def main(argv=None): args = oparse.parse_args(argv) corpus_name = args.corpus if args.end is not None: sent_start = args.start sent_end = args.end elif args.start is not None: sent_start = 0 sent_end = args.start else: sent_start = 0 sent_end = None columns = [None] * 14 corpus = Corpus(corpus_name, registry_dir=CQP_REGISTRY) columns[0] = corpus.attribute('word', 'p') sent_attr = corpus.attribute('s', 's') if args.fmt == 'conll': idx = 1 for col in args.xcolumns: if '=' in col: s_idx, att_name = col.split('=') s_idx = int(s_idx) else: s_idx = idx att_name = col columns[s_idx] = corpus.attribute(att_name, 'p') idx = s_idx + 1 output_sentences(sent_attr, columns, sent_start, sent_end) elif args.fmt == 'line': output_sentences_line(sent_attr, columns, sent_start, sent_end) elif args.fmt == 'bllip': output_sentences_bllip(sent_attr, columns, sent_start, sent_end, corpus_name=corpus_name, max_len=args.max_len)
def cqp2vocab_main(argv=None): opts, args = oparse.parse_args(argv) frequencies = defaultdict(int) for arg in args: crp = Corpus(arg, registry_dir=CQP_REGISTRY) att = crp.attribute(opts.attr, 'p') if opts.encoding is not None and crp.get_encoding() != opts.encoding: print >> sys.stderr, "Recoding %s items from %s to %s" % ( arg, crp.get_encoding(), opts.encoding) to_uni = crp.to_unicode enc = opts.encoding recode = lambda w: to_uni(w).encode(enc) else: recode = lambda x: x dic = att.getDictionary() for i in xrange(len(dic)): word = dic.get_word(i) frequencies[recode(word)] += att.frequency(word) for word in frequencies.keys(): if frequencies[word] < opts.threshold: del frequencies[word] if opts.out_fname is None: f_out = sys.stdout else: f_out = file(opts.out_fname, 'w') for word in sorted(frequencies): print >> f_out, word
def make_bigram_alph(corpora, attr_name='word', suffix='', outdir='.'): unigram_freqs = defaultdict(int) bigram_freqs = defaultdict(int) for corpus_name in corpora: print >> sys.stderr, "Reading corpus: %s" % (corpus_name, ) corpus = Corpus(corpus_name) att = corpus.attribute(attr_name, 'p') unigram_list, bigram_list = make_frequencies(att) for v, k in unigram_list: unigram_freqs[k] += v for v, k in bigram_list: bigram_freqs[k] += v unigram_list = [(v, k) for (k, v) in unigram_freqs.iteritems()] unigram_list.sort() del unigram_list[MAX_LIST:] unigram_alph = CPPAlphabet() for c, k in unigram_list: unigram_alph[k] unigram_alph.tofile( file(os.path.join(outdir, 'unigram%s_alph.txt' % (suffix, )), 'w')) bigram_list = [(v, k) for (k, v) in bigram_freqs.iteritems()] bigram_list.sort() del bigram_list[MAX_LIST:] bigram_alph = CPPAlphabet() for c, k in bigram_list: bigram_alph[k] bigram_alph.tofile( file(os.path.join(outdir, 'bigram%s_alph.txt' % (suffix, )), 'w'))
def cqp2vocab_main(argv=None): opts,args=oparse.parse_args(argv) frequencies=defaultdict(int) for arg in args: crp=Corpus(arg,registry_dir=CQP_REGISTRY) att=crp.attribute(opts.attr,'p') if opts.encoding is not None and crp.get_encoding()!=opts.encoding: print >>sys.stderr, "Recoding %s items from %s to %s"%( arg, crp.get_encoding(), opts.encoding) to_uni=crp.to_unicode enc=opts.encoding recode=lambda w: to_uni(w).encode(enc) else: recode=lambda x: x dic=att.getDictionary() for i in xrange(len(dic)): word=dic.get_word(i) frequencies[recode(word)]+=att.frequency(word) for word in frequencies.keys(): if frequencies[word]<opts.threshold: del frequencies[word] if opts.out_fname is None: f_out=sys.stdout else: f_out=file(opts.out_fname,'w') for word in sorted(frequencies): print >>f_out, word
def __init__(self, corpus_name): self.name = corpus_name self.corpus = Corpus(corpus_name, registry_dir=CQP_REGISTRY) self.words = self.corpus.attribute('word', 'p') self.sentences = self.corpus.attribute('s', 's') id_to_start = {} text_ids = self.corpus.attribute('text_id', 's') for start, end, fname in text_ids: id_to_start[fname] = start self.id_to_start = id_to_start
def con_source(request, qpos): window_size = 100 corp_name, start, end = cy.decrypt(qpos) start, end = int(start), int(end) corpus = Corpus(corp_name.upper(), registry_dir='/usr/local/share/cwb/registry') words = corpus.attribute('word', 'p') corp_len = len(words) if start - window_size < 0: lb = 0 else: lb = start - window_size if end + window_size > corp_len: rb = corp_len - 1 else: rb = end + window_size lw = ''.join(words[lb:start]) qw = '<span style="color:red;font-size:24px;">' + ''.join( words[start:end]) + '</span>' rw = ''.join(words[end:rb]) if corp_name == 'tccm' or corp_name == 'ntuspk': if corp_name == 'tccm': s_attrs = corpus.attribute('s_addresser', 's') if corp_name == 'ntuspk': s_attrs = corpus.attribute('s_speaker', 's') top = s_attrs.cpos2struc(lb) top = s_attrs[top] bottom = s_attrs.cpos2struc(rb) bottom = s_attrs[bottom] attr_con = [] for attr in s_attrs: if attr[0] >= top[0] and attr[1] <= bottom[1]: attr_con.append(attr) output = '' for a in attr_con: if start in xrange(a[0], a[1]): sent =\ a[-1] + ': ' +\ ' '.join(words[a[0]:start]) + ' ' +\ '<span style="color:red;font-size:24px;">' + ' '.join(words[start:end]) + '</span>' + ' ' +\ ' '.join(words[end:a[1]]) else: sent = '%s: %s' % (a[-1], ' '.join(words[a[0]:a[1]])) output += sent + '<br>' # output = ['%s: %s' % (i[-1], ' '.join(words[i[0]:i[1]])) for i in attr_con] # output = '<br>'.join(output) return HttpResponse(output) return HttpResponse(lw + qw + rw)
def con_source(qpos): """Concordance source.""" window_size = 100 corp_name, start, end = qpos.split('_') start, end = int(start), int(end) registry_dir = CONF.get('main', 'registry_dir') corpus = Corpus(corp_name.upper(), registry_dir=registry_dir) words = corpus.attribute('word', 'p') corp_len = len(words) if start - window_size < 0: lb = 0 else: lb = start - window_size if end + window_size > corp_len: rb = corp_len - 1 else: rb = end + window_size lw = ''.join(words[lb:start]) qw = '<span style="color:red;font-size:24px;">' + ''.join( words[start:end]) + '</span>' rw = ''.join(words[end:rb]) if corp_name == 'tccm' or corp_name == 'ntuspk': if corp_name == 'tccm': s_attrs = corpus.attribute('s_addresser', 's') if corp_name == 'ntuspk': s_attrs = corpus.attribute('s_speaker', 's') top = s_attrs.cpos2struc(lb) top = s_attrs[top] bottom = s_attrs.cpos2struc(rb) bottom = s_attrs[bottom] attr_con = [] for attr in s_attrs: if attr[0] >= top[0] and attr[1] <= bottom[1]: attr_con.append(attr) output = '' for a in attr_con: if start in xrange(a[0], a[1]): sent =\ a[-1] + ': ' +\ ' '.join(words[a[0]:start]) + ' ' +\ '<span style="color:red;font-size:24px;">' +\ ' '.join(words[start:end]) + '</span>' + ' ' +\ ' '.join(words[end:a[1]]) else: sent = '%s: %s' % (a[-1], ' '.join(words[a[0]:a[1]])) output += sent + '<br>' return output return lw + qw + rw
class CorpusInfo: def __init__(self, corpus_name): self.name=corpus_name self.corpus=Corpus(corpus_name, registry_dir=CQP_REGISTRY) self.words=self.corpus.attribute('word','p') self.sentences=self.corpus.attribute('s','s') id_to_start={} text_ids=self.corpus.attribute('file_id','s') for start, end, fname in text_ids: id_to_start[fname]=start self.id_to_start=id_to_start def __getitem__(self,fname): return self.sentences.cpos2struc(self.id_to_start[fname])
class CorpusInfo: def __init__(self, corpus_name): self.name = corpus_name self.corpus = Corpus(corpus_name, registry_dir=CQP_REGISTRY) self.words = self.corpus.attribute('word', 'p') self.sentences = self.corpus.attribute('s', 's') id_to_start = {} text_ids = self.corpus.attribute('file_id', 's') for start, end, fname in text_ids: id_to_start[fname] = start self.id_to_start = id_to_start def __getitem__(self, fname): return self.sentences.cpos2struc(self.id_to_start[fname])
def create_bow_tag(corpora, language, pos_tags, outdir='.', alph_suffix=''): # Step 1: extract unigram distributions for words unigram_alph = CPPUniAlphabet() unigram_alph.fromfile( file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, )))) unigram_alph.growing = False bigram_alph = CPPUniAlphabet() bigram_alph.fromfile( file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, )))) bigram_alph.growing = False infix = '_'.join(prefix_l) if infix != '': infix = '_' + infix if opts.limit != -1: prefix_l.append('%d' % (opts.limit / 1000)) word_matrix = None word_alphs = get_word_alphs_by_pos(language) for word_pos in pos_tags: word_alph = word_alphs[word_pos] word_feat_alph = CPPUniAlphabet() for corpus_name in corpora: corpus = Corpus(corpus_name) att = corpus.attribute(opts.attr_name, 'p') att_find = corpus.attribute('tb_lemma', 'p') att_sent = corpus.attribute('s', 's') pair_alphs = get_pair_alphs_by_pos(opts.language) word_alphs = get_word_alphs_by_pos(opts.language) print "word features for %s in %s" % (word_pos, corpus_name) wmat = gather_word_vectors(list(word_alph), att, att_find, att_sent, unigram_alph, bigram_alph, word_feat_alph, forward_mapping_by_pos(word_pos), opts.limit) if word_matrix is None: word_matrix = wmat else: word_matrix += wmat word_feat_alph.tofile_utf8( file( os.path.join(opts.outdir, 'word_bow%s%s_alph.txt' % ( infix, word_pos, )), 'w')) word_matrix.write_binary( file( os.path.join(opts.outdir, 'word_bow%s%s_mtx.bin' % ( infix, word_pos, )), 'w'))
def __init__(self, corpus_name, registry_path='/usr/local/share/cwb/registry/'): self.corpus_name = corpus_name self.registry_path = registry_path self.attributes = Crps(self.corpus_name, registry_dir=registry_path)
def __init__(self, corpus_name, lib_path=None, cqp_bin='cqp', registry_path='/usr/local/share/cwb/registry/', data_path='/tmp/ccc-data/'): """Establish connection to CQP and corpus attributes, set paths, read library. Raises KeyError if corpus not in registry. :param str corpus_name: name of corpus in CWB registry :param str lib_path: /path/to/macros/and/wordlists/ :param str cqp_bin: /path/to/cqp-binary :param str registry_path: /path/to/cwb/registry/ :param str data_path: /path/to/data/and/cache/ """ # process data path if data_path is not None: if not data_path.endswith(corpus_name): data_path = os.path.join(data_path, corpus_name) self.data_path = data_path if not os.path.isdir(self.data_path): os.makedirs(self.data_path) cache_path = os.path.join(self.data_path, "CACHE") else: self.data_path = None cache_path = None # set registry path and cqp_bin self.registry_path = registry_path self.cqp_bin = cqp_bin # macros and wordlists self.lib_path = lib_path # init (sub-)corpus information self.corpus_name = corpus_name self.subcorpus = None # init attributes self.attributes = Attributes(self.corpus_name, registry_dir=self.registry_path) # get corpus size self.corpus_size = len(self.attributes.attribute('word', 'p')) # get available corpus attributes self.attributes_available = self._attributes_available() # init cache self.cache = Cache(cache_path) # init counts self.counts = Counts(self.corpus_name, self.registry_path)
def __init__(self, bin_path, corpus_name, registry_dir): """Build `cqp` connection. :bin_path: absolute path of `cqp` bin (usually `/usr/local/bin/cqp`) :param: corpus_name: name of the corpus :param: registry_dir: absolute path of `cqp` registry directory """ cqp = PyCQP_interface.CQP( bin=bin_path, options=f'-c -r {registry_dir} -D {corpus_name}') corpus = Corpus(corpus_name, registry_dir=registry_dir) self.cqp = cqp self.corpus = corpus
def main(argv=None): (opts, args) = oparse.parse_args(argv) if not args: oparse.print_help() sys.exit(1) corpus_name = args[0] if len(args) == 3: sent_start = int(args[1]) sent_end = int(args[2]) elif len(args) == 2: sent_start = 0 sent_end = int(args[1]) else: sent_start = 0 sent_end = None columns = [None] * 14 corpus = Corpus(corpus_name, registry_dir=CQP_REGISTRY) columns[0] = corpus.attribute('word', 'p') sent_attr = corpus.attribute('s', 's') if opts.fmt == 'conll': idx = 1 for col in opts.xcolumns: if '=' in col: s_idx, att_name = col.split('=') s_idx = int(s_idx) else: s_idx = idx att_name = col columns[s_idx] = corpus.attribute(att_name, 'p') idx = s_idx + 1 output_sentences(sent_attr, columns, sent_start, sent_end) elif opts.fmt == 'line': output_sentences_line(sent_attr, columns, sent_start, sent_end) elif opts.fmt == 'bllip': output_sentences_bllip(sent_attr, columns, sent_start, sent_end, corpus_name=corpus_name, max_len=opts.max_len)
def find(self, token, rsize=None, show_pos=False, begin_time=None, end_time=None, board_list=None): if begin_time: if not isinstance(begin_time, int): raise TypeError('"begin_time" should be an "int"') if end_time: if not isinstance(end_time, int): raise TypeError('"end_time" should be an "int"') if board_list: if not isinstance(board_list, list): raise TypeError('"board_list" should be a "list"') for i in xrange(2001, datetime.today().year + 1): self.freq_by_year[i] = 0 self.conclst = [] registry_dir = '/usr/local/share/cwb/registry' cqp = PyCQP_interface.CQP(bin='/usr/local/bin/cqp', options='-c -r ' + registry_dir) cqp.Exec(self.corpus_name + ";") if token.startswith('['): wildcard = '.' for i in reversed(range(1, 6)): token = token.replace(wildcard * i, wildcard * i * 3) # token = re.sub('(\[word=")(%s)("\])' % (wildcard*i), '\\1%s\\3' % (wildcard*i*3), token) # else: # token = re.sub('(\[word=")(%s)("\])' % (wildcard*i), '\\1\\2|%s\\3' % (wildcard*i*3), token) cqp.Query(token) else: cqp.Query('[word="%s"];' % token) if rsize == None: rsize = int(cqp.Exec("size Last;")) self.results = cqp.Dump(first=0, last=rsize) os.system('kill -9 $(pidof cqp)') if self.results == [['']]: return 'nores' corpus = Corpus(self.corpus_name, registry_dir=registry_dir) words = corpus.attribute("word", "p") if show_pos == True: postags = corpus.attribute("pos", "p") elif show_pos == False: pass else: raise # sentences = corpus.attribute("s","s") -> find position in sentences (line number) ids = corpus.attribute("text_id", "s") boards = corpus.attribute("text_board", "s") ptimes = corpus.attribute("text_time", "s") for line in self.results: output = dict() start = int(line[0]) end = int(line[1]) + 1 # post_time filter ptime = ptimes.find_pos(start)[-1] if begin_time != None and end_time != None: if begin_time <= int(ptime) <= end_time: pass else: continue elif begin_time != None and end_time == None: if int(ptime) < begin_time: continue elif begin_time == None and end_time != None: if int(ptime) > end_time: continue # board_list filter board = boards.find_pos(start)[-1] if board_list: if board not in board_list: continue lw = words[start - self.window_size:start] rw = words[end:end + self.window_size] qw = words[start:end] if show_pos is True: lp = postags[start - self.window_size:start] rp = postags[end:end + self.window_size] qp = postags[start:end] left = ' '.join([ '%s<span>/%s</span>' % (word, pos) for word, pos in zip(lw, lp) ]) mid = ' '.join([ '%s<span>/%s</span>' % (word, pos) for word, pos in zip(qw, qp) ]) right = ' '.join([ '%s<span>/%s</span>' % (word, pos) for word, pos in zip(rw, rp) ]) elif show_pos is False: left = ' '.join(['%s' % word for word in lw]) mid = ' '.join(['%s' % word for word in qw]) right = ' '.join(['%s' % word for word in rw]) # s_bounds = sentences.find_pos(end-1) mongoid = ids.find_pos(start)[-1] if self.corpus_name == 'PTT': self.freq_by_year[int(ptime[:4])] += 1 output['conc'] = (left, mid, right) output['board'] = board output['post_time'] = ptime output['mongoid'] = mongoid output['board_cht'] = BOARDREF[board] self.conclst.append(output) if self.time_order == -1: rev = True elif self.time_order == 1: rev = False else: raise ValueError('time order should be either 1 or -1') self.conclst.sort(key=lambda x: x['post_time'], reverse=rev) if self.corpus_name == 'PTT': for y in self.freq_by_year.iterkeys(): if y in self.freq_by_year and y in toknumByYear: self.freq_by_year[y] = self.freq_by_year[y] / toknumByYear[ str(y)]
class Counts: """ returns df_counts: def: (p_att_1, p_att_2, ...), freq all p_atts are strings, " "-delimited for MWUs (split=NO) attributes: .corpus_name .attributes methods: ._cpos2patts .cpos (cpos_list, p_atts) .dump (df_dump, start, end, p_atts, split) - strategy 1: split NO|YES; flags ; combo x - strategy 2: split |YES; flags ; combo .matches (name, p_att, split, flags) - strategy 1: split NO| ; flags x; combo - strategy 2: split NO|YES; flags x; combo - strategy 3: split |YES; flags ; combo x .mwus (queries) - strategy 1: split NO| - ; flags x; combo x; mwu NO - strategy 2: split NO| - ; flags x; combo x; mwu YES - strategy 3: split NO| - ; flags x; combo ; mwu YES TODO: counting with group? """ def __init__(self, corpus_name, registry_path='/usr/local/share/cwb/registry/'): self.corpus_name = corpus_name self.registry_path = registry_path self.attributes = Crps(self.corpus_name, registry_dir=registry_path) def _cpos2patts(self, cpos, p_atts=['word'], ignore_missing=True): """Retrieves p-attributes of corpus position. :param int cpos: corpus position to fill :param list p_atts: p-attribute(s) to fill position with :param bool ignore_missing: whether to return -1 for out-of-bounds :return: p_att(s) :rtype: tuple """ if ignore_missing and cpos == -1: token = [None] * len(p_atts) else: token = [ self.attributes.attribute(p_att, 'p')[cpos] for p_att in p_atts ] return tuple(token) def cpos(self, cpos_list, p_atts=['word']): """Creates a frequency table for the p_att-values of the cpos-list. :param list cpos_list: corpus positions to fill :param list p_atts: p-attribute (combinations) to count :return: counts of the p_attribute (combinations) of the positions :rtype: DataFrame """ lex_items = [self._cpos2patts(p, p_atts=p_atts) for p in cpos_list] counts = Counter(lex_items) df_counts = DataFrame.from_dict(counts, orient='index', columns=['freq']) df_counts.index = MultiIndex.from_tuples(df_counts.index, names=p_atts) return df_counts @time_it def dump(self, df_dump, start='match', end='matchend', p_atts=['word'], split=False, strategy=2): """Counts tokens in [start .. end] (columns in df_dump). :param list df_dump: corpus positions to fill :param str start: column name where to start counting :param str end: column name where to end counting :param list p_atts: p-attribute (combinations) to count :param bool split: token-based count? (default: MWU) :param int strategy: strategy 2 (cwb-scan-corpus) is faster, does not support MWU counts though :return: counts of the p_attribute (combinations) of the positions :rtype: DataFrame """ # choose strategy if strategy == 2 and not split: logger.warning("dump: cannot use cwb-scan-corpus for MWUs") strategy = 1 logger.info("dump: strategy %d" % strategy) df_dump = df_dump.reset_index() # for working with match, matchend if strategy == 1: logger.info("... extracting tokens") ls = df_dump.apply( lambda x: [ self._cpos2patts(cpos, p_atts) for cpos in range(x[start], x[end] + 1) ], axis=1 ).values # list of list of tuples (p_att_1, p_att_2, ...) logger.info("... splitting") if split: tokens = [token for tokens in ls for token in tokens] else: tokens = [ tuple([" ".join(m) for m in zip(*mwu_list)]) for mwu_list in ls ] logger.info("... counting") counts = Counter(tokens) df_counts = DataFrame.from_dict(counts, orient='index', columns=['freq']) df_counts.index = MultiIndex.from_tuples(df_counts.index, names=p_atts) elif strategy == 2: df_dump = df_dump.reset_index() with NamedTemporaryFile(mode="wt") as f: logger.info("... writing dump temporarily to disk") df_dump[[start, end]].to_csv(f.name, sep="\t", header=None, index=False) df_counts = cwb_scan_corpus(f.name, self.corpus_name, self.registry_path, p_atts) df_counts = df_counts.sort_values(by='freq', ascending=False) return df_counts @time_it def matches(self, cqp, name, p_atts=["word"], split=False, flags=None, strategy=3): """Counts tokens in [match .. matchend] of named subcorpus defined in running cqp. :param CQP cqp: running cqp process :param list name: name of the subcorpus :param list p_atts: p-attribute(-combinations) to count :param bool split: token-based count? (default: MWU) :param str flags: %c, %d, %cd :return: counts of the p_attribute (combinations) of the positions :rtype: DataFrame """ # choose strategy combo = len(p_atts) > 1 # s f c # 1: - - - # 1: - x - # 2: - - - # 2: - x - # 2: x - - # 2: x x - # 3: x - - # 3: x - x # implemented: # - - - 1,2 # - x - 1,2 # x - - 2,3 # x x - 2 # x - x 3 # not implemented: # - - x # - x x # x x x if combo: if flags or (not flags and not split): raise NotImplementedError( "matches does not support parameter combination:", str(" ".join([ 'x' if x else '-' for x in [split, len(flags) > 0, combo] ]))) if strategy == 1: if split or combo: logger.warning("matches: cannot use cqp-count") strategy = 2 if strategy == 2: if combo: logger.warning("matches: cannot use cqp-tabulate") strategy = 3 if strategy == 3: if flags or not split: logger.warning("matches: cannot use cwb-scan-corpus") strategy = 2 logger.info("matches: strategy %s" % strategy) if strategy == 1: # split NO; flags NO/YES; combo NO # generally slow logger.info("... cqp is counting") cqp_return = cqp.Exec('count %s by %s %s;' % (name, p_atts[0], flags)) df_counts = read_csv(StringIO(cqp_return), sep="\t", header=None, names=["freq", "unknown", "item"]) df_counts = df_counts.set_index('item') df_counts = df_counts[['freq']] df_counts.index.name = p_atts[0] elif strategy == 2: # split NO/YES; flags NO/YES; combo NO # generally faster logger.info("... cqp is tabulating") cqp_return = cqp.Exec('tabulate %s match .. matchend %s %s;' % (name, p_atts[0], flags)) logger.info("... splitting tokens") if split: # split strings into tokens cqp_return = cqp_return.replace(" ", "\n") tokens = cqp_return.split("\n") logger.info("... counting %d tokens" % len(tokens)) df_counts = DataFrame.from_dict(Counter(tokens), orient='index', columns=['freq']) df_counts = df_counts[['freq']] df_counts.index.name = p_atts[0] elif strategy == 3: # split YES; flags NO; combo YES # generally fastest with NamedTemporaryFile(mode="wt") as f: logger.info("... writing dump temporarily to disk") cqp.Exec('dump %s > "%s";' % (name, f.name)) df_counts = cwb_scan_corpus(f.name, self.corpus_name, self.registry_path, p_atts) df_counts = df_counts.sort_values(by='freq', ascending=False) return df_counts @time_it def mwus(self, cqp, queries, p_atts=None, fill_missing=True, strategy=1): """Calculates frequencies for MWU queries in activated subcorpus. queries are a list of valid CQP queries, e.g. '[lemma="Angela"%cd & pos="NE"] [lemma="Merkel"%cd & pos="NE"]?' caveat: different indices for different strategies :param CQP cqp: running cqp process :param set queries: set of query strings to get frequency breakdown for :param bool fill_missing: count 0 for missing items? :param int strategy: strategy to use (see below) :return: counts (index: queries(strategy 1) or tokens (, column: freq) :rtype: DataFrame Strategy 1: for each item (1) run query for item (2) get size of corpus via cqp Strategy 2: (1) run query for all items at the same time (2) dump df (3) count_dump() Strategy 3: (1) run query for all items at the same time (2) count_matches() """ queries = set(queries) # only process each one query once name = 'tmp' # subcorpus name to use if strategy == 1: if p_atts: logger.warning( "mwus: cannot get frequency breakdown when not inspecting dump" ) strategy = 2 if not p_atts: p_atts = ['word'] # necessary for strategies 2 & 3 if strategy == 3 and len(p_atts) > 1: logger.warning( "mwus: cannot combine query when looking at several p-attributes" ) strategy = 2 logger.info("mwus: strategy %s" % strategy) if strategy == 1: logger.info("... running each query") freqs = list() for query in queries: cqp.Exec('%s=%s;' % (name, query)) freq = cqp.Exec('size %s;' % name) freqs.append(freq) df = DataFrame(data=freqs, index=queries, columns=['freq']) df.index.name = 'query' elif strategy == 2: query = "|".join(queries) cqp.Exec('%s=%s;' % (name, query)) df_dump = cqp.Dump(name) df = self.dump(df_dump, start='match', end='matchend', p_atts=p_atts, split=False, strategy=1) if len(p_atts) == 1: df.index = [item[0] for item in df.index] df.index.name = p_atts[0] elif strategy == 3: query = "|".join(queries) cqp.Exec('%s=%s;' % (name, query)) df = self.matches(cqp, name, p_atts=p_atts, split=False, flags=None, strategy=2) # post-process dataframe df["freq"] = df["freq"].astype(int) df = df.sort_values(by=["freq"], ascending=False) # df = df.loc[df["freq"] != 0] return df
registry_dir='/usr/local/share/cwb/registry' for corpus_name in ["DEWAC01"]: #+['DEWAC%02d'%(x,) for x in range(1,10)]: print corpus_name cqp=PyCQP_interface.CQP(bin='/usr/local/bin/cqp',options='-c -r '+registry_dir) cqp.Exec(corpus_name+";") cqp.Query('[word="ist|sind|war|waren|seid"] [pos="ADV|PPER"]* [word=".+[elr]n"] [pos="\$.|KON"];') cqp.Exec("sort Last by word;") rsize=int(cqp.Exec("size Last;")) results=cqp.Dump(first=0,last=rsize) cqp.Terminate() f=file(corpus_name+'_absentiv.txt','w') #f=sys.stdout corpus=Corpus(corpus_name,registry_dir=registry_dir); words=corpus.attribute("word","p") postags=corpus.attribute("pos","p") sentences=corpus.attribute("s","s") texts=corpus.attribute("text_id","s") for line in results: start=int(line[0]) end=int(line[1]) wn=words[end-1] posn=postags[end-1] if posn.startswith('VV'): s_bounds=sentences.find_pos(end-1) text_bounds=texts.find_pos(end-1) print >>f,"# %s"%(text_bounds[2],) print >>f,"%10d:"%(int(line[1]),), for pos in xrange(s_bounds[0],s_bounds[1]+1):
def query(): query = request.form["query"] + ";" query_result = [] corpus_list = [("dep_1", query), ("dep_10", query), ("dep_11", query), ("dep_12", query), ("dep_13", query), ("dep_14", query), ("dep_15", query), ("dep_16", query), ("dep_17", query), ("dep_18", query), ("dep_19", query), ("dep_2", query), ("dep_21", query), ("dep_22", query), ("dep_23", query), ("dep_24", query), ("dep_25", query), ("dep_26", query), ("dep_27", query), ("dep_28", query), ("dep_29", query), ("dep_2a", query), ("dep_2b", query), ("dep_3", query), ("dep_30", query), ("dep_31", query), ("dep_32", query), ("dep_33", query), ("dep_34", query), ("dep_35", query), ("dep_36", query), ("dep_37", query), ("dep_38", query), ("dep_39", query), ("dep_4", query), ("dep_40", query), ("dep_41", query), ("dep_42", query), ("dep_43", query), ("dep_44", query), ("dep_45", query), ("dep_46", query), ("dep_47", query), ("dep_48", query), ("dep_49", query), ("dep_5", query), ("dep_50", query), ("dep_51", query), ("dep_52", query), ("dep_53", query), ("dep_54", query), ("dep_55", query), ("dep_56", query), ("dep_57", query), ("dep_58", query), ("dep_59", query), ("dep_6", query), ("dep_60", query), ("dep_61", query), ("dep_62", query), ("dep_63", query), ("dep_64", query), ("dep_65", query), ("dep_66", query), ("dep_67", query), ("dep_68", query), ("dep_69", query), ("dep_7", query), ("dep_70", query), ("dep_71", query), ("dep_72", query), ("dep_73", query), ("dep_74", query), ("dep_75", query), ("dep_76", query), ("dep_77", query), ("dep_78", query), ("dep_79", query), ("dep_8", query), ("dep_80", query), ("dep_81", query), ("dep_82", query), ("dep_83", query), ("dep_84", query), ("dep_85", query), ("dep_86", query), ("dep_87", query), ("dep_88", query), ("dep_89", query), ("dep_9", query), ("dep_90", query), ("dep_91", query), ("dep_92", query), ("dep_93", query), ("dep_94", query), ("dep_95", query)] # Ici, autant de processus qu'indiqués en argument de Pool vont se partager les tâches (récupérer pour chaque département le résultat de la requête cqp) #start_time = datetime.datetime.now() try: pool = Pool(processes=None) query_result = pool.starmap(f, corpus_list) finally: pool.close() pool.join() if query_result[0] == False: return "Erreur de syntaxe" else: allResults = [] freqParDepartement = defaultdict(int) # Construction d'un dataframe contenant la fréquence du motif recherché dans chaque département # récupération de l'ensemble des résultats dans une seule et même liste for depResult in query_result: for codeDep in depResult: freqParDepartement[codeDep] = depResult[codeDep][ "nbTotalResults"] if depResult[codeDep]["results"] != [['']]: for result in depResult[codeDep]["results"]: allResults.append({"dep": codeDep, "result": result}) # calcul des spécificités freqParDepartementOrdered = OrderedDict( sorted(freqParDepartement.items(), key=lambda t: t[0])) df_queryFreq = pd.DataFrame(freqParDepartementOrdered, index=["freq"]).fillna(0) specif = specificities(df_queryFreq) resultsExtract = [] registry_dir = "/usr/local/share/cwb/registry" # Récupération des contextes gauche/droit + mise en forme, pour un extrait des résultats seulement (200 tirés au hasard) allResults_shuffle = [] random.shuffle(allResults) for i, dic in enumerate(allResults): if i < 200: dep = dic["dep"] if (re.match(r"^0\d$", dep)): corpus_name = "dep_" + re.match(r"^0(\d)$", dep).group(1).lower() else: corpus_name = "dep_" + dep.lower() r = dic["result"] corpus = Corpus(corpus_name, registry_dir=registry_dir) # permettra de récupérer par la suite le token, la POS ou le lemme correspondant à la position indiquée words = corpus.attribute("word", "p") postags = corpus.attribute("pos", "p") lemmas = corpus.attribute("lemma", "p") sentences = corpus.attribute(b"text", "s") left_context = [] right_context = [] start = int(r[0]) end = int(r[1]) # Récupération de la position du début et de la fin du tweet dans lequel le motif a été trouvé s_bounds = sentences.find_pos(end) # récupération de la position des mots des contextes droit et gauche for pos in range(s_bounds[0], s_bounds[1] + 1): if (pos < start): left_context.append(pos) if (pos > end): right_context.append(pos) # Construction du dictionnaire qui contiendra les informations qui nous intéressent result = { "dep": dep, "hide_column": "", "left_context": "", "pattern": "", "right_context": "" } lc_tokens = [] lc_pos = [] lc_lemmas = [] rc_tokens = [] rc_pos = [] rc_lemmas = [] # récupération du contexte gauche (tokens, pos et lemmes) for lp in left_context: lc_tokens.append(words[lp]) lc_pos.append(postags[lp]) lc_lemmas.append(lemmas[lp]) lc_tokens = reconstituteString(lc_tokens) lc_pos = " ".join(lc_pos) lc_lemmas = " ".join(lc_lemmas) # récupération du motif recherché (tokens, pos et lemmes) pattern_tokens = reconstituteString(words[start:end + 1]) pattern_pos = " ".join(postags[start:end + 1]) pattern_lemmas = " ".join(lemmas[start:end + 1]) # récupération du contexte droit (tokens, pos et lemmes) for rp in right_context: rc_tokens.append(words[rp]) rc_pos.append(postags[rp]) rc_lemmas.append(lemmas[rp]) rc_tokens = reconstituteString(rc_tokens) rc_pos = " ".join(rc_pos) rc_lemmas = " ".join(rc_lemmas) # mise en forme ici pour ne pas ajouter du temps de traitement côté client result["hide_column"] = lc_tokens[::-1] result[ "left_context"] = "<span title=\"" + lc_pos + " " + lc_lemmas + "\">" + lc_tokens + "</span>" result[ "pattern"] = "<span title=\"" + pattern_pos + " " + pattern_lemmas + "\">" + pattern_tokens + "</span>" result[ "right_context"] = "<span title=\"" + rc_pos + " " + rc_lemmas + "\">" + rc_tokens + "</span>" resultsExtract.append(result) #print(datetime.datetime.now()-start_time) resultAndSpec = {} resultAndSpec["result"] = resultsExtract resultAndSpec["specif"] = specif resultAndSpec["nbResults"] = int(df_queryFreq.sum().sum()) resultAndSpec["nbOccurrences"] = freqParDepartement resultAndSpec = ujson.dumps(resultAndSpec) return resultAndSpec
def f(corpus, query): """ Envoi de la requête à CQP et mise en forme des données récupérées entrée : nom du corpus sur lequel la requête sera effectuée et la requête en question sortie : requête à soumettre à CQP """ registry_dir = "/usr/local/share/cwb/registry" #cqp=PyCQP_interface.CQP(bin='/usr/local/bin/cqp',options='-c -r '+registry_dir) cqp = PyCQP_interface.CQP(bin='/usr/local/cwb/bin//cqp', options='-c -r ' + registry_dir) corpus_name = splitext(basename(corpus))[0].upper() dep = corpus_name.split("_")[1].upper() if (re.match(r"^\d$", dep)): dep = "0" + dep else: dep = dep resultDep = [] # Envoi de la requête cqp.Exec(corpus_name + ";") cqp.Query(query) cqp.Exec("sort Last by word;") """ Récupération des résultats, sous la forme d'une liste (results) qui contient autant de listes que de résultats correspondant à la requête effectuée. Ces listes permettent de récupérer l'emplacement du premier et du dernier élément des motifs correspondants dans le corpus. """ rsize = int(cqp.Exec("size Last;")) results = cqp.Dump(first=0, last=rsize) corpus = Corpus(corpus_name, registry_dir=registry_dir) # permettra de récupérer par la suite le token, la POS ou le lemme correspondant à la position indiquée words = corpus.attribute("word", "p") postags = corpus.attribute("pos", "p") lemmas = corpus.attribute("lemma", "p") sentences = corpus.attribute(b"text", "s") id = corpus.attribute(b"text_id", "s") dates = corpus.attribute(b"text_date", "s") geo = corpus.attribute(b"text_geo", "s") users = corpus.attribute(b"text_user", "s") cqp.Terminate() if (results != [[""]]): for r in results: left_context = [] right_context = [] start = int(r[0]) end = int(r[1]) # Récupération de la position du début et de la fin du tweet dans lequel le motif a été trouvé s_bounds = sentences.find_pos(end) # Récupérarion de ses attributs (id, date, coordonnées et id de l'utilisateur) id_bounds = id.find_pos(end) date_bounds = dates.find_pos(end) geo_bounds = geo.find_pos(end) user_bounds = users.find_pos(end) coord = geo_bounds[-1].decode("utf8").split(", ") # récupération de la position des mots des contextes droit et gauche for pos in range(s_bounds[0], s_bounds[1] + 1): if (pos < start): left_context.append(pos) if (pos > end): right_context.append(pos) # Construction du dictionnaire qui contiendra les informations qui nous intéressent result = { "id": id_bounds[-1], "date": date_bounds[-1].decode("utf8").split("T")[0], "geo": coord, "dep": dep, "user": user_bounds[-1], "hide_column": "", "left_context": "", "pattern": "", "right_context": "" } lc_tokens = [] lc_pos = [] lc_lemmas = [] rc_tokens = [] rc_pos = [] rc_lemmas = [] # récupération du contexte gauche (tokens, pos et lemmes) for lp in left_context: lc_tokens.append(words[lp]) lc_pos.append(postags[lp]) lc_lemmas.append(lemmas[lp]) lc_tokens = reconstituteString(lc_tokens) lc_pos = " ".join(lc_pos) lc_lemmas = " ".join(lc_lemmas) # récupération du motif recherché (tokens, pos et lemmes) pattern_tokens = reconstituteString(words[start:end + 1]) pattern_pos = " ".join(postags[start:end + 1]) pattern_lemmas = " ".join(lemmas[start:end + 1]) # récupération du contexte droit (tokens, pos et lemmes) for rp in right_context: rc_tokens.append(words[rp]) rc_pos.append(postags[rp]) rc_lemmas.append(lemmas[rp]) rc_tokens = reconstituteString(rc_tokens) rc_pos = " ".join(rc_pos) rc_lemmas = " ".join(rc_lemmas) # mise en forme ici pour ne pas ajouter du temps de traitement côté client result["hide_column"] = lc_tokens[::-1] result[ "left_context"] = "<span title=\"" + lc_pos + " " + lc_lemmas + "\">" + lc_tokens + "</span>" result[ "pattern"] = "<span title=\"" + pattern_pos + " " + pattern_lemmas + "\">" + pattern_tokens + "</span>" result[ "right_context"] = "<span title=\"" + rc_pos + " " + rc_lemmas + "\">" + rc_tokens + "</span>" resultDep.append(result) # fermeture du processus CQP car sinon ne se ferme pas os.popen("kill -9 " + str(cqp.CQP_process.pid)) return resultDep
def find(self, token, begin_time=20140601, end_time=20150531, board_list=['Gossiping']): if begin_time: if not isinstance(begin_time, int): raise TypeError('"begin_time" should be an "int"') if end_time: if not isinstance(end_time, int): raise TypeError('"end_time" should be an "int"') if board_list: if not isinstance(board_list, list): raise TypeError('"board_list" should be a "list"') self.conclst = [] registry_dir = '/usr/local/share/cwb/registry' cqp = PyCQP_interface.CQP(bin='/usr/local/bin/cqp', options='-c -r ' + registry_dir) cqp.Exec(self.corpus_name + ";") cqp.Query('[word="%s"];' % token) rsize = int(cqp.Exec("size Last;")) self.results = cqp.Dump(first=0, last=rsize) os.system('kill -9 $(pidof cqp)') if self.results == [['']]: return None corpus = Corpus(self.corpus_name, registry_dir=registry_dir) words = corpus.attribute("word", "p") ids = corpus.attribute("text_id", "s") boards = corpus.attribute("text_board", "s") ptimes = corpus.attribute("text_time", "s") for num, line in enumerate(self.results, 1): print num output = dict() start = int(line[0]) end = int(line[1]) + 1 # post_time filter ptime = ptimes.find_pos(start)[-1] if begin_time != None and end_time != None: if begin_time <= int(ptime) <= end_time: pass else: continue elif begin_time != None and end_time == None: if int(ptime) < begin_time: continue elif begin_time == None and end_time != None: if int(ptime) > end_time: continue # board_list filter board = boards.find_pos(start)[-1] if board_list: if board not in board_list: continue lw = words[start - self.window_size:start] rw = words[end:end + self.window_size] qw = words[start:end] left = ' '.join(['%s' % word for word in lw]) mid = ' '.join(['%s' % word for word in qw]) right = ' '.join(['%s' % word for word in rw]) mongoid = ids.find_pos(start)[-1] ptime = int(str(ptime)[:-2]) print ptime self.freq_by_month[ptime] += 1 output['conc'] = (left, mid, right) output['board'] = board output['post_time'] = ptime output['mongoid'] = mongoid self.conclst.append(output) if self.time_order == -1: rev = True elif self.time_order == 1: rev = False else: raise ValueError('time order should be either 1 or -1') self.conclst.sort(key=lambda x: x['post_time'], reverse=rev)
def find(self, token, show_pos=False, rsize=None): """Get concordance of a word.""" if isinstance(token, unicode): token = token.encode('utf-8') else: try: token.decode('utf-8') except BaseException: raise UnicodeError('Encoding error!') self.conclst = [] registry_dir = CONF.get('main', 'registry_dir') cqp = PyCQP_interface.CQP(bin=CONF.get('main', 'cqp_bin'), options='-c -r ' + registry_dir) cqp.Exec(self.corpus_name.upper() + ";") if token.startswith('cql:'): token = token[4:] cqp.Query(token) elif token.startswith('ncql:'): token = token[5:] token = convert_cql(token) cqp.Query(token) else: cqp.Query('[word="%s"];' % token) _rsize = int(cqp.Exec("size Last;")) if rsize is None: rsize = _rsize else: if rsize > _rsize: rsize = _rsize self.results = cqp.Dump(first=0, last=rsize) cqp.Terminate() if self.results == [['']]: return 'nores' corpus = Corpus(self.corpus_name, registry_dir=registry_dir) words = corpus.attribute("word", "p") with open(registry_dir + '/' + self.corpus_name) as f: cqpreg = f.read() s_attrs = re.findall('STRUCTURE\s(\w+)', cqpreg) s_attrs_dic = {} for attr in s_attrs: if attr != 's': s_attrs_dic[attr] = corpus.attribute(attr, "s") if show_pos == 1: postags = corpus.attribute("pos", "p") elif show_pos == 0: pass for line in self.results: output = dict() start = int(line[0]) end = int(line[1]) + 1 lw = words[start - self.window_size:start] if rsize < self.window_size: rw = words[end:end + rsize] else: rw = words[end:end + self.window_size] qw = words[start:end] if show_pos == 1: lp = postags[start - self.window_size:start] rp = postags[end:end + self.window_size] qp = postags[start:end] left = ' '.join([ '%s<span>/%s</span>' % (word, pos) for word, pos in zip(lw, lp) ]) mid = ' '.join([ '%s<span>/%s</span>' % (word, pos) for word, pos in zip(qw, qp) ]) right = ' '.join([ '%s<span>/%s</span>' % (word, pos) for word, pos in zip(rw, rp) ]) elif show_pos == 0: left = ' '.join(['%s' % word for word in lw]) mid = ' '.join(['%s' % word for word in qw]) right = ' '.join(['%s' % word for word in rw]) metainfo = dict() for k in s_attrs_dic.iterkeys(): metainfo[k] = s_attrs_dic[k].find_pos(start)[-1] output['conc'] = (left, mid, right) output['corp_name'] = DB[self.corpus_name] output['metainfo'] = metainfo output['qpos'] = '%s_%s_%s' % (self.corpus_name, start, end) self.conclst.append(output)
class Corpus: """Interface to CWB-indexed corpus. After initializing, the corpus class has ... ... the following attributes: .data_path .registry_path .cqp_bin .lib_path .corpus_name .subcorpus [None] .attributes_available .corpus_size ... the following initialized classes: .attributes .cache .counts ... the following methods: .__str__ ._attributes_available .start_cqp .copy .cpos2patts # p-attributes .marginals # p-attributes .cpos2sid # s-attributes .show_nqr # subcorpora .dump_from_s_att # creating dumps .dump_from_query # creating dumps .dump2patt # working on dumps .dump2satt # working on dumps .dump2context # working on dumps .query_s_att # query alias .query # query alias """ def __init__(self, corpus_name, lib_path=None, cqp_bin='cqp', registry_path='/usr/local/share/cwb/registry/', data_path='/tmp/ccc-data/'): """Establish connection to CQP and corpus attributes, set paths, read library. Raises KeyError if corpus not in registry. :param str corpus_name: name of corpus in CWB registry :param str lib_path: /path/to/macros/and/wordlists/ :param str cqp_bin: /path/to/cqp-binary :param str registry_path: /path/to/cwb/registry/ :param str data_path: /path/to/data/and/cache/ """ # process data path if data_path is not None: if not data_path.endswith(corpus_name): data_path = os.path.join(data_path, corpus_name) self.data_path = data_path if not os.path.isdir(self.data_path): os.makedirs(self.data_path) cache_path = os.path.join(self.data_path, "CACHE") else: self.data_path = None cache_path = None # set registry path and cqp_bin self.registry_path = registry_path self.cqp_bin = cqp_bin # macros and wordlists self.lib_path = lib_path # init (sub-)corpus information self.corpus_name = corpus_name self.subcorpus = None # init attributes self.attributes = Attributes(self.corpus_name, registry_dir=self.registry_path) # get corpus size self.corpus_size = len(self.attributes.attribute('word', 'p')) # get available corpus attributes self.attributes_available = self._attributes_available() # init cache self.cache = Cache(cache_path) # init counts self.counts = Counts(self.corpus_name, self.registry_path) def __str__(self): """Method for printing. :return: corpus_name, corpus_size, data_path, subcorpus :rtype: str """ return "\n".join([ 'a ccc.Corpus: "%s"' % self.corpus_name, "size : %s" % str(self.corpus_size), "data : %s" % str(self.data_path), "subcorpus : %s" % str(self.subcorpus), "attributes :", self.attributes_available.to_string(), ]) def _attributes_available(self): """Get indexed p- and s-attributes. Will be run once when initializing the corpus. :return: attributes and annotation info :rtype: DataFrame """ # use CQP's context descriptor cqp = self.start_cqp() cqp_ret = cqp.Exec('show cd;') cqp.__kill__() # read as dataframe attributes = read_csv( StringIO(cqp_ret), sep='\t', names=['type', 'attribute', 'annotation', 'active']).fillna(False) # post-process Boolean columns attributes['active'] = (attributes['active'] == "*") attributes['annotation'] = (attributes['annotation'] == '-V') return attributes def start_cqp(self): """Start CQP process. :return: CQP process :rtype: CQP """ return start_cqp(self.cqp_bin, self.registry_path, self.data_path, self.corpus_name, self.lib_path, self.subcorpus) def copy(self): """Get a fresh initialization of the corpus. :return: corpus :rtype: Corpus """ return Corpus(self.corpus_name, self.lib_path, self.cqp_bin, self.registry_path, self.data_path) ################ # p-attributes # ################ def cpos2patts(self, cpos, p_atts=['word'], ignore=True): """Retrieve p-attributes of corpus position. :param int cpos: corpus position to fill :param list p_atts: p-attribute(s) to fill position with :param bool ignore: whether to return (None, .*) for -1 :return: p-attribute(s) at cpos :rtype: tuple """ if cpos == -1 and ignore: token = [None] * len(p_atts) else: token = [ self.attributes.attribute(p_att, 'p')[cpos] for p_att in p_atts ] return tuple(token) def marginals(self, items, p_att='word', flags=0, pattern=False): """Extract marginal frequencies for given unigrams or unigram patterns of a single p-attribute. 0 if not in corpus. For combinations of p-attributes, see marginals_complex. :param list items: items to get marginals for :param str p_att: p-attribute to get frequencies for :param int flags: 1 = %c, 2 = %d, 3 = %cd (will activate wildcards) :param bool pattern: activate wildcards? :return: frequencies of the items in the whole corpus indexed by items :rtype: FreqFrame """ pattern = True if flags > 0 else pattern tokens_all = self.attributes.attribute(p_att, 'p') # loop through items and collect frequencies counts = list() for item in items: if not pattern: try: counts.append(tokens_all.frequency(item)) except KeyError: counts.append(0) else: cpos = tokens_all.find_pattern(item, flags=flags) counts.append(len(cpos)) # create dataframe df = DataFrame({'freq': counts, p_att: items}) df = df.set_index(p_att) return df def marginals_complex(self, items, p_atts=['word']): """Extract marginal frequencies for p-attribute combinations, e.g. ["word", "lemma"]. 0 if not in corpus. Marginals are retrieved using cwb-scan-corpus, result is cached. :param list items: list of tuples :param list p_atts: list of p-attributes :return: counts of the items in the whole corpus indexed by items :rtype: FreqFrame """ # retrieve all marginals for p-att combination from cache if possible identifier = "_".join(p_atts) + "_marginals" df = self.cache.get(identifier) if df is not None: logger.info('using cached version of marginals of "%s"' % "_".join(p_atts)) else: # calculate all marginals for p-att combination df = cwb_scan_corpus(None, self.corpus_name, self.registry_path, p_atts) self.cache.set(identifier, df) # select relevant rows df = df.reindex(items) df = df.fillna(0, downcast='infer') return df ################ # s-attributes # ################ def cpos2sid(self, cpos, s_att): """Get cwb-id of s-att at cpos. -1 if not present. :param int cpos: corpus position :param str s_atts: s-attribute to get cwb-id for :return: cwb-id :rtype: int """ s_attributes = self.attributes.attribute(s_att, "s") try: return s_attributes.cpos2struc(cpos) except KeyError: return -1 ############## # subcorpora # ############## def show_nqr(self): """Get subcorpora defined in CQP as DataFrame. :return: available subcorpora :rtype: DataFrame """ cqp = self.start_cqp() cqp_return = cqp.Exec("show named;") try: df = read_csv(StringIO(cqp_return), sep="\t", header=None) df.columns = ["storage", "corpus:subcorpus", "size"] crpssbcrps = df["corpus:subcorpus"].str.split(":", 1).str df['corpus'] = crpssbcrps[0] df['subcorpus'] = crpssbcrps[1] df.drop('corpus:subcorpus', axis=1, inplace=True) df = df[['corpus', 'subcorpus', 'size', 'storage']] except EmptyDataError: logger.info("no subcorpora defined") df = DataFrame() cqp.__kill__() return df def activate_subcorpus(self, nqr=None, df_dump=None): """Activate subcorpus. If no df_dump is given, this sets self.subcorpus and logs an error if subcorpus not defined. If a df_dump is given, the df_dump will be undumped. :param str subcorpus: subcorpus name defined in CQP :param DataFrame df_dump: DataFrame indexed by (match, matchend) with optional columns 'target' and 'keyword' """ if df_dump is not None: cqp = self.start_cqp() cqp.nqr_from_dump(df_dump, nqr) cqp.nqr_save(self.corpus_name, nqr) cqp.__kill__() if nqr is not None: # raise an error if subcorpus not available if nqr not in self.show_nqr()['subcorpus'].values: logger.error('subcorpus "%s" not defined)' % nqr) self.activate_subcorpus() else: logger.info('switched to subcorpus "%s"' % nqr) else: logger.info('switched to corpus "%s"' % self.corpus_name) # activate subcorpus self.subcorpus = nqr ################## # CREATING DUMPS # ################## def dump_from_s_att(self, s_att, annotation=True): """Create s-attribute spans as DataFrame of corpus positions. Resulting df_dump is indexed by (match, matchend). Note that s-attribute values are not indexed by the CWB. CWB.CL implementation just iterates over all annotations. This method thus creates a dataframe with the (non-overlapping) spans encoded as matches === (match, matchend) $s_cwbid, $s* === and caches the result. $s is only created if annotation is True and attribute is actually annotated. :param str s_att: s-attribute to get spans and annotation for :param bool annotation: whether to retrieve annotation (if present) :return: df_dump :rtype: DataFrame """ # retrieve from cache if possible identifier = s_att + "_spans" df = self.cache.get(identifier) if df is not None: logger.info('using cached version of spans of "%s"' % s_att) return df # compute logger.info('creating dataframe of spans of "%s"' % s_att) df = DataFrame(list(self.attributes.attribute(s_att, 's'))) # two or three columns: 0 (start), 1 (end), 2* (annotation) if annotation: annotation = (2 in df.columns) # just to make sure ... if not annotation: logger.info('s-att "%s" does not have any annotation' % s_att) else: df[2] = df[2].apply(lambda x: x.decode('utf-8')) # post-process df = df.reset_index() df = df.rename( { 'index': s_att + '_cwbid', 0: 'match', 1: 'matchend', 2: s_att }, axis=1) df = df.set_index(['match', 'matchend']) # put into cache self.cache.set(identifier, df) return df def dump_from_query(self, query, s_query=None, anchors=[], match_strategy='standard', name='Last', save=False): """Execute query, get DataFrame of corpus positions (CWB dump). Resulting df_dump is indexed by (match, matchend). Note that in the CWB, only two anchors can be active simultaneously. The method thus runs the query once with anchors set to [0, 1], and then collects the remaining anchor points by running the query again on the NQR of the first query run for each pair of remaining anchor points. Optional columns for each anchor: === (match, matchend), 0*, ..., 9* === The result is cached. :param str query: valid CQP query (without 'within' clause) :param str s_query: s-attribute used for initial query :param list anchors: anchors to search for :param str match_strategy: CQP matching strategy :param str name: name for NQR :param bool save: whether to save NQR to disk :return: df_dump :rtype: DataFrame """ # identify query if self.subcorpus is not None: # check subcorpus size to avoid confusion when re-naming cqp = self.start_cqp() sbcrpssize = cqp.Exec("size %s" % self.subcorpus) cqp.__kill__() else: sbcrpssize = None identifier = self.cache.generate_idx([ query, s_query, anchors, match_strategy, self.subcorpus, sbcrpssize ], prefix="df_dump:") # retrieve from cache if possible df_dump = self.cache.get(identifier) if df_dump is not None: logger.info( 'using cached version "%s" of df_dump with %d matches' % (identifier, len(df_dump))) return df_dump # init cqp and set matching strategy cqp = self.start_cqp() cqp.Exec('set MatchingStrategy "%s";' % match_strategy) # include optional within clause if s_query is None: start_query = query else: start_query = query + ' within ' + s_query # first run: anchors at 0 and 1 (considering within clause) logger.info("running CQP query") cqp.Exec('set ant 0; ank 1;') df_dump = cqp.nqr_from_query(query=start_query, name=name, match_strategy=match_strategy, return_dump=True) df_dump.columns = [0, 1] logger.info("found %d matches" % len(df_dump)) # if there's nothing to return ... if df_dump.empty: cqp.__kill__() return df_dump # join all other anchors remaining_anchors = list(chunk_anchors(anchors, 2)) if len(remaining_anchors) > 0: # restrict subsequent queries on initial matches cqp.nqr_activate(self.corpus_name, name) for pair in remaining_anchors: logger.info(".. running query for anchor(s) %s" % str(pair)) # set appropriate anchors cqp.Exec('set ant %d;' % pair[0]) if len(pair) == 2: cqp.Exec('set ank %d;' % pair[1]) else: cqp.Exec('set ank %d;' % 1) # dump new anchors cqp.Query('tmp = <match> ( %s );' % query) df = cqp.Dump("tmp") # select columns and join to global df if len(pair) == 2: df.columns = [pair[0], pair[1]] else: df.columns = [pair[0], 1] df = df.drop(1, axis=1) df_dump = df_dump.join(df) # NA handling logger.info("post-processing dataframe") df_dump = df_dump.dropna(axis=1, how='all') df_dump = df_dump.fillna(-1, downcast='integer') # restrict output to requested anchors df_dump = df_dump[anchors] # put into cache self.cache.set(identifier, df_dump) if save: cqp.nqr_save(self.corpus_name, name) cqp.__kill__() return df_dump ################################################# # WORKING ON DUMPS ############################## ################################################# def _dump2patt_row(self, row, p_att, start, end): """Retrieve p-attribute annotation from start to end of one row. :param Series row: dataframe row that contain start and end keys :param str p_att: p-attribute to retrieve :param str start: key of start column (int or str) :param str end: key of end column (int or str) :return: p-attribute annotation :rtype: str """ cpos_start = row[start] cpos_end = row[end] # if both are missing, return empty string if cpos_start == cpos_end == -1: return "" # if one of them is missing, set start = end or end = start if cpos_start == -1: cpos_start = cpos_end if cpos_end == -1: cpos_end = cpos_start # lexicalize p = self.attributes.attribute(p_att, 'p') return " ".join(p[int(cpos_start):int(cpos_end) + 1]) def dump2patt(self, df_dump, p_att='word', start='match', end='matchend'): """Retrieve p-attribute annotation from start to end. === (match, matchend), $p === Any additional columns of df_dump are preserved as long as there are no conflicts (in which case the original column will be overwritten). :param DataFrame df_dump: DataFrame with specified columns (possibly as index) :param str p_att: p-attribute to retrieve :param str start: key of start column (int or str) :param str end: key of end column (int or str) :return: df_dump :rtype: DataFrame """ # pre-process index_names = df_dump.index.names df = df_dump.reset_index() # retrieve attribute df[p_att] = df.apply( lambda row: self._dump2patt_row(row, p_att, start, end), axis=1) # post-process df = df.set_index(index_names) return df def dump2satt(self, df_dump, s_att, annotation=True): """Retrieve cwb-id, span, and annotation of s-attribute at match. Note that this only takes into account the match, not the matchend. This is reasonable assuming that the retrieved s-att comprises complete matches (match..matchend). === (match, matchend), $s_cwbid, $s_span, $s_spanend, $s* === $s is only created if annotation is True and attribute is actually annotated. Any additional columns of df_dump are preserved as long as there are no conflicts (in which case the original columns will be overwritten). :param DataFrame df_dump: DataFrame indexed by (match, matchend) :param str s_att: s-attribute to retrieve :param bool annotation: whether to retrieve annotation of s-att :return: df_dump :rtype: DataFrame """ # init dataframe df = df_dump.reset_index()[['match', 'matchend']] # get $s_id df[s_att + "_cwbid"] = df.match.apply(lambda x: self.cpos2sid(x, s_att)) nr_missing = (df[s_att + '_cwbid'] == -1).sum() logger.info('s-att "%s" exists at %d of %d matches' % (s_att, len(df) - nr_missing, len(df))) # retrieve where possible s_attributes = self.attributes.attribute(s_att, "s") df_s = df.loc[~(df[s_att + "_cwbid"] == -1)] df = df.join( DataFrame(index=df_s.index, data=df_s[s_att + "_cwbid"].apply( lambda x: s_attributes[int(x)]).to_list())) # two or three columns: 0 (start), 1 (end), 2* (annotation) if annotation: annotation = (2 in df.columns) # just to make sure ... if not annotation: logger.info('s-att "%s" does not have any annotation' % s_att) else: df[2] = df[2].apply(lambda x: x.decode('utf-8')) # restore original index and post-process df = df.set_index(['match', 'matchend']) df = df.rename({ 0: s_att + '_span', 1: s_att + '_spanend', 2: s_att }, axis=1) # join to original dataframe df_dump = df_dump.join(df, lsuffix='_bak') df_dump = df_dump[[ col for col in df_dump if not str(col).endswith('_bak') ]] df_dump[[s_att + '_span', s_att + '_spanend' ]] = df[[s_att + '_span', s_att + '_spanend']].fillna(-1, downcast='infer') return df_dump def dump2context(self, df_dump, context_left, context_right, context_break): """Extend df_dump to context, breaking the context at context_break. === (match, matchend), contextid*, context, contextend === Columns for $context_break, $context_break_cwbid, $context_break_span, $context_break_spanend, are also created (if applicable). Any additional columns of df_dump are preserved. Note that in contrast to matches, contexts may overlap. For positions where the s-att specified by context_break does not exist, context_break is ignored. The context creation algorithm does not take into account that match and matchend may be part of different spans defined by context_break; it only looks at the s-attributes of match, not of matchend. For the _context_ column (left hand side), the strategy is as follows; the strategy for _contextend_ (right hand side) is analogous (using context_right, matchend and context_break_spanend). if context_break_span is None and context_left is None => context = match if context_break_span is None and context_left is not None => context = max(0, match - context_left) if context_break_span is not None and context_left is None => context = context_break_span if context_break_span is not None and context_left is not None => context = max(match - context_left, s_start) :param DataFrame df_dump: DataFrame indexed by (match, matchend) :param int context_left: maximum context to the left of match :param int context_right: maximum context to the right of matchend :param str context_break: s-attribute to confine context """ if context_break is None: df = df_dump.reset_index() # left if context_left is None: df['context'] = df.match else: df['context'] = maximum(0, df.match - context_left) # right if context_right is None: df['contextend'] = df.matchend else: df['contextend'] = minimum(self.corpus_size - 1, df.matchend + context_right) else: # get context confined by s-attribute df = self.dump2satt(df_dump, context_break, annotation=False).reset_index() # save contextid df['contextid'] = df[context_break + '_cwbid'] # replace -1 to not confuse min() df[[context_break + '_spanend' ]] = df[[context_break + '_spanend' ]].replace(-1, self.corpus_size + 1) # left if context_left is None: df['context'] = df[context_break + '_span'] else: df['context'] = df.match - context_left df['context'] = df[['context', context_break + '_span']].max(axis=1) # right if context_right is None: df['contextend'] = df[context_break + '_spanend'] else: df['contextend'] = df.matchend + context_right df['contextend'] = df[[ 'contextend', context_break + '_spanend' ]].min(axis=1) # revert replacement df[[context_break + '_spanend' ]] = df[[context_break + '_spanend' ]].replace(self.corpus_size + 1, -1) # restore original index df = df.set_index(['match', 'matchend']) return df ################################################# # QUERY ALIASES ################################# ################################################# def query_s_att(self, s_att, values=set(), name=None): """Get s-attribute spans as Dump, optionally restricting the spans by matching the provided values against the s-att annotations. === (match, matchend), $s_cwbid, $s* === :param str s_att: s-attribute to use for spans :param set values: values of s-att annotation to restrict spans to :return: dump :rtype: Dump """ df_spans = self.dump_from_s_att(s_att) # restrict to certain values if len(values) > 0: if s_att not in df_spans.columns: logger.error("cannot restrict spans without annotation") df_spans = DataFrame(columns=['match', 'matchend']).set_index( ['match', 'matchend']) else: values = set(values) logger.info("restricting spans using %d values" % len(values)) df_spans = df_spans.loc[df_spans[s_att].isin(values)] # save as NQR if name is not None: # undump the dump and save to disk cqp = self.start_cqp() cqp.nqr_from_dump(df_spans, name) cqp.nqr_save(self.corpus_name, name) cqp.__kill__() # return proper Dump return Dump(self.copy(), df_spans, name_cqp=None) def query(self, cqp_query, context=20, context_left=None, context_right=None, context_break=None, corrections=dict(), match_strategy='standard', name=None): """Get query result as (context-extended) Dump (with corrected anchors). If a name is given, the resulting NQR (without context and before anchor correction) will be written to disk in CWB binary format. === (match, matchend), 0*, ..., 9*, contextid*, context*, contextend* === :param str query: CQP query :param int context: maximum context around match..matchend (symmetric) :param int context_left: maximum context left to the match :param int context_right: maximum context right to the matchend :param str context_break: s-attribute to confine context to :param dict corrections: corrections {anchor: offset} :param str match_strategy: CQP matching strategy :param str name: name for NQR :return: dump :rtype: Dump """ # preprocess input save = False if name is None else True # save NQR from CQP to disk? name = 'Last' if name is None else name # name in CQP query, s_query, anchors = preprocess_query(cqp_query) s_query = context_break if s_query is None else s_query context_left = context if context_left is None else context_left context_right = context if context_right is None else context_right # get dump from query df_dump = self.dump_from_query(query=query, s_query=s_query, anchors=anchors, match_strategy=match_strategy, name=name, save=save) # if dump has been retrieved from cache, NQR might not exist if self.show_nqr().empty or \ name not in self.show_nqr()['subcorpus'].values: # undump the dump and save to disk cqp = self.start_cqp() cqp.nqr_from_dump(df_dump, name) cqp.nqr_save(self.corpus_name, name) cqp.__kill__() # empty return? if len(df_dump) == 0: logger.warning("found 0 matches") df_dump = DataFrame(columns=['match', 'matchend']).set_index( ['match', 'matchend']) else: # extend dump to context df_dump = self.dump2context(df_dump, context_left, context_right, context_break) # apply corrections to anchor points df_dump = correct_anchors(df_dump, corrections) # return proper dump return Dump(self.copy(), df_dump, name_cqp=name)