def cqp2vocab_main(argv=None): opts, args = oparse.parse_args(argv) frequencies = defaultdict(int) for arg in args: crp = Corpus(arg, registry_dir=CQP_REGISTRY) att = crp.attribute(opts.attr, 'p') if opts.encoding is not None and crp.get_encoding() != opts.encoding: print >> sys.stderr, "Recoding %s items from %s to %s" % ( arg, crp.get_encoding(), opts.encoding) to_uni = crp.to_unicode enc = opts.encoding recode = lambda w: to_uni(w).encode(enc) else: recode = lambda x: x dic = att.getDictionary() for i in xrange(len(dic)): word = dic.get_word(i) frequencies[recode(word)] += att.frequency(word) for word in frequencies.keys(): if frequencies[word] < opts.threshold: del frequencies[word] if opts.out_fname is None: f_out = sys.stdout else: f_out = file(opts.out_fname, 'w') for word in sorted(frequencies): print >> f_out, word
def make_bigram_alph(corpora, attr_name='word', suffix='', outdir='.'): unigram_freqs = defaultdict(int) bigram_freqs = defaultdict(int) for corpus_name in corpora: print >> sys.stderr, "Reading corpus: %s" % (corpus_name, ) corpus = Corpus(corpus_name) att = corpus.attribute(attr_name, 'p') unigram_list, bigram_list = make_frequencies(att) for v, k in unigram_list: unigram_freqs[k] += v for v, k in bigram_list: bigram_freqs[k] += v unigram_list = [(v, k) for (k, v) in unigram_freqs.iteritems()] unigram_list.sort() del unigram_list[MAX_LIST:] unigram_alph = CPPAlphabet() for c, k in unigram_list: unigram_alph[k] unigram_alph.tofile( file(os.path.join(outdir, 'unigram%s_alph.txt' % (suffix, )), 'w')) bigram_list = [(v, k) for (k, v) in bigram_freqs.iteritems()] bigram_list.sort() del bigram_list[MAX_LIST:] bigram_alph = CPPAlphabet() for c, k in bigram_list: bigram_alph[k] bigram_alph.tofile( file(os.path.join(outdir, 'bigram%s_alph.txt' % (suffix, )), 'w'))
def con_source(request, qpos): window_size = 100 corp_name, start, end = cy.decrypt(qpos) start, end = int(start), int(end) corpus = Corpus(corp_name.upper(), registry_dir='/usr/local/share/cwb/registry') words = corpus.attribute('word', 'p') corp_len = len(words) if start - window_size < 0: lb = 0 else: lb = start - window_size if end + window_size > corp_len: rb = corp_len - 1 else: rb = end + window_size lw = ''.join(words[lb:start]) qw = '<span style="color:red;font-size:24px;">' + ''.join( words[start:end]) + '</span>' rw = ''.join(words[end:rb]) if corp_name == 'tccm' or corp_name == 'ntuspk': if corp_name == 'tccm': s_attrs = corpus.attribute('s_addresser', 's') if corp_name == 'ntuspk': s_attrs = corpus.attribute('s_speaker', 's') top = s_attrs.cpos2struc(lb) top = s_attrs[top] bottom = s_attrs.cpos2struc(rb) bottom = s_attrs[bottom] attr_con = [] for attr in s_attrs: if attr[0] >= top[0] and attr[1] <= bottom[1]: attr_con.append(attr) output = '' for a in attr_con: if start in xrange(a[0], a[1]): sent =\ a[-1] + ': ' +\ ' '.join(words[a[0]:start]) + ' ' +\ '<span style="color:red;font-size:24px;">' + ' '.join(words[start:end]) + '</span>' + ' ' +\ ' '.join(words[end:a[1]]) else: sent = '%s: %s' % (a[-1], ' '.join(words[a[0]:a[1]])) output += sent + '<br>' # output = ['%s: %s' % (i[-1], ' '.join(words[i[0]:i[1]])) for i in attr_con] # output = '<br>'.join(output) return HttpResponse(output) return HttpResponse(lw + qw + rw)
def create_bow_tag(corpora, language, pos_tags, outdir='.', alph_suffix=''): # Step 1: extract unigram distributions for words unigram_alph = CPPUniAlphabet() unigram_alph.fromfile( file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, )))) unigram_alph.growing = False bigram_alph = CPPUniAlphabet() bigram_alph.fromfile( file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, )))) bigram_alph.growing = False infix = '_'.join(prefix_l) if infix != '': infix = '_' + infix if opts.limit != -1: prefix_l.append('%d' % (opts.limit / 1000)) word_matrix = None word_alphs = get_word_alphs_by_pos(language) for word_pos in pos_tags: word_alph = word_alphs[word_pos] word_feat_alph = CPPUniAlphabet() for corpus_name in corpora: corpus = Corpus(corpus_name) att = corpus.attribute(opts.attr_name, 'p') att_find = corpus.attribute('tb_lemma', 'p') att_sent = corpus.attribute('s', 's') pair_alphs = get_pair_alphs_by_pos(opts.language) word_alphs = get_word_alphs_by_pos(opts.language) print "word features for %s in %s" % (word_pos, corpus_name) wmat = gather_word_vectors(list(word_alph), att, att_find, att_sent, unigram_alph, bigram_alph, word_feat_alph, forward_mapping_by_pos(word_pos), opts.limit) if word_matrix is None: word_matrix = wmat else: word_matrix += wmat word_feat_alph.tofile_utf8( file( os.path.join(opts.outdir, 'word_bow%s%s_alph.txt' % ( infix, word_pos, )), 'w')) word_matrix.write_binary( file( os.path.join(opts.outdir, 'word_bow%s%s_mtx.bin' % ( infix, word_pos, )), 'w'))
def f(corpus, query): """ Envoi de la requête à CQP et mise en forme des données récupérées entrée : nom du corpus sur lequel la requête sera effectuée et la requête en question sortie : requête à soumettre à CQP """ registry_dir = "/usr/local/share/cwb/registry" #cqp=PyCQP_interface.CQP(bin='/usr/local/bin/cqp',options='-c -r '+registry_dir) cqp = PyCQP_interface.CQP(bin='/usr/local/cwb/bin//cqp', options='-c -r ' + registry_dir) corpus_name = splitext(basename(corpus))[0].upper() dep = corpus_name.split("_")[1].upper() if (re.match(r"^\d$", dep)): dep = "0" + dep else: dep = dep resultDep = [] # Envoi de la requête cqp.Exec(corpus_name + ";") cqp.Query(query) cqp.Exec("sort Last by word;") """ Récupération des résultats, sous la forme d'une liste (results) qui contient autant de listes que de résultats correspondant à la requête effectuée. Ces listes permettent de récupérer l'emplacement du premier et du dernier élément des motifs correspondants dans le corpus. """ rsize = int(cqp.Exec("size Last;")) results = cqp.Dump(first=0, last=rsize) corpus = Corpus(corpus_name, registry_dir=registry_dir) # permettra de récupérer par la suite le token, la POS ou le lemme correspondant à la position indiquée words = corpus.attribute("word", "p") postags = corpus.attribute("pos", "p") lemmas = corpus.attribute("lemma", "p") sentences = corpus.attribute(b"text", "s") id = corpus.attribute(b"text_id", "s") dates = corpus.attribute(b"text_date", "s") geo = corpus.attribute(b"text_geo", "s") users = corpus.attribute(b"text_user", "s") cqp.Terminate() if (results != [[""]]): for r in results: left_context = [] right_context = [] start = int(r[0]) end = int(r[1]) # Récupération de la position du début et de la fin du tweet dans lequel le motif a été trouvé s_bounds = sentences.find_pos(end) # Récupérarion de ses attributs (id, date, coordonnées et id de l'utilisateur) id_bounds = id.find_pos(end) date_bounds = dates.find_pos(end) geo_bounds = geo.find_pos(end) user_bounds = users.find_pos(end) coord = geo_bounds[-1].decode("utf8").split(", ") # récupération de la position des mots des contextes droit et gauche for pos in range(s_bounds[0], s_bounds[1] + 1): if (pos < start): left_context.append(pos) if (pos > end): right_context.append(pos) # Construction du dictionnaire qui contiendra les informations qui nous intéressent result = { "id": id_bounds[-1], "date": date_bounds[-1].decode("utf8").split("T")[0], "geo": coord, "dep": dep, "user": user_bounds[-1], "hide_column": "", "left_context": "", "pattern": "", "right_context": "" } lc_tokens = [] lc_pos = [] lc_lemmas = [] rc_tokens = [] rc_pos = [] rc_lemmas = [] # récupération du contexte gauche (tokens, pos et lemmes) for lp in left_context: lc_tokens.append(words[lp]) lc_pos.append(postags[lp]) lc_lemmas.append(lemmas[lp]) lc_tokens = reconstituteString(lc_tokens) lc_pos = " ".join(lc_pos) lc_lemmas = " ".join(lc_lemmas) # récupération du motif recherché (tokens, pos et lemmes) pattern_tokens = reconstituteString(words[start:end + 1]) pattern_pos = " ".join(postags[start:end + 1]) pattern_lemmas = " ".join(lemmas[start:end + 1]) # récupération du contexte droit (tokens, pos et lemmes) for rp in right_context: rc_tokens.append(words[rp]) rc_pos.append(postags[rp]) rc_lemmas.append(lemmas[rp]) rc_tokens = reconstituteString(rc_tokens) rc_pos = " ".join(rc_pos) rc_lemmas = " ".join(rc_lemmas) # mise en forme ici pour ne pas ajouter du temps de traitement côté client result["hide_column"] = lc_tokens[::-1] result[ "left_context"] = "<span title=\"" + lc_pos + " " + lc_lemmas + "\">" + lc_tokens + "</span>" result[ "pattern"] = "<span title=\"" + pattern_pos + " " + pattern_lemmas + "\">" + pattern_tokens + "</span>" result[ "right_context"] = "<span title=\"" + rc_pos + " " + rc_lemmas + "\">" + rc_tokens + "</span>" resultDep.append(result) # fermeture du processus CQP car sinon ne se ferme pas os.popen("kill -9 " + str(cqp.CQP_process.pid)) return resultDep