示例#1
0
def cqp2vocab_main(argv=None):
    opts, args = oparse.parse_args(argv)
    frequencies = defaultdict(int)
    for arg in args:
        crp = Corpus(arg, registry_dir=CQP_REGISTRY)
        att = crp.attribute(opts.attr, 'p')
        if opts.encoding is not None and crp.get_encoding() != opts.encoding:
            print >> sys.stderr, "Recoding %s items from %s to %s" % (
                arg, crp.get_encoding(), opts.encoding)
            to_uni = crp.to_unicode
            enc = opts.encoding
            recode = lambda w: to_uni(w).encode(enc)
        else:
            recode = lambda x: x
        dic = att.getDictionary()
        for i in xrange(len(dic)):
            word = dic.get_word(i)
            frequencies[recode(word)] += att.frequency(word)
    for word in frequencies.keys():
        if frequencies[word] < opts.threshold:
            del frequencies[word]
    if opts.out_fname is None:
        f_out = sys.stdout
    else:
        f_out = file(opts.out_fname, 'w')
    for word in sorted(frequencies):
        print >> f_out, word
示例#2
0
def make_bigram_alph(corpora, attr_name='word', suffix='', outdir='.'):
    unigram_freqs = defaultdict(int)
    bigram_freqs = defaultdict(int)
    for corpus_name in corpora:
        print >> sys.stderr, "Reading corpus: %s" % (corpus_name, )
        corpus = Corpus(corpus_name)
        att = corpus.attribute(attr_name, 'p')
        unigram_list, bigram_list = make_frequencies(att)
        for v, k in unigram_list:
            unigram_freqs[k] += v
        for v, k in bigram_list:
            bigram_freqs[k] += v
    unigram_list = [(v, k) for (k, v) in unigram_freqs.iteritems()]
    unigram_list.sort()
    del unigram_list[MAX_LIST:]
    unigram_alph = CPPAlphabet()
    for c, k in unigram_list:
        unigram_alph[k]
    unigram_alph.tofile(
        file(os.path.join(outdir, 'unigram%s_alph.txt' % (suffix, )), 'w'))
    bigram_list = [(v, k) for (k, v) in bigram_freqs.iteritems()]
    bigram_list.sort()
    del bigram_list[MAX_LIST:]
    bigram_alph = CPPAlphabet()
    for c, k in bigram_list:
        bigram_alph[k]
    bigram_alph.tofile(
        file(os.path.join(outdir, 'bigram%s_alph.txt' % (suffix, )), 'w'))
示例#3
0
def con_source(request, qpos):
    window_size = 100
    corp_name, start, end = cy.decrypt(qpos)
    start, end = int(start), int(end)
    corpus = Corpus(corp_name.upper(),
                    registry_dir='/usr/local/share/cwb/registry')
    words = corpus.attribute('word', 'p')
    corp_len = len(words)
    if start - window_size < 0:
        lb = 0
    else:
        lb = start - window_size
    if end + window_size > corp_len:
        rb = corp_len - 1
    else:
        rb = end + window_size

    lw = ''.join(words[lb:start])
    qw = '<span style="color:red;font-size:24px;">' + ''.join(
        words[start:end]) + '</span>'
    rw = ''.join(words[end:rb])
    if corp_name == 'tccm' or corp_name == 'ntuspk':
        if corp_name == 'tccm':
            s_attrs = corpus.attribute('s_addresser', 's')
        if corp_name == 'ntuspk':
            s_attrs = corpus.attribute('s_speaker', 's')
        top = s_attrs.cpos2struc(lb)
        top = s_attrs[top]
        bottom = s_attrs.cpos2struc(rb)
        bottom = s_attrs[bottom]

        attr_con = []
        for attr in s_attrs:
            if attr[0] >= top[0] and attr[1] <= bottom[1]:
                attr_con.append(attr)
        output = ''
        for a in attr_con:
            if start in xrange(a[0], a[1]):
                sent =\
                a[-1] + ': ' +\
                ' '.join(words[a[0]:start]) + ' ' +\
                '<span style="color:red;font-size:24px;">' + ' '.join(words[start:end]) + '</span>' + ' ' +\
                ' '.join(words[end:a[1]])
            else:
                sent = '%s: %s' % (a[-1], ' '.join(words[a[0]:a[1]]))
            output += sent + '<br>'


#        output = ['%s: %s' % (i[-1], ' '.join(words[i[0]:i[1]])) for i in attr_con]
#        output = '<br>'.join(output)
        return HttpResponse(output)

    return HttpResponse(lw + qw + rw)
示例#4
0
def create_bow_tag(corpora, language, pos_tags, outdir='.', alph_suffix=''):
    # Step 1: extract unigram distributions for words
    unigram_alph = CPPUniAlphabet()
    unigram_alph.fromfile(
        file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, ))))
    unigram_alph.growing = False
    bigram_alph = CPPUniAlphabet()
    bigram_alph.fromfile(
        file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, ))))
    bigram_alph.growing = False
    infix = '_'.join(prefix_l)
    if infix != '': infix = '_' + infix
    if opts.limit != -1:
        prefix_l.append('%d' % (opts.limit / 1000))
    word_matrix = None
    word_alphs = get_word_alphs_by_pos(language)
    for word_pos in pos_tags:
        word_alph = word_alphs[word_pos]
        word_feat_alph = CPPUniAlphabet()
        for corpus_name in corpora:
            corpus = Corpus(corpus_name)
            att = corpus.attribute(opts.attr_name, 'p')
            att_find = corpus.attribute('tb_lemma', 'p')
            att_sent = corpus.attribute('s', 's')
            pair_alphs = get_pair_alphs_by_pos(opts.language)
            word_alphs = get_word_alphs_by_pos(opts.language)
            print "word features for %s in %s" % (word_pos, corpus_name)
            wmat = gather_word_vectors(list(word_alph), att, att_find,
                                       att_sent, unigram_alph, bigram_alph,
                                       word_feat_alph,
                                       forward_mapping_by_pos(word_pos),
                                       opts.limit)
            if word_matrix is None:
                word_matrix = wmat
            else:
                word_matrix += wmat
        word_feat_alph.tofile_utf8(
            file(
                os.path.join(opts.outdir, 'word_bow%s%s_alph.txt' % (
                    infix,
                    word_pos,
                )), 'w'))
        word_matrix.write_binary(
            file(
                os.path.join(opts.outdir, 'word_bow%s%s_mtx.bin' % (
                    infix,
                    word_pos,
                )), 'w'))
示例#5
0
def f(corpus, query):
    """
    Envoi de la requête à CQP et mise en forme des données récupérées
        entrée : nom du corpus sur lequel la requête sera effectuée et la requête en question
        sortie : requête à soumettre à CQP
    """

    registry_dir = "/usr/local/share/cwb/registry"
    #cqp=PyCQP_interface.CQP(bin='/usr/local/bin/cqp',options='-c -r '+registry_dir)
    cqp = PyCQP_interface.CQP(bin='/usr/local/cwb/bin//cqp',
                              options='-c -r ' + registry_dir)
    corpus_name = splitext(basename(corpus))[0].upper()
    dep = corpus_name.split("_")[1].upper()
    if (re.match(r"^\d$", dep)):
        dep = "0" + dep
    else:
        dep = dep

    resultDep = []

    # Envoi de la requête
    cqp.Exec(corpus_name + ";")
    cqp.Query(query)
    cqp.Exec("sort Last by word;")
    """
        Récupération des résultats, sous la forme d'une liste (results) qui contient autant de listes que de résultats correspondant à la requête effectuée.
        Ces listes permettent de récupérer l'emplacement du premier et du dernier élément des motifs correspondants dans le corpus.
    """
    rsize = int(cqp.Exec("size Last;"))
    results = cqp.Dump(first=0, last=rsize)

    corpus = Corpus(corpus_name, registry_dir=registry_dir)

    # permettra de récupérer par la suite le token, la POS ou le lemme correspondant à la position indiquée
    words = corpus.attribute("word", "p")
    postags = corpus.attribute("pos", "p")
    lemmas = corpus.attribute("lemma", "p")

    sentences = corpus.attribute(b"text", "s")
    id = corpus.attribute(b"text_id", "s")
    dates = corpus.attribute(b"text_date", "s")
    geo = corpus.attribute(b"text_geo", "s")
    users = corpus.attribute(b"text_user", "s")

    cqp.Terminate()

    if (results != [[""]]):
        for r in results:
            left_context = []
            right_context = []
            start = int(r[0])
            end = int(r[1])

            # Récupération de la position du début et de la fin du tweet dans lequel le motif a été trouvé
            s_bounds = sentences.find_pos(end)
            # Récupérarion de ses attributs (id, date, coordonnées et id de l'utilisateur)
            id_bounds = id.find_pos(end)
            date_bounds = dates.find_pos(end)
            geo_bounds = geo.find_pos(end)
            user_bounds = users.find_pos(end)

            coord = geo_bounds[-1].decode("utf8").split(", ")

            # récupération de la position des mots des contextes droit et gauche
            for pos in range(s_bounds[0], s_bounds[1] + 1):
                if (pos < start):
                    left_context.append(pos)
                if (pos > end):
                    right_context.append(pos)

            # Construction du dictionnaire qui contiendra les informations qui nous intéressent
            result = {
                "id": id_bounds[-1],
                "date": date_bounds[-1].decode("utf8").split("T")[0],
                "geo": coord,
                "dep": dep,
                "user": user_bounds[-1],
                "hide_column": "",
                "left_context": "",
                "pattern": "",
                "right_context": ""
            }

            lc_tokens = []
            lc_pos = []
            lc_lemmas = []
            rc_tokens = []
            rc_pos = []
            rc_lemmas = []

            # récupération du contexte gauche (tokens, pos et lemmes)
            for lp in left_context:
                lc_tokens.append(words[lp])
                lc_pos.append(postags[lp])
                lc_lemmas.append(lemmas[lp])
            lc_tokens = reconstituteString(lc_tokens)
            lc_pos = " ".join(lc_pos)
            lc_lemmas = " ".join(lc_lemmas)

            # récupération du motif recherché (tokens, pos et lemmes)
            pattern_tokens = reconstituteString(words[start:end + 1])
            pattern_pos = " ".join(postags[start:end + 1])
            pattern_lemmas = " ".join(lemmas[start:end + 1])

            # récupération du contexte droit (tokens, pos et lemmes)
            for rp in right_context:
                rc_tokens.append(words[rp])
                rc_pos.append(postags[rp])
                rc_lemmas.append(lemmas[rp])
            rc_tokens = reconstituteString(rc_tokens)
            rc_pos = " ".join(rc_pos)
            rc_lemmas = " ".join(rc_lemmas)

            # mise en forme ici pour ne pas ajouter du temps de traitement côté client
            result["hide_column"] = lc_tokens[::-1]
            result[
                "left_context"] = "<span title=\"" + lc_pos + "&#10;" + lc_lemmas + "\">" + lc_tokens + "</span>"
            result[
                "pattern"] = "<span title=\"" + pattern_pos + "&#10;" + pattern_lemmas + "\">" + pattern_tokens + "</span>"
            result[
                "right_context"] = "<span title=\"" + rc_pos + "&#10;" + rc_lemmas + "\">" + rc_tokens + "</span>"

            resultDep.append(result)

    # fermeture du processus CQP car sinon ne se ferme pas
    os.popen("kill -9 " + str(cqp.CQP_process.pid))

    return resultDep