Exemplos de ngram em Python, exemplos de segment.clean.ngram em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: new_word.py Projeto: lebinlebin/segment

def frequency(texts, fname='frequency.xlsx'):
    """词频统计"""
    c = Counter(w for t in texts for s in clean.ngram(t) for w in tk1.cut(s)if clean.is_word(w)).most_common(N)
    DataFrame([(w, tk1.get_flag(w), f)for w, f in c], columns=['word', 'flag', 'freq']).to_excel(fname, index=False)
    maximum = c[0][1] ** .5
    for w, f in reversed(c):
        tk1.bar(f**.5, maximum, w)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: synonym.py Projeto: lebinlebin/segment

def synonym_bigram(texts, center_word, filtrate=get_flag):
    """组合词"""
    tk.add_word(center_word, 2000, 'CENTER')
    left, right = Counter(), Counter()
    for text in texts:
        if center_word in text:
            for sentence in clean.ngram(text):
                words = [w for w in tk.cut(sentence) if filtrate(w)]
                for i in range(len(words) - 1):
                    if words[i] == center_word:
                        word = ' '.join(words[i:i + 2])
                        flag = ' '.join(tk.get_flag(w) for w in words[i:i + 2])
                        left[(word, flag)] += 1
                    if words[i + 1] == center_word:
                        word = ' '.join(words[i:i + 2])
                        flag = ' '.join(tk.get_flag(w) for w in words[i:i + 2])
                        right[(word, flag)] += 1
    u = max(left.most_common()[0][1], right.most_common()[0][1])
    left = corpus.ls2df([(i, j, k, tk.bar(k, u))
                         for (i, j), k in left.most_common()],
                        ['word', 'flag', 'freq', 'bar'])
    right = corpus.ls2df([(i, j, k, tk.bar(k, u))
                          for (i, j), k in right.most_common()],
                         ['word', 'flag', 'freq', 'bar'])
    corpus.df2sheets([left, right], ['left', 'right'],
                     'synonym_bigram_%s.xlsx' % center_word)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: word_frequency.py Projeto: lebinlebin/NLP

def new_word_flag(texts, dictionary=_dict, fname='new_word_flag.xlsx'):
    """探索新词极其词性"""
    c = Counter(
        (w.word, w.flag) for t in texts for s in ngram(t) for w in tk2.cut(s)
        if w.word not in dictionary and fullmatch(w.word)).most_common()
    DataFrame([(i[0], i[1], j) for i, j in c],
              columns=['word', 'flag', 'freq']).to_excel(fname, index=False)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: new_word.py Projeto: lebinlebin/segment

def new_word(texts, fname='new_word.xlsx'):
    """探索新词"""
    c = Counter(
        w for t in texts for s in clean.ngram(t) for w in tk1.cut(s)
        if clean.is_word(w) and w not in dictionary).most_common(N)
    DataFrame(c, columns=['word', 'freq']).to_excel(fname, index=False)
    maximum = c[0][1] ** .5
    for w, f in reversed(c):
        tk1.bar(f**.5, maximum, w)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: new_word.py Projeto: lebinlebin/segment

def new_word_flag(texts, fname='new_word_flag.xlsx'):
    """探索新词极其词性"""
    c = Counter(
        (w.word, w.flag) for t in texts for s in clean.ngram(t) for w in tk2.cut(s)
        if clean.is_word(w.word) and w.word not in dictionary).most_common(N)
    maximum = c[0][1] ** .5
    DataFrame([(i, j, k, tk2.bar(k**.5, maximum)) for (i, j), k in c],
              columns=['word', 'flag', 'freq', 'bar']).to_excel(fname, index=False)
    for w, f in reversed(c):
        tk1.bar(f**.5, maximum, ' '.join(w))

Exemplo n.º 6

0

Exibir arquivo

Arquivo: bigram.py Projeto: lebinlebin/segment

def trigram(texts, n=2, stop_words=STOP_WORDS):
    """统计语言模型"""
    c = Counter()
    for text in texts:
        for sentence in clean.ngram(clear(text)):
            words = [w for w in tk.cut(sentence) if w not in stop_words]
            for i in range(len(words) + 1 - n):
                c[' '.join(words[i:i + n])] += 1
    DataFrame(c.most_common(N),
              columns=['word', 'freq'])[['freq',
                                         'word']].to_excel('%dgram.xlsx' % n,
                                                           index=False)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: bigram.py Projeto: lebinlebin/segment

def trigram_flag(texts, n=2, stop_words=STOP_WORDS):
    """统计语言模型（带词性）"""
    c = Counter()
    for text in texts:
        for sentence in clean.ngram(clear(text)):
            words = [w for w in tk.cut(sentence) if w not in stop_words]
            for i in range(len(words) + 1 - n):
                word = ' '.join(words[i:i + n])
                flag = ' '.join(tk.get_flag(w) for w in words[i:i + n])
                c[(word, flag)] += 1
    u = c.most_common()[0][1]
    c = [(i, j, k, tk.bar(k, u)) for (i, j), k in c.most_common(N)]
    DataFrame(c, columns=['word', 'flag', 'freq',
                          'bar']).to_excel('%dgram_flag.xlsx' % n, index=False)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: bigram.py Projeto: lebinlebin/segment

def trigram_flag_sort(texts, n=2, stop_words=STOP_WORDS):
    """统计语言模型（带词性+排序）"""
    c = Counter()
    for text in texts:
        for sentence in clean.ngram(clear(text)):
            words = [w for w in tk.cut(sentence) if w not in stop_words]
            for i in range(len(words) + 1 - n):
                wf = sorted([(tk.get_flag(w), w) for w in words[i:i + n]])
                word = ' '.join(j[1] for j in wf)
                flag = ' '.join(j[0] for j in wf)
                c[(word, flag)] += 1
    c = [(k, j, i) for (i, j), k in c.most_common(N)]
    DataFrame(c, columns=['freq', 'flag',
                          'word']).to_excel('%dgram_flag_sort.xlsx' % n,
                                            index=False)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: word_frequency.py Projeto: lebinlebin/NLP

def frequency(texts, fname='frequency.xlsx'):
    """词频统计"""
    c = Counter(w for t in texts for s in ngram(t) for w in tk0.cut(s)
                if fullmatch(w)).most_common()
    DataFrame([(w, tk0.get_flag(w), f) for w, f in c],
              columns=['word', 'flag', 'freq']).to_excel(fname, index=False)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: word_frequency.py Projeto: lebinlebin/NLP

def new_word(texts, dictionary=_dict, fname='new_word.xlsx'):
    """探索新词"""
    c = Counter(w for t in texts for s in ngram(t) for w in tk1.cut(s)
                if w not in dictionary and fullmatch(w)).most_common()
    DataFrame(c, columns=['word', 'freq']).to_excel(fname, index=False)

Exemplo n.º 11

0

Exibir arquivo

def cut(text):
    for sentence in clean.ngram(text.strip()):
        for word in tk.cut(sentence):
            if clean.is_word(word) and tk.get_flag(
                    word) not in discarded_flags:
                yield word