Exemplo n.º 1
0
    def get_freq(self, word):
        this_word = word
        import nltk
        from nltk.corpus import sinica_treebank

        corpus = sinica_treebank.words()
        from collections import Counter
        freq_list = Counter(corpus)
        print(freq_list)
Exemplo n.º 2
0
    'English: Brown Corpus (Religion)':
    lambda: brown.words(categories='religion'),
    'English: Brown Corpus (Learned)':
    lambda: brown.words(categories='learned'),
    'English: Brown Corpus (Science Fiction)':
    lambda: brown.words(categories='science_fiction'),
    'English: Brown Corpus (Romance)':
    lambda: brown.words(categories='romance'),
    'English: Brown Corpus (Humor)':
    lambda: brown.words(categories='humor'),
    'English: NPS Chat Corpus':
    lambda: nps_chat.words(),
    'English: Wall Street Journal Corpus':
    lambda: treebank.words(),
    'Chinese: Sinica Corpus':
    lambda: sinica_treebank.words(),
    'Dutch: Alpino Corpus':
    lambda: alpino.words(),
    'Hindi: Indian Languages Corpus':
    lambda: indian.words(files='hindi.pos'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.words(),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.words(),
    'Portuguese: Machado Corpus (Brazil)':
    lambda: machado.words(),
    'Spanish: CESS-ESP Corpus':
    lambda: cess_esp.words()
}

Exemplo n.º 3
0
            'English: Brown Corpus (Religion)':
                lambda: brown.words(categories='religion'),
            'English: Brown Corpus (Learned)':
                lambda: brown.words(categories='learned'),
            'English: Brown Corpus (Science Fiction)':
                lambda: brown.words(categories='science_fiction'),
            'English: Brown Corpus (Romance)':
                lambda: brown.words(categories='romance'),
            'English: Brown Corpus (Humor)':
                lambda: brown.words(categories='humor'),
            'English: NPS Chat Corpus':
                lambda: nps_chat.words(),
            'English: Wall Street Journal Corpus':
                lambda: treebank.words(),
            'Chinese: Sinica Corpus':
                lambda: sinica_treebank.words(),
            'Dutch: Alpino Corpus':
                lambda: alpino.words(),
            'Hindi: Indian Languages Corpus':
                lambda: indian.words(files='hindi.pos'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.words(),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.words(),
            'Portuguese: Machado Corpus (Brazil)':
                lambda: machado.words(),
            'Spanish: CESS-ESP Corpus':
                lambda: cess_esp.words()
           }

class CollocationsView:
Exemplo n.º 4
0
# -*- coding:utf-8 -*-
# Filename: sinica_treebank.py
# Author:hankcs
# Date: 2014-04-08 上午11:44
from __future__ import print_function
import nltk
import sqlite3
from nltk.corpus import sinica_treebank
 
sinica_text = nltk.Text(sinica_treebank.words())
print(sinica_text)
for (key, var) in sinica_treebank.tagged_words()[:8]:
    print('%s/%s' % (key, var))

print(sinica_text.concordance('我'))
print(sinica_text.concordance(u'\u5609\u73cd'))
print("************* sinica_fd=nltk.FreqDist(sinica_treebank.words()) NLTK计算中文高频词 **********")
sinica_fd=nltk.FreqDist(sinica_treebank.words())
top100=sinica_fd.items()[0:100]
for (x,y) in top100:
    print(x,y)
Exemplo n.º 5
0
# -*-coding:utf-8-*-

import jieba
import pandas as pd
import time
import uniout
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sinica_treebank


print(sinica_treebank.words()[100:120])
# sinica_treebank.parsed_sents()[33].draw()

# nltk.download()
# wn.syssets('love')

# print wn.synsets(u'摩托车')
# print wn.synset('car.n.01').lemma_names
Exemplo n.º 6
0
    def __init__(self, min_nchar, fn, lang="ENG"):
        """
        TXT_FN : path to file containing text data.
        """
        self.min_nchar = min_nchar
        self.fdict = {
            'WORD': self.sample_word,
            'LINE': self.sample_line,
            'PARA': self.sample_para
        }
        self.lang = lang
        # parse English text
        if self.lang == "ENG":
            print('Generate English Data with NLTK:PlaintextCorpusReader')
            corpus = PlaintextCorpusReader("./", fn)

            self.words = corpus.words()
            self.sents = corpus.sents()
            self.paras = corpus.paras()

        # parse Japanese text
        elif self.lang == "JPN":
            print('Generate Japanese Data with NLTK:ChasenCorpusReader')
            # convert fs into chasen file
            _, ext = os.path.splitext(os.path.basename(fn))
            fn_chasen = fn.replace(ext, ".chasen")
            print("Convert {} into {}".format(fn, fn_chasen))

            cmd = "mecab -Ochasen {} > {}".format(fn, fn_chasen)
            print(
                "The following cmd below was executed to convert into chasen (for Japanese)"
            )
            print("\t{}".format(cmd))
            p = subprocess.call(cmd, shell=True)
            data = ChasenCorpusReader('./', fn_chasen, encoding='utf-8')

            self.words = data.words()
            self.sents = data.sents()
            self.paras = data.paras()

            # jp_sent_tokenizer = nltk.RegexpTokenizer(u'[^ 「」!?。]*[!?。]')
            # jp_chartype_tokenizer = nltk.RegexpTokenizer(u'([ぁ-んー]+|[ァ-ンー]+|[\u4e00-\u9FFF]+|[^ぁ-んァ-ンー\u4e00-\u9FFF]+)')
            #
            # corpus = PlaintextCorpusReader("./",
            #                              fn,
            #                              encoding='utf-8',
            #                              para_block_reader=read_line_block,
            #                              sent_tokenizer=jp_sent_tokenizer,
            #                              word_tokenizer=jp_chartype_tokenizer)
        elif self.lang == "ZHTW":
            print(
                'Generate Traditional Chinese Data with NLTK:sinica_treebank')
            self.words = []
            self.sents = []
            self.paras = []
            #data = SinicaTreebankCorpusReader('./', fn, encoding='utf-8')
            #self.words = data.words()
            #self.sents = data.sents()
            #self.paras = data.parsed_sents()
            self.words = sinica_treebank.words()
            self.sents = sinica_treebank.sents()
            self.paras = sinica_treebank.parsed_sents()
        else:
            self.words = []
            self.sents = []
            self.paras = []
            #data = SinicaTreebankCorpusReader('./', fn, encoding='utf-8')
            #self.words = data.words()
            #self.sents = data.sents()
            #self.paras = data.parsed_sents()
            self.words = sinica_treebank.words()
            self.sents = sinica_treebank.sents()
            self.paras = sinica_treebank.parsed_sents()
        # distribution over line/words for LINE/PARA:
        self.p_line_nline = np.array([0.85, 0.10, 0.05])
        self.p_line_nword = [4, 3, 12]  # normal: (mu, std)
        self.p_para_nline = [1.0, 1.0]  #[1.7,3.0] # beta: (a, b), max_nline
        self.p_para_nword = [1.7, 3.0, 10]  # beta: (a,b), max_nword

        # probability to center-align a paragraph:
        self.center_para = 0.5
Exemplo n.º 7
0
import nltk
import jieba
import re
from nltk import word_tokenize
from nltk.corpus import sinica_treebank

sinica_text = nltk.Text(sinica_treebank.words())
print(sinica_text.concordance('我'))

## 推特特殊语言体定义
emoticons_str = r"""
(?:
[:=;] # 眼睛
[oO\-]? # 鼻鼻子子
[D\)\]\(\]/\\OpP] # 嘴
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @某人人
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # 话题标签
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',
# URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # 数字
r"(?:[a-z][a-z'\-_]+[a-z])", # 含有 - 和 ‘ 的单词
r'(?:[\w_]+)', # 其他
r'(?:\S)' # 其他
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
#!/usr/bin/python3
# coding: utf-8
from nltk.corpus import sinica_treebank
##################################################################
## 简单测试
print(
    type(sinica_treebank)
)  # <class 'nltk.corpus.reader.sinica_treebank.SinicaTreebankCorpusReader'>
print(len(sinica_treebank.words()))  # 91627
print(sinica_treebank.words()
      )  # ['一', '友情', '嘉珍', '和', '我', '住在', '同一條', '巷子', '我們', ...]
# 去 ~/nltk_data/corpora/sinica_treebank/ 里面直接看会有好多的其他字符
##################################################################
## 生成 中文拼音的 38k-cn-words-pinyin-sorted-by-frequency.txt
import re
from nltk import FreqDist
from pypinyin import pinyin, lazy_pinyin, Style
fd = FreqDist(sinica_treebank.words())
print(len(list(fd.keys())))  # 17273; 去重以后的结果
print(len(fd.most_common()))  # 17273
str = ''.join([x[0] for x in fd.most_common()])
print(len(str))  # 38844
str = re.sub('[^\u4e00-\u9fa5]', '', str)
print(len(str))  # 38225; 去掉标点符号
with open('38k-cn-words-pinyin-sorted-by-frequency.txt', 'w') as f:
    f.write('\n'.join(lazy_pinyin(str)))
Exemplo n.º 9
0
    line = line.strip('\n')  
    titles.append(line),
    print (line ) 

print (titles[:10] )#前 10 个片名
#----------


#nltk.download()
#载入 nltk 的英文停用词作为“stopwords”变量
import nltk
stopwords = nltk.corpus.stopwords.words('english')
print (stopwords[:10])
#
from nltk.corpus import sinica_treebank
print(sinica_treebank.words())


import nltk
stopwords = nltk.corpus.stopwords.words('english')
print (stopwords[:10])


#-------  snownlp ---------
import snownlp
from snownlp import SnowNLP
s=SnowLP('这东西真心很赞')



Exemplo n.º 10
0
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import sinica_treebank

sinica_text = nltk.Text(sinica_treebank.words())
print sinica_text
for (key, var) in sinica_treebank.tagged_words()[:8]:
    print '%s/%s' % (key, var)

print sinica_treebank.parsed_sents()[15]
#!/usr/bin/python3
# coding: utf-8
##################################################################
## 一: treebank, 显示解析树
from nltk.corpus import treebank
t = treebank.parsed_sents('wsj_0001.mrg')[0]
t.draw()

##################################################################
## 二: sinica_treebank
from nltk.corpus import sinica_treebank
##################################################################
## 简单测试
print(type(sinica_treebank))  # <class 'nltk.corpus.reader.sinica_treebank.SinicaTreebankCorpusReader'>
print(len(sinica_treebank.words()))  # 91627
print(sinica_treebank.words())  # ['一', '友情', '嘉珍', '和', '我', '住在', '同一條', '巷子', '我們', ...]
# 去 ~/nltk_data/corpora/sinica_treebank/ 里面直接看会有好多的其他字符
##################################################################
## 生成 中文拼音的 38k-cn-words-pinyin-sorted-by-frequency.txt
import re
from nltk import FreqDist
from pypinyin import pinyin, lazy_pinyin, Style
fd = FreqDist(sinica_treebank.words())
print(len(list(fd.keys())))  # 17273; 去重以后的结果
print(len(fd.most_common()))  # 17273
str = ''.join([x[0] for x in fd.most_common()]); print(len(str))  # 38844
str = re.sub('[^\u4e00-\u9fa5]', '', str); print(len(str))  # 38225; 去掉标点符号
with open('38k-cn-words-pinyin-sorted-by-frequency.txt', 'w') as f:
    f.write('\n'.join(lazy_pinyin(str)))
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pickle
import nltk
from nltk.corpus import sinica_treebank

cnt = 0

sinica_fd = nltk.FreqDist(sinica_treebank.words())
# sinica_fd.keys()
# sinica_fd.values()
cnt = sum([float(x) for x in sinica_fd.values()])
print(cnt)

c = {}
for key, val in sinica_fd.items():
    c[key] = float(val) / cnt
    print(key, c[key])

d = dict(c)
# print(d)
with open("data/models/char_freq.cp", 'wb') as f:
    pickle.dump(d, f)

# with open("data/models/char_freq.cp", 'rb') as f:
#     print(pickle.load(f))
Exemplo n.º 13
0
# -*- coding:utf-8 -*-
# Filename: sinica_treebank.py
# Author:hankcs
# Date: 2014-04-08 上午11:44
from __future__ import print_function
import nltk
import sqlite3
from nltk.corpus import sinica_treebank

sinica_text = nltk.Text(sinica_treebank.words())
print(sinica_text)
for (key, var) in sinica_treebank.tagged_words()[:8]:
    print('%s/%s' % (key, var))

print(sinica_text.concordance('我'))
print(sinica_text.concordance(u'\u5609\u73cd'))
print(
    "************* sinica_fd=nltk.FreqDist(sinica_treebank.words()) NLTK计算中文高频词 **********"
)
sinica_fd = nltk.FreqDist(sinica_treebank.words())
top100 = sinica_fd.items()[0:100]
for (x, y) in top100:
    print(x, y)
Exemplo n.º 14
0
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
POLL_INTERVAL = 100

_DEFAULT = "English: Brown Corpus (Humor)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
Exemplo n.º 15
0
## -*- coding: utf-8 -*-
import nltk
from nltk.corpus import sinica_treebank as sinica
import string
import time

non_hanzi= list(string.printable)
non_hanzi.append("。")
non_hanzi.append("!")
non_hanzi.append("?")
non_hanzi.append(",")
corpus_bank = set(sinica.words())

def sent_segment(sentence):
    palabras=[]
    sentence= sentence.decode('utf-8')
    # print sentence
    num_characters= len(sentence)
    if num_characters==0:
        return []
    ##
    ## Checking if what we got to analyze is not a hanzi to skip it
    ini_non_hanzi= 0
    fin_non_hanzi= 0
    not_a_hanzi= False
    # Go through the non-hanzi text until i find something different
    while((fin_non_hanzi < num_characters) and (sentence[fin_non_hanzi] in non_hanzi)):
        not_a_hanzi= True
        fin_non_hanzi+=1
    if not_a_hanzi:
        palabras.append(sentence[ini_non_hanzi:fin_non_hanzi])  # Añade el conjunto con caracteres latinos y puntuación como una palabra entera
Exemplo n.º 16
0
_CORPORA = {
    "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(
        categories=["news", "editorial", "reviews"]
    ),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(
        categories="science_fiction"
    ),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)