import nltk from nltk.corpus import sinica_treebank print(sinica_treebank.sents()) print(sinica_treebank.parsed_sents()[27])
def test_sents(self): first_3_sents = sinica_treebank.sents()[:3] self.assertEqual( first_3_sents, [['一'], ['友情'], ['嘉珍', '和', '我', '住在', '同一條', '巷子']] )
from __future__ import division import nltk from nltk.corpus import sinica_treebank as s import Segmenter from Segmenter import sent_segment frases= s.sents() def haz_uno(sentence): ## 'sentence' es una lista de caracteres sin codificar en utf8 devolver= "" for i in sentence: devolver = devolver+ str(i).decode('utf-8') return devolver def compare_both(a,b): ## 'a' siempre será la frase del corpus ## 'b' será la frase segmentada por mi tokenizer count= 0 total= 0 if len(a) <= len(b): for i in range(len(a)): if a[i] == b[i]: count = count + 1 else: for i in range(len(b)): if a[i] == b[i]: count = count + 1 return count/int(len(a))
def test_sents(self): first_3_sents = sinica_treebank.sents()[:3] self.assertEqual(first_3_sents, [["一"], ["友情"], ["嘉珍", "和", "我", "住在", "同一條", "巷子"]])
def __init__(self, min_nchar, fn, lang="ENG"): """ TXT_FN : path to file containing text data. """ self.min_nchar = min_nchar self.fdict = { 'WORD': self.sample_word, 'LINE': self.sample_line, 'PARA': self.sample_para } self.lang = lang # parse English text if self.lang == "ENG": print('Generate English Data with NLTK:PlaintextCorpusReader') corpus = PlaintextCorpusReader("./", fn) self.words = corpus.words() self.sents = corpus.sents() self.paras = corpus.paras() # parse Japanese text elif self.lang == "JPN": print('Generate Japanese Data with NLTK:ChasenCorpusReader') # convert fs into chasen file _, ext = os.path.splitext(os.path.basename(fn)) fn_chasen = fn.replace(ext, ".chasen") print("Convert {} into {}".format(fn, fn_chasen)) cmd = "mecab -Ochasen {} > {}".format(fn, fn_chasen) print( "The following cmd below was executed to convert into chasen (for Japanese)" ) print("\t{}".format(cmd)) p = subprocess.call(cmd, shell=True) data = ChasenCorpusReader('./', fn_chasen, encoding='utf-8') self.words = data.words() self.sents = data.sents() self.paras = data.paras() # jp_sent_tokenizer = nltk.RegexpTokenizer(u'[^ 「」!?。]*[!?。]') # jp_chartype_tokenizer = nltk.RegexpTokenizer(u'([ぁ-んー]+|[ァ-ンー]+|[\u4e00-\u9FFF]+|[^ぁ-んァ-ンー\u4e00-\u9FFF]+)') # # corpus = PlaintextCorpusReader("./", # fn, # encoding='utf-8', # para_block_reader=read_line_block, # sent_tokenizer=jp_sent_tokenizer, # word_tokenizer=jp_chartype_tokenizer) elif self.lang == "ZHTW": print( 'Generate Traditional Chinese Data with NLTK:sinica_treebank') self.words = [] self.sents = [] self.paras = [] #data = SinicaTreebankCorpusReader('./', fn, encoding='utf-8') #self.words = data.words() #self.sents = data.sents() #self.paras = data.parsed_sents() self.words = sinica_treebank.words() self.sents = sinica_treebank.sents() self.paras = sinica_treebank.parsed_sents() else: self.words = [] self.sents = [] self.paras = [] #data = SinicaTreebankCorpusReader('./', fn, encoding='utf-8') #self.words = data.words() #self.sents = data.sents() #self.paras = data.parsed_sents() self.words = sinica_treebank.words() self.sents = sinica_treebank.sents() self.paras = sinica_treebank.parsed_sents() # distribution over line/words for LINE/PARA: self.p_line_nline = np.array([0.85, 0.10, 0.05]) self.p_line_nword = [4, 3, 12] # normal: (mu, std) self.p_para_nline = [1.0, 1.0] #[1.7,3.0] # beta: (a, b), max_nline self.p_para_nword = [1.7, 3.0, 10] # beta: (a,b), max_nword # probability to center-align a paragraph: self.center_para = 0.5
import nltk from nltk.corpus import sinica_treebank as sinica import Segmenter from Segmenter import sent_segment import PCFG from PCFG import PCFGChino import time #################################### ## ESTA PARTE NO ES IMPORTANTE PARA TESTEAR EL PARSER tagged_sents= sinica.tagged_sents() sents= sinica.sents() size= int(len(tagged_sents) * 0.9) train_set= tagged_sents[:size] test_set= tagged_sents[size:] ##trigram_tagger= nltk.TrigramTagger(train_set) ##score= trigram_tagger.evaluate(test_set) print "Entrenando" ini= time.time() t0= nltk.DefaultTagger('Nab') t1= nltk.UnigramTagger(train_set, backoff=t0) t2= nltk.BigramTagger(train_set, backoff=t1) t3= nltk.TrigramTagger(train_set, backoff=t2) fin= time.time() score= t3.evaluate(test_set) print("Entrenamiento terminado ", str(fin-ini)) print "Evaluation Tagger= ",score #################################### ## Se crea el parser
print(treebank.fileids()) # doctest: +ELLIPSIS print(treebank.words('wsj_0003.mrg')) print(treebank.tagged_words('wsj_0003.mrg')) print(treebank.parsed_sents('wsj_0003.mrg') [0]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('ptb') print(ptb.fileids()) # doctest: +SKIP # download the corpus from here: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/treebank.zip # then extract and place to the following location: .../nltk_data/corpora/ptb/ print(ptb.words('treebank/combined/wsj_0003.mrg')) # doctest: +SKIP print(ptb.tagged_words('treebank/combined/wsj_0003.mrg')) # doctest: +SKIP # print(ptb.categories()) # doctest: +SKIP # print(ptb.fileids('news')) # doctest: +SKIP # print(ptb.words(categories=['humor', 'fiction'])) # doctest: +SKIP # nltk.download('sinica_treebank') print(sinica_treebank.sents()) # doctest: +SKIP print(sinica_treebank.parsed_sents()[25]) # doctest: +SKIP # nltk.download('conll2007') print(conll2007.sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0].tree()) # doctest: +SKIP # for tree in ycoe.parsed_sents('cocuraC')[:4]: # print(tree) # doctest: +SKIP # word lists and lexicons print(words.fileids()) print(words.words('en')) # doctest: +ELLIPSIS print(stopwords.fileids()) # doctest: +ELLIPSIS print(stopwords.words('portuguese')) # doctest: +ELLIPSIS # nltk.download('names') print(names.fileids()) print(names.words('male.txt')) # doctest: +ELLIPSIS
s.remove(u'}') b= "".join(s) ## print "B",b f.write('Corpus: '+str(b)+'\n') r1= tree2set(str(a)) ## print "Tree A: "+str(r1) r2= tree2set(str(b)) ## print "Tree B: "+str(r2) return lp_lr(r2,r1)#parseval(r2,r1), labeled_recall(r2,r1),lp_lr(r2,r1) ## ## TRAIN + TEST 1000 ## size= 1000 frases= sinica.sents() arboles= sinica.parsed_sents() train= pcfg(size) train.carga_pesos() with open('gramatica1000total.txt','r') as g: gramatica=g.readlines() train.carga_gramatica(gramatica) F1= 0 f= open('t1000.txt', 'w') ##f= open('Knownwords.txt', 'w') for i in range(size): ## print i f.write(str(i)+'\n')