Exemplo n.º 1
0
 def test_parsed_sents(self):
     parsed_sents = sinica_treebank.parsed_sents()[25]
     self.assertEqual(
         parsed_sents,
         Tree('S', [
             Tree('NP', [Tree('Nba', ['嘉珍'])]),
             Tree('V‧地', [Tree('VA11', ['不停']),
                          Tree('DE', ['的'])]),
             Tree('VA4', ['哭泣'])
         ]))
 def test_parsed_sents(self):
     parsed_sents = sinica_treebank.parsed_sents()[25]
     self.assertEqual(parsed_sents,
         Tree('S', [
             Tree('NP', [
                 Tree('Nba', ['嘉珍'])
             ]),
             Tree('V‧地', [
                 Tree('VA11', ['不停']),
                 Tree('DE', ['的'])
             ]),
             Tree('VA4', ['哭泣'])
         ]))
Exemplo n.º 3
0
 def test_parsed_sents(self):
     parsed_sents = sinica_treebank.parsed_sents()[25]
     self.assertEqual(
         parsed_sents,
         Tree(
             "S",
             [
                 Tree("NP", [Tree("Nba", ["嘉珍"])]),
                 Tree("V‧地", [Tree("VA11", ["不停"]), Tree("DE", ["的"])]),
                 Tree("VA4", ["哭泣"]),
             ],
         ),
     )
Exemplo n.º 4
0
 def test_parsed_sents(self):
     parsed_sents = sinica_treebank.parsed_sents()[25]
     self.assertEqual(
         parsed_sents,
         Tree(
             "S",
             [
                 Tree("NP", [Tree("Nba", ["嘉珍"])]),
                 Tree("V‧地", [Tree("VA11", ["不停"]),
                              Tree("DE", ["的"])]),
                 Tree("VA4", ["哭泣"]),
             ],
         ),
     )
Exemplo n.º 5
0
import nltk
from nltk.corpus import sinica_treebank

print(sinica_treebank.sents())
print(sinica_treebank.parsed_sents()[27])
Exemplo n.º 6
0
text1.concordance('god')
text2.concordance('god')
text3.concordance('god', lines = 10, width = 30)

text1.similar('monstrous')
text2.similar('monstrous')
text3.similar('monstrous')
help(text1.similar)

from nltk.corpus import sinica_treebank
import random

num = random.choice([n for n in range(len(indian.sent()))])
print(indian.sents([num]))

sinica_treebank.parsed_sents()[888].draw()

from nltk.corpus import gutenberg as G
print(G.fileids())
emma = G.words('austen-emma.txt')

#number of words in one txt
for fileid in G.fileids():
    words = G.words(fileid)
    print(fileid, len(words))

#number of letters in one txt
num_chars = len(G.raw('austen-emma.txt'))
print(num_chars)

for fileid in G.fileids():
Exemplo n.º 7
0
import nltk
from nltk.tree import Tree
from nltk.corpus import sinica_treebank

# print(sinica_treebank.words())

print(sinica_treebank.parsed_sents()[36].draw())
# print(Tree.fromstring(sinica_treebank.parsed_sents()[33]).draw())
Exemplo n.º 8
0
import nltk
from nltk.corpus import sinica_treebank
print(sinica_treebank.sents())
print(sinica_treebank.parsed_sents()[27])
Exemplo n.º 9
0
    def __init__(self, min_nchar, fn, lang="ENG"):
        """
        TXT_FN : path to file containing text data.
        """
        self.min_nchar = min_nchar
        self.fdict = {
            'WORD': self.sample_word,
            'LINE': self.sample_line,
            'PARA': self.sample_para
        }
        self.lang = lang
        # parse English text
        if self.lang == "ENG":
            print('Generate English Data with NLTK:PlaintextCorpusReader')
            corpus = PlaintextCorpusReader("./", fn)

            self.words = corpus.words()
            self.sents = corpus.sents()
            self.paras = corpus.paras()

        # parse Japanese text
        elif self.lang == "JPN":
            print('Generate Japanese Data with NLTK:ChasenCorpusReader')
            # convert fs into chasen file
            _, ext = os.path.splitext(os.path.basename(fn))
            fn_chasen = fn.replace(ext, ".chasen")
            print("Convert {} into {}".format(fn, fn_chasen))

            cmd = "mecab -Ochasen {} > {}".format(fn, fn_chasen)
            print(
                "The following cmd below was executed to convert into chasen (for Japanese)"
            )
            print("\t{}".format(cmd))
            p = subprocess.call(cmd, shell=True)
            data = ChasenCorpusReader('./', fn_chasen, encoding='utf-8')

            self.words = data.words()
            self.sents = data.sents()
            self.paras = data.paras()

            # jp_sent_tokenizer = nltk.RegexpTokenizer(u'[^ 「」!?。]*[!?。]')
            # jp_chartype_tokenizer = nltk.RegexpTokenizer(u'([ぁ-んー]+|[ァ-ンー]+|[\u4e00-\u9FFF]+|[^ぁ-んァ-ンー\u4e00-\u9FFF]+)')
            #
            # corpus = PlaintextCorpusReader("./",
            #                              fn,
            #                              encoding='utf-8',
            #                              para_block_reader=read_line_block,
            #                              sent_tokenizer=jp_sent_tokenizer,
            #                              word_tokenizer=jp_chartype_tokenizer)
        elif self.lang == "ZHTW":
            print(
                'Generate Traditional Chinese Data with NLTK:sinica_treebank')
            self.words = []
            self.sents = []
            self.paras = []
            #data = SinicaTreebankCorpusReader('./', fn, encoding='utf-8')
            #self.words = data.words()
            #self.sents = data.sents()
            #self.paras = data.parsed_sents()
            self.words = sinica_treebank.words()
            self.sents = sinica_treebank.sents()
            self.paras = sinica_treebank.parsed_sents()
        else:
            self.words = []
            self.sents = []
            self.paras = []
            #data = SinicaTreebankCorpusReader('./', fn, encoding='utf-8')
            #self.words = data.words()
            #self.sents = data.sents()
            #self.paras = data.parsed_sents()
            self.words = sinica_treebank.words()
            self.sents = sinica_treebank.sents()
            self.paras = sinica_treebank.parsed_sents()
        # distribution over line/words for LINE/PARA:
        self.p_line_nline = np.array([0.85, 0.10, 0.05])
        self.p_line_nword = [4, 3, 12]  # normal: (mu, std)
        self.p_para_nline = [1.0, 1.0]  #[1.7,3.0] # beta: (a, b), max_nline
        self.p_para_nword = [1.7, 3.0, 10]  # beta: (a,b), max_nword

        # probability to center-align a paragraph:
        self.center_para = 0.5
Exemplo n.º 10
0
# -*- coding: utf-8 -*-
"""
Created on Tue Aug  2 08:14:38 2016

@author: alex
"""

from nltk.corpus import sinica_treebank

sents = sinica_treebank.parsed_sents()[15]

print type(sents)

sents.draw()
Exemplo n.º 11
0
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import sinica_treebank

sinica_text = nltk.Text(sinica_treebank.words())
print sinica_text
for (key, var) in sinica_treebank.tagged_words()[:8]:
    print '%s/%s' % (key, var)

print sinica_treebank.parsed_sents()[15]
import nltk
from nltk.corpus import sinica_treebank
print(sinica_treebank.sents())
print(sinica_treebank.parsed_sents()[27])
sinica_treebank.parsed_sents()[27].draw()
Exemplo n.º 13
0
print(treebank.words('wsj_0003.mrg'))
print(treebank.tagged_words('wsj_0003.mrg'))
print(treebank.parsed_sents('wsj_0003.mrg')
      [0])  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('ptb')
print(ptb.fileids())  # doctest: +SKIP
# download the corpus from here: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/treebank.zip
# then extract and place to the following location: .../nltk_data/corpora/ptb/
print(ptb.words('treebank/combined/wsj_0003.mrg'))  # doctest: +SKIP
print(ptb.tagged_words('treebank/combined/wsj_0003.mrg'))  # doctest: +SKIP
# print(ptb.categories())  # doctest: +SKIP
# print(ptb.fileids('news'))  # doctest: +SKIP
# print(ptb.words(categories=['humor', 'fiction']))  # doctest: +SKIP
# nltk.download('sinica_treebank')
print(sinica_treebank.sents())  # doctest: +SKIP
print(sinica_treebank.parsed_sents()[25])  # doctest: +SKIP
# nltk.download('conll2007')
print(conll2007.sents('esp.train')[0])  # doctest: +SKIP
print(conll2007.parsed_sents('esp.train')[0])  # doctest: +SKIP
print(conll2007.parsed_sents('esp.train')[0].tree())  # doctest: +SKIP
# for tree in ycoe.parsed_sents('cocuraC')[:4]:
#     print(tree)  # doctest: +SKIP
# word lists and lexicons
print(words.fileids())
print(words.words('en'))  # doctest: +ELLIPSIS
print(stopwords.fileids())  # doctest: +ELLIPSIS
print(stopwords.words('portuguese'))  # doctest: +ELLIPSIS
# nltk.download('names')
print(names.fileids())
print(names.words('male.txt'))  # doctest: +ELLIPSIS
print(names.words('female.txt'))  # doctest: +ELLIPSIS
Exemplo n.º 14
0
    b= "".join(s)
##    print "B",b
    f.write('Corpus: '+str(b)+'\n')
    r1= tree2set(str(a))
##    print "Tree A: "+str(r1)
    r2= tree2set(str(b))
##    print "Tree B: "+str(r2)
    return lp_lr(r2,r1)#parseval(r2,r1), labeled_recall(r2,r1),lp_lr(r2,r1)



##    ## TRAIN + TEST 1000
##   
size= 1000        
frases= sinica.sents()
arboles= sinica.parsed_sents()
train= pcfg(size)
train.carga_pesos()

with open('gramatica1000total.txt','r') as g:
    gramatica=g.readlines()
    
train.carga_gramatica(gramatica)

F1= 0
f= open('t1000.txt', 'w')
##f= open('Knownwords.txt', 'w')
for i in range(size):
##    print i
    f.write(str(i)+'\n')
##    print '\n\n'