Пример #1
0
 def test_sents(self):
     sents = conll2007.sents('esp.train')[0]
     self.assertEqual(
         sents[:6], ['El', 'aumento', 'del', 'índice', 'de', 'desempleo']
     )
 def test_sents(self):
     sents = conll2007.sents('esp.train')[0]
     self.assertEqual(
         sents[:6],
         ['El', 'aumento', 'del', 'índice', 'de', 'desempleo']
     )
Пример #3
0
 def test_sents(self):
     sents = conll2007.sents("esp.train")[0]
     self.assertEqual(sents[:6],
                      ["El", "aumento", "del", "índice", "de", "desempleo"])
Пример #4
0
raw_sentences = tokenizer.tokenize(books_raw)

book_sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        book_sentences.append(sentence_to_wordlist(raw_sentence))

#print(raw_sentences[5])
#print(book_sentences[5])

conll2000_corp_sents = conll2000.sents()
print("condll2000 to sents")
conll2002_corp_sents = conll2002.sents()
print("conll2002 to sents")

conll2007_corp_sents = conll2007.sents()
print("condll2007 to sents")
inaugural_corp_sents = inaugural.sents()
print("inaugural to sents")
abc_corp_sents = abc.sents()
print("ABC to sentences")
genesis_corp_sents = genesis.sents()
print("Genesis to sents")
frame_net_corp_sents = fn.sents()
print("Frame_net to sents")
state_union_corp_sents = state_union.sents()
print('state union to sents')
subject_corp_sents = subjectivity.sents()
print('Subjectvity to sents')
brown_corp_sents = brown.sents()
print("Brown corpus to sents")
Пример #5
0
 def test_sents(self):
     sents = conll2007.sents("esp.train")[0]
     self.assertEqual(sents[:6], ["El", "aumento", "del", "índice", "de", "desempleo"])
Пример #6
0
print(treebank.parsed_sents('wsj_0003.mrg')
      [0])  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('ptb')
print(ptb.fileids())  # doctest: +SKIP
# download the corpus from here: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/treebank.zip
# then extract and place to the following location: .../nltk_data/corpora/ptb/
print(ptb.words('treebank/combined/wsj_0003.mrg'))  # doctest: +SKIP
print(ptb.tagged_words('treebank/combined/wsj_0003.mrg'))  # doctest: +SKIP
# print(ptb.categories())  # doctest: +SKIP
# print(ptb.fileids('news'))  # doctest: +SKIP
# print(ptb.words(categories=['humor', 'fiction']))  # doctest: +SKIP
# nltk.download('sinica_treebank')
print(sinica_treebank.sents())  # doctest: +SKIP
print(sinica_treebank.parsed_sents()[25])  # doctest: +SKIP
# nltk.download('conll2007')
print(conll2007.sents('esp.train')[0])  # doctest: +SKIP
print(conll2007.parsed_sents('esp.train')[0])  # doctest: +SKIP
print(conll2007.parsed_sents('esp.train')[0].tree())  # doctest: +SKIP
# for tree in ycoe.parsed_sents('cocuraC')[:4]:
#     print(tree)  # doctest: +SKIP
# word lists and lexicons
print(words.fileids())
print(words.words('en'))  # doctest: +ELLIPSIS
print(stopwords.fileids())  # doctest: +ELLIPSIS
print(stopwords.words('portuguese'))  # doctest: +ELLIPSIS
# nltk.download('names')
print(names.fileids())
print(names.words('male.txt'))  # doctest: +ELLIPSIS
print(names.words('female.txt'))  # doctest: +ELLIPSIS
# nltk.download('cmudict')
print(cmudict.entries()[653:659])  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE