def test_sents(self): sents = conll2007.sents('esp.train')[0] self.assertEqual( sents[:6], ['El', 'aumento', 'del', 'índice', 'de', 'desempleo'] )
def test_sents(self): sents = conll2007.sents("esp.train")[0] self.assertEqual(sents[:6], ["El", "aumento", "del", "índice", "de", "desempleo"])
raw_sentences = tokenizer.tokenize(books_raw) book_sentences = [] for raw_sentence in raw_sentences: if len(raw_sentence) > 0: book_sentences.append(sentence_to_wordlist(raw_sentence)) #print(raw_sentences[5]) #print(book_sentences[5]) conll2000_corp_sents = conll2000.sents() print("condll2000 to sents") conll2002_corp_sents = conll2002.sents() print("conll2002 to sents") conll2007_corp_sents = conll2007.sents() print("condll2007 to sents") inaugural_corp_sents = inaugural.sents() print("inaugural to sents") abc_corp_sents = abc.sents() print("ABC to sentences") genesis_corp_sents = genesis.sents() print("Genesis to sents") frame_net_corp_sents = fn.sents() print("Frame_net to sents") state_union_corp_sents = state_union.sents() print('state union to sents') subject_corp_sents = subjectivity.sents() print('Subjectvity to sents') brown_corp_sents = brown.sents() print("Brown corpus to sents")
print(treebank.parsed_sents('wsj_0003.mrg') [0]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('ptb') print(ptb.fileids()) # doctest: +SKIP # download the corpus from here: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/treebank.zip # then extract and place to the following location: .../nltk_data/corpora/ptb/ print(ptb.words('treebank/combined/wsj_0003.mrg')) # doctest: +SKIP print(ptb.tagged_words('treebank/combined/wsj_0003.mrg')) # doctest: +SKIP # print(ptb.categories()) # doctest: +SKIP # print(ptb.fileids('news')) # doctest: +SKIP # print(ptb.words(categories=['humor', 'fiction'])) # doctest: +SKIP # nltk.download('sinica_treebank') print(sinica_treebank.sents()) # doctest: +SKIP print(sinica_treebank.parsed_sents()[25]) # doctest: +SKIP # nltk.download('conll2007') print(conll2007.sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0].tree()) # doctest: +SKIP # for tree in ycoe.parsed_sents('cocuraC')[:4]: # print(tree) # doctest: +SKIP # word lists and lexicons print(words.fileids()) print(words.words('en')) # doctest: +ELLIPSIS print(stopwords.fileids()) # doctest: +ELLIPSIS print(stopwords.words('portuguese')) # doctest: +ELLIPSIS # nltk.download('names') print(names.fileids()) print(names.words('male.txt')) # doctest: +ELLIPSIS print(names.words('female.txt')) # doctest: +ELLIPSIS # nltk.download('cmudict') print(cmudict.entries()[653:659]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE