Exemplo n.º 1
0
def load_genesis_corpus() -> np.ndarray:
    nltk.download('genesis')
    sentences = list(
        filter(
            lambda sent: (len(sent) <= 30) and
            (len(sent) >= 1) and any(map(lambda word: word.isalpha(), sent)),
            genesis.sents()))
    mdetok = TreebankWordDetokenizer()
    return np.array(list(
        map(
            lambda sent: mdetok.detokenize(
                (' '.join(sent).replace('``', '"').replace("''", '"').replace(
                    '`', "'")).split()), sentences)),
                    dtype=object)
Exemplo n.º 2
0
             #print("SEARCH TERM: "+thing)
             #print(wikipedia.page(thing))
             #print(wikipedia.page(thing).content)
             pages += wikipedia.page(thing).content
     except wikipedia.DisambiguationError as e:
         s = random.choice(e.options)
         get_wiki(s)
         pass
     except:
         pass
 b = brown.sents()
 sents = tokenizer.tokenize(pages)
 sense = gutenberg.sents('austen-sense.txt')
 emma = gutenberg.sents('austen-emma.txt')
 persuasion = gutenberg.sents('austen-persuasion.txt')
 bible = genesis.sents('english-kjv.txt')
 blake = gutenberg.sents('blake-poems.txt')
 bryant = gutenberg.sents('bryant-stories.txt')
 burgess = gutenberg.sents('burgess-busterbrown.txt')
 carroll = gutenberg.sents('carroll-alice.txt')
 ch_ball = gutenberg.sents('chesterton-ball.txt')
 ch_brown = gutenberg.sents('chesterton-brown.txt')
 ch_thurs = gutenberg.sents('chesterton-thursday.txt')
 edge = gutenberg.sents('edgeworth-parents.txt')
 mel = gutenberg.sents('melville-moby_dick.txt')
 mil = gutenberg.sents('milton-paradise.txt')
 caesar = gutenberg.sents('shakespeare-caesar.txt')
 hamlet = gutenberg.sents('shakespeare-hamlet.txt')
 macbeth = gutenberg.sents('shakespeare-macbeth.txt')
 whit = gutenberg.sents('whitman-leaves.txt')
 rural = abc.sents('rural.txt')
Exemplo n.º 3
0
def pmi_with_cython(input_corpus):
    logging.debug(msg='With cython is True')
    start = time.time()
    scored_matrix_obj = interface.run_feature_selection(
        input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True)
    elapsed_time = time.time() - start
    print(("elapsed_time with cython:{} [sec]".format(elapsed_time)))


from nltk.corpus import gutenberg
from nltk.corpus import webtext
from nltk.corpus import genesis
from nltk.corpus import abc

abs_corpus = abc.sents()
genesis_corpus = genesis.sents()
web_corpus = webtext.sents()
gutenberg_corpus = gutenberg.sents()

input_corpus = {
    'abs': list(abs_corpus),
    'genesis': list(genesis_corpus),
    'web': list(web_corpus),
    'gutenberg': list(gutenberg_corpus)
}

pmi_with_cython(input_corpus)
pmi_with_parallel(input_corpus)
#pmi_with_threading(input_corpus)
Exemplo n.º 4
0
#print(raw_sentences[5])
#print(book_sentences[5])

conll2000_corp_sents = conll2000.sents()
print("condll2000 to sents")
conll2002_corp_sents = conll2002.sents()
print("conll2002 to sents")

conll2007_corp_sents = conll2007.sents()
print("condll2007 to sents")
inaugural_corp_sents = inaugural.sents()
print("inaugural to sents")
abc_corp_sents = abc.sents()
print("ABC to sentences")
genesis_corp_sents = genesis.sents()
print("Genesis to sents")
frame_net_corp_sents = fn.sents()
print("Frame_net to sents")
state_union_corp_sents = state_union.sents()
print('state union to sents')
subject_corp_sents = subjectivity.sents()
print('Subjectvity to sents')
brown_corp_sents = brown.sents()
print("Brown corpus to sents")
movie_reviews_corp_sents = movie_reviews.sents()
print("Movie reviews to sents ")
guttenberg_corp_sents = gutenberg.sents()
print("Guttenberg to sents")
treebank_corb_sents = treebank.sents()
print("Freebank to sents")
Exemplo n.º 5
0
words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])

print "Building clean sentences list"
sentences = []
for s in brown.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in treebank.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in abc.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in movie_reviews.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in genesis.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))

    
def singles(words):
        if len(words) < 1:
            return
        for w in words:
            if re.match("[a-zA-Z'-]+", w) and w.strip() != "''":
                yield w

def doubles(sentences):
    for s in sentences:
        s = s.split(' ')
        if len(s) < 2:
            continue
    start = time.time()
    scored_matrix_obj = interface.run_feature_selection(
        input_dict=input_corpus,
        method='pmi',
        n_jobs=-1,
        use_cython=True
    )
    elapsed_time = time.time() - start
    print ("elapsed_time with cython:{} [sec]".format(elapsed_time))

from nltk.corpus import gutenberg
from nltk.corpus import webtext
from nltk.corpus import genesis
from nltk.corpus import abc

abs_corpus = abc.sents()
genesis_corpus = genesis.sents()
web_corpus = webtext.sents()
gutenberg_corpus = gutenberg.sents()

input_corpus = {
    'abs': list(abs_corpus),
    'genesis': list(genesis_corpus),
    'web': list(web_corpus),
    'gutenberg': list(gutenberg_corpus)
    }

pmi_with_cython(input_corpus)
pmi_with_parallel(input_corpus)
#pmi_with_threading(input_corpus)