예제 #1
0
	def ksEvaluator(doc, verbose = None):
		if verbose:
			tokenList = text_fun.prune(doc)
			print('Token list created')
			jaccard_mat = jaccard(tokenList)
			print('Jaccard matrix computed')
			cdf_vec = cdf(jaccard_mat, grid_pts)
			print('CDF vector created')
			return ks(cdf_vec, baselineCDF)
		else:
			tokenList = text_fun.prune(doc)
			return ks(cdf(jaccard(tokenList), grid_pts), 
				baselineCDF)
예제 #2
0
 def __iter__(self):
     for i, file_name in enumerate(files):
         if i%10000 == 0 and i != 0:
             print(time(), '%i files added to corpus.' %i)
         titles = titlextractor(file_name)
         docs = textractor(file_name)
         for title, doc in zip(titles, docs):
             with open('titles.txt', 'a') as f:
                 f.write(''.join((title, '\n')))
             yield dictionary.doc2bow(tf.prune(doc))
예제 #3
0
def median_words(path_to_inputs, num_ideas = 30):
    textList = pickle.load( open("fulltext.p", "rb"))
    textList = [text_fun.prune(doc) for doc in textList]

    dictionary = corpora.Dictionary(textList) # collects stats for each word
    corpus = [dictionary.doc2bow(doc) for doc in textList] 
    lsi = models.LsiModel(corpus, id2word = dictionary, 
        num_topics = len(textList))

    search_text = pickle.load(open('search_text.p', 'rb')) # Loads the search text
    index = similarities.MatrixSimilarity(lsi[corpus])

    searchWords = []
    for text in textList:
        searchWords.extend(text)
    searchWords = set(searchWords)

    d = enchant.Dict("en_US")
    simList = []
    wordList = []
    for word in searchWords:
        if d.check(word):
            wordList.extend([word])
            phrase = search_text + ' ' + word
            vec_repr = dictionary.doc2bow(phrase.split())
            vec_lsi = lsi[vec_repr] # convert the query to LSI space
            sim = sum(index[vec_lsi])
            simList.extend([sim])

    # new stimulus words
    simFrame = pd.DataFrame(simList, index = wordList, 
                            columns = ['Similarity'])
    medDistFrame = abs(simFrame - simFrame.median())
    idxMed = medDistFrame.sort_values(by = 'Similarity', ascending = True).index
    newStimulusWords = pd.Series(idxMed)
    counter = 0
    ideaList = []
    for word in newStimulusWords:
        if counter >= num_ideas:
            break
        wordTag = nltk.pos_tag([word])[0][1]
        if wordTag in ['NN']:
            counter = counter + 1
            ideaList.append('Try blending %s with a %s \n' %(search_text, word))
        elif wordTag in ['NNS']:
            counter = counter + 1
            ideaList.append('Try blending %s with %s \n' %(search_text, word))
        elif wordTag in ['JJ']:
            ideaList.append('Try making  %s more %s \n' %(search_text, word))
            counter = counter + 1
        elif wordTag in ['JJS', 'RBS']:
            ideaList.append('Imagine the %s version of %s \n' %(word, search_text))
            counter = counter + 1
        elif wordTag in ['JJR', 'RBR']:
            ideaList.append('Imagine a %s version of %s \n' %(word, search_text))
            counter = counter + 1
        elif wordTag in ['RB']:
            ideaList.append('How would %s change if you implemented it %s? \n' %(search_text, word))
            counter = counter + 1
        elif wordTag in ['VB']:
            ideaList.append('How could you %s with %s? \n' %(word, search_text))
            counter = counter + 1

    return ideaList
예제 #4
0
import text_fun
os.chdir(BlenderPath + '/Evaluators')
import ksmirnov_fun
import wikipedia
import gensim
import nltk
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob_aptagger import PerceptronTagger

googleList = pickle.load( open('../fulltext.p', 'rb'))
seed_term = pickle.load( open('../search_text.p', 'rb'))
seed_term = seed_term.lower()
textList = [text_fun.prune(doc) for doc in googleList]
article = wikipedia.page(seed_term).content
ksEvaluator = ksmirnov_fun.ksFunctionGenerator(textList)

header = wikipedia.summary(seed_term)
header = header[0:header.find('\n')]
print(ksEvaluator(header))
print(ksEvaluator(header.replace('consumption', 'processing')))

candidates = []
for item in googleList:
    item = text_fun.prune(item, stem = False, english_dictionary_words = True)
    candidates.extend(item)
candidates = list(set(candidates))
candidates_blob = TextBlob(' '.join(candidates), 
                           pos_tagger = PerceptronTagger())