def ksEvaluator(doc, verbose = None): if verbose: tokenList = text_fun.prune(doc) print('Token list created') jaccard_mat = jaccard(tokenList) print('Jaccard matrix computed') cdf_vec = cdf(jaccard_mat, grid_pts) print('CDF vector created') return ks(cdf_vec, baselineCDF) else: tokenList = text_fun.prune(doc) return ks(cdf(jaccard(tokenList), grid_pts), baselineCDF)
def __iter__(self): for i, file_name in enumerate(files): if i%10000 == 0 and i != 0: print(time(), '%i files added to corpus.' %i) titles = titlextractor(file_name) docs = textractor(file_name) for title, doc in zip(titles, docs): with open('titles.txt', 'a') as f: f.write(''.join((title, '\n'))) yield dictionary.doc2bow(tf.prune(doc))
def median_words(path_to_inputs, num_ideas = 30): textList = pickle.load( open("fulltext.p", "rb")) textList = [text_fun.prune(doc) for doc in textList] dictionary = corpora.Dictionary(textList) # collects stats for each word corpus = [dictionary.doc2bow(doc) for doc in textList] lsi = models.LsiModel(corpus, id2word = dictionary, num_topics = len(textList)) search_text = pickle.load(open('search_text.p', 'rb')) # Loads the search text index = similarities.MatrixSimilarity(lsi[corpus]) searchWords = [] for text in textList: searchWords.extend(text) searchWords = set(searchWords) d = enchant.Dict("en_US") simList = [] wordList = [] for word in searchWords: if d.check(word): wordList.extend([word]) phrase = search_text + ' ' + word vec_repr = dictionary.doc2bow(phrase.split()) vec_lsi = lsi[vec_repr] # convert the query to LSI space sim = sum(index[vec_lsi]) simList.extend([sim]) # new stimulus words simFrame = pd.DataFrame(simList, index = wordList, columns = ['Similarity']) medDistFrame = abs(simFrame - simFrame.median()) idxMed = medDistFrame.sort_values(by = 'Similarity', ascending = True).index newStimulusWords = pd.Series(idxMed) counter = 0 ideaList = [] for word in newStimulusWords: if counter >= num_ideas: break wordTag = nltk.pos_tag([word])[0][1] if wordTag in ['NN']: counter = counter + 1 ideaList.append('Try blending %s with a %s \n' %(search_text, word)) elif wordTag in ['NNS']: counter = counter + 1 ideaList.append('Try blending %s with %s \n' %(search_text, word)) elif wordTag in ['JJ']: ideaList.append('Try making %s more %s \n' %(search_text, word)) counter = counter + 1 elif wordTag in ['JJS', 'RBS']: ideaList.append('Imagine the %s version of %s \n' %(word, search_text)) counter = counter + 1 elif wordTag in ['JJR', 'RBR']: ideaList.append('Imagine a %s version of %s \n' %(word, search_text)) counter = counter + 1 elif wordTag in ['RB']: ideaList.append('How would %s change if you implemented it %s? \n' %(search_text, word)) counter = counter + 1 elif wordTag in ['VB']: ideaList.append('How could you %s with %s? \n' %(word, search_text)) counter = counter + 1 return ideaList
import text_fun os.chdir(BlenderPath + '/Evaluators') import ksmirnov_fun import wikipedia import gensim import nltk from collections import Counter from nltk import word_tokenize from nltk.corpus import stopwords from textblob import TextBlob from textblob_aptagger import PerceptronTagger googleList = pickle.load( open('../fulltext.p', 'rb')) seed_term = pickle.load( open('../search_text.p', 'rb')) seed_term = seed_term.lower() textList = [text_fun.prune(doc) for doc in googleList] article = wikipedia.page(seed_term).content ksEvaluator = ksmirnov_fun.ksFunctionGenerator(textList) header = wikipedia.summary(seed_term) header = header[0:header.find('\n')] print(ksEvaluator(header)) print(ksEvaluator(header.replace('consumption', 'processing'))) candidates = [] for item in googleList: item = text_fun.prune(item, stem = False, english_dictionary_words = True) candidates.extend(item) candidates = list(set(candidates)) candidates_blob = TextBlob(' '.join(candidates), pos_tagger = PerceptronTagger())