Exemplo n.º 1
0
def Pdf2Vec(titles):
    '''
	Vectorizes a given PDF on your local filesystem to a Log Entropy TF-IDF
	vector to then query against your similarity index

	Returns:

	[document-logent-vec-1, document-logent-vec-2, ... ,document-logent-vec-N]
	where N is is the number of titles
	'''
    #TODO: Make it so you can give a model as an arguement to vecorize a given
    #document into any trained gensim model

    ret_lst = []
    logent = LogEntropyModel.load('../models/logEntropy.model')
    diction = Dictionary.load('../models/wiki_dict.dict')
    for title in titles:
        curr_file = open('../data/articleData/pdfs/' + title + '.pdf')
        doc = slate.PDF(curr_file)
        doc = ' '.join(doc)
        doc_tokens = wikicorpus.tokenize(doc)
        bow = diction.doc2bow(doc_tokens)
        bow_logent = logent[bow]
        ret_lst.append(bow_logent)
        curr_file.close()

    return ret_lst
Exemplo n.º 2
0
print('Finished making the wikicorpus, saving BOW corpus\n')
corpora.mmcorpus.MmCorpus.serialize('../data/wiki_en_vocab200k', wiki_corpus)
print('Done saving BOW Corpus\n')

# Save the dicitonary, you will need it to convert future documents into
# BOW format

#wiki.dictionary.save("../data/wiki_dict.dict")
#print 'Saved dictionary'

print('Creating LogEntropy TF-IDF and regular TF-IDF matrices and models')
BOW_corpus = MmCorpus('../data/wiki_en_vocab200k') #Resurrect BOW corpus

#log_entropy = LogEntropyModel(BOW_corpus)
#log_entropy.save('../models/logEntropy.model') #already provided
log_entropy = LogEntropyModel.load('../models/logEntropy.model')
corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix',
log_entropy[BOW_corpus])

print('Saved LogEntropy TF-IDF matrix')

#tfidf = TfidfModel(BOW_corpus)
#tfidf.save('../models/tfidf.model') #already provided
tfidf = TfidfModel.load('../models/tfidf.model')
corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix',
tfidf[BOW_corpus])

print('Saved LogEntropy TF-IDF matrix')

print('Creating Similarity Index')
logent_corpus = MmCorpus('../data/log_entropy_matrix')