def search(self, text, isSvd = False): if isSvd: print "Search with SVD" else: print "Search with LDA" start = time.time() vector = text.split() cleanedVector = cleanVector(vector) #cleanedVector = self.wordsToAsk print cleanedVector if isSvd: bagOfWords, indices = createBagOfWordsFromVector(cleanedVector, self.amountOfWords, self.dictOfWords, self.idfs) b = fasterCorrelations(self.matrix, indices, bagOfWords, self.amountOfFiles) else: bagOfWords = createBagOfWordsForLDA(cleanedVector, self.amountOfWords, self.dictOfWords) #print 'bow shape:' #print bagOfWords.shape #print 'topic word shape:' #print np.transpose(self.model.topic_word_).shape res = np.dot(bagOfWords, np.transpose(self.model.topic_word_)) #print 'first mul shape:' #print res.shape #print res #print 'doc topic shape:' #print np.transpose(self.model.doc_topic_).shape res2 = np.dot(res, np.transpose(self.model.doc_topic_)) #print 'final shape:' #print res2.shape bestValues = sorted(list(res2[0]), reverse=True)[:5] b = [] for bestVal in bestValues: for ind, val in enumerate(list(res2[0])): if val == bestVal: b.append((ind, val)) results = [] dbMan = DatabaseManager() for x in b: results.append(dbMan.get_link(self.listOfArticles[x[0]])) for res in results: print res.url print res.title stop = time.time() return results, stop - start
def getArticles(self, drillDownPath): fileName = '' formerFileName = '' for ind in drillDownPath: formerFileName = fileName fileName+='_' + str(ind) with open(CLUST_DIR + 'b' + formerFileName + '.pickle', 'rb') as handle: formerClust = pickle.load(handle) artNumbers = formerClust[drillDownPath[-1]] results = [] dbMan = DatabaseManager() for x in artNumbers: results.append(dbMan.get_link(self.listOfArticles[x])) return results