Exemplo n.º 1
0
    def search(self, text, isSvd = False):

        if isSvd:
            print "Search with SVD"
        else:
            print "Search with LDA"

        start = time.time()
        vector = text.split()
        cleanedVector = cleanVector(vector)
        #cleanedVector = self.wordsToAsk

        print cleanedVector

        if isSvd:
            bagOfWords, indices = createBagOfWordsFromVector(cleanedVector, self.amountOfWords, self.dictOfWords, self.idfs)
            b = fasterCorrelations(self.matrix, indices, bagOfWords, self.amountOfFiles)

        else:
            bagOfWords = createBagOfWordsForLDA(cleanedVector, self.amountOfWords, self.dictOfWords)

            #print 'bow shape:'
            #print bagOfWords.shape

            #print 'topic word shape:'
            #print np.transpose(self.model.topic_word_).shape
            res = np.dot(bagOfWords, np.transpose(self.model.topic_word_))

            #print 'first mul shape:'
            #print res.shape
            #print res
            #print 'doc topic shape:'
            #print np.transpose(self.model.doc_topic_).shape

            res2 = np.dot(res, np.transpose(self.model.doc_topic_))

            #print 'final shape:'
            #print res2.shape

            bestValues = sorted(list(res2[0]), reverse=True)[:5]

            b = []

            for bestVal in bestValues:
                for ind, val in enumerate(list(res2[0])):
                    if val == bestVal:
                        b.append((ind, val))


        results = []
        dbMan = DatabaseManager()
        for x in b:
            results.append(dbMan.get_link(self.listOfArticles[x[0]]))

        for res in results:
            print res.url
            print res.title

        stop = time.time()
        return results, stop - start
Exemplo n.º 2
0
    def getArticles(self, drillDownPath):

        fileName = ''
        formerFileName = ''
        for ind in drillDownPath:
            formerFileName = fileName
            fileName+='_' + str(ind)

        with open(CLUST_DIR + 'b' + formerFileName + '.pickle',  'rb') as handle:
            formerClust = pickle.load(handle)

        artNumbers = formerClust[drillDownPath[-1]]

        results = []
        dbMan = DatabaseManager()
        for x in artNumbers:
            results.append(dbMan.get_link(self.listOfArticles[x]))

        return results