def search(searchTerm): starttime = time.time() print ">>SEARCHARTICLES: Search for top TF-IDF values has started." resultspath = os.getcwd() + "/TFIDF_searcher/articleresults.txt" totaldoccount = len(Corpus.articleIndex) results = [] term = str(searchTerm).strip() # term = lemmatise_input_term(term) if term in Corpus.wordIndex: articlehits = Corpus.wordIndex[term] doccount = len(articlehits) IDF = math.log10(totaldoccount / float(doccount)) for article in articlehits: wordcount = articlehits[article] articlewordcount = Corpus.articleIndex[article] TF = wordcount / float(articlewordcount) TFIDF = TF * IDF results.append((article, TFIDF)) # print "TFIDF: %s * %s = %s" % (TF, IDF, TFIDF) else: print ">>SEARCHARTICLES: '%s' 0 articles. Search terminated." %term return results if len(results) < 2: print ">>SEARCHARTICLES: '%s' too few articles. Search terminated." %term results = [] return results # Sort results on their TFIDF rating, in decreasing order. results = sorted(results, key=lambda result: result[1], reverse=True) print ">>SEARCHARTICLES: '%s' \t %s articles." % (term, len(results)) # Deletes the bottom 50% of our search results results = results[0:len(results)/2] for x in results: print x[0] # resultfile = open(resultspath, "w") # for result in results: # resultfile.write(result[0] + "\n") # for articles in result[1]: # resultfile.write(str(articles[0]) + " : " + str(articles[1]) + "\n") # resultfile.close() totalTime = round((time.time() - starttime), 3) print ">>SEARCHARTICLES: Search has completed in %s seconds." % totalTime searchLog(term, len(results), totalTime) return results
def search(self, searchTerm): starttime = time.time() totaldoccount = len(self.articleIndex) results = [] term = str(searchTerm).strip() # term = lemmatise_input_term(term) if term in self.wordIndex: articlehits = self.wordIndex[term] doccount = len(articlehits) IDF = math.log10(totaldoccount / float(doccount)) for article in articlehits: wordcount = articlehits[article] articlewordcount = self.articleIndex[article][0] TF = wordcount / float(articlewordcount) TFIDF = TF * IDF results.append((article, TFIDF)) # print "TFIDF: %s * %s = %s" % (TF, IDF, TFIDF) else: print ">>SEARCHARTICLES: '%s' has 0 articles. Search terminated." %term return results if len(results) < 2: print ">>SEARCHARTICLES: '%s' has %s article(s). Search terminated." %(term,len(results)) results = [] return results # Sort results on their TFIDF rating, in decreasing order. results = sorted(results, key=lambda result: result[1], reverse=True) print ">>SEARCHARTICLES: '%s' has %s articles (%s returned)." % (term, len(results), (len(results) / 2)) # Deletes the bottom 50% of our search results results = results[0:len(results)/2] self.sentimentscore = 0 # Removes TFIDF values from the remaining articles and adds up the sentimentscore for i in range(len(results)): x = results[i] self.sentimentscore += self.articleIndex[x[0]][1] results[i] = x[0] totalTime = round((time.time() - starttime), 3) # print ">>SEARCHARTICLES: Search has completed in %s seconds." % totalTime searchLog(term, len(results), totalTime) sentimentLog(term, self.sentimentscore) return results
def search(self, searchTerm): starttime = time.time() # logging purposes totaldoccount = len(self.articleIndex) subset = [] term = str(searchTerm).strip() if term in self.wordIndex: articlehits = self.wordIndex[term] doccount = len(articlehits) IDF = math.log10(totaldoccount / float(doccount)) for article in articlehits: wordcount = articlehits[article] articlewordcount = self.articleIndex[article][0] TF = wordcount / float(articlewordcount) TFIDF = TF * IDF subset.append((article, TFIDF)) else: print ">>SEARCHARTICLES: '%s' has 0 articles. Search terminated." %term return subset if len(subset) < 2: print ">>SEARCHARTICLES: '%s' has %s article(s). Search terminated." %(term,len(subset)) return [] # Sort subset on their TFIDF rating, in decreasing order. subset = sorted(subset, key=lambda result: result[1], reverse=True) print ">>SEARCHARTICLES: '%s' has %s articles (%s returned)." % (term, len(subset), (len(subset) / 2)) # Deletes the bottom 50% of our search subset subset = subset[0:len(subset)/2] self.sentimentscore = 0 # Removes TFIDF values from the remaining articles and adds up the sentimentscore for i in range(len(subset)): x = subset[i] self.sentimentscore += self.articleIndex[x[0]][1] subset[i] = x[0] totalTime = round((time.time() - starttime), 3) # logging purposes print ">>SEARCHARTICLES: Search has completed in %s seconds." % totalTime searchLog(term, len(subset), totalTime) # logging purposes sentimentArticleLog(term, self.sentimentscore) # logging purposes return subset