def main_search(Query, WebQuery = None, EmailQuery = None,Path=None): data = pd.read_csv(Path+ '/DumpDB.csv') #data= pickle.load('DumpDB.dump') Links = data['1311WebLinks'] Abstracts = data['1311WebAbstracts'] #extract chars from string error [Query_clean] = SortUtils.cleanStrings([Query]) QueryChars = SortUtils.target_words_extract(Query_clean) QueryChars = [c for c in QueryChars if c in data.column.values] QueryTFIDFMatrix = data[QueryChars] RankResult_index = SortUtils.rank_tfidfMatrix(QueryTFIDFMatrix, 0) [sorted_Links] = \ SortUtils.filter_result([Links],RankResult_index) if WebQuery: return None if EmailQuery: return sorted_Links[0]
def main_search(Query, WebQuery=None, EmailQuery=None): """ Given a query, return our search result. """ Links = google_search_engine(Query) [Links_RemoveEmpty, QuestionVotes, QuestionContent, AnswerContent, WebResult] = scrape_webs(Links) log("Orignal link:") log(Links_RemoveEmpty) # extract chars from string error [Query_clean] = SortUtils.cleanStrings([Query]) QueryChars = SortUtils.target_words_extract(Query_clean) QuestionContent = SortUtils.cleanStrings(QuestionContent) AnswerContent = SortUtils.cleanStrings(AnswerContent) QuestionAndAnswerContent = ["%s %s" % Content for Content in zip(QuestionContent, AnswerContent)] # cal tfidf Tfidf_table = tfidf.tfidf() index = 0 for content in QuestionAndAnswerContent: index = index + 1 content_remove_punc = SortUtils.remove_punctuation(content) content_words_list = SortUtils.target_words_extract(content_remove_punc) content_words_list = [word.encode("ascii", "ignore") for word in content_words_list] Tfidf_table.addDocument(str(index), content_words_list) SimilaritiesResult = Tfidf_table.similarities(QueryChars) SimilaritiesResult = [Result[1] for Result in SimilaritiesResult] SimilaritiesResult = SortUtils.normalize(SimilaritiesResult) QuestionVotes = SortUtils.mathlog(QuestionVotes) QuestionVotes = SortUtils.normalize(QuestionVotes) # log("after QuestionVotes") # log(QuestionVotes) FitValue = [sim * 0.6 + que * 0.4 for sim, que in zip(SimilaritiesResult, QuestionVotes)] Index_sortedby_fit = sorted(range(len(FitValue)), key=lambda k: FitValue[k], reverse=True) Index_sortedby_sim = sorted(range(len(SimilaritiesResult)), key=lambda k: SimilaritiesResult[k], reverse=True) log("sort by similarity") log(Index_sortedby_sim) log("votes") log(QuestionVotes) log("sort by Fitvalue") log(Index_sortedby_fit) [sorted_Links_RemoveEmpty, sorted_WebResult] = SortUtils.filter_result( [Links_RemoveEmpty, WebResult], Index_sortedby_fit ) log("new sorted links") log(sorted_Links_RemoveEmpty) log("sorted_WebResult") log(sorted_WebResult) log(WebQuery) if WebQuery: return sorted_WebResult if EmailQuery: return sorted_Links_RemoveEmpty[0]