Пример #1
0
def main_search(Query, WebQuery = None, EmailQuery = None,Path=None):
    data = pd.read_csv(Path+ '/DumpDB.csv')
    #data= pickle.load('DumpDB.dump')

    Links = data['1311WebLinks']
    Abstracts = data['1311WebAbstracts']


    #extract chars from string error
    [Query_clean] = SortUtils.cleanStrings([Query])
    QueryChars = SortUtils.target_words_extract(Query_clean)

    QueryChars = [c for c in QueryChars if c in data.column.values]




    QueryTFIDFMatrix = data[QueryChars]
    RankResult_index = SortUtils.rank_tfidfMatrix(QueryTFIDFMatrix, 0)

    [sorted_Links] = \
    SortUtils.filter_result([Links],RankResult_index)

    if WebQuery: return None
    if EmailQuery: return sorted_Links[0]
Пример #2
0
    writer = csv.writer(f)
    for i in xrange(len(filter_links.values.tolist())):
        writer.writerow(filter_links.values.tolist()[i] + filter_question.values.tolist()[i]+ \
                        filter_answer.values.tolist()[i] + filter_questionvotes.values.tolist()[i] \
                        + titles.values.tolist()[i])

Links = filter_links.values.tolist()
AnswerContent = filter_answer.values.tolist()
QuestionContent = filter_question.values.tolist()
QuestionVotes = filter_questionvotes.values.tolist()



Links = ['%s' % link[0] for link in Links]
AnswerContent = ['%s' % content[0] for content in AnswerContent]
AnswerContent = SortUtils.cleanStrings(AnswerContent)
QuestionContent = ['%s' % content[0] for content in QuestionContent]
QuestionContent = SortUtils.cleanStrings(QuestionContent)
QuestionVotes = ['%s' % content[0] for content in QuestionVotes]
QuestionAndAnswerContent = ['%s %s' % Content for Content in zip(QuestionContent,AnswerContent)]


TfidfValueMatrix = SortUtils.cal_tfidf(QuestionAndAnswerContent)


 #clear matrix if there is column that contains more than 3 number, delete
print "cleanning"
for name in TfidfValueMatrix.columns.values:
    if name.isdigit():
        del TfidfValueMatrix[name]
Пример #3
0
def main_search(Query, WebQuery=None, EmailQuery=None):
    """
    Given a query, return our search result.

    """

    Links = google_search_engine(Query)

    [Links_RemoveEmpty, QuestionVotes, QuestionContent, AnswerContent, WebResult] = scrape_webs(Links)

    log("Orignal link:")
    log(Links_RemoveEmpty)

    # extract chars from string error
    [Query_clean] = SortUtils.cleanStrings([Query])
    QueryChars = SortUtils.target_words_extract(Query_clean)

    QuestionContent = SortUtils.cleanStrings(QuestionContent)
    AnswerContent = SortUtils.cleanStrings(AnswerContent)
    QuestionAndAnswerContent = ["%s %s" % Content for Content in zip(QuestionContent, AnswerContent)]

    # cal tfidf
    Tfidf_table = tfidf.tfidf()
    index = 0
    for content in QuestionAndAnswerContent:
        index = index + 1
        content_remove_punc = SortUtils.remove_punctuation(content)
        content_words_list = SortUtils.target_words_extract(content_remove_punc)
        content_words_list = [word.encode("ascii", "ignore") for word in content_words_list]
        Tfidf_table.addDocument(str(index), content_words_list)

    SimilaritiesResult = Tfidf_table.similarities(QueryChars)
    SimilaritiesResult = [Result[1] for Result in SimilaritiesResult]

    SimilaritiesResult = SortUtils.normalize(SimilaritiesResult)
    QuestionVotes = SortUtils.mathlog(QuestionVotes)
    QuestionVotes = SortUtils.normalize(QuestionVotes)
    # log("after QuestionVotes")
    # log(QuestionVotes)

    FitValue = [sim * 0.6 + que * 0.4 for sim, que in zip(SimilaritiesResult, QuestionVotes)]
    Index_sortedby_fit = sorted(range(len(FitValue)), key=lambda k: FitValue[k], reverse=True)
    Index_sortedby_sim = sorted(range(len(SimilaritiesResult)), key=lambda k: SimilaritiesResult[k], reverse=True)
    log("sort by similarity")
    log(Index_sortedby_sim)
    log("votes")
    log(QuestionVotes)
    log("sort by Fitvalue")
    log(Index_sortedby_fit)

    [sorted_Links_RemoveEmpty, sorted_WebResult] = SortUtils.filter_result(
        [Links_RemoveEmpty, WebResult], Index_sortedby_fit
    )
    log("new sorted links")
    log(sorted_Links_RemoveEmpty)

    log("sorted_WebResult")
    log(sorted_WebResult)
    log(WebQuery)
    if WebQuery:
        return sorted_WebResult
    if EmailQuery:
        return sorted_Links_RemoveEmpty[0]