示例#1
0
def snippetGeneration():
    # fetch queryMap
    baselineRunsNoTextTrans()
    qMap = RetrievalModels.fetchQueryMap()
    docScorePerQuery = RetrievalModels.selectRetrievalModel(
        INVERTED_INDEX[0], NUM_OF_TOKEN_PER_DOC[0], 1, TYPE_OF_OUTPUTS[0],
        qMap)
    DisplayResult.main(docScorePerQuery)
示例#2
0
def queryEnrichment():
    # fetch queryMap
    qMap = RetrievalModels.fetchQueryMap()
    # run baseline for BM25 (no text transformation)
    docScorePerQuery = RetrievalModels.selectRetrievalModel(
        INVERTED_INDEX[0], NUM_OF_TOKEN_PER_DOC[0], 1, TYPE_OF_OUTPUTS[0],
        qMap)
    DOC_SCORES_PER_QUERY_PER_RUN[
        'QueryRefinement-BM25'] = QueryEnrichment.main(docScorePerQuery)
示例#3
0
def main(qmap):

    correctedQueries = {}
    create_words_dict()
    create_bigram_index()
    create_query_bigram_index()

    global INVERTED_INDEX

    print "Inducing errors in Queries:"
    error_queries = SpellingErrorGenerator.main(qmap)
    sorted_error_queries = sorted(error_queries.items(),
                                  key=operator.itemgetter(0))
    INVERTED_INDEX = RetrievalModels.fetchInvertedIndex(
        RetrievalModels.INVERTED_INDEX[0])
    generate_term_frequency_dict(INVERTED_INDEX)

    print "Generating new queries:"
    for t in sorted_error_queries:
        query_id = t[0]
        query = t[1]
        new_query = softmatching(query_id, query)
        print "Old Query -> " + str(query_id) + ": " + query
        print "New Query -> " + str(query_id) + ": " + new_query + "\n"
        correctedQueries[query_id] = new_query

    return correctedQueries
示例#4
0
def softMatching():
    global DOC_SCORES_PER_QUERY_PER_RUN
    baselineRunsNoTextTrans()
    qmap = RetrievalModels.fetchQueryMap()
    newQueries = SoftMatchingQuerHandler.main(qmap)
    if os.path.exists(TYPE_OF_OUTPUTS[4]):
        shutil.rmtree(TYPE_OF_OUTPUTS[4])
    if not os.path.exists(TYPE_OF_OUTPUTS[4]):
        os.makedirs(TYPE_OF_OUTPUTS[4])
    docScorePerQuery=RetrievalModels.selectRetrievalModel(RetrievalModels.INVERTED_INDEX[0],\
                                                          RetrievalModels.NUM_OF_TOKEN_PER_DOC[0],1,TYPE_OF_OUTPUTS[4],newQueries)

    if os.path.exists(DOC_SCORES_PER_QUERY_PER_RUN_PICKLE):
        DOC_SCORES_PER_QUERY_PER_RUN = PerformanceEvaluation.fetchDocScoresPerQueryPerRun(
        )
    DOC_SCORES_PER_QUERY_PER_RUN['SoftQueryMatching-BM25'] = docScorePerQuery
    writeDocScoresToPickleFile(DOC_SCORES_PER_QUERY_PER_RUN)
    PerformanceEvaluation.main(DOC_SCORES_PER_QUERY_PER_RUN)
示例#5
0
def baselineRunsWithStemming():
    # generate tokens with stemming as text transformation
    GenerateTokenizedCorpus.selectTypeOfTextTransformation(3)
    # generate unigram index with stemmed  corpus
    Indexer.selectTheCorpusForIndexing(3)
    # fetch queryMap
    qMap = RetrievalModels.fetchStemmedQueries()
    if os.path.exists(TYPE_OF_OUTPUTS[2]):
        shutil.rmtree(TYPE_OF_OUTPUTS[2])
    if not os.path.exists(TYPE_OF_OUTPUTS[2]):
        os.makedirs(TYPE_OF_OUTPUTS[2])

    # run all baselines (BM25, TF-IDF, Smoothed Query Likelihood Model)
    # with stemmed  corpus
    for i in range(1, 4):
        RetrievalModels.selectRetrievalModel(INVERTED_INDEX[2],
                                             NUM_OF_TOKEN_PER_DOC[2], i,
                                             TYPE_OF_OUTPUTS[2], qMap)
示例#6
0
def evaluate(docScoresPerQuery):
    tableQueryMap = {}
    for queryID in docScoresPerQuery:
        #print "Generating precision recall table for queryID: "+str(queryID)
        relevantDocs = RetrievalModels.fetchRelevantDocIds(queryID)
        if len(relevantDocs) == 0:
            continue
        tableQueryMap[queryID] = generatePrecisionRecallTables(
            docScoresPerQuery[queryID], relevantDocs, queryID)
    return tableQueryMap
def main(docScorePerQuery):

    if os.path.exists(TYPE_OF_OUTPUTS[3]):
        shutil.rmtree(TYPE_OF_OUTPUTS[3])
    if not os.path.exists(TYPE_OF_OUTPUTS[3]):
        os.makedirs(TYPE_OF_OUTPUTS[3])
    qMap = RetrievalModels.fetchQueryMap()
    invertedIndex = RetrievalModels.fetchInvertedIndex(
        RetrievalModels.INVERTED_INDEX[0])
    newQMap = {}
    for queryID in docScorePerQuery:
        newQuery = performQueryEnrichment(docScorePerQuery[queryID],
                                          qMap[queryID], invertedIndex,
                                          queryID)
        writeNewQueryTofile(ENRICHED_QUERY_FILE_NAME, newQuery, queryID)
        newQMap[queryID] = newQuery
    docScorePerQuery=RetrievalModels.selectRetrievalModel(RetrievalModels.INVERTED_INDEX[0],\
                                                          RetrievalModels.NUM_OF_TOKEN_PER_DOC[0],1,TYPE_OF_OUTPUTS[3],newQMap)
    return docScorePerQuery
示例#8
0
def baselineRunsNoTextTrans():

    # generate tokens with no text transformation
    GenerateTokenizedCorpus.selectTypeOfTextTransformation(1)
    # generate unigram index with no text transformation corpus
    Indexer.selectTheCorpusForIndexing(1)
    # fetch queryMap
    qMap = RetrievalModels.fetchQueryMap()
    if os.path.exists(TYPE_OF_OUTPUTS[0]):
        shutil.rmtree(TYPE_OF_OUTPUTS[0])
    if not os.path.exists(TYPE_OF_OUTPUTS[0]):
        os.makedirs(TYPE_OF_OUTPUTS[0])
    # run all baselines (BM25, TF-IDF, Smoothed Query Likelihood Model)
    # with no text transformation corpus
    for i in range(1, 4):
        docScorePerQuery = RetrievalModels.selectRetrievalModel(
            INVERTED_INDEX[0], NUM_OF_TOKEN_PER_DOC[0], i, TYPE_OF_OUTPUTS[0],
            qMap)
        DOC_SCORES_PER_QUERY_PER_RUN['NoTextTran-' +
                                     ORDER_OF_EXECUTION[i -
                                                        1]] = docScorePerQuery
示例#9
0
def create_query_bigram_index():
    global QUERY_BIGRAM_INDEX
    '''Including query in bigram index'''
    qMap = RetrievalModels.fetchQueryMap()
    for queryId in qMap:
        line = qMap[queryId]
        line = line.split()
        for i in range(0, len(line) - 1):
            term = line[i] + " " + line[i + 1]
            if (QUERY_BIGRAM_INDEX.has_key(term)):
                QUERY_BIGRAM_INDEX[term] += 1
            else:
                QUERY_BIGRAM_INDEX[term] = 1
def main(docScorePerQuery):
    '''Checks if the folder storing all the retrieved results with snippets exists, if exists delete and make a new one'''
    if (os.path.exists(SNIPPETS_FOLDER_PATH)):
        shutil.rmtree(SNIPPETS_FOLDER_PATH)
    os.mkdir(SNIPPETS_FOLDER_PATH)
    '''Fetching Stopwords'''
    fetch_stopwords()

    # fetch QueryMap
    queryMap = RetrievalModels.fetchQueryMap()
    print("Storing the top 10 documents with snippets for all queries")
    for queryID in docScorePerQuery:
        generate_snippets(queryID, queryMap[queryID],
                          docScorePerQuery[queryID])
示例#11
0
def generate_term_frequency_dict(INVERTED_INDEX):
    global TERM_FREQUENCY_DICT

    for k, v in INVERTED_INDEX.items():
        term = k
        doc_dict = v
        term_count = 0
        for k, v in doc_dict.items():
            term_count += v
        term = term.lower()
        TERM_FREQUENCY_DICT[term] = term_count

    qMap = RetrievalModels.fetchQueryMap()
    for queryId in qMap:
        line = qMap[queryId]
        for term in line.split():
            if (TERM_FREQUENCY_DICT.has_key(term)):
                TERM_FREQUENCY_DICT[term] += 1
            else:
                TERM_FREQUENCY_DICT[term] = 1
    fileIndex.write("------------------------------------------\n\n")
    fileIndex.write(newQuery + "\n\n\n")


def main(docScorePerQuery):

    if os.path.exists(TYPE_OF_OUTPUTS[3]):
        shutil.rmtree(TYPE_OF_OUTPUTS[3])
    if not os.path.exists(TYPE_OF_OUTPUTS[3]):
        os.makedirs(TYPE_OF_OUTPUTS[3])
    qMap = RetrievalModels.fetchQueryMap()
    invertedIndex = RetrievalModels.fetchInvertedIndex(
        RetrievalModels.INVERTED_INDEX[0])
    newQMap = {}
    for queryID in docScorePerQuery:
        newQuery = performQueryEnrichment(docScorePerQuery[queryID],
                                          qMap[queryID], invertedIndex,
                                          queryID)
        writeNewQueryTofile(ENRICHED_QUERY_FILE_NAME, newQuery, queryID)
        newQMap[queryID] = newQuery
    docScorePerQuery=RetrievalModels.selectRetrievalModel(RetrievalModels.INVERTED_INDEX[0],\
                                                          RetrievalModels.NUM_OF_TOKEN_PER_DOC[0],1,TYPE_OF_OUTPUTS[3],newQMap)
    return docScorePerQuery


if __name__ == '__main__':
    # fetch queryMap
    qMap = RetrievalModels.fetchQueryMap()
    docScorePerQuery=RetrievalModels.selectRetrievalModel(RetrievalModels.INVERTED_INDEX[0],RetrievalModels.NUM_OF_TOKEN_PER_DOC[0],\
                                                          1,TYPE_OF_OUTPUTS[0],qMap)
    main(docScorePerQuery)
def generate_snippets(query_id, query, docScore):
    DOCUMENT_SNIPPET_DICT = {}
    query = str(query).lower()
    sorted_doc_score = sorted(docScore.items(),
                              key=operator.itemgetter(1),
                              reverse=True)
    c = 0
    for t in sorted_doc_score:
        c += 1
        doc_id = t[0]
        '''Stores frequencies of every term in the index'''
        WORD_FREQUENCY_DICT = {}
        '''Stores significance scores for every sentence in a document'''
        SENTENCE_SCORES = {}
        '''fetching the text content of the document from the corpus'''
        current_doc_path = os.path.join(CORPUS_PATH, doc_id + ".html")
        f = open(current_doc_path, "r")
        content = f.read()
        text_content = GenerateTokenizedCorpus.parseHTML(content)
        text_content = text_content.lower()

        sentence_count = 0
        '''Calculating the frequency of each term and counting the number of sentences'''

        for line in text_content.split("\n"):
            if (line != ""):
                sentence_count += 1
                for term in line.split():
                    term = RetrievalModels.removePunctuation(term)
                    if (WORD_FREQUENCY_DICT.has_key(term)):
                        WORD_FREQUENCY_DICT[term] += 1
                    else:
                        WORD_FREQUENCY_DICT[term] = 1
        '''Calculating the significance score for each sentence'''

        for line in text_content.split("\n"):
            if (line != ""):
                significant_word_count = 0
                first_index = 0
                last_index = 0
                term_list = line.split()

                for i in range(0, len(term_list)):

                    term = RetrievalModels.removePunctuation(term_list[i])
                    if (check_significant_term(term, sentence_count,
                                               str(query),
                                               WORD_FREQUENCY_DICT)):
                        significant_word_count += 1
                        if (first_index == 0):
                            first_index = i + 1
                        last_index = i + 1

                span_length = (last_index - first_index) + 1

                SENTENCE_SCORES[line] = float(significant_word_count**2 /
                                              span_length)

        sorted_sentence_score = sorted(SENTENCE_SCORES.items(),
                                       key=operator.itemgetter(1),
                                       reverse=True)
        DOCUMENT_SNIPPET_DICT[doc_id] = sorted_sentence_score

        if (c == 10):
            break
    '''Generate retrived results with snippets for the currnt query'''
    genarate_snippet_files(sorted_doc_score, DOCUMENT_SNIPPET_DICT, query_id,
                           query)