def snippetGeneration(): # fetch queryMap baselineRunsNoTextTrans() qMap = RetrievalModels.fetchQueryMap() docScorePerQuery = RetrievalModels.selectRetrievalModel( INVERTED_INDEX[0], NUM_OF_TOKEN_PER_DOC[0], 1, TYPE_OF_OUTPUTS[0], qMap) DisplayResult.main(docScorePerQuery)
def queryEnrichment(): # fetch queryMap qMap = RetrievalModels.fetchQueryMap() # run baseline for BM25 (no text transformation) docScorePerQuery = RetrievalModels.selectRetrievalModel( INVERTED_INDEX[0], NUM_OF_TOKEN_PER_DOC[0], 1, TYPE_OF_OUTPUTS[0], qMap) DOC_SCORES_PER_QUERY_PER_RUN[ 'QueryRefinement-BM25'] = QueryEnrichment.main(docScorePerQuery)
def main(qmap): correctedQueries = {} create_words_dict() create_bigram_index() create_query_bigram_index() global INVERTED_INDEX print "Inducing errors in Queries:" error_queries = SpellingErrorGenerator.main(qmap) sorted_error_queries = sorted(error_queries.items(), key=operator.itemgetter(0)) INVERTED_INDEX = RetrievalModels.fetchInvertedIndex( RetrievalModels.INVERTED_INDEX[0]) generate_term_frequency_dict(INVERTED_INDEX) print "Generating new queries:" for t in sorted_error_queries: query_id = t[0] query = t[1] new_query = softmatching(query_id, query) print "Old Query -> " + str(query_id) + ": " + query print "New Query -> " + str(query_id) + ": " + new_query + "\n" correctedQueries[query_id] = new_query return correctedQueries
def softMatching(): global DOC_SCORES_PER_QUERY_PER_RUN baselineRunsNoTextTrans() qmap = RetrievalModels.fetchQueryMap() newQueries = SoftMatchingQuerHandler.main(qmap) if os.path.exists(TYPE_OF_OUTPUTS[4]): shutil.rmtree(TYPE_OF_OUTPUTS[4]) if not os.path.exists(TYPE_OF_OUTPUTS[4]): os.makedirs(TYPE_OF_OUTPUTS[4]) docScorePerQuery=RetrievalModels.selectRetrievalModel(RetrievalModels.INVERTED_INDEX[0],\ RetrievalModels.NUM_OF_TOKEN_PER_DOC[0],1,TYPE_OF_OUTPUTS[4],newQueries) if os.path.exists(DOC_SCORES_PER_QUERY_PER_RUN_PICKLE): DOC_SCORES_PER_QUERY_PER_RUN = PerformanceEvaluation.fetchDocScoresPerQueryPerRun( ) DOC_SCORES_PER_QUERY_PER_RUN['SoftQueryMatching-BM25'] = docScorePerQuery writeDocScoresToPickleFile(DOC_SCORES_PER_QUERY_PER_RUN) PerformanceEvaluation.main(DOC_SCORES_PER_QUERY_PER_RUN)
def baselineRunsWithStemming(): # generate tokens with stemming as text transformation GenerateTokenizedCorpus.selectTypeOfTextTransformation(3) # generate unigram index with stemmed corpus Indexer.selectTheCorpusForIndexing(3) # fetch queryMap qMap = RetrievalModels.fetchStemmedQueries() if os.path.exists(TYPE_OF_OUTPUTS[2]): shutil.rmtree(TYPE_OF_OUTPUTS[2]) if not os.path.exists(TYPE_OF_OUTPUTS[2]): os.makedirs(TYPE_OF_OUTPUTS[2]) # run all baselines (BM25, TF-IDF, Smoothed Query Likelihood Model) # with stemmed corpus for i in range(1, 4): RetrievalModels.selectRetrievalModel(INVERTED_INDEX[2], NUM_OF_TOKEN_PER_DOC[2], i, TYPE_OF_OUTPUTS[2], qMap)
def evaluate(docScoresPerQuery): tableQueryMap = {} for queryID in docScoresPerQuery: #print "Generating precision recall table for queryID: "+str(queryID) relevantDocs = RetrievalModels.fetchRelevantDocIds(queryID) if len(relevantDocs) == 0: continue tableQueryMap[queryID] = generatePrecisionRecallTables( docScoresPerQuery[queryID], relevantDocs, queryID) return tableQueryMap
def main(docScorePerQuery): if os.path.exists(TYPE_OF_OUTPUTS[3]): shutil.rmtree(TYPE_OF_OUTPUTS[3]) if not os.path.exists(TYPE_OF_OUTPUTS[3]): os.makedirs(TYPE_OF_OUTPUTS[3]) qMap = RetrievalModels.fetchQueryMap() invertedIndex = RetrievalModels.fetchInvertedIndex( RetrievalModels.INVERTED_INDEX[0]) newQMap = {} for queryID in docScorePerQuery: newQuery = performQueryEnrichment(docScorePerQuery[queryID], qMap[queryID], invertedIndex, queryID) writeNewQueryTofile(ENRICHED_QUERY_FILE_NAME, newQuery, queryID) newQMap[queryID] = newQuery docScorePerQuery=RetrievalModels.selectRetrievalModel(RetrievalModels.INVERTED_INDEX[0],\ RetrievalModels.NUM_OF_TOKEN_PER_DOC[0],1,TYPE_OF_OUTPUTS[3],newQMap) return docScorePerQuery
def baselineRunsNoTextTrans(): # generate tokens with no text transformation GenerateTokenizedCorpus.selectTypeOfTextTransformation(1) # generate unigram index with no text transformation corpus Indexer.selectTheCorpusForIndexing(1) # fetch queryMap qMap = RetrievalModels.fetchQueryMap() if os.path.exists(TYPE_OF_OUTPUTS[0]): shutil.rmtree(TYPE_OF_OUTPUTS[0]) if not os.path.exists(TYPE_OF_OUTPUTS[0]): os.makedirs(TYPE_OF_OUTPUTS[0]) # run all baselines (BM25, TF-IDF, Smoothed Query Likelihood Model) # with no text transformation corpus for i in range(1, 4): docScorePerQuery = RetrievalModels.selectRetrievalModel( INVERTED_INDEX[0], NUM_OF_TOKEN_PER_DOC[0], i, TYPE_OF_OUTPUTS[0], qMap) DOC_SCORES_PER_QUERY_PER_RUN['NoTextTran-' + ORDER_OF_EXECUTION[i - 1]] = docScorePerQuery
def create_query_bigram_index(): global QUERY_BIGRAM_INDEX '''Including query in bigram index''' qMap = RetrievalModels.fetchQueryMap() for queryId in qMap: line = qMap[queryId] line = line.split() for i in range(0, len(line) - 1): term = line[i] + " " + line[i + 1] if (QUERY_BIGRAM_INDEX.has_key(term)): QUERY_BIGRAM_INDEX[term] += 1 else: QUERY_BIGRAM_INDEX[term] = 1
def main(docScorePerQuery): '''Checks if the folder storing all the retrieved results with snippets exists, if exists delete and make a new one''' if (os.path.exists(SNIPPETS_FOLDER_PATH)): shutil.rmtree(SNIPPETS_FOLDER_PATH) os.mkdir(SNIPPETS_FOLDER_PATH) '''Fetching Stopwords''' fetch_stopwords() # fetch QueryMap queryMap = RetrievalModels.fetchQueryMap() print("Storing the top 10 documents with snippets for all queries") for queryID in docScorePerQuery: generate_snippets(queryID, queryMap[queryID], docScorePerQuery[queryID])
def generate_term_frequency_dict(INVERTED_INDEX): global TERM_FREQUENCY_DICT for k, v in INVERTED_INDEX.items(): term = k doc_dict = v term_count = 0 for k, v in doc_dict.items(): term_count += v term = term.lower() TERM_FREQUENCY_DICT[term] = term_count qMap = RetrievalModels.fetchQueryMap() for queryId in qMap: line = qMap[queryId] for term in line.split(): if (TERM_FREQUENCY_DICT.has_key(term)): TERM_FREQUENCY_DICT[term] += 1 else: TERM_FREQUENCY_DICT[term] = 1
fileIndex.write("------------------------------------------\n\n") fileIndex.write(newQuery + "\n\n\n") def main(docScorePerQuery): if os.path.exists(TYPE_OF_OUTPUTS[3]): shutil.rmtree(TYPE_OF_OUTPUTS[3]) if not os.path.exists(TYPE_OF_OUTPUTS[3]): os.makedirs(TYPE_OF_OUTPUTS[3]) qMap = RetrievalModels.fetchQueryMap() invertedIndex = RetrievalModels.fetchInvertedIndex( RetrievalModels.INVERTED_INDEX[0]) newQMap = {} for queryID in docScorePerQuery: newQuery = performQueryEnrichment(docScorePerQuery[queryID], qMap[queryID], invertedIndex, queryID) writeNewQueryTofile(ENRICHED_QUERY_FILE_NAME, newQuery, queryID) newQMap[queryID] = newQuery docScorePerQuery=RetrievalModels.selectRetrievalModel(RetrievalModels.INVERTED_INDEX[0],\ RetrievalModels.NUM_OF_TOKEN_PER_DOC[0],1,TYPE_OF_OUTPUTS[3],newQMap) return docScorePerQuery if __name__ == '__main__': # fetch queryMap qMap = RetrievalModels.fetchQueryMap() docScorePerQuery=RetrievalModels.selectRetrievalModel(RetrievalModels.INVERTED_INDEX[0],RetrievalModels.NUM_OF_TOKEN_PER_DOC[0],\ 1,TYPE_OF_OUTPUTS[0],qMap) main(docScorePerQuery)
def generate_snippets(query_id, query, docScore): DOCUMENT_SNIPPET_DICT = {} query = str(query).lower() sorted_doc_score = sorted(docScore.items(), key=operator.itemgetter(1), reverse=True) c = 0 for t in sorted_doc_score: c += 1 doc_id = t[0] '''Stores frequencies of every term in the index''' WORD_FREQUENCY_DICT = {} '''Stores significance scores for every sentence in a document''' SENTENCE_SCORES = {} '''fetching the text content of the document from the corpus''' current_doc_path = os.path.join(CORPUS_PATH, doc_id + ".html") f = open(current_doc_path, "r") content = f.read() text_content = GenerateTokenizedCorpus.parseHTML(content) text_content = text_content.lower() sentence_count = 0 '''Calculating the frequency of each term and counting the number of sentences''' for line in text_content.split("\n"): if (line != ""): sentence_count += 1 for term in line.split(): term = RetrievalModels.removePunctuation(term) if (WORD_FREQUENCY_DICT.has_key(term)): WORD_FREQUENCY_DICT[term] += 1 else: WORD_FREQUENCY_DICT[term] = 1 '''Calculating the significance score for each sentence''' for line in text_content.split("\n"): if (line != ""): significant_word_count = 0 first_index = 0 last_index = 0 term_list = line.split() for i in range(0, len(term_list)): term = RetrievalModels.removePunctuation(term_list[i]) if (check_significant_term(term, sentence_count, str(query), WORD_FREQUENCY_DICT)): significant_word_count += 1 if (first_index == 0): first_index = i + 1 last_index = i + 1 span_length = (last_index - first_index) + 1 SENTENCE_SCORES[line] = float(significant_word_count**2 / span_length) sorted_sentence_score = sorted(SENTENCE_SCORES.items(), key=operator.itemgetter(1), reverse=True) DOCUMENT_SNIPPET_DICT[doc_id] = sorted_sentence_score if (c == 10): break '''Generate retrived results with snippets for the currnt query''' genarate_snippet_files(sorted_doc_score, DOCUMENT_SNIPPET_DICT, query_id, query)