Exemplo n.º 1
0
def task3b(model):
    output_directory = os.path.join(os.getcwd(), "output")
    stemmed_corpus = get_stemmed_corpus()
    f = open('cacm_stem.query.txt', 'r')
    stemmed_queries = f.readlines()
    f.close()

    I = Indexer.InvertedIndexer('')
    I.stemmed_indexer(stemmed_corpus)
    r = Retriever.Retriever('', I, os.getcwd())
    file_name = os.path.join(output_directory,
                             'task3b_' + model + '_stemmed.txt')
    f = open(file_name, 'w')
    query_no = [12, 13, 19, 23, 24, 25, 50]
    q_iter = 0
    for each_query in stemmed_queries:
        r.process_query(each_query)
        docs_and_scores = r.get_scores_for_docs(model, query_no[q_iter])

        # save results into appropriate file
        docs = docs_and_scores[0]
        scores = docs_and_scores[1]
        for i in range(100):
            f.write(str(query_no[q_iter]) \
                    + " Q0 " \
                    + str(docs[i]) + ' ' \
                    + str((i + 1)) + " " \
                    + str(scores[i]) + " " \
                    + model + "\n")
        q_iter += 1
    f.close()
Exemplo n.º 2
0
def task3a(model, raw_corpus_directory):
    project_directory = os.getcwd()
    p = Indexer.Parser()
    corpus_directory = p.build_corpus(raw_corpus_directory, stopped=True)
    output_directory = os.path.join(project_directory, "output")

    I = Indexer.InvertedIndexer(corpus_directory)
    I.ngram_indexer(1)  # builds a unigram indexes for each word
    r = Retriever.Retriever(
        corpus_directory, I, project_directory
    )  # create a Retriever class, which contains different retrieval model

    os.chdir(raw_corpus_directory)
    os.chdir(os.pardir)
    f = open('cacm.query.txt', 'r')
    soup = BeautifulSoup(f.read(), 'html.parser')
    f.close()

    f_stop_words = open('common_words.txt', 'r')
    stop_words_list = f_stop_words.readlines()
    stop_words = [i.strip() for i in stop_words_list]
    f_stop_words.close()
    file_name = os.path.join(output_directory, 'task3a_' + model + '.txt')
    f = open(file_name, 'w')  # open file for writing results
    for i in range(64):
        query_no = (soup.find('docno')).text.encode(
            'utf-8')  # extract query number and query
        (soup.find('docno')).decompose()
        query = (soup.find('doc')).text.encode('utf-8')
        (soup.find('doc')).decompose()

        r.process_query(query, stopped=True,
                        stopwords=stop_words)  # parse the query
        # r.clean_content(query)
        docs_and_scores = r.get_scores_for_docs(
            model, int(query_no))  # retrieve relevant documents

        # save results into appropriate file
        docs = docs_and_scores[0]
        scores = docs_and_scores[1]
        for i in range(100):
            f.write(str(query_no) \
                        + " Q0 " \
                        + str(docs[i]) + ' ' \
                        + str((i+1)) + " " \
                        + str(scores[i]) + " " \
                        + model + "\n")
    f.close()
Exemplo n.º 3
0
        if each_term in word_count:
            word_count[each_term] += k
        else:
            word_count[each_term] = k
    top_terms = sorted(word_count.items(),
                       key=operator.itemgetter(1),
                       reverse=True)
    expanded_query_list = [i for i, j in top_terms][:len(query_terms) + n]
    expanded_query = " ".join(expanded_query_list)
    return expanded_query


current_directory = os.getcwd()

corpus_directory = os.path.join(current_directory, "processed_corpus")
I = Indexer.InvertedIndexer(corpus_directory)
I.ngram_indexer(1)
r = Retriever.Retriever(corpus_directory, I)

model = 'bm25'
# get the results from the previous runs (bm25 and tfidf)
file_name = 'task1_' + model + '.txt'
results_file_dir = os.path.join(current_directory, "task1")
results_file_dir = os.path.join(results_file_dir, file_name)

f = open(results_file_dir, 'r')
data = f.readlines()
f.close()
list_of_lines = []
for each_line in data:
    list_of_lines.append(
Exemplo n.º 4
0
def run_task(task, model, raw_corpus_directory):
    project_directory = os.getcwd()
    output_directory = os.path.join(project_directory, "output")

    # Parser (to process the raw corpus (no stopping))
    p = Indexer.Parser()
    corpus_directory = p.build_corpus(raw_corpus_directory)

    # Indexer - Builds the inverted indexes for the processed corpus
    I = Indexer.InvertedIndexer(corpus_directory)
    I.ngram_indexer(1)  # builds a unigram indexes for each word

    # Retriever - based on the model specified, this object can  be
    #             used to get the results.
    r = Retriever.Retriever(corpus_directory, I, project_directory)

    # Get the queries from the given file
    query_dic = {}  # stores the queries; key - query ID, token - query
    os.chdir(project_directory)
    f = open('cacm.query.txt', 'r')
    soup = BeautifulSoup(f.read(), 'html.parser')
    f.close()
    for i in range(64):
        query_no = (soup.find('docno')).text.encode(
            'utf-8')  # extract query number and query
        (soup.find('docno')).decompose()
        query = (soup.find('doc')).text.encode('utf-8')
        (soup.find('doc')).decompose()
        query_dic[int(query_no)] = query

    # task 1
    if task == 1:
        os.chdir(project_directory)
        if not os.path.exists(output_directory):
            os.mkdir(output_directory, 0755)
        os.chdir(output_directory)

        f = open('task1_' + model + '.txt', 'w')
        for query_no in range(len(query_dic)):
            r.process_query(query_dic[query_no + 1])  # parse the query
            docs_and_scores = r.get_scores_for_docs(
                model, (query_no + 1))  # retrieve relevant documents

            # save results into appropriate file
            docs = docs_and_scores[0]
            scores = docs_and_scores[1]
            for i in range(100):
                f.write(str(query_no + 1) \
                            + " Q0 " \
                            + str(docs[i]) + ' ' \
                            + str((i+1)) + " " \
                            + str(scores[i]) + " " \
                            + model + "\n")
        f.close()

    # task 2
    if task == 2:
        # read output files from task 1
        file_name = 'task1_' + model + '.txt'
        try:
            f = open(os.path.join(output_directory, file_name), 'r')
        except:
            print "Run Task - 1 before Task - 2"
            exit()
        data = f.readlines()
        f.close()
        list_of_lines = []
        for each_line in data:
            list_of_lines.append(each_line.split(
            ))  # contains parsed lines from the task1 output file

        task1_output = {
        }  # results for each; key = query ID(number), value = list of relevant files
        for each_line in list_of_lines:
            task1_output.setdefault(int(each_line[0]), []).append(each_line[2])

        # get stopwords
        f_stop_words = open('common_words.txt', 'r')
        stop_words_list = f_stop_words.readlines()
        stop_words = [i.strip() for i in stop_words_list]
        f_stop_words.close()

        # get corpus
        os.chdir(corpus_directory)
        files_list = glob.glob('*.html')
        corpus = {}
        for each_file in files_list:
            doc_name = each_file[:len(each_file) - 5]
            text = open(each_file).read()
            corpus[doc_name] = text.split()

        file_name = 'expanded_queries_' + model + '.txt'
        f = open(os.path.join(output_directory, file_name), 'w')
        expanded_query_dic = {}
        for query_no, query in query_dic.viewitems():
            processed_query = r.process_query(query, True)
            expanded_query_dic[query_no] = Query_Expander(
                processed_query,
                task1_output[query_no],
                corpus,
                stopwords=stop_words)
            f.write(str(query_no) + " " + expanded_query_dic[query_no] + "\n")
        f.close()

        file_name = 'task2_' + model + '.txt'
        f = open(os.path.join(output_directory, file_name), 'w')
        for query_no, query in expanded_query_dic.viewitems():
            r.process_query(query)  # parse the query
            # r.clean_content(query)
            docs_and_scores = r.get_scores_for_docs(
                model, query_no)  # retrieve relevant documents

            # save results into appropriate file
            docs = docs_and_scores[0]
            scores = docs_and_scores[1]
            for i in range(100):
                f.write(str(query_no) \
                        + " Q0 " \
                        + str(docs[i]) + ' ' \
                        + str((i + 1)) + " " \
                        + str(scores[i]) + " " \
                        + model + "\n")
        f.close()
        print "Results stored in " + output_directory + " directory"
Exemplo n.º 5
0
            if stop_if[0] in content[length - i -
                                     1] or stop_if[1] in content[length - i -
                                                                 1]:
                index = length - i
                stemmed_corpus[doc_id] = total_stemmed_corpus[doc_id][:index]
                break

    return stemmed_corpus


stemmed_corpus = get_stemmed_corpus()
f = open('cacm_stem.query.txt', 'r')
stemmed_queries = f.readlines()
f.close()

I = Indexer.InvertedIndexer('')
I.stemmed_indexer(stemmed_corpus)
r = Retriever.Retriever('', I, os.getcwd())

f = open('task3b_' + model + '_stemmed.txt', 'w')
query_no = [12, 13, 19, 23, 24, 25, 50]
q_iter = 0
for each_query in stemmed_queries:
    r.process_query(each_query)
    docs_and_scores = r.get_scores_for_docs(model, query_no[q_iter])

    # save results into appropriate file
    docs = docs_and_scores[0]
    scores = docs_and_scores[1]
    for i in range(100):
        f.write(str(query_no[q_iter]) \