def find_10_most_relevant(query, dictionary, postings, num_of_doc): ''' Compute cosine similarity between the query and each document, i.e., the lnc tf-idf for the tuples (term, frequency). Compute the score for each document containing one of those terms in the query. Return (at most) 10 most relavant document id (sorted) by score. @param query - The query string: str @param dictonary - The dictionary containing the doc frequency of a token: DefaultDict[int, Entry] @param postings - The postings dictionary containing a mapping of doc ID to the weight for a given token: Posting @param num_of_doc - The number of the documents indexed ''' ''' Get tokens (stemmed words in the query), terms (set of tokens), and the dictionary of term frequency in the query: DefaultDict[str, int] ''' tokens, terms, term_freq = get_term_freq(query) if phrasal_query: doc_candidate = interection(terms, postings) doc_to_rank = verify(doc_candidate, tokens, postings) # Compute cosine similarity between the query and each document, # with the weights follow the tf×idf calculation, and then do # normalization query_weight = normalize([ get_tf(freq) * get_idf(num_of_doc, dictionary[term].frequency) for (term, freq) in term_freq.items() ]) # Compute the score for each document containing one of those # terms in the query. score = Counter() for ((term, _), q_weight) in zip(term_freq.items(), query_weight): if q_weight > 0: ''' get the postings lists of the term, update the score ''' for doc_id, value in postings[term].items(): if phrasal_query and (doc_id not in doc_to_rank): continue score[doc_id] += q_weight * value.weight ''' rank and get result ''' return [doc_id for (doc_id, _) in score.most_common(TOP_K)]
def build_index(in_dir, out_dict, out_postings): """ build index from documents stored in the input directory, then output the dictionary file and postings file """ print('indexing...') #reading the files corpus = PlaintextCorpusReader(in_dir, '.*') file_names_str = corpus.fileids() file_names = sorted(map(int, file_names_str)) #Load corpus and generate the postings dictionary postings = defaultdict(dict) tokens = list() for docID in file_names: content = corpus.raw(str(docID)) # read file content content = preprocess(content) words = tokenize(content) # tokenization: content -> words tokens = stemming(words) # stemming if phrasal_query: token_len = defaultdict(list) else: token_len = defaultdict(int) # count the apeearing times of the token in the file term_pos = 0 for token in tokens: if phrasal_query: if token in token_len.keys(): token_len[token][0] += 1 token_len[token][1].append(term_pos) else: token_len[token] = [1, [term_pos]] else: token_len[token] += 1 term_pos += 1 ''' Generate weighted token frequency. Generate dictionary of key -> token, value -> a dict with k,v as file_name, weighted_token_frequency ''' if phrasal_query: weighted_tokenfreq = normalize( [get_tf(y[0]) for (x, y) in token_len.items()]) for ((token, freq), w_tf) in zip(token_len.items(), weighted_tokenfreq): postings[token][docID] = PhrasalToken(freq[0], freq[1], w_tf) else: weighted_tokenfreq = normalize( [get_tf(y) for (x, y) in token_len.items()]) for ((token, freq), w_tf) in zip(token_len.items(), weighted_tokenfreq): postings[token][docID] = Token(w_tf) ''' Output dictionary and postings files - Dictionary file stores all the tokens, with their doc frequency, the offset in the postings file. - Postings file stores the list of tuples -> (document ID, term freq). ''' # write postings file dictionary = defaultdict(Entry) #print(postings.items()) with open(out_postings, mode="wb") as postings_file: for key, value in postings.items(): #print(value) ''' len(value) := the document frequency of the token := how many times the token appears in all documents offset := current writing position of the postings file ''' offset = postings_file.tell() pickle.dump(value, postings_file) size = postings_file.write(pickle.dumps(value)) dictionary[key] = Entry(len(value), offset, size) # write dictionary file with open(out_dict, mode="wb") as dictionary_file: pickle.dump(url_map, dictionary_file) pickle.dump(doc_id_map, dictionary_file) pickle.dump(pr_result, dictionary_file) pickle.dump(dictionary, dictionary_file) print("dictionary done")
def execute_search(query, dictionary, postings, num_of_doc): ''' Compute cosine similarity between the query and each document, i.e., the lnc tf-idf for the tuples (term, frequency). Compute the score for each document containing one of those terms in the query. Return (at most) 10 most relavant document id (sorted) by score. @param query - The query string: str @param dictonary - The dictionary containing the doc frequency of a token: DefaultDict[int, Entry] @param postings - The postings dictionary containing a mapping of doc ID to the weight for a given token: Posting @param num_of_doc - The number of the documents indexed ''' ''' Get tokens (stemmed words in the query), terms (set of tokens), and the dictionary of term frequency in the query: DefaultDict[str, int] ''' if not boolean_query: if lesk_on: query = lesk(query) #print(query) if expand: query = expand_query(query) #print(query) tokens, terms, term_freq = get_term_freq(query) if phrasal_query: doc_candidate = intersection(terms, postings) doc_to_rank = verify(doc_candidate, tokens, postings) # Compute cosine similarity between the query and each document, # with the weights follow the tf×idf calculation, and then do # normalization query_weight = normalize([ get_tf(freq) * get_idf(num_of_doc, dictionary[term].frequency) for (term, freq) in term_freq.items() ]) # Compute the score for each document containing one of those # terms in the query. score = Counter() query_vector = {} for ((term, _), q_weight) in zip(term_freq.items(), query_weight): query_vector[term] = q_weight if q_weight > 0: ''' get the postings lists of the term, update the score ''' for doc_id, value in postings[term].items(): if phrasal_query and (doc_id not in doc_to_rank): continue score[doc_id] += q_weight * value.weight if not boolean_query and prf_on: ''' rank and get result''' most_rel_docs = [ doc_id for (doc_id, _) in score.most_common(K_MOST_RELEVANT) ] new_query = pseudo_rel_feedback(postings, dictionary, most_rel_docs, query_vector) ''' normalizing the new query ''' norm = sqrt(sum([i * i for i in new_query.values()], 0)) for term in new_query: new_query[term] = new_query[term] / norm score = Counter() for term in new_query: try: items = postings[term].items() except: continue for doc_id, freq in items: if phrasal_query and (doc_id not in doc_to_rank): continue score[doc_id] += new_query[term] * value.weight return score
def build_index(in_dir, out_dict, out_postings): """ build index from documents stored in the input directory, then output the dictionary file and postings file """ print('indexing...') ''' Create a sorted list of the files inside the directory ''' corpus = PlaintextCorpusReader(in_dir, '.*\.txt') file_id_strs = corpus.fileids() # file_ids = sorted(convert2int(file_id_strs)) ''' Load corpus and generate the postings dictionary ''' postings = defaultdict(dict) tokens = list() docsInfo = defaultdict(dict) for fn_str in file_id_strs: content_raw = corpus.raw(fn_str) # read file content title, anchor_text, content = splitContent(content_raw) words = tokenize(uk2us(content)) # tokenization: content -> words tokens = stemming(words, stopword=False) # stemming fn = convert2int(fn_str) docsInfo[fn] = [title, anchor_text] print("processing: " + fn_str) if phrasal_query: token_len = defaultdict(list) else: token_len = defaultdict(int) # count the apeearing times of the token in the file term_pos = 0 for token in tokens: if phrasal_query: if token in token_len.keys(): token_len[token][0] += 1 token_len[token][1].append(term_pos) else: token_len[token] = [1, [term_pos]] else: token_len[token] += 1 term_pos += 1 ''' Generate weighted token frequency. Generate dictionary of key -> token, value -> a dict with k,v as file_name, (frequency, weighted_token_frequency) ''' if phrasal_query: weighted_tokenfreq = normalize( [get_tf(y[0]) for (x, y) in token_len.items()]) for ((token, freq), w_tf) in zip(token_len.items(), weighted_tokenfreq): postings[token][fn] = PhrasalToken(freq[0], freq[1], w_tf) else: weighted_tokenfreq = normalize( [get_tf(y) for (x, y) in token_len.items()]) for ((token, freq), w_tf) in zip(token_len.items(), weighted_tokenfreq): postings[token][fn] = Token(freq, w_tf) ''' Output dictionary and postings files - Dictionary file stores all the tokens, with their doc frequency, the offset in the postings file, and the size (in bytes). - Postings file stores the list of tuples -> (document ID, term freq). ''' # write postings file dictionary = defaultdict(Entry) with open(out_postings, mode="wb") as postings_file: for key, value in postings.items(): ''' len(value) := the document frequency of the token := how many times the token appears in all documents offset := current writing position of the postings file size := the number of characters written in postings file, in terms of this token ''' offset = postings_file.tell() size = postings_file.write(pickle.dumps(value)) dictionary[key] = Entry(len(value), offset, size) print("postings done.") # write dictionary file with open(out_dict, mode="wb") as dictionary_file: pickle.dump(len(file_id_strs), dictionary_file) print("docs length: " + str(len(file_id_strs))) pickle.dump(docsInfo, dictionary_file) print(docsInfo) print("docsInfo done.") pickle.dump(dictionary, dictionary_file) print(dictionary) print("dictionary done.")
def build_index(in_dir, out_dict, out_postings): """ build index from documents stored in the input directory, then output the dictionary file and postings file """ print('indexing...') ''' read csv files into nest list ''' maxInt = sys.maxsize while True: try: csv.field_size_limit(maxInt) break except OverflowError: maxInt = int(maxInt / 10) with open(in_dir, 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile) rows = [row for row in reader] rows.pop(0) ''' Load corpus and generate the postings dictionary ''' postings = defaultdict(dict) tokens = list() docsInfo = defaultdict(dict) # docs_to_terms = defaultdict(dict) print(str(len(rows)) + " rows in total. ") rowID = 1 consecutive_ids = defaultdict(dict) doc_num = 0 for docID, _, content, date, court in rows: consecutive_ids[doc_num] = docID docID = doc_num doc_num += 1 print("processing row: " + str(rowID)) rowID += 1 docsInfo[docID] = [date, court] words = tokenize(uk2us(content)) # tokenization: content -> words tokens = stemming(words, stopword=True, lemma=True) # stemming # docs_to_terms[docID] = tokens if phrasal_query: token_len = defaultdict(list) else: token_len = defaultdict(int) # count the apeearing times of the token in the file term_pos = 0 for token in tokens: if phrasal_query: if token in token_len.keys(): token_len[token][0] += 1 token_len[token][1].append(term_pos) else: token_len[token] = [1, [term_pos]] else: token_len[token] += 1 term_pos += 1 ''' Generate weighted token frequency. Generate dictionary of key -> token, value -> a dict with k,v as file_name, weighted_token_frequency ''' if phrasal_query: weighted_tokenfreq = normalize( [get_tf(y[0]) for (x, y) in token_len.items()]) for ((token, freq), w_tf) in zip(token_len.items(), weighted_tokenfreq): postings[token][docID] = PhrasalToken(freq[1], w_tf) else: weighted_tokenfreq = normalize( [get_tf(y) for (x, y) in token_len.items()]) for ((token, freq), w_tf) in zip(token_len.items(), weighted_tokenfreq): postings[token][docID] = Token(w_tf) ''' Output dictionary and postings files - Dictionary file stores all the tokens, with their doc frequency, the offset in the postings file. - Postings file stores the list of tuples -> (document ID, term freq). ''' # write postings file dictionary = defaultdict(Entry) with open(out_postings, mode="wb") as postings_file: for key, value in postings.items(): ''' len(value) := the document frequency of the token := how many times the token appears in all documents offset := current writing position of the postings file ''' offset = postings_file.tell() pickle.dump(value, postings_file) dictionary[key] = Entry(len(value), offset) # write dictionary file with open(out_dict, mode="wb") as dictionary_file: pickle.dump(len(rows), dictionary_file) print("length done.") pickle.dump(consecutive_ids, dictionary_file) pickle.dump(docsInfo, dictionary_file) print("docsInfo done.") # pickle.dump(docs_to_terms, dictionary_file) # print("docs_to_terms done") pickle.dump(dictionary, dictionary_file) print("dictionary done")