def vectorQuery(self, k): """ vector query processing, using the cosine similarity. """ # ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order # You can use term frequency or TFIDF to construct the vectors result = {} ivObj = InvertedIndex() ivObj.load(self.filename) # loading the InvertedIndex doc_set = set() term_idf_list = [] for term in self.tokens: # for every term in the query finding the document IDs where the term is present if term in self.index: doc_set = doc_set.union(set(self.index[term].posting.keys())) term_idf_list.append( ivObj.idf(term) * 1.0 / len(self.tokens)) # calculating tf-idf weights for query doc_list = list(doc_set) for docID in doc_list: # Calculating tf-idf weights for the above documents for term in self.tokens: if term in self.index: if docID in result.keys(): result[docID].append(ivObj.tfidf(term, docID)) else: result[docID] = [ivObj.tfidf(term, docID)] else: if docID in result.keys(): result[docID].append(0.0) else: result[docID] = [0.0] score_dict = {} term_idf_list_np = np.array(self.unitVector( term_idf_list)) # calculating unit vector for each document for docID in doc_list: unit_result = self.unitVector(result[docID]) unit_np = np.array(unit_result) score_dict[docID] = np.dot( term_idf_list_np, unit_np) # dot product for query and each document score_list = score_dict.items() final = sorted(score_list, key=itemgetter(1), reverse=True) similarity = [] for i in range(0, k): similarity.append(final[i]) return similarity # list of (docID,cosine similarity) in order of ranking
def main(): ######### # SETUP # ######### # Get input args newsgroups_root_dir = argv[1] feat_def_path = argv[2] class_def_path = argv[3] training_data_path = argv[4] # Generate index #index_newsgroups(newsgroups_root_dir, "idx_save.pkl") ii = InvertedIndex() ii.load("idx_save.pkl") # Write out feature/term pairs to feat_def_path feature_id = 0 with open(feat_def_path, 'w') as outf: for item in ii.items: outf.write(str(feature_id) + " " + str(item) + "\n") feature_id += 1 # Read back in the feature/term pairs for later with open(feat_def_path, 'r') as inf: ft_pairs = inf.readlines() # Put the ft_pairs into a dictionary for quick lookup ft_dict = {} for pair in ft_pairs: ft_dict[pair.split()[1].strip()] = pair.split()[0] # Map the different newsgroups to a given class # This is fairly manual... with open(class_def_path, 'w') as outf: for dir in listdir(newsgroups_root_dir): outf.write(class_def_helper(dir) + " " + dir + "\n") ############################ # TRAINING DATA GENERATION # ############################ # Create the training data # For each document: # Find its containing folder, and extract class from class def # For each term in document # Compute tfidf, tf or idf current_file_id = 1 with open(training_data_path + ".TFIDF", 'w') as outf: # Compute tf-idf # Go through each document in newsgroups dir for root, _, files in walk(newsgroups_root_dir): # Find and write out the class label local_dir = root.split(sep)[-1] # For each file... for file in files: outf.write(class_def_helper(local_dir) + " ") print(root, file) # Get the words from the doc stemmed_token_list = preprocess_doc(root + sep + file) # Put all the info into a set (for uniqueness) data_set = set() # Now that we've re-done all that, find idfs for word in stemmed_token_list: # Skip blank stopwords if word == "": continue # Get the term ID #outf.write(ft_dict[word] + ":") # Calculate and write out TF-IDF # Note current_file_id is our doc_id tf = ii.find(word).posting[current_file_id].term_freq() idf = ii.idf(word) #outf.write(str(log10(1 + tf) * idf) + " ") data_set.add(ft_dict[word] + ":" + str(log10(1 + tf) * idf)) # Write newline to signify end of file #outf.write("\n") outf.write(" ".join( sorted(data_set, key=lambda x: int(x.split(':')[0]))) + "\n") outf.flush() # Increment our current doc current_file_id += 1 current_file_id = 1 with open(training_data_path + ".TF", 'w') as outf: # Compute tf # Go through each document in newsgroups dir for root, _, files in walk(newsgroups_root_dir): # Find and write out the class label local_dir = root.split(sep)[-1] # For each file... for file in files: outf.write(class_def_helper(local_dir) + " ") print(root, file) # Get the words from the doc stemmed_token_list = preprocess_doc(root + sep + file) # Put all the info into a set (for uniqueness) data_set = set() # Now that we've re-done all that, find idfs for word in stemmed_token_list: # Skip blank stopwords if word == "": continue # Get the term ID #outf.write(ft_dict[word] + ":") # Write the TF # Note current_file_id is our doc_id # outf.write(str(ii.find(word).posting[ # current_file_id].term_freq()) + " ") data_set.add(ft_dict[word] + ":" + str( ii.find(word).posting[current_file_id].term_freq())) # Write newline to signify end of file # outf.write("\n") outf.write(" ".join( sorted(data_set, key=lambda x: int(x.split(':')[0]))) + "\n") # outf.flush() # Increment our current doc current_file_id += 1 current_file_id = 1 with open(training_data_path + ".IDF", 'w') as outf: # Compute idf # Go through each document in newsgroups dir for root, _, files in walk(newsgroups_root_dir): # Find and write out the class label local_dir = root.split(sep)[-1] # For each file... for file in files: outf.write(class_def_helper(local_dir) + " ") print(root, file) # Get the words from the doc stemmed_token_list = preprocess_doc(root + sep + file) # Put all the info into a set (for uniqueness) data_set = set() # Now that we've re-done all that, find idfs for word in stemmed_token_list: # Skip blank stopwords if word == "": continue # Get the term ID #outf.write(ft_dict[word] + ":" + str(ii.idf(word)) # + " ") data_set.add(ft_dict[word] + ":" + str(ii.idf(word))) # Write newline to signify end of file outf.write(" ".join( sorted(data_set, key=lambda x: int(x.split(':')[0]))) + "\n")
class QueryProcessor: ## # # @param self # @param query # @param index # @param collection # @return None # @brief The constructor. # This process is extremely expensive because it loads the entire pickle object into memory. # If we are only executing this for one query it is fine but if we are doing it # for the evaluation used the load query instead # @exception None documented yet ## def __init__(self, query, index_file, collection): ''' index is the inverted index; collection is the document collection''' self.raw_query = query self.index = InvertedIndex() self.index = self.index.loadData(index_file) self.docs = collection self.tokenizer = Tokenizer( known_words=set(self.index.get_items_inverted().keys())) if self.raw_query: self.processed_query = self.preprocessing(self.raw_query) ## # @brief This method is used to load the next query for evaluation # @param self # @param query # @return None # @exception None ## def loadQuery(self, query): self.raw_query = query self.processed_query = self.preprocessing(self.raw_query) ## # @brief This method is used to load the next query for evaluation # @param self # @param raw_query # @return None # @exception None ## def preprocessing(self, raw_query): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' return self.tokenizer.transpose_document_tokenized_stemmed_spelling( raw_query) ## # @brief This method does the boolean query processing # @param self # @return results:list[docID] # @bug Fixed # @exception None ## def booleanQuery(self): ''' boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" for retrieving posting lists and merge them''' ''' This method would likely be faster due to the use of hashes, but I wanted to do what was shown in the slides from functools import reduce docs = [set(self.index[w]) for w in self.processed_query] docs.sort(key=len) # notice it is still smart to order by size return reduce(set.intersection,docs) ''' if len(self.processed_query) == 0: return [] ## checks that all of our query words are in the index, if not return [] ## for w in self.processed_query: if not w in self.index.get_items_inverted(): return [] ## checks if we only have 1 term in the query and returns its posting list if we do ## if len(self.processed_query) == 1: return list(self.index.get_items_inverted()[ self.processed_query[0]].get_posting_list().keys()) #### document_ids is a list of lists containing only document ids #### document_ids = [ list(self.index.get_items_inverted()[w].get_posting_list().keys()) for w in self.processed_query ] # by sorting so that we start with the shortest list of documents we get a potential speed up document_ids.sort(key=len) results = document_ids[0] ## iterates through each query word and does the intersection of docids from its posting list with all those before it ## ## could be done faster if index was implemented as set or some other hash data structure for p in document_ids[1:]: intermediate = [] i, j = 0, 0 while i < len(results) and j < len(p): if int(results[i]) < int(p[j]): i += 1 elif int(results[i]) > int(p[j]): j += 1 else: intermediate.append(p[j]) j += 1 i += 1 results = intermediate ## checks if we have already found terms totally disjoint from one another if len(results) == 0: return results return results ## # @brief This method compute cosine similarity for two vectors # @param self # @param vec1 # @param vec2 # @return score cosine: int # @exception None ## def cosine_similarity(self, vec1, vec2): # "compute cosine similarity: (vec1*vec2)/(||vec1||*||vec2||)" AA, AB, BB = 0, 0, 0 for i in range(len(vec1)): x = vec1[i] y = vec2[i] AA += x * x BB += y * y AB += x * y return round(AB / math.sqrt(AA * BB), 4) ## # @brief This method compute vector model # @param self # @param k # @return cosines: dict{docID: score} # @bug Fixed # @exception ValueError ## def vectorQuery(self, k): ''' vector query processing, using the cosine similarity. ''' #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order # You can use term frequency or TFIDF to construct the vectors if len(self.processed_query) == 0: all_docids = set() for _, v in self.index.get_items_inverted().items(): all_docids.update(v.get_posting_list().keys()) return [(str(id), 0) for id in sorted(list(map(int, all_docids)))[:k]] query_words = list(set(self.processed_query)) idfs = [self.index.idf(w) for w in query_words] # undefined behavior from document on what to do if k is larger than the corpus try: if k > self.index.get_total_number_Doc(): raise ValueError('k is greater than number of documents') except ValueError as err: print(err.args) return # below we define behavior if none of the words in the query are in any documents # this behavior was not defined in instructions so no documents seems most appropriate # if you used google and got 0 cosine it would return 0 documents even if you wanted the 50 most relevant if set(idfs) == {0}: all_docids = set() for _, v in self.index.get_items_inverted().items(): all_docids.update(v.get_posting_list().keys()) return [(str(id), 0) for id in sorted(list(map(int, all_docids)))[:k]] # removes any words that have 0 idf as that means they didn't appear in the corpus, means save memory # probably not necessary to turn it into lists, and may actually be more appropriate to leave as tuples idfs, query_words = map( list, zip(*[i for i in list(zip(idfs, query_words)) if not i[0] == 0])) #Calculates tfs of relevant words query_term_counter = Counter(self.processed_query) query_tf_vector = [ round(math.log10(query_term_counter[w] + 1), 4) for w in query_words ] #Other way of doing tf #query_tf_vector = [round(1 + math.log10(query_term_counter[w]),4) if query_term_counter[w] > 0 else 0 for w in query_words] ### NCC change if a term in a quiry does not appear in our inverted index Forget/Discount term #### postings should be a list of lists which contains word postings postings = [ self.index.get_items_inverted()[w].get_posting_list() for w in query_words if w in self.index.get_items_inverted() ] document_ids = set().union(*postings) document_tfs = {d: [0] * len(query_words) for d in document_ids} for inx, term in enumerate(postings): for document_id, posting in term.items(): #log normalization document_tfs[document_id][inx] = math.log10( posting.term_freq() + 1) #Other # tf = posting.term_freq() # if tf > 0 : # tf = 1 + math.log10(tf) # else: # tf = 0 # document_tfs[document_id][inx] = tf query_tfidf = np.multiply(query_tf_vector, idfs) cosines = Counter({ d: self.cosine_similarity(query_tfidf, np.multiply(d_tf, idfs)) for d, d_tf in document_tfs.items() }) # this has to be a list as dict are not sorted... # need a consistent ordering of documents when multiple documents have the same score we first sort on score then docid, very slow # if we know k or know the number of documents we could use numpy to preallocate memory which means we would not have to use append and could just use copy temp_k = k scores = sorted(list(set(cosines.values())), reverse=True) ret = [] for s in scores: docs_with_score_s = sorted( [int(d) for d, v in cosines.items() if v == s]) if len(docs_with_score_s) >= temp_k: docs_with_score_s = docs_with_score_s[:temp_k] ret.extend([(str(d), s) for d in docs_with_score_s]) temp_k = 0 break else: temp_k = temp_k - len(docs_with_score_s) ret.extend([(str(d), s) for d in docs_with_score_s]) if not temp_k == 0: all_docids = set() for _, v in self.index.get_items_inverted().items(): all_docids.update(v.get_posting_list().keys()) ret.extend([(str(j), 0) for j in sorted( list(map(int, all_docids.difference({i[0] for i in ret}))))[:temp_k] ]) return ret
def test(index_loc, cran_loc, qrels_loc): ''' test your code thoroughly. put the testing cases here''' ##### SETUP ITEMS ##### # Grab index file to restore II ii = InvertedIndex() ii.load(index_loc) # Get the document collection cf = CranFile(cran_loc) # Get ground-truth results from qrels.txt with open(qrels_loc) as f: qrels = f.readlines() # Index qrels into a dict qrel_dict = {} for qrel in qrels: qrel_split = qrel.split() if int(qrel_split[0]) in qrel_dict: qrel_dict[int(qrel_split[0])].append(int(qrel_split[1])) else: qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])] ##### INITIAL TEST ITEMS ##### print("TESTS BASED ON SUGGESTED TESTING POINTS") # Ensure tf is correct # Find a random word and check TF value against what is manually done posting_list = ii.find("experiment").posting tf_vector = [] for posting in posting_list: tf_vector.append(len(posting_list[posting].positions) \ == posting_list[posting].term_freq()) print("TF is computed correctly:", all(tf_vector)) # Ensure idf is correct print("IDF is computed correctly:", log10(ii.nDocs / len(posting_list)) \ == ii.idf("experiment")) # As both tf and idf are correct, and tf-idf is a product of the two, # it is reasonable to assume tf-idf is computed correctly ##### BOOL QUERY TESTS ##### # Here, I use very specific boolean queries to ensure that a # limited number of documents are returned print("\nBOOL QUERY TESTS") # Ensure that the exact title of doc 8 matches for doc 8 doc8 = "measurements of the effect of two-dimensional and three-dimensional roughness elements on boundary layer transition" qp1 = QueryProcessor(doc8, ii, cf) print("Bool query matches on exact title:", qp1.booleanQuery() == [8]) # Ensure that bool query matches very specific AND query qp2 = QueryProcessor("hugoniot and infinitesimally", ii, cf) print( "Bool query matches on specific AND query ('hugoniot and infinitesimally'):", qp2.booleanQuery() == [329]) # Test that an OR query is handled properly # Both gravel and stagnation have completely distinct postings lists. # OR should merge them. gravel_postings = ii.find("gravel").sorted_postings[:] stag_postings = ii.find("stagnat").sorted_postings[:] gravel_postings.extend(stag_postings) qp3 = QueryProcessor("gravel or stagnation", ii, cf) print("Bool query successfully handles OR ('gravel or stagnation'):", qp3.booleanQuery() == sorted(gravel_postings)) # Test that NOT is handled properly # The posting list for "diameter" is a subset of "slipstream" postings # (oddly enough). To test this works, do "slipstream and not diameter" # and we chould get slipstream's postings minus those of diameter. slip_postings = ii.find("slipstream").sorted_postings[:] diam_postings = ii.find("diamet").sorted_postings[:] slip_not_diam = [t for t in slip_postings if t not in diam_postings] print("Bool query successfully handles NOT ('slipstream and not diameter'):", QueryProcessor("slipstream and not diameter", ii, cf).booleanQuery() \ == slip_not_diam) # Ensure AND/OR order doesn't matter print("Bool query can handle query regardless of AND order ('a and b' = 'b and a'):", QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \ == QueryProcessor("diameter and slipstream", ii, cf).booleanQuery()) print("Bool query can handle query regardless of OR order ('a or b' = 'b or a'):", QueryProcessor("slipstream or diameter", ii, cf).booleanQuery() \ == QueryProcessor("diameter or slipstream", ii, cf).booleanQuery()) # Ensure that the presence of parens does not change query results print("Bool query can handle query regardless of parens ('slipstream and diameter'):", QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \ == QueryProcessor("(slipstream and diameter)", ii, cf).booleanQuery()) # Ensure parentheses do not change order of processing for AND-AND and OR-OR queries print("Bool query AND is accociative ('(a and b) and c' = 'a and (b and c)'):", QueryProcessor("(slipstream and diameter) and thrust", ii, cf).booleanQuery() \ == QueryProcessor("slipstream and (diameter and thrust)", ii, cf).booleanQuery()) print("Bool query OR is accociative ('(a or b) or c' = 'a or (b or c)'):", QueryProcessor("(slipstream or diameter) or thrust", ii, cf).booleanQuery() \ == QueryProcessor("slipstream or (diameter or thrust)", ii, cf).booleanQuery()) # Ensure parentheses properly group items # Tested by doing the query "manually" by adding/orring the correct terms part_one = QueryProcessor("conduction and cylinder and gas", ii, cf).booleanQuery() part_two = QueryProcessor("radiation and gas", ii, cf).booleanQuery() part_one.extend(part_two) expected_result = QueryProcessor("hugoniot", ii, cf).booleanQuery() expected_result.extend(part_one) print("Bool query parens successfully group conflicting operators:", QueryProcessor("(conduction and cylinder and gas) or (radiation and gas) or hugoniot", ii, cf).booleanQuery() \ == sorted(list(set(expected_result)))) ##### VECTOR QUERY TESTS ##### # For this, just ensure that most of the results are in the expected list print("\nVECTOR QUERY TESTS") # Ensure vector query can match on exact title print("Vector query matches on exact title:", qp1.vectorQuery(1)[0][0] == 8) # Try a few example queries from query.text # As long as one-fifth of t-10 are in gt_result, call it a pass # Note that queries with larger answer sets were chosen to # ensure there were enough to get to one-fifth of ten qc = loadCranQry("query.text") poss_queries = list(qc) # Query 001 result = QueryProcessor(qc["001"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("001") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 001:", sum(correct_vector) > 2) # Query 128 result = QueryProcessor(qc["128"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("128") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 128:", sum(correct_vector) > 2) # Query 226 result = QueryProcessor(qc["226"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("226") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 226:", sum(correct_vector) > 2) # Query 196 result = QueryProcessor(qc["196"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("196") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 196:", sum(correct_vector) > 2) # Query 291 result = QueryProcessor(qc["291"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("291") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 291:", sum(correct_vector) > 2)