def __init__(self, query, index_file, collection): ''' index is the inverted index; collection is the document collection''' self.raw_query = query self.index = InvertedIndex() self.index = self.index.loadData(index_file) self.docs = collection self.tokenizer = Tokenizer( known_words=set(self.index.get_items_inverted().keys())) if self.raw_query: self.processed_query = self.preprocessing(self.raw_query)
def prepare_tokenizer(config, model): args = config['args'] if config['emb_class'] == 'glove': vocab = load_vocab(args.vocab_path) tokenizer = Tokenizer(vocab, config) else: tokenizer = model.bert_tokenizer return tokenizer
def preprocess_glove(config): args = config['args'] # vocab, embedding init_vocab = build_init_vocab(config) vocab, embedding = build_vocab_from_embedding(args.embedding_path, init_vocab, config) # build data tokenizer = Tokenizer(vocab, config) if args.augmented: path = os.path.join(args.data_dir, args.augmented_filename) else: path = os.path.join(args.data_dir, _TRAIN_FILE) train_data = build_data(path, tokenizer) path = os.path.join(args.data_dir, _VALID_FILE) valid_data = build_data(path, tokenizer) path = os.path.join(args.data_dir, _TEST_FILE) test_data = build_data(path, tokenizer) # build labels path = os.path.join(args.data_dir, _TRAIN_FILE) labels = build_label(path) # write data, vocab, embedding, labels if args.augmented: path = os.path.join(args.data_dir, args.augmented_filename + _SUFFIX) else: path = os.path.join(args.data_dir, _TRAIN_FILE + _SUFFIX) write_data(train_data, path, tokenizer, labels) path = os.path.join(args.data_dir, _VALID_FILE + _SUFFIX) write_data(valid_data, path, tokenizer, labels) path = os.path.join(args.data_dir, _TEST_FILE + _SUFFIX) write_data(test_data, path, tokenizer, labels) path = os.path.join(args.data_dir, _VOCAB_FILE) write_vocab(vocab, path) path = os.path.join(args.data_dir, _EMBED_FILE) write_embedding(embedding, path) path = os.path.join(args.data_dir, _LABEL_FILE) write_label(labels, path)
def preprocess_glove_or_elmo(config): args = config['args'] # vocab, embedding init_vocab = build_init_vocab(config) vocab, embedding = build_vocab_from_embedding(args.embedding_path, init_vocab, config) # build poss, chars, labels path = os.path.join(args.data_dir, _TRAIN_FILE) poss, chars, labels, _ = build_dict(path, config) tokenizer = Tokenizer(vocab, config) # build data path = os.path.join(args.data_dir, _TRAIN_FILE) train_data = build_data(path, tokenizer) path = os.path.join(args.data_dir, _VALID_FILE) valid_data = build_data(path, tokenizer) path = os.path.join(args.data_dir, _TEST_FILE) test_data = build_data(path, tokenizer) # write data, vocab, embedding, poss, labels path = os.path.join(args.data_dir, _TRAIN_FILE + _SUFFIX) write_data(args, train_data, path, tokenizer, poss, labels) path = os.path.join(args.data_dir, _VALID_FILE + _SUFFIX) write_data(args, valid_data, path, tokenizer, poss, labels) path = os.path.join(args.data_dir, _TEST_FILE + _SUFFIX) write_data(args, test_data, path, tokenizer, poss, labels) path = os.path.join(args.data_dir, _VOCAB_FILE) write_vocab(vocab, path) path = os.path.join(args.data_dir, _EMBED_FILE) write_embedding(embedding, path) path = os.path.join(args.data_dir, _POS_FILE) write_dict(poss, path) path = os.path.join(args.data_dir, _LABEL_FILE) write_dict(labels, path)
def preprocess_bert(config): args = config['args'] w_tokenizer = None if args.bert_use_subword_pooling and args.bert_use_word_embedding: args = config['args'] # vocab, embedding init_vocab = build_init_vocab(config) vocab, embedding = build_vocab_from_embedding(args.embedding_path, init_vocab, config) w_tokenizer = Tokenizer(vocab, config) # write embedding path = os.path.join(args.data_dir, _EMBED_FILE) write_embedding(embedding, path) tokenizer = AutoTokenizer.from_pretrained(args.bert_model_name_or_path) # build poss, chars, labels, glabels path = os.path.join(args.data_dir, _TRAIN_FILE) poss, chars, labels, glabels = build_dict(path, config) # build features path = os.path.join(args.data_dir, _TRAIN_FILE) train_features = build_features(path, tokenizer, poss, labels, config, mode='train', w_tokenizer=w_tokenizer, glabels=glabels) path = os.path.join(args.data_dir, _VALID_FILE) valid_features = build_features(path, tokenizer, poss, labels, config, mode='valid', w_tokenizer=w_tokenizer, glabels=glabels) path = os.path.join(args.data_dir, _TEST_FILE) test_features = build_features(path, tokenizer, poss, labels, config, mode='test', w_tokenizer=w_tokenizer, glabels=glabels) # write features path = os.path.join(args.data_dir, _TRAIN_FILE + _FSUFFIX) write_features(train_features, path) path = os.path.join(args.data_dir, _VALID_FILE + _FSUFFIX) write_features(valid_features, path) path = os.path.join(args.data_dir, _TEST_FILE + _FSUFFIX) write_features(test_features, path) # write poss, labels, glabels path = os.path.join(args.data_dir, _POS_FILE) write_dict(poss, path) path = os.path.join(args.data_dir, _LABEL_FILE) write_dict(labels, path) path = os.path.join(args.data_dir, _GLABEL_FILE) write_dict(glabels, path)
class QueryProcessor: ## # # @param self # @param query # @param index # @param collection # @return None # @brief The constructor. # This process is extremely expensive because it loads the entire pickle object into memory. # If we are only executing this for one query it is fine but if we are doing it # for the evaluation used the load query instead # @exception None documented yet ## def __init__(self, query, index_file, collection): ''' index is the inverted index; collection is the document collection''' self.raw_query = query self.index = InvertedIndex() self.index = self.index.loadData(index_file) self.docs = collection self.tokenizer = Tokenizer( known_words=set(self.index.get_items_inverted().keys())) if self.raw_query: self.processed_query = self.preprocessing(self.raw_query) ## # @brief This method is used to load the next query for evaluation # @param self # @param query # @return None # @exception None ## def loadQuery(self, query): self.raw_query = query self.processed_query = self.preprocessing(self.raw_query) ## # @brief This method is used to load the next query for evaluation # @param self # @param raw_query # @return None # @exception None ## def preprocessing(self, raw_query): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' return self.tokenizer.transpose_document_tokenized_stemmed_spelling( raw_query) ## # @brief This method does the boolean query processing # @param self # @return results:list[docID] # @bug Fixed # @exception None ## def booleanQuery(self): ''' boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" for retrieving posting lists and merge them''' ''' This method would likely be faster due to the use of hashes, but I wanted to do what was shown in the slides from functools import reduce docs = [set(self.index[w]) for w in self.processed_query] docs.sort(key=len) # notice it is still smart to order by size return reduce(set.intersection,docs) ''' if len(self.processed_query) == 0: return [] ## checks that all of our query words are in the index, if not return [] ## for w in self.processed_query: if not w in self.index.get_items_inverted(): return [] ## checks if we only have 1 term in the query and returns its posting list if we do ## if len(self.processed_query) == 1: return list(self.index.get_items_inverted()[ self.processed_query[0]].get_posting_list().keys()) #### document_ids is a list of lists containing only document ids #### document_ids = [ list(self.index.get_items_inverted()[w].get_posting_list().keys()) for w in self.processed_query ] # by sorting so that we start with the shortest list of documents we get a potential speed up document_ids.sort(key=len) results = document_ids[0] ## iterates through each query word and does the intersection of docids from its posting list with all those before it ## ## could be done faster if index was implemented as set or some other hash data structure for p in document_ids[1:]: intermediate = [] i, j = 0, 0 while i < len(results) and j < len(p): if int(results[i]) < int(p[j]): i += 1 elif int(results[i]) > int(p[j]): j += 1 else: intermediate.append(p[j]) j += 1 i += 1 results = intermediate ## checks if we have already found terms totally disjoint from one another if len(results) == 0: return results return results ## # @brief This method compute cosine similarity for two vectors # @param self # @param vec1 # @param vec2 # @return score cosine: int # @exception None ## def cosine_similarity(self, vec1, vec2): # "compute cosine similarity: (vec1*vec2)/(||vec1||*||vec2||)" AA, AB, BB = 0, 0, 0 for i in range(len(vec1)): x = vec1[i] y = vec2[i] AA += x * x BB += y * y AB += x * y return round(AB / math.sqrt(AA * BB), 4) ## # @brief This method compute vector model # @param self # @param k # @return cosines: dict{docID: score} # @bug Fixed # @exception ValueError ## def vectorQuery(self, k): ''' vector query processing, using the cosine similarity. ''' #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order # You can use term frequency or TFIDF to construct the vectors if len(self.processed_query) == 0: all_docids = set() for _, v in self.index.get_items_inverted().items(): all_docids.update(v.get_posting_list().keys()) return [(str(id), 0) for id in sorted(list(map(int, all_docids)))[:k]] query_words = list(set(self.processed_query)) idfs = [self.index.idf(w) for w in query_words] # undefined behavior from document on what to do if k is larger than the corpus try: if k > self.index.get_total_number_Doc(): raise ValueError('k is greater than number of documents') except ValueError as err: print(err.args) return # below we define behavior if none of the words in the query are in any documents # this behavior was not defined in instructions so no documents seems most appropriate # if you used google and got 0 cosine it would return 0 documents even if you wanted the 50 most relevant if set(idfs) == {0}: all_docids = set() for _, v in self.index.get_items_inverted().items(): all_docids.update(v.get_posting_list().keys()) return [(str(id), 0) for id in sorted(list(map(int, all_docids)))[:k]] # removes any words that have 0 idf as that means they didn't appear in the corpus, means save memory # probably not necessary to turn it into lists, and may actually be more appropriate to leave as tuples idfs, query_words = map( list, zip(*[i for i in list(zip(idfs, query_words)) if not i[0] == 0])) #Calculates tfs of relevant words query_term_counter = Counter(self.processed_query) query_tf_vector = [ round(math.log10(query_term_counter[w] + 1), 4) for w in query_words ] #Other way of doing tf #query_tf_vector = [round(1 + math.log10(query_term_counter[w]),4) if query_term_counter[w] > 0 else 0 for w in query_words] ### NCC change if a term in a quiry does not appear in our inverted index Forget/Discount term #### postings should be a list of lists which contains word postings postings = [ self.index.get_items_inverted()[w].get_posting_list() for w in query_words if w in self.index.get_items_inverted() ] document_ids = set().union(*postings) document_tfs = {d: [0] * len(query_words) for d in document_ids} for inx, term in enumerate(postings): for document_id, posting in term.items(): #log normalization document_tfs[document_id][inx] = math.log10( posting.term_freq() + 1) #Other # tf = posting.term_freq() # if tf > 0 : # tf = 1 + math.log10(tf) # else: # tf = 0 # document_tfs[document_id][inx] = tf query_tfidf = np.multiply(query_tf_vector, idfs) cosines = Counter({ d: self.cosine_similarity(query_tfidf, np.multiply(d_tf, idfs)) for d, d_tf in document_tfs.items() }) # this has to be a list as dict are not sorted... # need a consistent ordering of documents when multiple documents have the same score we first sort on score then docid, very slow # if we know k or know the number of documents we could use numpy to preallocate memory which means we would not have to use append and could just use copy temp_k = k scores = sorted(list(set(cosines.values())), reverse=True) ret = [] for s in scores: docs_with_score_s = sorted( [int(d) for d, v in cosines.items() if v == s]) if len(docs_with_score_s) >= temp_k: docs_with_score_s = docs_with_score_s[:temp_k] ret.extend([(str(d), s) for d in docs_with_score_s]) temp_k = 0 break else: temp_k = temp_k - len(docs_with_score_s) ret.extend([(str(d), s) for d in docs_with_score_s]) if not temp_k == 0: all_docids = set() for _, v in self.index.get_items_inverted().items(): all_docids.update(v.get_posting_list().keys()) ret.extend([(str(j), 0) for j in sorted( list(map(int, all_docids.difference({i[0] for i in ret}))))[:temp_k] ]) return ret
def __init__(self): self.__items = {} # list of IndexItems self.__nDocs = 0 # the number of indexed documents self.__tokenizer = Tokenizer()
class InvertedIndex: ## # @param self # @param topicName # @return None # @brief The constructor. # @exception None documented yet ## def __init__(self): self.__items = {} # list of IndexItems self.__nDocs = 0 # the number of indexed documents self.__tokenizer = Tokenizer() ## # @brief This method return the total number of doc in our data set # # @param self # @param Doc # @return int # @exception None ## def get_total_number_Doc(self): return self.__nDocs ## # @brief This method return the total number of doc in our data set # # @param self # @param Doc # @return items: dict # @exception None ## def get_items_inverted(self): return self.__items ## # @brief This method is designed to index a docuemnt, using the simple SPIMI algorithm, # but no need to store blocks due to the small collection we are handling. # Using save/load the whole index instead # # ToDo: indexing only title and body; use some functions defined in util.py # (1) convert to lower cases, # (2) remove stopwords, # (3) stemming # # @param self # @param Doc # @return None # @exception None ## def indexDoc(self, doc): # indexing a Document object #Concatenate document title newDoc = doc.title + " " + doc.author + " " + doc.body docID = doc.docID full_stemmed_list = self.__tokenizer.transpose_document_tokenized_stemmed( newDoc) for position, term in enumerate(full_stemmed_list): if self.__items.get(term) != None: self.__items[term].add(docID, position) else: #key does not exists in dict newPosting = Posting(docID) newPosting.append(position) self.__items[term] = IndexItem(term) self.__items[term].set_posting_list(docID, newPosting) self.__nDocs += 1 ## # @brief This method Sorts all posting list by document ID. # NOTE: This method seems redundant as by default all postings list document IDs will be in order. # Since documents are read in in a particular order. # @param self # @return None # @exception None ## def sort(self): ''' sort all posting lists by docID''' for term, posting in self.__items.items(): posting.sort() ## # @brief This method sorts all indexing terms in our index # # @param self # @return OrderedDict # @exception None ## def sort_terms(self): ''' sort all posting lists by docID''' return collections.OrderedDict( sorted(self.__items.items(), key=operator.itemgetter(0))) # ## # @brief This method finds a term in the indexing and returns its posting list # # @param self # @param term # @return postingList:dict # @exception None ## def find(self, term): return self.__items[term] ## # @brief This method to dumper for json # # @param self # @param obj # @return toJSON or dict # @exception None ## def dumper(self, obj): try: return obj.toJSON() except: return obj.__dict__ ## # @brief This method Serializes the inverted index to a json format and # clears the Memory that holds this dictionary # # @param self # @param filename # @return ValueError # @exception None ## def save(self, filename): write_stream = open(filename, 'w') listTerm = self.sort_terms() dictMain = {} listInfo = {} for term, postingList in listTerm.items(): dictTemp = postingList.posting_list_to_string() dictTemp["idf"] = self.idf(term) dictMain[term] = dictTemp listInfo["nDoc"] = self.get_total_number_Doc() listInfo["Data"] = dictMain try: write_stream.write(json.dumps(listInfo, indent=3)) except ValueError as e: print("Is not valid json") write_stream.close() ## # @brief This method deserializes a json file in a object by reallocating the self.__items # # @param self # @param filename # @return json: dict # @exception ValueError ## def load(self, filename): try: with open(filename) as json_file: return json.load(json_file) except ValueError as e: print("Is not valid json") ## # @brief This method get IDF for term by compute the inverted document frequency for a given term. # We used this IDF = (Total number of (documents))/(Number of (documents) containing the word) # # @param self # @param term # @return idf:int # @exception None ## def idf(self, term): ''' ''' if not term in self.__items: return 0 termData = self.__items[term] N = self.get_total_number_Doc() df = len(termData.get_posting_list()) #inverse document frequency idf = round(math.log10(N / (float(df))), 4) #probabilistic inverse document frequency from #idf = round(math.log10(N - df /(float(df))), 4) return idf ## # @brief This method create IDF for doc # # @param self # @return idf: {term: {docID:idf}} # @exception None ## def idfDict(self): idf = collections.OrderedDict() for term, postingList in self.sort_terms().items(): idf[term] = self.idf(term) return idf ## # @brief This method create TF for add doc # There are different ways to represent TF we used tf = log(1+tf) # Another way is TF = (Frequency of the word in the sentence) / (Total number of words in the sentence) # # @param self # @return word_tf_values: {term: {docID: tf, docID: tf }} # @exception None ## def tf_doc(self): word_tf_values = collections.OrderedDict() for term, postingList in self.sort_terms().items(): doc_tf = collections.OrderedDict() for docID, post in postingList.get_posting_list().items(): doc_tf[docID] = round(math.log10(1 + post.term_freq()), 4) #log normalize word_tf_values[term] = doc_tf return word_tf_values ## # @brief This method create tfidf for all doc. # It structure is of the form {docID: {term: tf-idf,term: tf-idf }} # @param self # @param word_tf_valuesm # @param idfDict # @return TFIDF_dict:{docID: {term: tf-idf,term: tf-idf }} # @exception None ## def tf_idf(self, word_tf_valuesm, idfDict): TFIDF_dict = collections.defaultdict(list) for term, postingList in self.sort_terms().items(): tf_idf = 0.0 for doc, doctf in word_tf_valuesm[term].items(): term_tf_idf_doc = {} tf_idf = doctf * idfDict[term] term_tf_idf_doc[term] = tf_idf TFIDF_dict[doc].append(term_tf_idf_doc) return TFIDF_dict ## ## # @brief This method Saves the current state of the InvertedIndex # # @param self # @param filename # @return None # @exception AttributeError, pickle.PickleError ## def storeData(self, filename): try: fileP = open(filename, "wb") pickle.dump(self, fileP) # serialize class object except (AttributeError, pickle.PickleError): print("Error pickle.dump InvertedIndex ") fileP.close() ## # @brief This method Loads the saved InvertedIndex # # @param self # @param filename # @return invertedIndexer # @exception (pickle.UnpicklingError, ImportError, EOFError, IndexError, TypeError) ## def loadData(self, filename): try: fileP = open(filename, "rb") invertedIndexer = pickle.load(fileP) except (pickle.UnpicklingError, ImportError, EOFError, IndexError, TypeError) as err: print(err) print("Error pickle.load InvertedIndex ") fileP.close() return invertedIndexer