def run(self, query, page=Page(0, 20)): ''' Theory: t -> term / token r -> relevant D -> document Q -> query sum -> sum by all terms from query w_t -> weight of term sum(w_t) -> sum of all weights from query document weight = sum(log(p(D_t|Q,r) / p(D_t|Q,not r))) where p(D_t|Q,r) = 0.5 p(D_t|Q,not r) = n_t / N_d where n_t = number of documents containing term t N_d = total number of documents Args: query: query string page: used defined page ''' tokens = tokenize_text(query) if len(tokens) <= 0: return [] results = [] for doc_key, document in self.documents.items(): result = 0 for token in tokens: if token not in document.bag: continue token_docs = self.docs_bag.get(token, {}) token_docs_len = len(token_docs) if token_docs_len <= 0: continue ratio = 1.0 * token_docs_len / self.docs_no if abs(ratio - 10e-8) < 0: continue result += math.log(0.5 / ratio) if result > 0: results.append((doc_key, result)) results = sorted(results, key=lambda x: x[1], reverse=True) results = list(map(lambda x: x[0], results)) return results[page.start_index:page.end_index]
def run(self, query, page=Page(0, 20)): ''' Procedure: 1. tokenize the query 2. calculate query weight 3. calculate all distances 4. sort distances 5. return sorted result Args: query: query string page: page size and offset Returns: list of identifiers ''' # tokenize query tokens = tokenize_text(query) if len(tokens) <= 0: return [] # calculate query weigth bag_of_words = bag(tokens) max_freq = bag_of_words[max(bag_of_words, key=bag_of_words.get)] query_tf = lil_matrix((1, self.tokens_no)) for token, freq in bag_of_words.items(): if token not in self.tokens: continue index = self.tokens[token] query_tf[0, index] = tf(freq, max_freq) query_tf = csr_matrix(query_tf) query_w = csr_matrix(query_tf.multiply(self.idf)) # calculate distances between all documents and query distances = self.distance(self.tf_idf, query_w) # sort results and return specified page distances = distances[:, 0] sorted_indices = np.argsort(distances) top = sorted_indices[page.start_index:page.end_index] f = np.vectorize(lambda x: self.iterative_docs[x]) result = list(f(top)) return result
def preprocess_one(raw_file): ''' Procedure: 1. create document 2. fill with the data Args: raw_file: text (string) Returns: Document ''' document = Document() document.text = raw_file document.identifier = text_hash(raw_file) document.content_hash = document.identifier document.tokens = tokenize_text(raw_file) document.bag = bag(document.tokens) return document
def run(self, query, page=Page(0, 20)): ''' Procedure: 1. tokenize the query 2. for each token get normalized score for each document that contains the token 3. sum all normalized scores Why normalization: E.g. let's say that the query is "test" the document "Test one more time." is more relevant than document "Test one more time because something could went wrong.", because test has bigger impact. Args: query: query string page: page size and offset Returns: list of identifiers ''' tokens = tokenize_text(query) if len(tokens) <= 0: return [] data = {} for token in tokens: if token not in self.docs_bag: continue docs_bag_item = self.docs_bag[token] for doc_key, token_occurrence in docs_bag_item.items(): doc_size = len(self.documents[doc_key].tokens) if doc_size <= 0: normalized = 0 else: normalized = token_occurrence / doc_size data[doc_key] = data.get(doc_key, 0) + normalized data = sorted(data, key=data.get, reverse=True) return data[page.start_index:page.end_index]