def run(self, query, page=Page(0, 20)): ''' Procedure: 1. tokenize the query 2. calculate query weight 3. calculate all distances 4. sort distances 5. return sorted result Args: query: query string page: page size and offset Returns: list of identifiers ''' # tokenize query tokens = tokenize_text(query) if len(tokens) <= 0: return [] # calculate query weigth bag_of_words = bag(tokens) max_freq = bag_of_words[max(bag_of_words, key=bag_of_words.get)] query_tf = lil_matrix((1, self.tokens_no)) for token, freq in bag_of_words.items(): if token not in self.tokens: continue index = self.tokens[token] query_tf[0, index] = tf(freq, max_freq) query_tf = csr_matrix(query_tf) query_w = csr_matrix(query_tf.multiply(self.idf)) # calculate distances between all documents and query distances = self.distance(self.tf_idf, query_w) # sort results and return specified page distances = distances[:, 0] sorted_indices = np.argsort(distances) top = sorted_indices[page.start_index:page.end_index] f = np.vectorize(lambda x: self.iterative_docs[x]) result = list(f(top)) return result
def preprocess_one(raw_file): ''' Procedure: 1. create document 2. fill with the data Args: raw_file: text (string) Returns: Document ''' document = Document() document.text = raw_file document.identifier = text_hash(raw_file) document.content_hash = document.identifier document.tokens = tokenize_text(raw_file) document.bag = bag(document.tokens) return document