def handle_data(self): ''' Note: The word item site from the line number to doc file offset ''' with open(self.doc_id_output, 'w+') as doc_file, \ open(self.corpus, 'r') as corpus: doclen = 0 for line in corpus: offset_inline = 0 match = match_docheader(line) if match: self.doc_id += 1 self.doc_offset = 0 if self.doc_id > 1 and doclen > 0: doc_file.write(str(doclen) + '\n') doclen = 0 doc_file.write(match.groups()[0] + ' ' + str(self.doc_id) + ' ') line_words = split_words(line) doclen += len(line_words) lastword = '' for word in line_words: offset_inline = line.find(word, offset_inline + len(lastword)) lastword = word # Stem reduction word = stem(word).lower() if word not in self.stop_word and len(word) != 0: self.__add_word_index(word, self.doc_offset + offset_inline) self.doc_offset = self.doc_offset + len(line) doc_file.write(str(doclen))
def handle_data(self): ''' Note: The word item site from the line number to doc file offset ''' with open(self.doc_id_output, 'w+') as doc_file, \ open(self.corpus, 'r') as corpus: doclen = 0 for line in corpus: offset_inline = 0 match = match_docheader(line) if match: self.doc_id += 1 self.doc_offset = 0 if self.doc_id > 1 and doclen > 0: doc_file.write(str(doclen)+'\n') doclen = 0 doc_file.write(match.groups()[0]+' '+str(self.doc_id)+ ' ') line_words = split_words(line) doclen += len(line_words) lastword = '' for word in line_words: offset_inline = line.find(word, offset_inline + len(lastword)) lastword = word # Stem reduction word = stem(word).lower() if word not in self.stop_word and len(word) != 0: self.__add_word_index(word, self.doc_offset + offset_inline) self.doc_offset = self.doc_offset + len(line) doc_file.write(str(doclen))
def stem_query(query_string): return filter(lambda word: stem(word).lower(), query_string)