def handle_data(self): ''' Note: The word item site from the line number to doc file offset ''' with open(self.doc_id_output, 'w+') as doc_file, \ open(self.corpus, 'r') as corpus: doclen = 0 for line in corpus: offset_inline = 0 match = match_docheader(line) if match: self.doc_id += 1 self.doc_offset = 0 if self.doc_id > 1 and doclen > 0: doc_file.write(str(doclen) + '\n') doclen = 0 doc_file.write(match.groups()[0] + ' ' + str(self.doc_id) + ' ') line_words = split_words(line) doclen += len(line_words) lastword = '' for word in line_words: offset_inline = line.find(word, offset_inline + len(lastword)) lastword = word # Stem reduction word = stem(word).lower() if word not in self.stop_word and len(word) != 0: self.__add_word_index(word, self.doc_offset + offset_inline) self.doc_offset = self.doc_offset + len(line) doc_file.write(str(doclen))
def main(): return_count = 10 #parse parameters if len(sys.argv) >= 3: if "-w" in sys.argv: file_name = sys.argv[sys.argv.index("-w") + 1] else: usage() if "-r" in sys.argv: return_count = int(sys.argv[sys.argv.index("-r") + 1]) if "-ql" in sys.argv and "-qs" not in sys.argv: query_strlist = stem_query(sys.argv[sys.argv.index("-ql") + 1:]) elif "-qs" in sys.argv and "-ql" not in sys.argv: query_string = str(sys.argv[sys.argv.index("-qs") + 1:]) query_strlist = stem_query(split_words(query_string)) else: usage() else: usage() BKing(file_name, query_strlist, return_count)
def handle_data(self): ''' Note: The word item site from the line number to doc file offset ''' with open(self.doc_id_output, 'w+') as doc_file, \ open(self.corpus, 'r') as corpus: doclen = 0 for line in corpus: offset_inline = 0 match = match_docheader(line) if match: self.doc_id += 1 self.doc_offset = 0 if self.doc_id > 1 and doclen > 0: doc_file.write(str(doclen)+'\n') doclen = 0 doc_file.write(match.groups()[0]+' '+str(self.doc_id)+ ' ') line_words = split_words(line) doclen += len(line_words) lastword = '' for word in line_words: offset_inline = line.find(word, offset_inline + len(lastword)) lastword = word # Stem reduction word = stem(word).lower() if word not in self.stop_word and len(word) != 0: self.__add_word_index(word, self.doc_offset + offset_inline) self.doc_offset = self.doc_offset + len(line) doc_file.write(str(doclen))