Пример #1
0
    def handle_data(self):
        '''
		Note:
			The word item site from the line number to doc file offset
		'''
        with open(self.doc_id_output, 'w+') as doc_file, \
         open(self.corpus, 'r') as corpus:
            doclen = 0
            for line in corpus:
                offset_inline = 0
                match = match_docheader(line)
                if match:
                    self.doc_id += 1
                    self.doc_offset = 0
                    if self.doc_id > 1 and doclen > 0:
                        doc_file.write(str(doclen) + '\n')
                        doclen = 0
                    doc_file.write(match.groups()[0] + ' ' + str(self.doc_id) +
                                   ' ')

                line_words = split_words(line)
                doclen += len(line_words)
                lastword = ''
                for word in line_words:
                    offset_inline = line.find(word,
                                              offset_inline + len(lastword))
                    lastword = word
                    # Stem reduction
                    word = stem(word).lower()
                    if word not in self.stop_word and len(word) != 0:
                        self.__add_word_index(word,
                                              self.doc_offset + offset_inline)
                self.doc_offset = self.doc_offset + len(line)

            doc_file.write(str(doclen))
Пример #2
0
	def handle_data(self):
		'''
		Note:
			The word item site from the line number to doc file offset
		'''
		with open(self.doc_id_output, 'w+') as doc_file, \
			open(self.corpus, 'r') as corpus:
			doclen = 0
			for line in corpus:
				offset_inline = 0
				match = match_docheader(line)
				if match:
					self.doc_id += 1
					self.doc_offset = 0
					if self.doc_id > 1 and doclen > 0:
						doc_file.write(str(doclen)+'\n')
						doclen = 0
					doc_file.write(match.groups()[0]+' '+str(self.doc_id)+ ' ')
					
				
				line_words = split_words(line)
				doclen += len(line_words)
				lastword = ''
				for word in line_words:
					offset_inline = line.find(word, offset_inline + len(lastword))
					lastword = word	
					# Stem reduction
					word = stem(word).lower()
					if word not in self.stop_word and len(word) != 0: 
						self.__add_word_index(word, self.doc_offset + offset_inline)
				self.doc_offset = self.doc_offset + len(line)

			doc_file.write(str(doclen))
Пример #3
0
def stem_query(query_string):
    return filter(lambda word: stem(word).lower(), query_string)
Пример #4
0
def stem_query(query_string):
    return filter(lambda word: stem(word).lower(), query_string)