def __init__(self, arg_stop_words, inverted_index, postingIndexFile): self.stop_words = self.stop_words_set(arg_stop_words) self.inverted_index = inverted_index self.p = PorterStemmer() self.postingIndexFile = postingIndexFile
def parse_collection(self, arg_collection): pageID = -1 title = '' text = '' id_page_dict = {} #classify tags for line in arg_collection: split = [] if line[0] == '<': split = line.split('>', 1) if len(split) != 0: head = split[0][1:] if head == 'page': text = '' elif head == 'id': id_str = split[1].split('<') id_str = id_str[0] pageID = int(id_str) elif head == 'text': if split[1][-8:-1] == '</text>': text += split[1][:-8] else: text += split[1][:-1] for line in arg_collection: if line[-8:-1] == '</text>': text += (' ' + line[:-8]) break else: text += ' ' + line[:-1] title_text = title + '\n' + text id_page_dict[pageID] = title_text elif head == 'title': title_list = split[1].split('<') title = title_list[0] self.title_index[pageID] = title #lower cases for key, value in id_page_dict.items(): temp = value.lower() value = '' flag = False for c in temp: if c >= 'a' and c <= 'z' or c >= '0' and c <= '9': value += c flag = False else: if not flag: value += ' ' flag = True value = value.strip(' ') #filter out stop words and porter stemmer p = PorterStemmer() value_list = value.split(' ') value_list = filter(lambda token: token not in self.stop_words, value_list) new_value = [] for s in value_list: new_value.append(p.stem(s, 0, len(s) - 1)) value = ' '.join(new_value) id_page_dict[key] = value #build inverted index self.inverted_index = self.build_inverted_index(id_page_dict)