def stem_file(filename, vocab, stop_word = None): p = PorterStemmer() word_count = {} infile = open(filename, 'r') while 1: word = '' line = infile.readline() if line == '': break for c in line: if c.isalpha(): word += c.lower() else: if word: stemmed_word = p.stem(word, 0,len(word)-1) word = '' if stemmed_word is None or len(stemmed_word) == 0: continue if stop_word is not None and stop_word.get_id_from_token(stemmed_word) != -1: continue vocab.add_token(stemmed_word) count = word_count.get(stemmed_word, 0) + 1 word_count[stemmed_word] = count infile.close() return word_count
def stem_file(filename, vocab, stop_word=None): p = PorterStemmer() article = [] infile = open(filename, 'r') while 1: word = '' line = infile.readline() if line == '': break for c in line: if c.isalpha(): word += c.lower() else: if word: stemmed_word = p.stem(word, 0, len(word) - 1) word = '' if stemmed_word is None or len(stemmed_word) == 0: continue if stop_word is not None and stop_word.get_id_from_token( stemmed_word) != -1: continue vocab.add_token(stemmed_word) article.append(stemmed_word) infile.close() # random.shuffle(article) return article
class queryIndex: def __init__(self, arg_stop_words, inverted_index, postingIndexFile): self.stop_words = self.stop_words_set(arg_stop_words) self.inverted_index = inverted_index self.p = PorterStemmer() self.postingIndexFile = postingIndexFile #build stop_words_set def stop_words_set(self, arg_stop_words): stop_words_set = set() for line in arg_stop_words: line = line.rstrip('\n') stop_words_set.add(line) return stop_words_set def determine_query(self, query): if query[0] == '"' and query[-1] == '"': return 'PQ' word_list = query.split() if len(word_list) == 1: return 'OWQ' elif 'OR' in word_list or 'AND' in word_list: return 'BQ' else: return 'FTQ' def owq(self, query): query = query.lower() tokened_query = '' i = 0 for i in range(len(query)): if query[i] >= 'a' and query[i] <= 'z' or query[ i] >= '0' and query[i] <= '9': tokened_query += query[i] stemed_query = self.p.stem(tokened_query, 0, len(tokened_query) - 1) if stemed_query in self.stop_words: return [] return [stemed_query] def ftq(self, query): query = query.lower() tokened_query = '' flag = False i = 0 for i in range(len(query)): if query[i] >= 'a' and query[i] <= 'z' or query[ i] >= '0' and query[i] <= '9': tokened_query += query[i] flag = False else: if not flag: tokened_query += ' ' flag = True tokened_query = tokened_query.rstrip() query_list = tokened_query.split(' ') query_list = filter(lambda token: token not in self.stop_words, query_list) if not query_list: return [] stemmed_list = [] for word in query_list: stemmed_list.append(self.p.stem(word, 0, len(word) - 1)) return stemmed_list def bq(self, bool_tuples): parsed_list = [] for word in bool_tuples[1]: if isinstance(word, tuple): parsed_list.append(self.bq(word)) else: query_list = self.ftq(word) if query_list: token = ' '.join(query_list) parsed_list.append(token) parsed_tuple = (bool_tuples[0].split(), parsed_list) return parsed_tuple def parse_query(self, query): query_type = self.determine_query(query) if query_type == 'BQ': res = bool_expr_ast(query) return query_type, self.bq(res) elif query_type == 'OWQ': return query_type, self.owq(query) elif query_type == 'FTQ': return query_type, self.ftq(query) else: query = query[1:-1] return query_type, self.ftq(query) def merge_result_and(self, result1, result2): result = set(result1).intersection(set(result2)) return list(result) def merge_result_or(self, result1, result2): result = set(result1).union(set(result2)) return list(result) def match_bq_query(self, query_content): doc_list = [] for item in query_content[1]: if isinstance(item, tuple): doc_list = doc_list + [self.match_bq_query(item)] else: if item not in self.inverted_index: value_list = [] else: value_list = self.getPostingList(item) doc_list = doc_list + [value_list] index = 1 result_list = doc_list[0] while index < len(doc_list): if query_content[0] == ['AND']: result_list = self.merge_result_and(result_list, doc_list[index]) else: result_list = self.merge_result_or(result_list, doc_list[index]) index += 1 return result_list def getPostingList(self, term, query_type=None): offset_len = self.inverted_index[term] offset_len_list = offset_len.split(' ') self.postingIndexFile.seek(int(offset_len_list[0])) postingIndex = self.postingIndexFile.read(int(offset_len_list[1])) postingIndex = postingIndex.rstrip('\n') posting_pos = postingIndex.split('|') posting = posting_pos[0].rstrip(' ') posting = posting.split(' ') if query_type == None: return posting else: pos = posting_pos[1].split(';') pos_list = [] for item in pos: if item: pos_list += [item.split(' ')] return [posting] + [pos_list] def matching_documents(self, query): parsed_query = self.parse_query(query) query_type = parsed_query[0] query_content = parsed_query[1] if query_type == 'BQ': return self.match_bq_query(query_content) elif query_type == 'OWQ': if query_content[0] not in self.inverted_index: return [] res_list = self.getPostingList(query_content[0]) return res_list elif query_type == 'FTQ': result_list = [] for word in query_content: if word not in self.inverted_index: temp_list = [] else: temp_list = self.getPostingList(word) result_list = self.merge_result_or(result_list, temp_list) return result_list else: if query_content[0] not in self.inverted_index: page_list_1 = [] pos_list_1 = [] else: first_word_list = self.getPostingList(query_content[0], 'PQ') page_list_1 = first_word_list[0] pos_list_1 = first_word_list[1] index = 1 while index in range(len(query_content)): page_list = [] pos_list = [] if query_content[index] in self.inverted_index: item_list = self.getPostingList(query_content[index], 'PQ') page_list_2 = item_list[0] pos_list_2 = item_list[1] i = 0 j = 0 while i < len(page_list_1) and j < len(page_list_2): if int(page_list_1[i]) == int(page_list_2[j]): ii = 0 jj = 0 temp_list = [] while ii < len(pos_list_1[i]) and jj < len( pos_list_2[j]): if int(pos_list_1[i][ii]) + 1 == int( pos_list_2[j][jj]): temp_list.append(int(pos_list_2[j][jj])) ii += 1 jj += 1 elif int(pos_list_1[i][ii]) + 1 < int( pos_list_2[j][jj]): ii += 1 else: jj += 1 if temp_list: pos_list += [temp_list] page_list.append(page_list_2[j]) i += 1 j += 1 elif int(page_list_1[i]) < int(page_list_2[j]): i += 1 else: j += 1 index += 1 page_list_1 = page_list pos_list_1 = pos_list return page_list_1
def __init__(self, arg_stop_words, inverted_index, postingIndexFile): self.stop_words = self.stop_words_set(arg_stop_words) self.inverted_index = inverted_index self.p = PorterStemmer() self.postingIndexFile = postingIndexFile
class queryIndex: def __init__(self, arg_stop_words, inverted_index, postingIndexFile): self.stop_words = self.stop_words_set(arg_stop_words) self.inverted_index = inverted_index self.p = PorterStemmer() self.postingIndexFile = postingIndexFile #build stop_words_set def stop_words_set(self, arg_stop_words): stop_words_set = set() for line in arg_stop_words: line = line.rstrip('\n') stop_words_set.add(line) return stop_words_set def determine_query(self, query): if query[0] == '"' and query[-1] == '"': return 'PQ' word_list = query.split() if len(word_list) == 1: return 'OWQ' elif 'OR' in word_list or 'AND' in word_list: return 'BQ' else: return 'FTQ' def owq(self, query): query = query.lower() tokened_query = '' i = 0 for i in range(len(query)): if query[i] >= 'a' and query[i] <= 'z' or query[i] >='0' and query[i] <= '9': tokened_query += query[i] stemed_query = self.p.stem(tokened_query, 0, len(tokened_query) - 1) if stemed_query in self.stop_words: return [] return [stemed_query] def ftq(self, query): query = query.lower() tokened_query = '' flag = False i = 0 for i in range(len(query)): if query[i] >= 'a' and query[i] <= 'z' or query[i] >='0' and query[i] <= '9': tokened_query += query[i] flag = False else: if not flag: tokened_query += ' ' flag = True tokened_query = tokened_query.rstrip() query_list = tokened_query.split(' ') query_list = filter(lambda token: token not in self.stop_words, query_list) if not query_list: return [] stemmed_list = [] for word in query_list: stemmed_list.append(self.p.stem(word, 0, len(word) - 1)) return stemmed_list def bq(self, bool_tuples): parsed_list = [] for word in bool_tuples[1]: if isinstance(word, tuple): parsed_list.append(self.bq(word)) else: query_list = self.ftq(word) if query_list: token = ' '.join(query_list) parsed_list.append(token) parsed_tuple = (bool_tuples[0].split(), parsed_list) return parsed_tuple def parse_query(self, query): query_type = self.determine_query(query) if query_type == 'BQ': res = bool_expr_ast(query) return query_type, self.bq(res) elif query_type == 'OWQ': return query_type, self.owq(query) elif query_type == 'FTQ': return query_type, self.ftq(query) else: query = query[1:-1] return query_type, self.ftq(query) def merge_result_and(self, result1, result2): i = 0 j = 0 result = [] while i < len(result1) and j < len(result2): if result1[i][0] < result2[j][0]: i += 1 elif result1[i][0] > result2[j][0]: j += 1 else: result = result + [(result1[i][0], result1[i][1] + result2[j][1])] i += 1 j += 1 return result def merge_result_or(self, result1, result2): i = 0 j = 0 result = [] while i < len(result1) and j < len(result2): if result1[i][0] < result2[j][0]: result = result + [result1[i]] i += 1 elif result1[i][0] > result2[j][0]: result = result + [result2[j]] j += 1 else: result = result + [(result1[i][0], result1[i][1] + result2[j][1])] i += 1 j += 1 while i < len(result1): result = result + [result1[i]] i += 1 while j < len(result2): result = result + [result2[j]] j += 1 return result def match_bq_query(self, query_content): doc_list = [] for item in query_content[1]: if isinstance(item, tuple): doc_list = doc_list + [self.match_bq_query(item)] else: if item not in self.inverted_index: value_list = [] else: value_list = self.getPostingList(item) doc_list = doc_list + [value_list] index = 1 result_list = doc_list[0] while index < len(doc_list): if query_content[0] == ['AND']: result_list = self.merge_result_and(result_list, doc_list[index]) else: result_list = self.merge_result_or(result_list, doc_list[index]) index += 1 return result_list def getPostingList(self, term, query_type = None): offset_len = self.inverted_index[term] offset_len_list = offset_len.split(' ') self.postingIndexFile.seek(int(offset_len_list[0])) postingIndex = self.postingIndexFile.read(int(offset_len_list[1])) postingIndex = postingIndex.rstrip('\n') postingIndex = postingIndex.rstrip(' ') posting_pos = postingIndex.split('|') posting = posting_pos[0].rstrip(' ') posting = posting.split(' ') posting = map(int, posting) tfidf_list = posting_pos[2].split(' ') tfidf_list = map(float, tfidf_list) res_tfidf_list = zip(posting, tfidf_list) if query_type == None: return res_tfidf_list else: pos = posting_pos[1].split(';') pos_list = [] for item in pos: if item: pos_list += [item.split(' ')] return [posting] + [pos_list] + [tfidf_list] def matching_documents(self, query): parsed_query = self.parse_query(query) query_type = parsed_query[0] query_content = parsed_query[1] if query_type == 'BQ': result_tuple = self.match_bq_query(query_content) if not result_tuple: return [] sorted_list = sorted(result_tuple, key = lambda tup: tup[1], reverse = True) res_list, tfidf_list = zip(*sorted_list) return res_list, tfidf_list elif query_type == 'OWQ': if query_content[0] not in self.inverted_index: return [] res_tfidf_list = self.getPostingList(query_content[0]) sorted_list = sorted(res_tfidf_list, key = lambda tup: tup[1], reverse = True) res_list, tfidf_list = zip(*sorted_list) return res_list, tfidf_list elif query_type == 'FTQ': result_tuple = [] for word in query_content: if word not in self.inverted_index: temp_tuple = [] else: temp_tuple = self.getPostingList(word) result_tuple = self.merge_result_or(result_tuple, temp_tuple) sorted_list = sorted(result_tuple, key = lambda tup: tup[1], reverse = True) res_list, tfidf_list = zip(*sorted_list) return res_list, tfidf_list else: if query_content[0] not in self.inverted_index: page_list_1 = [] pos_list_1 = [] else: first_word_list = self.getPostingList(query_content[0], 'PQ') page_list_1 = first_word_list[0] pos_list_1 = first_word_list[1] score_list_1 = first_word_list[2] index = 1 while index in range(len(query_content)): page_list = [] pos_list = [] score_list = [] if query_content[index] in self.inverted_index: item_list = self.getPostingList(query_content[index], 'PQ') page_list_2 = item_list[0] pos_list_2 = item_list[1] score_list_2 = item_list[2] i = 0 j = 0 while i < len(page_list_1) and j < len(page_list_2): if int(page_list_1[i]) == int(page_list_2[j]): ii = 0 jj = 0 temp_list = [] while ii < len(pos_list_1[i]) and jj < len(pos_list_2[j]): if int(pos_list_1[i][ii]) + 1 == int(pos_list_2[j][jj]): temp_list.append(int(pos_list_2[j][jj])) ii += 1 jj += 1 elif int(pos_list_1[i][ii]) + 1 < int(pos_list_2[j][jj]): ii += 1 else: jj += 1 if temp_list: pos_list += [temp_list] page_list.append(page_list_2[j]) score_list.append(score_list_1[i] + score_list_2[j]) i += 1 j += 1 elif int(page_list_1[i]) < int(page_list_2[j]): i += 1 else: j += 1 index += 1 page_list_1 = page_list pos_list_1 = pos_list score_list_1 = score_list if not page_list: return [] res_tfidf_list = zip(page_list, score_list) sorted_list = sorted(res_tfidf_list, key = lambda tup: tup[1], reverse = True) res_list, tfidf_list = zip(*sorted_list) return res_list, tfidf_list
def test_word_stemming(self): stemmer = PorterStemmer() self.assertEqual('stem', stemmer.stem_word('stem')) self.assertEqual('stem', stemmer.stem_word('stemmed')) self.assertEqual('stem', stemmer.stem_word('stemming'))
def parse_collection(self, arg_collection): pageID = -1 title = '' text = '' id_page_dict = {} #classify tags for line in arg_collection: split = [] if line[0] == '<': split = line.split('>', 1) if len(split) != 0: head = split[0][1:] if head == 'page': text = '' elif head == 'id': id_str = split[1].split('<') id_str = id_str[0] pageID = int(id_str) elif head == 'text': if split[1][-8:-1] == '</text>': text += split[1][:-8] else: text += split[1][:-1] for line in arg_collection: if line[-8:-1] == '</text>': text += (' ' + line[:-8]) break else: text += ' ' + line[:-1] title_text = title + '\n' + text id_page_dict[pageID] = title_text elif head == 'title': title_list = split[1].split('<') title = title_list[0] self.title_index[pageID] = title #lower cases for key, value in id_page_dict.items(): temp = value.lower() value = '' flag = False for c in temp: if c >= 'a' and c <= 'z' or c >= '0' and c <= '9': value += c flag = False else: if not flag: value += ' ' flag = True value = value.strip(' ') #filter out stop words and porter stemmer p = PorterStemmer() value_list = value.split(' ') value_list = filter(lambda token: token not in self.stop_words, value_list) new_value = [] for s in value_list: new_value.append(p.stem(s, 0, len(s) - 1)) value = ' '.join(new_value) id_page_dict[key] = value #build inverted index self.inverted_index = self.build_inverted_index(id_page_dict)
def parse_collection(self, arg_collection): pageID = -1 title = '' text = '' id_page_dict = {} #classify tags for line in arg_collection: split = [] if line[0] == '<': split = line.split('>', 1) if len(split) != 0: head = split[0][1:] if head == 'page': text = '' elif head == 'id': id_str = split[1].split('<') id_str = id_str[0] pageID = int(id_str) elif head == 'text': if split[1][-8:-1] == '</text>': text += split[1][:-8] else: text += split[1][:-1] for line in arg_collection: if line[-8:-1] == '</text>': text += (' ' + line[:-8]) break else: text += ' ' + line[:-1] title_text = title + '\n' + text id_page_dict[pageID] = title_text elif head == 'title': title_list = split[1].split('<') title = title_list[0] self.title_index[pageID] = title #lower cases for key, value in id_page_dict.items(): temp = value.lower() value = '' flag = False for c in temp: if c >= 'a' and c <= 'z' or c >='0' and c <= '9': value += c flag = False else: if not flag: value += ' ' flag = True value = value.strip(' ') #filter out stop words and porter stemmer p = PorterStemmer() value_list = value.split(' ') value_list = filter(lambda token: token not in self.stop_words, value_list) new_value = [] for s in value_list: new_value.append(p.stem(s, 0, len(s) - 1)) value = ' '.join(new_value) id_page_dict[key] = value #build inverted index self.doc_no = len(id_page_dict) self.inverted_index = self.build_inverted_index(id_page_dict) self.compute_tf(id_page_dict)