def load_index(self, directory): self.seek_list = RecordDAWG('>QQ') self.seek_list.load(f'{directory}/compressed_seek_list.dawg') self.index_file = open(f'{directory}/compressed_index', mode='rb') with open(f'{directory}/symbol_to_encoding_dict.pickle', mode='rb') as f: self.symbol_to_encoding_dict = pickle.load(f) self.comment_offsets = numpy.load( f'{directory}/comment_offsets.npy', mmap_mode=None) self.comment_term_counts = numpy.load( f'{directory}/comment_term_counts.npy', mmap_mode=None) with open(f'{directory}/collection_term_count.pickle', mode='rb') as f: self.collection_term_count = pickle.load(f) self.comment_file = open(f'{directory}/comments.csv', mode='rb') self.comment_csv_reader = csv.reader( binary_read_line_generator(self.comment_file)) with open(f'{directory}/authors_list.pickle', mode='rb') as f: self.authors_list = pickle.load(f) with open(f'{directory}/articles_list.pickle', mode='rb') as f: self.articles_list = pickle.load(f) with open(f'{directory}/reply_to_index.pickle', mode='rb') as f: self.reply_to_index = pickle.load(f) self.cids = numpy.load(f'{directory}/cids.npy', mmap_mode='r') self.comment_offsets_cid = numpy.load( f'{directory}/comment_offsets_cid.npy', mmap_mode='r')
def load_dict(path): format = ">2I" try: d = RecordDAWG(format) d.load(path) return d except Exception, e: print "load dict error:..", e.message return None
def __init__(self, dict_path): reload(sys) sys.setdefaultencoding('utf8') self.dict_path = dict_path self.format = ">2I" try: self.dict = RecordDAWG(self.format) self.dict.load(dict_path) except Exception, e: print "load dict error:",dict_path, e.message
def build_dict(path, path_build): format = ">2I" keys = [] values = [] file_handler = open(path, 'rb') for line in file_handler: line = line.strip('/r/n') arr = line.split('\t') try: if len(arr) == 3: keys.append(arr[0].decode("utf-8")) values.append([int(arr[1]), int(arr[2])]) else: continue except: continue data = zip(keys, values) record = RecordDAWG(format, data) with open(path_build, 'wb') as f: record.write(f)
nameIdMap[short_name] = [] #print short_name, region_id, region_id_online nameIdMap[short_name].append([region_id, region_id_online]) return nameIdMap if __name__ == '__main__': #"654226", "和布克赛尔蒙古自治县", "和布克赛尔县", "0", "312216", "65", "amqp://*****:*****@120.27.247.47:5672/%2F" build_file = 'region_all.dawg' #generate dict format = ">2I" keys = [] values = [] nameIdMap = get_region_list() for k in nameIdMap.keys(): t = nameIdMap[k] for v in t: if v[0] and v[1]: keys.append(k) values.append([int(v[0]), int(v[1])]) print len(values), len(keys) for x in range(0, len(values)): print keys[x], values[x] data = zip(keys, values) record = RecordDAWG(format, data) with open(build_file, 'wb') as f: record.write(f)
def huffman_compression(self, generate_encoding=False): # compress using Huffman encoding symbol_to_encoding_dict = {} # count all occuring UTF-8 characters if generate_encoding: symbol_to_frequency_dict = Counter() with self.report.measure('counting utf8 characters'): with open(f'{self.directory}/index.csv') as index_file: chunk_size = 100000 def next_chunk_generator(): chunk = index_file.read(chunk_size) while chunk: yield chunk chunk = index_file.read(chunk_size) for i, chunk in enumerate(next_chunk_generator(), 1): symbol_to_frequency_dict.update(Counter(chunk)) self.report.progress( i, f' chunks counted ({chunk_size} characters ' 'each)', 100) if '\n' in symbol_to_frequency_dict.keys(): del symbol_to_frequency_dict['\n'] # derive huffman encoding from character counts with self.report.measure('deriving huffman encoding'): symbol_to_encoding_dict = Huffman.derive_encoding( symbol_to_frequency_dict) for key, value in symbol_to_encoding_dict.items(): assert (len(key) == 1) symbol_to_encoding_list[ord(key[0])] = value with open(f'{self.directory}/symbol_to_encoding_dict.pickle', mode='wb') as f: pickle.dump(symbol_to_encoding_dict, f, pickle.HIGHEST_PROTOCOL) else: # optimal encoding for guardian # character distribution should be similar for all datasets symbol_to_encoding_dict = { '\a': BitArray('1111'), ',': BitArray('001'), '0': BitArray('1000'), '1': BitArray('011'), '2': BitArray('010'), '3': BitArray('000'), '4': BitArray('1110'), '5': BitArray('1101'), '6': BitArray('1100'), '7': BitArray('1011'), '8': BitArray('1010'), '9': BitArray('1001') } with open(f'{self.directory}/symbol_to_encoding_dict.pickle', mode='wb') as f: pickle.dump(symbol_to_encoding_dict, f, pickle.HIGHEST_PROTOCOL) # save compressed index and corresponding seek_list with self.report.measure('saving compressed files'): self.compressed_seek_list = [] with open(f'{self.directory}/compressed_index', mode='wb') \ as compressed_index_file: offset = 0 for i, orig_line in enumerate( binary_read_line_generator_path( f'{self.directory}/index.csv'), 1): term = next( csv.reader(io.StringIO(orig_line), delimiter=posting_list_separator))[0] line_without_term = orig_line[len(term) + 3:] encoded_line = Huffman.encode(line_without_term, symbol_to_encoding_dict) compressed_index_file.write(encoded_line) self.compressed_seek_list.append( (term, (offset, len(encoded_line)))) self.report.progress(i, ' index lines compressed', 100000) offset += len(encoded_line) self.compressed_seek_list = \ RecordDAWG('>QQ', self.compressed_seek_list) self.compressed_seek_list.save( f'{self.directory}/compressed_seek_list.dawg')
def create_index(self): # read csv to create comment_list with self.report.measure('processing comments.csv'): number_of_processes = min(os.cpu_count(), 2) print(f'starting {number_of_processes} processes') csv_size = os.stat(f'{self.directory}/comments.csv').st_size with multiprocessing.Pool(processes=number_of_processes) as pool: offsets = [0] with open(f'{self.directory}/comments.csv', mode='rb') as f: for i in range(1, number_of_processes + 1): f.seek(int(i * csv_size / number_of_processes)) f.readline() next_offset = f.tell() if next_offset != offsets[-1]: offsets.append(next_offset) def on_error(exception): raise exception for start_offset, end_offset in zip(offsets, offsets[1:]): pool.apply_async(process_comments_file, args=(self.directory, start_offset, end_offset), error_callback=on_error) pool.close() pool.join() self.partial_index_names = [] reply_to_index = {} cid_to_offset = {} for end_offset in offsets[1:]: file_number_path = \ f'{self.directory}/{end_offset}_file_number.pickle' with open(file_number_path, mode='rb') as f: file_number = pickle.load(f) for i in range(file_number): self.partial_index_names.append( f'{self.directory}/{end_offset}_{i}') os.remove(file_number_path) reply_to_index_part_path = f'{self.directory}/' \ f'{end_offset}_reply_to_index.pickle' with open(reply_to_index_part_path, mode='rb') as f: reply_to_index_part = pickle.load(f) for key, value in reply_to_index_part.items(): if key not in reply_to_index.keys(): reply_to_index[key] = value else: reply_to_index[key].extend(value) os.remove(reply_to_index_part_path) cid_to_offset_part_path = f'{self.directory}/' \ f'{end_offset}_cid_to_offset.pickle' with open(cid_to_offset_part_path, mode='rb') as f: cid_to_offset_part = pickle.load(f) cid_to_offset.update(cid_to_offset_part) os.remove(cid_to_offset_part_path) with open(f'{self.directory}/reply_to_index.pickle', mode='wb') as f: pickle.dump(reply_to_index, f, pickle.HIGHEST_PROTOCOL) tempa = numpy.array([]) ret = [] ret2 = [] for key in sorted(cid_to_offset.keys()): ret.append(numpy.int64(key)) ret2.append(numpy.int64(cid_to_offset[key])) tempa = numpy.array(ret) numpy.save(f'{self.directory}/cids.npy', tempa) tempa2 = numpy.array(ret2) numpy.save(f'{self.directory}/comment_offsets_cid.npy', tempa2) # merge indices with self.report.measure('merging index'): # comment term counts self.comment_term_count_dict = {} for file_prefix in self.partial_index_names: file_path = file_prefix + '_comment_term_count_dict.pickle' with open(file_path, mode='rb') as f: self.comment_term_count_dict.update(pickle.load(f)) os.remove(file_path) tempa = numpy.array([]) ret = [] ret2 = [] for key in sorted(self.comment_term_count_dict.keys()): ret.append(numpy.int64(key)) ret2.append(numpy.int32(self.comment_term_count_dict[key])) tempa = numpy.array(ret) numpy.save(f'{self.directory}/comment_offsets.npy', tempa) tempa2 = numpy.array(ret2) numpy.save(f'{self.directory}/comment_term_counts.npy', tempa2) # collection term count self.collection_term_count = 0 for file_prefix in self.partial_index_names: file_path = file_prefix + '_collection_term_count.pickle' with open(file_path, mode='rb') as f: self.collection_term_count += pickle.load(f) os.remove(file_path) with open(f'{self.directory}/collection_term_count.pickle', mode='wb') as f: pickle.dump(self.collection_term_count, f, pickle.HIGHEST_PROTOCOL) # index index_files = [] for file_prefix in self.partial_index_names: file_path = file_prefix + '_index.csv' index_files.append(open(file_path, mode='rb')) current_terms = [] current_meta = [] current_posting_lists = [] global_active_indices = [] global_active_file_count = 0 for file in index_files: line = file.readline().decode('utf-8').rstrip('\n').split( posting_list_separator, 2) current_terms.append(line[0]) current_meta.append(int(line[1])) current_posting_lists.append(line[2]) global_active_indices.append(True) global_active_file_count += 1 current_active_indices = [] current_min_term = None self.seek_list = [] current_offset = 0 terms_done = 0 with open(f'{self.directory}/index.csv', mode='wb') as f: while global_active_file_count > 0: # find next term to write for key, term in enumerate(current_terms): if not global_active_indices[key]: continue if current_min_term is None or term < current_min_term: current_active_indices = [key] current_min_term = term elif term == current_min_term: current_active_indices.append(key) # merge all lines containing term if len(current_min_term) <= 128: meta = 0 for key in current_active_indices: meta += current_meta[key] line_string = \ f'{current_min_term}{posting_list_separator}{meta}' for key in current_active_indices: line_string += f'{posting_list_separator}' \ f'{current_posting_lists[key]}' line_string += '\n' line_raw = line_string.encode() f.write(line_raw) term = current_min_term[1:-1].replace('""', '"') self.seek_list.append((term, [current_offset])) current_offset += len(line_raw) # reload lines where necessary for key in current_active_indices: linetest = index_files[key].readline().decode('utf-8') if linetest == '': # end of file global_active_indices[key] = False global_active_file_count -= 1 print('one file out, ' f'{global_active_file_count} remaining') else: line = linetest.rstrip('\n').split( posting_list_separator, 2) current_terms[key] = line[0] current_meta[key] = int(line[1]) current_posting_lists[key] = line[2] current_min_term = None current_active_indices = [] terms_done += 1 if terms_done % 100000 == 0: print(f'Merged {terms_done} terms.') self.seek_list = RecordDAWG('>Q', self.seek_list) self.seek_list.save(f'{self.directory}/seek_list.dawg') for f in index_files: f.close() for file_prefix in self.partial_index_names: file_path = file_prefix + '_index.csv' os.remove(file_path) self.huffman_compression(generate_encoding=False) with self.report.measure('processing authors & articles'): with open(f'{self.directory}/authors_list.pickle', mode='wb') as f: pickle.dump( create_list_from_csv(f'{self.directory}/authors.csv'), f, pickle.HIGHEST_PROTOCOL) with open(f'{self.directory}/articles_list.pickle', mode='wb') \ as f: pickle.dump( create_list_from_csv(f'{self.directory}/articles.csv'), f, pickle.HIGHEST_PROTOCOL)
class IndexCreator(): def __init__(self, directory): self.directory = directory assert (os.path.isfile(f'{self.directory}/comments.csv')) sys.setrecursionlimit(10000) self.report = Report(quiet_mode=False) def create_index(self): # read csv to create comment_list with self.report.measure('processing comments.csv'): number_of_processes = min(os.cpu_count(), 2) print(f'starting {number_of_processes} processes') csv_size = os.stat(f'{self.directory}/comments.csv').st_size with multiprocessing.Pool(processes=number_of_processes) as pool: offsets = [0] with open(f'{self.directory}/comments.csv', mode='rb') as f: for i in range(1, number_of_processes + 1): f.seek(int(i * csv_size / number_of_processes)) f.readline() next_offset = f.tell() if next_offset != offsets[-1]: offsets.append(next_offset) def on_error(exception): raise exception for start_offset, end_offset in zip(offsets, offsets[1:]): pool.apply_async(process_comments_file, args=(self.directory, start_offset, end_offset), error_callback=on_error) pool.close() pool.join() self.partial_index_names = [] reply_to_index = {} cid_to_offset = {} for end_offset in offsets[1:]: file_number_path = \ f'{self.directory}/{end_offset}_file_number.pickle' with open(file_number_path, mode='rb') as f: file_number = pickle.load(f) for i in range(file_number): self.partial_index_names.append( f'{self.directory}/{end_offset}_{i}') os.remove(file_number_path) reply_to_index_part_path = f'{self.directory}/' \ f'{end_offset}_reply_to_index.pickle' with open(reply_to_index_part_path, mode='rb') as f: reply_to_index_part = pickle.load(f) for key, value in reply_to_index_part.items(): if key not in reply_to_index.keys(): reply_to_index[key] = value else: reply_to_index[key].extend(value) os.remove(reply_to_index_part_path) cid_to_offset_part_path = f'{self.directory}/' \ f'{end_offset}_cid_to_offset.pickle' with open(cid_to_offset_part_path, mode='rb') as f: cid_to_offset_part = pickle.load(f) cid_to_offset.update(cid_to_offset_part) os.remove(cid_to_offset_part_path) with open(f'{self.directory}/reply_to_index.pickle', mode='wb') as f: pickle.dump(reply_to_index, f, pickle.HIGHEST_PROTOCOL) tempa = numpy.array([]) ret = [] ret2 = [] for key in sorted(cid_to_offset.keys()): ret.append(numpy.int64(key)) ret2.append(numpy.int64(cid_to_offset[key])) tempa = numpy.array(ret) numpy.save(f'{self.directory}/cids.npy', tempa) tempa2 = numpy.array(ret2) numpy.save(f'{self.directory}/comment_offsets_cid.npy', tempa2) # merge indices with self.report.measure('merging index'): # comment term counts self.comment_term_count_dict = {} for file_prefix in self.partial_index_names: file_path = file_prefix + '_comment_term_count_dict.pickle' with open(file_path, mode='rb') as f: self.comment_term_count_dict.update(pickle.load(f)) os.remove(file_path) tempa = numpy.array([]) ret = [] ret2 = [] for key in sorted(self.comment_term_count_dict.keys()): ret.append(numpy.int64(key)) ret2.append(numpy.int32(self.comment_term_count_dict[key])) tempa = numpy.array(ret) numpy.save(f'{self.directory}/comment_offsets.npy', tempa) tempa2 = numpy.array(ret2) numpy.save(f'{self.directory}/comment_term_counts.npy', tempa2) # collection term count self.collection_term_count = 0 for file_prefix in self.partial_index_names: file_path = file_prefix + '_collection_term_count.pickle' with open(file_path, mode='rb') as f: self.collection_term_count += pickle.load(f) os.remove(file_path) with open(f'{self.directory}/collection_term_count.pickle', mode='wb') as f: pickle.dump(self.collection_term_count, f, pickle.HIGHEST_PROTOCOL) # index index_files = [] for file_prefix in self.partial_index_names: file_path = file_prefix + '_index.csv' index_files.append(open(file_path, mode='rb')) current_terms = [] current_meta = [] current_posting_lists = [] global_active_indices = [] global_active_file_count = 0 for file in index_files: line = file.readline().decode('utf-8').rstrip('\n').split( posting_list_separator, 2) current_terms.append(line[0]) current_meta.append(int(line[1])) current_posting_lists.append(line[2]) global_active_indices.append(True) global_active_file_count += 1 current_active_indices = [] current_min_term = None self.seek_list = [] current_offset = 0 terms_done = 0 with open(f'{self.directory}/index.csv', mode='wb') as f: while global_active_file_count > 0: # find next term to write for key, term in enumerate(current_terms): if not global_active_indices[key]: continue if current_min_term is None or term < current_min_term: current_active_indices = [key] current_min_term = term elif term == current_min_term: current_active_indices.append(key) # merge all lines containing term if len(current_min_term) <= 128: meta = 0 for key in current_active_indices: meta += current_meta[key] line_string = \ f'{current_min_term}{posting_list_separator}{meta}' for key in current_active_indices: line_string += f'{posting_list_separator}' \ f'{current_posting_lists[key]}' line_string += '\n' line_raw = line_string.encode() f.write(line_raw) term = current_min_term[1:-1].replace('""', '"') self.seek_list.append((term, [current_offset])) current_offset += len(line_raw) # reload lines where necessary for key in current_active_indices: linetest = index_files[key].readline().decode('utf-8') if linetest == '': # end of file global_active_indices[key] = False global_active_file_count -= 1 print('one file out, ' f'{global_active_file_count} remaining') else: line = linetest.rstrip('\n').split( posting_list_separator, 2) current_terms[key] = line[0] current_meta[key] = int(line[1]) current_posting_lists[key] = line[2] current_min_term = None current_active_indices = [] terms_done += 1 if terms_done % 100000 == 0: print(f'Merged {terms_done} terms.') self.seek_list = RecordDAWG('>Q', self.seek_list) self.seek_list.save(f'{self.directory}/seek_list.dawg') for f in index_files: f.close() for file_prefix in self.partial_index_names: file_path = file_prefix + '_index.csv' os.remove(file_path) self.huffman_compression(generate_encoding=False) with self.report.measure('processing authors & articles'): with open(f'{self.directory}/authors_list.pickle', mode='wb') as f: pickle.dump( create_list_from_csv(f'{self.directory}/authors.csv'), f, pickle.HIGHEST_PROTOCOL) with open(f'{self.directory}/articles_list.pickle', mode='wb') \ as f: pickle.dump( create_list_from_csv(f'{self.directory}/articles.csv'), f, pickle.HIGHEST_PROTOCOL) def huffman_compression(self, generate_encoding=False): # compress using Huffman encoding symbol_to_encoding_dict = {} # count all occuring UTF-8 characters if generate_encoding: symbol_to_frequency_dict = Counter() with self.report.measure('counting utf8 characters'): with open(f'{self.directory}/index.csv') as index_file: chunk_size = 100000 def next_chunk_generator(): chunk = index_file.read(chunk_size) while chunk: yield chunk chunk = index_file.read(chunk_size) for i, chunk in enumerate(next_chunk_generator(), 1): symbol_to_frequency_dict.update(Counter(chunk)) self.report.progress( i, f' chunks counted ({chunk_size} characters ' 'each)', 100) if '\n' in symbol_to_frequency_dict.keys(): del symbol_to_frequency_dict['\n'] # derive huffman encoding from character counts with self.report.measure('deriving huffman encoding'): symbol_to_encoding_dict = Huffman.derive_encoding( symbol_to_frequency_dict) for key, value in symbol_to_encoding_dict.items(): assert (len(key) == 1) symbol_to_encoding_list[ord(key[0])] = value with open(f'{self.directory}/symbol_to_encoding_dict.pickle', mode='wb') as f: pickle.dump(symbol_to_encoding_dict, f, pickle.HIGHEST_PROTOCOL) else: # optimal encoding for guardian # character distribution should be similar for all datasets symbol_to_encoding_dict = { '\a': BitArray('1111'), ',': BitArray('001'), '0': BitArray('1000'), '1': BitArray('011'), '2': BitArray('010'), '3': BitArray('000'), '4': BitArray('1110'), '5': BitArray('1101'), '6': BitArray('1100'), '7': BitArray('1011'), '8': BitArray('1010'), '9': BitArray('1001') } with open(f'{self.directory}/symbol_to_encoding_dict.pickle', mode='wb') as f: pickle.dump(symbol_to_encoding_dict, f, pickle.HIGHEST_PROTOCOL) # save compressed index and corresponding seek_list with self.report.measure('saving compressed files'): self.compressed_seek_list = [] with open(f'{self.directory}/compressed_index', mode='wb') \ as compressed_index_file: offset = 0 for i, orig_line in enumerate( binary_read_line_generator_path( f'{self.directory}/index.csv'), 1): term = next( csv.reader(io.StringIO(orig_line), delimiter=posting_list_separator))[0] line_without_term = orig_line[len(term) + 3:] encoded_line = Huffman.encode(line_without_term, symbol_to_encoding_dict) compressed_index_file.write(encoded_line) self.compressed_seek_list.append( (term, (offset, len(encoded_line)))) self.report.progress(i, ' index lines compressed', 100000) offset += len(encoded_line) self.compressed_seek_list = \ RecordDAWG('>QQ', self.compressed_seek_list) self.compressed_seek_list.save( f'{self.directory}/compressed_seek_list.dawg')
all_forms = [norm] + forms for form in all_forms: pr, sf = split(form, stem) prefixes.append(get_index(pr, ALL_PREFIXES)) suffixes.append(get_index(sf, ALL_SUFFIXES)) tags.append(get_index(ps, ALL_TAGS)) # scheme = array.array('H', prefixes + suffixes + tags) scheme = prefixes + suffixes + tags if scheme not in ALL_SCHEMES: ALL_SCHEMES.append(scheme) scheme_id = ALL_SCHEMES.index(scheme) for i, form in enumerate(all_forms): ALL_MAP.append((form, (scheme_id, i))) record_dawg = RecordDAWG(u">II", ALL_MAP) record_dawg.save('words.dawg') with open('ALL_PREFIXES.json', 'w', encoding='utf-8') as fp: json.dump(ALL_PREFIXES, fp, ensure_ascii=False) with open('ALL_SUFFIXES.json', 'w', encoding='utf-8') as fp: json.dump(ALL_SUFFIXES, fp, ensure_ascii=False) with open('ALL_TAGS.json', 'w', encoding='utf-8') as fp: json.dump(ALL_TAGS, fp, ensure_ascii=False) with open('ALL_SCHEMES.json', 'w', encoding='utf-8') as fp: json.dump(ALL_SCHEMES, fp, ensure_ascii=False)
class SearchEngine(): def __init__(self): self.seek_list = None self.comment_file = None self.index_file = None self.symbol_to_encoding_dict = None self.cids = None self.comment_offsets_cid = None self.comment_offsets = None self.comment_term_counts = None self.comment_csv_reader = None self.authors_list = None self.articles_list = None self.reply_to_index = None self.collection_term_count = 0 self.stemmer = Stemmer.Stemmer('english') self.tokenizer = nltk.tokenize.ToktokTokenizer() self.report = Report() def load_index(self, directory): self.seek_list = RecordDAWG('>QQ') self.seek_list.load(f'{directory}/compressed_seek_list.dawg') self.index_file = open(f'{directory}/compressed_index', mode='rb') with open(f'{directory}/symbol_to_encoding_dict.pickle', mode='rb') as f: self.symbol_to_encoding_dict = pickle.load(f) self.comment_offsets = numpy.load( f'{directory}/comment_offsets.npy', mmap_mode=None) self.comment_term_counts = numpy.load( f'{directory}/comment_term_counts.npy', mmap_mode=None) with open(f'{directory}/collection_term_count.pickle', mode='rb') as f: self.collection_term_count = pickle.load(f) self.comment_file = open(f'{directory}/comments.csv', mode='rb') self.comment_csv_reader = csv.reader( binary_read_line_generator(self.comment_file)) with open(f'{directory}/authors_list.pickle', mode='rb') as f: self.authors_list = pickle.load(f) with open(f'{directory}/articles_list.pickle', mode='rb') as f: self.articles_list = pickle.load(f) with open(f'{directory}/reply_to_index.pickle', mode='rb') as f: self.reply_to_index = pickle.load(f) self.cids = numpy.load(f'{directory}/cids.npy', mmap_mode='r') self.comment_offsets_cid = numpy.load( f'{directory}/comment_offsets_cid.npy', mmap_mode='r') def load_posting_list_parts(self, stem): offset, size = self.seek_list[stem][0] self.index_file.seek(offset) binary_data = self.index_file.read(size) decoded_posting_list = Huffman.decode( binary_data, self.symbol_to_encoding_dict) return [stem] + decoded_posting_list.split(posting_list_separator) def get_comment_term_count(self, comment_offset): return self.comment_term_counts[numpy.searchsorted( self.comment_offsets, comment_offset)] def get_cid_to_offset(self, cid): return self.comment_offsets_cid[numpy.searchsorted(self.cids, cid)] # returns score based on natural language model with dirichlet smoothing # query_terms: list of query terms, stemmed and filtered # comment_offsets: list of offsets of comments into comment file def get_dirichlet_smoothed_score(self, query_terms, comment_offsets, mu=1500): ranked_comments = [[0, offset] for offset in comment_offsets] for query_term in query_terms: query_stem = self.stemmer.stemWord(query_term) if query_stem not in self.seek_list or \ self.seek_list[query_stem][0][1] > \ self.collection_term_count / 100: continue posting_list_parts = self.load_posting_list_parts(query_stem) query_term_count = int(posting_list_parts[1]) comment_offsets_index = 0 for comment_list in posting_list_parts[2:]: if comment_offsets_index >= len(comment_offsets): break first_occurence = int(comment_list.partition(',')[0]) len_occurrences = comment_list.count(',') + 1 while (comment_offsets_index < len(comment_offsets) and first_occurence > comment_offsets[comment_offsets_index]): # term not found -> 0 occurences in comment ranked_comments[comment_offsets_index][0] += math.log( (mu * query_term_count / self.collection_term_count) / (self.get_comment_term_count(comment_offsets[ comment_offsets_index]) + mu)) comment_offsets_index += 1 if(comment_offsets_index < len(comment_offsets) and first_occurence == comment_offsets[comment_offsets_index]): fD_query_term = len_occurrences - 1 ranked_comments[comment_offsets_index][0] += math.log( (fD_query_term + (mu * query_term_count / self.collection_term_count)) / (self.get_comment_term_count(comment_offsets[ comment_offsets_index]) + mu)) comment_offsets_index += 1 while comment_offsets_index < len(comment_offsets): # no matches found ranked_comments[comment_offsets_index][0] += math.log( (mu * query_term_count / self.collection_term_count) / (self.get_comment_term_count(comment_offsets[ comment_offsets_index]) + mu)) comment_offsets_index += 1 return ranked_comments # load comment from given offset into comment file def load_comment(self, offset): self.comment_file.seek(offset) comment_as_list = next(self.comment_csv_reader) comment = Comment() comment.cid = int(comment_as_list[0]) # comment.article_url = self.articles_list[int(comment_as_list[1])] # comment.author = self.authors_list[int(comment_as_list[2])] comment.text = comment_as_list[3] # comment.timestamp = comment_as_list[4] # comment.parent_cid = int(comment_as_list[5]) \ # if comment_as_list[5] != '' else -1 comment.upvotes = int(comment_as_list[6]) \ if len(comment_as_list) >= 7 else 0 comment.downvotes = int(comment_as_list[7]) \ if len(comment_as_list) >= 8 else 0 return comment def load_comment_from_cid(self, cid): return self.load_comment(self.get_cid_to_offset(cid)) def load_cid_only(self, offset): self.comment_file.seek(offset) csv_line_start = self.comment_file.read(8) comma_position = csv_line_start.find(b',') while comma_position == -1: csv_line_start += self.comment_file.read(8) comma_position = csv_line_start.find(b',') return csv_line_start[:comma_position].decode() # returns offsets into comment file for all comments containing stem in # ascending order def get_offsets_for_stem(self, stem): if stem not in self.seek_list: return [] posting_list_parts = self.load_posting_list_parts(stem) return [int(x.partition(',')[0]) for x in posting_list_parts[2:]] def phrase_query(self, phrase, suffix=''): if phrase == '' and suffix != '': # suffix of the phrase now becomes prefix for a prefix query return self.prefix_query(suffix) if ' ' not in phrase: offsets = self.keyword_query(phrase) else: stem_offset_size_list = [] # may contain duplicates! for sentence in nltk.tokenize.sent_tokenize(phrase): for token in self.tokenizer.tokenize(sentence): stem = self.stemmer.stemWord(token) if stem not in self.seek_list: continue stem_offset_size_list.append((stem, self.seek_list[stem])) if len(stem_offset_size_list) == 0: return [] # sort by posting_list size stem_offset_size_list.sort(key=lambda t: t[1][0][1]) smallest_stem = stem_offset_size_list[0][0] second_smallest_stem = stem_offset_size_list[1][0] \ if len(stem_offset_size_list) >= 2 and \ stem_offset_size_list[1][1][0][1] < \ self.collection_term_count / 100 else '' offsets = self.get_offsets_for_stem(smallest_stem) if second_smallest_stem != '': offsets = set(offsets) offsets.intersection_update( self.get_offsets_for_stem(second_smallest_stem)) result = [] phrase_to_check = phrase if suffix == '' else f'{phrase} {suffix}' for offset in offsets: comment = self.load_comment(offset) if phrase_to_check in comment.text.lower(): result.append(offset) return result def prefix_query(self, prefix): stems_with_prefix = self.seek_list.keys(prefix) result = [] for stem in stems_with_prefix: result.extend(self.get_offsets_for_stem(stem)) return result def keyword_query(self, keyword): return self.get_offsets_for_stem( self.stemmer.stemWord(keyword)) def reply_to_query(self, target_cid): return [self.cid_to_offset[cid] for cid in self.reply_to_index.get(target_cid, ())] def basic_search(self, token_node): # search for a single query token if token_node.kind == 'phrase_prefix': # phrase prefix query: 'hi ye'* return self.phrase_query( token_node.phrase_start, token_node.prefix) elif token_node.kind == 'phrase': # phrase query: 'european union' return self.phrase_query(token_node.phrase) elif token_node.kind == 'prefix': # prefix query: isra* return self.prefix_query(token_node.prefix) elif token_node.kind == 'reply_to': # ReplyTo query: ReplyTo:12345 return self.reply_to_query(token_node.target_cid) elif token_node.kind == 'keyword': # keyword query: merkel return self.keyword_query(token_node.keyword) else: raise RuntimeError(f'unknown token_node.kind: {token_node.kind}') def print_comments(self, offset_iterable, printIdsOnly=True): if printIdsOnly: print(','.join((self.load_cid_only(offset) for offset in offset_iterable))) else: for offset in offset_iterable: comment = self.load_comment(offset) print(f'{comment.cid},{comment.text}') def search(self, query, top_k=None, printIdsOnly=True): print(f'\nsearching for "{query}":') query_tree_root = build_query_tree(query) if query_tree_root.is_boolean_query: or_result = set() with self.report.measure('searching'): for and_node in query_tree_root.children: and_result = None to_be_removed = [] for child in and_node.children: child_result = self.basic_search(child) if child.is_negated: to_be_removed.append(child_result) elif and_result is None: and_result = set(child_result) else: and_result.intersection_update(child_result) and_result.difference_update(*to_be_removed) or_result.update(and_result) self.print_comments(or_result, printIdsOnly) else: # non bool query with self.report.measure('searching'): children_results = (self.basic_search(child) for child in query_tree_root.children) comment_offsets = list(frozenset().union(*children_results)) with self.report.measure('calculating scores'): # rated_comment is a tuple of (score, offset) rated_comments = self.get_dirichlet_smoothed_score( query_tree_root.query_terms, comment_offsets) if top_k is not None and len(rated_comments) > top_k: top_k_rated_comments = \ rated_comments[:top_k] heapq.heapify(top_k_rated_comments) for rated_comment in rated_comments[top_k:]: heapq.heappushpop(top_k_rated_comments, rated_comment) result = top_k_rated_comments else: result = rated_comments result.sort(key=lambda x: x[0], reverse=True) self.print_comments( (offset for score, offset in result), printIdsOnly)