def load_index(self, directory):
     self.seek_list = RecordDAWG('>QQ')
     self.seek_list.load(f'{directory}/compressed_seek_list.dawg')
     self.index_file = open(f'{directory}/compressed_index', mode='rb')
     with open(f'{directory}/symbol_to_encoding_dict.pickle',
               mode='rb') as f:
         self.symbol_to_encoding_dict = pickle.load(f)
     self.comment_offsets = numpy.load(
         f'{directory}/comment_offsets.npy', mmap_mode=None)
     self.comment_term_counts = numpy.load(
         f'{directory}/comment_term_counts.npy', mmap_mode=None)
     with open(f'{directory}/collection_term_count.pickle', mode='rb') as f:
         self.collection_term_count = pickle.load(f)
     self.comment_file = open(f'{directory}/comments.csv', mode='rb')
     self.comment_csv_reader = csv.reader(
         binary_read_line_generator(self.comment_file))
     with open(f'{directory}/authors_list.pickle', mode='rb') as f:
         self.authors_list = pickle.load(f)
     with open(f'{directory}/articles_list.pickle', mode='rb') as f:
         self.articles_list = pickle.load(f)
     with open(f'{directory}/reply_to_index.pickle', mode='rb') as f:
         self.reply_to_index = pickle.load(f)
     self.cids = numpy.load(f'{directory}/cids.npy', mmap_mode='r')
     self.comment_offsets_cid = numpy.load(
         f'{directory}/comment_offsets_cid.npy', mmap_mode='r')
예제 #2
0
def load_dict(path):
    format = ">2I"
    try:
        d = RecordDAWG(format)
        d.load(path)
        return d
    except Exception, e:
        print "load dict error:..", e.message
        return None
예제 #3
0
 def __init__(self, dict_path):
     reload(sys)
     sys.setdefaultencoding('utf8')
     self.dict_path = dict_path
     self.format = ">2I"
     try:
         self.dict = RecordDAWG(self.format)
         self.dict.load(dict_path)
     except Exception, e:
         print "load dict error:",dict_path, e.message
예제 #4
0
def build_dict(path, path_build):
    format = ">2I"
    keys = []
    values = []
    file_handler = open(path, 'rb')
    for line in file_handler:
        line = line.strip('/r/n')
        arr = line.split('\t')
        try:
            if len(arr) == 3:
                keys.append(arr[0].decode("utf-8"))
                values.append([int(arr[1]), int(arr[2])])
            else:
                continue
        except:
            continue
    data = zip(keys, values)
    record = RecordDAWG(format, data)
    with open(path_build, 'wb') as f:
        record.write(f)
예제 #5
0
                nameIdMap[short_name] = []
            #print short_name, region_id, region_id_online
            nameIdMap[short_name].append([region_id, region_id_online])
    return nameIdMap


if __name__ == '__main__':
    #"654226", "和布克赛尔蒙古自治县", "和布克赛尔县", "0", "312216", "65", "amqp://*****:*****@120.27.247.47:5672/%2F"
    build_file = 'region_all.dawg'

    #generate dict
    format = ">2I"
    keys = []
    values = []

    nameIdMap = get_region_list()
    for k in nameIdMap.keys():
        t = nameIdMap[k]
        for v in t:
            if v[0] and v[1]:
                keys.append(k)
                values.append([int(v[0]), int(v[1])])

    print len(values), len(keys)
    for x in range(0, len(values)):
        print keys[x], values[x]
    data = zip(keys, values)
    record = RecordDAWG(format, data)
    with open(build_file, 'wb') as f:
        record.write(f)
    def huffman_compression(self, generate_encoding=False):
        # compress using Huffman encoding
        symbol_to_encoding_dict = {}

        # count all occuring UTF-8 characters
        if generate_encoding:
            symbol_to_frequency_dict = Counter()
            with self.report.measure('counting utf8 characters'):
                with open(f'{self.directory}/index.csv') as index_file:
                    chunk_size = 100000

                    def next_chunk_generator():
                        chunk = index_file.read(chunk_size)
                        while chunk:
                            yield chunk
                            chunk = index_file.read(chunk_size)

                    for i, chunk in enumerate(next_chunk_generator(), 1):
                        symbol_to_frequency_dict.update(Counter(chunk))
                        self.report.progress(
                            i, f' chunks counted ({chunk_size} characters '
                            'each)', 100)
                if '\n' in symbol_to_frequency_dict.keys():
                    del symbol_to_frequency_dict['\n']

            # derive huffman encoding from character counts
            with self.report.measure('deriving huffman encoding'):
                symbol_to_encoding_dict = Huffman.derive_encoding(
                    symbol_to_frequency_dict)
            for key, value in symbol_to_encoding_dict.items():
                assert (len(key) == 1)
                symbol_to_encoding_list[ord(key[0])] = value
            with open(f'{self.directory}/symbol_to_encoding_dict.pickle',
                      mode='wb') as f:
                pickle.dump(symbol_to_encoding_dict, f,
                            pickle.HIGHEST_PROTOCOL)
        else:
            # optimal encoding for guardian
            # character distribution should be similar for all datasets
            symbol_to_encoding_dict = {
                '\a': BitArray('1111'),
                ',': BitArray('001'),
                '0': BitArray('1000'),
                '1': BitArray('011'),
                '2': BitArray('010'),
                '3': BitArray('000'),
                '4': BitArray('1110'),
                '5': BitArray('1101'),
                '6': BitArray('1100'),
                '7': BitArray('1011'),
                '8': BitArray('1010'),
                '9': BitArray('1001')
            }

        with open(f'{self.directory}/symbol_to_encoding_dict.pickle',
                  mode='wb') as f:
            pickle.dump(symbol_to_encoding_dict, f, pickle.HIGHEST_PROTOCOL)

        # save compressed index and corresponding seek_list
        with self.report.measure('saving compressed files'):
            self.compressed_seek_list = []
            with open(f'{self.directory}/compressed_index', mode='wb') \
                    as compressed_index_file:
                offset = 0
                for i, orig_line in enumerate(
                        binary_read_line_generator_path(
                            f'{self.directory}/index.csv'), 1):
                    term = next(
                        csv.reader(io.StringIO(orig_line),
                                   delimiter=posting_list_separator))[0]
                    line_without_term = orig_line[len(term) + 3:]
                    encoded_line = Huffman.encode(line_without_term,
                                                  symbol_to_encoding_dict)
                    compressed_index_file.write(encoded_line)

                    self.compressed_seek_list.append(
                        (term, (offset, len(encoded_line))))

                    self.report.progress(i, ' index lines compressed', 100000)

                    offset += len(encoded_line)
            self.compressed_seek_list = \
                RecordDAWG('>QQ', self.compressed_seek_list)
            self.compressed_seek_list.save(
                f'{self.directory}/compressed_seek_list.dawg')
    def create_index(self):
        # read csv to create comment_list

        with self.report.measure('processing comments.csv'):
            number_of_processes = min(os.cpu_count(), 2)
            print(f'starting {number_of_processes} processes')
            csv_size = os.stat(f'{self.directory}/comments.csv').st_size
            with multiprocessing.Pool(processes=number_of_processes) as pool:
                offsets = [0]
                with open(f'{self.directory}/comments.csv', mode='rb') as f:
                    for i in range(1, number_of_processes + 1):
                        f.seek(int(i * csv_size / number_of_processes))
                        f.readline()
                        next_offset = f.tell()
                        if next_offset != offsets[-1]:
                            offsets.append(next_offset)

                def on_error(exception):
                    raise exception

                for start_offset, end_offset in zip(offsets, offsets[1:]):
                    pool.apply_async(process_comments_file,
                                     args=(self.directory, start_offset,
                                           end_offset),
                                     error_callback=on_error)
                pool.close()
                pool.join()

            self.partial_index_names = []
            reply_to_index = {}
            cid_to_offset = {}
            for end_offset in offsets[1:]:
                file_number_path = \
                    f'{self.directory}/{end_offset}_file_number.pickle'
                with open(file_number_path, mode='rb') as f:
                    file_number = pickle.load(f)
                    for i in range(file_number):
                        self.partial_index_names.append(
                            f'{self.directory}/{end_offset}_{i}')
                os.remove(file_number_path)

                reply_to_index_part_path = f'{self.directory}/' \
                    f'{end_offset}_reply_to_index.pickle'
                with open(reply_to_index_part_path, mode='rb') as f:
                    reply_to_index_part = pickle.load(f)
                    for key, value in reply_to_index_part.items():
                        if key not in reply_to_index.keys():
                            reply_to_index[key] = value
                        else:
                            reply_to_index[key].extend(value)
                os.remove(reply_to_index_part_path)

                cid_to_offset_part_path = f'{self.directory}/' \
                    f'{end_offset}_cid_to_offset.pickle'
                with open(cid_to_offset_part_path, mode='rb') as f:
                    cid_to_offset_part = pickle.load(f)
                    cid_to_offset.update(cid_to_offset_part)
                os.remove(cid_to_offset_part_path)

            with open(f'{self.directory}/reply_to_index.pickle',
                      mode='wb') as f:
                pickle.dump(reply_to_index, f, pickle.HIGHEST_PROTOCOL)

            tempa = numpy.array([])
            ret = []
            ret2 = []
            for key in sorted(cid_to_offset.keys()):
                ret.append(numpy.int64(key))
                ret2.append(numpy.int64(cid_to_offset[key]))
            tempa = numpy.array(ret)
            numpy.save(f'{self.directory}/cids.npy', tempa)
            tempa2 = numpy.array(ret2)
            numpy.save(f'{self.directory}/comment_offsets_cid.npy', tempa2)

        # merge indices
        with self.report.measure('merging index'):
            # comment term counts
            self.comment_term_count_dict = {}
            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_comment_term_count_dict.pickle'
                with open(file_path, mode='rb') as f:
                    self.comment_term_count_dict.update(pickle.load(f))
                os.remove(file_path)
            tempa = numpy.array([])
            ret = []
            ret2 = []
            for key in sorted(self.comment_term_count_dict.keys()):
                ret.append(numpy.int64(key))
                ret2.append(numpy.int32(self.comment_term_count_dict[key]))
            tempa = numpy.array(ret)
            numpy.save(f'{self.directory}/comment_offsets.npy', tempa)
            tempa2 = numpy.array(ret2)
            numpy.save(f'{self.directory}/comment_term_counts.npy', tempa2)

            # collection term count
            self.collection_term_count = 0
            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_collection_term_count.pickle'
                with open(file_path, mode='rb') as f:
                    self.collection_term_count += pickle.load(f)
                os.remove(file_path)

            with open(f'{self.directory}/collection_term_count.pickle',
                      mode='wb') as f:
                pickle.dump(self.collection_term_count, f,
                            pickle.HIGHEST_PROTOCOL)

            # index
            index_files = []
            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_index.csv'
                index_files.append(open(file_path, mode='rb'))

            current_terms = []
            current_meta = []
            current_posting_lists = []
            global_active_indices = []
            global_active_file_count = 0

            for file in index_files:
                line = file.readline().decode('utf-8').rstrip('\n').split(
                    posting_list_separator, 2)
                current_terms.append(line[0])
                current_meta.append(int(line[1]))
                current_posting_lists.append(line[2])
                global_active_indices.append(True)
                global_active_file_count += 1

            current_active_indices = []
            current_min_term = None
            self.seek_list = []
            current_offset = 0
            terms_done = 0

            with open(f'{self.directory}/index.csv', mode='wb') as f:
                while global_active_file_count > 0:
                    # find next term to write
                    for key, term in enumerate(current_terms):
                        if not global_active_indices[key]:
                            continue
                        if current_min_term is None or term < current_min_term:
                            current_active_indices = [key]
                            current_min_term = term
                        elif term == current_min_term:
                            current_active_indices.append(key)

                    # merge all lines containing term

                    if len(current_min_term) <= 128:
                        meta = 0
                        for key in current_active_indices:
                            meta += current_meta[key]

                        line_string = \
                            f'{current_min_term}{posting_list_separator}{meta}'
                        for key in current_active_indices:
                            line_string += f'{posting_list_separator}' \
                                f'{current_posting_lists[key]}'

                        line_string += '\n'
                        line_raw = line_string.encode()
                        f.write(line_raw)
                        term = current_min_term[1:-1].replace('""', '"')
                        self.seek_list.append((term, [current_offset]))
                        current_offset += len(line_raw)

                    # reload lines where necessary
                    for key in current_active_indices:
                        linetest = index_files[key].readline().decode('utf-8')
                        if linetest == '':
                            # end of file
                            global_active_indices[key] = False
                            global_active_file_count -= 1
                            print('one file out, '
                                  f'{global_active_file_count} remaining')
                        else:
                            line = linetest.rstrip('\n').split(
                                posting_list_separator, 2)
                            current_terms[key] = line[0]
                            current_meta[key] = int(line[1])
                            current_posting_lists[key] = line[2]

                    current_min_term = None
                    current_active_indices = []
                    terms_done += 1
                    if terms_done % 100000 == 0:
                        print(f'Merged {terms_done} terms.')

            self.seek_list = RecordDAWG('>Q', self.seek_list)
            self.seek_list.save(f'{self.directory}/seek_list.dawg')

            for f in index_files:
                f.close()

            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_index.csv'
                os.remove(file_path)

        self.huffman_compression(generate_encoding=False)

        with self.report.measure('processing authors & articles'):
            with open(f'{self.directory}/authors_list.pickle', mode='wb') as f:
                pickle.dump(
                    create_list_from_csv(f'{self.directory}/authors.csv'), f,
                    pickle.HIGHEST_PROTOCOL)

            with open(f'{self.directory}/articles_list.pickle', mode='wb') \
                    as f:
                pickle.dump(
                    create_list_from_csv(f'{self.directory}/articles.csv'), f,
                    pickle.HIGHEST_PROTOCOL)
class IndexCreator():
    def __init__(self, directory):
        self.directory = directory
        assert (os.path.isfile(f'{self.directory}/comments.csv'))
        sys.setrecursionlimit(10000)
        self.report = Report(quiet_mode=False)

    def create_index(self):
        # read csv to create comment_list

        with self.report.measure('processing comments.csv'):
            number_of_processes = min(os.cpu_count(), 2)
            print(f'starting {number_of_processes} processes')
            csv_size = os.stat(f'{self.directory}/comments.csv').st_size
            with multiprocessing.Pool(processes=number_of_processes) as pool:
                offsets = [0]
                with open(f'{self.directory}/comments.csv', mode='rb') as f:
                    for i in range(1, number_of_processes + 1):
                        f.seek(int(i * csv_size / number_of_processes))
                        f.readline()
                        next_offset = f.tell()
                        if next_offset != offsets[-1]:
                            offsets.append(next_offset)

                def on_error(exception):
                    raise exception

                for start_offset, end_offset in zip(offsets, offsets[1:]):
                    pool.apply_async(process_comments_file,
                                     args=(self.directory, start_offset,
                                           end_offset),
                                     error_callback=on_error)
                pool.close()
                pool.join()

            self.partial_index_names = []
            reply_to_index = {}
            cid_to_offset = {}
            for end_offset in offsets[1:]:
                file_number_path = \
                    f'{self.directory}/{end_offset}_file_number.pickle'
                with open(file_number_path, mode='rb') as f:
                    file_number = pickle.load(f)
                    for i in range(file_number):
                        self.partial_index_names.append(
                            f'{self.directory}/{end_offset}_{i}')
                os.remove(file_number_path)

                reply_to_index_part_path = f'{self.directory}/' \
                    f'{end_offset}_reply_to_index.pickle'
                with open(reply_to_index_part_path, mode='rb') as f:
                    reply_to_index_part = pickle.load(f)
                    for key, value in reply_to_index_part.items():
                        if key not in reply_to_index.keys():
                            reply_to_index[key] = value
                        else:
                            reply_to_index[key].extend(value)
                os.remove(reply_to_index_part_path)

                cid_to_offset_part_path = f'{self.directory}/' \
                    f'{end_offset}_cid_to_offset.pickle'
                with open(cid_to_offset_part_path, mode='rb') as f:
                    cid_to_offset_part = pickle.load(f)
                    cid_to_offset.update(cid_to_offset_part)
                os.remove(cid_to_offset_part_path)

            with open(f'{self.directory}/reply_to_index.pickle',
                      mode='wb') as f:
                pickle.dump(reply_to_index, f, pickle.HIGHEST_PROTOCOL)

            tempa = numpy.array([])
            ret = []
            ret2 = []
            for key in sorted(cid_to_offset.keys()):
                ret.append(numpy.int64(key))
                ret2.append(numpy.int64(cid_to_offset[key]))
            tempa = numpy.array(ret)
            numpy.save(f'{self.directory}/cids.npy', tempa)
            tempa2 = numpy.array(ret2)
            numpy.save(f'{self.directory}/comment_offsets_cid.npy', tempa2)

        # merge indices
        with self.report.measure('merging index'):
            # comment term counts
            self.comment_term_count_dict = {}
            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_comment_term_count_dict.pickle'
                with open(file_path, mode='rb') as f:
                    self.comment_term_count_dict.update(pickle.load(f))
                os.remove(file_path)
            tempa = numpy.array([])
            ret = []
            ret2 = []
            for key in sorted(self.comment_term_count_dict.keys()):
                ret.append(numpy.int64(key))
                ret2.append(numpy.int32(self.comment_term_count_dict[key]))
            tempa = numpy.array(ret)
            numpy.save(f'{self.directory}/comment_offsets.npy', tempa)
            tempa2 = numpy.array(ret2)
            numpy.save(f'{self.directory}/comment_term_counts.npy', tempa2)

            # collection term count
            self.collection_term_count = 0
            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_collection_term_count.pickle'
                with open(file_path, mode='rb') as f:
                    self.collection_term_count += pickle.load(f)
                os.remove(file_path)

            with open(f'{self.directory}/collection_term_count.pickle',
                      mode='wb') as f:
                pickle.dump(self.collection_term_count, f,
                            pickle.HIGHEST_PROTOCOL)

            # index
            index_files = []
            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_index.csv'
                index_files.append(open(file_path, mode='rb'))

            current_terms = []
            current_meta = []
            current_posting_lists = []
            global_active_indices = []
            global_active_file_count = 0

            for file in index_files:
                line = file.readline().decode('utf-8').rstrip('\n').split(
                    posting_list_separator, 2)
                current_terms.append(line[0])
                current_meta.append(int(line[1]))
                current_posting_lists.append(line[2])
                global_active_indices.append(True)
                global_active_file_count += 1

            current_active_indices = []
            current_min_term = None
            self.seek_list = []
            current_offset = 0
            terms_done = 0

            with open(f'{self.directory}/index.csv', mode='wb') as f:
                while global_active_file_count > 0:
                    # find next term to write
                    for key, term in enumerate(current_terms):
                        if not global_active_indices[key]:
                            continue
                        if current_min_term is None or term < current_min_term:
                            current_active_indices = [key]
                            current_min_term = term
                        elif term == current_min_term:
                            current_active_indices.append(key)

                    # merge all lines containing term

                    if len(current_min_term) <= 128:
                        meta = 0
                        for key in current_active_indices:
                            meta += current_meta[key]

                        line_string = \
                            f'{current_min_term}{posting_list_separator}{meta}'
                        for key in current_active_indices:
                            line_string += f'{posting_list_separator}' \
                                f'{current_posting_lists[key]}'

                        line_string += '\n'
                        line_raw = line_string.encode()
                        f.write(line_raw)
                        term = current_min_term[1:-1].replace('""', '"')
                        self.seek_list.append((term, [current_offset]))
                        current_offset += len(line_raw)

                    # reload lines where necessary
                    for key in current_active_indices:
                        linetest = index_files[key].readline().decode('utf-8')
                        if linetest == '':
                            # end of file
                            global_active_indices[key] = False
                            global_active_file_count -= 1
                            print('one file out, '
                                  f'{global_active_file_count} remaining')
                        else:
                            line = linetest.rstrip('\n').split(
                                posting_list_separator, 2)
                            current_terms[key] = line[0]
                            current_meta[key] = int(line[1])
                            current_posting_lists[key] = line[2]

                    current_min_term = None
                    current_active_indices = []
                    terms_done += 1
                    if terms_done % 100000 == 0:
                        print(f'Merged {terms_done} terms.')

            self.seek_list = RecordDAWG('>Q', self.seek_list)
            self.seek_list.save(f'{self.directory}/seek_list.dawg')

            for f in index_files:
                f.close()

            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_index.csv'
                os.remove(file_path)

        self.huffman_compression(generate_encoding=False)

        with self.report.measure('processing authors & articles'):
            with open(f'{self.directory}/authors_list.pickle', mode='wb') as f:
                pickle.dump(
                    create_list_from_csv(f'{self.directory}/authors.csv'), f,
                    pickle.HIGHEST_PROTOCOL)

            with open(f'{self.directory}/articles_list.pickle', mode='wb') \
                    as f:
                pickle.dump(
                    create_list_from_csv(f'{self.directory}/articles.csv'), f,
                    pickle.HIGHEST_PROTOCOL)

    def huffman_compression(self, generate_encoding=False):
        # compress using Huffman encoding
        symbol_to_encoding_dict = {}

        # count all occuring UTF-8 characters
        if generate_encoding:
            symbol_to_frequency_dict = Counter()
            with self.report.measure('counting utf8 characters'):
                with open(f'{self.directory}/index.csv') as index_file:
                    chunk_size = 100000

                    def next_chunk_generator():
                        chunk = index_file.read(chunk_size)
                        while chunk:
                            yield chunk
                            chunk = index_file.read(chunk_size)

                    for i, chunk in enumerate(next_chunk_generator(), 1):
                        symbol_to_frequency_dict.update(Counter(chunk))
                        self.report.progress(
                            i, f' chunks counted ({chunk_size} characters '
                            'each)', 100)
                if '\n' in symbol_to_frequency_dict.keys():
                    del symbol_to_frequency_dict['\n']

            # derive huffman encoding from character counts
            with self.report.measure('deriving huffman encoding'):
                symbol_to_encoding_dict = Huffman.derive_encoding(
                    symbol_to_frequency_dict)
            for key, value in symbol_to_encoding_dict.items():
                assert (len(key) == 1)
                symbol_to_encoding_list[ord(key[0])] = value
            with open(f'{self.directory}/symbol_to_encoding_dict.pickle',
                      mode='wb') as f:
                pickle.dump(symbol_to_encoding_dict, f,
                            pickle.HIGHEST_PROTOCOL)
        else:
            # optimal encoding for guardian
            # character distribution should be similar for all datasets
            symbol_to_encoding_dict = {
                '\a': BitArray('1111'),
                ',': BitArray('001'),
                '0': BitArray('1000'),
                '1': BitArray('011'),
                '2': BitArray('010'),
                '3': BitArray('000'),
                '4': BitArray('1110'),
                '5': BitArray('1101'),
                '6': BitArray('1100'),
                '7': BitArray('1011'),
                '8': BitArray('1010'),
                '9': BitArray('1001')
            }

        with open(f'{self.directory}/symbol_to_encoding_dict.pickle',
                  mode='wb') as f:
            pickle.dump(symbol_to_encoding_dict, f, pickle.HIGHEST_PROTOCOL)

        # save compressed index and corresponding seek_list
        with self.report.measure('saving compressed files'):
            self.compressed_seek_list = []
            with open(f'{self.directory}/compressed_index', mode='wb') \
                    as compressed_index_file:
                offset = 0
                for i, orig_line in enumerate(
                        binary_read_line_generator_path(
                            f'{self.directory}/index.csv'), 1):
                    term = next(
                        csv.reader(io.StringIO(orig_line),
                                   delimiter=posting_list_separator))[0]
                    line_without_term = orig_line[len(term) + 3:]
                    encoded_line = Huffman.encode(line_without_term,
                                                  symbol_to_encoding_dict)
                    compressed_index_file.write(encoded_line)

                    self.compressed_seek_list.append(
                        (term, (offset, len(encoded_line))))

                    self.report.progress(i, ' index lines compressed', 100000)

                    offset += len(encoded_line)
            self.compressed_seek_list = \
                RecordDAWG('>QQ', self.compressed_seek_list)
            self.compressed_seek_list.save(
                f'{self.directory}/compressed_seek_list.dawg')
    all_forms = [norm] + forms
    for form in all_forms:
        pr, sf = split(form, stem)

        prefixes.append(get_index(pr, ALL_PREFIXES))
        suffixes.append(get_index(sf, ALL_SUFFIXES))
        tags.append(get_index(ps, ALL_TAGS))
    # scheme = array.array('H', prefixes + suffixes + tags)
    scheme = prefixes + suffixes + tags
    if scheme not in ALL_SCHEMES:
        ALL_SCHEMES.append(scheme)
    scheme_id = ALL_SCHEMES.index(scheme)

    for i, form in enumerate(all_forms):
        ALL_MAP.append((form, (scheme_id, i)))

record_dawg = RecordDAWG(u">II", ALL_MAP)
record_dawg.save('words.dawg')

with open('ALL_PREFIXES.json', 'w', encoding='utf-8') as fp:
    json.dump(ALL_PREFIXES, fp, ensure_ascii=False)

with open('ALL_SUFFIXES.json', 'w', encoding='utf-8') as fp:
    json.dump(ALL_SUFFIXES, fp, ensure_ascii=False)

with open('ALL_TAGS.json', 'w', encoding='utf-8') as fp:
    json.dump(ALL_TAGS, fp, ensure_ascii=False)

with open('ALL_SCHEMES.json', 'w', encoding='utf-8') as fp:
    json.dump(ALL_SCHEMES, fp, ensure_ascii=False)
class SearchEngine():
    def __init__(self):
        self.seek_list = None
        self.comment_file = None
        self.index_file = None
        self.symbol_to_encoding_dict = None
        self.cids = None
        self.comment_offsets_cid = None
        self.comment_offsets = None
        self.comment_term_counts = None
        self.comment_csv_reader = None
        self.authors_list = None
        self.articles_list = None
        self.reply_to_index = None
        self.collection_term_count = 0
        self.stemmer = Stemmer.Stemmer('english')
        self.tokenizer = nltk.tokenize.ToktokTokenizer()
        self.report = Report()

    def load_index(self, directory):
        self.seek_list = RecordDAWG('>QQ')
        self.seek_list.load(f'{directory}/compressed_seek_list.dawg')
        self.index_file = open(f'{directory}/compressed_index', mode='rb')
        with open(f'{directory}/symbol_to_encoding_dict.pickle',
                  mode='rb') as f:
            self.symbol_to_encoding_dict = pickle.load(f)
        self.comment_offsets = numpy.load(
            f'{directory}/comment_offsets.npy', mmap_mode=None)
        self.comment_term_counts = numpy.load(
            f'{directory}/comment_term_counts.npy', mmap_mode=None)
        with open(f'{directory}/collection_term_count.pickle', mode='rb') as f:
            self.collection_term_count = pickle.load(f)
        self.comment_file = open(f'{directory}/comments.csv', mode='rb')
        self.comment_csv_reader = csv.reader(
            binary_read_line_generator(self.comment_file))
        with open(f'{directory}/authors_list.pickle', mode='rb') as f:
            self.authors_list = pickle.load(f)
        with open(f'{directory}/articles_list.pickle', mode='rb') as f:
            self.articles_list = pickle.load(f)
        with open(f'{directory}/reply_to_index.pickle', mode='rb') as f:
            self.reply_to_index = pickle.load(f)
        self.cids = numpy.load(f'{directory}/cids.npy', mmap_mode='r')
        self.comment_offsets_cid = numpy.load(
            f'{directory}/comment_offsets_cid.npy', mmap_mode='r')

    def load_posting_list_parts(self, stem):
        offset, size = self.seek_list[stem][0]
        self.index_file.seek(offset)
        binary_data = self.index_file.read(size)
        decoded_posting_list = Huffman.decode(
            binary_data, self.symbol_to_encoding_dict)
        return [stem] + decoded_posting_list.split(posting_list_separator)

    def get_comment_term_count(self, comment_offset):
        return self.comment_term_counts[numpy.searchsorted(
            self.comment_offsets, comment_offset)]

    def get_cid_to_offset(self, cid):
        return self.comment_offsets_cid[numpy.searchsorted(self.cids, cid)]

    # returns score based on natural language model with dirichlet smoothing
    # query_terms: list of query terms, stemmed and filtered
    # comment_offsets: list of offsets of comments into comment file
    def get_dirichlet_smoothed_score(self, query_terms, comment_offsets,
                                     mu=1500):
        ranked_comments = [[0, offset] for offset in comment_offsets]
        for query_term in query_terms:
            query_stem = self.stemmer.stemWord(query_term)
            if query_stem not in self.seek_list or \
                    self.seek_list[query_stem][0][1] > \
                    self.collection_term_count / 100:
                continue
            posting_list_parts = self.load_posting_list_parts(query_stem)
            query_term_count = int(posting_list_parts[1])
            comment_offsets_index = 0
            for comment_list in posting_list_parts[2:]:
                if comment_offsets_index >= len(comment_offsets):
                    break
                first_occurence = int(comment_list.partition(',')[0])
                len_occurrences = comment_list.count(',') + 1
                while (comment_offsets_index < len(comment_offsets)
                        and first_occurence >
                        comment_offsets[comment_offsets_index]):
                    # term not found -> 0 occurences in comment
                    ranked_comments[comment_offsets_index][0] += math.log(
                        (mu * query_term_count / self.collection_term_count)
                        / (self.get_comment_term_count(comment_offsets[
                            comment_offsets_index]) + mu))
                    comment_offsets_index += 1

                if(comment_offsets_index < len(comment_offsets)
                        and first_occurence ==
                        comment_offsets[comment_offsets_index]):
                    fD_query_term = len_occurrences - 1
                    ranked_comments[comment_offsets_index][0] += math.log(
                        (fD_query_term + (mu * query_term_count
                                          / self.collection_term_count))
                        / (self.get_comment_term_count(comment_offsets[
                            comment_offsets_index]) + mu))
                    comment_offsets_index += 1
            while comment_offsets_index < len(comment_offsets):
                # no matches found
                ranked_comments[comment_offsets_index][0] += math.log(
                    (mu * query_term_count / self.collection_term_count)
                    / (self.get_comment_term_count(comment_offsets[
                        comment_offsets_index]) + mu))
                comment_offsets_index += 1

        return ranked_comments

    # load comment from given offset into comment file
    def load_comment(self, offset):
        self.comment_file.seek(offset)
        comment_as_list = next(self.comment_csv_reader)
        comment = Comment()
        comment.cid = int(comment_as_list[0])
        # comment.article_url = self.articles_list[int(comment_as_list[1])]
        # comment.author = self.authors_list[int(comment_as_list[2])]
        comment.text = comment_as_list[3]
        # comment.timestamp = comment_as_list[4]
        # comment.parent_cid = int(comment_as_list[5]) \
        #    if comment_as_list[5] != '' else -1
        comment.upvotes = int(comment_as_list[6]) \
            if len(comment_as_list) >= 7 else 0
        comment.downvotes = int(comment_as_list[7]) \
            if len(comment_as_list) >= 8 else 0

        return comment

    def load_comment_from_cid(self, cid):
        return self.load_comment(self.get_cid_to_offset(cid))

    def load_cid_only(self, offset):
        self.comment_file.seek(offset)
        csv_line_start = self.comment_file.read(8)
        comma_position = csv_line_start.find(b',')
        while comma_position == -1:
            csv_line_start += self.comment_file.read(8)
            comma_position = csv_line_start.find(b',')
        return csv_line_start[:comma_position].decode()

    # returns offsets into comment file for all comments containing stem in
    # ascending order
    def get_offsets_for_stem(self, stem):
        if stem not in self.seek_list:
            return []
        posting_list_parts = self.load_posting_list_parts(stem)
        return [int(x.partition(',')[0]) for x in posting_list_parts[2:]]

    def phrase_query(self, phrase, suffix=''):
        if phrase == '' and suffix != '':
            # suffix of the phrase now becomes prefix for a prefix query
            return self.prefix_query(suffix)

        if ' ' not in phrase:
            offsets = self.keyword_query(phrase)
        else:
            stem_offset_size_list = []  # may contain duplicates!
            for sentence in nltk.tokenize.sent_tokenize(phrase):
                for token in self.tokenizer.tokenize(sentence):
                    stem = self.stemmer.stemWord(token)
                    if stem not in self.seek_list:
                        continue
                    stem_offset_size_list.append((stem, self.seek_list[stem]))

            if len(stem_offset_size_list) == 0:
                return []

            # sort by posting_list size
            stem_offset_size_list.sort(key=lambda t: t[1][0][1])
            smallest_stem = stem_offset_size_list[0][0]
            second_smallest_stem = stem_offset_size_list[1][0] \
                if len(stem_offset_size_list) >= 2 and \
                stem_offset_size_list[1][1][0][1] < \
                self.collection_term_count / 100 else ''
            offsets = self.get_offsets_for_stem(smallest_stem)
            if second_smallest_stem != '':
                offsets = set(offsets)
                offsets.intersection_update(
                    self.get_offsets_for_stem(second_smallest_stem))

        result = []
        phrase_to_check = phrase if suffix == '' else f'{phrase} {suffix}'
        for offset in offsets:
            comment = self.load_comment(offset)
            if phrase_to_check in comment.text.lower():
                result.append(offset)
        return result

    def prefix_query(self, prefix):
        stems_with_prefix = self.seek_list.keys(prefix)
        result = []
        for stem in stems_with_prefix:
            result.extend(self.get_offsets_for_stem(stem))
        return result

    def keyword_query(self, keyword):
        return self.get_offsets_for_stem(
            self.stemmer.stemWord(keyword))

    def reply_to_query(self, target_cid):
        return [self.cid_to_offset[cid]
                for cid in self.reply_to_index.get(target_cid, ())]

    def basic_search(self, token_node):
        # search for a single query token

        if token_node.kind == 'phrase_prefix':  # phrase prefix query: 'hi ye'*
            return self.phrase_query(
                token_node.phrase_start, token_node.prefix)
        elif token_node.kind == 'phrase':  # phrase query: 'european union'
            return self.phrase_query(token_node.phrase)
        elif token_node.kind == 'prefix':  # prefix query: isra*
            return self.prefix_query(token_node.prefix)
        elif token_node.kind == 'reply_to':  # ReplyTo query: ReplyTo:12345
            return self.reply_to_query(token_node.target_cid)
        elif token_node.kind == 'keyword':  # keyword query: merkel
            return self.keyword_query(token_node.keyword)
        else:
            raise RuntimeError(f'unknown token_node.kind: {token_node.kind}')

    def print_comments(self, offset_iterable, printIdsOnly=True):
        if printIdsOnly:
            print(','.join((self.load_cid_only(offset)
                            for offset in offset_iterable)))
        else:
            for offset in offset_iterable:
                comment = self.load_comment(offset)
                print(f'{comment.cid},{comment.text}')

    def search(self, query, top_k=None, printIdsOnly=True):
        print(f'\nsearching for "{query}":')

        query_tree_root = build_query_tree(query)
        if query_tree_root.is_boolean_query:
            or_result = set()
            with self.report.measure('searching'):
                for and_node in query_tree_root.children:
                    and_result = None
                    to_be_removed = []
                    for child in and_node.children:
                        child_result = self.basic_search(child)
                        if child.is_negated:
                            to_be_removed.append(child_result)
                        elif and_result is None:
                            and_result = set(child_result)
                        else:
                            and_result.intersection_update(child_result)
                    and_result.difference_update(*to_be_removed)
                    or_result.update(and_result)

            self.print_comments(or_result, printIdsOnly)
        else:  # non bool query
            with self.report.measure('searching'):
                children_results = (self.basic_search(child)
                                    for child in query_tree_root.children)
                comment_offsets = list(frozenset().union(*children_results))

            with self.report.measure('calculating scores'):
                # rated_comment is a tuple of (score, offset)
                rated_comments = self.get_dirichlet_smoothed_score(
                    query_tree_root.query_terms, comment_offsets)
                if top_k is not None and len(rated_comments) > top_k:
                    top_k_rated_comments = \
                        rated_comments[:top_k]
                    heapq.heapify(top_k_rated_comments)
                    for rated_comment in rated_comments[top_k:]:
                        heapq.heappushpop(top_k_rated_comments, rated_comment)
                    result = top_k_rated_comments
                else:
                    result = rated_comments

                result.sort(key=lambda x: x[0], reverse=True)

            self.print_comments(
                (offset for score, offset in result), printIdsOnly)