Exemplo n.º 1
0
def main(argc, argv):
    filename = '../vectors/google_vecs.txt'
    if argc > 1:
        filename = argv[1]
    words, vecs, size = utils.get_vectors(filename)
    print(size)
    sim_values, dis_sim_values = calculate_similarity_values(words, vecs, size)
    print(sim_values)
    print(dis_sim_values)
    plot_graph(sim_values[1:], dis_sim_values[:-1])
Exemplo n.º 2
0
def main(argc, argv):
    db_config = Configuration('config/db_config.json')
    logger = Logger(db_config.get_value('log'))
    if argc < 2:
        logger.log(Logger.ERROR,
                   'Configuration file for index creation required')
        return
    index_config = Configuration(argv[1])
    batch_size = db_config.get_value("batch_size")

    train_size_coarse = index_config.get_value('train_size_coarse')
    train_size_fine = index_config.get_value('train_size_fine')
    centr_num_coarse = index_config.get_value('k_coarse')
    m = index_config.get_value('m')
    k = index_config.get_value('k')

    # get vectors
    words, vectors, vectors_size = \
        utils.get_vectors(index_config.get_value('vec_file_path'), logger)
    logger.log(logger.INFO, 'vectors_size :' + str(vectors_size))

    # determine coarse quantizer
    cq = None
    cq_filename = index_config.get_value('coarse_quantizer_file') if \
        index_config.has_key('coarse_quantizer_file') else None
    cq_output_name = cq_filename if cq_filename != None else 'coarse_quantizer.pcl'
    if COARSE_TYPE == 'MULTI_INDEX':
        cq = qcreator.construct_quantizer(
            qcreator.create_quantizer,
            (vectors[:train_size_fine], 2, centr_num_coarse, logger),
            logger,
            input_name=cq_filename,
            output_name=cq_output_name)
    else:
        cq = qcreator.construct_quantizer(
            qcreator.create_coarse_quantizer,
            (vectors[:train_size_coarse], centr_num_coarse),
            logger,
            input_name=cq_filename,
            output_name=cq_output_name)

    # determine codebook
    codebook = None
    codebook_filename = index_config.get_value('codebook_file') if \
        index_config.has_key('codebook_file') else None
    codebook_output_name = codebook_filename if codebook_filename != None else 'codebook.pcl'
    codebook = qcreator.construct_quantizer(
        qcreator.create_quantizer, (vectors[:train_size_fine], m, k, logger),
        logger,
        input_name=codebook_filename,
        output_name=codebook_output_name)

    # create db connection
    con, cur = db_export.create_connection(db_config, logger)

    # prepare database
    utils.init_tables(con, cur, get_table_information(index_config), logger)
    utils.disable_triggers(index_config.get_value('fine_table_name'), con, cur)

    # create index with quantizers
    logger.log(logger.INFO, 'Start index creation (single cycle)')
    start = time.time()
    index, coarse_counts, fine_counts = \
        create_index_data(vectors[:vectors_size], cq, codebook, logger)
    end = time.time()
    logger.log(logger.INFO,
               'Finish index creation after ' + str(end - start) + ' seconds')
    # add to database
    add_to_database(words, cq, codebook, index, coarse_counts, fine_counts,
                    con, cur, index_config, batch_size, logger)
    logger.log(logger.INFO, 'Create database index structures')
    utils.create_index(index_config.get_value('fine_table_name'),
                       index_config.get_value('fine_word_index_name'), 'id',
                       con, cur, logger)
    utils.create_index(index_config.get_value('fine_table_name'),
                       index_config.get_value('fine_coarse_index_name'),
                       'coarse_id', con, cur, logger)

    # create statistics
    if (index_config.has_key('statistic_table')
            and index_config.has_key('statistic_column')
            and CREATE_STATS_TABLE):
        utils.create_statistics_table(
            index_config.get_value('statistic_table'),
            index_config.get_value('statistic_column'),
            index_config.get_value('coarse_table_name'), con, cur, logger)

    utils.enable_triggers(index_config.get_value('fine_table_name'), con, cur)
Exemplo n.º 3
0
def main(argc, argv):
    db_config = Configuration('config/db_config.json')
    logger = Logger(db_config.get_value('log'))
    if argc < 2:
        logger.log(Logger.ERROR, 'Configuration file for index creation required')
        return
    index_config = Configuration(argv[1])

    batch_size = db_config.get_value("batch_size")

    train_size_coarse = index_config.get_value('train_size_coarse')
    train_size_fine = index_config.get_value('train_size_fine')
    centr_num_coarse = index_config.get_value('k_coarse')
    m = index_config.get_value('m')
    k = index_config.get_value('k')

    # get vectors
    words, vectors, vectors_size = utils.get_vectors(index_config.get_value('vec_file_path'), logger)
    logger.log(logger.INFO, 'vectors_size :' + str(vectors_size))

    # determine coarse quantizer
    cq = None
    if index_config.has_key('coarse_quantizer_file'):
        cq_filename = index_config.get_value('coarse_quantizer_file')
        if cq_filename:
            logger.log(Logger.INFO, 'Use coarse quantizer from ' + cq_filename)
            cq = qcreator.load_quantizer(cq_filename)
    if type(cq) == type(None):
        logger.log(Logger.INFO, 'Create new coarse quantizer')
        # create coarse quantizer
        cq = qcreator.create_coarse_quantizer(vectors[:train_size_coarse], centr_num_coarse)
        # store coarse quantizer
        qcreator.store_quantizer(cq, 'coarse_quantizer.pcl')

    # determine codebook
    codebook = None
    if index_config.has_key('residual_codebook_file'):
        codebook_filename = index_config.get_value('residual_codebook_file')
        if codebook_filename:
            logger.log(Logger.INFO, 'Use residual codebook from ' + codebook_filename)
            codebook = qcreator.load_quantizer(codebook_filename)
    if type(codebook) == type(None):
        logger.log(Logger.INFO, 'Create new residual codebook')
        # calculate codebook based on residuals
        codebook = create_fine_quantizer(cq, vectors[:train_size_fine], m, k, logger)
        # store codebook
        qcreator.store_quantizer(codebook, 'residual_codebook.pcl')

    con = None
    cur = None
    if (index_config.get_value('add_to_database')):
        # create db connection
        try:
            con = psycopg2.connect("dbname='" + db_config.get_value('db_name') + "' user='******'username') + "' host='" + db_config.get_value('host') + "' password='******'password') + "'")
        except:
            logger.log(logger.ERROR, 'Can not connect to database')
            return
        cur = con.cursor()

        utils.init_tables(con, cur, get_table_information(index_config), logger)
        utils.disable_triggers(index_config.get_value('fine_table_name'),con, cur)

    # create index with quantizers
    use_pipeline = False
    if index_config.has_key('pipeline'):
        use_pipeline = index_config.get_value('pipeline')

    # single cycle
    if not use_pipeline:
        logger.log(logger.INFO, 'Start index creation (single cycle)')
        start = time.time()
        index, coarse_counts, fine_counts = create_index_with_faiss(vectors[:vectors_size], cq, codebook, logger)
        end = time.time()
        logger.log(logger.INFO, 'Finish index creation after ' + str(end - start) + ' seconds')
        # add to file
        if (index_config.get_value('export_filename')):
            index_data = dict({
                'words': words,
                'cq': cq,
                'codebook': codebook,
                'index': index,
                'coarse_counts': coarse_counts,
                'fine_counts': fine_counts
            })
            im.save_index(index_data, index_config.get_value('export_filename'))

        if (index_config.get_value('add_to_database')):

            add_to_database(words, cq, codebook, index, coarse_counts, fine_counts, con, cur, index_config, batch_size, logger)
            logger.log(logger.INFO, 'Create database index structures')
            utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_word_index_name'), 'word', con, cur, logger)
            utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_coarse_index_name'), 'coarse_id', con, cur, logger)
            utils.enable_triggers(index_config.get_value('fine_table_name'), con, cur)

    # pipeline approach
    if use_pipeline:
        logger.log(logger.INFO, 'Start index creation (pipeline)')
        start = time.time()
        feeder = VectorFeeder(vectors[:vectors_size], words)
        m = len(codebook)
        len_centr = int(len(vectors[0]) / m)
        calculation = IVFADCIndexCreator(cq, codebook, m, len_centr, logger)
        fine_counts = dict()
        coarse_counts = dict()
        output_file = None
        if (index_config.get_value('export_pipeline_data')):
            output_file = open(index_config.get_value('export_pipeline_data'), 'wb')
        while (feeder.has_next()):
            # calculate
            batch, word_batch = feeder.get_next_batch(batch_size)
            entries, coarse_counts, fine_counts = calculation.index_batch(batch)
            # write to database or add to file
            if (index_config.get_value('add_to_database')):
                # add to database
                add_batch_to_database(word_batch, entries, con, cur, index_config, batch_size, logger)
                logger.log(logger.INFO, 'Added ' + str(feeder.get_cursor() - batch_size + len(batch)) + ' vectors to the database')
            if (index_config.get_value('export_pipeline_data')):
                # write to file
                index_batch = dict({
                    'words': word_batch,
                    'index': entries,
                })
                count_data = dict({
                    'coarse_counts': coarse_counts,
                    'fine_counts': fine_counts
                })
                pickle.dump(index_batch, output_file)
                f = open(index_config.get_value('export_pipeline_data')+'.tmp', 'wb')
                pickle.dump(count_data, f)
                f.close()
                logger.log(logger.INFO, 'Processed ' + str(feeder.get_cursor() - batch_size + len(batch)) + ' vectors')
        if output_file:
            output_file.close()
        if (index_config.get_value('add_to_database')):
            # add codebook and cq to database
            add_codebook_to_database(codebook, fine_counts, con, cur, index_config)
            logger.log(Logger.INFO, 'Added residual codebook entries into database')
            add_cq_to_database(cq, coarse_counts, con, cur, index_config)
            logger.log(Logger.INFO, 'Added coarse quantizer entries into database')
            logger.log(logger.INFO, 'Create database index structures')
            utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_word_index_name'), 'word', con, cur, logger)
            utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_coarse_index_name'), 'coarse_id', con, cur, logger)
            utils.enable_triggers(index_config.get_value('fine_table_name'), con, cur)

        end = time.time()