예제 #1
0
def main(argc, argv):

    if argc > 2:
        db_config = Configuration(argv[2])
    else:
        db_config = Configuration('config/db_config.json')

    logger = Logger(db_config.get_value('log'))

    if argc < 2:
        logger.log(Logger.ERROR,
                   'Configuration file for index creation required')
        return
    vec_config = Configuration(argv[1])

    user = db_config.get_value('username')
    password = db_config.get_value('password')
    host = db_config.get_value('host')
    db_name = db_config.get_value('db_name')

    # init db connection
    try:
        con = psycopg2.connect("dbname='" + db_name + "' user='******' host='" + host + "' password='******'")
    except:
        logger.log(Logger.ERROR, 'Can not connect to database')
        return

    cur = con.cursor()

    init_tables(con, cur, vec_config.get_value('table_name'), logger)

    insert_vectors(vec_config.get_value('vec_file_path'), con, cur,
                   vec_config.get_value('table_name'),
                   db_config.get_value('batch_size'),
                   vec_config.get_value('normalized'), logger)

    # commit changes
    con.commit()

    # create index
    utils.create_index(vec_config.get_value('table_name'),
                       vec_config.get_value('index_name'), 'word', con, cur,
                       logger)

    # close connection
    con.close()
예제 #2
0
def add_to_database(db_config, index_config, type, index_file, logger):

    # create db connection
    con = None
    try:
        con = psycopg2.connect("dbname='" + db_config.get_value('db_name') +
                               "' user='******'username') +
                               "' host='" + db_config.get_value('host') +
                               "' password='******'password') + "'")
    except:
        logger.log(Logger.ERROR, 'Can not connect to database')
        return
    cur = con.cursor()

    if type == 'pq':
        data = im.load_index(index_file)
        utils.init_tables(con, cur,
                          pq_index.get_table_information(index_config), logger)
        pq_index.add_to_database(data['words'], data['codebook'],
                                 data['index'], data['counts'], con,
                                 cur, index_config,
                                 db_config.get_value('batch_size'), logger)

        utils.create_index(index_config.get_value("pq_table_name"),
                           index_config.get_value("pq_index_name"), 'word',
                           con, cur, logger)

    elif type == 'ivfadc':
        data = im.load_index(index_file)
        utils.init_tables(con, cur, ivfadc.get_table_information(index_config),
                          logger)

        ivfadc.add_to_database(data['words'], data['cq'], data['codebook'],
                               data['index'], data['coarse_counts'],
                               data['fine_counts'], con, cur, index_config,
                               db_config.get_value('batch_size'), logger)

        utils.create_index(index_config.get_value('fine_table_name'),
                           index_config.get_value('fine_word_index_name'),
                           'word', con, cur, logger)
        utils.create_index(index_config.get_value('fine_table_name'),
                           index_config.get_value('fine_coarse_index_name'),
                           'coarse_id', con, cur, logger)
    elif type == 'ivfadc_pipeline':
        # TODO test
        data = im.load_pipeline_ivfadc_index(
            index_file, index_file + '.tmp',
            index_config.get_value('coarse_quantizer_file'),
            index_config.get_value('residual_codebook_file'))
        utils.init_tables(con, cur, ivfadc.get_table_information(index_config),
                          logger)

        ivfadc.add_to_database(data['words'], data['cq'], data['codebook'],
                               data['index'], data['coarse_counts'],
                               data['fine_counts'], con, cur, index_config,
                               db_config.get_value('batch_size'), logger)

        utils.create_index(index_config.get_value('fine_table_name'),
                           index_config.get_value('fine_word_index_name'),
                           'word', con, cur, logger)
        utils.create_index(index_config.get_value('coarse_table_name'),
                           index_config.get_value('fine_coarse_index_name'),
                           'coarse_id', con, cur, logger)
    elif type == 'pq_pipeline':
        # TODO test
        data = im.load_pipeline_pq_index(
            index_file, index_file + '.tmp',
            index_config.get_value('codebook_file'))
        utils.init_tables(con, cur, pq.get_table_information(index_config),
                          logger)

        pq.add_to_database(data['words'], data['codebook'], data['index'],
                           data['counts'], con, cur, index_config,
                           db_config.get_value('batch_size'), logger)

        utils.create_index(index_config.get_value("pq_table_name"),
                           index_config.get_value("pq_index_name"), 'word',
                           con, cur, logger)
    else:
        logger.log(logger.WARNING, 'Index type ' + str(type) + ' unknown')
    return
예제 #3
0
def main(argc, argv):
    db_config = Configuration('config/db_config.json')
    logger = Logger(db_config.get_value('log'))
    if argc < 2:
        logger.log(Logger.ERROR,
                   'Configuration file for index creation required')
        return
    index_config = Configuration(argv[1])
    batch_size = db_config.get_value("batch_size")

    train_size_coarse = index_config.get_value('train_size_coarse')
    train_size_fine = index_config.get_value('train_size_fine')
    centr_num_coarse = index_config.get_value('k_coarse')
    m = index_config.get_value('m')
    k = index_config.get_value('k')

    # get vectors
    words, vectors, vectors_size = \
        utils.get_vectors(index_config.get_value('vec_file_path'), logger)
    logger.log(logger.INFO, 'vectors_size :' + str(vectors_size))

    # determine coarse quantizer
    cq = None
    cq_filename = index_config.get_value('coarse_quantizer_file') if \
        index_config.has_key('coarse_quantizer_file') else None
    cq_output_name = cq_filename if cq_filename != None else 'coarse_quantizer.pcl'
    if COARSE_TYPE == 'MULTI_INDEX':
        cq = qcreator.construct_quantizer(
            qcreator.create_quantizer,
            (vectors[:train_size_fine], 2, centr_num_coarse, logger),
            logger,
            input_name=cq_filename,
            output_name=cq_output_name)
    else:
        cq = qcreator.construct_quantizer(
            qcreator.create_coarse_quantizer,
            (vectors[:train_size_coarse], centr_num_coarse),
            logger,
            input_name=cq_filename,
            output_name=cq_output_name)

    # determine codebook
    codebook = None
    codebook_filename = index_config.get_value('codebook_file') if \
        index_config.has_key('codebook_file') else None
    codebook_output_name = codebook_filename if codebook_filename != None else 'codebook.pcl'
    codebook = qcreator.construct_quantizer(
        qcreator.create_quantizer, (vectors[:train_size_fine], m, k, logger),
        logger,
        input_name=codebook_filename,
        output_name=codebook_output_name)

    # create db connection
    con, cur = db_export.create_connection(db_config, logger)

    # prepare database
    utils.init_tables(con, cur, get_table_information(index_config), logger)
    utils.disable_triggers(index_config.get_value('fine_table_name'), con, cur)

    # create index with quantizers
    logger.log(logger.INFO, 'Start index creation (single cycle)')
    start = time.time()
    index, coarse_counts, fine_counts = \
        create_index_data(vectors[:vectors_size], cq, codebook, logger)
    end = time.time()
    logger.log(logger.INFO,
               'Finish index creation after ' + str(end - start) + ' seconds')
    # add to database
    add_to_database(words, cq, codebook, index, coarse_counts, fine_counts,
                    con, cur, index_config, batch_size, logger)
    logger.log(logger.INFO, 'Create database index structures')
    utils.create_index(index_config.get_value('fine_table_name'),
                       index_config.get_value('fine_word_index_name'), 'id',
                       con, cur, logger)
    utils.create_index(index_config.get_value('fine_table_name'),
                       index_config.get_value('fine_coarse_index_name'),
                       'coarse_id', con, cur, logger)

    # create statistics
    if (index_config.has_key('statistic_table')
            and index_config.has_key('statistic_column')
            and CREATE_STATS_TABLE):
        utils.create_statistics_table(
            index_config.get_value('statistic_table'),
            index_config.get_value('statistic_column'),
            index_config.get_value('coarse_table_name'), con, cur, logger)

    utils.enable_triggers(index_config.get_value('fine_table_name'), con, cur)
예제 #4
0
def main(argc, argv):
    db_config = Configuration('config/db_config.json')
    logger = Logger(db_config.get_value('log'))
    if argc < 2:
        logger.log(Logger.ERROR, 'Configuration file for index creation required')
        return
    index_config = Configuration(argv[1])

    batch_size = db_config.get_value("batch_size")

    train_size_coarse = index_config.get_value('train_size_coarse')
    train_size_fine = index_config.get_value('train_size_fine')
    centr_num_coarse = index_config.get_value('k_coarse')
    m = index_config.get_value('m')
    k = index_config.get_value('k')

    # get vectors
    words, vectors, vectors_size = utils.get_vectors(index_config.get_value('vec_file_path'), logger)
    logger.log(logger.INFO, 'vectors_size :' + str(vectors_size))

    # determine coarse quantizer
    cq = None
    if index_config.has_key('coarse_quantizer_file'):
        cq_filename = index_config.get_value('coarse_quantizer_file')
        if cq_filename:
            logger.log(Logger.INFO, 'Use coarse quantizer from ' + cq_filename)
            cq = qcreator.load_quantizer(cq_filename)
    if type(cq) == type(None):
        logger.log(Logger.INFO, 'Create new coarse quantizer')
        # create coarse quantizer
        cq = qcreator.create_coarse_quantizer(vectors[:train_size_coarse], centr_num_coarse)
        # store coarse quantizer
        qcreator.store_quantizer(cq, 'coarse_quantizer.pcl')

    # determine codebook
    codebook = None
    if index_config.has_key('residual_codebook_file'):
        codebook_filename = index_config.get_value('residual_codebook_file')
        if codebook_filename:
            logger.log(Logger.INFO, 'Use residual codebook from ' + codebook_filename)
            codebook = qcreator.load_quantizer(codebook_filename)
    if type(codebook) == type(None):
        logger.log(Logger.INFO, 'Create new residual codebook')
        # calculate codebook based on residuals
        codebook = create_fine_quantizer(cq, vectors[:train_size_fine], m, k, logger)
        # store codebook
        qcreator.store_quantizer(codebook, 'residual_codebook.pcl')

    con = None
    cur = None
    if (index_config.get_value('add_to_database')):
        # create db connection
        try:
            con = psycopg2.connect("dbname='" + db_config.get_value('db_name') + "' user='******'username') + "' host='" + db_config.get_value('host') + "' password='******'password') + "'")
        except:
            logger.log(logger.ERROR, 'Can not connect to database')
            return
        cur = con.cursor()

        utils.init_tables(con, cur, get_table_information(index_config), logger)
        utils.disable_triggers(index_config.get_value('fine_table_name'),con, cur)

    # create index with quantizers
    use_pipeline = False
    if index_config.has_key('pipeline'):
        use_pipeline = index_config.get_value('pipeline')

    # single cycle
    if not use_pipeline:
        logger.log(logger.INFO, 'Start index creation (single cycle)')
        start = time.time()
        index, coarse_counts, fine_counts = create_index_with_faiss(vectors[:vectors_size], cq, codebook, logger)
        end = time.time()
        logger.log(logger.INFO, 'Finish index creation after ' + str(end - start) + ' seconds')
        # add to file
        if (index_config.get_value('export_filename')):
            index_data = dict({
                'words': words,
                'cq': cq,
                'codebook': codebook,
                'index': index,
                'coarse_counts': coarse_counts,
                'fine_counts': fine_counts
            })
            im.save_index(index_data, index_config.get_value('export_filename'))

        if (index_config.get_value('add_to_database')):

            add_to_database(words, cq, codebook, index, coarse_counts, fine_counts, con, cur, index_config, batch_size, logger)
            logger.log(logger.INFO, 'Create database index structures')
            utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_word_index_name'), 'word', con, cur, logger)
            utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_coarse_index_name'), 'coarse_id', con, cur, logger)
            utils.enable_triggers(index_config.get_value('fine_table_name'), con, cur)

    # pipeline approach
    if use_pipeline:
        logger.log(logger.INFO, 'Start index creation (pipeline)')
        start = time.time()
        feeder = VectorFeeder(vectors[:vectors_size], words)
        m = len(codebook)
        len_centr = int(len(vectors[0]) / m)
        calculation = IVFADCIndexCreator(cq, codebook, m, len_centr, logger)
        fine_counts = dict()
        coarse_counts = dict()
        output_file = None
        if (index_config.get_value('export_pipeline_data')):
            output_file = open(index_config.get_value('export_pipeline_data'), 'wb')
        while (feeder.has_next()):
            # calculate
            batch, word_batch = feeder.get_next_batch(batch_size)
            entries, coarse_counts, fine_counts = calculation.index_batch(batch)
            # write to database or add to file
            if (index_config.get_value('add_to_database')):
                # add to database
                add_batch_to_database(word_batch, entries, con, cur, index_config, batch_size, logger)
                logger.log(logger.INFO, 'Added ' + str(feeder.get_cursor() - batch_size + len(batch)) + ' vectors to the database')
            if (index_config.get_value('export_pipeline_data')):
                # write to file
                index_batch = dict({
                    'words': word_batch,
                    'index': entries,
                })
                count_data = dict({
                    'coarse_counts': coarse_counts,
                    'fine_counts': fine_counts
                })
                pickle.dump(index_batch, output_file)
                f = open(index_config.get_value('export_pipeline_data')+'.tmp', 'wb')
                pickle.dump(count_data, f)
                f.close()
                logger.log(logger.INFO, 'Processed ' + str(feeder.get_cursor() - batch_size + len(batch)) + ' vectors')
        if output_file:
            output_file.close()
        if (index_config.get_value('add_to_database')):
            # add codebook and cq to database
            add_codebook_to_database(codebook, fine_counts, con, cur, index_config)
            logger.log(Logger.INFO, 'Added residual codebook entries into database')
            add_cq_to_database(cq, coarse_counts, con, cur, index_config)
            logger.log(Logger.INFO, 'Added coarse quantizer entries into database')
            logger.log(logger.INFO, 'Create database index structures')
            utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_word_index_name'), 'word', con, cur, logger)
            utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_coarse_index_name'), 'coarse_id', con, cur, logger)
            utils.enable_triggers(index_config.get_value('fine_table_name'), con, cur)

        end = time.time()