def main(argc, argv): filename = '../vectors/google_vecs.txt' if argc > 1: filename = argv[1] words, vecs, size = utils.get_vectors(filename) print(size) sim_values, dis_sim_values = calculate_similarity_values(words, vecs, size) print(sim_values) print(dis_sim_values) plot_graph(sim_values[1:], dis_sim_values[:-1])
def main(argc, argv): db_config = Configuration('config/db_config.json') logger = Logger(db_config.get_value('log')) if argc < 2: logger.log(Logger.ERROR, 'Configuration file for index creation required') return index_config = Configuration(argv[1]) batch_size = db_config.get_value("batch_size") train_size_coarse = index_config.get_value('train_size_coarse') train_size_fine = index_config.get_value('train_size_fine') centr_num_coarse = index_config.get_value('k_coarse') m = index_config.get_value('m') k = index_config.get_value('k') # get vectors words, vectors, vectors_size = \ utils.get_vectors(index_config.get_value('vec_file_path'), logger) logger.log(logger.INFO, 'vectors_size :' + str(vectors_size)) # determine coarse quantizer cq = None cq_filename = index_config.get_value('coarse_quantizer_file') if \ index_config.has_key('coarse_quantizer_file') else None cq_output_name = cq_filename if cq_filename != None else 'coarse_quantizer.pcl' if COARSE_TYPE == 'MULTI_INDEX': cq = qcreator.construct_quantizer( qcreator.create_quantizer, (vectors[:train_size_fine], 2, centr_num_coarse, logger), logger, input_name=cq_filename, output_name=cq_output_name) else: cq = qcreator.construct_quantizer( qcreator.create_coarse_quantizer, (vectors[:train_size_coarse], centr_num_coarse), logger, input_name=cq_filename, output_name=cq_output_name) # determine codebook codebook = None codebook_filename = index_config.get_value('codebook_file') if \ index_config.has_key('codebook_file') else None codebook_output_name = codebook_filename if codebook_filename != None else 'codebook.pcl' codebook = qcreator.construct_quantizer( qcreator.create_quantizer, (vectors[:train_size_fine], m, k, logger), logger, input_name=codebook_filename, output_name=codebook_output_name) # create db connection con, cur = db_export.create_connection(db_config, logger) # prepare database utils.init_tables(con, cur, get_table_information(index_config), logger) utils.disable_triggers(index_config.get_value('fine_table_name'), con, cur) # create index with quantizers logger.log(logger.INFO, 'Start index creation (single cycle)') start = time.time() index, coarse_counts, fine_counts = \ create_index_data(vectors[:vectors_size], cq, codebook, logger) end = time.time() logger.log(logger.INFO, 'Finish index creation after ' + str(end - start) + ' seconds') # add to database add_to_database(words, cq, codebook, index, coarse_counts, fine_counts, con, cur, index_config, batch_size, logger) logger.log(logger.INFO, 'Create database index structures') utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_word_index_name'), 'id', con, cur, logger) utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_coarse_index_name'), 'coarse_id', con, cur, logger) # create statistics if (index_config.has_key('statistic_table') and index_config.has_key('statistic_column') and CREATE_STATS_TABLE): utils.create_statistics_table( index_config.get_value('statistic_table'), index_config.get_value('statistic_column'), index_config.get_value('coarse_table_name'), con, cur, logger) utils.enable_triggers(index_config.get_value('fine_table_name'), con, cur)
def main(argc, argv): db_config = Configuration('config/db_config.json') logger = Logger(db_config.get_value('log')) if argc < 2: logger.log(Logger.ERROR, 'Configuration file for index creation required') return index_config = Configuration(argv[1]) batch_size = db_config.get_value("batch_size") train_size_coarse = index_config.get_value('train_size_coarse') train_size_fine = index_config.get_value('train_size_fine') centr_num_coarse = index_config.get_value('k_coarse') m = index_config.get_value('m') k = index_config.get_value('k') # get vectors words, vectors, vectors_size = utils.get_vectors(index_config.get_value('vec_file_path'), logger) logger.log(logger.INFO, 'vectors_size :' + str(vectors_size)) # determine coarse quantizer cq = None if index_config.has_key('coarse_quantizer_file'): cq_filename = index_config.get_value('coarse_quantizer_file') if cq_filename: logger.log(Logger.INFO, 'Use coarse quantizer from ' + cq_filename) cq = qcreator.load_quantizer(cq_filename) if type(cq) == type(None): logger.log(Logger.INFO, 'Create new coarse quantizer') # create coarse quantizer cq = qcreator.create_coarse_quantizer(vectors[:train_size_coarse], centr_num_coarse) # store coarse quantizer qcreator.store_quantizer(cq, 'coarse_quantizer.pcl') # determine codebook codebook = None if index_config.has_key('residual_codebook_file'): codebook_filename = index_config.get_value('residual_codebook_file') if codebook_filename: logger.log(Logger.INFO, 'Use residual codebook from ' + codebook_filename) codebook = qcreator.load_quantizer(codebook_filename) if type(codebook) == type(None): logger.log(Logger.INFO, 'Create new residual codebook') # calculate codebook based on residuals codebook = create_fine_quantizer(cq, vectors[:train_size_fine], m, k, logger) # store codebook qcreator.store_quantizer(codebook, 'residual_codebook.pcl') con = None cur = None if (index_config.get_value('add_to_database')): # create db connection try: con = psycopg2.connect("dbname='" + db_config.get_value('db_name') + "' user='******'username') + "' host='" + db_config.get_value('host') + "' password='******'password') + "'") except: logger.log(logger.ERROR, 'Can not connect to database') return cur = con.cursor() utils.init_tables(con, cur, get_table_information(index_config), logger) utils.disable_triggers(index_config.get_value('fine_table_name'),con, cur) # create index with quantizers use_pipeline = False if index_config.has_key('pipeline'): use_pipeline = index_config.get_value('pipeline') # single cycle if not use_pipeline: logger.log(logger.INFO, 'Start index creation (single cycle)') start = time.time() index, coarse_counts, fine_counts = create_index_with_faiss(vectors[:vectors_size], cq, codebook, logger) end = time.time() logger.log(logger.INFO, 'Finish index creation after ' + str(end - start) + ' seconds') # add to file if (index_config.get_value('export_filename')): index_data = dict({ 'words': words, 'cq': cq, 'codebook': codebook, 'index': index, 'coarse_counts': coarse_counts, 'fine_counts': fine_counts }) im.save_index(index_data, index_config.get_value('export_filename')) if (index_config.get_value('add_to_database')): add_to_database(words, cq, codebook, index, coarse_counts, fine_counts, con, cur, index_config, batch_size, logger) logger.log(logger.INFO, 'Create database index structures') utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_word_index_name'), 'word', con, cur, logger) utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_coarse_index_name'), 'coarse_id', con, cur, logger) utils.enable_triggers(index_config.get_value('fine_table_name'), con, cur) # pipeline approach if use_pipeline: logger.log(logger.INFO, 'Start index creation (pipeline)') start = time.time() feeder = VectorFeeder(vectors[:vectors_size], words) m = len(codebook) len_centr = int(len(vectors[0]) / m) calculation = IVFADCIndexCreator(cq, codebook, m, len_centr, logger) fine_counts = dict() coarse_counts = dict() output_file = None if (index_config.get_value('export_pipeline_data')): output_file = open(index_config.get_value('export_pipeline_data'), 'wb') while (feeder.has_next()): # calculate batch, word_batch = feeder.get_next_batch(batch_size) entries, coarse_counts, fine_counts = calculation.index_batch(batch) # write to database or add to file if (index_config.get_value('add_to_database')): # add to database add_batch_to_database(word_batch, entries, con, cur, index_config, batch_size, logger) logger.log(logger.INFO, 'Added ' + str(feeder.get_cursor() - batch_size + len(batch)) + ' vectors to the database') if (index_config.get_value('export_pipeline_data')): # write to file index_batch = dict({ 'words': word_batch, 'index': entries, }) count_data = dict({ 'coarse_counts': coarse_counts, 'fine_counts': fine_counts }) pickle.dump(index_batch, output_file) f = open(index_config.get_value('export_pipeline_data')+'.tmp', 'wb') pickle.dump(count_data, f) f.close() logger.log(logger.INFO, 'Processed ' + str(feeder.get_cursor() - batch_size + len(batch)) + ' vectors') if output_file: output_file.close() if (index_config.get_value('add_to_database')): # add codebook and cq to database add_codebook_to_database(codebook, fine_counts, con, cur, index_config) logger.log(Logger.INFO, 'Added residual codebook entries into database') add_cq_to_database(cq, coarse_counts, con, cur, index_config) logger.log(Logger.INFO, 'Added coarse quantizer entries into database') logger.log(logger.INFO, 'Create database index structures') utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_word_index_name'), 'word', con, cur, logger) utils.create_index(index_config.get_value('fine_table_name'), index_config.get_value('fine_coarse_index_name'), 'coarse_id', con, cur, logger) utils.enable_triggers(index_config.get_value('fine_table_name'), con, cur) end = time.time()