def create_search_index(argv=None): """Create NMSLib index and a reverse lookup CSV file. This routine reads a list CSV data files at a given directory, combines them into one for reverse lookup and uses the embeddings string to create an NMSLib index. This embedding is the last column of all CSV files. Args: argv: A list of strings representing command line arguments. """ tf.logging.set_verbosity(tf.logging.INFO) args = arguments.parse_arguments(argv) if not os.path.isdir(args.tmp_dir): logging.info("Creating directory %s", args.tmp_dir) os.makedirs(args.tmp_dir) tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file)) tmp_lookup_file = os.path.join(args.tmp_dir, os.path.basename(args.lookup_file)) embeddings_data = [] with open(tmp_lookup_file, 'w') as lookup_file: lookup_writer = csv.writer(lookup_file) for csv_file_path in tf.gfile.Glob('{}/*index*.csv'.format( args.data_dir)): logging.info('Reading %s', csv_file_path) with tf.gfile.Open(csv_file_path) as csv_file: reader = csv.reader(csv_file) for row in reader: embedding_string = row[-1] embedding_vector = [ float(value) for value in embedding_string.split(',') ] embeddings_data.append(embedding_vector) lookup_writer.writerow(row[:-1]) embeddings_data = np.array(embeddings_data) search_engine.CodeSearchEngine.create_index(embeddings_data, tmp_index_file) logging.info("Copying file %s to %s", tmp_lookup_file, args.lookup_file) tf.gfile.Copy(tmp_lookup_file, args.lookup_file) logging.info("Copying file %s to %s", tmp_index_file, args.index_file) tf.gfile.Copy(tmp_index_file, args.index_file) logging.info("Finished creating the index")
def start_search_server(argv=None): """Start a Flask REST server. This routine starts a Flask server which maintains an in memory index and a reverse-lookup database of Python files which can be queried via a simple REST API. It also serves the UI for a friendlier interface. Args: argv: A list of strings representing command line arguments. """ tf.logging.set_verbosity(tf.logging.INFO) args = arguments.parse_arguments(argv) if not os.path.isdir(args.tmp_dir): os.makedirs(args.tmp_dir) tf.logging.debug('Reading {}'.format(args.lookup_file)) lookup_data = [] with tf.gfile.Open(args.lookup_file) as lookup_file: reader = csv.reader(lookup_file) for row in reader: lookup_data.append(row) tmp_index_file = os.path.join(args.tmp_dir, os.path.basename(args.index_file)) tf.logging.debug('Reading {}'.format(args.index_file)) if not os.path.isfile(tmp_index_file): tf.gfile.Copy(args.index_file, tmp_index_file) # Build an an encoder for the natural language strings. query_encoder = build_query_encoder(args.problem, args.data_dir, embed_code=False) embedding_fn = functools.partial(embed_query, query_encoder, args.serving_url) search_engine = CodeSearchEngine(tmp_index_file, lookup_data, embedding_fn) search_server = CodeSearchServer(search_engine, args.ui_dir, host=args.host, port=args.port) search_server.run()