def search(ctx): from officeanswers.search import build_search_index logger.info("Build search index...") config = ctx.obj['CONFIG'] preprocess_dir = config.paths['preprocess_dir'] embed_model = get_inference_model(config) pre = engine.load_preprocessor(preprocess_dir, config.net_name) docs, embeds = build_document_embeddings(config) search = build_search_index(embeds) query = click.prompt("\nWhat do you want to search?\n", type=str) while query and query != 'exit': sparse_input = pre.transform_list([query])[0] sparse_input = np.expand_dims(sparse_input, axis=0) dense_input = embed_model.predict(sparse_input)[0] print(type(dense_input)) print(dense_input.shape) idxs, dists = search.knnQuery(dense_input, k=3) for idx, dist in zip(idxs, dists): # print(f'\nCosine Dist: {dist:.4f}\n---------------\n', docs[idx]) print("\n----------------\n", docs[idx]) query = click.prompt("\nWhat do you want to search?\n", type=str)
def test_dssm(train, test): """Test DSSM model.""" # do pre-processing. dssm_preprocessor = preprocessor.DSSMPreprocessor() processed_train = dssm_preprocessor.fit_transform(train, stage='train') processed_test = dssm_preprocessor.fit_transform(test, stage='test') # the dimension of dssm model is the length of tri-letters. input_shapes = processed_train.context['input_shapes'] # generator. generator = generators.PointGenerator(processed_train, stage='train') # Create a dssm model dssm_model = models.DSSMModel() dssm_model.params['input_shapes'] = input_shapes dssm_model.guess_and_fill_missing_params() dssm_model.build() dssm_model.compile() dssm_model.fit_generator(generator) # save dssm_preprocessor.save('.tmpdir') dssm_model.save('.tmpdir') # testing dssm_proprecessor = engine.load_preprocessor('.tmpdir') processed_test = dssm_proprecessor.fit_transform(test, stage='test') generator = generators.PointGenerator(processed_test, stage='test') X, y = generator[0] dssm_model = engine.load_model('.tmpdir') predictions = dssm_model.predict([X.text_left, X.text_right]) assert len(predictions) > 0 assert type(predictions[0][0]) == np.float32 shutil.rmtree('.tmpdir')
def test_save_load(base_preprocessor): dirpath = '.tmpdir' base_preprocessor.save(dirpath) preprocessor = engine.load_preprocessor(dirpath) with pytest.raises(FileExistsError): base_preprocessor.save(dirpath) shutil.rmtree(dirpath)
def predict(config: Config, model: engine.BaseModel, query: str, nlargest: int = 5) -> typing.List[typing.Tuple[str, float, str]]: logger.info('Running predictions...') net_name = config.net_name pp_dir = config.paths['preprocess_dir'] corpus_d_path = os.path.join(pp_dir, net_name + "_documents.dill") docs = dill.load(open(corpus_d_path, 'rb')) doc_lookup = list(docs.keys()) num_docs = len(doc_lookup) docs_df = pd.DataFrame.from_dict(docs, orient='index', columns=['Document']) docs_df['QID'] = 'Q' task = tasks.Ranking() pre = engine.load_preprocessor(dirpath=pp_dir, name=net_name) query_df = docs_df.copy() query_df['Question'] = query inputs = pre.transform(list(query_df.itertuples()), stage='predict') gen_predict = generators.PointGenerator(inputs, task, shuffle=False, stage='test') predictions = model._backend.predict_generator(gen_predict, verbose=1) idx = heapq.nlargest(nlargest, range(num_docs), predictions.ravel().take) results = [] for candidate in idx: did = doc_lookup[candidate] d = docs[did] score = predictions[candidate][0] results.append((did, score, d)) return results
def build_document_embeddings(config: Config) -> typing.Tuple[typing.List[str], np.ndarray]: """ Build document embeddings by running inference on model Args: config (Config): configuration model object after preprocessing and training phase Return: tuple with raw docs and their numpy embeddings """ logger.info("Building embeddings...") try: dataset_path = config.inputs['share']['custom_corpus'] preprocess_dir = config.paths['preprocess_dir'] processed_dir = config.paths['processed_dir'] except KeyError as e: error_msg = f"KeyError {e}\nCheck config file" logger.error(error_msg) raise KeyError(e) if not os.path.exists(dataset_path): error_msg = f"Dataset: `{dataset_path}` does not exist." logger.error(error_msg) raise FileNotFoundError(error_msg) model_path = os.path.join(processed_dir, f"{config.net_name}.h5") if not os.path.exists(model_path): error_msg = f"{model_path} does not exist. Train model first" logger.error(error_msg) raise FileNotFoundError(error_msg) logger.info("Loading embedding model...") custom_object = {} custom_object['rank_hinge_loss'] = losses.rank_hinge_loss embed_model = load_model(model_path, custom_objects=custom_object).get_layer('model_1') docs = [] embeddings = [] logger.info("Getting embeddings...") with open(dataset_path, 'r') as f: for line in f: line = line.strip() try: question, doc, label = line.split('\t') except ValueError: error_msg = "Invalid format for relation text." + \ "Should be `question\tdocument\tlabel`" logger.error(error_msg) raise docs.append(doc) pre = engine.load_preprocessor(preprocess_dir, config.net_name) preprocessed_docs = pre.transform_list(docs) for doc in preprocessed_docs: sparse_input = np.expand_dims(doc, axis=0) embeddings.append(embed_model.predict(sparse_input)[0]) return docs, np.array(embeddings)
base_dir = config.WORKBUDDY_DIR config = Config() config.from_json_file(config_path) data_dir = os.path.join(base_dir, 'data') preprocess_dir = os.path.join(data_dir, 'preprocessed') processed_dir = os.path.join(data_dir, 'processed') config.paths['preprocess_dir'] = preprocess_dir config.paths['processed_dir'] = processed_dir embed_model = get_inference_model(config) if 'preprocess' in config.inputs['share']: pre = engine.load_preprocessor(preprocess_dir, config.inputs['share']['preprocess']) else: pre = engine.load_preprocessor(preprocess_dir, config.net_name) config.inputs['share']['custom_corpus'] = os.path.join( base_dir, config.inputs['share']['custom_corpus']) docs, embeds = build_document_embeddings(config) logger.info("Loading search index...") index_name = 'custom_index' if not os.path.exists(index_name): logger.info("Search index not found. Building it...") search_engine = build_search_index(embeds) search_engine.saveIndex(index_name)