Пример #1
0
def search(ctx):
    from officeanswers.search import build_search_index

    logger.info("Build search index...")

    config = ctx.obj['CONFIG']
    preprocess_dir = config.paths['preprocess_dir']
    embed_model = get_inference_model(config)

    pre = engine.load_preprocessor(preprocess_dir, config.net_name)

    docs, embeds = build_document_embeddings(config)
    search = build_search_index(embeds)

    query = click.prompt("\nWhat do you want to search?\n", type=str)
    while query and query != 'exit':
        sparse_input = pre.transform_list([query])[0]
        sparse_input = np.expand_dims(sparse_input, axis=0)
        dense_input = embed_model.predict(sparse_input)[0]

        print(type(dense_input))
        print(dense_input.shape)

        idxs, dists = search.knnQuery(dense_input, k=3)

        for idx, dist in zip(idxs, dists):
            # print(f'\nCosine Dist: {dist:.4f}\n---------------\n', docs[idx])
            print("\n----------------\n", docs[idx])

        query = click.prompt("\nWhat do you want to search?\n", type=str)
Пример #2
0
def test_dssm(train, test):
    """Test DSSM model."""
    # do pre-processing.
    dssm_preprocessor = preprocessor.DSSMPreprocessor()
    processed_train = dssm_preprocessor.fit_transform(train, stage='train')
    processed_test = dssm_preprocessor.fit_transform(test, stage='test')
    # the dimension of dssm model is the length of tri-letters.
    input_shapes = processed_train.context['input_shapes']
    # generator.
    generator = generators.PointGenerator(processed_train, stage='train')
    # Create a dssm model
    dssm_model = models.DSSMModel()
    dssm_model.params['input_shapes'] = input_shapes
    dssm_model.guess_and_fill_missing_params()
    dssm_model.build()
    dssm_model.compile()
    dssm_model.fit_generator(generator)
    # save
    dssm_preprocessor.save('.tmpdir')
    dssm_model.save('.tmpdir')

    # testing
    dssm_proprecessor = engine.load_preprocessor('.tmpdir')
    processed_test = dssm_proprecessor.fit_transform(test, stage='test')
    generator = generators.PointGenerator(processed_test, stage='test')
    X, y = generator[0]
    dssm_model = engine.load_model('.tmpdir')
    predictions = dssm_model.predict([X.text_left, X.text_right])
    assert len(predictions) > 0
    assert type(predictions[0][0]) == np.float32
    shutil.rmtree('.tmpdir')
Пример #3
0
def test_save_load(base_preprocessor):
    dirpath = '.tmpdir'
    base_preprocessor.save(dirpath)
    preprocessor = engine.load_preprocessor(dirpath)
    with pytest.raises(FileExistsError):
        base_preprocessor.save(dirpath)
    shutil.rmtree(dirpath)
Пример #4
0
def predict(config: Config,
            model: engine.BaseModel,
            query: str,
            nlargest: int = 5) -> typing.List[typing.Tuple[str, float, str]]:
    logger.info('Running predictions...')

    net_name = config.net_name
    pp_dir = config.paths['preprocess_dir']
    corpus_d_path = os.path.join(pp_dir, net_name + "_documents.dill")

    docs = dill.load(open(corpus_d_path, 'rb'))
    doc_lookup = list(docs.keys())
    num_docs = len(doc_lookup)
    docs_df = pd.DataFrame.from_dict(docs,
                                     orient='index',
                                     columns=['Document'])
    docs_df['QID'] = 'Q'
    task = tasks.Ranking()
    pre = engine.load_preprocessor(dirpath=pp_dir, name=net_name)

    query_df = docs_df.copy()
    query_df['Question'] = query
    inputs = pre.transform(list(query_df.itertuples()), stage='predict')
    gen_predict = generators.PointGenerator(inputs,
                                            task,
                                            shuffle=False,
                                            stage='test')
    predictions = model._backend.predict_generator(gen_predict, verbose=1)
    idx = heapq.nlargest(nlargest, range(num_docs), predictions.ravel().take)
    results = []
    for candidate in idx:
        did = doc_lookup[candidate]
        d = docs[did]
        score = predictions[candidate][0]
        results.append((did, score, d))

    return results
Пример #5
0
def build_document_embeddings(config: Config) -> typing.Tuple[typing.List[str],
                                                              np.ndarray]:
    """
    Build document embeddings by running inference on model

    Args:
        config (Config): configuration model object after preprocessing and
            training phase

    Return:
        tuple with raw docs and their numpy embeddings
    """
    logger.info("Building embeddings...")
    try:
        dataset_path = config.inputs['share']['custom_corpus']
        preprocess_dir = config.paths['preprocess_dir']
        processed_dir = config.paths['processed_dir']
    except KeyError as e:
        error_msg = f"KeyError {e}\nCheck config file"
        logger.error(error_msg)
        raise KeyError(e)

    if not os.path.exists(dataset_path):
        error_msg = f"Dataset: `{dataset_path}` does not exist."
        logger.error(error_msg)
        raise FileNotFoundError(error_msg)

    model_path = os.path.join(processed_dir,
                              f"{config.net_name}.h5")
    if not os.path.exists(model_path):
        error_msg = f"{model_path} does not exist. Train model first"
        logger.error(error_msg)
        raise FileNotFoundError(error_msg)

    logger.info("Loading embedding model...")
    custom_object = {}
    custom_object['rank_hinge_loss'] = losses.rank_hinge_loss
    embed_model = load_model(model_path,
                             custom_objects=custom_object).get_layer('model_1')
    docs = []
    embeddings = []
    logger.info("Getting embeddings...")
    with open(dataset_path, 'r') as f:
        for line in f:
            line = line.strip()
            try:
                question, doc, label = line.split('\t')
            except ValueError:
                error_msg = "Invalid format for relation text." + \
                    "Should be `question\tdocument\tlabel`"
                logger.error(error_msg)
                raise
            docs.append(doc)

    pre = engine.load_preprocessor(preprocess_dir,
                                   config.net_name)

    preprocessed_docs = pre.transform_list(docs)
    for doc in preprocessed_docs:
        sparse_input = np.expand_dims(doc, axis=0)
        embeddings.append(embed_model.predict(sparse_input)[0])

    return docs, np.array(embeddings)
Пример #6
0
base_dir = config.WORKBUDDY_DIR

config = Config()
config.from_json_file(config_path)
data_dir = os.path.join(base_dir, 'data')
preprocess_dir = os.path.join(data_dir,
                              'preprocessed')
processed_dir = os.path.join(data_dir,
                             'processed')
config.paths['preprocess_dir'] = preprocess_dir
config.paths['processed_dir'] = processed_dir

embed_model = get_inference_model(config)
if 'preprocess' in config.inputs['share']:
    pre = engine.load_preprocessor(preprocess_dir,
                                   config.inputs['share']['preprocess'])
else:
    pre = engine.load_preprocessor(preprocess_dir,
                                   config.net_name)

config.inputs['share']['custom_corpus'] = os.path.join(
    base_dir,
    config.inputs['share']['custom_corpus'])
docs, embeds = build_document_embeddings(config)

logger.info("Loading search index...")
index_name = 'custom_index'
if not os.path.exists(index_name):
    logger.info("Search index not found. Building it...")
    search_engine = build_search_index(embeds)
    search_engine.saveIndex(index_name)