def word_embeddings(learner: LanguageLearner, s: str, debug: bool = False) -> Tensor: tokens, _ = measure("tokenizing", lambda: learner.data.one_item(s), debug) measure("resetting model", lambda: learner.model.reset(), debug) encoder = learner.model[0] outputs = measure("predicting", lambda: encoder(tokens), debug) embeddings = outputs[-1][-1] return embeddings
def main(models_path: Path, test_data_json: Path, debug: bool): """Evaluates a language model against a test data set.""" with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) print(f"Loading test data from {test_data_json}...") rows = [] with jsonlines.open(test_data_json) as reader: for obj in reader.iter(type=dict, skip_invalid=True): rows.append(obj) df = pd.DataFrame(rows) test_databunch = (TextList.from_df( df, path=models_path, cols=["title", "content"]).split_none().label_for_lm().databunch(bs=4)) learner = measure( "model loading", lambda: from_model(models_path, model_name="model_large_finetuned" ), debug, ) print(learner.validate(dl=test_databunch.train_dl))
def search( es: Elasticsearch, learner: LanguageLearner, index_name: str, query: str, debug=False, ): embeddings = doc2vec(learner, query, debug) # embeddings = [embeddings[0]] indices = range(len(embeddings)) with_index = zip(indices, embeddings) params = {"queryVector" + str(idx): e.tolist() for idx, e in with_index} queries = [ "cosineSimilarity(params.queryVector" + str(idx) + ", doc['embeddings_" + str(idx) + "'])" for idx in indices ] q = { "size": 1, "query": { "script_score": { "query": { "match_all": {} }, "script": { "source": "+".join(queries) + "+0.0", "params": params }, } }, } result = measure("search", lambda: es.search(index=index_name, body=q), debug) return result["hits"]["hits"][0]["_source"]["title"]
def doc2vec(learner: LanguageLearner, s: str, debug: bool = False, max_dim: int = 1024) -> Sequence[Tensor]: with torch.no_grad(): embeddings = measure("get_full_embeddings", lambda: word_embeddings(learner, s, debug), debug) avg_pool = embeddings.mean(dim=1) max_pool = embeddings.max(dim=1)[0] last = cast(Tensor, cast(Any, embeddings)[:, -1]) # workaround pyright issue return (torch.cat([last, max_pool, avg_pool], 1).to("cpu").squeeze().split(max_dim))
def main( models_path: Path, data_path: Path, drop_index: bool, index_name: str, host: str, port: int, limit_bytes: int, debug: bool, ): """Index all the training rows in <databunch.pkl> into ElasticSearch.""" with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) learner = measure( "encoder loading", lambda: from_encoder(models_path, encoder_name="encoder_large_finetuned"), debug, ) es = Elasticsearch(hosts=[{"host": host, "port": port}]) if drop_index: print("Recreating index...") recreate_index(es, learner, index_name, debug) print("Loading data...") df = load_databunch(Path(data_path), debug).train_ds.inner_df total = df.shape[0] print(f"Indexing {total} rows...") for idx, row in df.iterrows(): measure( f"{idx}/{total}", lambda: index_document(es, learner, index_name, row.to_dict(), limit_bytes, debug), debug, )
def main( models_path: Path, query: str, index_name: str = "boe", host: str = "localhost", port: int = 9200, debug: bool = False, ): with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) learner = measure( "model loading", lambda: from_encoder(models_path, encoder_name="encoder_large_finetuned"), debug, ) es = Elasticsearch(hosts=[{"host": host, "port": port}]) for result in search(es, learner, index_name, query, debug): print("\n") print(result)
def load_databunch(pkl_path: Path, debug=False) -> TextLMDataBunch: p = pkl_path folder = p.parent filename = p.name return measure("loading dataframe", lambda: load_data(folder, filename), debug)