def train_fasttext(hf_dataset, output_dir): """ Run with: $ ./data_cli.py train_fasttext paperswithcode_aspects ./output :return: """ tokens_fp = os.path.join(output_dir, 'tokens.txt') fasttext_bin_fp = os.path.join(output_dir, 'fasttext.bin') fasttext_w2v_fp = os.path.join(output_dir, 'fasttext.w2v.txt') docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir='./data/nlp_cache', split='docs') logger.info(f'Documents loaded: {len(docs_ds):,}') # Tokenized text doc_delimiter = '\n' token_delimiter = ' ' tokens_count = 0 with open(tokens_fp, 'w') as f: for doc in docs_ds: # Extract plain text text = doc['title'] + ': ' + doc['abstract'] for token in gensim.utils.simple_preprocess(text, min_len=2, max_len=15): f.write(token + token_delimiter) tokens_count += 1 f.write(doc_delimiter) logger.info(f'Total tokens: {tokens_count:,}') # Train actual fasttext model logger.info(f'Train fastext model...') model = fasttext.train_unsupervised( tokens_fp, model='skipgram', lr=0.05, # learning rate [0.05] dim=300, # size of word vectors [100] ws=5, # size of the context window [5] epoch=5 # number of epochs [5] # thread # number of threads [number of cpus] ) model.save_model(fasttext_bin_fp) del model ft_model = FastText.load_fasttext_format(fasttext_bin_fp) ft_model.wv.save_word2vec_format(fasttext_w2v_fp) logger.info(f'Output saved to: {fasttext_w2v_fp}') logger.info('Done')
def build_explirefit_inputs(hf_dataset, aspect, output_dir): """ Run with: $ ./data_cli.py build_explirefit_inputs paperswithcode_aspects task ./output/ format synonyms.txt: <doc_id_a> <doc_id_b> <doc_id_a> <doc_id_c> ... antonyms.txt <doc_id_a> <doc_id_b> <doc_id_a> <doc_id_c> ... :param aspect: :param hf_dataset: :param output_dir: :return: """ for fold in [1, 2, 3, 4]: fold = str(fold) fold_dir = os.path.join(output_dir, fold) if not os.path.exists(fold_dir): os.makedirs(fold_dir) synonyms_fp = os.path.join(fold_dir, 'synonyms.txt') if os.path.exists(synonyms_fp): raise FileExistsError(f'Output exists already: {synonyms_fp}') antonyms_fp = os.path.join(fold_dir, 'antonyms.txt') if os.path.exists(antonyms_fp): raise FileExistsError(f'Output exists already: {antonyms_fp}') train_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir='./data/nlp_cache', split=get_train_split(aspect, fold)) logger.info(f'Training samples: {len(train_ds):,}') with open(synonyms_fp, 'w') as synonyms_f: with open(antonyms_fp, 'w') as antonyms_f: for row in tqdm(train_ds, desc='Writing output'): line = 'en_' + row['from_paper_id'] + ' en_' + row[ 'to_paper_id'] + '\n' if row['label'] == 'y': synonyms_f.write(line) elif row['label'] == 'n': antonyms_f.write(line) else: raise ValueError(f'Unsupported label: {row}') logger.info('Done')
def build_whoosh_index(index_dir: str, override=False): # use search index from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED from whoosh.analysis import StemmingAnalyzer from whoosh import index from whoosh.qparser import QueryParser hf_dataset = 'paperswithcode_aspects' nlp_cache_dir = './data/nlp_cache' if os.path.exists(index_dir): if override: os.rmdir(index_dir) else: logger.error(f'Index dir exists already and override is disabled') return # Load meta data docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir=nlp_cache_dir, split='docs') paper_id2paper = {p['paper_id']: Paper(**p) for p in docs_ds} paper_schema = Schema( paper_id=ID(stored=True), title=TEXT(stored=True), abstract=TEXT(analyzer=StemmingAnalyzer()), paper_url=TEXT(), aspect_tasks=KEYWORD, aspect_methods=KEYWORD, aspect_datasets=KEYWORD, ) # reset index via: $ #!rm -r ./output/pwc/whoosh_index logger.info('Creating new index') # index does not exist os.makedirs(index_dir) ix = index.create_in(index_dir, paper_schema) # save documents writer = ix.writer() for paper_id, paper in tqdm(paper_id2paper.items()): writer.add_document(**paper.__dict__) # break writer.commit() logger.info('Done')
def build_avg_word_vectors(hf_dataset, w2v_path, output_path, override=False): """ Run with: $ ./data_cli.py build_avg_word_vectors paperswithcode_aspects ./output/fasttext.w2v.txt ./output/pwc_doc_id2avg_fasttext.w2v.txt :param hf_dataset: :param w2v_path: :param output_path: :param override: :return: """ stop_words = 'english' count_vector_size = 100000 if os.path.exists(output_path): if override: logger.debug(f'Override {output_path}') os.remove(output_path) else: logger.info( f'Stop. Output file exists already (override disabled): {output_path}' ) return w2v_model = KeyedVectors.load_word2vec_format(w2v_path) doc_model = KeyedVectors(vector_size=w2v_model.vector_size) count_vec = CountVectorizer(stop_words=stop_words, analyzer='word', lowercase=True, ngram_range=(1, 1), max_features=count_vector_size) docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir='./data/nlp_cache', split='docs') logger.info(f'Documents loaded: {len(docs_ds):,}') # Extract plain text texts = [] doc_id2idx = {} idx2doc_id = {} for idx, doc in enumerate(docs_ds): # Extract plain text texts.append(doc['title'] + ': ' + doc['abstract']) doc_id2idx[doc['paper_id']] = idx idx2doc_id[idx] = doc['paper_id'] # Transforms the data into a bag of words count_train = count_vec.fit(texts) idx2bow = count_vec.transform(texts) vidx2word = {v: k for k, v in count_train.vocabulary_.items()} assert len(vidx2word) == len(count_train.vocabulary_) logger.info(f'Vocab size: {len(count_train.vocabulary_)}') for idx, text in enumerate( tqdm(texts, total=len(texts), desc='Converting docs to vectors')): bow = idx2bow[idx].A[0] vectors = [] weights = [] for _idx, count in enumerate(bow): if count > 0: word = vidx2word[_idx] try: v = w2v_model.get_vector(word) vectors.append(v) weights.append(count) except KeyError: # unknown word pass pass # Check if at least one document term exists as word vector if vectors and weights: # Weight avg doc = np.average(np.array(vectors), axis=0, weights=np.array(weights)) # Add to model with doc_id doc_model.add([str(idx2doc_id[idx])], [doc]) else: logger.debug( f'Cannot add document {idx2doc_id[idx]} due to missing word vectors' ) # Save to disk doc_model.save_word2vec_format(output_path) logger.info(f'Saved to: {output_path}')
def build_specter_input(hf_dataset: str, aspect, fold: Union[str, int], output_path: str, override: bool = False) -> None: """ Run with: $ ./data_cli.py build_specter_input paperswithcode_aspects task 1 ./output/specter_input/1 Builds the following files (needed for SPECTER training): - data.json containing the document ids and their relationship. - metadata.json containing mapping of document ids to textual fiels (e.g., title, abstract) - train.txt,val.txt, test.txt containing document ids corresponding to train/val/test sets (one doc id per line). Data structure: - count = 5 (same aspect) - count = 1 (???) => ignore :param aspect: :param hf_dataset: :param fold: :param output_path: :param override: :return: """ nlp_cache_dir = './data/nlp_cache' if os.path.exists(output_path) and not override: logger.error(f'Output path exist already: {output_path}') sys.exit(1) else: os.makedirs(output_path) train_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir=nlp_cache_dir, split=get_train_split(aspect, fold)) test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir=nlp_cache_dir, split=get_test_split(aspect, fold)) docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir=nlp_cache_dir, split='docs') # metadata metadata = {} for doc in docs_ds: metadata[doc['paper_id']] = { 'paper_id': doc['paper_id'], 'title': doc['title'], 'abstract': doc['abstract'], } logger.info('Writing metadata') json.dump(metadata, open(os.path.join(output_path, 'metadata.json'), 'w')) # train/val/test ids train_doc_ids = set() test_doc_ids = set() # data data = defaultdict(dict) for pair in train_ds: # TODO include negative samples? count = 5 if pair['label'] == 'y' else 1 data[pair['from_paper_id']][pair['to_paper_id']] = {'count': count} train_doc_ids.add(pair['from_paper_id']) train_doc_ids.add(pair['to_paper_id']) for pair in test_ds: count = 5 if pair['label'] == 'y' else 1 data[pair['from_paper_id']][pair['to_paper_id']] = {'count': count} test_doc_ids.add(pair['from_paper_id']) test_doc_ids.add(pair['to_paper_id']) logger.info('Writing data') json.dump(data, open(os.path.join(output_path, 'data.json'), 'w')) train_doc_ids = list(train_doc_ids) full_test_doc_ids = list(test_doc_ids) random.shuffle(full_test_doc_ids) split_at = int(0.1 * len(full_test_doc_ids)) val_doc_ids = full_test_doc_ids[:split_at] test_doc_ids = full_test_doc_ids[split_at:] logger.info('Writing train/val/test') with open(os.path.join(output_path, 'train.txt'), 'w') as f: for i in train_doc_ids: f.write(i + '\n') with open(os.path.join(output_path, 'val.txt'), 'w') as f: for i in val_doc_ids: f.write(i + '\n') with open(os.path.join(output_path, 'test.txt'), 'w') as f: for i in test_doc_ids: f.write(i + '\n') logger.info('done')
def build_specter_vectors(hf_dataset: str, specter_path: str, output_path: str, cuda_device: int = -1, batch_size: int = 32, vector_size: int = 768, override=False): """ Run with: $ ./data_cli.py build_specter_vectors paperswithcode_aspects ./specter_archive ./output/pwc_doc_id2specter.w2v.txt --cuda_device=5 Download specter: $ wget https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/specter/archive.tar.gz $ tar -xzvf archive.tar.gz :param vector_size: :param output_path: ./output :param override: :param cuda_device: :param batch_size: :param hf_dataset: :param specter_path: Path to specter :return: """ from specter.predict_command import predictor_from_archive from allennlp.models import load_archive # load to register from specter.model import Model from specter.data import DataReader, DataReaderFromPickled from specter.predictor import SpecterPredictor if Model and DataReader and SpecterPredictor: pass if os.path.exists(output_path) and not override: logger.error(f'Output file exists already: {output_path}') return # Dataset docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir='./data/nlp_cache', split='docs') logger.info(f'Documents loaded: {len(docs_ds):,}') papers_to_embed = [doc for doc in docs_ds] # Specter settings archive_path = os.path.join(specter_path, 'model.tar.gz') metadata_path = os.path.join(specter_path, 'metadata_sample.json') included_text_fields = 'abstract title' vocab_dir = os.path.join(specter_path, 'data/vocab/') cuda_device = int(cuda_device) overrides = f"{{'model':{{'predict_mode':'true','include_venue':'false'}},'dataset_reader':{{'type':'specter_data_reader','predict_mode':'true','paper_features_path':'{metadata_path}','included_text_fields': '{included_text_fields}'}},'vocabulary':{{'directory_path':'{vocab_dir}'}}}}" logger.info(f'SPECTER overrides: {overrides}') archive = load_archive(archive_path, cuda_device=cuda_device, overrides=overrides) predictor = predictor_from_archive(archive, predictor_name='specter_predictor', paper_features_path=metadata_path) # Batches def chunks(lst, chunk_size): """Splits a longer list to respect batch size""" for i in range(0, len(lst), chunk_size): yield lst[i:i + chunk_size] batches_count = int(len(papers_to_embed) / batch_size) batch_embed_papers = [] # 30min on GPU for batch in tqdm(chunks(papers_to_embed, batch_size), total=batches_count): batch_out = predictor.predict_batch_json(batch) batch_embed_papers += batch_out # To keyed vectors doc_model = KeyedVectors(vector_size=vector_size) for embed_paper in tqdm(batch_embed_papers): doc_model.add([embed_paper['paper_id']], [embed_paper['embedding']]) # Save to disk doc_model.save_word2vec_format(output_path) logger.info('Done')
def build_transformers_vectors(hf_dataset: str, model_name_or_path: str, output_path: str, pooling: str, batch_size: int = 16, override: bool = False): """ $ ./data_cli.py build_transformers_vectors paperswithcode_aspects scibert-scivocab-uncased ./output/scibert-cls --pooling=cls --batch_size=16 :param hf_dataset: :param model_name_or_path: :param output_path: :param pooling: :param override: :return: """ env = get_env() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") pooling_strategies = ['cls', 'mean'] if os.path.exists(output_path) and not override: logger.error(f'Output file exists already: {output_path}') sys.exit(1) if pooling not in pooling_strategies: raise ValueError(f'Invalid pooling: {pooling}') # Model path from env if not os.path.exists(model_name_or_path) and os.path.exists( os.path.join(env['bert_dir'], model_name_or_path)): model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path) # Dataset docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir='./data/nlp_cache', split='docs') logger.info(f'Documents loaded: {len(docs_ds):,}') # Model model = AutoModel.from_pretrained(model_name_or_path) model = model.to(device) # Tokenize docs tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) texts = [doc['title'] + ': ' + doc['abstract'] for doc in docs_ds] inputs = tokenizer(texts, add_special_tokens=True, return_tensors='pt', padding=True, max_length=model.config.max_position_embeddings, truncation=True, return_token_type_ids=False, return_attention_mask=True) ds = TensorDataset(inputs['input_ids'], inputs['attention_mask']) dl = DataLoader(ds, shuffle=False, batch_size=batch_size) # Vectors doc_model = KeyedVectors(vector_size=model.config.hidden_size) with torch.no_grad(): for batch_idx, batch_data in enumerate(tqdm(dl, desc='Inference')): batch_data = tuple(t.to(device) for t in batch_data) outputs = model(*batch_data, return_dict=True) if pooling == 'cls': batch_embeddings = outputs['pooler_output'].detach().cpu( ).numpy() elif pooling == 'mean': batch_embeddings = np.mean( outputs['last_hidden_state'].detach().cpu().numpy(), axis=1) else: raise NotImplementedError() batch_ids = docs_ds[batch_idx * batch_size:batch_idx * batch_size + batch_size]['paper_id'] doc_model.add(batch_ids, batch_embeddings) # Save to disk doc_model.save_word2vec_format(output_path) logger.info('Done')
def main(): # Auto-environment env = get_env() parser = HfArgumentParser( (ModelArguments, TrainingArguments, ExperimentArguments)) model_args, training_args, experiment_args = parser.parse_args_into_dataclasses( ) # Adjust output with folds and model name #TODO disabled # training_args.output_dir = os.path.join(training_args.output_dir, str(experiment_args.cv_fold), model_args.get_model_name()) # Model path from env if not os.path.exists(model_args.model_name_or_path) and os.path.exists( os.path.join(env['bert_dir'], model_args.model_name_or_path)): model_args.model_name_or_path = os.path.join( env['bert_dir'], model_args.model_name_or_path) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Dataset args label_classes = get_label_classes_from_hf_dataset( get_local_hf_dataset_path(experiment_args.hf_dataset)) num_labels = len(label_classes) if num_labels > 1 and experiment_args.binary_classification: # In binary classification we have only single label (with y=[0;1]) num_labels = 1 logger.warning(f'Forcing label classes to binary: {label_classes}') columns = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'] # Input to transformers.forward # Build dataset for splits train_ds = load_dataset( get_local_hf_dataset_path(experiment_args.hf_dataset), name='relations', cache_dir=experiment_args.hf_dataset_cache_dir, split=get_train_split(experiment_args.aspect, experiment_args.cv_fold)) test_ds = load_dataset( get_local_hf_dataset_path(experiment_args.hf_dataset), name='relations', cache_dir=experiment_args.hf_dataset_cache_dir, split=get_test_split(experiment_args.aspect, experiment_args.cv_fold)) docs_ds = load_dataset(get_local_hf_dataset_path( experiment_args.hf_dataset), name='docs', cache_dir=experiment_args.hf_dataset_cache_dir, split=datasets.Split('docs')) # Forced limit if experiment_args.dataset_limit > 0: logger.info( f'Train and test datasets limited to {experiment_args.dataset_limit} samples' ) train_ds = Dataset(train_ds.data[:experiment_args.dataset_limit]) test_ds = Dataset(test_ds.data[:experiment_args.dataset_limit]) # Build ID => Doc mapping doc_id2doc = {doc[experiment_args.doc_id_col]: doc for doc in docs_ds} if model_args.model_name_or_path.startswith('baseline-rnn'): # Load Spacy as tokenizer spacy_nlp = spacy.load(experiment_args.spacy_model, disable=["tagger", "ner", "textcat"]) if experiment_args.multi_label: # Baseline models model = RNNForMultiLabelSequenceClassification( word_vectors=get_vectors_from_spacy_model(spacy_nlp), hidden_size=experiment_args.rnn_hidden_size, rnn=experiment_args.rnn_type, num_labels=num_labels, num_layers=experiment_args.rnn_num_layers, dropout=experiment_args.rnn_dropout, ) else: raise NotImplementedError( 'RNN baseline is only available for multi label classification' ) tokenizer = None else: # Load pretrained Transformers models and tokenizers model_config = AutoConfig.from_pretrained( model_args.model_name_or_path, num_labels=num_labels, cache_dir=model_args.cache_dir) # No need for spacy spacy_nlp = None if 'longformer' in model_args.model_name_or_path: # TVM: a custom CUDA kernel implementation of our sliding window attention (works only on GPU) model_config.attention_mode = 'tvm' # override tokenizer name if not set if model_args.tokenizer_name is None: roberta_path = os.path.join(env['bert_dir'], 'roberta-base') model_args.tokenizer_name = roberta_path if os.path.exists( roberta_path) else 'roberta-base' logger.info( f'Overriding tokenizer: {model_args.tokenizer_name}') # override max length experiment_args.max_length = 4096 if experiment_args.multi_label: model_cls = AutoModelForMultiLabelSequenceClassification else: model_cls = AutoModelForSequenceClassification model = model_cls.from_pretrained(model_args.model_name_or_path, config=model_config, cache_dir=model_args.cache_dir) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) # Set token limit if defined by model (for Longformer) if model.config.max_position_embeddings > 0: tokenizer.model_max_length = model.config.max_position_embeddings # Init helper dpt = DocRelTrainerHelper( id2doc=doc_id2doc, transformers_tokenizer=tokenizer, spacy_nlp=spacy_nlp, label_classes=label_classes, binary_classification=experiment_args.binary_classification, doc_a_col=experiment_args.doc_a_col, doc_b_col=experiment_args.doc_b_col, label_col=experiment_args.label_col, text_from_doc_func=get_non_empty_text_from_doc, classification_threshold=experiment_args.classification_threshold, max_length=experiment_args.max_length, multi_label=experiment_args.multi_label, ) logger.info('Converting to features (doc mapping, tokenize, ...)') # Build hash from settings for caching data_settings_hash = hashlib.md5( dataclasses.asdict(experiment_args).__str__().encode("utf-8") + dataclasses.asdict(model_args).__str__().encode("utf-8")).hexdigest() train_tensor_ds = train_ds.map( dpt.convert_to_features, batched=True, load_from_cache_file=True, num_proc=int(env['workers']), cache_file_name=os.path.join( experiment_args.hf_dataset_cache_dir, "cache-train-" + data_settings_hash + ".arrow")) train_tensor_ds.set_format(type='torch', columns=columns) test_tensor_ds = test_ds.map( dpt.convert_to_features, batched=True, load_from_cache_file=True, num_proc=int(env['workers']), cache_file_name=os.path.join( experiment_args.hf_dataset_cache_dir, "cache-test-" + data_settings_hash + ".arrow")) test_tensor_ds.set_format(type='torch', columns=columns) logger.info(f'Dataset columns: {columns}') logger.info(f'Train sample: {train_ds[0]}') logger.debug(f'- as tensor: {train_tensor_ds[0]}') logger.info(f'Test sample: {test_ds[0]}') logger.debug(f'- as tensor: {test_tensor_ds[0]}') # Load models weights (when no training but predictions) model_weights_path = os.path.join(training_args.output_dir, 'pytorch_model.bin') if not training_args.do_train and experiment_args.save_predictions: logger.info( f'Loading existing model weights from disk: {model_weights_path}') if os.path.exists(model_weights_path): model.load_state_dict(torch.load(model_weights_path)) else: logger.error('Weights files does not exist!') # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_tensor_ds, eval_dataset=test_tensor_ds, data_collator=DocRelDataCollator(), #prediction_loss_only=False, compute_metrics=dpt.compute_metrics, ) # Log additional (to Weights & Baises) if is_wandb_available(): extra_config = {} extra_config.update(dataclasses.asdict(experiment_args)) extra_config.update(dataclasses.asdict(model_args)) wandb.config.update(extra_config, allow_val_change=True) if training_args.do_train: logger.info('Training started...') trainer.train() if isinstance(model, PreTrainedModel): trainer.save_model() tokenizer.save_pretrained(training_args.output_dir) elif isinstance(model, nn.Module): # RNN model torch.save(model.state_dict(), model_weights_path) if experiment_args.save_predictions: logger.info('Predicting...') predictions = trainer.predict(test_tensor_ds) df = dpt.get_df_from_predictions(test_ds, docs_ds, predictions, exclude_columns=['abstract']) # Save results to disk df.to_csv(os.path.join(training_args.output_dir, 'results.csv'), index=False) json.dump( predictions.metrics, open(os.path.join(training_args.output_dir, 'metrics.json'), 'w')) logger.info('Done')
def get_evaluation_df(name, doc_model, hf_dataset, aspect, fold) -> Tuple[DataFrame, Dict]: # Init dataframe metrics = [ 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs', 'precision', 'recall', 'avg_p', 'reciprocal_rank', 'ndcg' ] df = pd.DataFrame([], columns=['name', 'aspect', 'fold', 'top_k'] + metrics) # Dataset test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir='./data/nlp_cache', split=get_test_split(aspect, fold)) logger.info(f'Test samples: {len(test_ds):,}') # Unique paper IDs in test set test_paper_ids = set(test_ds['from_paper_id']).union( set(test_ds['to_paper_id'])) logger.info(f'Test paper IDs: {len(test_paper_ids):,}') logger.info(f'Examples: {list(test_paper_ids)[:10]}') # Relevance mapping doc_id2related_ids = defaultdict(set) # type: Dict[Set[str]] for row in test_ds: if row['label'] == 'y': a = row['from_paper_id'] b = row['to_paper_id'] doc_id2related_ids[a].add(b) doc_id2related_ids[b].add(a) # Filter for documents in test set test_doc_model = KeyedVectors(vector_size=doc_model.vector_size) test_doc_ids = [] test_doc_vectors = [] missed_doc_ids = 0 for doc_id in doc_model.vocab: if doc_id in test_paper_ids: vec = doc_model.get_vector(doc_id) if len(vec) != doc_model.vector_size: raise ValueError( f'Test document as invalid shape: {doc_id} => {vec.shape}' ) test_doc_ids.append(doc_id) test_doc_vectors.append(vec) else: missed_doc_ids += 1 # logger.warning(f'Document ID is not part of test set: {doc_id} ({type(doc_id)})') if len(test_doc_ids) != len(test_doc_vectors): raise ValueError( f'Test document IDs does not match vector count: {len(test_doc_ids)} vs {len(test_doc_vectors)}' ) logger.info( f'Test document IDs: {len(test_doc_ids)} (missed {missed_doc_ids})' ) logger.info(f'Test document vectors: {len(test_doc_vectors)}') test_doc_model.add(test_doc_ids, test_doc_vectors) test_doc_model.init_sims(replace=True) logger.info(f'Test document vectors: {test_doc_model.vectors.shape}') # Actual evaluation # k2eval_rows = defaultdict(list) seed_ids_without_recommendations = [] max_top_k = max(top_ks) eval_rows = {top_k: defaultdict(list) for top_k in top_ks } # top_k => metric_name => list of value seed_id2ret_docs = {} for seed_id in tqdm( test_paper_ids, desc=f'Evaluation ({name},aspect={aspect},fold={fold})'): try: rel_docs = doc_id2related_ids[seed_id] max_ret_docs = [ d for d, score in test_doc_model.most_similar(seed_id, topn=max_top_k) ] seed_id2ret_docs[seed_id] = max_ret_docs for top_k in top_ks: ret_docs = max_ret_docs[:top_k] rel_ret_docs_count = len(set(ret_docs) & set(rel_docs)) if ret_docs and rel_docs: # Precision = No. of relevant documents retrieved / No. of total documents retrieved precision = rel_ret_docs_count / len(ret_docs) # Recall = No. of relevant documents retrieved / No. of total relevant documents recall = rel_ret_docs_count / len(rel_docs) # Avg. precision (for MAP) avg_p = get_avg_precision(ret_docs, rel_docs) # Reciprocal rank (for MRR) reciprocal_rank = get_reciprocal_rank( ret_docs, rel_docs) # # NDCG@k predicted_relevance = [ 1 if ret_doc_id in rel_docs else 0 for ret_doc_id in ret_docs ] true_relevances = [1] * len(rel_docs) ndcg_value = compute_dcg_at_k( predicted_relevance, top_k) / compute_dcg_at_k( true_relevances, top_k) # Save metrics eval_rows[top_k]['retrieved_docs'].append( len(ret_docs)) eval_rows[top_k]['relevant_docs'].append(len(rel_docs)) eval_rows[top_k]['relevant_retrieved_docs'].append( rel_ret_docs_count) eval_rows[top_k]['precision'].append(precision) eval_rows[top_k]['recall'].append(recall) eval_rows[top_k]['avg_p'].append(avg_p) eval_rows[top_k]['reciprocal_rank'].append( reciprocal_rank) eval_rows[top_k]['ndcg'].append(ndcg_value) except (IndexError, ValueError, KeyError) as e: seed_ids_without_recommendations.append(seed_id) logger.warning( f'Cannot retrieve recommendations for #{seed_id}: {e}') logger.info( f'Completed with {len(eval_rows[top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})' ) # Summarize evaluation for top_k in top_ks: try: row = [name, aspect, fold, top_k] for metric in metrics: # mean over all metrics values = eval_rows[top_k][metric] if len(values) > 0: row.append(np.mean(values)) else: row.append(None) df.loc[len(df)] = row except ValueError as e: logger.error( f'Cannot summarize row: {top_k} {fold} {metrics} {e}') return df, seed_id2ret_docs
def evaluate_vectors(hf_dataset: str, aspect: str, input_path: str, name: str, folds: Union[str, list], top_ks: Union[str, list], output_path: str): """ Run with: $ ./eval_cli.py evaluate_vectors paperswithcode_aspects task ./output/pwc_doc_id2st.txt --name=sentence_transformers --folds=1,2,3,4 --top_ks=5,10,25,50 --output_path=./output/eval.csv :param aspect: :param folds: :param top_ks: :param name: :param hf_dataset: :param input_path: :param output_path: :return: """ if isinstance(folds, str): folds = folds.split(',') elif isinstance(folds, int): folds = [folds] if isinstance(top_ks, str): top_ks = top_ks.split(',') elif isinstance(top_ks, int): top_ks = [top_ks] logger.info(f'Folds: {folds}') logger.info(f'Top-Ks: {top_ks}') if len(folds) < 1: logger.error('No folds provided') return if len(top_ks) < 1: logger.error('No top-k values provided') return # Load documents doc_model = KeyedVectors.load_word2vec_format(input_path) logger.info(f'Document vectors: {doc_model.vectors.shape}') # Normalize vectors doc_model.init_sims(replace=True) # Init dataframe metrics = [ 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs', 'precision', 'recall', 'avg_p', 'reciprocal_rank' ] df = pd.DataFrame([], columns=['name', 'fold', 'top_k'] + metrics) # Iterate over folds for fold in folds: logger.info(f'Current fold: {fold}') # Dataset test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir='./data/nlp_cache', split=get_test_split(aspect, fold)) logger.info(f'Test samples: {len(test_ds):,}') # Unique paper IDs in test set test_paper_ids = set(test_ds['from_paper_id']).union( set(test_ds['to_paper_id'])) logger.info(f'Test paper IDs: {len(test_paper_ids):,}') logger.info(f'Examples: {list(test_paper_ids)[:10]}') # Relevance mapping doc_id2related_ids = defaultdict(set) # type: Dict[Set[str]] for row in test_ds: if row['label'] == 'y': a = row['from_paper_id'] b = row['to_paper_id'] doc_id2related_ids[a].add(b) doc_id2related_ids[b].add(a) # Filter for documents in test set test_doc_model = KeyedVectors(vector_size=doc_model.vector_size) test_doc_ids = [] test_doc_vectors = [] missed_doc_ids = 0 for doc_id in doc_model.vocab: if doc_id in test_paper_ids: vec = doc_model.get_vector(doc_id) if len(vec) != doc_model.vector_size: raise ValueError( f'Test document as invalid shape: {doc_id} => {vec.shape}' ) test_doc_ids.append(doc_id) test_doc_vectors.append(vec) else: missed_doc_ids += 1 # logger.warning(f'Document ID is not part of test set: {doc_id} ({type(doc_id)})') if len(test_doc_ids) != len(test_doc_vectors): raise ValueError( f'Test document IDs does not match vector count: {len(test_doc_ids)} vs {len(test_doc_vectors)}' ) logger.info( f'Test document IDs: {len(test_doc_ids)} (missed {missed_doc_ids})' ) logger.info(f'Test document vectors: {len(test_doc_vectors)}') test_doc_model.add(test_doc_ids, test_doc_vectors) test_doc_model.init_sims(replace=True) logger.info(f'Test document vectors: {test_doc_model.vectors.shape}') # Actual evaluation # k2eval_rows = defaultdict(list) seed_ids_without_recommendations = [] max_top_k = max(top_ks) eval_rows = {top_k: defaultdict(list) for top_k in top_ks } # top_k => metric_name => list of value for seed_id in tqdm(test_paper_ids, desc=f'Evaluation (fold={fold})'): try: rel_docs = doc_id2related_ids[seed_id] max_ret_docs = [ d for d, score in test_doc_model.most_similar(seed_id, topn=max_top_k) ] for top_k in top_ks: ret_docs = max_ret_docs[:top_k] rel_ret_docs_count = len(set(ret_docs) & set(rel_docs)) if ret_docs and rel_docs: # Precision = No. of relevant documents retrieved / No. of total documents retrieved precision = rel_ret_docs_count / len(ret_docs) # Recall = No. of relevant documents retrieved / No. of total relevant documents recall = rel_ret_docs_count / len(rel_docs) # Avg. precision (for MAP) avg_p = get_avg_precision(ret_docs, rel_docs) # Reciprocal rank (for MRR) reciprocal_rank = get_reciprocal_rank( ret_docs, rel_docs) # # NDCG@k # predicted_relevance = [1 if ret_doc_id in rel_docs else 0 for ret_doc_id in ret_docs] # true_relevances = [1] * len(rel_docs) # ndcg_value = self.compute_dcg_at_k(predicted_relevance, top_k) / self.compute_dcg_at_k(true_relevances, top_k) # Save metrics eval_rows[top_k]['retrieved_docs'].append( len(ret_docs)) eval_rows[top_k]['relevant_docs'].append(len(rel_docs)) eval_rows[top_k]['relevant_retrieved_docs'].append( rel_ret_docs_count) eval_rows[top_k]['precision'].append(precision) eval_rows[top_k]['recall'].append(recall) eval_rows[top_k]['avg_p'].append(avg_p) eval_rows[top_k]['reciprocal_rank'].append( reciprocal_rank) except (IndexError, ValueError, KeyError) as e: seed_ids_without_recommendations.append(seed_id) logger.warning( f'Cannot retrieve recommendations for #{seed_id}: {e}') logger.info( f'Completed with {len(eval_rows[top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})' ) # Summarize evaluation for top_k in top_ks: try: row = [name, fold, top_k] for metric in metrics: # mean over all metrics values = eval_rows[top_k][metric] if len(values) > 0: row.append(np.mean(values)) else: row.append(None) df.loc[len(df)] = row except ValueError as e: logger.error( f'Cannot summarize row: {top_k} {fold} {metrics} {e}') # # # df = pd.DataFrame(k2eval_rows[top_k], # columns=['seed_id', 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs', # 'precision', 'recall', 'avg_p', 'reciprocal_rank']) # # print(df.mean()) # # print(df.mean().to_frame().transpose().iloc[0]) logger.info(f'Writing {len(df)} rows to {output_path}') if os.path.exists(output_path): # Append new rows to evaluation file df.to_csv(output_path, mode='a', header=False, index=False) else: # Write new files df.to_csv(output_path, header=True, index=False) logger.info('Done')
def train(model_name_or_path: str, hf_dataset: str, aspect: str, fold: Union[int, str], output_path: str, train_epochs: int = 3, train_batch_size: int = 25, eval_batch_size: int = 32, evaluation_steps: int = 5000, train_on_test: bool = False, loss: str = 'multiple_negatives_ranking', override: bool = False): """ # $MODEL_NAME $HF_DATASET $ASPECT $FOLD $OUTPUT_DIR --train_epochs=3 --train_batch_size=$TRAIN_BATCH_SIZE --eval_batch_size=$EVAL_BATCH_SIZE Run with: $ export CUDA_VISIBLE_DEVICES=1 $ ./sentence_transformer_cli.py train scibert-scivocab-uncased paperswithcode_task_docs 1 ./output/st_scibert/1 --train_epochs=3 --train_batch_size=25 --eval_batch_size=32 :param loss: Training loss function (choices: multiple_negatives_ranking, cosine) :param train_on_test: If True, joint training on train and test set (validation disabled) :param aspect: :param evaluation_steps: :param train_epochs: :param model_name_or_path: :param hf_dataset: :param fold: :param output_path: :param train_batch_size: :param eval_batch_size: :param override: :return: """ top_ks = [5, 10, 25, 50] # cuda_device = -1 # hf_dataset = 'paperswithcode_task_docs' # model_name_or_path = 'scibert-scivocab-uncased' # fold = 1 max_token_length = 336 # ssee pwc_token_stats.ipynb nlp_cache_dir = './data/nlp_cache' # train_batch_size = 25 # eval_batch_size = 32 # override = False # output_path = './output/pwc_task_st/1/sci-bert' # output_path = os.path.join(output_path, str(fold), model_name_or_path) # output/1/sci-bert if os.path.exists(output_path) and not override: logger.error(f'Stop. Output path exists already: {output_path}') sys.exit(1) # if cuda_device >= 0: # os.environ["CUDA_VISIBLE_DEVICES"] = str(cuda_device) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Model path from env if not os.path.exists(model_name_or_path) and os.path.exists( os.path.join(env['bert_dir'], model_name_or_path)): model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path) word_embedding_model = Transformer(model_name_or_path, max_seq_length=max_token_length) pooling_model = Pooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # tokenizer = BertTokenizer.from_pretrained(model_name_or_path) # dataset docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir=nlp_cache_dir, split='docs') train_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir=nlp_cache_dir, split=get_train_split(aspect, fold)) test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir=nlp_cache_dir, split=get_test_split(aspect, fold)) # filter for positive labels only train_ds = train_ds.filter(lambda row: row['label'] == 'y') logger.info(f'After filtering: {len(train_ds):,}') # joint training on train and test? if train_on_test: # # import pyarrow # from datasets.arrow_dataset import Dataset # # full_ds_table = pyarrow.concat_tables([train_ds.data, test_ds.data]) # full_ds = Dataset(arrow_table=full_ds_table) raise NotImplementedError('TODO Evaluator') else: # standard training on test only train_sds = DocumentPairSentencesDataset(docs_ds, train_ds, model, max_length=max_token_length, forced_length=0) train_sds.tokenize_all_docs() evaluator = NearestNeighborsEvaluator(model, docs_ds, test_ds, top_ks=top_ks, batch_size=eval_batch_size, show_progress_bar=True) if loss == 'cosine': train_loss = losses.CosineSimilarityLoss(model) elif loss == 'multiple_negatives_ranking': # A nice advantage of MultipleNegativesRankingLoss is that it only requires positive pairs # https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/quora_duplicate_questions train_loss = losses.MultipleNegativesRankingLoss(model) else: raise ValueError(f'Unsupported loss function: {loss}') train_dl = DataLoader(train_sds, shuffle=True, batch_size=train_batch_size) # Training model.fit( train_objectives=[(train_dl, train_loss)], epochs=train_epochs, # try 1-4 warmup_steps=100, evaluator=evaluator, evaluation_steps= evaluation_steps, # increase to 5000 (full dataset => 20k steps) output_path=output_path, output_path_ignore_not_empty=True) logger.info('Training done')
def build_vectors(st_output_path: str, hf_dataset: str, aspect: str, fold: Union[int, str], include_all_docs: bool = False, override: bool = False): """ :param override: :param include_all_docs: Generate also vectors for samples from training data :param st_output_path: Path to Sentence Transformer model :param hf_dataset: Huggingface dataset path or name :param aspect: :param fold: :return: """ max_token_length = 336 # ssee pwc_token_stats.ipynb nlp_cache_dir = './data/nlp_cache' out_fn = 'pwc_id2vec__all_docs.w2v.txt' if include_all_docs else 'pwc_id2vec.w2v.txt' out_fp = os.path.join(st_output_path, out_fn) if not os.path.exists(st_output_path): logger.error( f'Sentence Transformer directory does not exist: {st_output_path}') return if os.path.exists(out_fp) and not override: logger.error( f'Output path exists already and override is disabled: {out_fp}') return # Inference for best model best_model = SentenceTransformer(st_output_path) best_model.get_sentence_embedding_dimension() test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir=nlp_cache_dir, split=get_test_split(aspect, fold)) docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir=nlp_cache_dir, split='docs') test_sds = DocumentPairSentencesDataset(docs_ds, test_ds, best_model) if include_all_docs: # use all document ids input_paper_ids = set(docs_ds['paper_id']) logger.info(f'All documents in corpus: {len(input_paper_ids):,}') else: # generate vectors from unique test documents only input_paper_ids = set(test_ds['from_paper_id']).union( set(test_ds['to_paper_id'])) with open(out_fp, 'w') as f: # header f.write( f'{len(input_paper_ids)} {best_model.get_sentence_embedding_dimension()}\n' ) # body for paper_id in tqdm(input_paper_ids, desc='Inference'): vec = [ str(v) for v in best_model.encode(test_sds.get_text_from_doc( paper_id), show_progress_bar=False) ] assert len(vec) == best_model.get_sentence_embedding_dimension() vec_str = ' '.join(vec) line = f'{paper_id} {vec_str}\n' f.write(line) # break logger.info(f'Encoded {len(input_paper_ids):,} into {out_fp}')