示例#1
0
文件: data_cli.py 项目: j5bd/q
def train_fasttext(hf_dataset, output_dir):
    """

    Run with: $ ./data_cli.py train_fasttext paperswithcode_aspects ./output

    :return:
    """

    tokens_fp = os.path.join(output_dir, 'tokens.txt')
    fasttext_bin_fp = os.path.join(output_dir, 'fasttext.bin')
    fasttext_w2v_fp = os.path.join(output_dir, 'fasttext.w2v.txt')

    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir='./data/nlp_cache',
                           split='docs')

    logger.info(f'Documents loaded: {len(docs_ds):,}')

    # Tokenized text
    doc_delimiter = '\n'
    token_delimiter = ' '
    tokens_count = 0

    with open(tokens_fp, 'w') as f:
        for doc in docs_ds:
            # Extract plain text
            text = doc['title'] + ': ' + doc['abstract']

            for token in gensim.utils.simple_preprocess(text,
                                                        min_len=2,
                                                        max_len=15):
                f.write(token + token_delimiter)
                tokens_count += 1
            f.write(doc_delimiter)

    logger.info(f'Total tokens: {tokens_count:,}')

    # Train actual fasttext model
    logger.info(f'Train fastext model...')

    model = fasttext.train_unsupervised(
        tokens_fp,
        model='skipgram',
        lr=0.05,  # learning rate [0.05]
        dim=300,  # size of word vectors [100]
        ws=5,  # size of the context window [5]
        epoch=5  # number of epochs [5]
        # thread            # number of threads [number of cpus]
    )
    model.save_model(fasttext_bin_fp)

    del model

    ft_model = FastText.load_fasttext_format(fasttext_bin_fp)
    ft_model.wv.save_word2vec_format(fasttext_w2v_fp)

    logger.info(f'Output saved to: {fasttext_w2v_fp}')

    logger.info('Done')
示例#2
0
文件: data_cli.py 项目: j5bd/q
def build_explirefit_inputs(hf_dataset, aspect, output_dir):
    """

    Run with: $ ./data_cli.py build_explirefit_inputs paperswithcode_aspects task ./output/

    format synonyms.txt:
    <doc_id_a> <doc_id_b>
    <doc_id_a> <doc_id_c>
    ...

    antonyms.txt
    <doc_id_a> <doc_id_b>
    <doc_id_a> <doc_id_c>
    ...

    :param aspect:
    :param hf_dataset:
    :param output_dir:
    :return:
    """

    for fold in [1, 2, 3, 4]:
        fold = str(fold)

        fold_dir = os.path.join(output_dir, fold)

        if not os.path.exists(fold_dir):
            os.makedirs(fold_dir)

        synonyms_fp = os.path.join(fold_dir, 'synonyms.txt')
        if os.path.exists(synonyms_fp):
            raise FileExistsError(f'Output exists already: {synonyms_fp}')

        antonyms_fp = os.path.join(fold_dir, 'antonyms.txt')
        if os.path.exists(antonyms_fp):
            raise FileExistsError(f'Output exists already: {antonyms_fp}')

        train_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                                name='relations',
                                cache_dir='./data/nlp_cache',
                                split=get_train_split(aspect, fold))

        logger.info(f'Training samples: {len(train_ds):,}')

        with open(synonyms_fp, 'w') as synonyms_f:
            with open(antonyms_fp, 'w') as antonyms_f:

                for row in tqdm(train_ds, desc='Writing output'):
                    line = 'en_' + row['from_paper_id'] + ' en_' + row[
                        'to_paper_id'] + '\n'

                    if row['label'] == 'y':
                        synonyms_f.write(line)
                    elif row['label'] == 'n':
                        antonyms_f.write(line)
                    else:
                        raise ValueError(f'Unsupported label: {row}')

    logger.info('Done')
示例#3
0
文件: data_cli.py 项目: j5bd/q
def build_whoosh_index(index_dir: str, override=False):
    # use search index
    from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
    from whoosh.analysis import StemmingAnalyzer
    from whoosh import index
    from whoosh.qparser import QueryParser

    hf_dataset = 'paperswithcode_aspects'
    nlp_cache_dir = './data/nlp_cache'

    if os.path.exists(index_dir):
        if override:
            os.rmdir(index_dir)
        else:
            logger.error(f'Index dir exists already and override is disabled')
            return

    # Load meta data
    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir=nlp_cache_dir,
                           split='docs')

    paper_id2paper = {p['paper_id']: Paper(**p) for p in docs_ds}

    paper_schema = Schema(
        paper_id=ID(stored=True),
        title=TEXT(stored=True),
        abstract=TEXT(analyzer=StemmingAnalyzer()),
        paper_url=TEXT(),
        aspect_tasks=KEYWORD,
        aspect_methods=KEYWORD,
        aspect_datasets=KEYWORD,
    )

    # reset index via: $ #!rm -r ./output/pwc/whoosh_index
    logger.info('Creating new index')
    # index does not exist
    os.makedirs(index_dir)

    ix = index.create_in(index_dir, paper_schema)

    # save documents
    writer = ix.writer()

    for paper_id, paper in tqdm(paper_id2paper.items()):
        writer.add_document(**paper.__dict__)
        # break
    writer.commit()

    logger.info('Done')
示例#4
0
文件: data_cli.py 项目: j5bd/q
def build_avg_word_vectors(hf_dataset, w2v_path, output_path, override=False):
    """

    Run with: $ ./data_cli.py build_avg_word_vectors paperswithcode_aspects ./output/fasttext.w2v.txt ./output/pwc_doc_id2avg_fasttext.w2v.txt

    :param hf_dataset:
    :param w2v_path:
    :param output_path:
    :param override:
    :return:
    """
    stop_words = 'english'
    count_vector_size = 100000

    if os.path.exists(output_path):
        if override:
            logger.debug(f'Override {output_path}')
            os.remove(output_path)
        else:
            logger.info(
                f'Stop. Output file exists already (override disabled): {output_path}'
            )
            return

    w2v_model = KeyedVectors.load_word2vec_format(w2v_path)
    doc_model = KeyedVectors(vector_size=w2v_model.vector_size)

    count_vec = CountVectorizer(stop_words=stop_words,
                                analyzer='word',
                                lowercase=True,
                                ngram_range=(1, 1),
                                max_features=count_vector_size)

    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir='./data/nlp_cache',
                           split='docs')
    logger.info(f'Documents loaded: {len(docs_ds):,}')

    # Extract plain text
    texts = []
    doc_id2idx = {}
    idx2doc_id = {}

    for idx, doc in enumerate(docs_ds):
        # Extract plain text
        texts.append(doc['title'] + ': ' + doc['abstract'])
        doc_id2idx[doc['paper_id']] = idx
        idx2doc_id[idx] = doc['paper_id']

    # Transforms the data into a bag of words
    count_train = count_vec.fit(texts)
    idx2bow = count_vec.transform(texts)
    vidx2word = {v: k for k, v in count_train.vocabulary_.items()}

    assert len(vidx2word) == len(count_train.vocabulary_)

    logger.info(f'Vocab size: {len(count_train.vocabulary_)}')

    for idx, text in enumerate(
            tqdm(texts, total=len(texts), desc='Converting docs to vectors')):
        bow = idx2bow[idx].A[0]

        vectors = []
        weights = []

        for _idx, count in enumerate(bow):
            if count > 0:
                word = vidx2word[_idx]
                try:
                    v = w2v_model.get_vector(word)
                    vectors.append(v)
                    weights.append(count)
                except KeyError:
                    # unknown word
                    pass

                pass

        # Check if at least one document term exists as word vector
        if vectors and weights:
            # Weight avg
            doc = np.average(np.array(vectors),
                             axis=0,
                             weights=np.array(weights))

            # Add to model with doc_id
            doc_model.add([str(idx2doc_id[idx])], [doc])
        else:
            logger.debug(
                f'Cannot add document {idx2doc_id[idx]} due to missing word vectors'
            )

    # Save to disk
    doc_model.save_word2vec_format(output_path)
    logger.info(f'Saved to: {output_path}')
示例#5
0
文件: data_cli.py 项目: j5bd/q
def build_specter_input(hf_dataset: str,
                        aspect,
                        fold: Union[str, int],
                        output_path: str,
                        override: bool = False) -> None:
    """
    Run with: $ ./data_cli.py build_specter_input paperswithcode_aspects task 1 ./output/specter_input/1

    Builds the following files (needed for SPECTER training):
    - data.json containing the document ids and their relationship.
    - metadata.json containing mapping of document ids to textual fiels (e.g., title, abstract)
    - train.txt,val.txt, test.txt containing document ids corresponding to train/val/test sets (one doc id per line).

    Data structure:
    - count = 5 (same aspect)
    - count = 1 (???) => ignore

    :param aspect:
    :param hf_dataset:
    :param fold:
    :param output_path:
    :param override:
    :return:
    """
    nlp_cache_dir = './data/nlp_cache'

    if os.path.exists(output_path) and not override:
        logger.error(f'Output path exist already: {output_path}')
        sys.exit(1)
    else:
        os.makedirs(output_path)

    train_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                            name='relations',
                            cache_dir=nlp_cache_dir,
                            split=get_train_split(aspect, fold))

    test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='relations',
                           cache_dir=nlp_cache_dir,
                           split=get_test_split(aspect, fold))

    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir=nlp_cache_dir,
                           split='docs')

    # metadata
    metadata = {}
    for doc in docs_ds:
        metadata[doc['paper_id']] = {
            'paper_id': doc['paper_id'],
            'title': doc['title'],
            'abstract': doc['abstract'],
        }
    logger.info('Writing metadata')
    json.dump(metadata, open(os.path.join(output_path, 'metadata.json'), 'w'))

    # train/val/test ids
    train_doc_ids = set()
    test_doc_ids = set()

    # data
    data = defaultdict(dict)
    for pair in train_ds:
        # TODO include negative samples?
        count = 5 if pair['label'] == 'y' else 1
        data[pair['from_paper_id']][pair['to_paper_id']] = {'count': count}

        train_doc_ids.add(pair['from_paper_id'])
        train_doc_ids.add(pair['to_paper_id'])

    for pair in test_ds:
        count = 5 if pair['label'] == 'y' else 1
        data[pair['from_paper_id']][pair['to_paper_id']] = {'count': count}

        test_doc_ids.add(pair['from_paper_id'])
        test_doc_ids.add(pair['to_paper_id'])

    logger.info('Writing data')
    json.dump(data, open(os.path.join(output_path, 'data.json'), 'w'))

    train_doc_ids = list(train_doc_ids)
    full_test_doc_ids = list(test_doc_ids)
    random.shuffle(full_test_doc_ids)

    split_at = int(0.1 * len(full_test_doc_ids))

    val_doc_ids = full_test_doc_ids[:split_at]
    test_doc_ids = full_test_doc_ids[split_at:]

    logger.info('Writing train/val/test')
    with open(os.path.join(output_path, 'train.txt'), 'w') as f:
        for i in train_doc_ids:
            f.write(i + '\n')
    with open(os.path.join(output_path, 'val.txt'), 'w') as f:
        for i in val_doc_ids:
            f.write(i + '\n')
    with open(os.path.join(output_path, 'test.txt'), 'w') as f:
        for i in test_doc_ids:
            f.write(i + '\n')

    logger.info('done')
示例#6
0
文件: data_cli.py 项目: j5bd/q
def build_specter_vectors(hf_dataset: str,
                          specter_path: str,
                          output_path: str,
                          cuda_device: int = -1,
                          batch_size: int = 32,
                          vector_size: int = 768,
                          override=False):
    """
    Run with: $ ./data_cli.py build_specter_vectors paperswithcode_aspects ./specter_archive ./output/pwc_doc_id2specter.w2v.txt --cuda_device=5

    Download specter:
    $ wget https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/specter/archive.tar.gz
    $ tar -xzvf archive.tar.gz

    :param vector_size:
    :param output_path: ./output
    :param override:
    :param cuda_device:
    :param batch_size:
    :param hf_dataset:
    :param specter_path: Path to specter
    :return:
    """
    from specter.predict_command import predictor_from_archive
    from allennlp.models import load_archive

    # load to register
    from specter.model import Model
    from specter.data import DataReader, DataReaderFromPickled
    from specter.predictor import SpecterPredictor

    if Model and DataReader and SpecterPredictor:
        pass

    if os.path.exists(output_path) and not override:
        logger.error(f'Output file exists already: {output_path}')
        return

    # Dataset
    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir='./data/nlp_cache',
                           split='docs')
    logger.info(f'Documents loaded: {len(docs_ds):,}')
    papers_to_embed = [doc for doc in docs_ds]

    # Specter settings
    archive_path = os.path.join(specter_path, 'model.tar.gz')
    metadata_path = os.path.join(specter_path, 'metadata_sample.json')
    included_text_fields = 'abstract title'
    vocab_dir = os.path.join(specter_path, 'data/vocab/')

    cuda_device = int(cuda_device)

    overrides = f"{{'model':{{'predict_mode':'true','include_venue':'false'}},'dataset_reader':{{'type':'specter_data_reader','predict_mode':'true','paper_features_path':'{metadata_path}','included_text_fields': '{included_text_fields}'}},'vocabulary':{{'directory_path':'{vocab_dir}'}}}}"

    logger.info(f'SPECTER overrides: {overrides}')

    archive = load_archive(archive_path,
                           cuda_device=cuda_device,
                           overrides=overrides)

    predictor = predictor_from_archive(archive,
                                       predictor_name='specter_predictor',
                                       paper_features_path=metadata_path)

    # Batches
    def chunks(lst, chunk_size):
        """Splits a longer list to respect batch size"""
        for i in range(0, len(lst), chunk_size):
            yield lst[i:i + chunk_size]

    batches_count = int(len(papers_to_embed) / batch_size)
    batch_embed_papers = []

    # 30min on GPU
    for batch in tqdm(chunks(papers_to_embed, batch_size),
                      total=batches_count):
        batch_out = predictor.predict_batch_json(batch)
        batch_embed_papers += batch_out

    # To keyed vectors
    doc_model = KeyedVectors(vector_size=vector_size)

    for embed_paper in tqdm(batch_embed_papers):
        doc_model.add([embed_paper['paper_id']], [embed_paper['embedding']])

    # Save to disk
    doc_model.save_word2vec_format(output_path)

    logger.info('Done')
示例#7
0
文件: data_cli.py 项目: j5bd/q
def build_transformers_vectors(hf_dataset: str,
                               model_name_or_path: str,
                               output_path: str,
                               pooling: str,
                               batch_size: int = 16,
                               override: bool = False):
    """

    $ ./data_cli.py build_transformers_vectors paperswithcode_aspects scibert-scivocab-uncased ./output/scibert-cls --pooling=cls --batch_size=16

    :param hf_dataset:
    :param model_name_or_path:
    :param output_path:
    :param pooling:
    :param override:
    :return:
    """

    env = get_env()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    pooling_strategies = ['cls', 'mean']

    if os.path.exists(output_path) and not override:
        logger.error(f'Output file exists already: {output_path}')
        sys.exit(1)

    if pooling not in pooling_strategies:
        raise ValueError(f'Invalid pooling: {pooling}')

    # Model path from env
    if not os.path.exists(model_name_or_path) and os.path.exists(
            os.path.join(env['bert_dir'], model_name_or_path)):
        model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path)

    # Dataset
    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir='./data/nlp_cache',
                           split='docs')
    logger.info(f'Documents loaded: {len(docs_ds):,}')

    # Model
    model = AutoModel.from_pretrained(model_name_or_path)
    model = model.to(device)

    # Tokenize docs
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

    texts = [doc['title'] + ': ' + doc['abstract'] for doc in docs_ds]

    inputs = tokenizer(texts,
                       add_special_tokens=True,
                       return_tensors='pt',
                       padding=True,
                       max_length=model.config.max_position_embeddings,
                       truncation=True,
                       return_token_type_ids=False,
                       return_attention_mask=True)

    ds = TensorDataset(inputs['input_ids'], inputs['attention_mask'])
    dl = DataLoader(ds, shuffle=False, batch_size=batch_size)

    # Vectors
    doc_model = KeyedVectors(vector_size=model.config.hidden_size)

    with torch.no_grad():
        for batch_idx, batch_data in enumerate(tqdm(dl, desc='Inference')):
            batch_data = tuple(t.to(device) for t in batch_data)

            outputs = model(*batch_data, return_dict=True)

            if pooling == 'cls':
                batch_embeddings = outputs['pooler_output'].detach().cpu(
                ).numpy()
            elif pooling == 'mean':
                batch_embeddings = np.mean(
                    outputs['last_hidden_state'].detach().cpu().numpy(),
                    axis=1)
            else:
                raise NotImplementedError()

            batch_ids = docs_ds[batch_idx * batch_size:batch_idx * batch_size +
                                batch_size]['paper_id']
            doc_model.add(batch_ids, batch_embeddings)

    # Save to disk
    doc_model.save_word2vec_format(output_path)

    logger.info('Done')
示例#8
0
def main():
    # Auto-environment
    env = get_env()

    parser = HfArgumentParser(
        (ModelArguments, TrainingArguments, ExperimentArguments))
    model_args, training_args, experiment_args = parser.parse_args_into_dataclasses(
    )

    # Adjust output with folds and model name
    #TODO disabled
    # training_args.output_dir = os.path.join(training_args.output_dir, str(experiment_args.cv_fold), model_args.get_model_name())

    # Model path from env
    if not os.path.exists(model_args.model_name_or_path) and os.path.exists(
            os.path.join(env['bert_dir'], model_args.model_name_or_path)):
        model_args.model_name_or_path = os.path.join(
            env['bert_dir'], model_args.model_name_or_path)

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Dataset args
    label_classes = get_label_classes_from_hf_dataset(
        get_local_hf_dataset_path(experiment_args.hf_dataset))
    num_labels = len(label_classes)

    if num_labels > 1 and experiment_args.binary_classification:
        # In binary classification we have only single label (with y=[0;1])
        num_labels = 1
        logger.warning(f'Forcing label classes to binary: {label_classes}')

    columns = ['input_ids', 'attention_mask', 'token_type_ids',
               'labels']  # Input to transformers.forward

    # Build dataset for splits
    train_ds = load_dataset(
        get_local_hf_dataset_path(experiment_args.hf_dataset),
        name='relations',
        cache_dir=experiment_args.hf_dataset_cache_dir,
        split=get_train_split(experiment_args.aspect, experiment_args.cv_fold))
    test_ds = load_dataset(
        get_local_hf_dataset_path(experiment_args.hf_dataset),
        name='relations',
        cache_dir=experiment_args.hf_dataset_cache_dir,
        split=get_test_split(experiment_args.aspect, experiment_args.cv_fold))
    docs_ds = load_dataset(get_local_hf_dataset_path(
        experiment_args.hf_dataset),
                           name='docs',
                           cache_dir=experiment_args.hf_dataset_cache_dir,
                           split=datasets.Split('docs'))

    # Forced limit
    if experiment_args.dataset_limit > 0:
        logger.info(
            f'Train and test datasets limited to {experiment_args.dataset_limit} samples'
        )

        train_ds = Dataset(train_ds.data[:experiment_args.dataset_limit])
        test_ds = Dataset(test_ds.data[:experiment_args.dataset_limit])

    # Build ID => Doc mapping
    doc_id2doc = {doc[experiment_args.doc_id_col]: doc for doc in docs_ds}

    if model_args.model_name_or_path.startswith('baseline-rnn'):
        # Load Spacy as tokenizer
        spacy_nlp = spacy.load(experiment_args.spacy_model,
                               disable=["tagger", "ner", "textcat"])

        if experiment_args.multi_label:
            # Baseline models
            model = RNNForMultiLabelSequenceClassification(
                word_vectors=get_vectors_from_spacy_model(spacy_nlp),
                hidden_size=experiment_args.rnn_hidden_size,
                rnn=experiment_args.rnn_type,
                num_labels=num_labels,
                num_layers=experiment_args.rnn_num_layers,
                dropout=experiment_args.rnn_dropout,
            )
        else:
            raise NotImplementedError(
                'RNN baseline is only available for multi label classification'
            )

        tokenizer = None

    else:
        # Load pretrained Transformers models and tokenizers
        model_config = AutoConfig.from_pretrained(
            model_args.model_name_or_path,
            num_labels=num_labels,
            cache_dir=model_args.cache_dir)

        # No need for spacy
        spacy_nlp = None

        if 'longformer' in model_args.model_name_or_path:
            # TVM: a custom CUDA kernel implementation of our sliding window attention (works only on GPU)
            model_config.attention_mode = 'tvm'

            # override tokenizer name if not set
            if model_args.tokenizer_name is None:
                roberta_path = os.path.join(env['bert_dir'], 'roberta-base')
                model_args.tokenizer_name = roberta_path if os.path.exists(
                    roberta_path) else 'roberta-base'

                logger.info(
                    f'Overriding tokenizer: {model_args.tokenizer_name}')

            # override max length
            experiment_args.max_length = 4096

        if experiment_args.multi_label:
            model_cls = AutoModelForMultiLabelSequenceClassification
        else:
            model_cls = AutoModelForSequenceClassification

        model = model_cls.from_pretrained(model_args.model_name_or_path,
                                          config=model_config,
                                          cache_dir=model_args.cache_dir)
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name
            if model_args.tokenizer_name else model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )

        # Set token limit if defined by model (for Longformer)
        if model.config.max_position_embeddings > 0:
            tokenizer.model_max_length = model.config.max_position_embeddings

    # Init helper
    dpt = DocRelTrainerHelper(
        id2doc=doc_id2doc,
        transformers_tokenizer=tokenizer,
        spacy_nlp=spacy_nlp,
        label_classes=label_classes,
        binary_classification=experiment_args.binary_classification,
        doc_a_col=experiment_args.doc_a_col,
        doc_b_col=experiment_args.doc_b_col,
        label_col=experiment_args.label_col,
        text_from_doc_func=get_non_empty_text_from_doc,
        classification_threshold=experiment_args.classification_threshold,
        max_length=experiment_args.max_length,
        multi_label=experiment_args.multi_label,
    )

    logger.info('Converting to features (doc mapping, tokenize, ...)')

    # Build hash from settings for caching
    data_settings_hash = hashlib.md5(
        dataclasses.asdict(experiment_args).__str__().encode("utf-8") +
        dataclasses.asdict(model_args).__str__().encode("utf-8")).hexdigest()

    train_tensor_ds = train_ds.map(
        dpt.convert_to_features,
        batched=True,
        load_from_cache_file=True,
        num_proc=int(env['workers']),
        cache_file_name=os.path.join(
            experiment_args.hf_dataset_cache_dir,
            "cache-train-" + data_settings_hash + ".arrow"))
    train_tensor_ds.set_format(type='torch', columns=columns)

    test_tensor_ds = test_ds.map(
        dpt.convert_to_features,
        batched=True,
        load_from_cache_file=True,
        num_proc=int(env['workers']),
        cache_file_name=os.path.join(
            experiment_args.hf_dataset_cache_dir,
            "cache-test-" + data_settings_hash + ".arrow"))
    test_tensor_ds.set_format(type='torch', columns=columns)

    logger.info(f'Dataset columns: {columns}')
    logger.info(f'Train sample: {train_ds[0]}')
    logger.debug(f'- as tensor: {train_tensor_ds[0]}')

    logger.info(f'Test sample: {test_ds[0]}')
    logger.debug(f'- as tensor: {test_tensor_ds[0]}')

    # Load models weights (when no training but predictions)
    model_weights_path = os.path.join(training_args.output_dir,
                                      'pytorch_model.bin')

    if not training_args.do_train and experiment_args.save_predictions:
        logger.info(
            f'Loading existing model weights from disk: {model_weights_path}')
        if os.path.exists(model_weights_path):
            model.load_state_dict(torch.load(model_weights_path))
        else:
            logger.error('Weights files does not exist!')

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tensor_ds,
        eval_dataset=test_tensor_ds,
        data_collator=DocRelDataCollator(),
        #prediction_loss_only=False,
        compute_metrics=dpt.compute_metrics,
    )

    # Log additional (to Weights & Baises)
    if is_wandb_available():
        extra_config = {}
        extra_config.update(dataclasses.asdict(experiment_args))
        extra_config.update(dataclasses.asdict(model_args))

        wandb.config.update(extra_config, allow_val_change=True)

    if training_args.do_train:
        logger.info('Training started...')
        trainer.train()

        if isinstance(model, PreTrainedModel):
            trainer.save_model()
            tokenizer.save_pretrained(training_args.output_dir)

        elif isinstance(model, nn.Module):  # RNN model
            torch.save(model.state_dict(), model_weights_path)

    if experiment_args.save_predictions:
        logger.info('Predicting...')

        predictions = trainer.predict(test_tensor_ds)

        df = dpt.get_df_from_predictions(test_ds,
                                         docs_ds,
                                         predictions,
                                         exclude_columns=['abstract'])

        # Save results to disk
        df.to_csv(os.path.join(training_args.output_dir, 'results.csv'),
                  index=False)
        json.dump(
            predictions.metrics,
            open(os.path.join(training_args.output_dir, 'metrics.json'), 'w'))

    logger.info('Done')
示例#9
0
文件: eval_cli.py 项目: j5bd/q
    def get_evaluation_df(name, doc_model, hf_dataset, aspect,
                          fold) -> Tuple[DataFrame, Dict]:
        # Init dataframe
        metrics = [
            'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs',
            'precision', 'recall', 'avg_p', 'reciprocal_rank', 'ndcg'
        ]
        df = pd.DataFrame([],
                          columns=['name', 'aspect', 'fold', 'top_k'] +
                          metrics)

        # Dataset
        test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                               name='relations',
                               cache_dir='./data/nlp_cache',
                               split=get_test_split(aspect, fold))

        logger.info(f'Test samples: {len(test_ds):,}')

        # Unique paper IDs in test set
        test_paper_ids = set(test_ds['from_paper_id']).union(
            set(test_ds['to_paper_id']))

        logger.info(f'Test paper IDs: {len(test_paper_ids):,}')
        logger.info(f'Examples: {list(test_paper_ids)[:10]}')

        # Relevance mapping
        doc_id2related_ids = defaultdict(set)  # type: Dict[Set[str]]
        for row in test_ds:
            if row['label'] == 'y':
                a = row['from_paper_id']
                b = row['to_paper_id']
                doc_id2related_ids[a].add(b)
                doc_id2related_ids[b].add(a)

        # Filter for documents in test set
        test_doc_model = KeyedVectors(vector_size=doc_model.vector_size)
        test_doc_ids = []
        test_doc_vectors = []
        missed_doc_ids = 0

        for doc_id in doc_model.vocab:
            if doc_id in test_paper_ids:
                vec = doc_model.get_vector(doc_id)
                if len(vec) != doc_model.vector_size:
                    raise ValueError(
                        f'Test document as invalid shape: {doc_id} => {vec.shape}'
                    )

                test_doc_ids.append(doc_id)
                test_doc_vectors.append(vec)
            else:
                missed_doc_ids += 1
                # logger.warning(f'Document ID is not part of test set: {doc_id} ({type(doc_id)})')

        if len(test_doc_ids) != len(test_doc_vectors):
            raise ValueError(
                f'Test document IDs does not match vector count: {len(test_doc_ids)} vs {len(test_doc_vectors)}'
            )

        logger.info(
            f'Test document IDs: {len(test_doc_ids)} (missed {missed_doc_ids})'
        )
        logger.info(f'Test document vectors: {len(test_doc_vectors)}')

        test_doc_model.add(test_doc_ids, test_doc_vectors)
        test_doc_model.init_sims(replace=True)

        logger.info(f'Test document vectors: {test_doc_model.vectors.shape}')

        # Actual evaluation
        # k2eval_rows = defaultdict(list)
        seed_ids_without_recommendations = []
        max_top_k = max(top_ks)
        eval_rows = {top_k: defaultdict(list)
                     for top_k in top_ks
                     }  # top_k => metric_name => list of value

        seed_id2ret_docs = {}

        for seed_id in tqdm(
                test_paper_ids,
                desc=f'Evaluation ({name},aspect={aspect},fold={fold})'):
            try:
                rel_docs = doc_id2related_ids[seed_id]
                max_ret_docs = [
                    d
                    for d, score in test_doc_model.most_similar(seed_id,
                                                                topn=max_top_k)
                ]
                seed_id2ret_docs[seed_id] = max_ret_docs

                for top_k in top_ks:
                    ret_docs = max_ret_docs[:top_k]
                    rel_ret_docs_count = len(set(ret_docs) & set(rel_docs))

                    if ret_docs and rel_docs:
                        # Precision = No. of relevant documents retrieved / No. of total documents retrieved
                        precision = rel_ret_docs_count / len(ret_docs)

                        # Recall = No. of relevant documents retrieved / No. of total relevant documents
                        recall = rel_ret_docs_count / len(rel_docs)

                        # Avg. precision (for MAP)
                        avg_p = get_avg_precision(ret_docs, rel_docs)

                        # Reciprocal rank (for MRR)
                        reciprocal_rank = get_reciprocal_rank(
                            ret_docs, rel_docs)

                        # # NDCG@k
                        predicted_relevance = [
                            1 if ret_doc_id in rel_docs else 0
                            for ret_doc_id in ret_docs
                        ]
                        true_relevances = [1] * len(rel_docs)
                        ndcg_value = compute_dcg_at_k(
                            predicted_relevance, top_k) / compute_dcg_at_k(
                                true_relevances, top_k)

                        # Save metrics
                        eval_rows[top_k]['retrieved_docs'].append(
                            len(ret_docs))
                        eval_rows[top_k]['relevant_docs'].append(len(rel_docs))
                        eval_rows[top_k]['relevant_retrieved_docs'].append(
                            rel_ret_docs_count)
                        eval_rows[top_k]['precision'].append(precision)
                        eval_rows[top_k]['recall'].append(recall)
                        eval_rows[top_k]['avg_p'].append(avg_p)
                        eval_rows[top_k]['reciprocal_rank'].append(
                            reciprocal_rank)
                        eval_rows[top_k]['ndcg'].append(ndcg_value)

            except (IndexError, ValueError, KeyError) as e:
                seed_ids_without_recommendations.append(seed_id)

                logger.warning(
                    f'Cannot retrieve recommendations for #{seed_id}: {e}')

        logger.info(
            f'Completed with {len(eval_rows[top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})'
        )

        # Summarize evaluation
        for top_k in top_ks:
            try:
                row = [name, aspect, fold, top_k]
                for metric in metrics:
                    # mean over all metrics
                    values = eval_rows[top_k][metric]
                    if len(values) > 0:
                        row.append(np.mean(values))
                    else:
                        row.append(None)

                df.loc[len(df)] = row

            except ValueError as e:
                logger.error(
                    f'Cannot summarize row: {top_k} {fold} {metrics} {e}')

        return df, seed_id2ret_docs
示例#10
0
文件: eval_cli.py 项目: j5bd/q
def evaluate_vectors(hf_dataset: str, aspect: str, input_path: str, name: str,
                     folds: Union[str, list], top_ks: Union[str, list],
                     output_path: str):
    """

    Run with: $ ./eval_cli.py evaluate_vectors paperswithcode_aspects task ./output/pwc_doc_id2st.txt --name=sentence_transformers --folds=1,2,3,4 --top_ks=5,10,25,50 --output_path=./output/eval.csv

    :param aspect:
    :param folds:
    :param top_ks:
    :param name:
    :param hf_dataset:
    :param input_path:
    :param output_path:
    :return:
    """

    if isinstance(folds, str):
        folds = folds.split(',')
    elif isinstance(folds, int):
        folds = [folds]

    if isinstance(top_ks, str):
        top_ks = top_ks.split(',')
    elif isinstance(top_ks, int):
        top_ks = [top_ks]

    logger.info(f'Folds: {folds}')
    logger.info(f'Top-Ks: {top_ks}')

    if len(folds) < 1:
        logger.error('No folds provided')
        return

    if len(top_ks) < 1:
        logger.error('No top-k values provided')
        return

    # Load documents
    doc_model = KeyedVectors.load_word2vec_format(input_path)
    logger.info(f'Document vectors: {doc_model.vectors.shape}')

    # Normalize vectors
    doc_model.init_sims(replace=True)

    # Init dataframe
    metrics = [
        'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs',
        'precision', 'recall', 'avg_p', 'reciprocal_rank'
    ]
    df = pd.DataFrame([], columns=['name', 'fold', 'top_k'] + metrics)

    # Iterate over folds
    for fold in folds:
        logger.info(f'Current fold: {fold}')

        # Dataset
        test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                               name='relations',
                               cache_dir='./data/nlp_cache',
                               split=get_test_split(aspect, fold))

        logger.info(f'Test samples: {len(test_ds):,}')

        # Unique paper IDs in test set
        test_paper_ids = set(test_ds['from_paper_id']).union(
            set(test_ds['to_paper_id']))

        logger.info(f'Test paper IDs: {len(test_paper_ids):,}')
        logger.info(f'Examples: {list(test_paper_ids)[:10]}')

        # Relevance mapping
        doc_id2related_ids = defaultdict(set)  # type: Dict[Set[str]]
        for row in test_ds:
            if row['label'] == 'y':
                a = row['from_paper_id']
                b = row['to_paper_id']
                doc_id2related_ids[a].add(b)
                doc_id2related_ids[b].add(a)

        # Filter for documents in test set
        test_doc_model = KeyedVectors(vector_size=doc_model.vector_size)
        test_doc_ids = []
        test_doc_vectors = []
        missed_doc_ids = 0

        for doc_id in doc_model.vocab:
            if doc_id in test_paper_ids:
                vec = doc_model.get_vector(doc_id)
                if len(vec) != doc_model.vector_size:
                    raise ValueError(
                        f'Test document as invalid shape: {doc_id} => {vec.shape}'
                    )

                test_doc_ids.append(doc_id)
                test_doc_vectors.append(vec)
            else:
                missed_doc_ids += 1
                # logger.warning(f'Document ID is not part of test set: {doc_id} ({type(doc_id)})')

        if len(test_doc_ids) != len(test_doc_vectors):
            raise ValueError(
                f'Test document IDs does not match vector count: {len(test_doc_ids)} vs {len(test_doc_vectors)}'
            )

        logger.info(
            f'Test document IDs: {len(test_doc_ids)} (missed {missed_doc_ids})'
        )
        logger.info(f'Test document vectors: {len(test_doc_vectors)}')

        test_doc_model.add(test_doc_ids, test_doc_vectors)
        test_doc_model.init_sims(replace=True)

        logger.info(f'Test document vectors: {test_doc_model.vectors.shape}')

        # Actual evaluation
        # k2eval_rows = defaultdict(list)
        seed_ids_without_recommendations = []
        max_top_k = max(top_ks)
        eval_rows = {top_k: defaultdict(list)
                     for top_k in top_ks
                     }  # top_k => metric_name => list of value

        for seed_id in tqdm(test_paper_ids, desc=f'Evaluation (fold={fold})'):
            try:
                rel_docs = doc_id2related_ids[seed_id]
                max_ret_docs = [
                    d
                    for d, score in test_doc_model.most_similar(seed_id,
                                                                topn=max_top_k)
                ]
                for top_k in top_ks:
                    ret_docs = max_ret_docs[:top_k]
                    rel_ret_docs_count = len(set(ret_docs) & set(rel_docs))

                    if ret_docs and rel_docs:
                        # Precision = No. of relevant documents retrieved / No. of total documents retrieved
                        precision = rel_ret_docs_count / len(ret_docs)

                        # Recall = No. of relevant documents retrieved / No. of total relevant documents
                        recall = rel_ret_docs_count / len(rel_docs)

                        # Avg. precision (for MAP)
                        avg_p = get_avg_precision(ret_docs, rel_docs)

                        # Reciprocal rank (for MRR)
                        reciprocal_rank = get_reciprocal_rank(
                            ret_docs, rel_docs)

                        # # NDCG@k
                        # predicted_relevance = [1 if ret_doc_id in rel_docs else 0 for ret_doc_id in ret_docs]
                        # true_relevances = [1] * len(rel_docs)
                        # ndcg_value = self.compute_dcg_at_k(predicted_relevance, top_k) / self.compute_dcg_at_k(true_relevances, top_k)

                        # Save metrics
                        eval_rows[top_k]['retrieved_docs'].append(
                            len(ret_docs))
                        eval_rows[top_k]['relevant_docs'].append(len(rel_docs))
                        eval_rows[top_k]['relevant_retrieved_docs'].append(
                            rel_ret_docs_count)
                        eval_rows[top_k]['precision'].append(precision)
                        eval_rows[top_k]['recall'].append(recall)
                        eval_rows[top_k]['avg_p'].append(avg_p)
                        eval_rows[top_k]['reciprocal_rank'].append(
                            reciprocal_rank)

            except (IndexError, ValueError, KeyError) as e:
                seed_ids_without_recommendations.append(seed_id)

                logger.warning(
                    f'Cannot retrieve recommendations for #{seed_id}: {e}')

        logger.info(
            f'Completed with {len(eval_rows[top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})'
        )

        # Summarize evaluation
        for top_k in top_ks:
            try:
                row = [name, fold, top_k]
                for metric in metrics:
                    # mean over all metrics
                    values = eval_rows[top_k][metric]
                    if len(values) > 0:
                        row.append(np.mean(values))
                    else:
                        row.append(None)

                df.loc[len(df)] = row

            except ValueError as e:
                logger.error(
                    f'Cannot summarize row: {top_k} {fold} {metrics} {e}')

            #
            #
            # df = pd.DataFrame(k2eval_rows[top_k],
            #                   columns=['seed_id', 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs',
            #                            'precision', 'recall', 'avg_p', 'reciprocal_rank'])
            #
            # print(df.mean())
            #
            # print(df.mean().to_frame().transpose().iloc[0])

    logger.info(f'Writing {len(df)} rows to {output_path}')

    if os.path.exists(output_path):
        # Append new rows to evaluation file
        df.to_csv(output_path, mode='a', header=False, index=False)
    else:
        # Write new files
        df.to_csv(output_path, header=True, index=False)

    logger.info('Done')
示例#11
0
def train(model_name_or_path: str,
          hf_dataset: str,
          aspect: str,
          fold: Union[int, str],
          output_path: str,
          train_epochs: int = 3,
          train_batch_size: int = 25,
          eval_batch_size: int = 32,
          evaluation_steps: int = 5000,
          train_on_test: bool = False,
          loss: str = 'multiple_negatives_ranking',
          override: bool = False):
    """

    # $MODEL_NAME $HF_DATASET $ASPECT $FOLD $OUTPUT_DIR --train_epochs=3 --train_batch_size=$TRAIN_BATCH_SIZE --eval_batch_size=$EVAL_BATCH_SIZE

    Run with:
    $ export CUDA_VISIBLE_DEVICES=1
    $ ./sentence_transformer_cli.py train scibert-scivocab-uncased paperswithcode_task_docs 1 ./output/st_scibert/1 --train_epochs=3 --train_batch_size=25 --eval_batch_size=32


    :param loss: Training loss function (choices: multiple_negatives_ranking, cosine)
    :param train_on_test: If True, joint training on train and test set (validation disabled)
    :param aspect:
    :param evaluation_steps:
    :param train_epochs:
    :param model_name_or_path:
    :param hf_dataset:
    :param fold:
    :param output_path:
    :param train_batch_size:
    :param eval_batch_size:
    :param override:
    :return:
    """

    top_ks = [5, 10, 25, 50]
    # cuda_device = -1

    # hf_dataset = 'paperswithcode_task_docs'
    # model_name_or_path = 'scibert-scivocab-uncased'
    # fold = 1
    max_token_length = 336  # ssee pwc_token_stats.ipynb
    nlp_cache_dir = './data/nlp_cache'

    # train_batch_size = 25
    # eval_batch_size = 32
    # override = False

    # output_path = './output/pwc_task_st/1/sci-bert'
    # output_path = os.path.join(output_path, str(fold), model_name_or_path)  # output/1/sci-bert

    if os.path.exists(output_path) and not override:
        logger.error(f'Stop. Output path exists already: {output_path}')
        sys.exit(1)

    # if cuda_device >= 0:
    #     os.environ["CUDA_VISIBLE_DEVICES"] = str(cuda_device)

    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Model path from env
    if not os.path.exists(model_name_or_path) and os.path.exists(
            os.path.join(env['bert_dir'], model_name_or_path)):
        model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path)

    word_embedding_model = Transformer(model_name_or_path,
                                       max_seq_length=max_token_length)
    pooling_model = Pooling(
        word_embedding_model.get_word_embedding_dimension())

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    # tokenizer = BertTokenizer.from_pretrained(model_name_or_path)

    # dataset
    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir=nlp_cache_dir,
                           split='docs')
    train_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                            name='relations',
                            cache_dir=nlp_cache_dir,
                            split=get_train_split(aspect, fold))
    test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='relations',
                           cache_dir=nlp_cache_dir,
                           split=get_test_split(aspect, fold))

    # filter for positive labels only
    train_ds = train_ds.filter(lambda row: row['label'] == 'y')

    logger.info(f'After filtering: {len(train_ds):,}')

    # joint training on train and test?
    if train_on_test:
        #
        # import pyarrow
        # from datasets.arrow_dataset import Dataset
        #
        # full_ds_table = pyarrow.concat_tables([train_ds.data, test_ds.data])
        # full_ds = Dataset(arrow_table=full_ds_table)
        raise NotImplementedError('TODO Evaluator')
    else:
        # standard training on test only
        train_sds = DocumentPairSentencesDataset(docs_ds,
                                                 train_ds,
                                                 model,
                                                 max_length=max_token_length,
                                                 forced_length=0)
        train_sds.tokenize_all_docs()

        evaluator = NearestNeighborsEvaluator(model,
                                              docs_ds,
                                              test_ds,
                                              top_ks=top_ks,
                                              batch_size=eval_batch_size,
                                              show_progress_bar=True)

    if loss == 'cosine':
        train_loss = losses.CosineSimilarityLoss(model)
    elif loss == 'multiple_negatives_ranking':
        # A nice advantage of MultipleNegativesRankingLoss is that it only requires positive pairs
        # https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/quora_duplicate_questions
        train_loss = losses.MultipleNegativesRankingLoss(model)
    else:
        raise ValueError(f'Unsupported loss function: {loss}')

    train_dl = DataLoader(train_sds, shuffle=True, batch_size=train_batch_size)

    # Training
    model.fit(
        train_objectives=[(train_dl, train_loss)],
        epochs=train_epochs,  # try 1-4
        warmup_steps=100,
        evaluator=evaluator,
        evaluation_steps=
        evaluation_steps,  # increase to 5000 (full dataset => 20k steps)
        output_path=output_path,
        output_path_ignore_not_empty=True)

    logger.info('Training done')
示例#12
0
def build_vectors(st_output_path: str,
                  hf_dataset: str,
                  aspect: str,
                  fold: Union[int, str],
                  include_all_docs: bool = False,
                  override: bool = False):
    """

    :param override:
    :param include_all_docs: Generate also vectors for samples from training data
    :param st_output_path: Path to Sentence Transformer model
    :param hf_dataset: Huggingface dataset path or name
    :param aspect:
    :param fold:
    :return:
    """
    max_token_length = 336  # ssee pwc_token_stats.ipynb
    nlp_cache_dir = './data/nlp_cache'

    out_fn = 'pwc_id2vec__all_docs.w2v.txt' if include_all_docs else 'pwc_id2vec.w2v.txt'
    out_fp = os.path.join(st_output_path, out_fn)

    if not os.path.exists(st_output_path):
        logger.error(
            f'Sentence Transformer directory does not exist: {st_output_path}')
        return

    if os.path.exists(out_fp) and not override:
        logger.error(
            f'Output path exists already and override is disabled: {out_fp}')
        return

    # Inference for best model
    best_model = SentenceTransformer(st_output_path)
    best_model.get_sentence_embedding_dimension()

    test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='relations',
                           cache_dir=nlp_cache_dir,
                           split=get_test_split(aspect, fold))

    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir=nlp_cache_dir,
                           split='docs')
    test_sds = DocumentPairSentencesDataset(docs_ds, test_ds, best_model)

    if include_all_docs:
        # use all document ids
        input_paper_ids = set(docs_ds['paper_id'])
        logger.info(f'All documents in corpus: {len(input_paper_ids):,}')

    else:
        # generate vectors from unique test documents only
        input_paper_ids = set(test_ds['from_paper_id']).union(
            set(test_ds['to_paper_id']))

    with open(out_fp, 'w') as f:
        # header
        f.write(
            f'{len(input_paper_ids)} {best_model.get_sentence_embedding_dimension()}\n'
        )

        # body
        for paper_id in tqdm(input_paper_ids, desc='Inference'):
            vec = [
                str(v) for v in best_model.encode(test_sds.get_text_from_doc(
                    paper_id),
                                                  show_progress_bar=False)
            ]

            assert len(vec) == best_model.get_sentence_embedding_dimension()

            vec_str = ' '.join(vec)
            line = f'{paper_id} {vec_str}\n'
            f.write(line)
            # break
    logger.info(f'Encoded {len(input_paper_ids):,} into {out_fp}')