示例#1
0
def model_trainer(model_path, train_dataset, test_dataset):
    # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels =4)
    model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels = 3, return_dict=True)


    training_args = TrainingArguments(
    output_dir=model_path,          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    # warmup_steps=0,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    logging_dir=os.path.join(model_path, 'logs'),
    learning_rate= 5e-5,          # directory for storing logs
    logging_steps=1000,
    save_steps = 2700,
    # save_model = os.path.join(model_path, 'final_model')
    )

    trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,          # evaluation dataset
    compute_metrics = compute_metrics,
    )
    return trainer, model
示例#2
0
    def __init__(self, use_gpu=True, tokenizer=None):
        super().__init__()
        MODEL_NAME = 'iarfmoose/roberta-small-bulgarian-pos'
        if tokenizer:
            self.tokenizer = tokenizer
        else:
            self.tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)
        self.model = RobertaForTokenClassification.from_pretrained(MODEL_NAME)
        self.model.to(self.device)

        self.tag_to_id = {
            'ADJ': 0,
            'ADP': 1,
            'PUNCT': 2,
            'ADV': 3,
            'AUX': 4,
            'SYM': 5,
            'INTJ': 6,
            'CCONJ': 7,
            'X': 8,
            'NOUN': 9,
            'DET': 10,
            'PROPN': 11,
            'NUM': 12,
            'VERB': 13,
            'PART': 14,
            'PRON': 15,
            'SCONJ': 16
        }

        self.id_to_tag = {self.tag_to_id[tag]: tag for tag in self.tag_to_id}
示例#3
0
    def __init__(self, use_gpu=True, tokenizer=None):
        super().__init__()
        MODEL_NAME = 'iarfmoose/roberta-small-bulgarian-ner'
        if tokenizer:
            self.tokenizer = tokenizer
        else:
            self.tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

        self.model = RobertaForTokenClassification.from_pretrained(MODEL_NAME)
        self.model.to(self.device)

        self.tag_to_id = {
            'O': 0,
            'I-PRO': 1,
            'I-PER': 2,
            'I-ORG': 3,
            'I-LOC': 4,
            'I-EVT': 5,
            'B-PRO': 6,
            'B-PER': 7,
            'B-ORG': 8,
            'B-LOC': 9,
            'B-EVT': 10
        }

        self.id_to_tag = {self.tag_to_id[tag]: tag for tag in self.tag_to_id}
示例#4
0
 def __init__(self, tokenizer):
     super(UpperSentDetectorModel, self).__init__()
     self.tokenizer = tokenizer
     self.transformer = \
         RobertaForTokenClassification.from_pretrained('roberta-base')
     self.transformer.to(device)
     self.softmax = torch.nn.Softmax(dim=2)
     self.threshold = 0.9  # may be changed, if you need i
示例#5
0
 def create_and_check_roberta_for_token_classification(
         self, config, input_ids, token_type_ids, input_mask,
         sequence_labels, token_labels, choice_labels):
     config.num_labels = self.num_labels
     model = RobertaForTokenClassification(config=config)
     model.eval()
     loss, logits = model(input_ids,
                          attention_mask=input_mask,
                          token_type_ids=token_type_ids,
                          labels=token_labels)
     result = {
         "loss": loss,
         "logits": logits,
     }
     self.parent.assertListEqual(
         list(result["logits"].size()),
         [self.batch_size, self.seq_length, self.num_labels])
     self.check_loss_output(result)
示例#6
0
def model_fn(model_dir):
    print("Loading model.")

    from transformers import RobertaForTokenClassification

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = RobertaForTokenClassification.from_pretrained('roberta-base',
                                                          num_labels=20)

    with open(os.path.join(model_dir, 'model.pth'), 'rb') as f:
        model.load_state_dict(torch.load(f))

    return model.to(device)
示例#7
0
 def create_and_check_for_token_classification(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     config.num_labels = self.num_labels
     model = RobertaForTokenClassification(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
示例#8
0
def get_model():

    # Load model
    config = RobertaConfig.from_pretrained(
        "BERTweet_base_transformers/config.json", num_labels=3)
    BERTweet = RobertaForTokenClassification.from_pretrained(
        "BERTweet_base_transformers/model.bin", config=config)

    optimizer = AdamW(
        BERTweet.parameters(),
        lr=1e-05,  # args.learning_rate - default is 5e-5, 
        eps=1e-8  # args.adam_epsilon  - default is 1e-8.
    )

    return BERTweet, optimizer
示例#9
0
def sentenceLabel(sentence):
    f = open('./model_save/tag2idx.pckl', 'rb')
    tag2idx = pickle.load(f)
    device = torch.device("cpu")
    output_dir = './model_save/'
    idx2tag = dict((v, k) for k, v in tag2idx.items())
    tokenizer = RobertaTokenizer.from_pretrained(output_dir)
    model = RobertaForTokenClassification.from_pretrained(output_dir)
    model.to(device)

    # predict
    all_tokens = []
    all_entities = []

    tokenized_sentence = tokenizer.encode(sentence)
    input_ids = torch.tensor([tokenized_sentence]).to(device)

    predictions = []
    with torch.no_grad():
        output = model(input_ids)
        output = output[0].detach().cpu().numpy()
        predictions.extend([list(p) for p in np.argmax(output, axis=2)])

    tags_predictions = []
    for x in predictions[0]:
        tags_predictions.append(idx2tag[int(x)])

    tokens = []
    count = 0

    ### get tokens from ids
    for x in tokenizer.convert_ids_to_tokens(tokenized_sentence):
        if count == 1:
            tokens.append(x)
        else:
            tokens.append(x[1:])
        count += 1

    all_entities.append(tags_predictions[1:-1])
    all_tokens.append(tokens[1:-1])

    #print(all_tokens)
    #print(all_entities)

    return all_tokens, all_entities
示例#10
0
    def train(self):
        if self.has_started():
            last_checkpoint = self.get_latest_checkpoint()
            logger.info(f"Resuming training from: {last_checkpoint}")

            model = AutoModelForTokenClassification.from_pretrained(
                last_checkpoint, config=self.config)

        else:
            model = RobertaForTokenClassification.from_pretrained(
                "neurocode/IsRoBERTa", config=self.config)

        trainer = Trainer(model=model,
                          args=self.training_args,
                          train_dataset=self.dataset)

        trainer.train()

        trainer.save_model(f"{self.model_dir}")
        self.upload()
示例#11
0
def model_trainer(args, test_dataset):
    # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels =4)
    model = RobertaForTokenClassification.from_pretrained(args.model_path,
                                                          num_labels=3,
                                                          return_dict=True)

    #/anfs/bigdisc/rmya2/faiss_data/results_table_to_cell2/checkpoint-1400/'
    training_args = TrainingArguments(
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=16,  # batch size for evaluation
        # warmup_steps=0,                # number of warmup steps for learning rate scheduler
        logging_dir='./logs',
        output_dir='./model_output')

    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        eval_dataset=test_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,
    )
    return trainer, model
import pickle

f = open('tags_vals.pckl', 'rb')
tags_vals = pickle.load(f)
f.close()
print(tags_vals)

f = open('tag2idx.pckl', 'rb')
tag2idx = pickle.load(f)
f.close()
idx2tag = dict((v, k) for k, v in tag2idx.items())

device = torch.device("cpu")
output_dir = './roberta_few_labels/'
tokenizer = RobertaTokenizer.from_pretrained(output_dir)
model = RobertaForTokenClassification.from_pretrained(output_dir)
model.to(device)

text = 'O, B-Diagnostic_procedure, I-Diagnostic_procedure,B-Biological_structure, I-Biological_structure, B-Sign_symptom, I-Sign_symptom, B-Detailed_description, I-Detailed_description, B-Lab_value, I-Lab_value, B-Date, I-Date, B-Age, I-Age, B-Clinical_event, I-Clinical_event, B-Date, I-Date, B-Disease_disorder, I-Disease_disorder, B-Nonbiological_location, I-Nonbiological_location, B-Severity, I-Severity, B-Sex, B-Therapeutic_procedure, I-Therapeutic_procedure'
tag_values = text.split(',')

print(tag_values)

query = "a woman aged 65 has a fever and a cough on march at a hospital"
tokenized_sentence = tokenizer.encode(query)
print(tokenized_sentence)
input_ids = torch.tensor([tokenized_sentence]).to(device)
print(input_ids)
predictions = []
with torch.no_grad():
    output = model(input_ids)
示例#13
0
test_data = TensorDataset(test_inputs, test_masks, test_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

if args.pretrained_model == 'bert-base-cased':
    model = BertForTokenClassification.from_pretrained(
        args.pretrained_model,
        num_labels=len(tag2idx),
        output_attentions=False,
        output_hidden_states=False)
if args.pretrained_model == 'bert-base-cased-crf':
    model = bertCRF(num_classes=len(tag2idx), model_name=args.pretrained_model)
if args.pretrained_model == 'roberta-base':
    model = RobertaForTokenClassification.from_pretrained(
        args.pretrained_model,
        num_labels=len(tag2idx),
        output_attentions=False,
        output_hidden_states=False)
if args.pretrained_model == 'roberta-base-crf':
    model = bertCRF(num_classes=len(tag2idx), model_name=args.pretrained_model)
model.cuda()

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay_rate':
    0.01
}, {
    'params':
示例#14
0
    parser.add_argument(
        "--use_goodreads",
        help="calculate citation graph with external goodreads database",
        action='store_true')
    parser.add_argument(
        "--use_citation_model",
        help=
        "wheter to use NER model to remove false positives from detected citations",
        action='store_true')

    args = parser.parse_args()

    metadata_to_use = 'goodreads' if args.use_goodreads else 'calibre'

    model = RobertaForTokenClassification.from_pretrained(
        'fine-tuned-model-ner-better-data-3')
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    model.share_memory()

    bkp = BookProcesserFactory(create_dataset=True,
                               verbose=False,
                               use_citation_model=args.use_citation_model,
                               metadata_to_use=metadata_to_use)

    book_processer = bkp.GetProcessFunction()

    if args.use_goodreads:
        # if it doesn't find the file read the script first without the --use_goodreads argument
        G = pickle.load(open("pickled_graphs/small_graph.p", "rb"))
    else:
示例#15
0
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

model = RobertaForTokenClassification.from_pretrained("roberta-base",
                                                      num_labels=len(tag2idx))
model.cuda()

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
示例#16
0
def train(no_cache: bool, dataset_path: str, data_config_name: str,
          training_args: TrainingArguments, tokenizer: RobertaTokenizerFast):
    print(f"tokenizer vocab size: {tokenizer.vocab_size}")

    print(f"\nLoading and tokenizing datasets found in {dataset_path}.")
    train_dataset, eval_dataset, test_dataset = load_dataset(
        'EMBO/sd-nlp',
        # './tokcl/loader.py',
        data_config_name,
        script_version="main",
        # data_dir=dataset_path,
        split=["train", "validation", "test"],
        # download_mode=GenerateMode.FORCE_REDOWNLOAD if no_cache else GenerateMode.REUSE_DATASET_IF_EXISTS,
        cache_dir=CACHE)
    print(f"\nTraining with {len(train_dataset)} examples.")
    print(f"Evaluating on {len(eval_dataset)} examples.")

    if data_config_name in ["NER", "ROLES"]:
        # use our fancy data collator that randomly masks some of the inputs to enforce context learning
        training_args.remove_unused_columns = False  # we need tag_mask
        data_collator = DataCollatorForMaskedTokenClassification(
            tokenizer=tokenizer,
            max_length=config.max_length,
            masking_probability=training_args.masking_probability,
            replacement_probability=training_args.replacement_probability,
            select_labels=training_args.select_labels)
    else:
        # normal token classification
        data_collator = DataCollatorForTokenClassification(
            tokenizer=tokenizer, max_length=config.max_length)

    num_labels = train_dataset.info.features['labels'].feature.num_classes
    label_list = train_dataset.info.features['labels'].feature.names
    print(f"\nTraining on {num_labels} features:")
    print(", ".join(label_list))

    compute_metrics = MetricsComputer(label_list=label_list)

    model = RobertaForTokenClassification.from_pretrained(
        LM_MODEL_PATH,
        num_labels=num_labels,
        max_position_embeddings=config.max_length + 2)

    print("\nTraining arguments:")
    print(training_args)

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      compute_metrics=compute_metrics,
                      callbacks=[ShowExample(tokenizer)])

    print(f"CUDA available: {torch.cuda.is_available()}")

    trainer.train()
    trainer.save_model(training_args.output_dir)

    print(f"Testing on {len(test_dataset)}.")
    pred: NamedTuple = trainer.predict(test_dataset, metric_key_prefix='test')
    print(f"{pred.metrics}")
示例#17
0
bert_out_address = 'models/'
if not os.path.exists(bert_out_address):
    os.makedirs(bert_out_address)
# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(
    model, 'module') else model  # Only save the model it-self
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(bert_out_address, "pytorch_model.bin")
output_config_file = os.path.join(bert_out_address, "config.json")
# Save model into file
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(bert_out_address)

# %%
model = RobertaForTokenClassification.from_pretrained(bert_out_address,
                                                      num_labels=len(tag2idx))
# Set model to GPU
model.cuda()
if n_gpu > 1:
    model = torch.nn.DataParallel(model)
# Evalue loop
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []

print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
示例#18
0
from argparse import ArgumentParser
from transformers import pipeline, RobertaForTokenClassification
from common import TOKCL_MODEL_PATH
from common.config import config

_EXAMPLE = """<s> F. Western blot of input and eluates of Upf1 domains purification in a Nmd4-HA strain. The band with the # might corresponds to a dimer of Upf1-CH,
 bands marked with a star correspond to residual signal with the anti-HA antibodies (Nmd4). Fragments in the eluate have a smaller size because the protein A pa
rt of the tag was removed by digestion with the TEV protease. G6PDH served as a loading control in the input samples </s>"""

if __name__ == "__main__":
    parser = ArgumentParser(description="Quick try of a NER model")
    parser.add_argument("text",
                        nargs="?",
                        default=_EXAMPLE,
                        help="Text to analyze.")
    parser.add_argument("-M",
                        "--model-path",
                        default={TOKCL_MODEL_PATH},
                        help="Path to the model.")

    args = parser.parse_args()
    text = args.text
    model_path = args.model_path
    model = RobertaForTokenClassification.from_pretrained(model_path)
    tokenizer = config.tokenizer
    pipe = pipeline('ner', model, tokenizer=tokenizer)
    res = pipe(text)
    for r in res:
        print(r['word'], r['entity'])
示例#19
0
from transformers import BertTokenizer, RobertaForTokenClassification, DataProcessor

if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

tokenizer = BertTokenizer.from_pretrained("RoBERTa_zh_Large_PyTorch")

bert_model = RobertaForTokenClassification.from_pretrained(
    "./RoBERTa_zh_Large_PyTorch/",  # 使用 12-layer 的 BERT 模型.
    num_labels=5000,  # 多分类任务的输出标签为 len(tag2idx)个.
    output_attentions=False,  # 不返回 attentions weights.
    output_hidden_states=False,  # 不返回 all hidden-states.
)


class enity_identifing(nn.Module):
    def __init__(self, vocab_size, embedding_dim, bert_model):
        super(enity_identifing, self).__init__()
        self.bert_model = bert_model.to(device)
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        initrange = 0.1
        self.embed.weight.data.uniform_(-initrange, initrange)
        self.rnn_type = "LSTM"
        self.nhid = 512
        self.rnn = nn.LSTM(5000, self.nhid, bidirectional=True,
                           dropout=0.5).to(device)
示例#20
0
def sentenceLabel(sentence):
    f = open('./model_save/tag2idx.pckl', 'rb')
    tag2idx = pickle.load(f)
    device = torch.device("cpu")

    output_dir = './model_save/'
    idx2tag = dict((v,k) for k, v in tag2idx.items())
    tokenizer = RobertaTokenizer.from_pretrained(output_dir)
    model = RobertaForTokenClassification.from_pretrained(output_dir)
    model.aux_logits = False
    model.to(device)
    tokenized_sentence = tokenizer.encode(sentence)
    input_ids = torch.tensor([tokenized_sentence]).to(device)



    all_tokens = []
    origin_tokens = sentence.split(' ')
    print(origin_tokens)
    all_entities = []
    entity_types = []
    tokenized_sentence = tokenizer.encode(sentence)
    input_ids = torch.tensor([tokenized_sentence]).to(device)

    predictions = []
    with torch.no_grad():
        output = model(input_ids)
        output = output[0].detach().cpu().numpy()
        predictions.extend([list(p) for p in np.argmax(output, axis=2)])


    tags_predictions = []
    for x in predictions[0]:
        tags_predictions.append(idx2tag[int(x)])

    tokens = []
    count = 0
    
    ### get tokens from ids
    for x in tokenizer.convert_ids_to_tokens(tokenized_sentence):
        if count == 1:
            tokens.append(x)
        elif x[0] == 'Ġ':
            tokens.append(x[1:])
        else:
            tokens.append(x)
        count+=1

    wordIndex = 0
    startIndex = 0
    entityIndex = 0
    entity_types.append(tags_predictions[1:-1])

    for x in tokens[1:-1]:
        entity = entity_types[0][entityIndex]
        entityIndex += 1
        if wordIndex == len(origin_tokens):
            break
        if x in origin_tokens[wordIndex].lower():
            if startIndex == 0:
                all_tokens.append(origin_tokens[wordIndex])
                if(len(entity) < 2):
                    all_entities.append(entity)
                else:
                    all_entities.append(entity[2:])
            startIndex = startIndex + len(x)
            if startIndex  >= len(origin_tokens[wordIndex]):
                wordIndex += 1
                startIndex = 0
            


    print(all_tokens)
    print(all_entities)

    return all_tokens,all_entities
示例#21
0
    parser.add_argument("--bsz", type=int)
    parser.add_argument("--path-to-parallel-data-json")
    parser.add_argument("--output-path")
    parser.add_argument("--update-freq", type=int, default=1)
    args = parser.parse_args()

    with open(args.path_to_parallel_data_json, "r") as f:
        text = [i.strip() for i in f.readlines()]
    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
    dataset = NERDataset(
        text, None, tokenizer, label_mapping=["KEEP", "MASK", "DELETE"]
    )
    config = RobertaConfig.from_pretrained("roberta-base")
    config.num_labels = 3
    model = RobertaForTokenClassification.from_pretrained(
        args.hf_dump, config=config
    ).cuda()
    print("Loaded roberta model")

    output_masks = open(os.path.join(args.output_path, "masks.txt"), "w")

    list_of_edits = []
    total_out_list = []
    for v, example in enumerate(DataLoader(dataset, batch_size=args.bsz)):
        output = model(**example)
        out_list = []
        for i, x in enumerate(output.logits.argmax(-1)):
            current_sent = []
            current_edits = []
            for j, tok in enumerate(tokenizer.tokenize(text[args.bsz * v + i])):
                val = x[j + 1]
    args = parser.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device {}.".format(device))

    torch.manual_seed(args.seed)

    # ------ LOAD DATA ------
    train_loader = _get_data_loader(args.batch_size, args.data_dir,
                                    "train_roberta.csv", args.max_len)
    test_loader = _get_data_loader(args.batch_size, args.val_dir,
                                   "test_roberta.csv", args.max_len)

    # ------ CREATE ROBERTA MODEL ------
    model = RobertaForTokenClassification.from_pretrained(
        'roberta-base', num_labels=args.n_tags).to(device)

    # ------ SPECIFY OPTIMIZER ------
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [{
        "params": [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        "weight_decay":
        0.01
    }, {
        "params": [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
示例#23
0
    label_list = processor.get_labels(data)
    num_labels = len(label_list)
    if task in ['pos-tagging', 'ner']:
        num_labels += 1  # we add 1 because of the padding which is labelled 0
    logging.info("\tDone.")

    logging.info(
        "Fetching pre-trained RoBerta model: {} and Tokenizer: {} for the task: {}..."
        .format(parameters['pretrained_model'],
                parameters['pretrained_tokenizer'], parameters['task']))
    if task in ['pos-tagging', 'ner']:
        model = RobertaForTokenClassification.from_pretrained(
            parameters['pretrained_model'],
            num_labels=
            num_labels,  # The number of output labels for classification.  
            output_attentions=parameters[
                'output_attentions'],  # Whether the model returns attentions weights.
            output_hidden_states=parameters[
                'output_hidden_states'],  # Whether the model returns all hidden-states.
        )
    elif task in ['sentence-classification']:
        model = RobertaForSequenceClassification.from_pretrained(
            parameters['pretrained_model'],
            num_labels=
            num_labels,  # The number of output labels for classification.  
            output_attentions=parameters[
                'output_attentions'],  # Whether the model returns attentions weights.
            output_hidden_states=parameters[
                'output_hidden_states'],  # Whether the model returns all hidden-states.
        )
    tokenizer = RobertaTokenizer.from_pretrained(
示例#24
0
    accumulation_steps = args.update_freq

    with open(args.path_to_parallel_data_json, "r") as f:
        data = [json.loads(i.strip()) for i in f.readlines()]
    text = [i["source"] for i in data]
    labels = [i["label"] for i in data]

    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
    dataset = NERDataset(text,
                         labels,
                         tokenizer,
                         label_mapping=["KEEP", "MASK", "DELETE"])
    model = RobertaForTokenClassification.from_pretrained(
        f"{args.hf_dump}/pytorch_model.bin",
        config=f"{args.hf_dump}/config.json",
        num_labels=len(dataset.label_mapping),
        hidden_dropout_prob=0.2,
        attention_probs_dropout_prob=0.2,
    ).cuda()
    optimizer = optim.Adam(model.parameters(), lr=3e-6)
    scheduler = transformers.get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=5000,
        num_training_steps=40000,
    )

    os.makedirs(args.save_dir, exist_ok=False)

    for _ in range(args.epochs):
        for i, example in tqdm(
                enumerate(
示例#25
0
from transformers import RobertaTokenizer, RobertaForTokenClassification
import torch

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForTokenClassification.from_pretrained('roberta-base')

s = 'Hello, my dog is cute'
encoded_tokens = tokenizer.encode(s, add_special_tokens=True)
input_ids = torch.tensor(encoded_tokens).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1

outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
示例#26
0
            entities[
                "special_tokens_mask"] = special_tokens_mask  # restore special_tokens_mask for potential carry over to next serial model
            output.append(entities)
        return output


if __name__ == "__main__":
    parser = ArgumentParser(description="SmartTagging of free text.")
    parser.add_argument(
        "text",
        nargs="?",
        default=
        "We studied mice with genetic ablation of the ERK1 gene in brain and muscle.",
        help="The text to tag.")
    args = parser.parse_args()
    text = args.text
    panel_model = RobertaForTokenClassification.from_pretrained(
        f"EMBO/sd-panels")
    ner_model = RobertaForTokenClassification.from_pretrained(f"EMBO/sd-ner")
    role_model = RobertaForTokenClassification.from_pretrained(
        f"EMBO/sd-roles")
    tokenizer = RobertaTokenizerFast.from_pretrained(f"roberta-base")
    tagger = Tagger(
        tokenizer,
        panel_model,  # segments figure legends into panel legends
        ner_model,  # tags biolgical entities
        role_model  # semantic roles of entities
    )
    tagged = tagger(text)
    print(tagged)