Exemplo n.º 1
0
    def predict(self, input_json) -> List[str]:

        sentence = input_json[0]['sentence']

        tokenized_sentence = config.TOKENIZER.encode(sentence)

        sentence = sentence.split()
        #print(sentence)
        #print(tokenized_sentence)

        test_dataset = EntityDataset(texts=[sentence],
                                     pos=[[0] * len(sentence)],
                                     tags=[[0] * len(sentence)])

        device = torch.device("cpu")

        with torch.no_grad():
            data = test_dataset[0]
            for k, v in data.items():
                data[k] = v.to(device).unsqueeze(0)
            tag, pos, _ = self.artifacts.ner(**data)

            tags = enc_tag.inverse_transform(
                tag.argmax(2).cpu().numpy().reshape(
                    -1))[:len(tokenized_sentence)]

            i = 0
            names = []
            while i < len(tags):
                item = tags[i]
                indices = []
                if (item == "B-per"):
                    while (item == "B-per"):
                        indices.append(i)
                        i += 1
                        item = tags[i]
                    tokenized_name = tokenized_sentence[
                        indices[0]:indices[-1] + 1]
                    name = config.TOKENIZER.decode(tokenized_name)
                    names.append(name)
                indices = []
                i += 1

            resp = ','.join(names)

            return [resp]
Exemplo n.º 2
0
def predict():
    meta_data = joblib.load(config.META_PATH)

    enc_pos = meta_data['enc_pos']
    enc_tag = meta_data['enc_tag']

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))

    sentence = '''
    Alice will go to China this Saturday! Her father works in WHO
     
    .
    '''
    tokenized_sentence = config.TOKENIZER.encode(sentence)
    sentence = sentence.split()
    print('\n')
    print('sentence', sentence)
    print('tokenized_sentence', tokenized_sentence)

    test_dataset = EntityDataset(words=[sentence],
                                 pos=[[0] * len(sentence)],
                                 tags=[[0] * len(sentence)],
                                 tokenizer=config.TOKENIZER,
                                 max_len=config.MAX_LEN)

    model = MODEL_DISPATCHER[config.BASE_MODEL](bert_path=config.BERT_PATH,
                                                num_tag=num_tag,
                                                num_pos=num_pos)

    model.load_state_dict(torch.load(config.MODEL_PATH))
    model.to(DEVICE)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(DEVICE).unsqueeze(0)
        tag, pos, _ = model(**data)

        print(
            enc_tag.inverse_transform(tag.argmax(2).cpu().numpy().reshape(-1))
            [:len(tokenized_sentence)])

        print(
            enc_pos.inverse_transform(pos.argmax(2).cpu().numpy().reshape(-1))
            [:len(tokenized_sentence)])
Exemplo n.º 3
0
def predict(sentence):
    tokenized_sentence = config.TOKENIZER.encode(sentence)

    sentence = sentence.split()
    #print(sentence)
    #print(tokenized_sentence)

    test_dataset = EntityDataset(texts=[sentence],
                                 pos=[[0] * len(sentence)],
                                 tags=[[0] * len(sentence)])

    device = torch.device("cuda")
    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.load_state_dict(torch.load(config.MODEL_PATH))
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, pos, _ = model(**data)

        tags = enc_tag.inverse_transform(
            tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)]

        i = 0
        names = []
        while i < len(tags):
            item = tags[i]
            indices = []
            if (item == "B-per"):
                while (item == "B-per"):
                    indices.append(i)
                    i += 1
                    item = tags[i]
                tokenized_name = tokenized_sentence[indices[0]:indices[-1] + 1]
                name = config.TOKENIZER.decode(tokenized_name)
                names.append(name)
            indices = []
            i += 1

        return names
Exemplo n.º 4
0
if __name__ == '__main__':

    data_path = './../input/ner_dataset.csv'
    text_list, pos_list, tag_list, enc_pos, enc_tag = process_data(
        config.TRAINING_FILE)

    metadata = joblib.load('metadata.bin')
    enc_pos = metadata['enc_pos']
    enc_tag = metadata['enc_tag']

    sentence = """
    anoop lives in bangalore
    """

    tokenized_sentence = config.TOKENIZER(sentence)
    sentence = sentence.split()
    print(sentence)
    print(tokenized_sentence)

    test_dataset = EntityDataset(test_text, train_pos, train_tag)
    test_dataloader = torch.utils.data.DataLoader(
        dataset=test_dataset, batch_size=config.TRAIN_BATCH_SIZE)

    valid_dataset = EntityDataset(valid_text, valid_pos, valid_tag)
    valid_dataloader = torch.utils.data.DataLoader(
        dataset=valid_dataset, batch_size=config.VALID_BATCH_SIZE)

    device = torch.device('cpu')
    model = EntityModel(num_pos, num_tag)
    model.to(device)
Exemplo n.º 5
0
enc_tag = meta_data["enc_tag"]

num_pos = len(list(enc_pos.classes_))
num_tag = len(list(enc_tag.classes_))

sentence = """
daijianghai was born in jiangsu and his school is harvard
"""
tokenized_sentence = config.TOKENIZER.encode(sentence)

sentence = sentence.split()
print(sentence)
print(tokenized_sentence)

test_dataset = EntityDataset(texts=[sentence],
                             pos=[[0] * len(sentence)],
                             tags=[[0] * len(sentence)])

device = torch.device("cuda")
model = EntityModel(num_tag=num_tag, num_pos=num_pos)
model.load_state_dict(torch.load(config.MODEL_PATH))
model.to(device)

with torch.no_grad():
    data = test_dataset[0]
    for k, v in data.items():
        data[k] = v.to(device).unsqueeze(0)
    tag, pos, _ = model(**data)

    print(
        enc_tag.inverse_transform(tag.argmax(2).cpu().numpy().reshape(-1))
Exemplo n.º 6
0
    meta_data = {"enc_pos": enc_pos, "enc_tag": enc_tag}

    joblib.dump(meta_data, "meta.bin")

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))

    (train_sentences, test_sentences, train_pos, test_pos, train_tag,
     test_tag) = model_selection.train_test_split(sentences,
                                                  pos,
                                                  tag,
                                                  random_state=42,
                                                  test_size=0.1)

    train_dataset = EntityDataset(texts=train_sentences,
                                  pos=train_pos,
                                  tags=train_tag)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = EntityDataset(texts=test_sentences,
                                  pos=test_pos,
                                  tags=test_tag)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")
    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.to(device)
Exemplo n.º 7
0
def run():
    sentences, pos, tag, enc_pos, enc_tag = process_data(DF_PATH)

    meta_data = {
        'enc_pos': enc_pos,
        'enc_tag': enc_tag
    }

    joblib.dump(meta_data, META_PATH)

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))

    (
        train_sentences,
        valid_sentences,
        train_pos,
        valid_pos,
        train_tag,
        valid_tag,
    ) = model_selection.train_test_split(sentences, pos, tag, random_state=2020, test_size=0.1)

    tokenizer = transformers.BertTokenizer.from_pretrained(TOKENIZER_PATH, do_lower_case=True)

    train_dataset = EntityDataset(
        words=train_sentences,
        pos=train_pos,
        tags=train_tag,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = EntityDataset(
        words=valid_sentences,
        pos=valid_pos,
        tags=valid_tag,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALIDATION_BATCH_SIZE,
        num_workers=4
    )

    model = MODEL_DISPATCHER[BASE_MODEL](bert_path=BERT_PATH,
                                         num_tag=num_tag,
                                         num_pos=num_pos
                                         )
    model.to(DEVICE)

    # parameters_optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    parameters_optimizer = [
        {
            'params': [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            'weight_decay': 0.001,
        },
        {
            'params': [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            'weight_decay': 0.0,
        }
    ]

    optimizer = AdamW(parameters_optimizer, lr=LR)
    num_training_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    best_loss = np.inf
    for epoch in range(EPOCHS):
        train_loss = train_loop_fn(train_dataloader, model, optimizer, DEVICE, scheduler)
        valid_loss = eval_loop_fn(valid_dataloader, model, DEVICE)

        print(f'Train_loss = {train_loss}, Valid_loss = {valid_loss}')

        if valid_loss < best_loss:
            torch.save(model.state_dict(), MODEL_PATH)
            best_loss = valid_loss