Exemplo n.º 1
0
    def __init__(self, modelname="", dataset=None, use_saved_model=False):
        self.dataset = dataset
        #labels_list = ["O", "B-ACT",  "I-ACT", "B-OBJ", "I-OBJ", "B-VAL", "I-VAL", "B-VAR", "I-VAR"]
        #labels_list = dataset.get_labels_list()
        labels_list = dataset['labels_list']

        output_dir = "outputs_{}".format(modelname)
        # Create a NERModel
        model_args = {
            'output_dir': output_dir,
            'overwrite_output_dir': True,
            'reprocess_input_data': True,
            
            'save_eval_checkpoints': False,
            'save_steps': -1,
            'save_model_every_epoch': False,
            
            'train_batch_size': 10, # 10
            'num_train_epochs': 10,   # 5
            'max_seq_length': 256,
            'gradient_accumulation_steps': 8,

            'labels_list': labels_list
        }
                
        if use_saved_model:
            self.model = NERModel("bert", output_dir, use_cuda=False, args=model_args)
        else:
            self.model = NERModel("bert", "bert-base-cased", use_cuda=False, args=model_args)
Exemplo n.º 2
0
def predict(sentence):
    if sentence:
        model1 = NERModel('bert',
                          'NERMODEL1',
                          labels=[
                              "B-sector", "I-sector", "B-funda", "O",
                              "operator", "threshold", "Join", "B-attr",
                              "I-funda", "TPQty", "TPUnit", "Sortby", "B-eco",
                              "I-eco", "B-index", "Capitalization", "I-",
                              "funda", "B-security", 'I-security', 'Number',
                              'Sector', 'TPMonth', 'TPYr', 'TPRef'
                          ],
                          args={
                              "save_eval_checkpoints": False,
                              "save_steps": -1,
                              "output_dir": "NERMODEL",
                              'overwrite_output_dir': True,
                              "save_model_every_epoch": False,
                              'reprocess_input_data': True,
                              "train_batch_size": 10,
                              'num_train_epochs': 15,
                              "max_seq_length": 64
                          },
                          use_cuda=False)

        predictions, raw_outputs = model1.predict([sentence])
        result = json.dumps(predictions[0])
        return result
Exemplo n.º 3
0
def training():
    wandb.init()

    model = NERModel("roberta",
                     "roberta-base",
                     use_cuda=True,
                     args=model_args,
                     sweep_config=wandb.config)
    # model = NERModel("distilbert", "distilbert-base-cased", use_cuda=True, args=model_args, sweep_config=wandb.config)
    model.train_model(train_df, eval_data=trial_df)

    wandb.join()
Exemplo n.º 4
0
 def roberta_ner_train(self):
     train_df = pd.DataFrame(self.roberta_ner_train_data,
                             columns=['sentence_id', 'words', 'labels'])
     self.roberta_ner_model = NERModel("roberta",
                                       "roberta-base",
                                       labels=self.labels,
                                       args={
                                           "num_train_epochs": 3,
                                           "overwrite_output_dir": True,
                                           "output_dir": "ner_outputs/"
                                       })
     self.roberta_ner_model.train_model(train_df)
Exemplo n.º 5
0
    def __init__(self, ner_path: str, deps_path: str):
        """Create/Load a new DependencyModel

        Args:
            ner_path (str): directory of NER model. (if not exists, create a new model)
            deps_path (str): directory of classification model.
        """
        self.ner_path = ner_path
        try:
            self.ner_model = NERModel("distilbert", ner_path, use_cuda=False)
        except:
            self.ner_model = NERModel("distilbert", "distilbert-base-uncased",
                labels=custom_labels, use_cuda=False)
        self.deps_model = ClassifyModel(deps_path)
Exemplo n.º 6
0
def create_model(model_class, model_type, model_name, num_labels, weight, args,
                 use_cuda, cuda_device, **kwargs):
    if model_class == "ClassificationModel":
        return ClassificationModel(model_type, model_name, num_labels, weight,
                                   args, use_cuda, cuda_device, **kwargs)
    elif model_class == "MultiLabelClassificationModel":
        return MultiLabelClassificationModel(model_type, model_name,
                                             num_labels, weight, args,
                                             use_cuda, cuda_device, **kwargs)
    elif model_class == "QuestionAnsweringModel":
        return QuestionAnsweringModel(model_type, model_name, args, use_cuda,
                                      cuda_device, **kwargs)
    elif model_class == "NERModel":
        return NERModel(model_type,
                        model_name,
                        args=args,
                        use_cuda=use_cuda,
                        cuda_device=cuda_device,
                        **kwargs)
    elif model_class == "T5Model":
        args = T5Args()
        args.use_multiprocessed_decoding = False
        return T5Model(model_type,
                       model_name,
                       args=args,
                       use_cuda=use_cuda,
                       cuda_device=cuda_device,
                       **kwargs)
    else:
        raise ValueError(
            "{} is either invalid or not yet implemented.".format(model_class))
def load_models():
    """Load POS and NER trained telugu models."""
    pos_model = NERModel('bert',
                         'kuppuluri/telugu_bertu_pos',
                         args={"use_multiprocessing": False},
                         labels=[
                             'QC',
                             'JJ',
                             'NN',
                             'QF',
                             'RDP',
                             'O',
                             'NNO',
                             'PRP',
                             'RP',
                             'VM',
                             'WQ',
                             'PSP',
                             'UT',
                             'CC',
                             'INTF',
                             'SYMP',
                             'NNP',
                             'INJ',
                             'SYM',
                             'CL',
                             'QO',
                             'DEM',
                             'RB',
                             'NST',
                         ],
                         use_cuda=False)

    ner_model = NERModel('bert',
                         'kuppuluri/telugu_bertu_ner',
                         labels=[
                             'B-PERSON', 'I-ORG', 'B-ORG', 'I-LOC', 'B-MISC',
                             'I-MISC', 'I-PERSON', 'B-LOC', 'O'
                         ],
                         use_cuda=False,
                         args={"use_multiprocessing": False})

    spacy_telugu_model = spacy.blank("te")

    return pos_model, ner_model, spacy_telugu_model
Exemplo n.º 8
0
def predict():
    if request.method == 'POST':
        test_sents = pd.read_csv(request.files.get('file'))
    aspect_model = NERModel("bert", "/Users/mutaz/Desktop/Mutaz Thesis bert/hotel_reviews", use_cuda=False,
                            labels=["B-A", "I-A", "O"],
                            args={"use_cuda": False,
                                  "save_eval_checkpoints": False,
                                  "save_steps": -1,
                                  "output_dir": "MODEL",
                                  'overwrite_output_dir': True,
                                  "save_model_every_epoch": False,
                                  })

    predictions, raw_outputs = aspect_model.predict(test_sents["sentence"])
    pred = [[tag__ for word_ in s for word__, tag__ in word_.items()] for s in [sentence_ for sentence_ in predictions]]
    print(pred)
    # connect(test_sents, aspects_prds)
    return redirect('https://dub01.online.tableau.com/#/site/absa/views/absa_project/MAIN?:iid=20&:original_view=y')
Exemplo n.º 9
0
 def __init__(self, wrds_per_pred=250):
     self.wrds_per_pred = wrds_per_pred
     self.overlap_wrds = 30
     self.valid_labels = [
         'OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U',
         "'O", '-O', '?O', '?U'
     ]
     self.model = NERModel("bert",
                           "felflare/bert-restore-punctuation",
                           labels=self.valid_labels,
                           args={
                               "silent": True,
                               "max_seq_length": 512
                           })
     # use_cuda isnt working and this hack seems to load the model correctly to the gpu
     self.model.device = torch.device("cuda:1")
     # dummy punctuate to load the model onto gpu
     self.punctuate("hello how are you")
Exemplo n.º 10
0
def main(trainingdataset, testdataset, outputdir):

    print()
    # Creat TransformerModel: NERModel: model_class, model_type from huggingface
    # several attributes can be changes with args -> see self.args, i.e.
    # args={'learning_rate': 2e-5, 'overwrite_output_dir': True, 'reprocess_input_data': True}

    #eval_init_df = pd.DataFrame()
    #eval_init_df = pd.DataFrame(testdataset, columns=['sentence_id', 'words', 'labels']) #is this structure same for wikiaan?
    #can't simply pass a txt to create a dataframe, need to have csv ...
    torch.cuda.empty_cache()
    model = NERModel(
        'bert',
        'bert-base-multilingual-cased',
        labels=[
            "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG",
            "B-LOC", "I-LOC"
        ],
        #use_cuda=False,
        args={
            'save_model_every_epoch': False,
            'save_steps': 10000,
            'output_dir': outputdir,
            'evaluate_during_training': True,
            'overwrite_output_dir': True,
            'classification_report': True,
            'save_eval_checkpoints': False
        })

    #changes Philine in passed arg parameters - 03.04.2020:
    # - evaluate_during_training: Perform evaluations during training - after every step defined in parameter: evaluate_during_training_steps, by default 2000 ->!eval_df has to be given as an input to train_model()
    # - classification_report of each label, will be added to eval_results.txt file
    # save_eval_checkpoints:False -> model won't be saved after each checkpoint -> improving of execution?
    # save_model_every_epoch can also be set to False!

    # Train the model
    model.train_model(
        trainingdataset, eval_df=testdataset
    )  # Make sure eval_df is passed to the training method if enabled.
def test_named_entity_recognition():
    # Creating train_df  and eval_df for demonstration
    train_data = [
        [0, "Simple", "B-MISC"],
        [0, "Transformers", "I-MISC"],
        [0, "started", "O"],
        [1, "with", "O"],
        [0, "text", "O"],
        [0, "classification", "B-MISC"],
        [1, "Simple", "B-MISC"],
        [1, "Transformers", "I-MISC"],
        [1, "can", "O"],
        [1, "now", "O"],
        [1, "perform", "O"],
        [1, "NER", "B-MISC"],
    ]
    train_df = pd.DataFrame(train_data,
                            columns=["sentence_id", "words", "labels"])

    eval_data = [
        [0, "Simple", "B-MISC"],
        [0, "Transformers", "I-MISC"],
        [0, "was", "O"],
        [1, "built", "O"],
        [1, "for", "O"],
        [0, "text", "O"],
        [0, "classification", "B-MISC"],
        [1, "Simple", "B-MISC"],
        [1, "Transformers", "I-MISC"],
        [1, "then", "O"],
        [1, "expanded", "O"],
        [1, "to", "O"],
        [1, "perform", "O"],
        [1, "NER", "B-MISC"],
    ]
    eval_df = pd.DataFrame(eval_data,
                           columns=["sentence_id", "words", "labels"])

    # Create a NERModel
    model = NERModel(
        "bert",
        "bert-base-cased",
        args={
            "no_save": True,
            "overwrite_output_dir": True,
            "reprocess_input_data": False
        },
        use_cuda=False,
    )

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, predictions = model.eval_model(eval_df)

    # Predictions on arbitary text strings
    predictions, raw_outputs = model.predict(["Some arbitary sentence"])
Exemplo n.º 12
0
def main():
    st.title("NER - Simple Transformers")
    
    cuda_available = torch.cuda.is_available()
    custom_labels=['NN','NNP','IN','DT','JJ','NNS','VBD','VBN','VBZ','CD','VB',
                   'CC','TO','RB','VBG','VBP','PRP','POS','PRP$','MD','WDT','JJS',
                   'JJR','WP','NNPS','RP','WRB','RBR','EX','RBS','PDT','WP$','UH','FW']
    model = NERModel("roberta", "best_model", use_cuda=cuda_available, labels=custom_labels)
    
    st.sidebar.subheader("Parameters")
    model.args.max_seq_length = st.sidebar.slider("Max Seq Length", min_value=1, max_value=92,
                                                  value=model.args.max_seq_length)
    model.args.use_multiprocessing = False
    
    st.subheader("Digite o texto: ")
    input = st.text_area("")
    
    if st.button("Analisar"):
        prediction = get_prediction(model, input)[0]
        st.write(prediction)
Exemplo n.º 13
0
from simpletransformers.ner import NERModel, NERArgs

label=data["labels"].unique().tolist()

label

args=NERArgs()
args.num_train_epochs=1
args.learning_rate=1e-4
args.overwrite_output_dir=True
args.train_batch_size=32
args.eval_batch_size=32



model=NERModel('bert', 'bert-base-cased', labels=label, args=args)

model.train_model(train_data, eval_data=test_data, acc=accuracy_score)

result, model_outputs, preds_list=model.eval_model(test_data)

result

prediction, model_output=model.predict(["This is Nishi"])

prediction

!pip install bert-extractive-summarizer

!pip install wikipedia
Exemplo n.º 14
0
        "do_lower_case": True,
        "silent": True,
        "reprocess_input_data": True
    }
    labels = [
        'B-ORGANIZATION', 'I-ORGANIZATION', 'B-SUBJECT', 'I-SUBJECT',
        'B-OBJECT', 'I-OBJECT', 'B-CODEX', 'I-CODEX', 'B-NUMBER', 'I-NUMBER',
        'B-LAWFACE', 'I-LAWFACE', 'B-PHYSFACE', 'I-PHYSFACE', 'B-REGISTRY',
        'I-REGISTRY', 'B-SIZES', 'I-SIZES', 'B-DATE', 'I-DATE',
        'B-SPARESUBJECT', 'I-SPARESUBJECT', 'B-CADASTRE', 'I-CADASTRE',
        'B-ADDRESS', 'I-ADDRESS', 'O'
    ]

    model = NERModel(model_type='distilbert',
                     model_name='outputs/',
                     args=args,
                     use_cuda=False,
                     labels=labels)

    async def main(loop=None):
        ner_controller = NERController(model)

        application = Application(middlewares=[middleware.logging],
                                  logger=logger)
        application.router.add_post('/api/ner', ner_controller.post)

        runner = AppRunner(application)
        await runner.setup()
        site = TCPSite(runner, environ['APP_HOST'], int(environ['APP_PORT']))
        await site.start()
Exemplo n.º 15
0
class NerModel:
    def __init__(self, modelname="", dataset=None, use_saved_model=False):
        self.dataset = dataset
        #labels_list = ["O", "B-ACT",  "I-ACT", "B-OBJ", "I-OBJ", "B-VAL", "I-VAL", "B-VAR", "I-VAR"]
        #labels_list = dataset.get_labels_list()
        labels_list = dataset['labels_list']

        output_dir = "outputs_{}".format(modelname)
        # Create a NERModel
        model_args = {
            'output_dir': output_dir,
            'overwrite_output_dir': True,
            'reprocess_input_data': True,
            
            'save_eval_checkpoints': False,
            'save_steps': -1,
            'save_model_every_epoch': False,
            
            'train_batch_size': 10, # 10
            'num_train_epochs': 10,   # 5
            'max_seq_length': 256,
            'gradient_accumulation_steps': 8,

            'labels_list': labels_list
        }
                
        if use_saved_model:
            self.model = NERModel("bert", output_dir, use_cuda=False, args=model_args)
        else:
            self.model = NERModel("bert", "bert-base-cased", use_cuda=False, args=model_args)
            # args={"overwrite_output_dir": True, "reprocess_input_data": True}

    def train(self):
        # # Train the model
        if self.dataset:
            self.model.train_model(self.dataset['train'])
        else:
            raise Exception("dataset is None")

    def eval(self):
        # # Evaluate the model
        if self.dataset:
            result, model_outputs, predictions = self.model.eval_model(self.dataset['val'])
            print("Evaluation result:", result)
        else:
            raise Exception("dataset is None")

    def simple_test(self):
        # Predictions on arbitary text strings
        sentences = ["Some arbitary sentence", "Simple Transformers sentence"]
        predictions, raw_outputs = self.model.predict(sentences)
        print(predictions)

        # More detailed preditctions
        for n, (preds, outs) in enumerate(zip(predictions, raw_outputs)):
            print("\n___________________________")
            print("Sentence: ", sentences[n])
            for pred, out in zip(preds, outs):
                key = list(pred.keys())[0]
                new_out = out[key]
                preds = list(softmax(np.mean(new_out, axis=0)))
                print(key, pred[key], preds[np.argmax(preds)], preds)

    def predict(self, sentences):
        predictions, raw_outputs = self.model.predict(sentences)
        return predictions
    "use_early_stopping": True,
    "early_stopping_patience": 4,
    "evaluate_during_training": True,
    "reprocess_input_data": False,
    "use_cached_eval_features": True,
    "fp16": False,
    "num_train_epochs": 10,
    "evaluate_during_training_steps": 10000,
    "train_batch_size": 32,
    'cross_entropy_ignore_index': 0,
    'classification_report': True
}

model = NERModel("electra",
                 "discriminator_trained/discriminator_model",
                 args=train_args,
                 labels=labels,
                 use_cuda=True,
                 crf=True)

# Train the model
model.train_model(train_file, eval_data=eval_file)

# Evaluate the model
test_file = 'data_set/test.ner.small.txt'
result, model_outputs, predictions = model.eval_model(train_file)

print(result)

# from transformers import ElectraTokenizer, ElectraForPreTraining
# model_name = r'D:\git_learn\simpletransformers\examples\language_model\discriminator_trained\discriminator_model'
# model = ElectraForPreTraining.from_pretrained(model_name)
Exemplo n.º 17
0
transformers_logger.setLevel(logging.WARNING)

# Creating train_df  and eval_df for demonstration
train_data = [
    [0, 'Simple', 'B-MISC'], [0, 'Transformers', 'I-MISC'], [0, 'started', 'O'], [1, 'with', 'O'], [0, 'text', 'O'], [0, 'classification', 'B-MISC'],
    [1, 'Simple', 'B-MISC'], [1, 'Transformers', 'I-MISC'], [1, 'can', 'O'], [1, 'now', 'O'], [1, 'perform', 'O'], [1, 'NER', 'B-MISC']
]
train_df = pd.DataFrame(train_data, columns=['sentence_id', 'words', 'labels'])

eval_data = [
    [0, 'Simple', 'B-MISC'], [0, 'Transformers', 'I-MISC'], [0, 'was', 'O'], [1, 'built', 'O'], [1, 'for', 'O'], [0, 'text', 'O'], [0, 'classification', 'B-MISC'],
    [1, 'Simple', 'B-MISC'], [1, 'Transformers', 'I-MISC'], [1, 'then', 'O'], [1, 'expanded', 'O'], [1, 'to', 'O'], [1, 'perform', 'O'], [1, 'NER', 'B-MISC']
]
eval_df = pd.DataFrame(eval_data, columns=['sentence_id', 'words', 'labels'])
# print(train_df)
# print(eval_df)
# Create a NERModel
model = NERModel('bert', 'bert-base-cased', args={'overwrite_output_dir': True, 'reprocess_input_data': True})

# Train the model
model.train_model(train_df)

# # Evaluate the model
result, model_outputs, predictions = model.eval_model(eval_df)
print(result, predictions)

# # Predictions on arbitary text strings
predictions, raw_outputs = model.predict(["Simple Transformers started with text classification"])

print(predictions)
# print(raw_outputs)
Exemplo n.º 18
0
class RestorePuncts:
    def __init__(self, wrds_per_pred=250):
        self.wrds_per_pred = wrds_per_pred
        self.overlap_wrds = 30
        self.valid_labels = [
            'OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U',
            "'O", '-O', '?O', '?U'
        ]
        self.model = NERModel("bert",
                              "felflare/bert-restore-punctuation",
                              labels=self.valid_labels,
                              args={
                                  "silent": True,
                                  "max_seq_length": 512
                              })
        # use_cuda isnt working and this hack seems to load the model correctly to the gpu
        self.model.device = torch.device("cuda:1")
        # dummy punctuate to load the model onto gpu
        self.punctuate("hello how are you")

    def punctuate(self, text: str, batch_size: int = 32, lang: str = ''):
        """
        Performs punctuation restoration on arbitrarily large text.
        Detects if input is not English, if non-English was detected terminates predictions.
        Overrride by supplying `lang='en'`
        
        Args:
            - text (str): Text to punctuate, can be few words to as large as you want.
            - lang (str): Explicit language of input text.
        """

        #if not lang and len(text) > 10:
        #    lang = detect(text)
        #if lang != 'en':
        #    raise Exception(F"""Non English text detected. Restore Punctuation works only for English.
        #    If you are certain the input is English, pass argument lang='en' to this function.
        #    Punctuate received: {text}""")

        def chunks(L, n):
            return [L[x:x + n] for x in range(0, len(L), n)]

        # plit up large text into bert digestable chunks
        splits = self.split_on_toks(text, self.wrds_per_pred,
                                    self.overlap_wrds)

        texts = [i["text"] for i in splits]
        batches = chunks(texts, batch_size)
        preds_lst = []

        for batch in batches:
            batch_preds, _ = self.model.predict(batch)
            preds_lst.extend(batch_preds)

        # predict slices
        # full_preds_lst contains tuple of labels and logits
        #full_preds_lst = [self.predict(i['text']) for i in splits]
        # extract predictions, and discard logits
        #preds_lst = [i[0][0] for i in full_preds_lst]
        # join text slices
        combined_preds = self.combine_results(text, preds_lst)
        # create punctuated prediction
        punct_text = self.punctuate_texts(combined_preds)
        return punct_text

    def predict(self, input_slice):
        """
        Passes the unpunctuated text to the model for punctuation.
        """
        predictions, raw_outputs = self.model.predict([input_slice])
        return predictions, raw_outputs

    @staticmethod
    def split_on_toks(text, length, overlap):
        """
        Splits text into predefined slices of overlapping text with indexes (offsets)
        that tie-back to original text.
        This is done to bypass 512 token limit on transformer models by sequentially
        feeding chunks of < 512 toks.
        Example output:
        [{...}, {"text": "...", 'start_idx': 31354, 'end_idx': 32648}, {...}]
        """
        wrds = text.replace('\n', ' ').split(" ")
        resp = []
        lst_chunk_idx = 0
        i = 0

        while True:
            # words in the chunk and the overlapping portion
            wrds_len = wrds[(length * i):(length * (i + 1))]
            wrds_ovlp = wrds[(length * (i + 1)):((length * (i + 1)) + overlap)]
            wrds_split = wrds_len + wrds_ovlp

            # Break loop if no more words
            if not wrds_split:
                break

            wrds_str = " ".join(wrds_split)
            nxt_chunk_start_idx = len(" ".join(wrds_len))
            lst_char_idx = len(" ".join(wrds_split))

            resp_obj = {
                "text": wrds_str,
                "start_idx": lst_chunk_idx,
                "end_idx": lst_char_idx + lst_chunk_idx,
            }

            resp.append(resp_obj)
            lst_chunk_idx += nxt_chunk_start_idx + 1
            i += 1
        logging.info(f"Sliced transcript into {len(resp)} slices.")
        return resp

    @staticmethod
    def combine_results(full_text: str, text_slices):
        """
        Given a full text and predictions of each slice combines predictions into a single text again.
        Performs validataion wether text was combined correctly
        """
        split_full_text = full_text.replace('\n', ' ').split(" ")
        split_full_text = [i for i in split_full_text if i]
        split_full_text_len = len(split_full_text)
        output_text = []
        index = 0

        if len(text_slices[-1]) <= 3 and len(text_slices) > 1:
            text_slices = text_slices[:-1]

        for _slice in text_slices:
            slice_wrds = len(_slice)
            for ix, wrd in enumerate(_slice):
                # print(index, "|", str(list(wrd.keys())[0]), "|", split_full_text[index])
                if index == split_full_text_len:
                    break

                if split_full_text[index] == str(list(wrd.keys())[0]) and \
                        ix <= slice_wrds - 3 and text_slices[-1] != _slice:
                    index += 1
                    pred_item_tuple = list(wrd.items())[0]
                    output_text.append(pred_item_tuple)
                elif split_full_text[index] == str(list(
                        wrd.keys())[0]) and text_slices[-1] == _slice:
                    index += 1
                    pred_item_tuple = list(wrd.items())[0]
                    output_text.append(pred_item_tuple)
        assert [i[0] for i in output_text] == split_full_text
        return output_text

    @staticmethod
    def punctuate_texts(full_pred: list):
        """
        Given a list of Predictions from the model, applies the predictions to text,
        thus punctuating it.
        """
        punct_resp = ""
        for i in full_pred:
            word, label = i
            if label[-1] == "U":
                punct_wrd = word.capitalize()
            else:
                punct_wrd = word

            if label[0] != "O":
                punct_wrd += label[0]

            punct_resp += punct_wrd + " "
        punct_resp = punct_resp.strip()
        # Append trailing period if doesnt exist.
        if punct_resp[-1].isalnum():
            punct_resp += "."
        return punct_resp
    [0, "built", "O"],
    [0, "for", "O"],
    [0, "text", "O"],
    [0, "classification", "B-MISC"],
    [1, "Simple", "B-MISC"],
    [1, "Transformers", "I-MISC"],
    [1, "then", "O"],
    [1, "expanded", "O"],
    [1, "to", "O"],
    [1, "perform", "O"],
    [1, "NER", "B-MISC"],
]
eval_df = pd.DataFrame(eval_data, columns=["sentence_id", "words", "labels"])

# Create a NERModel
model = NERModel("bert", "bert-base-cased", args={"overwrite_output_dir": True, "reprocess_input_data": True})

# # Train the model
# model.train_model(train_df)

# # Evaluate the model
# result, model_outputs, predictions = model.eval_model(eval_df)


# Predictions on arbitary text strings
sentences = ["Some arbitary sentence", "Simple Transformers sentence"]
predictions, raw_outputs = model.predict(sentences)

print(predictions)

# More detailed preditctions
# Create a NERModel
model_args = {
    'overwrite_output_dir': True,
    'reprocess_input_data': True,
    
    'save_eval_checkpoints': False,
    'save_steps': -1,
    'save_model_every_epoch': False,
    
    'train_batch_size': 10,
    'num_train_epochs': 100,   # 5
    'max_seq_length': 256,
    'gradient_accumulation_steps': 8
}

model = NERModel("bert", "bert-base-cased", use_cuda=False, args=model_args)
    # args={"overwrite_output_dir": True, "reprocess_input_data": True}

train_df = eval_df

# # Train the model
model.train_model(train_df)

# # Evaluate the model
result, model_outputs, predictions = model.eval_model(eval_df)


# Predictions on arbitary text strings
sentences = ["Some arbitary sentence", "Simple Transformers sentence"]
predictions, raw_outputs = model.predict(sentences)
Exemplo n.º 21
0
if __name__ == '__main__':
    train_df = txt_to_df("/Users/mutaz/Desktop/Mutaz Thesis bert/Data/raw/train.txt")
    test_df = txt_to_df("/Users/mutaz/Desktop/Mutaz Thesis bert/Data/raw/test.txt")
    test_df.reset_index(drop=True, inplace=True)
    # test_df["Sentence #"] = test_df["Sentence #"].values
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)
    train_df = pd.DataFrame(train_df.values, columns=['sentence_id', 'words', 'labels'])
    test_df = pd.DataFrame(test_df.values, columns=['sentence_id', 'words', 'labels'])
    model_used = "/Users/mutaz/Desktop/Mutaz Thesis bert/hotel_reviews"
    mm, epp, lrr, f11, results = [], [], [], [], {}
    aspect_model = NERModel("bert", "{}".format(model_used),
                            labels=["B-A", "I-A", "O"],
                            args={"save_eval_checkpoints": False, "save_steps": -1, 'overwrite_output_dir': True,
                                  "save_model_every_epoch": False,
                                  'reprocess_input_data': True, "train_batch_size": 5, 'num_train_epochs': 2,
                                  "gradient_accumulation_steps": 5,
                                  "output_dir": "/Users/mutaz/Desktop/Mutaz Thesis bert"
                                , "learning_rate": 0.0001}, use_cuda=False)
    # aspect_model.train_model(train_df)
    test_df = test_df.iloc[:100]
    a = test_df.groupby("sentence_id")
    gps = [a.get_group(key) for key, item in a]
    actual = [list(g.labels.values) for g in gps]
    sentences = [" ".join(i) for i in [[w for w in g.words.values] for g in gps]]
    predictions, raw_outputs = aspect_model.predict(sentences)
    pred = [[tag__ for word_ in s for word__, tag__ in word_.items()] for s in [sentence_ for sentence_ in predictions]]
    clear_output()
    f1 = f1_score(actual, pred, mode='strict', scheme=IOB2)
    print(f1)
Exemplo n.º 22
0
            line = line.split('\t')
            if len(line) == 3:
                eval_data.append(line)

    eval_df = pd.DataFrame(eval_data,
                           columns=['sentence_id', 'words', 'labels'])

    # test = eval_df.groupby('sentence_id').count()[['words']]

    # print(test.query("words > 128"))

    # Create a NERModel
    model = NERModel('albert',
                     'albert-base-v2',
                     use_cuda=False,
                     args={
                         'overwrite_output_dir': True,
                         'reprocess_input_data': True
                     })

    # # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, predictions = model.eval_model(eval_df)

    # Predictions on arbitary text strings
    predictions, raw_outputs = model.predict(
        ["Peter er lige begyndt til badminton."])

    print(predictions)
Exemplo n.º 23
0
    "evaluate_during_training": False,

    #    "process_count": cpu_count() - 2 if cpu_count() > 2 else 1,
    "process_count": 2,
    "n_gpu": 1,
}

from simpletransformers.ner import NERModel
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Create a NERModel
model = NERModel('bert',
                 'bert-base-cased',
                 use_cuda=True,
                 labels=label_list,
                 args=args)

model.train_model(train_df)

sentences = [
    "Mary M. Mrdutt, M.D., from the Baylor Scott & White Memorial Hospital in Temple, Texas, and colleagues prospectively measured frailty in elective surgery patients in a health care system.",
    "To be clear, there are currently no legal requirements for any cosmetic manufacturer marketing products to American consumers to test their products for safety"
]
predictions, raw_outputs = model.predict(sentences)

print(predictions)
Exemplo n.º 24
0
def main(sentence):
    """Predicts NER labels""" 
    model = NERModel('bert', 'outputs/', use_cuda=False) #
    predictions, raw_outputs = model.predict([sentence])
    print(predictions)
Exemplo n.º 25
0
    [0, "text", "O"],
    [0, "classification", "B-MISC"],
    [1, "Simple", "B-MISC"],
    [1, "Transformers", "I-MISC"],
    [1, "then", "O"],
    [1, "expanded", "O"],
    [1, "to", "O"],
    [1, "perform", "O"],
    [1, "NER", "B-MISC"],
]
eval_df = pd.DataFrame(eval_data, columns=["sentence_id", "words", "labels"])

# Create a NERModel
model = NERModel("bert",
                 "bert-base-cased",
                 args={
                     "overwrite_output_dir": True,
                     "reprocess_input_data": True
                 })

# Train the model
model.train_model(train_df)

# Evaluate the model
result, model_outputs, predictions = model.eval_model(eval_df)

# Predictions on arbitary text strings
predictions, raw_outputs = model.predict(["Some arbitary sentence"])

print(predictions)
Exemplo n.º 26
0
class DependencyModel:
    

    def __init__(self, ner_path: str, deps_path: str):
        """Create/Load a new DependencyModel

        Args:
            ner_path (str): directory of NER model. (if not exists, create a new model)
            deps_path (str): directory of classification model.
        """
        self.ner_path = ner_path
        try:
            self.ner_model = NERModel("distilbert", ner_path, use_cuda=False)
        except:
            self.ner_model = NERModel("distilbert", "distilbert-base-uncased",
                labels=custom_labels, use_cuda=False)
        self.deps_model = ClassifyModel(deps_path)
    
    def train_ner(self, ner_data: Iterator[Tuple[str, str, str]] ):
        """Train the NER model

        Args:
            ner_data (Iterator[Tuple[str, str, str]]): iterator of tuple of (sentence, entity1, entity2).
        """

        train_data = []
        tokenizer = nltk.RegexpTokenizer(r"[A-Za-z']+")

        for i, (sentence, e1, e2) in enumerate(ner_data):
            for word in tokenizer.tokenize(sentence):
                label = "O"
                if word.lower() == e1.lower() or word.lower() == e2.lower():
                    label = "E"
                train_data.append((i, word, label))
        
        train_data = pd.DataFrame(train_data)
        train_data.columns = ["sentence_id", "words", "labels"]
        self.ner_model.train(train_data, output_dir=self.ner_path)

    def train_deps(self, train_data: Iterator[Tuple[str, Tuple[str, str, str]]]):
        """Train the dependency tree path classification model.

        Args:
            train_data (Iterator[Tuple[str, Tuple[str, str, str]]]): iterator of (sentence, (relation, entity1, entity2))
        """
        classify_data: List[str, str] = []

        for sentence, (relation, e1, e2) in train_data:
            tree = DepTree(sentence)
            classify_data.append((" ".join(tree.shortest_path(e1,e2)), relation_list.index(relation)))
        self.deps_model.train(train_data)
    
    def train(self, train_data: Iterator[Tuple[str, Tuple[str, str, str]]]):
        """Train both the NER model AND the classification model.

        Args:
            train_data (Iterator[Tuple[str, Tuple[str, str, str]]]): iterator of (sentence, (relation, entity1, entity2))
        """
        train_data = list(train_data)
        self.train_ner([ (sent, e1, e2) for sent, (rela, e1, e2) in train_data ])
        self.train_deps(train_data)
    
    def predict(self, data: Iterable[str]) -> List[Tuple[str, Tuple[str, str]]]:
        """Predict the relation extracted from input sentences.

        Args:
            data (Iterable[str]): list of input sentences.

        Returns:
            List[Tuple[str, Tuple[str, str]]]: list of (relation, (entity1, entity2))
        """
        data = list(data)
        predictions, raws = self.ner_model.predict(data)

        predict_data = []
        entities = []
        for raw, sentence in zip(raws, data):
            [e1, e2] = get_entity(raw)
            tree = DepTree(sentence)
            entities.append((e1, e2))
            predict_data.append(" ".join(tree.shortest_path(e1,e2)))
        return list(zip(self.deps_model.predict(predict_data), entities))
Exemplo n.º 27
0
class BioAnalysis:
    def __init__(self,
                 train_file="./data/train.tsv",
                 dev_file="./data/dev.tsv",
                 test_file="./data/test.tsv"):
        self.train_data = file_opener(train_file)
        self.dev_data = file_opener(dev_file)
        self.test_data = file_opener(test_file)
        self.test_data.pop(192)
        self.crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                        c1=0.09684573395986483,
                                        c2=0.0800864058815976,
                                        max_iterations=100,
                                        all_possible_transitions=True)
        self.dev_predicted = None
        self.test_predicted = None
        self.dev_labels = None
        self.test_labels = None
        self.labels = [
            'B-Abiotic_Entity', 'B-Aggregate_Biotic_Abiotic_Entity',
            'B-Biotic_Entity', 'B-Eventuality', 'B-Location', 'B-Quality',
            'B-Time', 'B-Unit', 'B-Value', 'I-Abiotic_Entity',
            'I-Aggregate_Biotic_Abiotic_Entity', 'I-Biotic_Entity',
            'I-Eventuality', 'I-Location', 'I-Quality', 'I-Time', 'I-Unit',
            'I-Value', 'O'
        ]

        self.roberta_nel_model = None
        self.roberta_nel_dev_eval = None
        self.roberta_nel_test_eval = None
        self.roberta_nel_dev_links = None
        self.roberta_nel_test_links = None
        self.roberta_nel_train_data, _ = get_roberta_nel_data(self.train_data)
        self.roberta_nel_dev_data, self.roberta_nel_dev_spans = get_roberta_nel_data(
            self.dev_data)
        self.roberta_nel_test_data, self.roberta_nel_test_spans = get_roberta_nel_data(
            self.test_data)

        self.roberta_ner_model = None
        self.roberta_ner_dev_eval = None
        self.roberta_ner_test_eval = None
        self.roberta_ner_train_data = get_roberta_ner_data(self.train_data)
        self.roberta_ner_dev_data = get_roberta_ner_data(self.dev_data)
        self.roberta_ner_test_data = get_roberta_ner_data(self.test_data)

    def crf_fit(self):
        self.crf.fit(*get_features_labels(self.train_data))

    def crf_predict(self):
        dev_feat, self.dev_labels = get_features_labels(self.dev_data)
        test_feat, self.test_labels = get_features_labels(self.test_data)
        self.dev_predicted = self.crf.predict(dev_feat)
        self.test_predicted = self.crf.predict(test_feat)

    def crf_evaluate(self, verbose=False, labels=False):
        if labels:
            lab = labels
        else:
            lab = self.crf.classes_
            lab.remove("O")
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            print("Dev Results\n===========")
            dev_args = (self.dev_labels, self.dev_predicted)
            kwargs = {"average": "weighted", "labels": lab}
            if verbose:
                print("Precision:",
                      metrics.flat_precision_score(*dev_args, **kwargs))
                print("Recall:",
                      metrics.flat_recall_score(*dev_args, **kwargs))
            print("F1:", metrics.flat_f1_score(*dev_args, **kwargs))
            test_args = (self.test_labels, self.test_predicted)
            print("\nTest Results\n============")
            if verbose:
                print("Precision:",
                      metrics.flat_precision_score(*test_args, **kwargs))
                print("Recall:",
                      metrics.flat_recall_score(*test_args, **kwargs))
            print("F1:", metrics.flat_f1_score(*test_args, **kwargs))

    def roberta_nel_train(self):
        train_df = pd.DataFrame(self.roberta_nel_train_data)
        self.roberta_nel_model = ClassificationModel(
            "roberta",
            "roberta-base",
            args={
                "num_train_epochs": 3,
                "overwrite_output_dir": True,
                "output_dir": "nel_outputs/"
            })
        self.roberta_nel_model.train_model(train_df)

    def roberta_nel_eval(self):
        dev_df = pd.DataFrame(self.roberta_nel_dev_data)
        test_df = pd.DataFrame(self.roberta_nel_test_data)
        self.roberta_nel_dev_eval = self.roberta_nel_model.eval_model(
            dev_df, acc=f1_score)
        self.roberta_nel_test_eval = self.roberta_nel_model.eval_model(
            test_df, acc=f1_score)
        print("Dev NEL Results\n===========")
        print("F1:", self.roberta_nel_dev_eval[0]["acc"])
        print("\nTest NEL Results\n============")
        print("F1:", self.roberta_nel_test_eval[0]["acc"])

    def roberta_nel_load_model(self):
        self.roberta_nel_model = ClassificationModel(
            "roberta", "nel_outputs/", args={"num_train_epochs": 3})

    def roberta_ner_train(self):
        train_df = pd.DataFrame(self.roberta_ner_train_data,
                                columns=['sentence_id', 'words', 'labels'])
        self.roberta_ner_model = NERModel("roberta",
                                          "roberta-base",
                                          labels=self.labels,
                                          args={
                                              "num_train_epochs": 3,
                                              "overwrite_output_dir": True,
                                              "output_dir": "ner_outputs/"
                                          })
        self.roberta_ner_model.train_model(train_df)

    def roberta_ner_eval(self):
        dev_df = pd.DataFrame(self.roberta_ner_dev_data,
                              columns=['sentence_id', 'words', 'labels'])
        test_df = pd.DataFrame(self.roberta_ner_test_data,
                               columns=['sentence_id', 'words', 'labels'])
        self.roberta_ner_dev_eval = self.roberta_ner_model.eval_model(
            dev_df, "./ner_outputs/")
        self.roberta_ner_test_eval = self.roberta_ner_model.eval_model(
            test_df, "./ner_outputs/")
        print("Dev NER Results\n===========")
        print("Precision:", self.roberta_ner_dev_eval[0]["precision"])
        print("Recall:", self.roberta_ner_dev_eval[0]["recall"])
        print("F1:", self.roberta_ner_dev_eval[0]["f1_score"])
        print("\nTest NER Results\n============")
        print("Precision:", self.roberta_ner_test_eval[0]["precision"])
        print("Recall:", self.roberta_ner_test_eval[0]["recall"])
        print("F1:", self.roberta_ner_test_eval[0]["f1_score"])

    def roberta_ner_load_model(self):
        self.roberta_ner_model = NERModel("roberta",
                                          "ner_outputs/",
                                          labels=self.labels,
                                          args={"num_train_epochs": 3})

    def roberta_ner_nel_pipeline(self):
        try:
            self.roberta_ner_load_model()
        except OSError:
            self.roberta_ner_train()
        self.roberta_ner_eval()

        roberta_dev_phrases = deepcopy(self.dev_data)
        for ii in range(len(roberta_dev_phrases)):
            for jj in range(len(roberta_dev_phrases[ii])):
                roberta_dev_phrases[ii][jj] = list(roberta_dev_phrases[ii][jj])
                roberta_dev_phrases[ii][jj][2] = self.roberta_ner_dev_eval[2][
                    ii][jj]
        roberta_dev_phrases, roberta_dev_spans = get_roberta_nel_data(
            roberta_dev_phrases)

        roberta_test_phrases = deepcopy(self.test_data)
        for ii in range(len(roberta_test_phrases)):
            for jj in range(len(roberta_test_phrases[ii])):
                roberta_test_phrases[ii][jj] = list(
                    roberta_test_phrases[ii][jj])
                roberta_test_phrases[ii][jj][2] = self.roberta_ner_test_eval[
                    2][ii][jj]
        roberta_test_phrases, roberta_test_spans = get_roberta_nel_data(
            roberta_test_phrases)
        try:
            self.roberta_nel_load_model()
        except OSError:
            self.roberta_nel_train()
        roberta_dev_prediction = self.roberta_nel_model.predict(
            [x[0] for x in roberta_dev_phrases])[0]
        roberta_test_prediction = self.roberta_nel_model.predict(
            [x[0] for x in roberta_test_phrases])[0]

        roberta_dev_actual = [x[1] for x in self.roberta_nel_dev_data]
        roberta_test_actual = [x[1] for x in self.roberta_nel_test_data]

        dev_prediction = transform_nel_results(roberta_dev_prediction,
                                               roberta_dev_spans)
        dev_actual = transform_nel_results(roberta_dev_actual,
                                           self.roberta_nel_dev_spans)
        dev_actual, dev_prediction = resolve_diff(dev_actual, dev_prediction)

        test_prediction = transform_nel_results(roberta_test_prediction,
                                                roberta_test_spans)
        test_actual = transform_nel_results(roberta_test_actual,
                                            self.roberta_nel_test_spans)
        test_actual, test_prediction = resolve_diff(test_actual,
                                                    test_prediction)
        print("Dev NEL Combined Results\n===========")
        print("F1:", f1_score(dev_actual, dev_prediction))
        print("Test NEL Combined Results\n===========")
        print("F1:", f1_score(test_actual, test_prediction))

        dev_output = list(
            zip([x[0] for x in roberta_dev_phrases], roberta_dev_prediction))
        self.roberta_nel_dev_links = get_links(dev_output)
        test_output = list(
            zip([x[0] for x in roberta_test_phrases], roberta_test_prediction))
        self.roberta_nel_test_links = get_links(test_output)
Exemplo n.º 28
0
    labels = f.readlines()
labels = [i.strip() for i in labels]

train_args = {
    "output_dir": "ner_output",
    "overwrite_output_dir": True,
    "use_multiprocessing": False,
    "save_steps": 0,
    "use_early_stopping": True,
    "early_stopping_patience": 4,
    "evaluate_during_training": True,
    "reprocess_input_data": False,
    "use_cached_eval_features": True,
    "fp16": False,
    "num_train_epochs": 10,
    "evaluate_during_training_steps": 10000,
    "train_batch_size": 32,
    'cross_entropy_ignore_index': 0,
    'classification_report': True
}

model = NERModel("electra",
                 "ner_output/checkpoint-150000",
                 args=train_args,
                 labels=labels,
                 use_cuda=True,
                 crf=True)

result = model.predict([list('发烧头痛3天'), list('见盲肠底,升结肠近肝曲')],
                       split_on_space=False)
print(result)
Exemplo n.º 29
0
 def roberta_ner_load_model(self):
     self.roberta_ner_model = NERModel("roberta",
                                       "ner_outputs/",
                                       labels=self.labels,
                                       args={"num_train_epochs": 3})
Exemplo n.º 30
0
              [1, 'perform', 'O'], [1, 'NER', 'B-MISC']]
train_df = pd.DataFrame(train_data, columns=['sentence_id', 'words', 'labels'])

eval_data = [[0, 'Simple', 'B-MISC'], [0, 'Transformers', 'I-MISC'],
             [0, 'was', 'O'], [1, 'built', 'O'], [1, 'for', 'O'],
             [0, 'text', 'O'], [0, 'classification', 'B-MISC'],
             [1, 'Simple', 'B-MISC'], [1, 'Transformers', 'I-MISC'],
             [1, 'then', 'O'], [1, 'expanded', 'O'], [1, 'to', 'O'],
             [1, 'perform', 'O'], [1, 'NER', 'B-MISC']]
eval_df = pd.DataFrame(eval_data, columns=['sentence_id', 'words', 'labels'])

# Create a NERModel
model = NERModel('bert',
                 'bert-base-cased',
                 args={
                     'use_multiprocessing': False,
                     'overwrite_output_dir': True,
                     'reprocess_input_data': True
                 },
                 use_cuda=False)

# Train the model
model.train_model(train_df)

# Evaluate the model
result, model_outputs, predictions = model.eval_model(eval_df)

# Predictions on arbitary text strings
predictions, raw_outputs = model.predict(["Some arbitary sentence"])

print(predictions)