예제 #1
0
파일: main.py 프로젝트: Soumi7/St_webapp
def predict(sentence):
    if sentence:
        model1 = NERModel('bert',
                          'NERMODEL1',
                          labels=[
                              "B-sector", "I-sector", "B-funda", "O",
                              "operator", "threshold", "Join", "B-attr",
                              "I-funda", "TPQty", "TPUnit", "Sortby", "B-eco",
                              "I-eco", "B-index", "Capitalization", "I-",
                              "funda", "B-security", 'I-security', 'Number',
                              'Sector', 'TPMonth', 'TPYr', 'TPRef'
                          ],
                          args={
                              "save_eval_checkpoints": False,
                              "save_steps": -1,
                              "output_dir": "NERMODEL",
                              'overwrite_output_dir': True,
                              "save_model_every_epoch": False,
                              'reprocess_input_data': True,
                              "train_batch_size": 10,
                              'num_train_epochs': 15,
                              "max_seq_length": 64
                          },
                          use_cuda=False)

        predictions, raw_outputs = model1.predict([sentence])
        result = json.dumps(predictions[0])
        return result
def test_named_entity_recognition():
    # Creating train_df  and eval_df for demonstration
    train_data = [
        [0, "Simple", "B-MISC"],
        [0, "Transformers", "I-MISC"],
        [0, "started", "O"],
        [1, "with", "O"],
        [0, "text", "O"],
        [0, "classification", "B-MISC"],
        [1, "Simple", "B-MISC"],
        [1, "Transformers", "I-MISC"],
        [1, "can", "O"],
        [1, "now", "O"],
        [1, "perform", "O"],
        [1, "NER", "B-MISC"],
    ]
    train_df = pd.DataFrame(train_data,
                            columns=["sentence_id", "words", "labels"])

    eval_data = [
        [0, "Simple", "B-MISC"],
        [0, "Transformers", "I-MISC"],
        [0, "was", "O"],
        [1, "built", "O"],
        [1, "for", "O"],
        [0, "text", "O"],
        [0, "classification", "B-MISC"],
        [1, "Simple", "B-MISC"],
        [1, "Transformers", "I-MISC"],
        [1, "then", "O"],
        [1, "expanded", "O"],
        [1, "to", "O"],
        [1, "perform", "O"],
        [1, "NER", "B-MISC"],
    ]
    eval_df = pd.DataFrame(eval_data,
                           columns=["sentence_id", "words", "labels"])

    # Create a NERModel
    model = NERModel(
        "bert",
        "bert-base-cased",
        args={
            "no_save": True,
            "overwrite_output_dir": True,
            "reprocess_input_data": False
        },
        use_cuda=False,
    )

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, predictions = model.eval_model(eval_df)

    # Predictions on arbitary text strings
    predictions, raw_outputs = model.predict(["Some arbitary sentence"])
예제 #3
0
def predict():
    if request.method == 'POST':
        test_sents = pd.read_csv(request.files.get('file'))
    aspect_model = NERModel("bert", "/Users/mutaz/Desktop/Mutaz Thesis bert/hotel_reviews", use_cuda=False,
                            labels=["B-A", "I-A", "O"],
                            args={"use_cuda": False,
                                  "save_eval_checkpoints": False,
                                  "save_steps": -1,
                                  "output_dir": "MODEL",
                                  'overwrite_output_dir': True,
                                  "save_model_every_epoch": False,
                                  })

    predictions, raw_outputs = aspect_model.predict(test_sents["sentence"])
    pred = [[tag__ for word_ in s for word__, tag__ in word_.items()] for s in [sentence_ for sentence_ in predictions]]
    print(pred)
    # connect(test_sents, aspects_prds)
    return redirect('https://dub01.online.tableau.com/#/site/absa/views/absa_project/MAIN?:iid=20&:original_view=y')
예제 #4
0
args.learning_rate=1e-4
args.overwrite_output_dir=True
args.train_batch_size=32
args.eval_batch_size=32



model=NERModel('bert', 'bert-base-cased', labels=label, args=args)

model.train_model(train_data, eval_data=test_data, acc=accuracy_score)

result, model_outputs, preds_list=model.eval_model(test_data)

result

prediction, model_output=model.predict(["This is Nishi"])

prediction

!pip install bert-extractive-summarizer

!pip install wikipedia

import wikipedia

wiki = wikipedia.page('Amsterdam')
article=wiki.content
print(article)

from summarizer import Summarizer
model = Summarizer()
예제 #5
0
class NerModel:
    def __init__(self, modelname="", dataset=None, use_saved_model=False):
        self.dataset = dataset
        #labels_list = ["O", "B-ACT",  "I-ACT", "B-OBJ", "I-OBJ", "B-VAL", "I-VAL", "B-VAR", "I-VAR"]
        #labels_list = dataset.get_labels_list()
        labels_list = dataset['labels_list']

        output_dir = "outputs_{}".format(modelname)
        # Create a NERModel
        model_args = {
            'output_dir': output_dir,
            'overwrite_output_dir': True,
            'reprocess_input_data': True,
            
            'save_eval_checkpoints': False,
            'save_steps': -1,
            'save_model_every_epoch': False,
            
            'train_batch_size': 10, # 10
            'num_train_epochs': 10,   # 5
            'max_seq_length': 256,
            'gradient_accumulation_steps': 8,

            'labels_list': labels_list
        }
                
        if use_saved_model:
            self.model = NERModel("bert", output_dir, use_cuda=False, args=model_args)
        else:
            self.model = NERModel("bert", "bert-base-cased", use_cuda=False, args=model_args)
            # args={"overwrite_output_dir": True, "reprocess_input_data": True}

    def train(self):
        # # Train the model
        if self.dataset:
            self.model.train_model(self.dataset['train'])
        else:
            raise Exception("dataset is None")

    def eval(self):
        # # Evaluate the model
        if self.dataset:
            result, model_outputs, predictions = self.model.eval_model(self.dataset['val'])
            print("Evaluation result:", result)
        else:
            raise Exception("dataset is None")

    def simple_test(self):
        # Predictions on arbitary text strings
        sentences = ["Some arbitary sentence", "Simple Transformers sentence"]
        predictions, raw_outputs = self.model.predict(sentences)
        print(predictions)

        # More detailed preditctions
        for n, (preds, outs) in enumerate(zip(predictions, raw_outputs)):
            print("\n___________________________")
            print("Sentence: ", sentences[n])
            for pred, out in zip(preds, outs):
                key = list(pred.keys())[0]
                new_out = out[key]
                preds = list(softmax(np.mean(new_out, axis=0)))
                print(key, pred[key], preds[np.argmax(preds)], preds)

    def predict(self, sentences):
        predictions, raw_outputs = self.model.predict(sentences)
        return predictions
예제 #6
0
class DependencyModel:
    

    def __init__(self, ner_path: str, deps_path: str):
        """Create/Load a new DependencyModel

        Args:
            ner_path (str): directory of NER model. (if not exists, create a new model)
            deps_path (str): directory of classification model.
        """
        self.ner_path = ner_path
        try:
            self.ner_model = NERModel("distilbert", ner_path, use_cuda=False)
        except:
            self.ner_model = NERModel("distilbert", "distilbert-base-uncased",
                labels=custom_labels, use_cuda=False)
        self.deps_model = ClassifyModel(deps_path)
    
    def train_ner(self, ner_data: Iterator[Tuple[str, str, str]] ):
        """Train the NER model

        Args:
            ner_data (Iterator[Tuple[str, str, str]]): iterator of tuple of (sentence, entity1, entity2).
        """

        train_data = []
        tokenizer = nltk.RegexpTokenizer(r"[A-Za-z']+")

        for i, (sentence, e1, e2) in enumerate(ner_data):
            for word in tokenizer.tokenize(sentence):
                label = "O"
                if word.lower() == e1.lower() or word.lower() == e2.lower():
                    label = "E"
                train_data.append((i, word, label))
        
        train_data = pd.DataFrame(train_data)
        train_data.columns = ["sentence_id", "words", "labels"]
        self.ner_model.train(train_data, output_dir=self.ner_path)

    def train_deps(self, train_data: Iterator[Tuple[str, Tuple[str, str, str]]]):
        """Train the dependency tree path classification model.

        Args:
            train_data (Iterator[Tuple[str, Tuple[str, str, str]]]): iterator of (sentence, (relation, entity1, entity2))
        """
        classify_data: List[str, str] = []

        for sentence, (relation, e1, e2) in train_data:
            tree = DepTree(sentence)
            classify_data.append((" ".join(tree.shortest_path(e1,e2)), relation_list.index(relation)))
        self.deps_model.train(train_data)
    
    def train(self, train_data: Iterator[Tuple[str, Tuple[str, str, str]]]):
        """Train both the NER model AND the classification model.

        Args:
            train_data (Iterator[Tuple[str, Tuple[str, str, str]]]): iterator of (sentence, (relation, entity1, entity2))
        """
        train_data = list(train_data)
        self.train_ner([ (sent, e1, e2) for sent, (rela, e1, e2) in train_data ])
        self.train_deps(train_data)
    
    def predict(self, data: Iterable[str]) -> List[Tuple[str, Tuple[str, str]]]:
        """Predict the relation extracted from input sentences.

        Args:
            data (Iterable[str]): list of input sentences.

        Returns:
            List[Tuple[str, Tuple[str, str]]]: list of (relation, (entity1, entity2))
        """
        data = list(data)
        predictions, raws = self.ner_model.predict(data)

        predict_data = []
        entities = []
        for raw, sentence in zip(raws, data):
            [e1, e2] = get_entity(raw)
            tree = DepTree(sentence)
            entities.append((e1, e2))
            predict_data.append(" ".join(tree.shortest_path(e1,e2)))
        return list(zip(self.deps_model.predict(predict_data), entities))
예제 #7
0
if __name__ == '__main__':
    train_df = txt_to_df("/Users/mutaz/Desktop/Mutaz Thesis bert/Data/raw/train.txt")
    test_df = txt_to_df("/Users/mutaz/Desktop/Mutaz Thesis bert/Data/raw/test.txt")
    test_df.reset_index(drop=True, inplace=True)
    # test_df["Sentence #"] = test_df["Sentence #"].values
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)
    train_df = pd.DataFrame(train_df.values, columns=['sentence_id', 'words', 'labels'])
    test_df = pd.DataFrame(test_df.values, columns=['sentence_id', 'words', 'labels'])
    model_used = "/Users/mutaz/Desktop/Mutaz Thesis bert/hotel_reviews"
    mm, epp, lrr, f11, results = [], [], [], [], {}
    aspect_model = NERModel("bert", "{}".format(model_used),
                            labels=["B-A", "I-A", "O"],
                            args={"save_eval_checkpoints": False, "save_steps": -1, 'overwrite_output_dir': True,
                                  "save_model_every_epoch": False,
                                  'reprocess_input_data': True, "train_batch_size": 5, 'num_train_epochs': 2,
                                  "gradient_accumulation_steps": 5,
                                  "output_dir": "/Users/mutaz/Desktop/Mutaz Thesis bert"
                                , "learning_rate": 0.0001}, use_cuda=False)
    # aspect_model.train_model(train_df)
    test_df = test_df.iloc[:100]
    a = test_df.groupby("sentence_id")
    gps = [a.get_group(key) for key, item in a]
    actual = [list(g.labels.values) for g in gps]
    sentences = [" ".join(i) for i in [[w for w in g.words.values] for g in gps]]
    predictions, raw_outputs = aspect_model.predict(sentences)
    pred = [[tag__ for word_ in s for word__, tag__ in word_.items()] for s in [sentence_ for sentence_ in predictions]]
    clear_output()
    f1 = f1_score(actual, pred, mode='strict', scheme=IOB2)
    print(f1)
예제 #8
0
            line = line.split('\t')
            if len(line) == 3:
                eval_data.append(line)

    eval_df = pd.DataFrame(eval_data,
                           columns=['sentence_id', 'words', 'labels'])

    # test = eval_df.groupby('sentence_id').count()[['words']]

    # print(test.query("words > 128"))

    # Create a NERModel
    model = NERModel('albert',
                     'albert-base-v2',
                     use_cuda=False,
                     args={
                         'overwrite_output_dir': True,
                         'reprocess_input_data': True
                     })

    # # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, predictions = model.eval_model(eval_df)

    # Predictions on arbitary text strings
    predictions, raw_outputs = model.predict(
        ["Peter er lige begyndt til badminton."])

    print(predictions)
예제 #9
0
class CustomSlotAnalysis(BaseEstimator, TransformerMixin):
	def __init__(self):
		device = str(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
		if device=='cpu':
			use_cuda = False
		else:
			use_cuda = True
		model_args = {"use_multiprocessing": False}
		self.labels = ['B-aircraft_code',
				'B-airline_code',
				'B-airline_name',
				'B-airport_code',
				'B-airport_name',
				'B-arrive_date.date_relative',
				'B-arrive_date.day_name',
				'B-arrive_date.day_number',
				'B-arrive_date.month_name',
				'B-arrive_date.today_relative',
				'B-arrive_time.end_time',
				'B-arrive_time.period_mod',
				'B-arrive_time.period_of_day',
				'B-arrive_time.start_time',
				'B-arrive_time.time',
				'B-arrive_time.time_relative',
				'B-booking_class',
				'B-city_name',
				'B-class_type',
				'B-compartment',
				'B-connect',
				'B-cost_relative',
				'B-day_name',
				'B-day_number',
				'B-days_code',
				'B-depart_date.date_relative',
				'B-depart_date.day_name',
				'B-depart_date.day_number',
				'B-depart_date.month_name',
				'B-depart_date.today_relative',
				'B-depart_date.year',
				'B-depart_time.end_time',
				'B-depart_time.period_mod',
				'B-depart_time.period_of_day',
				'B-depart_time.start_time',
				'B-depart_time.time',
				'B-depart_time.time_relative',
				'B-economy',
				'B-fare_amount',
				'B-fare_basis_code',
				'B-flight',
				'B-flight_days',
				'B-flight_mod',
				'B-flight_number',
				'B-flight_stop',
				'B-flight_time',
				'B-fromloc.airport_code',
				'B-fromloc.airport_name',
				'B-fromloc.city_name',
				'B-fromloc.state_code',
				'B-fromloc.state_name',
				'B-meal',
				'B-meal_code',
				'B-meal_description',
				'B-mod',
				'B-month_name',
				'B-or',
				'B-period_of_day',
				'B-restriction_code',
				'B-return_date.date_relative',
				'B-return_date.day_name',
				'B-return_date.day_number',
				'B-return_date.month_name',
				'B-return_date.today_relative',
				'B-return_time.period_mod',
				'B-return_time.period_of_day',
				'B-round_trip',
				'B-state_code',
				'B-state_name',
				'B-stoploc.airport_code',
				'B-stoploc.airport_name',
				'B-stoploc.city_name',
				'B-stoploc.state_code',
				'B-time',
				'B-time_relative',
				'B-today_relative',
				'B-toloc.airport_code',
				'B-toloc.airport_name',
				'B-toloc.city_name',
				'B-toloc.country_name',
				'B-toloc.state_code',
				'B-toloc.state_name',
				'B-transport_type',
				'I-airline_name',
				'I-airport_name',
				'I-arrive_date.date_relative',
				'I-arrive_date.day_number',
				'I-arrive_time.end_time',
				'I-arrive_time.period_of_day',
				'I-arrive_time.start_time',
				'I-arrive_time.time',
				'I-arrive_time.time_relative',
				'I-city_name',
				'I-class_type',
				'I-cost_relative',
				'I-depart_date.date_relative',
				'I-depart_date.day_number',
				'I-depart_date.today_relative',
				'I-depart_time.end_time',
				'I-depart_time.period_of_day',
				'I-depart_time.start_time',
				'I-depart_time.time',
				'I-depart_time.time_relative',
				'I-economy',
				'I-fare_amount',
				'I-fare_basis_code',
				'I-flight_mod',
				'I-flight_number',
				'I-flight_stop',
				'I-flight_time',
				'I-fromloc.airport_name',
				'I-fromloc.city_name',
				'I-fromloc.state_name',
				'I-meal_code',
				'I-meal_description',
				'I-period_of_day',
				'I-restriction_code',
				'I-return_date.date_relative',
				'I-return_date.day_number',
				'I-return_date.today_relative',
				'I-round_trip',
				'I-state_name',
				'I-stoploc.city_name',
				'I-time',
				'I-today_relative',
				'I-toloc.airport_name',
				'I-toloc.city_name',
				'I-toloc.state_name',
				'I-transport_type',
				'O']
		self.model = NERModel('bert', '../models/checkpoint-1120-epoch-2',use_cuda=use_cuda,labels=self.labels,args=model_args)



	def fit(self):
		print('\n>>>>>>>fit() called. \n')
		return self



	def convert_to_dict(self,flag,ent,text):
	    l=[]
	    for i in range(len(flag)):
	        l.append({"slot":flag[i],
	                "value":ent[i],
	              "extractor": "slot_extractor"})
	    #x={"entity":l}
	    #print(l)
	    text['entities']=l

	    

	    return text



	def transform(self,text):
		slot_res=text['correctness']['value']
		#print(text['intent']['value'][0])
		#print(slot_res)
		prediction = self.model.predict([text['text']])
		#print('15:slot_pred',prediction)
		#label=message.get('correctness')[0].get('value')
		prediction = prediction[0][0]
		# print(prediction)
		#text = ''
		ent=[]
		flag=[]
		# ent=''
		# flag=''
		for i in prediction:
		    key = list(i.keys())[0]
		    if i[key]!='O':
		        flag.append(i[key])
		        ent.append(key.split('{')[0])

		if slot_res=='correct':
			entities =self.convert_to_dict(flag,ent,text)
			return entities
		else:
			entities =self.convert_to_dict('-','-',text)
			return entities
예제 #10
0
transformers_logger.setLevel(logging.WARNING)

# Creating train_df  and eval_df for demonstration
train_data = [
    [0, 'Simple', 'B-MISC'], [0, 'Transformers', 'I-MISC'], [0, 'started', 'O'], [1, 'with', 'O'], [0, 'text', 'O'], [0, 'classification', 'B-MISC'],
    [1, 'Simple', 'B-MISC'], [1, 'Transformers', 'I-MISC'], [1, 'can', 'O'], [1, 'now', 'O'], [1, 'perform', 'O'], [1, 'NER', 'B-MISC']
]
train_df = pd.DataFrame(train_data, columns=['sentence_id', 'words', 'labels'])

eval_data = [
    [0, 'Simple', 'B-MISC'], [0, 'Transformers', 'I-MISC'], [0, 'was', 'O'], [1, 'built', 'O'], [1, 'for', 'O'], [0, 'text', 'O'], [0, 'classification', 'B-MISC'],
    [1, 'Simple', 'B-MISC'], [1, 'Transformers', 'I-MISC'], [1, 'then', 'O'], [1, 'expanded', 'O'], [1, 'to', 'O'], [1, 'perform', 'O'], [1, 'NER', 'B-MISC']
]
eval_df = pd.DataFrame(eval_data, columns=['sentence_id', 'words', 'labels'])
# print(train_df)
# print(eval_df)
# Create a NERModel
model = NERModel('bert', 'bert-base-cased', args={'overwrite_output_dir': True, 'reprocess_input_data': True})

# Train the model
model.train_model(train_df)

# # Evaluate the model
result, model_outputs, predictions = model.eval_model(eval_df)
print(result, predictions)

# # Predictions on arbitary text strings
predictions, raw_outputs = model.predict(["Simple Transformers started with text classification"])

print(predictions)
# print(raw_outputs)
예제 #11
0
class RestorePuncts:
    def __init__(self, wrds_per_pred=250):
        self.wrds_per_pred = wrds_per_pred
        self.overlap_wrds = 30
        self.valid_labels = [
            'OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U',
            "'O", '-O', '?O', '?U'
        ]
        self.model = NERModel("bert",
                              "felflare/bert-restore-punctuation",
                              labels=self.valid_labels,
                              args={
                                  "silent": True,
                                  "max_seq_length": 512
                              })
        # use_cuda isnt working and this hack seems to load the model correctly to the gpu
        self.model.device = torch.device("cuda:1")
        # dummy punctuate to load the model onto gpu
        self.punctuate("hello how are you")

    def punctuate(self, text: str, batch_size: int = 32, lang: str = ''):
        """
        Performs punctuation restoration on arbitrarily large text.
        Detects if input is not English, if non-English was detected terminates predictions.
        Overrride by supplying `lang='en'`
        
        Args:
            - text (str): Text to punctuate, can be few words to as large as you want.
            - lang (str): Explicit language of input text.
        """

        #if not lang and len(text) > 10:
        #    lang = detect(text)
        #if lang != 'en':
        #    raise Exception(F"""Non English text detected. Restore Punctuation works only for English.
        #    If you are certain the input is English, pass argument lang='en' to this function.
        #    Punctuate received: {text}""")

        def chunks(L, n):
            return [L[x:x + n] for x in range(0, len(L), n)]

        # plit up large text into bert digestable chunks
        splits = self.split_on_toks(text, self.wrds_per_pred,
                                    self.overlap_wrds)

        texts = [i["text"] for i in splits]
        batches = chunks(texts, batch_size)
        preds_lst = []

        for batch in batches:
            batch_preds, _ = self.model.predict(batch)
            preds_lst.extend(batch_preds)

        # predict slices
        # full_preds_lst contains tuple of labels and logits
        #full_preds_lst = [self.predict(i['text']) for i in splits]
        # extract predictions, and discard logits
        #preds_lst = [i[0][0] for i in full_preds_lst]
        # join text slices
        combined_preds = self.combine_results(text, preds_lst)
        # create punctuated prediction
        punct_text = self.punctuate_texts(combined_preds)
        return punct_text

    def predict(self, input_slice):
        """
        Passes the unpunctuated text to the model for punctuation.
        """
        predictions, raw_outputs = self.model.predict([input_slice])
        return predictions, raw_outputs

    @staticmethod
    def split_on_toks(text, length, overlap):
        """
        Splits text into predefined slices of overlapping text with indexes (offsets)
        that tie-back to original text.
        This is done to bypass 512 token limit on transformer models by sequentially
        feeding chunks of < 512 toks.
        Example output:
        [{...}, {"text": "...", 'start_idx': 31354, 'end_idx': 32648}, {...}]
        """
        wrds = text.replace('\n', ' ').split(" ")
        resp = []
        lst_chunk_idx = 0
        i = 0

        while True:
            # words in the chunk and the overlapping portion
            wrds_len = wrds[(length * i):(length * (i + 1))]
            wrds_ovlp = wrds[(length * (i + 1)):((length * (i + 1)) + overlap)]
            wrds_split = wrds_len + wrds_ovlp

            # Break loop if no more words
            if not wrds_split:
                break

            wrds_str = " ".join(wrds_split)
            nxt_chunk_start_idx = len(" ".join(wrds_len))
            lst_char_idx = len(" ".join(wrds_split))

            resp_obj = {
                "text": wrds_str,
                "start_idx": lst_chunk_idx,
                "end_idx": lst_char_idx + lst_chunk_idx,
            }

            resp.append(resp_obj)
            lst_chunk_idx += nxt_chunk_start_idx + 1
            i += 1
        logging.info(f"Sliced transcript into {len(resp)} slices.")
        return resp

    @staticmethod
    def combine_results(full_text: str, text_slices):
        """
        Given a full text and predictions of each slice combines predictions into a single text again.
        Performs validataion wether text was combined correctly
        """
        split_full_text = full_text.replace('\n', ' ').split(" ")
        split_full_text = [i for i in split_full_text if i]
        split_full_text_len = len(split_full_text)
        output_text = []
        index = 0

        if len(text_slices[-1]) <= 3 and len(text_slices) > 1:
            text_slices = text_slices[:-1]

        for _slice in text_slices:
            slice_wrds = len(_slice)
            for ix, wrd in enumerate(_slice):
                # print(index, "|", str(list(wrd.keys())[0]), "|", split_full_text[index])
                if index == split_full_text_len:
                    break

                if split_full_text[index] == str(list(wrd.keys())[0]) and \
                        ix <= slice_wrds - 3 and text_slices[-1] != _slice:
                    index += 1
                    pred_item_tuple = list(wrd.items())[0]
                    output_text.append(pred_item_tuple)
                elif split_full_text[index] == str(list(
                        wrd.keys())[0]) and text_slices[-1] == _slice:
                    index += 1
                    pred_item_tuple = list(wrd.items())[0]
                    output_text.append(pred_item_tuple)
        assert [i[0] for i in output_text] == split_full_text
        return output_text

    @staticmethod
    def punctuate_texts(full_pred: list):
        """
        Given a list of Predictions from the model, applies the predictions to text,
        thus punctuating it.
        """
        punct_resp = ""
        for i in full_pred:
            word, label = i
            if label[-1] == "U":
                punct_wrd = word.capitalize()
            else:
                punct_wrd = word

            if label[0] != "O":
                punct_wrd += label[0]

            punct_resp += punct_wrd + " "
        punct_resp = punct_resp.strip()
        # Append trailing period if doesnt exist.
        if punct_resp[-1].isalnum():
            punct_resp += "."
        return punct_resp
예제 #12
0
    labels = f.readlines()
labels = [i.strip() for i in labels]

train_args = {
    "output_dir": "ner_output",
    "overwrite_output_dir": True,
    "use_multiprocessing": False,
    "save_steps": 0,
    "use_early_stopping": True,
    "early_stopping_patience": 4,
    "evaluate_during_training": True,
    "reprocess_input_data": False,
    "use_cached_eval_features": True,
    "fp16": False,
    "num_train_epochs": 10,
    "evaluate_during_training_steps": 10000,
    "train_batch_size": 32,
    'cross_entropy_ignore_index': 0,
    'classification_report': True
}

model = NERModel("electra",
                 "ner_output/checkpoint-150000",
                 args=train_args,
                 labels=labels,
                 use_cuda=True,
                 crf=True)

result = model.predict([list('发烧头痛3天'), list('见盲肠底,升结肠近肝曲')],
                       split_on_space=False)
print(result)
예제 #13
0
class NerModel:
    def __init__(self,
                 modelname="",
                 dataset=None,
                 use_saved_model=False,
                 input_dir=None,
                 output_dir=None):

        #pretrained_model_name = "lm_outputs_test/from_scratch/best_model"
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.dataset = dataset
        #labels_list = ["O", "B-ACT",  "I-ACT", "B-OBJ", "I-OBJ", "B-VAL", "I-VAL", "B-VAR", "I-VAR"]
        #labels_list = dataset.get_labels_list()

        labels_list = dataset['labels_list']
        #labels_list = ['O', 'B-ACT', 'I-ACT', 'B-OBJ', 'I-OBJ', 'B-CNT', 'I-CNT',
        #    'B-OPE', 'I-OPE', 'B-ORD', 'B-PRE', 'I-PRE', 'B-TYP',
        #    'B-VAL', 'I-VAL', 'B-ATT', 'I-ATT', 'B-VAR', 'I-VAR']

        #output_dir = "outputs_{}".format(modelname)
        os.system("{} -rf".format(output_dir))

        use_cuda = torch.cuda.is_available()

        # Create a NERModel
        model_args = {
            'labels_list': labels_list,
            'output_dir': output_dir,
            'overwrite_output_dir': True,
            'reprocess_input_data': True,
            'save_eval_checkpoints': False,
            'save_steps': -1,
            'save_model_every_epoch': False,
            #'no_save' : True,
            #'no_cache': True,
            'evaluate_during_training': True,
            'num_train_epochs': 15,  # 5
            'train_batch_size': 10,  # 10   (<=10 for bert, <=5 for longformer)
            'eval_batch_size': 10,
            'max_seq_length': 128,  # default 128
            'gradient_accumulation_steps': 8,
            'learning_rate':
            0.0001,  # default 4e-5; a good value is 0.0001 for albert

            #'max_position_embeddings': 64,
        }

        #self.model = NERModel("bert", pretrained_model_name, use_cuda=False, args=model_args)
        #self.model = NERModel("bert", "bert-base-uncased", use_cuda=False, args=model_args)
        #self.model = NERModel("longformer", "allenai/longformer-base-4096", use_cuda=False, args=model_args)
        #self.model = NERModel("longformer", pretrained_model_name, use_cuda=False, args=model_args)
        #self.model = NERModel("xlmroberta", "xlm-roberta-base", use_cuda=False, args=model_args)
        #self.model = NERModel("albert", "albert-base-v2", use_cuda=False, args=model_args)
        #self.model = NERModel("electra", 'google/electra-small-generator', use_cuda=False, args=model_args)
        #self.model = NERModel("layoutlm", 'microsoft/layoutlm-base-uncased', use_cuda=False, args=model_args)
        #self.model = NERModel("distilbert", "distilbert-base-cased-distilled-squad", use_cuda=False, args=model_args)

        #model_type, english_model_name  = "longformer", "allenai/longformer-base-4096"
        #model_type, english_model_name  = "mpnet", "microsoft/mpnet-base"
        #model_type, english_model_name  = "electra", "google/electra-small-discriminator"
        #model_type, english_model_name  = "squeezebert", "squeezebert/squeezebert-uncased"
        #model_type, english_model_name  = "albert", "albert-base-v2"
        #model_type, english_model_name  = "xlmroberta", "xlm-roberta-base"
        model_type, english_model_name = "roberta", "distilroberta-base"
        #model_type, english_model_name  = "bert", "bert-base-uncased"
        #model_type, english_model_name  = "distilbert", "distilbert-base-uncased"

        if input_dir:
            # Use a previously trained model (on NER or LM tasks)
            self.model = NERModel(model_type,
                                  input_dir,
                                  use_cuda=use_cuda,
                                  args=model_args)
        else:
            # Use a pre-trained (English) model
            self.model = NERModel(model_type,
                                  english_model_name,
                                  use_cuda=use_cuda,
                                  args=model_args)  # force_download=True
        """
        if use_saved_model:
            if path:
                # Use a model located in a given folder
                self.model = NERModel("longformer", path, use_cuda=False, args=model_args)
            else:
                # Use a previously trained model (on NER or LM tasks)
                self.model = NERModel("longformer", output_dir, use_cuda=False, args=model_args)
        else:
            # Use a pre-trained (English) model
            self.model = NERModel("longformer", "allenai/longformer-base-4096", use_cuda=False, args=model_args)
        """
        """
        if use_saved_model:
            self.model = NERModel("bert", output_dir, use_cuda=False, args=model_args)
        else:
            self.model = NERModel("bert", pretrained_model_name, use_cuda=False, args=model_args)
            # args={"overwrite_output_dir": True, "reprocess_input_data": True}
        """

        self.model_info = {
            'model_type': model_type,
            'english_model_name': english_model_name,
            'input_dir': input_dir
        }

    def train(self):
        # # Train the model
        if self.dataset:
            global_step, training_details = self.model.train_model(
                self.dataset['train'], eval_data=self.dataset['val'])
        else:
            raise Exception("dataset is None")

        print("global_step:", global_step)
        print("training_details:", training_details)
        #training_details: {'global_step': [4], 'precision': [0.6987951807228916], 'recall': [0.402777777777777
        #8], 'f1_score': [0.5110132158590308], 'train_loss': [0.41127926111221313], 'eval_loss': [0.63655577600
        #00229]}
        # it contains f1_score only for the validation dataset
        return training_details

    def eval(self):
        # # Evaluate the model
        if self.dataset:
            res_train, model_outputs, predictions = self.model.eval_model(
                self.dataset['train'])
            res_val, model_outputs, predictions = self.model.eval_model(
                self.dataset['val'])
            print("Evaluation")
            #print("On train data:", result)
            #{'eval_loss': 0.8920, 'precision': 0.0833, 'recall': 0.027, 'f1_score': 0.0416}
            print("train loss: {:.3f}; prec/recall/f1: {:.3f}/{:.3f}/{:.3f}".
                  format(res_train['eval_loss'], res_train['precision'],
                         res_train['recall'], res_train['f1_score']))
            #print("On validation data:", result)
            print("valid loss: {:.3f}; prec/recall/f1: {:.3f}/{:.3f}/{:.3f}".
                  format(res_val['eval_loss'], res_val['precision'],
                         res_val['recall'], res_val['f1_score']))
            print(
                "Summary. Loss (train/val): {:.3f}/{:.3f}, F1: {:.3f}/{:.3f}".
                format(res_train['eval_loss'], res_val['eval_loss'],
                       res_train['f1_score'], res_val['f1_score']))
        else:
            raise Exception("dataset is None")

        print("model_info:", self.model_info)

        return res_val

    def test(self):
        sentence_id = self.dataset['test']['sentence_id']
        words = self.dataset['test']['words']
        labels = self.dataset['test']['labels']

        prev_id = 0
        s_words = []
        s_labels = []
        samples = []

        for i in range(len(sentence_id)):
            s_id = sentence_id[i]
            word = words[i]
            label = labels[i]

            if s_id != prev_id:
                sentence = " ".join(s_words)
                #print("sentence id={}: {}".format(prev_id, sentence))
                samples.append({
                    'text': sentence,
                    'tokens': s_words,
                    'labels': s_labels
                })
                #print("s_labels: {}".format(s_labels))
                s_words = []
                s_labels = []
                prev_id = s_id

            s_words.append(words[i])
            s_labels.append(labels[i])
            #print("i={}, word={}, label={}".format(s_id, word, label))

        sentence = " ".join(s_words)
        #print("sentence id={}: {}".format(prev_id, sentence))
        samples.append({
            'text': sentence,
            'tokens': s_words,
            'labels': s_labels
        })

        texts = [sample['text'] for sample in samples]
        predictions, raw_outputs = self.model.predict(texts)
        #print(predictions)

        acc_list = []
        success_list = []

        # More detailed preditctions
        for i, (preds, raw_outs) in enumerate(zip(predictions, raw_outputs)):
            print()
            print("text: ", texts[i])
            #print("\npreds: ", preds)
            pred_labels = [list(t.values())[0] for t in preds]
            print("pred_labels: ", pred_labels)
            true_labels = samples[i]['labels']
            print("true_labels: ", true_labels)
            #print("raw_outs: ", raw_outs)

            if len(true_labels) != len(pred_labels):
                raise Exception("len(true_labels) != len(pred_labels)")
            comp = [
                true_labels[i] == pred_labels[i]
                for i in range(len(pred_labels))
            ]
            acc1sentence = np.mean(comp)
            print("acc={:.3f}".format(acc1sentence))
            acc_list.append(acc1sentence)
            success = 1 if acc1sentence == 1.0 else 0
            success_list.append(success)

        avg_acc = np.mean(acc_list)
        avg_success = np.mean(success_list)

        return {'avg_acc': avg_acc, 'avg_success': avg_success}

        #for pred, out in zip(preds, outs):
        #print("pred:", pred)
        #print("out:", out)
        #key = list(pred.keys())[0]
        #new_out = out[key]
        #preds = list(softmax(np.mean(new_out, axis=0)))
        #print(key, pred[key], preds[np.argmax(preds)], preds)

    def simple_test(self):
        # Predictions on arbitary text strings
        sentences = ["Some arbitary sentence", "Simple Transformers sentence"]
        predictions, raw_outputs = self.model.predict(sentences)
        print(predictions)

        # More detailed preditctions
        for n, (preds, outs) in enumerate(zip(predictions, raw_outputs)):
            print("\n___________________________")
            print("Sentence: ", sentences[n])
            for pred, out in zip(preds, outs):
                key = list(pred.keys())[0]
                new_out = out[key]
                preds = list(softmax(np.mean(new_out, axis=0)))
                print(key, pred[key], preds[np.argmax(preds)], preds)

    def predict(self, sentences):
        predictions, raw_outputs = self.model.predict(sentences)
        #tokenized_sentences = [self.tokenizer.tokenize(sentence) for sentence in sentences]
        #predictions, raw_outputs = self.model.predict(tokenized_sentences, split_on_space=False)
        return predictions

    def raw_predict(self, sentences):
        predictions, raw_outputs = self.model.predict(sentences)
        #print("raw_outputs:", raw_outputs)
        #print(self.model.args.labels_list)
        labels_list = self.model.args.labels_list
        confidences = [
            calc_confidence(raw_output, labels_list)
            for raw_output in raw_outputs
        ]
        #print("confidence:", confidence)
        return {
            'predictions': predictions,
            'raw_outputs': raw_outputs,
            'confidences': confidences
        }
        """
예제 #14
0

model_args = NERArgs(
                    labels_list = ['B','I', 'O'],
                    manual_seed = 2021,
                    num_train_epochs = 4,
                    max_seq_length = 512,
                    use_early_stopping = True,
                    overwrite_output_dir = True,
                    train_batch_size = 8
                    )

model  = NERModel(
    "roberta", "roberta-base", args=model_args, weight = [2.52853895, 12.18978944,  0.39643544], use_cuda=True, cuda_device=-1
)

model.train_model(train_df)

result, model_outputs, wrong_predictions = model.eval_model(
    eval_df
)

terms = []
preds = []
for lines_ in lines:
  sentences = [[token.text for token in nlp1(line.strip())] for line in lines_]
  predictions, raw_outputs = model.predict(sentences, split_on_space=False)
  preds.extend(predictions)

with open('ann_weighted_roberta.pkl', 'wb') as f:
    pkl.dump(preds, f)
    [1, "perform", "O"],
    [1, "NER", "B-MISC"],
]
eval_df = pd.DataFrame(eval_data, columns=["sentence_id", "words", "labels"])

# Create a NERModel
model = NERModel("bert", "bert-base-cased", args={"overwrite_output_dir": True, "reprocess_input_data": True})

# # Train the model
# model.train_model(train_df)

# # Evaluate the model
# result, model_outputs, predictions = model.eval_model(eval_df)


# Predictions on arbitary text strings
sentences = ["Some arbitary sentence", "Simple Transformers sentence"]
predictions, raw_outputs = model.predict(sentences)

print(predictions)

# More detailed preditctions
for n, (preds, outs) in enumerate(zip(predictions, raw_outputs)):
    print("\n___________________________")
    print("Sentence: ", sentences[n])
    for pred, out in zip(preds, outs):
        key = list(pred.keys())[0]
        new_out = out[key]
        preds = list(softmax(np.mean(new_out, axis=0)))
        print(key, pred[key], preds[np.argmax(preds)], preds)
예제 #16
0
    [0, "text", "O"],
    [0, "classification", "B-MISC"],
    [1, "Simple", "B-MISC"],
    [1, "Transformers", "I-MISC"],
    [1, "then", "O"],
    [1, "expanded", "O"],
    [1, "to", "O"],
    [1, "perform", "O"],
    [1, "NER", "B-MISC"],
]
eval_df = pd.DataFrame(eval_data, columns=["sentence_id", "words", "labels"])

# Create a NERModel
model = NERModel("bert",
                 "bert-base-cased",
                 args={
                     "overwrite_output_dir": True,
                     "reprocess_input_data": True
                 })

# Train the model
model.train_model(train_df)

# Evaluate the model
result, model_outputs, predictions = model.eval_model(eval_df)

# Predictions on arbitary text strings
predictions, raw_outputs = model.predict(["Some arbitary sentence"])

print(predictions)
예제 #17
0
def main(sentence):
    """Predicts NER labels""" 
    model = NERModel('bert', 'outputs/', use_cuda=False) #
    predictions, raw_outputs = model.predict([sentence])
    print(predictions)