def run_pipeline_test(self, model, tokenizer): text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer) # Small inputs because BartTokenizer tiny has maximum position embeddings = 22 valid_inputs = "HuggingFace is in" outputs = text_classifier(valid_inputs) self.assertEqual(nested_simplify(outputs), [{ "label": ANY(str), "score": ANY(float) }]) self.assertTrue(outputs[0]["label"] in model.config.id2label.values()) valid_inputs = ["HuggingFace is in ", "Paris is in France"] outputs = text_classifier(valid_inputs) self.assertEqual( nested_simplify(outputs), [{ "label": ANY(str), "score": ANY(float) }, { "label": ANY(str), "score": ANY(float) }], ) self.assertTrue(outputs[0]["label"] in model.config.id2label.values()) self.assertTrue(outputs[1]["label"] in model.config.id2label.values())
def predict_fn(input_data, model): logger.info('***** PREDICT_FN ********') trained_model, tokenizer = model pipe = TextClassificationPipeline(model=trained_model, tokenizer=tokenizer) logger.info('***** TEXT INPUT : %s', input_data) output = pipe(input_data) return output
def __init__(self, model_type="DISTILBERT", model_name="distilbert-base-uncased", num_labels: int = 2, load_path: str = ""): self.adaptor = get_adaptor(model_type) config = AutoConfig.from_pretrained(model_name, num_labels=num_labels) if load_path != "": model = AutoModelForSequenceClassification.from_pretrained( load_path, config=config) else: model = AutoModelForSequenceClassification.from_pretrained( model_name, config=config) super().__init__(model_type, model_name, model) device_number = detect_cuda_device_number() self._pipeline = TextClassificationPipeline(model=self.model, tokenizer=self.tokenizer, device=device_number) self._trainer = TCTrainer(self.model, self.model_type, self.tokenizer, self._device, self.logger)
def simple_inference(): ''' this one is simpler and better for general case. It doesn't show the distribution of all the sentiments. this one uses the TextClassificationPipeline from transformers lib which is preferable :return: ''' tokenizer = DistilBertTokenizer.from_pretrained("./model_out/") model = DistilBertForSequenceClassification.from_pretrained("./model_out/") model.to('cpu') sentiment_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=-1) t1 = time.time() result = sentiment_classifier("this is so cute!") t2 = time.time() print(t2 - t1, result) result = sentiment_classifier("That's so disgusting!") t3 = time.time() print(t3 - t2, result) result = sentiment_classifier("this is a simple test.") t4 = time.time() print(t4 - t3, result)
def classify_sentiment(): rest_request = json.loads(request.data.decode('utf-8')) sentence = str(rest_request["sentence"]) sentiment_classifier = TextClassificationPipeline( model=model, tokenizer=tokenizer, device=0 if config.use_cuda else -1) result = sentiment_classifier(sentence) return str(result)
def sentiment_model(text: str): tokenizer = AutoTokenizer.from_pretrained( "distilbert-base-uncased-finetuned-sst-2-english") model = AutoModelForSequenceClassification.from_pretrained( "distilbert-base-uncased-finetuned-sst-2-english") pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, task="sentiment-analysis") return pipeline(text)[0]['label'], pipeline(text)[0]['score']
def test_unbatch_attentions_hidden_states(self): model = DistilBertForSequenceClassification.from_pretrained( "hf-internal-testing/tiny-random-distilbert", output_hidden_states=True, output_attentions=True ) tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-distilbert") text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer) # Used to throw an error because `hidden_states` are a tuple of tensors # instead of the expected tensor. outputs = text_classifier(["This is great !"] * 20, batch_size=32) self.assertEqual(len(outputs), 20)
def __init__(self, args: Namespace): self.logger = getLogger("transformers-cli/training") self.framework = "tf" if is_tf_available() else "torch" os.makedirs(args.output, exist_ok=True) assert os.path.isdir(args.output) self.output = args.output self.column_label = args.column_label self.column_text = args.column_text self.column_id = args.column_id self.logger.info("Loading {} pipeline for {}".format( args.task, args.model)) if args.task == "text_classification": self.pipeline = TextClassificationPipeline.from_pretrained( args.model) elif args.task == "token_classification": raise NotImplementedError elif args.task == "question_answering": raise NotImplementedError self.logger.info("Loading dataset from {}".format(args.train_data)) self.train_dataset = Processor.create_from_csv( args.train_data, column_label=args.column_label, column_text=args.column_text, column_id=args.column_id, skip_first_row=args.skip_first_row, ) self.valid_dataset = None if args.validation_data: self.logger.info("Loading validation dataset from {}".format( args.validation_data)) self.valid_dataset = Processor.create_from_csv( args.validation_data, column_label=args.column_label, column_text=args.column_text, column_id=args.column_id, skip_first_row=args.skip_first_row, ) self.validation_split = args.validation_split self.train_batch_size = args.train_batch_size self.valid_batch_size = args.valid_batch_size self.learning_rate = args.learning_rate self.adam_epsilon = args.adam_epsilon
def __init__( self, model_name_or_path: Union[ Path, str] = "shahrukhx01/bert-mini-finetune-question-detection"): """ :param model_name_or_path: Transformer based fine tuned mini bert model for query classification """ # save init parameters to enable export of component config as YAML self.set_config(model_name_or_path=model_name_or_path) model = AutoModelForSequenceClassification.from_pretrained( model_name_or_path) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) self.query_classification_pipeline = TextClassificationPipeline( model=model, tokenizer=tokenizer)
def main(): bert = TFAutoModel.from_pretrained('Rostlab/prot_bert_bfd') input_ids = tf.keras.layers.Input(shape=(MAX_LEN, ), name='input_ids', dtype='int64') mask = tf.keras.layers.Input(shape=(MAX_LEN, ), name='attention_mask', dtype='int64') embeddings = bert(input_ids, attention_mask=mask)[0] X = tf.keras.layers.GlobalMaxPooling1D()(embeddings) X = tf.keras.layers.BatchNormalization()(X) X = tf.keras.layers.Dense(64, activation='relu')(X) X = tf.keras.layers.Dropout(0.1)(X) X = tf.keras.layers.Dense(16, activation='relu')(X) y = tf.keras.layers.Dense(6, activation='softmax', name='outputs')(X) bert = tf.keras.Model(inputs=[input_ids, mask], outputs=[y]) bert.load_weights('./checkpoints/mini_test2/tf_model.h5') # TODO: load the tokenizer and the pretrained model from (checkpoints directory) tokenizer = AutoTokenizer.from_pretrained( 'Rostlab/prot_bert_bfd', do_lower_case=False, ) # bert = TFAutoModelForSequenceClassification.from_pretrained('./checkpoints/mini_test/weights.h5', from_pt=True) # pipeline = TextClassificationPipeline(model=bert, tokenizer=tokenizer, device=0, framework='tf', task="first EC number prediction") # TODO: change device to read from cuda apis seq = 'M E N H S K Q T E A P H P G T Y M P A G Y P P P Y P P A A F Q G P S D H A A Y P I P Q A G Y Q G P P G P Y P G P Q P G Y P V P P G G Y A G G ' \ 'G P S G F P V Q N Q P A Y N H P G G P G G T P W M P A P P P P L N C P P G L E Y L A Q I D Q L L V H Q Q I E L L E V L T G F E T N N K Y E I ' \ 'K N S L G Q R V Y F A V E D T D C C T R N C C G A S R P F T L R I L D N L G R E V M T L E R P L R C S S C C F P C C L Q E I E I Q A P P G V ' \ 'P V G Y V T Q T W H P C L P K F T L Q N E K K Q D V L K V V G P C V V C S C C S D I D F E L K S L D E E S V V G K I S K Q W S G F V R E A F ' \ 'T D A D N F G I Q F P L D L D V K M K A V M L G A C F L I D F M F F E R T G N E E Q R S G A W Q ' print(pipeline(seq))
def __init__(self, model_type="BERT", model_name="bert-base-multilingual-cased", num_labels=3): self.adaptor = get_adaptor(model_type) config = AutoConfig.from_pretrained(model_name, num_labels=num_labels) model = AutoModelForSequenceClassification.from_pretrained( model_name, config=config) super().__init__(model_type, model_name, model) device_number = detect_cuda_device_number() self._pipeline = TextClassificationPipeline(model=self.model, tokenizer=self.tokenizer, device=device_number) self._trainer = ABSATrainer(self.model, self.model_type, self.tokenizer, self._device, self.logger)
def __init__(self, model_type="DISTILBERT", model_name="distilbert-base-uncased", num_labels=2): self.adaptor = get_adaptor(model_type) config = AutoConfig.from_pretrained(model_name, num_labels=num_labels) model = self.adaptor.SequenceClassification.from_pretrained( model_name, config=config) tokenizer = self.adaptor.Tokenizer.from_pretrained(model_name) super().__init__(model_type, model_name, model, tokenizer) device_number = detect_cuda_device_number() self._pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device_number) self._trainer = TCTrainer(self._model, self.model_type, self._tokenizer, self._device, self.logger)
2: 1, 3: 2, 4: 3, 5: 4 }) tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') if num_gpus >= 1: inference_device = 0 # GPU 0 else: inference_device = -1 # CPU print('inference_device {}'.format(inference_device)) inference_pipeline = TextClassificationPipeline(model=loaded_model, tokenizer=tokenizer, framework='tf', device=inference_device) print("""I loved it! I will recommend this to everyone.""", inference_pipeline("""I loved it! I will recommend this to everyone.""")) print("""It's OK.""", inference_pipeline("""It's OK.""")) print("""Really bad. I hope they don't make this anymore.""", inference_pipeline("""Really bad. I hope they don't make this anymore.""")) import csv df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', delimiter='\t', quoting=csv.QUOTE_NONE, compression='gzip')[['review_body', 'star_rating']] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TextClassificationPipeline tokenizer = DistilBertTokenizer.from_pretrained("./model_out/") model = DistilBertForSequenceClassification.from_pretrained("./model_out/") sentiment_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer) result = sentiment_classifier("this is so cute!") print(result)
df = pd.read_csv(os.path.join(_DATA_PATH, _DATA_FILE)) index_list = df.index.tolist() n_rows = len(index_list) batches = [ index_list[i:i + _MAX_NUMBER_ROWS] for i in range(0, n_rows, _MAX_NUMBER_ROWS) ] # DOWNLOAD DE MODEL AND TOKENIZER FROM HUGGINGFACE model = BertForSequenceClassification.from_pretrained( pretrained_model_name_or_path=_HUGGINGFACE_MODEL) tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=_HUGGINGFACE_MODEL) # CREATE THE CLASSIFIER sentiment_analyzer = TextClassificationPipeline(model=model, tokenizer=tokenizer) # INFER SENTIMENTS for i in range(len(batches)): # get a portion of data df_i = df.loc[batches[i]] # create the filename filename = os.path.join( _DATA_PATH_NEW, _DATA_FILE_NEW.format( n=str(i).rjust(len(str(len(batches))), '0'))) if not os.path.exists(filename): try: # extract the text clean, but is going to be cleaned again by the tokenizer text = list(df_i['text_clean'].astype(str).values) # infer sentiments
args = parser.parse_args() print(f"Loading data from {args.source_file}") data_df = pd.read_csv(args.source_file, index_col=0) print(data_df.tail()) # Carregando checkpoint e tokenizador print(f"- Loading tokenizer {args.tokenizer}.") print(f"- Loading checkpoint {args.checkpoint}.") tokenizer = BertTokenizerFast.from_pretrained(args.tokenizer) model = BertForSequenceClassification.from_pretrained(args.checkpoint) # Preparando pipeline de inferência print(f"- Preparing inference pipeline.") pipeline = TextClassificationPipeline( model=model, tokenizer=tokenizer, # batch_size=args.bs, # TODO: não funciona com batch_size, atualizar a lib transformers eventualmente. device=args.device) print(f"- Running inference:") preds_dict = pipeline(data_df[args.text_column].values.tolist()) # Extraindo predições do dicionario retornado preds = [example["label"] for example in preds_dict] data_df["predictions"] = preds # Salvando resultados print(f"- Saving results to {args.source_file}.") data_df.to_csv(f"{args.source_file}")
def get_test_pipeline(self, model, tokenizer, feature_extractor): text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer) return text_classifier, ["HuggingFace is in", "This is another test"]
def main(): bert_tokenizer = DistilBertTokenizer.from_pretrained('bert_tokenizer') bert_fine_tuned_model = AutoModelForSequenceClassification.from_pretrained( 'bert_fine_tuned') pipeline = TextClassificationPipeline(model=bert_fine_tuned_model, tokenizer=bert_tokenizer, return_all_scores=True) #Loading in lstm tokenizer and model tokenizer = load_tokenizer('tokenizer.pkl') model_lstm = keras.models.load_model('lstm_token300_dim32_softmax.h5') #Loading in sentence similarity embedding model claim_model_bert = torch.load("claim_model_bert") #Dictionaries relating number label to written label label2class = {'False': 0, 'Mixture': 1, 'True': 2, 'Unproven': 3} class2label = {0: 'False', 1: 'Mixture', 2: 'True', 3: 'Unproven'} #Loading in the testing data and removing erroneous results testingData = pd.read_csv('TestDataWithSlicingDF') testingData = testingData[testingData["label"] != -1] st.title("Public Health Fake News Analyzer") options = st.sidebar.selectbox( "Choose a page", ["About", "Fake News Prediction", "Similarity Matching", "Testing"]) if options == "About": st.write("Creators: Alex Gui, Vivek Lam and Sathya Chitturi") st.write("Dataset Source: https://arxiv.org/abs/2010.09926") st.write( " Due to the nature and popularity of social networking sites, misinformation can propagate rapidly leading to widespread dissemination of misleading and even harmful information. A plethora of misinformation can make it hard for the public to understand what claims hold merit and which are baseless. This machine learning tool allows users to quickly learn whether an article contains fake news. The tabs in this website correspond to predictive fake news detection, user claim similarity matching, and model performance evaluation." ) st.error( "Disclaimer: This project is still a work in progress. The best source of information regarding fake news comes directly from verified fact-checkers." ) ###### Tab 1 ##### if options == "Fake News Prediction": #Setting other buttons states in different tabs to false st.subheader("Prediction on article") st.markdown(Path("predTab.md").read_text()) user_input_type = st.selectbox("Select a method to input data", ['Text Box', 'URL']) if user_input_type == 'Text Box': text = st.text_area("Enter Text", "Type Here", key="predText") if text != 'Type Here': model_selected = st.selectbox( "Select a model", ['Baseline LSTM', 'Fine-tuned BERT']) if model_selected == 'Baseline LSTM': if st.button("Analyze"): X = preprocess_text(tokenizer, [text]) pred = model_lstm.predict(X) displayPredictedDf(pred) if model_selected == 'Fine-tuned BERT': if st.button("Analyze"): #truncates text so that the model will run text = text[:2000] pred = pipeline(text) pred = np.expand_dims(np.array([ pred[0][0]["score"], pred[0][1]["score"], pred[0][2]["score"], pred[0][3]["score"] ]), axis=0) displayPredictedDf(pred) elif user_input_type == 'URL': raw_url = st.text_input("Enter URL", "Type Here", key="predURL") if raw_url != 'Type Here': try: text = get_text(raw_url) textLoaded = True except: st.error("Cannot parse url") textLoaded = False if textLoaded: model_selected = st.selectbox( "Select a model", ['Baseline LSTM', 'Fine-tuned BERT']) if model_selected == 'Baseline LSTM': if st.button("Analyze"): X = preprocess_text(tokenizer, [text]) pred = model_lstm.predict(X) displayPredictedDf(pred) if model_selected == 'Fine-tuned BERT': if st.button("Analyze"): #truncates text so that the model will run text = text[:2000] pred = pipeline(text) pred = np.expand_dims(np.array([ pred[0][0]["score"], pred[0][1]["score"], pred[0][2]["score"], pred[0][3]["score"] ]), axis=0) displayPredictedDf(pred) ########### Tab 2 ############# if options == "Similarity Matching": st.subheader("Similar Claim Finder") st.markdown(Path("simTab.md").read_text()) user_input_type = st.selectbox("Select a method to input data", ['Text Box', 'URL']) if user_input_type == 'Text Box': text = st.text_area("Enter Text", "Type Here", key="simText") if text != 'Type Here': model_selected = st.selectbox("Select a model", ['BERT Similarity']) if model_selected == 'BERT Similarity': if st.button("Analyze Claim"): article_split = sentence_splitter(text) # Take first 20 sentences processed_article = (article_split) embedding_data = load_embedded_data( "healthFactTrainData.pkl") matched_claims = [] for sent in processed_article: claim_embed = claim_model_bert.encode(sent) sim_scores = cosine_similarity( embedding_data['claim_embedding'], claim_embed.reshape(1, -1)) top = np.flip(np.argsort(sim_scores.flatten()))[:3] for idx in top: if round(sim_scores[idx].item(), 3) > 0.8: if embedding_data['label'][idx] != -1: matched_claims.append([ round(sim_scores[idx].item(), 3), sent, embedding_data['claim'][idx], class2label[embedding_data['label'] [idx]] ]) df = pd.DataFrame(matched_claims, columns=[ 'Similarity Score', 'Trigger Sentence in Article', 'Claim in Training Data', 'Claim Label' ]) df = df.sort_values(by=['Similarity Score'], ascending=False) st.table(df.assign(hack='').set_index('hack')) elif user_input_type == 'URL': raw_url = st.text_input("Enter URL", "Type Here", key="simURL") if raw_url != 'Type Here': try: text = get_text(raw_url) textLoaded = True except: st.error("Cannot parse url") textLoaded = False if textLoaded: model_selected = st.selectbox("Select a model", ['BERT Similarity']) if model_selected == 'BERT Similarity': if st.button("Analyze Claim"): article_split = sentence_splitter(text) # Take first 20 sentences processed_article = (article_split) embedding_data = load_embedded_data( "healthFactTrainData.pkl") matched_claims = [] for sent in processed_article: claim_embed = claim_model_bert.encode(sent) sim_scores = cosine_similarity( embedding_data['claim_embedding'], claim_embed.reshape(1, -1)) top = np.flip(np.argsort( sim_scores.flatten()))[:3] for idx in top: if round(sim_scores[idx].item(), 3) > 0.8: if embedding_data['label'][idx] != -1: matched_claims.append([ round(sim_scores[idx].item(), 3), sent, embedding_data['claim'][idx], class2label[embedding_data[ 'label'][idx]] ]) df = pd.DataFrame( matched_claims, columns=[ 'Similarity Score', 'Trigger Sentence in Article', 'Claim in Training Data', 'Claim Label' ]) df = df.sort_values(by=['Similarity Score'], ascending=False) st.table(df.assign(hack='').set_index('hack')) ###### Tab 3 ###### if options == "Testing": st.subheader("Prediction on testing data") st.markdown(Path("testTab.md").read_text()) model_selected = st.selectbox("Select a model", ['Baseline LSTM']) if model_selected == 'Baseline LSTM': st.write("{0} Accuracy on whole Test Dataset: 66%".format( model_selected)) user_input = st.selectbox("Slicing Type", ["Word Count", "Year Published", \ "Average Sentence Length", "Percentage Punctuation"]) if user_input == "Word Count": wordCountSplit = st.slider(label="Word Count Split", min_value=200, max_value=1300, step=1) lowerSplitWC = testingData[ testingData["word_counts"] <= wordCountSplit] upperSplitWC = testingData[ testingData["word_counts"] > wordCountSplit] if st.button("Generate Split Statistics"): accDf, totalaccuracyDF = generateAccDfLSTM( lowerSplitWC, upperSplitWC, tokenizer, model_lstm, sampleSize=SAMPLE_SIZE) st.write(totalaccuracyDF) st.write(accDf) elif user_input == "Year Published": yearPublishedSplit = st.slider(label="Year Published Split", min_value=2010, max_value=2019, step=1) #Removing instances where the year is unavailable testingDataYP = testingData[testingData["year_published"] != 0] lowerSplitYP = testingDataYP[ testingDataYP["year_published"] <= yearPublishedSplit] upperSplitYP = testingDataYP[ testingDataYP["year_published"] > yearPublishedSplit] if st.button("Generate Split Statistics"): accDf, totalaccuracyDF = generateAccDfLSTM( lowerSplitYP, lowerSplitYP, tokenizer, model_lstm, sampleSize=SAMPLE_SIZE) st.write(totalaccuracyDF) st.write(accDf) elif user_input == "Average Sentence Length": avgSentLenSplit = st.slider( label="Average Sentence Length Split", min_value=101, max_value=199, step=1) #Removing instances where the year is unavailable testingDataASL = testingData[ testingData["average_sentence_length"] != 0] lowerSplitASL = testingDataASL[ testingDataASL["average_sentence_length"] <= avgSentLenSplit] upperSplitASL = testingDataASL[ testingDataASL["average_sentence_length"] > avgSentLenSplit] if st.button("Generate Split Statistics"): accDf, totalaccuracyDF = generateAccDfLSTM( lowerSplitASL, upperSplitASL, tokenizer, model_lstm, sampleSize=SAMPLE_SIZE) st.write(totalaccuracyDF) st.write(accDf) elif user_input == "Percentage Punctuation": fracPuncSplit = st.slider(label="Percentage Punctuation Split", min_value=.12, max_value=.8, step=.01) #Removing instances where the year is unavailable testingDataFPS = testingData[ testingData["percentage_punc_to_word"] != 1] lowerSplitFPS = testingDataFPS[ testingDataFPS["percentage_punc_to_word"] <= fracPuncSplit] upperSplitFPS = testingDataFPS[ testingDataFPS["percentage_punc_to_word"] > fracPuncSplit] if st.button("Generate Split Statistics"): accDf, totalaccuracyDF = generateAccDfLSTM( lowerSplitFPS, upperSplitFPS, tokenizer, model_lstm, sampleSize=SAMPLE_SIZE) st.write(totalaccuracyDF) st.write(accDf)
def __init__(self, name_model: str = SENTIMENT_MODEL): tokenizer = AutoTokenizer.from_pretrained(name_model) model = AutoModelForSequenceClassification.from_pretrained(name_model) self.pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
def upload_model_to_s3(file, bucket): s3 = boto3.client('s3') with open(file, "rb") as f: s3.upload_fileobj(f, bucket, file) print('Model file {} uploaded to {}'.format(file, bucket)) if __name__ == '__main__': train_data='s3://sagemaker-us-west-2-231218423789/training-pipeline-2020-09-05-16-19-31/processing/output/bert-train' test_data='s3://sagemaker-us-west-2-231218423789/training-pipeline-2020-09-05-16-19-31/processing/output/bert-test' validation_data='s3://sagemaker-us-west-2-231218423789/training-pipeline-2020-09-05-16-19-31/processing/output/bert-validation' model_dir='opt/ml/model' output_dir='s3://sagemaker-us-west-2-231218423789/dlc/output' use_xla=False use_amp=False max_seq_length=64 train_batch_size=64 validation_batch_size=64 test_batch_size=64 epochs=1 learning_rate=0.00003 epsilon=0.00000001 train_steps_per_epoch=50 validation_steps=10 test_steps=10 freeze_bert_layer=True run_validation=False run_test=False run_sample_predictions=False # Model Output transformer_fine_tuned_model_path = os.path.join(model_dir, 'transformers/fine-tuned/') os.makedirs(transformer_fine_tuned_model_path, exist_ok=True) # SavedModel Output tensorflow_saved_model_path = os.path.join(model_dir, 'tensorflow/saved_model/0') os.makedirs(tensorflow_saved_model_path, exist_ok=True) distributed_strategy = tf.distribute.MirroredStrategy() with distributed_strategy.scope(): tf.config.optimizer.set_jit(use_xla) tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp}) train_data_filenames = glob(os.path.join(train_data, '*.tfrecord')) print('train_data_filenames {}'.format(train_data_filenames)) train_dataset = file_based_input_dataset_builder( channel='train', input_filenames=train_data_filenames, is_training=True, drop_remainder=False, batch_size=train_batch_size, epochs=epochs, steps_per_epoch=train_steps_per_epoch, max_seq_length=max_seq_length).map(select_data_and_label_from_record) tokenizer = None config = None model = None # This is required when launching many instances at once... the urllib request seems to get denied periodically successful_download = False retries = 0 while (retries < 5 and not successful_download): try: tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=len(CLASSES)) model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config) successful_download = True print('Sucessfully downloaded after {} retries.'.format(retries)) except: retries = retries + 1 random_sleep = random.randint(1, 30) print('Retry #{}. Sleeping for {} seconds'.format(retries, random_sleep)) time.sleep(random_sleep) callbacks = [] initial_epoch_number = 0 if not tokenizer or not model or not config: print('Not properly initialized...') optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon) print('** use_amp {}'.format(use_amp)) if use_amp: # loss scaling is currently required when using mixed precision optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic') print('*** OPTIMIZER {} ***'.format(optimizer)) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) print('Compiled model {}'.format(model)) model.layers[0].trainable = not freeze_bert_layer print(model.summary()) if run_validation: validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord')) print('validation_data_filenames {}'.format(validation_data_filenames)) validation_dataset = file_based_input_dataset_builder( channel='validation', input_filenames=validation_data_filenames, is_training=False, drop_remainder=False, batch_size=validation_batch_size, epochs=epochs, steps_per_epoch=validation_steps, max_seq_length=max_seq_length).map(select_data_and_label_from_record) print('Starting Training and Validation...') validation_dataset = validation_dataset.take(validation_steps) train_and_validation_history = model.fit(train_dataset, shuffle=True, epochs=epochs, initial_epoch=initial_epoch_number, steps_per_epoch=train_steps_per_epoch, validation_data=validation_dataset, validation_steps=validation_steps, callbacks=callbacks) print(train_and_validation_history) else: # Not running validation print('Starting Training (Without Validation)...') train_history = model.fit(train_dataset, shuffle=True, epochs=epochs, initial_epoch=initial_epoch_number, steps_per_epoch=train_steps_per_epoch, callbacks=callbacks) print(train_history) if run_test: test_data_filenames = glob(os.path.join(test_data, '*.tfrecord')) print('test_data_filenames {}'.format(test_data_filenames)) test_dataset = file_based_input_dataset_builder( channel='test', input_filenames=test_data_filenames, is_training=False, drop_remainder=False, batch_size=test_batch_size, epochs=epochs, steps_per_epoch=test_steps, max_seq_length=max_seq_length).map(select_data_and_label_from_record) print('Starting test...') test_history = model.evaluate(test_dataset, steps=test_steps, callbacks=callbacks) print('Test history {}'.format(test_history)) # Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path)) model.save_pretrained(transformer_fine_tuned_model_path) upload_model_to_s3(transformer_fine_tuned_model_path, output_dir) # Save the TensorFlow SavedModel for Serving Predictions print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path)) model.save(tensorflow_saved_model_path, save_format='tf') upload_model_to_s3(tensorflow_saved_model_path, output_dir) # Copy inference.py and requirements.txt to the code/ directory # Note: This is required for the SageMaker Endpoint to pick them up. # This appears to be hard-coded and must be called code/ inference_path = os.path.join(local_model_dir, 'code/') print('Copying inference source files to {}'.format(inference_path)) os.makedirs(inference_path, exist_ok=True) os.system('cp inference.py {}'.format(inference_path)) print(glob(inference_path)) # os.system('cp requirements.txt {}/code'.format(inference_path)) if run_sample_predictions: loaded_model = TFDistilBertForSequenceClassification.from_pretrained(transformer_fine_tuned_model_path, id2label={ 0: 1, 1: 2, 2: 3, 3: 4, 4: 5 }, label2id={ 1: 0, 2: 1, 3: 2, 4: 3, 5: 4 }) tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') inference_pipeline = TextClassificationPipeline(model=loaded_model, tokenizer=tokenizer, framework='tf', device=-1) print("""I loved it! I will recommend this to everyone.""", inference_pipeline("""I loved it! I will recommend this to everyone.""")) print("""It's OK.""", inference_pipeline("""It's OK.""")) print("""Really bad. I hope they don't make this anymore.""", inference_pipeline("""Really bad. I hope they don't make this anymore.""")) import csv df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', delimiter='\t', quoting=csv.QUOTE_NONE, compression='gzip')[['review_body', 'star_rating']] df_test_reviews = df_test_reviews.sample(n=100) df_test_reviews.shape df_test_reviews.head() import pandas as pd def predict(review_body): prediction_map = inference_pipeline(review_body) return prediction_map[0]['label'] y_test = df_test_reviews['review_body'].map(predict) y_test y_actual = df_test_reviews['star_rating'] y_actual from sklearn.metrics import classification_report print(classification_report(y_true=y_test, y_pred=y_actual)) from sklearn.metrics import accuracy_score print('Accuracy: ', accuracy_score(y_true=y_test, y_pred=y_actual)) import matplotlib.pyplot as plt import pandas as pd def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens): print(cm) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) fmt = 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="black" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') import itertools import numpy as np from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt #%matplotlib inline #%config InlineBackend.figure_format='retina' cm = confusion_matrix(y_true=y_test, y_pred=y_actual) plt.figure() fig, ax = plt.subplots(figsize=(10,5)) plot_conf_mat(cm, classes=['1', '2', '3', '4', '5'], title='Confusion Matrix') # Save the confusion matrix plt.show() # Model Output metrics_path = os.path.join(local_model_dir, 'metrics/') os.makedirs(metrics_path, exist_ok=True) plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
def predict_fn(input_data, model): trained_model, tokenizer = model pipe = TextClassificationPipeline(model=trained_model, tokenizer=tokenizer) output = pipe(input_data) return output
# -*- coding: utf-8 -*- """ Created on Mon Feb 15 03:40:33 2021 @author: nashe """ import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'} from transformers import TextClassificationPipeline, TFAutoModelForSequenceClassification, AutoTokenizer MODEL_DIR = r"D:\Fine-tuned Models\NLP\bert\tf-distilbert-base-uncased\epoch_1_loss_0.39" # Feature extraction pipeline model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_DIR) tokenizer = AutoTokenizer.from_pretrained(r"D:\Models\NLP\bert\tf-distilbert-base-uncased") pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, framework='tf', device=0) result = pipeline("It was a good watch. But a little boring.")[0]