def run_pipeline_test(self, model, tokenizer):
        text_classifier = TextClassificationPipeline(model=model,
                                                     tokenizer=tokenizer)

        # Small inputs because BartTokenizer tiny has maximum position embeddings = 22
        valid_inputs = "HuggingFace is in"
        outputs = text_classifier(valid_inputs)

        self.assertEqual(nested_simplify(outputs), [{
            "label": ANY(str),
            "score": ANY(float)
        }])
        self.assertTrue(outputs[0]["label"] in model.config.id2label.values())

        valid_inputs = ["HuggingFace is in ", "Paris is in France"]
        outputs = text_classifier(valid_inputs)
        self.assertEqual(
            nested_simplify(outputs),
            [{
                "label": ANY(str),
                "score": ANY(float)
            }, {
                "label": ANY(str),
                "score": ANY(float)
            }],
        )
        self.assertTrue(outputs[0]["label"] in model.config.id2label.values())
        self.assertTrue(outputs[1]["label"] in model.config.id2label.values())
Пример #2
0
def predict_fn(input_data, model):
    logger.info('***** PREDICT_FN ********')
    trained_model, tokenizer = model
    pipe = TextClassificationPipeline(model=trained_model, tokenizer=tokenizer)
    logger.info('***** TEXT INPUT : %s', input_data)
    output = pipe(input_data)
    return output
Пример #3
0
    def __init__(self,
                 model_type="DISTILBERT",
                 model_name="distilbert-base-uncased",
                 num_labels: int = 2,
                 load_path: str = ""):
        self.adaptor = get_adaptor(model_type)

        config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)

        if load_path != "":
            model = AutoModelForSequenceClassification.from_pretrained(
                load_path, config=config)
        else:
            model = AutoModelForSequenceClassification.from_pretrained(
                model_name, config=config)

        super().__init__(model_type, model_name, model)

        device_number = detect_cuda_device_number()
        self._pipeline = TextClassificationPipeline(model=self.model,
                                                    tokenizer=self.tokenizer,
                                                    device=device_number)

        self._trainer = TCTrainer(self.model, self.model_type, self.tokenizer,
                                  self._device, self.logger)
Пример #4
0
def simple_inference():
    '''
    this one is simpler and better for general case. It doesn't show the distribution of all the sentiments.
    this one uses the TextClassificationPipeline from transformers lib which is preferable
    :return:
    '''
    tokenizer = DistilBertTokenizer.from_pretrained("./model_out/")
    model = DistilBertForSequenceClassification.from_pretrained("./model_out/")
    model.to('cpu')
    sentiment_classifier = TextClassificationPipeline(model=model,
                                                      tokenizer=tokenizer,
                                                      device=-1)

    t1 = time.time()
    result = sentiment_classifier("this is so cute!")
    t2 = time.time()
    print(t2 - t1, result)

    result = sentiment_classifier("That's so disgusting!")
    t3 = time.time()
    print(t3 - t2, result)

    result = sentiment_classifier("this is a simple test.")
    t4 = time.time()
    print(t4 - t3, result)
Пример #5
0
def classify_sentiment():
    rest_request = json.loads(request.data.decode('utf-8'))
    sentence = str(rest_request["sentence"])
    sentiment_classifier = TextClassificationPipeline(
        model=model, tokenizer=tokenizer, device=0 if config.use_cuda else -1)

    result = sentiment_classifier(sentence)
    return str(result)
Пример #6
0
def sentiment_model(text: str):
    tokenizer = AutoTokenizer.from_pretrained(
        "distilbert-base-uncased-finetuned-sst-2-english")
    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased-finetuned-sst-2-english")
    pipeline = TextClassificationPipeline(model=model,
                                          tokenizer=tokenizer,
                                          task="sentiment-analysis")

    return pipeline(text)[0]['label'], pipeline(text)[0]['score']
    def test_unbatch_attentions_hidden_states(self):
        model = DistilBertForSequenceClassification.from_pretrained(
            "hf-internal-testing/tiny-random-distilbert", output_hidden_states=True, output_attentions=True
        )
        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-distilbert")
        text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)

        # Used to throw an error because `hidden_states` are a tuple of tensors
        # instead of the expected tensor.
        outputs = text_classifier(["This is great !"] * 20, batch_size=32)
        self.assertEqual(len(outputs), 20)
Пример #8
0
    def __init__(self, args: Namespace):
        self.logger = getLogger("transformers-cli/training")

        self.framework = "tf" if is_tf_available() else "torch"

        os.makedirs(args.output, exist_ok=True)
        assert os.path.isdir(args.output)
        self.output = args.output

        self.column_label = args.column_label
        self.column_text = args.column_text
        self.column_id = args.column_id

        self.logger.info("Loading {} pipeline for {}".format(
            args.task, args.model))
        if args.task == "text_classification":
            self.pipeline = TextClassificationPipeline.from_pretrained(
                args.model)
        elif args.task == "token_classification":
            raise NotImplementedError
        elif args.task == "question_answering":
            raise NotImplementedError

        self.logger.info("Loading dataset from {}".format(args.train_data))
        self.train_dataset = Processor.create_from_csv(
            args.train_data,
            column_label=args.column_label,
            column_text=args.column_text,
            column_id=args.column_id,
            skip_first_row=args.skip_first_row,
        )
        self.valid_dataset = None
        if args.validation_data:
            self.logger.info("Loading validation dataset from {}".format(
                args.validation_data))
            self.valid_dataset = Processor.create_from_csv(
                args.validation_data,
                column_label=args.column_label,
                column_text=args.column_text,
                column_id=args.column_id,
                skip_first_row=args.skip_first_row,
            )

        self.validation_split = args.validation_split
        self.train_batch_size = args.train_batch_size
        self.valid_batch_size = args.valid_batch_size
        self.learning_rate = args.learning_rate
        self.adam_epsilon = args.adam_epsilon
Пример #9
0
    def __init__(
        self,
        model_name_or_path: Union[
            Path, str] = "shahrukhx01/bert-mini-finetune-question-detection"):
        """
        :param model_name_or_path: Transformer based fine tuned mini bert model for query classification
        """
        # save init parameters to enable export of component config as YAML
        self.set_config(model_name_or_path=model_name_or_path)

        model = AutoModelForSequenceClassification.from_pretrained(
            model_name_or_path)
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

        self.query_classification_pipeline = TextClassificationPipeline(
            model=model, tokenizer=tokenizer)
def main():

    bert = TFAutoModel.from_pretrained('Rostlab/prot_bert_bfd')

    input_ids = tf.keras.layers.Input(shape=(MAX_LEN, ),
                                      name='input_ids',
                                      dtype='int64')
    mask = tf.keras.layers.Input(shape=(MAX_LEN, ),
                                 name='attention_mask',
                                 dtype='int64')

    embeddings = bert(input_ids, attention_mask=mask)[0]

    X = tf.keras.layers.GlobalMaxPooling1D()(embeddings)
    X = tf.keras.layers.BatchNormalization()(X)
    X = tf.keras.layers.Dense(64, activation='relu')(X)
    X = tf.keras.layers.Dropout(0.1)(X)
    X = tf.keras.layers.Dense(16, activation='relu')(X)
    y = tf.keras.layers.Dense(6, activation='softmax', name='outputs')(X)

    bert = tf.keras.Model(inputs=[input_ids, mask], outputs=[y])

    bert.load_weights('./checkpoints/mini_test2/tf_model.h5')

    # TODO: load the tokenizer and the pretrained model from (checkpoints directory)
    tokenizer = AutoTokenizer.from_pretrained(
        'Rostlab/prot_bert_bfd',
        do_lower_case=False,
    )
    # bert = TFAutoModelForSequenceClassification.from_pretrained('./checkpoints/mini_test/weights.h5', from_pt=True)
    #
    pipeline = TextClassificationPipeline(model=bert,
                                          tokenizer=tokenizer,
                                          device=0,
                                          framework='tf',
                                          task="first EC number prediction")
    # TODO: change device to read from cuda apis

    seq = 'M E N H S K Q T E A P H P G T Y M P A G Y P P P Y P P A A F Q G P S D H A A Y P I P Q A G Y Q G P P G P Y P G P Q P G Y P V P P G G Y A G G ' \
          'G P S G F P V Q N Q P A Y N H P G G P G G T P W M P A P P P P L N C P P G L E Y L A Q I D Q L L V H Q Q I E L L E V L T G F E T N N K Y E I ' \
          'K N S L G Q R V Y F A V E D T D C C T R N C C G A S R P F T L R I L D N L G R E V M T L E R P L R C S S C C F P C C L Q E I E I Q A P P G V ' \
          'P V G Y V T Q T W H P C L P K F T L Q N E K K Q D V L K V V G P C V V C S C C S D I D F E L K S L D E E S V V G K I S K Q W S G F V R E A F ' \
          'T D A D N F G I Q F P L D L D V K M K A V M L G A C F L I D F M F F E R T G N E E Q R S G A W Q '

    print(pipeline(seq))
Пример #11
0
    def __init__(self,
                 model_type="BERT",
                 model_name="bert-base-multilingual-cased",
                 num_labels=3):
        self.adaptor = get_adaptor(model_type)
        config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)

        model = AutoModelForSequenceClassification.from_pretrained(
            model_name, config=config)

        super().__init__(model_type, model_name, model)

        device_number = detect_cuda_device_number()
        self._pipeline = TextClassificationPipeline(model=self.model,
                                                    tokenizer=self.tokenizer,
                                                    device=device_number)

        self._trainer = ABSATrainer(self.model, self.model_type,
                                    self.tokenizer, self._device, self.logger)
    def __init__(self,
                 model_type="DISTILBERT",
                 model_name="distilbert-base-uncased",
                 num_labels=2):
        self.adaptor = get_adaptor(model_type)
        config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)

        model = self.adaptor.SequenceClassification.from_pretrained(
            model_name, config=config)
        tokenizer = self.adaptor.Tokenizer.from_pretrained(model_name)

        super().__init__(model_type, model_name, model, tokenizer)

        device_number = detect_cuda_device_number()
        self._pipeline = TextClassificationPipeline(model=model,
                                                    tokenizer=tokenizer,
                                                    device=device_number)

        self._trainer = TCTrainer(self._model, self.model_type,
                                  self._tokenizer, self._device, self.logger)
                                                                        2: 1,
                                                                        3: 2,
                                                                        4: 3,
                                                                        5: 4
                                                                       })

        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

        if num_gpus >= 1:
            inference_device = 0 # GPU 0
        else:
            inference_device = -1 # CPU
        print('inference_device {}'.format(inference_device))

        inference_pipeline = TextClassificationPipeline(model=loaded_model, 
                                                        tokenizer=tokenizer,
                                                        framework='tf',
                                                        device=inference_device)  

        print("""I loved it!  I will recommend this to everyone.""", inference_pipeline("""I loved it!  I will recommend this to everyone."""))
        print("""It's OK.""", inference_pipeline("""It's OK."""))
        print("""Really bad.  I hope they don't make this anymore.""", inference_pipeline("""Really bad.  I hope they don't make this anymore."""))

        import csv

        df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', 
                                        delimiter='\t', 
                                        quoting=csv.QUOTE_NONE,
                                        compression='gzip')[['review_body', 'star_rating']]

        df_test_reviews = df_test_reviews.sample(n=100)
        df_test_reviews.shape
Пример #14
0
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TextClassificationPipeline

tokenizer = DistilBertTokenizer.from_pretrained("./model_out/")
model = DistilBertForSequenceClassification.from_pretrained("./model_out/")

sentiment_classifier = TextClassificationPipeline(model=model,
                                                  tokenizer=tokenizer)

result = sentiment_classifier("this is so cute!")
print(result)
Пример #15
0
        df = pd.read_csv(os.path.join(_DATA_PATH, _DATA_FILE))
        index_list = df.index.tolist()
        n_rows = len(index_list)
        batches = [
            index_list[i:i + _MAX_NUMBER_ROWS]
            for i in range(0, n_rows, _MAX_NUMBER_ROWS)
        ]

        # DOWNLOAD DE MODEL AND TOKENIZER FROM HUGGINGFACE
        model = BertForSequenceClassification.from_pretrained(
            pretrained_model_name_or_path=_HUGGINGFACE_MODEL)
        tokenizer = BertTokenizer.from_pretrained(
            pretrained_model_name_or_path=_HUGGINGFACE_MODEL)

        # CREATE THE CLASSIFIER
        sentiment_analyzer = TextClassificationPipeline(model=model,
                                                        tokenizer=tokenizer)

        # INFER SENTIMENTS
        for i in range(len(batches)):
            # get a portion of data
            df_i = df.loc[batches[i]]
            # create the filename
            filename = os.path.join(
                _DATA_PATH_NEW,
                _DATA_FILE_NEW.format(
                    n=str(i).rjust(len(str(len(batches))), '0')))
            if not os.path.exists(filename):
                try:
                    # extract the text clean, but is going to be cleaned again by the tokenizer
                    text = list(df_i['text_clean'].astype(str).values)
                    # infer sentiments
Пример #16
0
    args = parser.parse_args()

    print(f"Loading data from {args.source_file}")
    data_df = pd.read_csv(args.source_file, index_col=0)
    print(data_df.tail())

    # Carregando checkpoint e tokenizador
    print(f"- Loading tokenizer {args.tokenizer}.")
    print(f"- Loading checkpoint {args.checkpoint}.")
    tokenizer = BertTokenizerFast.from_pretrained(args.tokenizer)
    model = BertForSequenceClassification.from_pretrained(args.checkpoint)

    # Preparando pipeline de inferência
    print(f"- Preparing inference pipeline.")
    pipeline = TextClassificationPipeline(
        model=model,
        tokenizer=tokenizer,
        # batch_size=args.bs, # TODO: não funciona com batch_size, atualizar a lib transformers eventualmente.
        device=args.device)

    print(f"- Running inference:")
    preds_dict = pipeline(data_df[args.text_column].values.tolist())
    # Extraindo predições do dicionario retornado
    preds = [example["label"] for example in preds_dict]
    data_df["predictions"] = preds

    # Salvando resultados
    print(f"- Saving results to {args.source_file}.")
    data_df.to_csv(f"{args.source_file}")
Пример #17
0
 def get_test_pipeline(self, model, tokenizer, feature_extractor):
     text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
     return text_classifier, ["HuggingFace is in", "This is another test"]
Пример #18
0
def main():

    bert_tokenizer = DistilBertTokenizer.from_pretrained('bert_tokenizer')
    bert_fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(
        'bert_fine_tuned')
    pipeline = TextClassificationPipeline(model=bert_fine_tuned_model,
                                          tokenizer=bert_tokenizer,
                                          return_all_scores=True)

    #Loading in lstm tokenizer and model
    tokenizer = load_tokenizer('tokenizer.pkl')
    model_lstm = keras.models.load_model('lstm_token300_dim32_softmax.h5')

    #Loading in sentence similarity embedding model
    claim_model_bert = torch.load("claim_model_bert")

    #Dictionaries relating number label to written label
    label2class = {'False': 0, 'Mixture': 1, 'True': 2, 'Unproven': 3}
    class2label = {0: 'False', 1: 'Mixture', 2: 'True', 3: 'Unproven'}

    #Loading in the testing data and removing erroneous results
    testingData = pd.read_csv('TestDataWithSlicingDF')
    testingData = testingData[testingData["label"] != -1]

    st.title("Public Health Fake News Analyzer")

    options = st.sidebar.selectbox(
        "Choose a page",
        ["About", "Fake News Prediction", "Similarity Matching", "Testing"])

    if options == "About":

        st.write("Creators: Alex Gui, Vivek Lam and Sathya Chitturi")
        st.write("Dataset Source: https://arxiv.org/abs/2010.09926")

        st.write(
            " Due to the nature and popularity of social networking sites, misinformation can propagate rapidly leading to widespread dissemination of misleading and even harmful information. A plethora of misinformation can make it hard for the public to understand what claims hold merit and which are baseless. This machine learning tool allows users to quickly learn whether an article contains fake news. The tabs in this website correspond to predictive fake news detection, user claim similarity matching, and model performance evaluation."
        )

        st.error(
            "Disclaimer: This project is still a work in progress. The best source of information regarding fake news comes directly from verified fact-checkers."
        )

###### Tab 1 #####
    if options == "Fake News Prediction":

        #Setting other buttons states in different tabs to false
        st.subheader("Prediction on article")
        st.markdown(Path("predTab.md").read_text())

        user_input_type = st.selectbox("Select a method to input data",
                                       ['Text Box', 'URL'])

        if user_input_type == 'Text Box':
            text = st.text_area("Enter Text", "Type Here", key="predText")
            if text != 'Type Here':

                model_selected = st.selectbox(
                    "Select a model", ['Baseline LSTM', 'Fine-tuned BERT'])

                if model_selected == 'Baseline LSTM':

                    if st.button("Analyze"):
                        X = preprocess_text(tokenizer, [text])
                        pred = model_lstm.predict(X)
                        displayPredictedDf(pred)

                if model_selected == 'Fine-tuned BERT':

                    if st.button("Analyze"):
                        #truncates text so that the model will run
                        text = text[:2000]
                        pred = pipeline(text)
                        pred = np.expand_dims(np.array([
                            pred[0][0]["score"], pred[0][1]["score"],
                            pred[0][2]["score"], pred[0][3]["score"]
                        ]),
                                              axis=0)
                        displayPredictedDf(pred)

        elif user_input_type == 'URL':
            raw_url = st.text_input("Enter URL", "Type Here", key="predURL")
            if raw_url != 'Type Here':

                try:
                    text = get_text(raw_url)
                    textLoaded = True
                except:
                    st.error("Cannot parse url")
                    textLoaded = False

                if textLoaded:
                    model_selected = st.selectbox(
                        "Select a model", ['Baseline LSTM', 'Fine-tuned BERT'])

                    if model_selected == 'Baseline LSTM':

                        if st.button("Analyze"):

                            X = preprocess_text(tokenizer, [text])
                            pred = model_lstm.predict(X)
                            displayPredictedDf(pred)

                    if model_selected == 'Fine-tuned BERT':

                        if st.button("Analyze"):
                            #truncates text so that the model will run
                            text = text[:2000]
                            pred = pipeline(text)
                            pred = np.expand_dims(np.array([
                                pred[0][0]["score"], pred[0][1]["score"],
                                pred[0][2]["score"], pred[0][3]["score"]
                            ]),
                                                  axis=0)
                            displayPredictedDf(pred)

########### Tab 2 #############
    if options == "Similarity Matching":

        st.subheader("Similar Claim Finder")
        st.markdown(Path("simTab.md").read_text())

        user_input_type = st.selectbox("Select a method to input data",
                                       ['Text Box', 'URL'])

        if user_input_type == 'Text Box':
            text = st.text_area("Enter Text", "Type Here", key="simText")
            if text != 'Type Here':

                model_selected = st.selectbox("Select a model",
                                              ['BERT Similarity'])

                if model_selected == 'BERT Similarity':

                    if st.button("Analyze Claim"):

                        article_split = sentence_splitter(text)
                        # Take first 20 sentences
                        processed_article = (article_split)

                        embedding_data = load_embedded_data(
                            "healthFactTrainData.pkl")

                        matched_claims = []

                        for sent in processed_article:
                            claim_embed = claim_model_bert.encode(sent)
                            sim_scores = cosine_similarity(
                                embedding_data['claim_embedding'],
                                claim_embed.reshape(1, -1))
                            top = np.flip(np.argsort(sim_scores.flatten()))[:3]

                            for idx in top:

                                if round(sim_scores[idx].item(), 3) > 0.8:
                                    if embedding_data['label'][idx] != -1:
                                        matched_claims.append([
                                            round(sim_scores[idx].item(), 3),
                                            sent, embedding_data['claim'][idx],
                                            class2label[embedding_data['label']
                                                        [idx]]
                                        ])

                        df = pd.DataFrame(matched_claims,
                                          columns=[
                                              'Similarity Score',
                                              'Trigger Sentence in Article',
                                              'Claim in Training Data',
                                              'Claim Label'
                                          ])
                        df = df.sort_values(by=['Similarity Score'],
                                            ascending=False)
                        st.table(df.assign(hack='').set_index('hack'))

        elif user_input_type == 'URL':
            raw_url = st.text_input("Enter URL", "Type Here", key="simURL")
            if raw_url != 'Type Here':

                try:
                    text = get_text(raw_url)
                    textLoaded = True
                except:
                    st.error("Cannot parse url")
                    textLoaded = False

                if textLoaded:
                    model_selected = st.selectbox("Select a model",
                                                  ['BERT Similarity'])

                    if model_selected == 'BERT Similarity':

                        if st.button("Analyze Claim"):

                            article_split = sentence_splitter(text)
                            # Take first 20 sentences
                            processed_article = (article_split)

                            embedding_data = load_embedded_data(
                                "healthFactTrainData.pkl")

                            matched_claims = []

                            for sent in processed_article:
                                claim_embed = claim_model_bert.encode(sent)
                                sim_scores = cosine_similarity(
                                    embedding_data['claim_embedding'],
                                    claim_embed.reshape(1, -1))
                                top = np.flip(np.argsort(
                                    sim_scores.flatten()))[:3]

                                for idx in top:
                                    if round(sim_scores[idx].item(), 3) > 0.8:
                                        if embedding_data['label'][idx] != -1:
                                            matched_claims.append([
                                                round(sim_scores[idx].item(),
                                                      3), sent,
                                                embedding_data['claim'][idx],
                                                class2label[embedding_data[
                                                    'label'][idx]]
                                            ])

                            df = pd.DataFrame(
                                matched_claims,
                                columns=[
                                    'Similarity Score',
                                    'Trigger Sentence in Article',
                                    'Claim in Training Data', 'Claim Label'
                                ])
                            df = df.sort_values(by=['Similarity Score'],
                                                ascending=False)
                            st.table(df.assign(hack='').set_index('hack'))


###### Tab 3 ######
    if options == "Testing":

        st.subheader("Prediction on testing data")
        st.markdown(Path("testTab.md").read_text())

        model_selected = st.selectbox("Select a model", ['Baseline LSTM'])

        if model_selected == 'Baseline LSTM':
            st.write("{0} Accuracy on whole Test Dataset: 66%".format(
                model_selected))

            user_input = st.selectbox("Slicing Type", ["Word Count", "Year Published", \
             "Average Sentence Length", "Percentage Punctuation"])

            if user_input == "Word Count":
                wordCountSplit = st.slider(label="Word Count Split",
                                           min_value=200,
                                           max_value=1300,
                                           step=1)

                lowerSplitWC = testingData[
                    testingData["word_counts"] <= wordCountSplit]
                upperSplitWC = testingData[
                    testingData["word_counts"] > wordCountSplit]

                if st.button("Generate Split Statistics"):

                    accDf, totalaccuracyDF = generateAccDfLSTM(
                        lowerSplitWC,
                        upperSplitWC,
                        tokenizer,
                        model_lstm,
                        sampleSize=SAMPLE_SIZE)
                    st.write(totalaccuracyDF)
                    st.write(accDf)

            elif user_input == "Year Published":
                yearPublishedSplit = st.slider(label="Year Published Split",
                                               min_value=2010,
                                               max_value=2019,
                                               step=1)

                #Removing instances where the year is unavailable
                testingDataYP = testingData[testingData["year_published"] != 0]

                lowerSplitYP = testingDataYP[
                    testingDataYP["year_published"] <= yearPublishedSplit]
                upperSplitYP = testingDataYP[
                    testingDataYP["year_published"] > yearPublishedSplit]

                if st.button("Generate Split Statistics"):
                    accDf, totalaccuracyDF = generateAccDfLSTM(
                        lowerSplitYP,
                        lowerSplitYP,
                        tokenizer,
                        model_lstm,
                        sampleSize=SAMPLE_SIZE)
                    st.write(totalaccuracyDF)
                    st.write(accDf)

            elif user_input == "Average Sentence Length":
                avgSentLenSplit = st.slider(
                    label="Average Sentence Length Split",
                    min_value=101,
                    max_value=199,
                    step=1)

                #Removing instances where the year is unavailable
                testingDataASL = testingData[
                    testingData["average_sentence_length"] != 0]

                lowerSplitASL = testingDataASL[
                    testingDataASL["average_sentence_length"] <=
                    avgSentLenSplit]
                upperSplitASL = testingDataASL[
                    testingDataASL["average_sentence_length"] >
                    avgSentLenSplit]

                if st.button("Generate Split Statistics"):
                    accDf, totalaccuracyDF = generateAccDfLSTM(
                        lowerSplitASL,
                        upperSplitASL,
                        tokenizer,
                        model_lstm,
                        sampleSize=SAMPLE_SIZE)
                    st.write(totalaccuracyDF)
                    st.write(accDf)

            elif user_input == "Percentage Punctuation":
                fracPuncSplit = st.slider(label="Percentage Punctuation Split",
                                          min_value=.12,
                                          max_value=.8,
                                          step=.01)

                #Removing instances where the year is unavailable
                testingDataFPS = testingData[
                    testingData["percentage_punc_to_word"] != 1]

                lowerSplitFPS = testingDataFPS[
                    testingDataFPS["percentage_punc_to_word"] <= fracPuncSplit]
                upperSplitFPS = testingDataFPS[
                    testingDataFPS["percentage_punc_to_word"] > fracPuncSplit]

                if st.button("Generate Split Statistics"):
                    accDf, totalaccuracyDF = generateAccDfLSTM(
                        lowerSplitFPS,
                        upperSplitFPS,
                        tokenizer,
                        model_lstm,
                        sampleSize=SAMPLE_SIZE)
                    st.write(totalaccuracyDF)
                    st.write(accDf)
Пример #19
0
 def __init__(self, name_model: str = SENTIMENT_MODEL):
     tokenizer = AutoTokenizer.from_pretrained(name_model)
     model = AutoModelForSequenceClassification.from_pretrained(name_model)
     self.pipe = TextClassificationPipeline(model=model,
                                            tokenizer=tokenizer)
Пример #20
0
def upload_model_to_s3(file, bucket):
    s3 = boto3.client('s3')
    with open(file, "rb") as f:
    s3.upload_fileobj(f, bucket, file)
    print('Model file {} uploaded to {}'.format(file, bucket))


if __name__ == '__main__':

    train_data='s3://sagemaker-us-west-2-231218423789/training-pipeline-2020-09-05-16-19-31/processing/output/bert-train'
    test_data='s3://sagemaker-us-west-2-231218423789/training-pipeline-2020-09-05-16-19-31/processing/output/bert-test'
    validation_data='s3://sagemaker-us-west-2-231218423789/training-pipeline-2020-09-05-16-19-31/processing/output/bert-validation'
    model_dir='opt/ml/model'
    output_dir='s3://sagemaker-us-west-2-231218423789/dlc/output'
    use_xla=False
    use_amp=False
    max_seq_length=64
    train_batch_size=64
    validation_batch_size=64
    test_batch_size=64
    epochs=1
    learning_rate=0.00003
    epsilon=0.00000001
    train_steps_per_epoch=50
    validation_steps=10
    test_steps=10
    freeze_bert_layer=True
    run_validation=False
    run_test=False
    run_sample_predictions=False
 
    # Model Output 
    transformer_fine_tuned_model_path = os.path.join(model_dir, 'transformers/fine-tuned/')
    os.makedirs(transformer_fine_tuned_model_path, exist_ok=True)

    # SavedModel Output
    tensorflow_saved_model_path = os.path.join(model_dir, 'tensorflow/saved_model/0')
    os.makedirs(tensorflow_saved_model_path, exist_ok=True)
    
    distributed_strategy = tf.distribute.MirroredStrategy()
    
    with distributed_strategy.scope():
        tf.config.optimizer.set_jit(use_xla)
        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": use_amp})

        train_data_filenames = glob(os.path.join(train_data, '*.tfrecord'))
        print('train_data_filenames {}'.format(train_data_filenames))
        train_dataset = file_based_input_dataset_builder(
            channel='train',
            input_filenames=train_data_filenames,
            is_training=True,
            drop_remainder=False,
            batch_size=train_batch_size,
            epochs=epochs,
            steps_per_epoch=train_steps_per_epoch,
            max_seq_length=max_seq_length).map(select_data_and_label_from_record)

        tokenizer = None
        config = None
        model = None

        # This is required when launching many instances at once...  the urllib request seems to get denied periodically
        successful_download = False
        retries = 0
        while (retries < 5 and not successful_download):
            try:
                tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
                config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
                                                          num_labels=len(CLASSES))
                model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                                              config=config)
                successful_download = True
                print('Sucessfully downloaded after {} retries.'.format(retries))
            except:
                retries = retries + 1
                random_sleep = random.randint(1, 30)
                print('Retry #{}.  Sleeping for {} seconds'.format(retries, random_sleep))
                time.sleep(random_sleep)

        callbacks = []

        initial_epoch_number = 0 

        if not tokenizer or not model or not config:
            print('Not properly initialized...')

        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)
        print('** use_amp {}'.format(use_amp))        
        if use_amp:
            # loss scaling is currently required when using mixed precision
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')

        print('*** OPTIMIZER {} ***'.format(optimizer))
        
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

        model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
        print('Compiled model {}'.format(model))          
        model.layers[0].trainable = not freeze_bert_layer
        print(model.summary())

        if run_validation:
            validation_data_filenames = glob(os.path.join(validation_data, '*.tfrecord'))
            print('validation_data_filenames {}'.format(validation_data_filenames))
            validation_dataset = file_based_input_dataset_builder(
                channel='validation',
                input_filenames=validation_data_filenames,
                is_training=False,
                drop_remainder=False,
                batch_size=validation_batch_size,
                epochs=epochs,
                steps_per_epoch=validation_steps,
                max_seq_length=max_seq_length).map(select_data_and_label_from_record)
            
            print('Starting Training and Validation...')
            validation_dataset = validation_dataset.take(validation_steps)
            train_and_validation_history = model.fit(train_dataset,
                                                     shuffle=True,
                                                     epochs=epochs,
                                                     initial_epoch=initial_epoch_number,
                                                     steps_per_epoch=train_steps_per_epoch,
                                                     validation_data=validation_dataset,
                                                     validation_steps=validation_steps,
                                                     callbacks=callbacks)                                
            print(train_and_validation_history)
        else: # Not running validation
            print('Starting Training (Without Validation)...')
            train_history = model.fit(train_dataset,
                                      shuffle=True,
                                      epochs=epochs,
                                      initial_epoch=initial_epoch_number,
                                      steps_per_epoch=train_steps_per_epoch,
                                      callbacks=callbacks)                
            print(train_history)

        if run_test:
            test_data_filenames = glob(os.path.join(test_data, '*.tfrecord'))
            print('test_data_filenames {}'.format(test_data_filenames))
            test_dataset = file_based_input_dataset_builder(
                channel='test',
                input_filenames=test_data_filenames,
                is_training=False,
                drop_remainder=False,
                batch_size=test_batch_size,
                epochs=epochs,
                steps_per_epoch=test_steps,
                max_seq_length=max_seq_length).map(select_data_and_label_from_record)

            print('Starting test...')
            test_history = model.evaluate(test_dataset,
                                          steps=test_steps,
                                          callbacks=callbacks)
                                 
            print('Test history {}'.format(test_history))
            
        # Save the Fine-Yuned Transformers Model as a New "Pre-Trained" Model
        print('transformer_fine_tuned_model_path {}'.format(transformer_fine_tuned_model_path))   
        model.save_pretrained(transformer_fine_tuned_model_path)
        upload_model_to_s3(transformer_fine_tuned_model_path, output_dir)

        # Save the TensorFlow SavedModel for Serving Predictions
        print('tensorflow_saved_model_path {}'.format(tensorflow_saved_model_path))   
        model.save(tensorflow_saved_model_path, save_format='tf')
        upload_model_to_s3(tensorflow_saved_model_path, output_dir)

                
        # Copy inference.py and requirements.txt to the code/ directory
        #   Note: This is required for the SageMaker Endpoint to pick them up.
        #         This appears to be hard-coded and must be called code/
        inference_path = os.path.join(local_model_dir, 'code/')
        print('Copying inference source files to {}'.format(inference_path))
        os.makedirs(inference_path, exist_ok=True)               
        os.system('cp inference.py {}'.format(inference_path))
        print(glob(inference_path))        
#        os.system('cp requirements.txt {}/code'.format(inference_path))
        
    if run_sample_predictions:
        loaded_model = TFDistilBertForSequenceClassification.from_pretrained(transformer_fine_tuned_model_path,
                                                                       id2label={
                                                                        0: 1,
                                                                        1: 2,
                                                                        2: 3,
                                                                        3: 4,
                                                                        4: 5
                                                                       },
                                                                       label2id={
                                                                        1: 0,
                                                                        2: 1,
                                                                        3: 2,
                                                                        4: 3,
                                                                        5: 4
                                                                       })

        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

        inference_pipeline = TextClassificationPipeline(model=loaded_model, 
                                                        tokenizer=tokenizer,
                                                        framework='tf',
                                                        device=-1)  

        print("""I loved it!  I will recommend this to everyone.""", inference_pipeline("""I loved it!  I will recommend this to everyone."""))
        print("""It's OK.""", inference_pipeline("""It's OK."""))
        print("""Really bad.  I hope they don't make this anymore.""", inference_pipeline("""Really bad.  I hope they don't make this anymore."""))

        import csv

        df_test_reviews = pd.read_csv('./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', 
                                        delimiter='\t', 
                                        quoting=csv.QUOTE_NONE,
                                        compression='gzip')[['review_body', 'star_rating']]

        df_test_reviews = df_test_reviews.sample(n=100)
        df_test_reviews.shape
        df_test_reviews.head()
        
        import pandas as pd

        def predict(review_body):
            prediction_map = inference_pipeline(review_body)
            return prediction_map[0]['label']

        y_test = df_test_reviews['review_body'].map(predict)
        y_test
        
        y_actual = df_test_reviews['star_rating']
        y_actual

        from sklearn.metrics import classification_report
        print(classification_report(y_true=y_test, y_pred=y_actual))
        
        from sklearn.metrics import accuracy_score
        print('Accuracy: ', accuracy_score(y_true=y_test, y_pred=y_actual))
        
        import matplotlib.pyplot as plt
        import pandas as pd

        def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
            print(cm)
            plt.imshow(cm, interpolation='nearest', cmap=cmap)
            plt.title(title)
            plt.colorbar()
            tick_marks = np.arange(len(classes))
            plt.xticks(tick_marks, classes, rotation=45)
            plt.yticks(tick_marks, classes)

            fmt = 'd'
            thresh = cm.max() / 2.
            for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
                plt.text(j, i, format(cm[i, j], fmt),
                horizontalalignment="center",
                color="black" if cm[i, j] > thresh else "black")

                plt.tight_layout()
                plt.ylabel('True label')
                plt.xlabel('Predicted label')
                
        import itertools
        import numpy as np
        from sklearn.metrics import confusion_matrix
        import matplotlib.pyplot as plt
        #%matplotlib inline
        #%config InlineBackend.figure_format='retina'

        cm = confusion_matrix(y_true=y_test, y_pred=y_actual)

        plt.figure()
        fig, ax = plt.subplots(figsize=(10,5))
        plot_conf_mat(cm, 
                      classes=['1', '2', '3', '4', '5'], 
                      title='Confusion Matrix')

        # Save the confusion matrix        
        plt.show()
        
        # Model Output 
        metrics_path = os.path.join(local_model_dir, 'metrics/')
        os.makedirs(metrics_path, exist_ok=True)
        plt.savefig('{}/confusion_matrix.png'.format(metrics_path))
Пример #21
0
def predict_fn(input_data, model):
    trained_model, tokenizer = model
    pipe = TextClassificationPipeline(model=trained_model, tokenizer=tokenizer)
    output = pipe(input_data)
    return output
Пример #22
0
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 15 03:40:33 2021

@author: nashe
"""


import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}

from transformers import TextClassificationPipeline, TFAutoModelForSequenceClassification, AutoTokenizer

MODEL_DIR = r"D:\Fine-tuned Models\NLP\bert\tf-distilbert-base-uncased\epoch_1_loss_0.39"

# Feature extraction pipeline
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(r"D:\Models\NLP\bert\tf-distilbert-base-uncased")

pipeline = TextClassificationPipeline(model=model,
                                      tokenizer=tokenizer,
                                      framework='tf',
                                      device=0)

result = pipeline("It was a good watch. But a little boring.")[0]