def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus((tasks_base_path / 'imdb')) label_dict = corpus.make_label_dictionary() embedding = FlairEmbeddings('news-forward-fast') document_embeddings = DocumentRNNEmbeddings([embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False) sentence = Sentence('Berlin is a really nice city.') for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load((results_base_path / 'final-model.pt')) sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) shutil.rmtree(results_base_path)
def test_train_load_use_classifier_flair(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb", label_type="topic") label_dict = corpus.make_label_dictionary(label_type="topic") flair_document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [flair_embeddings], 128, 1, False, 64, False, False) model: TextClassifier = TextClassifier( document_embeddings=flair_document_embeddings, label_dictionary=label_dict, label_type="topic", multi_label=False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False) sentence = Sentence("Berlin is a really nice city.") model.predict(sentence) for label in sentence.labels: assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float del trainer, model, corpus, flair_document_embeddings loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path) del loaded_model
def test_train_load_use_classifier_with_sampler(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() model: TextClassifier = TextClassifier(document_embeddings, label_dict, multi_label=False) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, max_epochs=2, shuffle=False, sampler=ImbalancedClassificationDatasetSampler, ) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float del trainer, model, corpus loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path) del loaded_model
def main(FIN, FOUT): print("Reading in data.") df = pd.read_csv(FIN) count = 0 # global count # filter dataframe for only USA tweets df = df[df['place_country_code'] == 'US'].fillna('None') df = df[df['language'] == 'en'] # load classifier classifier = TextClassifier.load('sentiment') print('Model has been loaded from flair.') try: os.mkdir(FOUT) except: pass print('Running Script.') for row in (df[[ 'created_at', 'place_full_name', 'language', 'mentions', 'hashtags', 'clean_text' ]].iterrows()): tweet, count = run_stack(count, row[1]['created_at'], row[1]['place_full_name'], row[1]['language'], row[1]['mentions'], row[1]['hashtags'], row[1]['clean_text']) dump_tweet(FOUT + '/sentiment_tweets.json', tweet) print('Script has finished.')
def test_train_load_use_classifier_with_prob(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [word_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, max_epochs=2, shuffle=False) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence, multi_class_prob=True): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence, multi_class_prob=True) loaded_model.predict([sentence, sentence_empty], multi_class_prob=True) loaded_model.predict([sentence_empty], multi_class_prob=True) # clean up results directory shutil.rmtree(results_base_path)
def main(): db_name = 'restaurants.db' years = [ '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020' ] restaurants = ['Tibits', 'Mildreds Soho', 'By Chloe'] classifier = TextClassifier.load('sentiment') for restaurant in restaurants: print('restaurant', restaurant) for year in years: print('year', year) sentiment_scores = calcualte_sentiment_for_given_restaurant_year( db_name, classifier, restaurant, year) number_of_positive = 0 number_of_negative = 0 for score in sentiment_scores: if score == 1: number_of_positive = number_of_positive + 1 else: number_of_negative = number_of_negative + 1 print('positive', number_of_positive) print('negative', number_of_negative * -1)
import pandas as pd import streamlit as st from flair.data import Sentence from flair.models import TextClassifier from twitterscraper import query_tweets import sys sys.path.append('/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages') # Set page title st.title('Twitter Sentiment Analysis') # Load classification model with st.spinner('Loading classification model...'): classifier = TextClassifier.load('models/best-model.pt') # Preprocess function allowed_chars = ' AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789~`!@#$%^&*()-=_+[]{}|;:",./<>?' punct = '!?,.@#' maxlen = 280 def preprocess(text): # Delete URLs, cut to maxlen, space out punction with spaces, and remove unallowed chars return ''.join([' ' + char + ' ' if char in punct else char for char in [char for char in re.sub(r'http\S+', 'http', text, flags=re.MULTILINE) if char in allowed_chars]]) ### SINGLE TWEET CLASSIFICATION ### st.subheader('Single tweet classification') # Get sentence input, preprocess it, and convert to flair.data.Sentence format tweet_input = st.text_input('Tweet:')
) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # 7. start the training trainer.train('resources/taggers/ag_news', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=150) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves('resources/taggers/ag_news/loss.tsv') plotter.plot_weights('resources/taggers/ag_news/weights.txt') classifier = TextClassifier.load('resources/taggers/ag_news/final-model.pt') # create example sentence sentence = Sentence('France is the current world cup winner.') # predict tags and print classifier.predict(sentence) print(sentence.labels)
reproject_words_dimension=256, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # 7. start the training trainer.train('resources/classifiers', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=150) # 以下、モデルを読み込んで実行してみる classifier = TextClassifier.load('resources/classifiers/best-model.pt') # create example sentence sentence = Sentence("現実を受け入れて生きるしかない", use_tokenizer=japanese_tokenizer) print(sentence.to_tokenized_string()) # predict class and print classifier.predict(sentence) label_dict = sentence.to_dict()["labels"][0] label_dict["confidence"] if label_dict["value"] == "__label__O" else 0
#apply document LSTM to the stacked embeddings document_embeddings = DocumentRNNEmbeddings( word_embeddings, # hidden_size=512, # reproject_words=True, # reproject_words_dimension=256, ) #build model classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) #specify parameters and train model trainer.train(PATH/'models/', max_epochs=3,checkpoint=True, learning_rate=1e-1) classifier = TextClassifier.load('/content/drive/My Drive/emnlp/models/best-model.pt') """## Dev Set Prediction""" dev_folder = "" # if not adjust these variables accordingly dev_template_labels_file = "" task_SLC_output_file = "" def read_articles_from_file_list(folder_name, file_pattern="*.txt"): file_list = glob.glob(os.path.join(folder_name, file_pattern)) article_id_list, sentence_id_list, sentence_list = ([], [], [])
import datetime as dt import re import pandas as pd import streamlit as st from flair.data import Sentence from flair.models import TextClassifier # Set page title st.title('Sentiment Analysis') # Load classification model with st.spinner('Loading classification model...'): classifier = TextClassifier.load( '/Users/mengyu/Desktop/engineering/models/best-model.pt') # Preprocess function allowed_chars = ' AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789~`!@#$%^&*()-=_+[]{}|;:",./<>?' punct = '!?,.@#' maxlen = 280 def preprocess(text): # Delete URLs, cut to maxlen, space out punction with spaces, and remove unallowed chars return ''.join([ ' ' + char + ' ' if char in punct else char for char in [ char for char in re.sub(r'http\S+', 'http', text, flags=re.MULTILINE) if char in allowed_chars ] ])
return thevalence #Surely I missed the builtin method that just returns the value? except Exception: print("An exception occurred. Text was not passed to get_valence") return 'n/a' def plotone_df(df): plt.hist(df['valence'], color='blue', edgecolor='black') #plt.tight_layout() plt.show() print_intro() classifier = TextClassifier.load('en-sentiment') num = 0 for df in dfs: threelines() print("Preparing valence for df %d - this may take a second" % num) #print("Preparing valence for df %s "%str(dfs.index(df))) df['valence'] = df['text'].apply(get_valence) #df.reset_index(drop=True) #plt.hist(df['valence'], color='blue', edgecolor='black') # plt.tight_layout() #plt.show() num += 1 #plotone_df(df) #print(df['valence'].dtypes)
from flask import Flask, jsonify, request from fastai.text import * from flair.models import TextClassifier from flair.data import Sentence import json import heapq application = Flask(__name__) classifier_sentiment = TextClassifier.load('./models/best-model.pt') @application.route("/") def hello(): return "Hello World!" @application.route('/classify', methods=['POST']) def post_tasks(): return_object = [] data = json.loads(request.data) df = pd.DataFrame(data) items = TextList.from_df(df[['sample']]) learn = load_learner('./models', 'export.pkl', test=items) preds = learn.get_preds(ds_type=DatasetType.Test)[0].tolist() preds = [get_classes(item) for item in preds] for indx, sample in enumerate(data, start=0): sample["categories"] = preds[indx] print(sample) return_object.append(sample) return jsonify(return_object)
import pandas as pd from flair.models import TextClassifier from flair.data import Sentence import sys from flair.models import SequenceTagger import webbrowser import re import os debug = False play_tags = ['ok','play','yes','sure','like','love','awesome','nice','yep','yeah','good'] retry_tags = ['no','next','shuffle','hate','dislike','another','nope','nay','jeez','nah','ugh','not'] #model loadings tagger = SequenceTagger.load('pos') mood = TextClassifier.load('en-sentiment') classifier = TextClassifier.load(sys.argv[1]) df = pd.read_pickle('./data/music.pkl') df = df.loc[df['valence'] != '0.0'] genres = set(df['genre']) mood_history = [] current_genre = False def debug_print(*objects): global debug if(debug): print(objects)
import streamlit as st import pandas as pd import numpy as np from flair.models import TextClassifier from flair.data import Sentence ##checkpt: Jun6 8:31pm done prediction and reading dataframe! new_data_folder = './gdp_benchmark_classifier//' finetuned_classifier = TextClassifier.load(new_data_folder + 'best-model.pt') def finetuned_model_predictions(input_file_path, col_text, finetuned_classifier, output_file_path): '''Makes Sentiment Predictions on unannotated data points contained in the input csvfile by loading the user-defined classifier. Exports the csvfile by adding new columns and filling in results from model predictions. ''' if col_text.isdigit(): # if no text header unannotated_df = pd.read_csv(input_file_path, header=None) col_text = int( col_text ) ## indexing works after turning input string into integer !! else: unannotated_df = pd.read_csv(input_file_path) ## drop some duplicated rows #unannotated_df = unannotated_df.drop_duplicates(col_text) ## add new columns to export predictions ## modified on May28 to export predict_prob for less likely labels as well
def __init__(self, ckpt=CKPT, name2nats=DICT): self.classifier = TextClassifier.load(ckpt) self.name2nats = self.construct(name2nats)
consumer_key = 'XXXX' consumer_secret = 'XXXX' access_token = 'XXXX' access_token_secret = 'XXXX' auth = tw.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tw.API(auth, wait_on_rate_limit=True) # Set page title st.title('Twitter Sentiment Analysis') # Load classification model with st.spinner('Loading classification model...'): classifier = TextClassifier.load('model-saves/my_fine_tuned_bert1.pt') # Preprocess function allowed_chars = ' AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789~`!@#$%^&*()-=_+[]{}|;:",./<>?' punct = '!?,.@#' maxlen = 280 def preprocess(text): # Delete URLs, cut to maxlen, space out punction with spaces, and remove unallowed chars return ''.join([ ' ' + char + ' ' if char in punct else char for char in [ char for char in re.sub(r'http\S+', 'http', text, flags=re.MULTILINE) if char in allowed_chars ]
def predict_labels(data, pretrained_model): classifier = TextClassifier.load(pretrained_model) data_to_classify, sub_outcomes = [], [] with open(data, 'r') as d: instances = [] for i in d.readlines(): if i != '\n': i = i.split() instances.append(i) else: # if instances: instances_copy = instances.copy() data_to_classify.append(instances_copy) outcome, sub_instances, l = '', [], 0 _outcomes_ = () #print(instances_copy) for x in range(len(instances_copy)): if x == l: x_str = instances_copy[x][1] # if x_str != 'O': if x_str.startswith('B') or x_str.startswith('I'): outcome = instances_copy[x][0] if x == len(instances_copy) - 1: if str(outcome.strip()) != 'nan': sent = Sentence(outcome) classifier.predict(sent) sub_instances.append('{}:{}'.format( outcome.strip(), sent.labels[0].value)) else: for y in range(x + 1, len(instances_copy)): if not instances_copy[y][1].startswith( 'B' ) and instances_copy[y][1] != 'O': outcome += ' {}'.format( instances_copy[y][0]) outcome = outcome.strip() if y == len(instances_copy) - 1: outcome_copy = outcome if str(outcome_copy.strip() ) != 'nan': sent = Sentence(outcome_copy) classifier.predict(sent) sub_instances.append( '{}:{}'.format( outcome_copy.strip(), sent.labels[0].value)) outcome = '' l = y else: if outcome: outcome_copy = outcome if str(outcome_copy.strip() ) != 'nan': sent = Sentence(outcome_copy) classifier.predict(sent) sub_instances.append( '{}:{}'.format( outcome_copy.strip(), sent.labels[0].value)) outcome = '' break l += 1 sub_outcomes.append(tuple(sub_instances)) instances.clear() data_to_classify = [' '.join(j[0] for j in i) for i in data_to_classify] data_to_classify_frame = pd.DataFrame(data_to_classify, columns=['Abstract']) max_outcomes_per_sentence = max([len(i) for i in sub_outcomes]) columns_ = [ 'Outcome {}'.format(i + 1) for i in range(max_outcomes_per_sentence) ] sub_outcomes_frame = pd.DataFrame(sub_outcomes, columns=columns_) data_to_classify_frame = pd.concat( [data_to_classify_frame, sub_outcomes_frame], axis=1) print(tabulate(data_to_classify_frame, headers='keys', tablefmt='psql'))
def clean(raw): """ Remove hyperlinks and markup """ result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw) result = re.sub('>', "", result) result = re.sub(''', "'", result) result = re.sub('"', '"', result) result = re.sub('/', ' ', result) result = re.sub('<p>', ' ', result) result = re.sub('</i>', '', result) result = re.sub('>', '', result) result = re.sub('<i>', ' ', result) result = re.sub("\n", '', result) return result classifier = TextClassifier.load('./model_result/final-model.pt') #df = review_dataframe #df = df.head(10) df['text'] = df['text'].fillna('').apply(str) d = [] for i, row in df.iterrows(): document = row['text'] document = clean(document) sentence = Sentence(document) classifier.predict(sentence) print(document + "\n\n") print(sentence.labels)
def get_classifier_model(model_name) -> TextClassifier: return TextClassifier.load(model_name)
reproject_words_dimension=256, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # 7. start the training trainer.train('/home/anna/Desktop/markup/8', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=150) # 8. plot weight traces (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_weights('/home/anna/Desktop/markup/8/weights.txt') classifier = TextClassifier.load('/home/anna/Desktop/markup/8/final-model.pt') # create example sentences sentence = Sentence('France is the current world cup winner.') # predict class and print classifier.predict(sentence) print(sentence.labels)
def load_flair(): return TextClassifier.load('en-sentiment')
def __init__(self, model_name_or_path: str): self.classifier = TextClassifier.load(model_name_or_path)
# Create model from flair.models import TextClassifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # Create model trainer from flair.trainers import ModelTrainer trainer = ModelTrainer(classifier, corpus) # Train the model trainer.train('model-saves', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=8, max_epochs=200) # Load the model and make predictions from flair.data import Sentence classifier = TextClassifier.load('model-saves/final-model.pt') pos_sentence = Sentence(preprocess('I love Python!')) neg_sentence = Sentence(preprocess('Python is the worst!')) classifier.predict(pos_sentence) classifier.predict(neg_sentence) print(pos_sentence.labels, neg_sentence.labels)
def __init__(self, path_to_model: str) -> None: "Input Flair trained sentiment model" from flair.models import TextClassifier self.classifier = TextClassifier.load(path_to_model)
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "multi_class", label_type="topic") label_dict = corpus.make_label_dictionary(label_type="topic") model: TextClassifier = TextClassifier( document_embeddings=document_embeddings, label_dictionary=label_dict, label_type="topic", multi_label=True, ) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, mini_batch_size=1, max_epochs=20, shuffle=False, checkpoint=False, train_with_test=True, train_with_dev=True, ) sentence = Sentence("apple tv") model.predict(sentence) assert "apple" in [label.value for label in sentence.labels] assert "tv" in [label.value for label in sentence.labels] for label in sentence.labels: print(label) assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float del trainer, model, corpus loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("apple tv") loaded_model.predict(sentence) assert "apple" in [label.value for label in sentence.labels] assert "tv" in [label.value for label in sentence.labels] for label in sentence.labels: assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) del loaded_model
from flask import Flask, request, jsonify from flask_cors import CORS from flair.models import TextClassifier from flair.data import Sentence classifier = TextClassifier.load('./model/best-model.pt') mapping = { 'sad': '😞', 'smile': '😀', 'food': '🍽', 'heart': '❤', 'baseball': '⚾' } app = Flask(__name__) CORS(app) @app.route('/emojify', methods=['POST']) def emoji(): data = request.form.get('text') if not len(data.strip()): return '' sentence = Sentence(data) classifier.predict(sentence) print(str(sentence.labels)) if 'sad' in str(sentence.labels): return mapping['sad'] elif 'smile' in str(sentence.labels): return mapping['smile'] elif 'food' in str(sentence.labels): return mapping['food']
class SentimentAnalysisAPI(object): default_sentence: str = "N/A" flair_sent_model: TextClassifier = TextClassifier.load("sentiment") nltk_sent_model: SentimentIntensityAnalyzer = SentimentIntensityAnalyzer() @staticmethod def get_document_language(doc: str) -> str: language: str = "" try: language: str = detect(doc) except Exception as e: logger.error(e) return language @staticmethod def translate_to_english(src_doc: str, src_lang: str) -> str: translation: str = "" try: eng_translator: Translator = Translator() res: Translated = eng_translator.translate(src_doc, src=src_lang, dest="en") translation: str = res.text except Exception as e: logger.error(e) return translation @staticmethod def make_sentences(text: str, min_char: int = 3) -> list: """ Break apart text into a list of sentences """ if len(text) > min_char: sentences: list = [ sent for sent in split_single(text) if len(sent) > min_char ] else: sentences: list = [] if not sentences: logger.warning("Default sentence was added") sentences: list = [SentimentAnalysisAPI.default_sentence] return sentences @staticmethod def get_label_decision(score: float, lower_boundary: float = .4, upper_boundary: float = .6) -> str: final_label: str = "" try: if score < lower_boundary: final_label: str = "Negative" elif lower_boundary <= score < upper_boundary: final_label: str = "Neutral" else: final_label: str = "Positive" except Exception as e: logger.error(e) return final_label @staticmethod def get_flair_sentiment_analysis(doc: str) -> FlairSentOutput: output: FlairSentOutput = FlairSentOutput() try: was_translated: bool = False # 1. Get the language of the document lang: str = SentimentAnalysisAPI.get_document_language(doc=doc) if lang != "en": # Translate doc: str = SentimentAnalysisAPI.translate_to_english( src_doc=doc, src_lang=lang) was_translated: bool = True sentences: list = SentimentAnalysisAPI.make_sentences(text=doc) scores: list = [] for sent in sentences: # 2. Load model and predict sentence: Sentence = Sentence(sent) SentimentAnalysisAPI.flair_sent_model.predict(sentence) single_res: Label = sentence.labels[0] single_label: str = single_res.value single_score: float = single_res.score if single_label == "POSITIVE" else ( 1 - single_res.score) scores.append(single_score) final_score: float = round(float(np.mean(scores)), 3) final_label: str = SentimentAnalysisAPI.get_label_decision( score=final_score) output: FlairSentOutput = FlairSentOutput( label=final_label, confidence=final_score, translated=was_translated, analysed=True) except Exception as e: logger.error(e) return output @staticmethod def get_textblob_sentiment_analysis(doc: str) -> TextBlobSentOutput: output: TextBlobSentOutput = TextBlobSentOutput() try: was_translated: bool = False # 1. Get the language of the document lang: str = SentimentAnalysisAPI.get_document_language(doc=doc) if lang != "en": # Translate doc: str = SentimentAnalysisAPI.translate_to_english( src_doc=doc, src_lang=lang) was_translated: bool = True sentences: list = SentimentAnalysisAPI.make_sentences(text=doc) polarity_scores: list = [] subjectivity_scores: list = [] for sent in sentences: subjectivity: float = TextBlob(sent).sentiment.subjectivity polarity: float = TextBlob(sent).sentiment.polarity polarity_scores.append(polarity) subjectivity_scores.append(subjectivity) final_subjectivity: float = round( float(np.mean(subjectivity_scores)), 3) final_polarity: float = round(float(np.mean(polarity_scores)), 3) output: TextBlobSentOutput = TextBlobSentOutput( analysed=True, polarity=final_polarity, subjectivity=final_subjectivity, translated=was_translated) except Exception as e: logger.error(e) return output @staticmethod def get_nltk_sentiment_analysis(doc: str) -> NLTKSentOutput: output: NLTKSentOutput = NLTKSentOutput() try: was_translated: bool = False # 1. Get the language of the document lang: str = SentimentAnalysisAPI.get_document_language(doc=doc) if lang != "en": # Translate doc: str = SentimentAnalysisAPI.translate_to_english( src_doc=doc, src_lang=lang) was_translated: bool = True sentences: list = SentimentAnalysisAPI.make_sentences(text=doc) polarity_scores: list = [] for sent in sentences: polarity_scores.append( SentimentAnalysisAPI.nltk_sent_model.polarity_scores( text=sent)) neg_prob: float = SentimentAnalysisAPI.get_nltk_scores( scores=polarity_scores, key="neg") neu_prob: float = SentimentAnalysisAPI.get_nltk_scores( scores=polarity_scores, key="neu") pos_prob: float = SentimentAnalysisAPI.get_nltk_scores( scores=polarity_scores, key="pos") compound_prob: float = SentimentAnalysisAPI.get_nltk_scores( scores=polarity_scores, key="compound") output: NLTKSentOutput = NLTKSentOutput( analysed=True, negative_prob=neg_prob, neutral_prob=neu_prob, positive_prob=pos_prob, compound_prob=compound_prob, translated=was_translated) except Exception as e: logger.error(e) return output @staticmethod def get_nltk_scores(scores: list, key: str) -> float: score: float = 0.0 try: score: float = round(float(np.mean([i.get(key) for i in scores])), 3) except Exception as e: logger.error(e) return score
def get_lyrics_df(folder, artist): lyrics_df = concat_lyrics_df(folder, artist) lyrics_df.loc[:, 'artist'] = artist # Create columns lyrics_df.loc[:, 'artist_wrote_song'] = lyrics_df.apply( lambda x: 1 if x['artist'] in x['writers'] else 0, axis=1) lyrics_df.loc[:, 'artist_produced_song'] = lyrics_df.apply( lambda x: 1 if x['artist'] in x['producers'] else 0, axis=1) lyrics_df.loc[:, 'structure_tags'] = lyrics_df['lyrics'].apply( lambda x: re.findall(r"(\[.*\])", x)) lyrics_df.loc[:, 'lyrics_clean'] = lyrics_df.apply( lambda x: clean_lyrics(x['lyrics'], x['structure_tags']), axis=1) lyrics_df.loc[:, 'structure_tags_clean'] = lyrics_df['structure_tags'].apply( lambda x: clean_structure_tags(x)) lyrics_df.loc[:, 'song_structure'] = lyrics_df['structure_tags_clean'].apply( lambda x: get_song_structure(x)) # Analyze tokens and text nlp = spacy.load('en', disable=['parser', 'ner']) lyrics_df.loc[:, 'total_word_count'] = lyrics_df['lyrics_clean'].apply( lambda x: len(nlp(x))) lyrics_df.loc[:, 'lyrics_lemmatized'] = lyrics_df['lyrics_clean'].apply( lambda x: ' '.join([ token.lemma_.lower() for token in nlp(x) if token.is_alpha and not token.is_stop ])) lyrics_df['lemma_count'] = lyrics_df['lyrics_lemmatized'].apply( lambda x: len(x.split(' '))) lyrics_df['unique_lemmas_on_song'] = lyrics_df['lyrics_lemmatized'].apply( lambda x: len(set(x.split(' ')))) # Sentiment analysis classifier = TextClassifier.load('sentiment') sents = [] for i in range(0, len(lyrics_df.index.tolist())): text = lyrics_df.iloc[i]['lyrics_clean'] if text: sentence = Sentence(text) classifier.predict(sentence) sents.append(str(sentence.labels)) else: sents.append('') lyrics_df.loc[:, 'flair_sentiment'] = sents lyrics_df.loc[:, 'sentiment_label'] = lyrics_df['flair_sentiment'].str[ 1:-1].str.split('(', expand=True).iloc[:, 0].fillna('').str.strip() lyrics_df.loc[:, 'sentiment_probability'] = lyrics_df['flair_sentiment'].str[ 1:-1].str.split('(', expand=True).iloc[:, 1].str[:-1].fillna(0) lyrics_df.loc[:, 'sentiment'] = lyrics_df['sentiment_label'].apply( lambda x: 1 if x == 'POSITIVE' else -1 if x == 'NEGATIVE' else 0) lyrics_df.loc[:, 'sentiment_score'] = ( lyrics_df['sentiment_probability'].astype(float) - 0.5) lyrics_df.loc[:, 'sentiment_score'] = lyrics_df[ 'sentiment_score'] * lyrics_df['sentiment'] os.chdir(folder / '{}'.format(artist)) lyrics_df.to_csv( 'lyrics_final.csv', index=False, ) return lyrics_df
from pymongo import MongoClient import pandas as pd import numpy as np from flair.models import TextClassifier from flair.data import Sentence import logging logging.basicConfig(level=logging.ERROR) ###### Mongodb connection client = MongoClient('localhost', 27017) db = client.covML data_col = db.scrapedData ############################ flair_sentiment = TextClassifier.load('en-sentiment') ## def analyze_sentiment(headline): s = Sentence(headline) flair_sentiment.predict(s) total_sentiment = s.labels[0].to_dict() return total_sentiment ## def analyseSentiments(): myNews=data_col.find() df = pd.DataFrame(myNews) del df['_id'] df['Result'] = np.array([analyze_sentiment(headline)['value'] for headline in df['headline']]) df['confidence'] = np.array([analyze_sentiment(headline)['confidence'] for headline in df['headline']]) data_col.delete_many({})