def get_text_features_tfidf(input_df: pd.DataFrame, text_col: str, n_comp=10, clean=True): vectorizer = make_pipeline( TfidfVectorizer(), make_union(TruncatedSVD(n_components=n_comp, random_state=42), NMF(n_components=n_comp, random_state=42), make_pipeline( BM25Transformer(use_idf=True, k1=2.0, b=0.75), TruncatedSVD(n_components=n_comp, random_state=42)), n_jobs=1)) X = input_df[text_col].fillna('') if clean == True: pipeline = [ hero.preprocessing.fillna, hero.preprocessing.remove_digits, hero.preprocessing.remove_punctuation, hero.preprocessing.remove_diacritics, hero.preprocessing.remove_whitespace ] hero.clean(X, pipeline) X = vectorizer.fit_transform(X).astype(np.float32) output_df = pd.DataFrame( X, columns=[f'{text_col}_tfidf_svd_{i}' for i in range(n_comp)] + [f'{text_col}_tfidf_nmf_{i}' for i in range(n_comp)] + [f'{text_col}_tfidf_bm25_{i}' for i in range(n_comp)]) return output_df
def indeed_postprocess(i_df, query_term, query_jobtype, verbose=False, shorten_links=False): print("Starting postprocess - ", datetime.now()) # apply texthero cleaning i_df["titles"] = hero.clean(i_df["titles"]) i_df["summary"] = hero.clean(i_df["summary"]) # use bit.ly to shorten links if shorten_links: try: len(i_df["short_link"]) print("found values for short_link, not-recreating") except: print("no values exist for short_link, creating them now") # there is a random delay to not overload APIs, max rt is 5s * num_rows i_df["short_link"] = i_df["links"].apply(shorten_URL_bitly) else: i_df["short_link"] = "not_created" # save file to excel rn = datetime.now() i_PP_date = rn.strftime("_%m.%d.%Y-%H-%M_") i_df["date_pulled"] = rn.strftime("%m.%d.%Y") i_df["time_pulled"] = rn.strftime("%H:%M:%S") out_name = "JS_DB_" + "query=[term(s)=" + query_term + ", type=" + query_jobtype + "]" + i_PP_date + ".xlsx" i_df.to_excel(out_name) if verbose: print("Saved {} - ".format(out_name), datetime.now()) # download if requested return i_df
def preprocess(self,document,pipeline = None,add_preprocess=None): """ Return a list contaning all the methods used in the default cleaning pipeline. Return a list with the following functions: 1. :meth:`texthero.preprocessing.fillna` 2. :meth:`texthero.preprocessing.lowercase` 3. :meth:`texthero.preprocessing.remove_digits` 4. :meth:`texthero.preprocessing.remove_html_tags` 5. :meth:`texthero.preprocessing.remove_punctuation` 6. :meth:`texthero.preprocessing.remove_diacritics` 7. :meth:`texthero.preprocessing.remove_stopwords` 8. :meth:`texthero.preprocessing.remove_whitespace` """ if pipeline is not None: pipeline = heropreprocess.get_default_pipeline() if add_preprocess is not None: for preprocess in add_preprocess: pipeline.append(preprocess) clean_text = hero.clean(document, pipeline=pipeline) return clean_text
def cleaning_text(df_name): ''' All the steps of preprocessing :param df_name: name of the df on wich the content column must be preprocessed :return: a "clea_content" column ''' # delete pseudo strating with @ df_name['clean_content'] = df_name['content'].apply(delete_pseudo) # method clean from texthero df_name['clean_content'] = hero.clean(df_name['clean_content']) # delete stopwords with texthero default_stopwords = stopwords.DEFAULT custom_stopwords = default_stopwords.union( set([ "feel", "feeling", "im", "get", "http", "ive", "go", "day", "com", "got", "see" "4pm" ])) df_name['clean_content'] = hero.remove_stopwords(df_name['clean_content'], custom_stopwords) # remove urls df_name['clean_content'] = hero.remove_urls(df_name['clean_content']) # remove angle brakets df_name['clean_content'] = hero.remove_angle_brackets( df_name['clean_content']) # remove digits df_name['clean_content'] = hero.preprocessing.remove_digits( df_name['clean_content'], only_blocks=False)
def get_wordcloud(all_comments): df = pd.DataFrame(all_comments, columns=['text']) df['clean_text'] = hero.clean(df['text']) figure = hero.visualization.wordcloud(df['clean_text'], return_figure=True) # figure.savefig('plot.png') return figure, df
def _clean(self, series: pd.Series) -> pd.Series: custom_pipeline = [ preprocessing.fillna, preprocessing.lowercase, preprocessing.remove_digits, preprocessing.remove_punctuation, preprocessing.remove_diacritics, preprocessing.remove_whitespace ] return texthero.clean(series, pipeline=custom_pipeline)
def clean_text(raw_text): """ textheroで前処理例 nltk ライブラリをインストールすることで nltk のデータ利用して stpowordの除去を行える """ import texthero as hero raw_text = raw_text clean_text = hero.clean(raw_text, pipeline=[ hero.preprocessing.fillna, hero.preprocessing.lowercase, hero.preprocessing.remove_digits, hero.preprocessing.remove_punctuation, hero.preprocessing.remove_diacritics, hero.preprocessing.remove_stopwords ]) # wordcloudで可視化 hero.visualization.wordcloud(clean_text, colormap='viridis', background_color='white') import nltk nltk.download('stopwords') os.listdir(os.path.expanduser('~/nltk_data/corpora/stopwords/')) # 英語とオランダ語を stopword として指定 custom_stopwords = nltk.corpus.stopwords.words( 'dutch') + nltk.corpus.stopwords.words('english') # 前処理にストップワード除去を加える apply_stopword_text = hero.clean( raw_text, pipeline=[ hero.preprocessing.fillna, hero.preprocessing.lowercase, hero.preprocessing.remove_digits, hero.preprocessing.remove_punctuation, hero.preprocessing.remove_diacritics, lambda x: hero.preprocessing. remove_stopwords(x, stopwords=custom_stopwords) ])
def clearn_by_hero(x): custom_pipeline = [ preprocessing.fillna, preprocessing.lowercase, preprocessing.remove_digits, preprocessing.remove_punctuation, preprocessing.remove_diacritics, preprocessing.remove_whitespace, preprocessing.remove_stopwords ] return hero.clean(x, custom_pipeline)
def preprocess(self, input_text): """前処理""" clean_text = hero.clean(input_text, pipeline=[ hero.preprocessing.fillna, hero.preprocessing.lowercase, hero.preprocessing.remove_digits, hero.preprocessing.remove_punctuation, hero.preprocessing.remove_diacritics, hero.preprocessing.remove_stopwords ]) return clean_text
def cleaner(df, col): """ :param df: a pandas dataframe :param col: a string (name of col to clean) :return: a dataframe cleaned with text hero pipeline """ # ignore future warning re: texthero default call to regex warnings.filterwarnings("ignore") pipeline = [ pp.fillna, pp.remove_digits, pp.lowercase, pp.remove_punctuation, pp.remove_diacritics, pp.remove_stopwords, pp.remove_whitespace, pp.stem ] df[col] = hero.clean(df[col], pipeline=pipeline) return df
def process_resume(doc): text = "" for page in doc: text = text + str(page.getText()) # Split text by next line tx = " ".join(text.split('\n')) # Remove unnecessary punctuations cleaned_tx = re.sub(r'[§_|]', '', tx) # Remove/extract phone numbers pattern = re.compile( r'([+(]?\d+[)\-]?[ \t\r\f\v]*[(]?\d{2,}[()\-]?[ \t\r\f\v]*\d{2,}[()\-]?[ \t\r\f\v]*\d*[ \t\r\f\v]*\d*[ \t\r\f\v]*)' ) match = pattern.findall(cleaned_tx) match = [ re.sub(r'[,.]', '', el) for el in match if len(re.sub(r'[()\-.,\s+]', '', el)) > 6 ] match = [re.sub(r'\D$', '', el).strip() for el in match] phoneNum_match = [el for el in match if len(re.sub(r'\D', '', el)) <= 15] phoneNum_match = ' '.join([str(elem) for elem in phoneNum_match]) cleaned_tx = cleaned_tx.replace(phoneNum_match, '') # Get email email_pattern = re.compile(r'\S*@\S*') email_match = email_pattern.findall(cleaned_tx) email_match = ' '.join([str(elem) for elem in email_match]) cleaned_tx = cleaned_tx.replace(email_match, '') # Further cleaning using texthero resume_df = pd.DataFrame([cleaned_tx], columns=['resume']) df2 = pd.DataFrame() custom_pipeline = [ preprocessing.lowercase, preprocessing.remove_digits, preprocessing.remove_punctuation, preprocessing.remove_diacritics, preprocessing.remove_whitespace ] # preprocessing.stem] df2['cleaned'] = hero.clean(resume_df['resume'], pipeline=custom_pipeline) pd.set_option('max_colwidth', 100000) cleaned_resume = df2['cleaned'].to_string(index=False) return cleaned_resume
def text_normalization(text): # 英語とオランダ語を stopword として指定 custom_stopwords = nltk.corpus.stopwords.words( 'dutch') + nltk.corpus.stopwords.words('english') x = hero.clean(text, pipeline=[ hero.preprocessing.fillna, hero.preprocessing.lowercase, hero.preprocessing.remove_digits, hero.preprocessing.remove_punctuation, hero.preprocessing.remove_diacritics, lambda x: hero.preprocessing.remove_stopwords( x, stopwords=custom_stopwords) ]) return x
def preProcess(crawledData: list, models, headers): predictions = [] for i in range(len(crawledData)): series = pd.Series(crawledData[i]) series = hero.clean(series) toString = pd.Series.to_string(series, index=False) predictions.append(ModelUtils.fastPredict(toString, models)) resFrame = pd.DataFrame() for i in range(len(predictions)): for j in range(len(predictions[i])): dataframe = headers.copy() new = predictions[i][j].replace('__label__', '') dataframe[new] = [1] dataframe["TEXT"] = crawledData[i] resFrame = pd.concat([resFrame, dataframe], ignore_index=True) return resFrame
def run_nlp_match(target_column, df, model): charge_description, categories = target_column logging.info(f'- Starting NLP match for {charge_description}') results = [] logging.info(f'- Starting NLP match for {charge_description}') for category in categories: if 'micro' in category: # do mapping where there is a description but no category mapping df['flag'] = ~df[charge_description].isna() & df[category].isna() start_counts = df['flag'].value_counts() logging.info( f'-- In {category}, there are {start_counts[True]} to classify.' ) temp = hero.clean( df[df['flag'] == True][charge_description].copy(), pipeline=Config.text_pipeline) idx = temp.index.values.tolist() predictions = model.predict(temp) df[category].update( pd.Series(predictions, name=charge_description, index=idx)) df['flag'] = ~df[charge_description].isna() & df[category].isna() end_counts = df['flag'].value_counts() try: logging.info( f'The are {end_counts[True]} charges left to classify') except: logging.info( f'Classification Status 100%. Started with {start_counts[True]} to map. Ended with {end_counts[False]} mapped. ' ) result = tuple((category, df[category])) results.append(result) # currently returning first element of a list of name and series pair return results[0]
def upload(): if request.method == 'POST': # Check if the post request has the file part if 'file' not in request.files: flash('No file attached in request') return redirect(request.url) file = request.files['file'] # if user does not select file, browser also # submits an empty part without filename if file.filename == '': flash('No selected file') return redirect(request.url) elif not allowed_file(file.filename): abort(400, 'Incorrect file extension') flash('Incorrect file extension. Must be PDF!') return redirect(request.url) if file and allowed_file(file.filename): filename = secure_filename(file.filename) file.save( os.path.join(BASE_DIR, app.config['UPLOAD_FOLDER'], filename)) resume_cleaned = final_resume_clean( os.path.join(app.config['UPLOAD_FOLDER'], filename), filename) jobs_df = pd.read_csv('cleaned_data.csv') global df df = get_recommendations(resume_cleaned, jobs_df) # Remove diacritics from original job descriptions custom_pipeline = [preprocessing.remove_diacritics] df['Description'] = hero.clean(df['Description'], pipeline=custom_pipeline) return render_template("recommendation.html", column_names=df.columns.values, target_column="Company", hide_column="Job Description", row_data=list(df.values.tolist()), zip=zip)
def name2feats(df, column='Name', methods=['pca', 'kmeans', 'ngram']): # preprocessing custom_pipeline = [ hero.preprocessing.fillna, hero.preprocessing.lowercase, hero.preprocessing.remove_digits, hero.preprocessing.remove_punctuation, hero.preprocessing.remove_diacritics, hero.preprocessing.remove_whitespace ] df['clean_name'] = hero.clean(df[column], pipeline=custom_pipeline) # tfidf -> pca if 'pca' in methods: df['pca_name'] = hero.tfidf(df['clean_name'], max_features=200) df['pca_name'] = hero.pca(df['pca_name'], n_components=10) for i in np.arange(len(df['pca_name'].values[0])): df[f'tfidf_pca_{column}{i}'] = df['pca_name'].apply(lambda x: x[i]) df.drop(columns=['pca_name'], inplace=True) # tfidf -> kmeans if 'kmeans' in methods: df[f'tfidf_kmeans_{column}'] = hero.tfidf(df['clean_name'], max_features=200) df[f'tfidf_kmeans_{column}'] = hero.kmeans( df[f'tfidf_kmeans_{column}'], n_clusters=10) # n-gram if 'ngram' in methods: for n in [2, 3]: name_grams = df[column].apply(lambda x: line_ngram(x, n)) grams = [x for row in name_grams for x in row if len(x) > 0] top_grams = pd.Series(grams).value_counts().head(20).index df[f'{column}_in_top_{n}gram'] = name_grams.map( lambda x: any([i for i in x if i in top_grams])) df.drop(columns=['clean_name'], inplace=True) return df
def clean_dataframe(df): df['clean_reviewText'] = hero.clean(df['reviewText']) return df
f"{proposals_directory}/{target_state}/{target_city}/{file_name}" ).read() custom_stop_words = [ "feira", "santana", "municipio", "municipal", "município", "municipais", "cidade", "publico", "publica", ] cleaned_text = remove_portuguese_stopwords(content, custom_stop_words) cleaned_text = pd.Series(cleaned_text) cleaned_text = hero.clean(cleaned_text, custom_pipeline) candidates_and_proposals[candidate_name] = cleaned_text for candidate, proposal in candidates_and_proposals.items(): print(f"------------------------ {candidate}") text = " ".join(proposal) wordcloud = WordCloud( background_color="white", width=2000, height=800, colormap="PuOr", collocations=False, ).generate(text) fig = plt.figure(figsize=(20, 10)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off")
from model.utils import vectorize from sklearn.svm import SVC from sklearn.model_selection import StratifiedKFold from sklearn.metrics import auc, precision_score, accuracy_score, recall_score, f1_score, confusion_matrix import pandas as pd from utils import vectorize, TEXTHERO_FILTER import numpy as np import pickle from texthero import preprocessing import texthero from text_matching_clf import TextMatchingClassifier reviews = pd.read_csv("./clean/clean-geomdash.csv") reviews["clean"] = texthero.clean(reviews["content"], pipeline=TEXTHERO_FILTER) # SAMPLE_PER_CLASS = 250 # sampled_reviews = reviews.groupby('score').apply(lambda x: x.sample(SAMPLE_PER_CLASS)) sampled_reviews = reviews with open('./model/vectorizer-reviews.pkl', 'rb') as vf: vectorizer = pickle.load(vf) with open('./model/svc-reviews.pkl', 'rb') as svc_f: model = pickle.load(svc_f) with open('./model/text-match-reviews.pkl', 'rb') as text_f: text_match_model = pickle.load(text_f) X = vectorizer.transform(sampled_reviews["clean"])
from skmultilearn.problem_transform import LabelPowerset from skmultilearn.problem_transform import ClassifierChain from sklearn.linear_model import LogisticRegression from skmultilearn.problem_transform import BinaryRelevance from sklearn.naive_bayes import GaussianNB test_df = pd.read_csv( r"E:\Hackathon\Research paper search\test_8iecVfC\test.csv", encoding='utf-8') train_df = pd.read_csv( r"E:\Hackathon\Research paper search\train_tGmol3O\train.csv", encoding='utf-8') pd.set_option('max_columns', None) test_df["TITLE"] = hero.clean(test_df["TITLE"]) train_df["TITLE"] = hero.clean(train_df["TITLE"]) test_df["ABSTRACT"] = hero.clean(test_df["ABSTRACT"]) train_df["ABSTRACT"] = hero.clean(train_df["ABSTRACT"]) train_df["Content"] = train_df["TITLE"] + train_df["ABSTRACT"] test_df["Content"] = test_df["TITLE"] + test_df["ABSTRACT"] train_df["Content"] = hero.clean(train_df["Content"]) train_df_x = train_df["Content"] test_df_x = test_df["Content"] train_df_y = train_df.iloc[:, [3, 4, 5, 6, 7, 8]] stp = string.punctuation # stp = "!”#$%&’()*+,-./:;<=>?@[]^_`{|}~\\"
print(fig.renderText('Lyrics Scraper')) print("Hi! I am a bot that can guess the artist from the lyrics.") print("Test me! But let me prepare first. Enter the name of an artist:") artist_input = [] user_input = input("-> ") print("Enter the name of the second artist:") user_input2 = input("-> ") ask_user() df = pd.DataFrame() df["artist"] = artist_name df["lyrics"] = lyrics df['lyrics_clean'] = df['lyrics'].replace(r'\n', ' ', regex=True) df["lyrics_clean"] = hero.clean(df['lyrics']) X = df['lyrics_clean'] y = df['artist'] tfv = TfidfVectorizer() X_tfv = tfv.fit_transform(X) X_vec = pd.DataFrame(X_tfv.todense(), columns=tfv.get_feature_names()) mnb = MultinomialNB() mnb.fit(X_vec, y) mnb_score = mnb.score(X_vec, y) def play(): text = [] print("Let's play! Write some lyrics:")
'redmi note 9', 'oneplus 7t pro', 'nokia 5.3', 'Samsung Galaxy M21', 'Apple iPhone 11', 'Vivo Y20', 'Redmi 8A', 'OPPO A5', 'OnePlus 8', 'Samsung S10' ] #store the mobile phone reviews in the list reviews reviews = [] for i in range(len(mobile_phones)): reviews.append( pd.read_csv(f"../reviews/{mobile_phones[i]}_review.csv")) #custom pipeline to clean the data custom_pipeline = [ preprocessing.fillna, preprocessing.lowercase, preprocessing.remove_whitespace, preprocessing.remove_angle_brackets, preprocessing.remove_html_tags, preprocessing.remove_digits, preprocessing.remove_stopwords, preprocessing.remove_diacritics, preprocessing.remove_round_brackets, preprocessing.remove_square_brackets, preprocessing.remove_curly_brackets, preprocessing.remove_punctuation ] #clean each phone review in reviews for r in reviews: r.review = hero.clean(r.review, custom_pipeline) #lemmatize reviews: for r in reviews: r.review = r.review.apply(lemmatization) #save clean reviews joblib.dump(reviews, '../inputs/mobile_reviews.pkl')
def vectorize(df: pd.DataFrame, text_col: str, kwargs: dict): data = texthero.clean(df[text_col], TEXTHERO_FILTER) vectorizer = TfidfVectorizer(**kwargs) data_mat = vectorizer.fit_transform(data) return data_mat, vectorizer
def _cleanText(self, text) -> str: textSeries = Series([text]) textSeries = hero.clean(textSeries) text = Series.to_string(textSeries, index=False) return text
def apply_nlp_match_police_related( df, model_file='arrest_police_flag_classification', known_description='charge_1_description', known_mapping='charge_1_description_police_related'): model_path = os.sep.join([MODELS_FOLDER, model_file]) if not os.path.exists(model_path): logging.info( 'apply_nlp_match_police_related() NLP model not found, learning a model for classification.' ) known_classifications = df[[known_description, known_mapping]].copy(deep=True) known_classifications = known_classifications.dropna() known_classifications = known_classifications.reset_index(drop=True) known_classifications = known_classifications.rename( columns={ known_description: 'description_original', known_mapping: 'category' }) known_classifications['category'] = known_classifications[ 'category'].map({ False: 0, True: 1 }) known_classifications['description_cleaned'] = hero.clean( known_classifications['description_original'], pipeline=Config.text_pipeline) x_train, x_test, y_train, y_test = tts( known_classifications[['description_cleaned']], known_classifications['category'], test_size=0.3, shuffle=True) logging.info('Fit Train Predict Model') model = Config.nlp_ppl.fit(x_train['description_cleaned'], y_train) y_pred = model.predict(x_test['description_cleaned']) y_true = y_test.tolist() acc = accuracy_score(y_true, y_pred) logging.info(f'Accuracy Score is {acc}') tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() logging.info(f'CM tn {tn} ,fp {fp}, fn {fn}, tp {tp}') plt.figure() plot_confusion_matrix(model, x_test['description_cleaned'], y_true) plt.title(f'Confusion Matrix for Police Related Flag Classification.' f'\nOverall Accuracy Score is {acc}' f'. Train Size={len(x_train)}. Test Size={len(x_test)}.') cm_matrix_plot = os.sep.join( [MODELS_FOLDER, 'arrest_police_related_confusion_matrix.png']) plt.tight_layout() plt.savefig(cm_matrix_plot) plt.show() joblib.dump(model, model_path) logging.info( f'Saving Police Related Classification Model to {model_path}') else: logging.info( f'Found arrest classification model for police related flag at {model_path}' ) model = joblib.load(model_path) logging.info('Applying NLP Model.') # create a nested list to iterate through target_columns = list( zip(Config.charge_columns, Config.police_related_flags)) # run nlp match for charge_description, category in target_columns: df['flag'] = ~df[charge_description].isna() & df[category].isna() start_counts = df['flag'].value_counts() logging.info( f'-- In {category}, there are {start_counts[True]} to classify.') temp = hero.clean(df[df['flag'] == True][charge_description].copy(), pipeline=Config.text_pipeline) idx = temp.index.values.tolist() predictions = model.predict(temp) predictions = [False if i == 0 else True for i in predictions] df[category].update( pd.Series(predictions, name=charge_description, index=idx)) df['flag'] = ~df[charge_description].isna() & df[category].isna() end_counts = df['flag'].value_counts() try: logging.info( f'The are {end_counts[True]} charges left to classify') except: logging.info(f'Classification Status 100%.') df = df.drop(columns=['flag']) return df
encoding="ISO-8859-1") test = pd.read_csv(config.TEST, sep="\t", names=['label', 'text'], encoding="ISO-8859-1") dev = pd.read_csv(config.DEV, sep="\t", names=['label', 'text'], encoding="ISO-8859-1") #combine train,test and dev data for data cleaning final_dataset = pd.concat([train, dev, test], axis=0) #create custom pipeline to clean the dataset custom_pipeline = [ preprocessing.fillna, preprocessing.lowercase, preprocessing.remove_whitespace, preprocessing.remove_punctuation, preprocessing.remove_stopwords, preprocessing.remove_digits, preprocessing.remove_urls ] #copy dataset to df to clean it df = final_dataset.copy() #claen text data in the dataset df['text'] = hero.clean(df['text'], custom_pipeline) #lemmatize reviews df.text = df.text.apply(lemmatization) #save the cleaned dataset df.to_csv("../inputs/data.csv", index=False)
import lib.data_processing as lib import importlib import pandas as pd import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import texthero as hero from collections import Counter df = pd.read_csv('./bias_data/bias_data/transcripts/transcripts.csv') print(df.shape) df = df.iloc[:2000, :] df['clean_text'] = hero.clean(df['transcript']) df['tfidf_clean_text'] = hero.tfidf(df['clean_text'], max_features=200) df['pca'] = hero.pca(df['tfidf_clean_text'], 3) # print(df.head(5)) print(Counter(list(df['host'])).keys()) print(Counter(list(df['host'])).values()) # print(list(df['clean_text'])[:100]) hostNum = len(Counter(list(df['host'])).values()) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') x = list(x[0] for x in df['pca']) y = list(x[1] for x in df['pca']) z = list(x[2] for x in df['pca']) maxLen = max([len(x) for x in df['host']]) c = [] nameMapping = dict() for i in df['host']:
from pathlib import Path sys.path.append(str(Path(__file__).parents[2])) from kaggle_shopee.factories.config_factory import ConfigFactory, EnvEnum from kaggle_shopee.factories.data_factory import Data, DataFactory from kaggle_shopee.factories.preprocessing import Pp from kaggle_shopee.utils.args_util import ArgsUtil args = ArgsUtil.get_args(EnvEnum.LOCAL, "exp017", []) config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, False) data = DataFactory.load_data(config) data, config = Pp.main(data, config) # %% import pycld2 import texthero as hero data.train["title_cleaned"] = hero.clean(data.train["title"]) data.train["title_lang"] = ( data.train["title"].fillna("").map(lambda x: pycld2.detect(x)[2][0][1])) # %% data.train[["title", "title_cleaned"]].sample(10) # %% data.train[~data.train["title"].map(lambda x: "\\" in x)].sample(10)[[ "title", "title_cleaned" ]]
custom_pipeline = [preprocessing.fillna, preprocessing.lowercase, preprocessing.remove_stopwords, preprocessing.remove_digits, preprocessing.remove_whitespace] import sys GAME = sys.argv[1] if len(sys.argv) > 2: if sys.argv[2] == 'all': RATING = 'all' else: RATING = max(int(sys.argv[2]), 1) else: RATING = 1 df = pd.read_csv(f'./clean/clean-{GAME}.csv') df['clean_text'] = hero.clean(df['content'], custom_pipeline) def display_topics(model, feature_names, no_top_words): for topic_idx, topic in enumerate(model.components_): print(f"Topic {topic_idx}:") print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])) vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') text_df = df[df.score == RATING] if RATING != 'all' else df text_count = vectorizer.fit_transform(text_df['clean_text']) lda = LatentDirichletAllocation(learning_method='online', learning_offset=50.,random_state=0).fit(text_count) display_topics(lda, vectorizer.get_feature_names(), 10)
def make_nlp_classification_model_charge_descriptions( model_name_charge_classification, df=None, filename='arrests_redacted.bz2', known_mapping='charge_1_description_category_micro'): model_path_charge_classification = os.sep.join( [MODELS_FOLDER, model_name_charge_classification]) if os.path.exists(model_path_charge_classification): logging.info( f'Found existing model for charge description classification, loading it from {model_path_charge_classification}' ) model = joblib.load(model_path_charge_classification) return model else: logging.info( f'Did not find NLP model for charge description classification at {model_path_charge_classification}, starting NLP model training pipeline.' ) if df is None: data_file = os.sep.join([DATA_FOLDER, filename]) logging.info(f'Starting NLP Pipeline from {data_file}') df = pd.read_pickle(data_file) known_classifications = df[['charge_1_description', known_mapping]].copy() known_classifications = known_classifications.dropna() known_classifications = known_classifications.reset_index(drop=True) known_classifications = known_classifications.rename( columns={ 'charge_1_description': 'description_original', 'charge_1_description_category_micro': 'category' }) known_classifications['description_cleaned'] = hero.clean( known_classifications['description_original'], pipeline=Config.text_pipeline) x_train, x_test, y_train, y_test = tts( known_classifications[['description_cleaned']], known_classifications['category'], test_size=0.3, shuffle=True) logging.info('Fit Train Predict Model') model = Config.nlp_ppl.fit(x_train['description_cleaned'], y_train) y_pred = model.predict(x_test['description_cleaned']) y_true = y_test.tolist() acc = accuracy_score(y_true, y_pred) logging.info('==== Model Results') logging.info(f'==== Accuracy Score is {acc}') labels = df[known_mapping].dropna().astype('str').unique().tolist() cm = confusion_matrix(y_true, y_pred, labels=labels) cm_df = pd.DataFrame(cm, columns=labels, index=labels) cm_matrix_data = os.sep.join( [MODELS_FOLDER, 'arrest_charge_descr_confusion_matrix.csv']) cm_df.to_csv(cm_matrix_data) plt.figure(figsize=(15, 15)) cmap = plt.cm.get_cmap('viridis') plot_confusion_matrix( model, x_test['description_cleaned'], y_true, display_labels=labels # , normalize='all' , include_values=False, xticks_rotation=30, cmap=cmap) plt.yticks(fontsize="x-small") plt.xticks(fontsize="xx-small") cm_matrix_plot = os.sep.join( [MODELS_FOLDER, 'arrest_charge_descr_confusion_matrix.png']) plt.title( f'Confusion Matrix for Arrest Charge Description Classification Model\nOverall Accuracy is {acc}. Train Size={len(x_train)} Test Size={len(x_test)}' ) plt.tight_layout() plt.savefig(cm_matrix_plot) plt.show() joblib.dump(model, model_path_charge_classification) logging.info(f'Saving Model to {model_path_charge_classification}') return model