Python clean 예제들, texthero.clean Python 예제들

예제 #1

0

파일 보기

파일: run_model1.py 프로젝트: tubo213/atmacup10_colum2131_tubo

def get_text_features_tfidf(input_df: pd.DataFrame,
                            text_col: str,
                            n_comp=10,
                            clean=True):
    vectorizer = make_pipeline(
        TfidfVectorizer(),
        make_union(TruncatedSVD(n_components=n_comp, random_state=42),
                   NMF(n_components=n_comp, random_state=42),
                   make_pipeline(
                       BM25Transformer(use_idf=True, k1=2.0, b=0.75),
                       TruncatedSVD(n_components=n_comp, random_state=42)),
                   n_jobs=1))

    X = input_df[text_col].fillna('')
    if clean == True:
        pipeline = [
            hero.preprocessing.fillna, hero.preprocessing.remove_digits,
            hero.preprocessing.remove_punctuation,
            hero.preprocessing.remove_diacritics,
            hero.preprocessing.remove_whitespace
        ]
        hero.clean(X, pipeline)

    X = vectorizer.fit_transform(X).astype(np.float32)
    output_df = pd.DataFrame(
        X,
        columns=[f'{text_col}_tfidf_svd_{i}' for i in range(n_comp)] +
        [f'{text_col}_tfidf_nmf_{i}' for i in range(n_comp)] +
        [f'{text_col}_tfidf_bm25_{i}' for i in range(n_comp)])

    return output_df

예제 #2

0

파일 보기

파일: CH_scrape_and_viz.py 프로젝트: pszemraj/job-scraper

def indeed_postprocess(i_df, query_term, query_jobtype, verbose=False,
                       shorten_links=False):
    print("Starting postprocess - ", datetime.now())

    # apply texthero cleaning
    i_df["titles"] = hero.clean(i_df["titles"])
    i_df["summary"] = hero.clean(i_df["summary"])

    # use bit.ly to shorten links
    if shorten_links:
        try:
            len(i_df["short_link"])
            print("found values for short_link, not-recreating")
        except:
            print("no values exist for short_link, creating them now")
            # there is a random delay to not overload APIs, max rt is 5s * num_rows
            i_df["short_link"] = i_df["links"].apply(shorten_URL_bitly)
    else:
        i_df["short_link"] = "not_created"

    # save file to excel
    rn = datetime.now()
    i_PP_date = rn.strftime("_%m.%d.%Y-%H-%M_")
    i_df["date_pulled"] = rn.strftime("%m.%d.%Y")
    i_df["time_pulled"] = rn.strftime("%H:%M:%S")
    out_name = "JS_DB_" + "query=[term(s)=" + query_term + ", type=" + query_jobtype + "]" + i_PP_date + ".xlsx"
    i_df.to_excel(out_name)
    if verbose: print("Saved {} - ".format(out_name), datetime.now())

    # download if requested
    return i_df

예제 #3

0

파일 보기

    def preprocess(self,document,pipeline = None,add_preprocess=None):
        """
         Return a list contaning all the methods used in the default cleaning
         pipeline.
         Return a list with the following functions:
          1. :meth:`texthero.preprocessing.fillna`
          2. :meth:`texthero.preprocessing.lowercase`
          3. :meth:`texthero.preprocessing.remove_digits`
          4. :meth:`texthero.preprocessing.remove_html_tags`
          5. :meth:`texthero.preprocessing.remove_punctuation`
          6. :meth:`texthero.preprocessing.remove_diacritics`
          7. :meth:`texthero.preprocessing.remove_stopwords`
          8. :meth:`texthero.preprocessing.remove_whitespace`

         """
        if pipeline is not None:
            pipeline = heropreprocess.get_default_pipeline()
        if add_preprocess is not None:
            for preprocess in add_preprocess:
                pipeline.append(preprocess)



        clean_text = hero.clean(document, pipeline=pipeline)
        return clean_text

예제 #4

0

파일 보기

def cleaning_text(df_name):
    '''
    All the steps of preprocessing
    :param df_name: name of the df on wich the content column must be preprocessed
    :return: a "clea_content" column
    '''
    # delete pseudo strating with @
    df_name['clean_content'] = df_name['content'].apply(delete_pseudo)
    # method clean from texthero
    df_name['clean_content'] = hero.clean(df_name['clean_content'])
    # delete stopwords with texthero
    default_stopwords = stopwords.DEFAULT
    custom_stopwords = default_stopwords.union(
        set([
            "feel", "feeling", "im", "get", "http", "ive", "go", "day", "com",
            "got", "see"
            "4pm"
        ]))
    df_name['clean_content'] = hero.remove_stopwords(df_name['clean_content'],
                                                     custom_stopwords)
    # remove urls
    df_name['clean_content'] = hero.remove_urls(df_name['clean_content'])
    # remove angle brakets
    df_name['clean_content'] = hero.remove_angle_brackets(
        df_name['clean_content'])
    # remove digits
    df_name['clean_content'] = hero.preprocessing.remove_digits(
        df_name['clean_content'], only_blocks=False)

예제 #5

0

파일 보기

def get_wordcloud(all_comments):
    df = pd.DataFrame(all_comments, columns=['text'])
    df['clean_text'] = hero.clean(df['text'])

    figure = hero.visualization.wordcloud(df['clean_text'], return_figure=True)
    # figure.savefig('plot.png')
    return figure, df

예제 #6

0

파일 보기

파일: name_bow3.py 프로젝트: cfiken/atmacup8

 def _clean(self, series: pd.Series) -> pd.Series:
     custom_pipeline = [
         preprocessing.fillna, preprocessing.lowercase,
         preprocessing.remove_digits, preprocessing.remove_punctuation,
         preprocessing.remove_diacritics, preprocessing.remove_whitespace
     ]
     return texthero.clean(series, pipeline=custom_pipeline)

예제 #7

0

파일 보기

파일: text.py 프로젝트: ekity1002/ML_utils

def clean_text(raw_text):
    """
    textheroで前処理例
    nltk ライブラリをインストールすることで nltk のデータ利用して stpowordの除去を行える
    """
    import texthero as hero

    raw_text = raw_text

    clean_text = hero.clean(raw_text,
                            pipeline=[
                                hero.preprocessing.fillna,
                                hero.preprocessing.lowercase,
                                hero.preprocessing.remove_digits,
                                hero.preprocessing.remove_punctuation,
                                hero.preprocessing.remove_diacritics,
                                hero.preprocessing.remove_stopwords
                            ])

    # wordcloudで可視化
    hero.visualization.wordcloud(clean_text,
                                 colormap='viridis',
                                 background_color='white')

    import nltk

    nltk.download('stopwords')
    os.listdir(os.path.expanduser('~/nltk_data/corpora/stopwords/'))

    # 英語とオランダ語を stopword として指定
    custom_stopwords = nltk.corpus.stopwords.words(
        'dutch') + nltk.corpus.stopwords.words('english')

    # 前処理にストップワード除去を加える
    apply_stopword_text = hero.clean(
        raw_text,
        pipeline=[
            hero.preprocessing.fillna, hero.preprocessing.lowercase,
            hero.preprocessing.remove_digits,
            hero.preprocessing.remove_punctuation,
            hero.preprocessing.remove_diacritics, lambda x: hero.preprocessing.
            remove_stopwords(x, stopwords=custom_stopwords)
        ])

예제 #8

0

파일 보기

def clearn_by_hero(x):
    custom_pipeline = [
        preprocessing.fillna,
        preprocessing.lowercase,
        preprocessing.remove_digits,
        preprocessing.remove_punctuation,
        preprocessing.remove_diacritics,
        preprocessing.remove_whitespace,
        preprocessing.remove_stopwords
    ]
    return hero.clean(x, custom_pipeline)

예제 #9

0

파일 보기

파일: ans_54.py 프로젝트: takapy0210/nlp_2020

 def preprocess(self, input_text):
     """前処理"""
     clean_text = hero.clean(input_text,
                             pipeline=[
                                 hero.preprocessing.fillna,
                                 hero.preprocessing.lowercase,
                                 hero.preprocessing.remove_digits,
                                 hero.preprocessing.remove_punctuation,
                                 hero.preprocessing.remove_diacritics,
                                 hero.preprocessing.remove_stopwords
                             ])
     return clean_text

예제 #10

0

파일 보기

파일: clean_data.py 프로젝트: justinhchae/stocks

def cleaner(df, col):
    """
    :param df: a pandas dataframe
    :param col: a string (name of col to clean)
    :return: a dataframe cleaned with text hero pipeline
    """
    # ignore future warning re: texthero default call to regex
    warnings.filterwarnings("ignore")
    pipeline = [
        pp.fillna, pp.remove_digits, pp.lowercase, pp.remove_punctuation,
        pp.remove_diacritics, pp.remove_stopwords, pp.remove_whitespace,
        pp.stem
    ]
    df[col] = hero.clean(df[col], pipeline=pipeline)

    return df

예제 #11

0

파일 보기

파일: resumeCleaning.py 프로젝트: alexanderschang/job_recommender

def process_resume(doc):
    text = ""
    for page in doc:
        text = text + str(page.getText())

    # Split text by next line
    tx = " ".join(text.split('\n'))

    # Remove unnecessary punctuations
    cleaned_tx = re.sub(r'[§_|]', '', tx)

    # Remove/extract phone numbers
    pattern = re.compile(
        r'([+(]?\d+[)\-]?[ \t\r\f\v]*[(]?\d{2,}[()\-]?[ \t\r\f\v]*\d{2,}[()\-]?[ \t\r\f\v]*\d*[ \t\r\f\v]*\d*[ \t\r\f\v]*)'
    )
    match = pattern.findall(cleaned_tx)
    match = [
        re.sub(r'[,.]', '', el) for el in match
        if len(re.sub(r'[()\-.,\s+]', '', el)) > 6
    ]
    match = [re.sub(r'\D$', '', el).strip() for el in match]
    phoneNum_match = [el for el in match if len(re.sub(r'\D', '', el)) <= 15]
    phoneNum_match = ' '.join([str(elem) for elem in phoneNum_match])
    cleaned_tx = cleaned_tx.replace(phoneNum_match, '')

    # Get email
    email_pattern = re.compile(r'\S*@\S*')
    email_match = email_pattern.findall(cleaned_tx)
    email_match = ' '.join([str(elem) for elem in email_match])
    cleaned_tx = cleaned_tx.replace(email_match, '')

    # Further cleaning using texthero
    resume_df = pd.DataFrame([cleaned_tx], columns=['resume'])
    df2 = pd.DataFrame()

    custom_pipeline = [
        preprocessing.lowercase, preprocessing.remove_digits,
        preprocessing.remove_punctuation, preprocessing.remove_diacritics,
        preprocessing.remove_whitespace
    ]
    # preprocessing.stem]

    df2['cleaned'] = hero.clean(resume_df['resume'], pipeline=custom_pipeline)
    pd.set_option('max_colwidth', 100000)
    cleaned_resume = df2['cleaned'].to_string(index=False)

    return cleaned_resume

예제 #12

0

파일 보기

파일: feature.py 프로젝트: sunyeul/Competition

def text_normalization(text):

    # 英語とオランダ語を stopword として指定
    custom_stopwords = nltk.corpus.stopwords.words(
        'dutch') + nltk.corpus.stopwords.words('english')

    x = hero.clean(text,
                   pipeline=[
                       hero.preprocessing.fillna, hero.preprocessing.lowercase,
                       hero.preprocessing.remove_digits,
                       hero.preprocessing.remove_punctuation,
                       hero.preprocessing.remove_diacritics,
                       lambda x: hero.preprocessing.remove_stopwords(
                           x, stopwords=custom_stopwords)
                   ])

    return x

예제 #13

0

파일 보기

파일: dataInjector.py 프로젝트: fadiatamny/AID-Master

    def preProcess(crawledData: list, models, headers):
        predictions = []
        for i in range(len(crawledData)):
            series = pd.Series(crawledData[i])
            series = hero.clean(series)
            toString = pd.Series.to_string(series, index=False)
            predictions.append(ModelUtils.fastPredict(toString, models))

        resFrame = pd.DataFrame()
        for i in range(len(predictions)):
            for j in range(len(predictions[i])):
                dataframe = headers.copy()
                new = predictions[i][j].replace('__label__', '')
                dataframe[new] = [1]
            dataframe["TEXT"] = crawledData[i]
            resFrame = pd.concat([resFrame, dataframe], ignore_index=True)

        return resFrame

예제 #14

0

파일 보기

파일: make_nlp_classifications.py 프로젝트: appleseed-data/arrest_clusters

def run_nlp_match(target_column, df, model):
    charge_description, categories = target_column
    logging.info(f'- Starting NLP match for {charge_description}')

    results = []

    logging.info(f'- Starting NLP match for {charge_description}')

    for category in categories:
        if 'micro' in category:
            # do mapping where there is a description but no category mapping
            df['flag'] = ~df[charge_description].isna() & df[category].isna()
            start_counts = df['flag'].value_counts()
            logging.info(
                f'-- In {category}, there are {start_counts[True]} to classify.'
            )

            temp = hero.clean(
                df[df['flag'] == True][charge_description].copy(),
                pipeline=Config.text_pipeline)
            idx = temp.index.values.tolist()
            predictions = model.predict(temp)

            df[category].update(
                pd.Series(predictions, name=charge_description, index=idx))

            df['flag'] = ~df[charge_description].isna() & df[category].isna()
            end_counts = df['flag'].value_counts()

            try:
                logging.info(
                    f'The are {end_counts[True]} charges left to classify')
            except:
                logging.info(
                    f'Classification Status 100%. Started with {start_counts[True]} to map. Ended with {end_counts[False]} mapped. '
                )

            result = tuple((category, df[category]))

            results.append(result)

    # currently returning first element of a list of name and series pair
    return results[0]

예제 #15

0

파일 보기

def upload():
    if request.method == 'POST':
        # Check if the post request has the file part
        if 'file' not in request.files:
            flash('No file attached in request')
            return redirect(request.url)
        file = request.files['file']

        # if user does not select file, browser also
        # submits an empty part without filename
        if file.filename == '':
            flash('No selected file')
            return redirect(request.url)

        elif not allowed_file(file.filename):
            abort(400, 'Incorrect file extension')
            flash('Incorrect file extension. Must be PDF!')
            return redirect(request.url)

        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            file.save(
                os.path.join(BASE_DIR, app.config['UPLOAD_FOLDER'], filename))

            resume_cleaned = final_resume_clean(
                os.path.join(app.config['UPLOAD_FOLDER'], filename), filename)
            jobs_df = pd.read_csv('cleaned_data.csv')

            global df
            df = get_recommendations(resume_cleaned, jobs_df)

            # Remove diacritics from original job descriptions
            custom_pipeline = [preprocessing.remove_diacritics]
            df['Description'] = hero.clean(df['Description'],
                                           pipeline=custom_pipeline)

    return render_template("recommendation.html",
                           column_names=df.columns.values,
                           target_column="Company",
                           hide_column="Job Description",
                           row_data=list(df.values.tolist()),
                           zip=zip)

예제 #16

0

파일 보기

파일: preprocess.py 프로젝트: katsu1110/atma8_solution

def name2feats(df, column='Name', methods=['pca', 'kmeans', 'ngram']):
    # preprocessing
    custom_pipeline = [
        hero.preprocessing.fillna, hero.preprocessing.lowercase,
        hero.preprocessing.remove_digits,
        hero.preprocessing.remove_punctuation,
        hero.preprocessing.remove_diacritics,
        hero.preprocessing.remove_whitespace
    ]
    df['clean_name'] = hero.clean(df[column], pipeline=custom_pipeline)

    # tfidf -> pca
    if 'pca' in methods:
        df['pca_name'] = hero.tfidf(df['clean_name'], max_features=200)
        df['pca_name'] = hero.pca(df['pca_name'], n_components=10)
        for i in np.arange(len(df['pca_name'].values[0])):
            df[f'tfidf_pca_{column}{i}'] = df['pca_name'].apply(lambda x: x[i])
        df.drop(columns=['pca_name'], inplace=True)

    # tfidf -> kmeans
    if 'kmeans' in methods:
        df[f'tfidf_kmeans_{column}'] = hero.tfidf(df['clean_name'],
                                                  max_features=200)
        df[f'tfidf_kmeans_{column}'] = hero.kmeans(
            df[f'tfidf_kmeans_{column}'], n_clusters=10)

    # n-gram
    if 'ngram' in methods:
        for n in [2, 3]:
            name_grams = df[column].apply(lambda x: line_ngram(x, n))
            grams = [x for row in name_grams for x in row if len(x) > 0]
            top_grams = pd.Series(grams).value_counts().head(20).index
            df[f'{column}_in_top_{n}gram'] = name_grams.map(
                lambda x: any([i for i in x if i in top_grams]))

    df.drop(columns=['clean_name'], inplace=True)
    return df

예제 #17

0

파일 보기

파일: svm.py 프로젝트: jackspicer1229/CSCI4622-NLP

def clean_dataframe(df):
    df['clean_reviewText'] = hero.clean(df['reviewText'])
    return df

예제 #18

0

파일 보기

파일: 2020-11-09-anapaulagomes-propostas-dos-candidatos-a-prefeito.py 프로젝트: viniciusbeckerdesouza/analises

        f"{proposals_directory}/{target_state}/{target_city}/{file_name}"
    ).read()
    custom_stop_words = [
        "feira",
        "santana",
        "municipio",
        "municipal",
        "município",
        "municipais",
        "cidade",
        "publico",
        "publica",
    ]
    cleaned_text = remove_portuguese_stopwords(content, custom_stop_words)
    cleaned_text = pd.Series(cleaned_text)
    cleaned_text = hero.clean(cleaned_text, custom_pipeline)
    candidates_and_proposals[candidate_name] = cleaned_text

for candidate, proposal in candidates_and_proposals.items():
    print(f"------------------------ {candidate}")
    text = " ".join(proposal)
    wordcloud = WordCloud(
        background_color="white",
        width=2000,
        height=800,
        colormap="PuOr",
        collocations=False,
    ).generate(text)
    fig = plt.figure(figsize=(20, 10))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")

예제 #19

0

파일 보기

파일: testing.py 프로젝트: thpthp1/csds448-text-analysis

from model.utils import vectorize
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc, precision_score, accuracy_score, recall_score, f1_score, confusion_matrix
import pandas as pd
from utils import vectorize, TEXTHERO_FILTER
import numpy as np
import pickle
from texthero import preprocessing
import texthero
from text_matching_clf import TextMatchingClassifier

reviews = pd.read_csv("./clean/clean-geomdash.csv")

reviews["clean"] = texthero.clean(reviews["content"], pipeline=TEXTHERO_FILTER)

# SAMPLE_PER_CLASS = 250
# sampled_reviews = reviews.groupby('score').apply(lambda x: x.sample(SAMPLE_PER_CLASS))
sampled_reviews = reviews

with open('./model/vectorizer-reviews.pkl', 'rb') as vf:
    vectorizer = pickle.load(vf)

with open('./model/svc-reviews.pkl', 'rb') as svc_f:
    model = pickle.load(svc_f)

with open('./model/text-match-reviews.pkl', 'rb') as text_f:
    text_match_model = pickle.load(text_f)


X = vectorizer.transform(sampled_reviews["clean"])

예제 #20

0

파일 보기

from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

test_df = pd.read_csv(
    r"E:\Hackathon\Research paper search\test_8iecVfC\test.csv",
    encoding='utf-8')
train_df = pd.read_csv(
    r"E:\Hackathon\Research paper search\train_tGmol3O\train.csv",
    encoding='utf-8')

pd.set_option('max_columns', None)

test_df["TITLE"] = hero.clean(test_df["TITLE"])
train_df["TITLE"] = hero.clean(train_df["TITLE"])

test_df["ABSTRACT"] = hero.clean(test_df["ABSTRACT"])
train_df["ABSTRACT"] = hero.clean(train_df["ABSTRACT"])

train_df["Content"] = train_df["TITLE"] + train_df["ABSTRACT"]
test_df["Content"] = test_df["TITLE"] + test_df["ABSTRACT"]
train_df["Content"] = hero.clean(train_df["Content"])

train_df_x = train_df["Content"]
test_df_x = test_df["Content"]
train_df_y = train_df.iloc[:, [3, 4, 5, 6, 7, 8]]

stp = string.punctuation
# stp = "!”#$%&’()*+,-./:;<=>?@[]^_`{|}~\\"

예제 #21

0

파일 보기

파일: main.py 프로젝트: lorenanda/lyrics-classification

print(fig.renderText('Lyrics Scraper'))
print("Hi! I am a bot that can guess the artist from the lyrics.")
print("Test me! But let me prepare first. Enter the name of an artist:")

artist_input = []
user_input = input("-> ")
print("Enter the name of the second artist:")
user_input2 = input("-> ")

ask_user()

df = pd.DataFrame()
df["artist"] = artist_name
df["lyrics"] = lyrics
df['lyrics_clean'] = df['lyrics'].replace(r'\n', ' ', regex=True)
df["lyrics_clean"] = hero.clean(df['lyrics'])
X = df['lyrics_clean']
y = df['artist']

tfv = TfidfVectorizer()
X_tfv = tfv.fit_transform(X)
X_vec = pd.DataFrame(X_tfv.todense(), columns=tfv.get_feature_names())

mnb = MultinomialNB()
mnb.fit(X_vec, y)
mnb_score = mnb.score(X_vec, y)


def play():
    text = []
    print("Let's play! Write some lyrics:")

예제 #22

0

파일 보기

        'redmi note 9', 'oneplus 7t pro', 'nokia 5.3', 'Samsung Galaxy M21',
        'Apple iPhone 11', 'Vivo Y20', 'Redmi 8A', 'OPPO A5', 'OnePlus 8',
        'Samsung S10'
    ]

    #store the mobile phone reviews in the list reviews
    reviews = []
    for i in range(len(mobile_phones)):
        reviews.append(
            pd.read_csv(f"../reviews/{mobile_phones[i]}_review.csv"))

    #custom pipeline to clean the data
    custom_pipeline = [
        preprocessing.fillna, preprocessing.lowercase,
        preprocessing.remove_whitespace, preprocessing.remove_angle_brackets,
        preprocessing.remove_html_tags, preprocessing.remove_digits,
        preprocessing.remove_stopwords, preprocessing.remove_diacritics,
        preprocessing.remove_round_brackets,
        preprocessing.remove_square_brackets,
        preprocessing.remove_curly_brackets, preprocessing.remove_punctuation
    ]

    #clean each phone review in reviews
    for r in reviews:
        r.review = hero.clean(r.review, custom_pipeline)

    #lemmatize reviews:
    for r in reviews:
        r.review = r.review.apply(lemmatization)
    #save clean reviews
    joblib.dump(reviews, '../inputs/mobile_reviews.pkl')

예제 #23

0

파일 보기

def vectorize(df: pd.DataFrame, text_col: str, kwargs: dict):
    data = texthero.clean(df[text_col], TEXTHERO_FILTER)
    vectorizer = TfidfVectorizer(**kwargs)
    data_mat = vectorizer.fit_transform(data)
    return data_mat, vectorizer

예제 #24

0

파일 보기

 def _cleanText(self, text) -> str:
     textSeries = Series([text])
     textSeries = hero.clean(textSeries)
     text = Series.to_string(textSeries, index=False)
     return text

예제 #25

0

파일 보기

파일: make_nlp_classifications.py 프로젝트: appleseed-data/arrest_clusters

def apply_nlp_match_police_related(
        df,
        model_file='arrest_police_flag_classification',
        known_description='charge_1_description',
        known_mapping='charge_1_description_police_related'):

    model_path = os.sep.join([MODELS_FOLDER, model_file])

    if not os.path.exists(model_path):
        logging.info(
            'apply_nlp_match_police_related() NLP model not found, learning a model for classification.'
        )
        known_classifications = df[[known_description,
                                    known_mapping]].copy(deep=True)
        known_classifications = known_classifications.dropna()
        known_classifications = known_classifications.reset_index(drop=True)
        known_classifications = known_classifications.rename(
            columns={
                known_description: 'description_original',
                known_mapping: 'category'
            })

        known_classifications['category'] = known_classifications[
            'category'].map({
                False: 0,
                True: 1
            })

        known_classifications['description_cleaned'] = hero.clean(
            known_classifications['description_original'],
            pipeline=Config.text_pipeline)

        x_train, x_test, y_train, y_test = tts(
            known_classifications[['description_cleaned']],
            known_classifications['category'],
            test_size=0.3,
            shuffle=True)

        logging.info('Fit Train Predict Model')

        model = Config.nlp_ppl.fit(x_train['description_cleaned'], y_train)
        y_pred = model.predict(x_test['description_cleaned'])
        y_true = y_test.tolist()
        acc = accuracy_score(y_true, y_pred)
        logging.info(f'Accuracy Score is {acc}')

        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        logging.info(f'CM tn {tn} ,fp {fp}, fn {fn}, tp {tp}')

        plt.figure()
        plot_confusion_matrix(model, x_test['description_cleaned'], y_true)
        plt.title(f'Confusion Matrix for Police Related Flag Classification.'
                  f'\nOverall Accuracy Score is {acc}'
                  f'. Train Size={len(x_train)}. Test Size={len(x_test)}.')
        cm_matrix_plot = os.sep.join(
            [MODELS_FOLDER, 'arrest_police_related_confusion_matrix.png'])
        plt.tight_layout()
        plt.savefig(cm_matrix_plot)
        plt.show()

        joblib.dump(model, model_path)
        logging.info(
            f'Saving Police Related Classification Model to {model_path}')

    else:
        logging.info(
            f'Found arrest classification model for police related flag at {model_path}'
        )
        model = joblib.load(model_path)

    logging.info('Applying NLP Model.')
    # create a nested list to iterate through
    target_columns = list(
        zip(Config.charge_columns, Config.police_related_flags))
    # run nlp match
    for charge_description, category in target_columns:

        df['flag'] = ~df[charge_description].isna() & df[category].isna()
        start_counts = df['flag'].value_counts()
        logging.info(
            f'-- In {category}, there are {start_counts[True]} to classify.')

        temp = hero.clean(df[df['flag'] == True][charge_description].copy(),
                          pipeline=Config.text_pipeline)
        idx = temp.index.values.tolist()
        predictions = model.predict(temp)
        predictions = [False if i == 0 else True for i in predictions]

        df[category].update(
            pd.Series(predictions, name=charge_description, index=idx))

        df['flag'] = ~df[charge_description].isna() & df[category].isna()
        end_counts = df['flag'].value_counts()

        try:
            logging.info(
                f'The are {end_counts[True]} charges left to classify')
        except:
            logging.info(f'Classification Status 100%.')

    df = df.drop(columns=['flag'])

    return df

예제 #26

0

파일 보기

                        encoding="ISO-8859-1")
    test = pd.read_csv(config.TEST,
                       sep="\t",
                       names=['label', 'text'],
                       encoding="ISO-8859-1")
    dev = pd.read_csv(config.DEV,
                      sep="\t",
                      names=['label', 'text'],
                      encoding="ISO-8859-1")

    #combine train,test and dev data for data cleaning
    final_dataset = pd.concat([train, dev, test], axis=0)

    #create custom pipeline to clean the dataset
    custom_pipeline = [
        preprocessing.fillna, preprocessing.lowercase,
        preprocessing.remove_whitespace, preprocessing.remove_punctuation,
        preprocessing.remove_stopwords, preprocessing.remove_digits,
        preprocessing.remove_urls
    ]
    #copy dataset to df to clean it
    df = final_dataset.copy()

    #claen text data in the dataset
    df['text'] = hero.clean(df['text'], custom_pipeline)

    #lemmatize reviews
    df.text = df.text.apply(lemmatization)

    #save the cleaned dataset
    df.to_csv("../inputs/data.csv", index=False)

예제 #27

0

파일 보기

import lib.data_processing as lib
import importlib
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import texthero as hero
from collections import Counter

df = pd.read_csv('./bias_data/bias_data/transcripts/transcripts.csv')
print(df.shape)

df = df.iloc[:2000, :]
df['clean_text'] = hero.clean(df['transcript'])
df['tfidf_clean_text'] = hero.tfidf(df['clean_text'], max_features=200)
df['pca'] = hero.pca(df['tfidf_clean_text'], 3)

# print(df.head(5))
print(Counter(list(df['host'])).keys())
print(Counter(list(df['host'])).values())
# print(list(df['clean_text'])[:100])
hostNum = len(Counter(list(df['host'])).values())

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x = list(x[0] for x in df['pca'])
y = list(x[1] for x in df['pca'])
z = list(x[2] for x in df['pca'])
maxLen = max([len(x) for x in df['host']])
c = []
nameMapping = dict()
for i in df['host']:

예제 #28

0

파일 보기

from pathlib import Path

sys.path.append(str(Path(__file__).parents[2]))

from kaggle_shopee.factories.config_factory import ConfigFactory, EnvEnum
from kaggle_shopee.factories.data_factory import Data, DataFactory
from kaggle_shopee.factories.preprocessing import Pp
from kaggle_shopee.utils.args_util import ArgsUtil

args = ArgsUtil.get_args(EnvEnum.LOCAL, "exp017", [])
config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, False)
data = DataFactory.load_data(config)
data, config = Pp.main(data, config)

# %%
import pycld2
import texthero as hero

data.train["title_cleaned"] = hero.clean(data.train["title"])

data.train["title_lang"] = (
    data.train["title"].fillna("").map(lambda x: pycld2.detect(x)[2][0][1]))

# %%
data.train[["title", "title_cleaned"]].sample(10)

# %%
data.train[~data.train["title"].map(lambda x: "\\" in x)].sample(10)[[
    "title", "title_cleaned"
]]

예제 #29

0

파일 보기

custom_pipeline = [preprocessing.fillna,
                   preprocessing.lowercase,
                   preprocessing.remove_stopwords,
                   preprocessing.remove_digits,
                   preprocessing.remove_whitespace]
import sys

GAME = sys.argv[1]
if len(sys.argv) > 2:
    if sys.argv[2] == 'all':
        RATING = 'all'
    else: RATING = max(int(sys.argv[2]), 1) 
else: 
    RATING = 1

df = pd.read_csv(f'./clean/clean-{GAME}.csv')
df['clean_text'] = hero.clean(df['content'], custom_pipeline)


def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
text_df = df[df.score == RATING] if RATING != 'all' else df
text_count = vectorizer.fit_transform(text_df['clean_text'])
lda = LatentDirichletAllocation(learning_method='online', learning_offset=50.,random_state=0).fit(text_count)

display_topics(lda, vectorizer.get_feature_names(), 10)

예제 #30

0

파일 보기

파일: make_nlp_classifications.py 프로젝트: appleseed-data/arrest_clusters

def make_nlp_classification_model_charge_descriptions(
        model_name_charge_classification,
        df=None,
        filename='arrests_redacted.bz2',
        known_mapping='charge_1_description_category_micro'):

    model_path_charge_classification = os.sep.join(
        [MODELS_FOLDER, model_name_charge_classification])

    if os.path.exists(model_path_charge_classification):
        logging.info(
            f'Found existing model for charge description classification, loading it from {model_path_charge_classification}'
        )
        model = joblib.load(model_path_charge_classification)
        return model
    else:
        logging.info(
            f'Did not find NLP model for charge description classification at {model_path_charge_classification}, starting NLP model training pipeline.'
        )
        if df is None:
            data_file = os.sep.join([DATA_FOLDER, filename])
            logging.info(f'Starting NLP Pipeline from {data_file}')
            df = pd.read_pickle(data_file)

        known_classifications = df[['charge_1_description',
                                    known_mapping]].copy()
        known_classifications = known_classifications.dropna()
        known_classifications = known_classifications.reset_index(drop=True)
        known_classifications = known_classifications.rename(
            columns={
                'charge_1_description': 'description_original',
                'charge_1_description_category_micro': 'category'
            })

        known_classifications['description_cleaned'] = hero.clean(
            known_classifications['description_original'],
            pipeline=Config.text_pipeline)

        x_train, x_test, y_train, y_test = tts(
            known_classifications[['description_cleaned']],
            known_classifications['category'],
            test_size=0.3,
            shuffle=True)

        logging.info('Fit Train Predict Model')

        model = Config.nlp_ppl.fit(x_train['description_cleaned'], y_train)
        y_pred = model.predict(x_test['description_cleaned'])
        y_true = y_test.tolist()
        acc = accuracy_score(y_true, y_pred)
        logging.info('==== Model Results')
        logging.info(f'==== Accuracy Score is {acc}')

        labels = df[known_mapping].dropna().astype('str').unique().tolist()
        cm = confusion_matrix(y_true, y_pred, labels=labels)

        cm_df = pd.DataFrame(cm, columns=labels, index=labels)
        cm_matrix_data = os.sep.join(
            [MODELS_FOLDER, 'arrest_charge_descr_confusion_matrix.csv'])
        cm_df.to_csv(cm_matrix_data)

        plt.figure(figsize=(15, 15))
        cmap = plt.cm.get_cmap('viridis')

        plot_confusion_matrix(
            model,
            x_test['description_cleaned'],
            y_true,
            display_labels=labels
            # , normalize='all'
            ,
            include_values=False,
            xticks_rotation=30,
            cmap=cmap)
        plt.yticks(fontsize="x-small")
        plt.xticks(fontsize="xx-small")

        cm_matrix_plot = os.sep.join(
            [MODELS_FOLDER, 'arrest_charge_descr_confusion_matrix.png'])
        plt.title(
            f'Confusion Matrix for Arrest Charge Description Classification Model\nOverall Accuracy is {acc}. Train Size={len(x_train)} Test Size={len(x_test)}'
        )
        plt.tight_layout()
        plt.savefig(cm_matrix_plot)
        plt.show()

        joblib.dump(model, model_path_charge_classification)
        logging.info(f'Saving Model to {model_path_charge_classification}')

        return model