Пример #1
0
def norm_words(df):
    '''
    Tokenizes and normalizes words. Differs from norm_text() because it does not lemmatize
    the words. Used in word embedding models.  Returns df with new columns "tokenized_words" and "normalized_words"
    '''
    df['tokenized_words'] = df['text'].apply(
        lambda x: lucem_illud_2020.word_tokenize(x))
    df['normalized_words'] = df['tokenized_words'].apply(
        lambda x: lucem_illud_2020.normalizeTokens(x, lemma=False))

    return df
Пример #2
0
def norm_text(df):
    '''
    Tokenizes and normalizes text data.  Returns df with new columns "tokenized_text" and "normalized_text"
    '''
    # Tokenized and normalized texts
    df['tokenized_text'] = df['text'].apply(
        lambda x: lucem_illud_2020.word_tokenize(x))
    df['normalized_text'] = df['tokenized_text'].apply(
        lambda x: lucem_illud_2020.normalizeTokens(x))

    return df
Пример #3
0
def norm_sent(df):
    '''
    Tokenizes and normalizes sentences. Returns df with new columns "tokenized_sents" and "normalized_sents"
    '''
    # Tokenized and normalized sents
    df['tokenized_sents'] = df['text'].apply(lambda x: [
        lucem_illud_2020.word_tokenize(s)
        for s in lucem_illud_2020.sent_tokenize(x)
    ])
    df['normalized_sents'] = df['tokenized_sents'].apply(
        lambda x:
        [lucem_illud_2020.normalizeTokens(s, lemma=False) for s in x])

    return df
Пример #4
0
#load tweets for all days
days = pd.read_csv(r'.\Classified\all.csv')
days = days.reset_index().drop(['index', 'Unnamed: 0'], axis=1)

#%% subset variables
tweets = days[[
    'screen_name', 'status_id', 'created_at', 'text', 'cap', 'is_retweet',
    'favorite_count', 'retweet_count', 'followers_count', 'friends_count'
]]

# keep only original text
tweets = tweets.drop_duplicates(subset='text')

#%% tokenize and normalize the text of each tweet
tweets['tokenized_text'] = tweets['text'].apply(
    lambda x: lucem.word_tokenize(x))
tweets['normalized_tokens'] = tweets['tokenized_text'].apply(
    lambda x: lucem.normalizeTokens(x, extra_stop=['amp']))

#%% create a subset of max 1000 words that appear less than 3 times and in more...
#...than half of documents
#initialize model
twTFVectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    max_df=0.5, max_features=1500, min_df=100, stop_words='english', norm='l2')
#train the model
twTFVects = twTFVectorizer.fit_transform(tweets['text'])
print(twTFVects.shape)


#%% creat dropMissing function to apply the tf-idf filter
def dropMissing(wordLst, vocab):