예제 #1
0
 def transform(self, X, y=None):
     preprocessed_tokens = textClean.pipeline(X.to_list(), multi_gram = [1], lower_case=True, deacc=False, encoding='utf8',
                                              errors='strict', stem_lemma = 'lemma', tag_drop = [], nltk_stop=True, 
                                              stop_word_list=[], check_numbers=False, word_length=3,
                                              remove_consecutives=True)
     
     return preprocessed_tokens
예제 #2
0
def lsa_text_extraction(textdoc,
                        smooth=0.4,
                        MIN_DIMENSIONS=3,
                        REDUCTION_RATIO=1 / 1,
                        topn=5):
    """
    reduction_ratio: used to reduce computation cost: limit diagonal size, when it is 1 it keeps original diagonal size, when it is 0.4 only keep 0.4 * original diagonal size
    smooth: is a factor appened to matrix normalization, small value might cause overfitting and large value might cause underfitting
    """
    ''' document to sentences '''
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    document = tokenizer.tokenize(textdoc)
    ''' generate term freq matrix '''
    assert 0.0 <= smooth < 1.0
    preprocessed_text = textClean.pipeline(document,
                                           multi_gram=[1],
                                           lower_case=True,
                                           deacc=False,
                                           encoding='utf8',
                                           errors='strict',
                                           stem_lemma='lemma',
                                           tag_drop=[],
                                           nltk_stop=True,
                                           stop_word_list=[],
                                           check_numbers=False,
                                           word_length=2,
                                           remove_consecutives=True)
    dictionary = DocVector.generate_corpus_dict(preprocessed_text,
                                                no_below=2,
                                                no_above=0.5,
                                                keep_n=100000)
    doc_vec = DocVector.create_document_vector(preprocessed_text, dictionary)
    tfmatrix = DocVector.get_vocab_matrix(doc_vec, dictionary)
    matrix_copy = tfmatrix.values.T
    '''
    Computes TF metrics for each sentence (column) in the given matrix and  normalize 
    the tf weights of all terms occurring in a document by the maximum tf in that document 
    according to ntf_{t,d} = a + (1-a)\frac{tf_{t,d}}{tf_{max}(d)^{'}}.
        
    The smoothing term $a$ damps the contribution of the second term - which may be viewed 
    as a scaling down of tf by the largest tf value in $d$
    '''
    max_word_frequencies = np.max(matrix_copy, axis=0)
    rows, cols = matrix_copy.shape
    for row in range(rows):
        for col in range(cols):
            max_word_frequency = max_word_frequencies[col]
            if max_word_frequency != 0:
                frequency = matrix_copy[row, col] / max_word_frequency
                matrix_copy[row, col] = smooth + (1.0 - smooth) * frequency
    ''' get ranks '''
    u, sigma, v_matrix = singular_value_decomposition(matrix_copy,
                                                      full_matrices=False)
    assert len(sigma) == v_matrix.shape[0]
    dimensions = max(MIN_DIMENSIONS, int(len(sigma) * REDUCTION_RATIO))
    powered_sigma = tuple(s**2 if i < dimensions else 0.0
                          for i, s in enumerate(sigma))
    ranks = []
    for column_vector in v_matrix.T:
        rank = sum(s * v**2 for s, v in zip(powered_sigma, column_vector))
        ranks.append(math.sqrt(rank))
    ''' output result '''
    percentile_list = pd.DataFrame({
        'sentence': document,
        'rank': ranks,
    }).sort_values(by='rank', ascending=False)

    output_sentence = [i for i in percentile_list.head(topn)['sentence']]
    return output_sentence
fig.show()


# In[48]:


test= data[(data.date.astype(str)=='2021-06-01')&(data.country=='India')].reset_index(drop=True)
test.orig_text[5]


# In[49]:


test['explore_text'] = textClean.pipeline(test['text'].to_list(), multi_gram = [1], lower_case=True, 
                                                 deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', 
                                                 tag_drop = [], nltk_stop=True, 
                                                 stop_word_list=['effect','vaccine','side','covid'], 
                                                 check_numbers=False, word_length=2, remove_consecutives=True)
print(tfidf.get_top_n_tfidf_bow(list(test['explore_text']), top_n_tokens = 30))
DataExploration.generate_word_cloud(list(test['explore_text']))
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(list(test['explore_text']), num_topics = 10)
lda.lda_topics(lda_allbow)


# From the above plots we can see, people in India and Canada tweets a lot in March and April, and the spikes in overall volume trend were contributed by India, we will do more exploration for Canada and India. For india, it seems most tweets are neutral.
# 
# In India, in 2021-Mar-01, most tweets are about vacc, in 2021-Apr-21, most tweets are about vacc and infection and medical service, in 2021-June-01, India start use sputnik, etc.

# In[76]:
예제 #4
0
# In[56]:

print(train['LABEL'].value_counts())

# In[ ]:

# In[38]:

train['title_tokens'] = textClean.pipeline(train['TITLE'].to_list(),
                                           multi_gram=[1],
                                           lower_case=True,
                                           deacc=False,
                                           encoding='utf8',
                                           errors='strict',
                                           stem_lemma='lemma',
                                           tag_drop=[],
                                           nltk_stop=True,
                                           stop_word_list=[],
                                           remove_pattern=[],
                                           check_numbers=True,
                                           word_length=2,
                                           remove_consecutives=True)

# In[43]:

selected_tokens = train[train.LABEL == 'Mathematics']['title_tokens'].to_list()
top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(selected_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(selected_tokens, top_n_tokens=30)
예제 #5
0
raw_data = pd.read_csv(os.path.join(root_path, "IMDB Dataset.csv"))
data = raw_data[raw_data.review.isnull() == False]
data['label'] = np.where(data['sentiment'] == 'positive', 1, 0)
data = data.drop_duplicates()
data.insert(0, 'index', data.index + 1)

# In[53]:

#Clean data using Bumblebee Pipeline
preprocessed_text = textClean.pipeline(
    data['review'][0:1000].to_list(),
    multi_gram=[1],
    lower_case=True,
    deacc=False,
    encoding='utf8',
    errors='strict',
    stem_lemma='lemma',
    tag_drop=['V'],
    nltk_stop=True,
    stop_word_list=['movie', 'film', 'movies', 'films'],
    check_numbers=True,
    word_length=3,
    remove_consecutives=True)
preprocessed_text = [' '.join(i) for i in preprocessed_text]

# In[9]:

data.review[0]

# ## Cleaned Text

# In[10]:
예제 #6
0
                       'purchase', 'question',
                       'rather', 'ratio',
                       'reported', 'reserve', 'result', 'retained',
                       'rule', 'say',
                       'september', 'service', 'shoe',
                       'star', 'state', 'store', 'subsidiary', 'sunday', 'super',
                       'take', 'ten', 'th', 'eht',
                       'therefore', 'utility', 'volume',
                       'zero']


# In[22]:


preprocessed_text = textClean.pipeline(letters, multi_gram = [1], lower_case=True, deacc=False, encoding='utf8',
                                       errors='strict', stem_lemma = 'lemma', tag_drop = [], nltk_stop=True, 
                                       stop_word_list=frequent_words_list, remove_pattern = ['www'],
                                       check_numbers=True, word_length=2, remove_consecutives=True)
preprocessed_text = [' '.join(i) for i in preprocessed_text]


# In[23]:


processed_letter = {
    'year': yearid,
    'clean_letter': preprocessed_text
}
processed_letter_df = pd.DataFrame(processed_letter, columns = ['year', 'clean_letter'])
processed_letter_df.to_csv(os.path.join(data_path, 'processed_letter.csv'))

예제 #7
0
        df[i] = df[i].apply(lambda x: str(x))
    return df


raw_data = convert_str(
    raw_data, ['company_profile', 'description', 'requirements', 'benefits'])

# In[4]:

raw_data['profile_tokens'] = textClean.pipeline(
    raw_data['company_profile'].to_list(),
    multi_gram=[1],
    lower_case=True,
    deacc=False,
    encoding='utf8',
    errors='strict',
    stem_lemma='lemma',
    tag_drop=[],
    nltk_stop=True,
    stop_word_list=[],
    check_numbers=False,
    word_length=2,
    remove_consecutives=True)
raw_data['description_tokens'] = textClean.pipeline(
    raw_data['description'].to_list(),
    multi_gram=[1],
    lower_case=True,
    deacc=False,
    encoding='utf8',
    errors='strict',
    stem_lemma='lemma',
    tag_drop=[],
예제 #8
0
content_data = books[['id','authors','original_title','language_code','average_rating']]
content_data['doc'] = content_data['authors'] + ' ' + content_data['original_title'] + ' ' + content_data['language_code'] + ' ' + content_data['average_rating'].astype(str)
content_data = content_data.dropna()
content_data.head(3)


# In[10]:


processed_doc = TextProcessing.pipeline(content_data['doc'].to_list(), 
                                        multi_gram = [1,2], 
                                        lower_case=True, 
                                        deacc=False, encoding='utf8',
                                        errors='strict', 
                                        stem_lemma = 'lemma', 
                                        tag_drop = ['J'], 
                                        nltk_stop=True, 
                                        stop_word_list=['course','courses'], 
                                        check_numbers=False, 
                                        word_length=0, 
                                        remove_consecutives=True)


# In[163]:


dataExploration.generate_word_cloud(processed_doc)


# In[11]:
print('nasdaq ticker', len(nasdaq.Ticker.unique()))
print('----------------')
print('nasdaq sample')
print(nasdaq.Headline[0])

# # NLP Exploration

# In[16]:

djia_tokens = textClean.pipeline(djia_news['Headline'].to_list(),
                                 multi_gram=[1],
                                 lower_case=True,
                                 deacc=False,
                                 encoding='utf8',
                                 errors='strict',
                                 stem_lemma='lemma',
                                 tag_drop=['V'],
                                 nltk_stop=True,
                                 stop_word_list=[],
                                 remove_pattern=[],
                                 check_numbers=True,
                                 word_length=2,
                                 remove_consecutives=True)

# In[26]:

nasdaq_tokens = textClean.pipeline(nasdaq['Headline'].to_list(),
                                   multi_gram=[1],
                                   lower_case=True,
                                   deacc=False,
                                   encoding='utf8',
                                   errors='strict',
예제 #10
0
#                                            stop_word_list=[],
#                                            check_numbers=False, word_length=2, remove_consecutives=True)
fraud_tokens = list(raw_data[raw_data.fraudulent == 1]['jd_tokens'])
print(tfidf.get_top_n_tfidf_bow(fraud_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(fraud_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(fraud_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[146]:

raw_data['jd_tokens2'] = textClean.pipeline(raw_data['jd'].to_list(),
                                            multi_gram=[2],
                                            lower_case=True,
                                            deacc=False,
                                            encoding='utf8',
                                            errors='strict',
                                            stem_lemma='lemma',
                                            tag_drop=[],
                                            nltk_stop=True,
                                            stop_word_list=[],
                                            check_numbers=False,
                                            word_length=2,
                                            remove_consecutives=True)
print(tfidf.get_top_n_tfidf_bow(list(raw_data['jd_tokens2']), top_n_tokens=30))
DataExploration.generate_word_cloud(list(raw_data['jd_tokens2']))
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(list(raw_data['jd_tokens2']),
                                                 num_topics=10)
lda.lda_topics(lda_allbow)