示例#1
0
def lsa_text_extraction(textdoc,
                        smooth=0.4,
                        MIN_DIMENSIONS=3,
                        REDUCTION_RATIO=1 / 1,
                        topn=5):
    """
    reduction_ratio: used to reduce computation cost: limit diagonal size, when it is 1 it keeps original diagonal size, when it is 0.4 only keep 0.4 * original diagonal size
    smooth: is a factor appened to matrix normalization, small value might cause overfitting and large value might cause underfitting
    """
    ''' document to sentences '''
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    document = tokenizer.tokenize(textdoc)
    ''' generate term freq matrix '''
    assert 0.0 <= smooth < 1.0
    preprocessed_text = textClean.pipeline(document,
                                           multi_gram=[1],
                                           lower_case=True,
                                           deacc=False,
                                           encoding='utf8',
                                           errors='strict',
                                           stem_lemma='lemma',
                                           tag_drop=[],
                                           nltk_stop=True,
                                           stop_word_list=[],
                                           check_numbers=False,
                                           word_length=2,
                                           remove_consecutives=True)
    dictionary = DocVector.generate_corpus_dict(preprocessed_text,
                                                no_below=2,
                                                no_above=0.5,
                                                keep_n=100000)
    doc_vec = DocVector.create_document_vector(preprocessed_text, dictionary)
    tfmatrix = DocVector.get_vocab_matrix(doc_vec, dictionary)
    matrix_copy = tfmatrix.values.T
    '''
    Computes TF metrics for each sentence (column) in the given matrix and  normalize 
    the tf weights of all terms occurring in a document by the maximum tf in that document 
    according to ntf_{t,d} = a + (1-a)\frac{tf_{t,d}}{tf_{max}(d)^{'}}.
        
    The smoothing term $a$ damps the contribution of the second term - which may be viewed 
    as a scaling down of tf by the largest tf value in $d$
    '''
    max_word_frequencies = np.max(matrix_copy, axis=0)
    rows, cols = matrix_copy.shape
    for row in range(rows):
        for col in range(cols):
            max_word_frequency = max_word_frequencies[col]
            if max_word_frequency != 0:
                frequency = matrix_copy[row, col] / max_word_frequency
                matrix_copy[row, col] = smooth + (1.0 - smooth) * frequency
    ''' get ranks '''
    u, sigma, v_matrix = singular_value_decomposition(matrix_copy,
                                                      full_matrices=False)
    assert len(sigma) == v_matrix.shape[0]
    dimensions = max(MIN_DIMENSIONS, int(len(sigma) * REDUCTION_RATIO))
    powered_sigma = tuple(s**2 if i < dimensions else 0.0
                          for i, s in enumerate(sigma))
    ranks = []
    for column_vector in v_matrix.T:
        rank = sum(s * v**2 for s, v in zip(powered_sigma, column_vector))
        ranks.append(math.sqrt(rank))
    ''' output result '''
    percentile_list = pd.DataFrame({
        'sentence': document,
        'rank': ranks,
    }).sort_values(by='rank', ascending=False)

    output_sentence = [i for i in percentile_list.head(topn)['sentence']]
    return output_sentence
示例#2
0
# ## Model Development

# ### 1. Split Dataset

# In[3]:


preprocessed_tokens = textClean.pipeline(data['review'][0:100].to_list(), multi_gram = [1], lower_case=True, deacc=False, encoding='utf8',
                                           errors='strict', stem_lemma = 'lemma', tag_drop = [], nltk_stop=True, stop_word_list=['movie'], 
                                           check_numbers=False, word_length=3, remove_consecutives=True)

dictionary = DocVector.generate_corpus_dict(preprocessed_tokens, no_below =1,
                                            no_above = 0.5, keep_n = 100000)
bow_corpus = DocVector.create_document_vector(preprocessed_tokens, dictionary)
my_df = DocVector.get_vocab_matrix(bow_corpus, dictionary)
my_df.head(3)


# In[4]:


X_train, X_test, y_train, y_test = train_test_split(my_df.loc[:99,:], data['label'][0:100], test_size = 0.33, random_state = 11)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 11)


# ### 2. Feature Engineering
# * Apart from vectorization process, addtional features can be created from the dataset such as:
#     * Length of comments 
#     * Number of entities (using Named-Entity Recognition(NER))
#     * One-hot encoding of entities
示例#3
0
    def transform(self, X, y=None):

        bow_corpus = DocVector.create_document_vector(X, dictionary)
        my_df = DocVector.get_vocab_matrix(bow_corpus, dictionary)
        return my_df
示例#4
0
# In[5]:

tfidf_value_data = tfidf.get_tfidf_dataframe(preprocessed_tokens)
to10_tfidf_bow = tfidf.get_top_n_tfidf_bow(preprocessed_tokens,
                                           top_n_tokens=10)
to10_tfidf_bow

# In[6]:

dictionary = DocVector.generate_corpus_dict(preprocessed_tokens,
                                            no_below=1,
                                            no_above=0.5,
                                            keep_n=100000)
bow_corpus = DocVector.create_document_vector(preprocessed_tokens, dictionary)
tfidf_trans = models.TfidfModel(bow_corpus)
my_df = DocVector.get_vocab_matrix(tfidf_trans[bow_corpus], dictionary)

# In[7]:

my_df.head(3)

# ## Task 3: Use a Traditional Machine Learning Model to Classify the Documents

# You're free to use any traditional machine learning model that you like, but make sure that you follow the best practices in your model building pipeline that were covered in Day 2! (as time allows)
#
# * Proper Evaluation Metrics
# * Cross Validation
# * Hyperparameter Tuning
# * Feature Selection
# * Model Interpretability