Пример #1
0
def updated_topic_extraction(corpus, tm_obj, cluster_num):
    '''
    Main function of topic modeling when a new document is assigned to 
    the nearest cluster
    '''

    n_topics = int(os.getenv('TOPIC_NUMBER_PER_CLUSTER'))
    print("Cluster #{}:".format(cluster_num))
    norm_corpus = normalize_corpus(corpus)
    vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus,
                                                    feature_type='tfidf')
    feature_names = vectorizer.get_feature_names()

    # Update the model object
    tm_obj.fit_transform(tfidf_matrix)
    weights = tm_obj.components_

    topics = get_topics_terms_weights(weights, feature_names)
    print_topics_udf(topics=topics,
                     total_topics=n_topics,
                     num_terms=10,
                     display_weights=True)

    # Return the updated model object
    return tm_obj
Пример #2
0
def topic_extraction(documents, labels):
    '''
    Main function of topic modeling
    '''

    num_clusters = len(set(labels))
    n_topics = int(os.getenv('TOPIC_NUMBER_PER_CLUSTER'))
    matched = False
    tm_obj = []
    for c in range(num_clusters):
        print("=" * 70)
        print("Cluster #{}:".format(c))
        corpus = [
            document for i, document in enumerate(documents) if labels[i] == c
        ]
        norm_corpus = normalize_corpus(corpus)
        vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus,
                                                        feature_type='tfidf')
        feature_names = vectorizer.get_feature_names()
        if os.getenv('TOPIC_MODELING') == "lda":
            # Use Latent Dirichlet Allocation for topic modeling
            lda = LatentDirichletAllocation(n_components=n_topics,
                                            max_iter=1000,
                                            learning_method='online',
                                            learning_offset=10.,
                                            random_state=42)
            lda.fit(tfidf_matrix)
            weights = lda.components_
            matched = True
            tm_obj.append(lda)

        if os.getenv('TOPIC_MODELING') == "nmf":
            # Use Nonnegative Matrix Factorization for topic modeling
            nmf = NMF(n_components=n_topics,
                      random_state=42,
                      alpha=.1,
                      l1_ratio=.5)
            nmf.fit(tfidf_matrix)
            weights = nmf.components_
            matched = True
            tm_obj.append(nmf)

        if not matched:
            raise ValueError("Unknown topic modeling algorithm!")

        topics = get_topics_terms_weights(weights, feature_names)
        print_topics_udf(topics=topics,
                         total_topics=n_topics,
                         num_terms=10,
                         display_weights=True)

    return tm_obj
def lsa_text_summarizer(documents, num_sentences=2,
                        num_topics=2, feature_type='frequency',
                        sv_threshold=0.5):
                            
    vec, dt_matrix = build_feature_matrix(documents, 
                                          feature_type=feature_type)

    td_matrix = dt_matrix.transpose()
    td_matrix = td_matrix.multiply(td_matrix > 0)

    u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)  
    min_sigma_value = max(s) * sv_threshold
    s[s < min_sigma_value] = 0
    
    salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt)))
    top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1]
    top_sentence_indices.sort()
    
    for index in top_sentence_indices:
        print(sentences[index])
def textrank_text_summarizer(documents,
                             num_sentences=2,
                             feature_type='frequency'):

    vec, dt_matrix = build_feature_matrix(norm_sentences, feature_type='tfidf')
    similarity_matrix = (dt_matrix * dt_matrix.T)

    similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
    scores = networkx.pagerank(similarity_graph)

    ranked_sentences = sorted(
        ((score, index) for index, score in scores.items()), reverse=True)

    top_sentence_indices = [
        ranked_sentences[index][1] for index in range(num_sentences)
    ]
    top_sentence_indices.sort()

    for index in top_sentence_indices:
        print sentences[index]
def lsa_text_summarizer(documents, num_sentences=2,
                        num_topics=2, feature_type='frequency',
                        sv_threshold=0.5):
                            
    vec, dt_matrix = build_feature_matrix(documents, 
                                          feature_type=feature_type)

    td_matrix = dt_matrix.transpose()
    td_matrix = td_matrix.multiply(td_matrix > 0)

    u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)  
    min_sigma_value = max(s) * sv_threshold
    s[s < min_sigma_value] = 0
    
    salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt)))
    top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1]
    top_sentence_indices.sort()
    
    for index in top_sentence_indices:
        print sentences[index]
def textrank_text_summarizer(documents, num_sentences=2,
                             feature_type='frequency'):
    
    vec, dt_matrix = build_feature_matrix(norm_sentences, 
                                      feature_type='tfidf')
    similarity_matrix = (dt_matrix * dt_matrix.T)
        
    similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
    scores = networkx.pagerank(similarity_graph)   
    
    ranked_sentences = sorted(((score, index) 
                                for index, score 
                                in scores.items()), 
                              reverse=True)

    top_sentence_indices = [ranked_sentences[index][1] 
                            for index in range(num_sentences)]
    top_sentence_indices.sort()
    
    for index in top_sentence_indices:
        print sentences[index]                             
Пример #7
0
def run():
    answers = [
        'Functions are used as one-time processing snippet for inling and jumbling the code.',
        'Functions are used for reusing, inlining and jumbling the code.',
        'Functions are used as one-time processing snippet for inlining and organizing the code.',
        'Functions are used as one-time processing snippet for modularizing and jumbling the code.',
        'Functions are used for reusing, inling and organizing the code.',
        'Functions are used as one-time processing snippet for modularizing and organizing the code.',
        'Functions are used for reusing, modularizing and jumbling the code.',
        'Functions are used for reusing, modularizing and organizing the code.'
    ]

    model_answer = [
        "Functions are used for reusing, modularizing and organizing the code."
    ]

    # normalize answers
    norm_corpus = normalize_corpus(answers, lemmatize=True)

    # normalize model_answer
    norm_model_answer = normalize_corpus(model_answer, lemmatize=True)

    vectorizer, corpus_features = build_feature_matrix(
        norm_corpus, feature_type='frequency')

    # extract features from model_answer
    model_answer_features = vectorizer.transform(norm_model_answer)

    doc_lengths = [len(doc.split()) for doc in norm_corpus]
    avg_dl = np.average(doc_lengths)
    corpus_term_idfs = compute_corpus_term_idfs(corpus_features, norm_corpus)

    for index, doc in enumerate(model_answer):

        doc_features = model_answer_features[index]
        bm25_scores = compute_bm25_similarity(doc_features,
                                              corpus_features,
                                              doc_lengths,
                                              avg_dl,
                                              corpus_term_idfs,
                                              k1=1.5,
                                              b=0.75)
        semantic_similarity_scores = []
        for sentence in answers:
            score = (sentence_similarity(sentence, model_answer[0]) +
                     sentence_similarity(model_answer[0], sentence)) / 2
            semantic_similarity_scores.append(score)
        print 'Model Answer', ':', doc
        print '-' * 40
        doc_index = 0
        for score_tuple in zip(semantic_similarity_scores, bm25_scores):
            sim_score = ((score_tuple[0] * 10) + score_tuple[1]) / 2
            if (sim_score < 1):
                sim_score = 0
            elif (1 <= sim_score <= 2):
                sim_score = 1
            elif (2 < sim_score <= 4):
                sim_score = 2
            elif (4 < sim_score <= 6):
                sim_score = 3
            elif (6 < sim_score <= 8):
                sim_score = 4
            elif (8 < sim_score <= 10):
                sim_score = 5
            print 'Ans num: {} Score: {}\nAnswer: {}'.format(
                doc_index + 1, sim_score, answers[doc_index])
            print '-' * 40
            doc_index = doc_index + 1
        print
docs = parse_document(toy_text)
text = ' '.join(docs)
text_summarization_gensim(text, summary_ratio=0.4)


    
sentences = parse_document(toy_text)
norm_sentences = normalize_corpus(sentences,lemmatize=False) 

total_sentences = len(norm_sentences)
print('Total Sentences in Document:', total_sentences)

num_sentences = 3
num_topics = 2

vec, dt_matrix = build_feature_matrix(sentences, 
                                      feature_type='frequency')

td_matrix = dt_matrix.transpose()
td_matrix = td_matrix.multiply(td_matrix > 0)

u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)  
                                         
sv_threshold = 0.5
min_sigma_value = max(s) * sv_threshold
s[s < min_sigma_value] = 0

salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt)))
print(np.round(salience_scores, 2))

top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1]
top_sentence_indices.sort()
Пример #9
0
def main():
    no_list = 5000
    dataframe = pd.read_csv(
        './data/final_questions_data.csv',
        names=['user', 'college', 'category', 'problems', 'problem_link'])

    answers = list(dataframe['problems'][1:no_list])
    answers = answers[1:no_list]

    if os.path.isfile('norm_corpus.csv'):
        pass
        read_df = pd.read_csv('norm_corpus.csv',
                              names=['norm'],
                              index_col=False)
        norm_corpus = read_df['norm'][1:].values.astype('U').tolist()
    else:
        norm_corpus = normalize_corpus(answers, lemmatize=True)
        write_df = pd.DataFrame(norm_corpus)
        write_df.to_csv('norm_corpus.csv', index=False, header=None)
    vectorizer, corpus_features = build_feature_matrix(norm_corpus,
                                                       feature_type='tfidf')

    doc_lengths = [len(doc.split()) for doc in norm_corpus]
    avg_dl = np.average(doc_lengths)
    corpus_term_idfs = compute_corpus_term_idfs(corpus_features, norm_corpus)

    for answer in answers:
        answers = list(dataframe['problems'][1:no_list])

        answers.remove(convert(answer))
        model_answer = convert(answer)
        print(model_answer)
        # normalize answers
        norm_corpus = normalize_corpus(answers, lemmatize=True)

        # normalize model_answer
        norm_model_answer = normalize_corpus(model_answer, lemmatize=True)

        # extract features from model_answer
        model_answer_features = vectorizer.transform(norm_model_answer)

        for index, doc in enumerate(model_answer):
            doc_features = model_answer_features[index]
            bm25_scores = compute_bm25_similarity(doc_features,
                                                  corpus_features,
                                                  doc_lengths,
                                                  avg_dl,
                                                  corpus_term_idfs,
                                                  k1=1.5,
                                                  b=0.75)
            semantic_similarity_scores = []
            for sentence in answers:
                score = (sentence_similarity(sentence, model_answer[0]) +
                         sentence_similarity(model_answer[0], sentence)) / 2
                semantic_similarity_scores.append(score)
            print('Model Answer', ':', doc)
            print('-' * 40)
            doc_index = 0
            sim_scores = []
            for score_tuple in zip(semantic_similarity_scores, bm25_scores):
                sim_score = ((score_tuple[0] * 10) + score_tuple[1]) / 2
                sim_scores.append(sim_score)
            #print (sim_scores)
            print(
                sorted(range(len(sim_score)), key=lambda i: sim_score[i])[-5:])
            break
            print('Ans num: {} Score: {}\nAnswer: {}'.format(
                doc_index + 1, sim_score, answers[doc_index]))
            print('-' * 40)
            doc_index = doc_index + 1

        break
'Python is a great Programming language',
'Python and Java are popular Programming languages',
'Among Programming languages, both Python and Java are the most used in Analytics',
'The fox is quicker than the lazy dog',
'The dog is smarter than the fox',
'The dog, fox and cat are good friends']

query_docs = ['The fox is definitely smarter than the dog',
            'Java is a static typed programming language unlike Python',
            'I love to relax under the beautiful blue sky!']  


# normalize and extract features from the toy corpus
norm_corpus = normalize_corpus(toy_corpus, lemmatize=True)
tfidf_vectorizer, tfidf_features = build_feature_matrix(norm_corpus,
                                                        feature_type='tfidf',
                                                        ngram_range=(1, 1), 
                                                        min_df=0.0, max_df=1.0)
                                                        
# normalize and extract features from the query corpus
norm_query_docs =  normalize_corpus(query_docs, lemmatize=True)            
query_docs_tfidf = tfidf_vectorizer.transform(norm_query_docs)

def compute_cosine_similarity(doc_features, corpus_features,
                              top_n=3):
    # get document vectors
    doc_features = doc_features[0]
    # compute similarities
    similarity = np.dot(doc_features, 
                        corpus_features.T)
    similarity = similarity.toarray()[0]
    # get docs with highest similarity scores
    for row in reader:  # each row is a list
        results.append(row[1])

# print results

sentences = results

# normalize corpus
norm_req_synopses = normalize_corpus(sentences,
                                     lemmatize=True,
                                     only_text_chars=False)

# extract tf-idf features
vectorizer, feature_matrix = build_feature_matrix(norm_req_synopses,
                                                  feature_type='tfidf',
                                                  min_df=0.1,
                                                  max_df=0.9,
                                                  ngram_range=(1, 2))

# view number of features
#print feature_matrix.shape

# get feature names
feature_names = vectorizer.get_feature_names()

# print sample features
#print feature_names[:20]

topn_features = 10
cluster_details = {}
def main():
    '''
    Main function of document categorization
    '''

    # Get a list of file names of all documents in a specified folder
    fnames = get_filenames()
    print("The total number of files: %g" % len(fnames))

    titles = []
    documents = []
    for i, fname in enumerate(fnames):
        print("*" * 70)
        print("File no.%d %s is being processed ..." %
              (i, os.path.basename(fname)))
        text, corrupted_files = tika_parser(fname)
        if text:  # ignore corrupted files
            all_tokens = preprocess_text(text)
            # Append this list as the new content describing the original document
            documents.append(all_tokens)
            # Keep only the file name without extension
            title = os.path.basename(fname).strip(".pdf")
            # Append the document title
            titles.append(title)
            # Write the document title and content to an SQLite database
            sqlite_entry(db, title, all_tokens)

    # Extract features from documents
    vectorizer, feature_matrix = \
    build_feature_matrix(documents, feature_type='tfidf',
                         min_df=0.0, max_df=1.0,
                         ngram_range=(1, 1))
    # Notice that 'feature_matrix' is normalized so that no extra normalization
    # is required

    # Save vectorizer in a file in the current folder
    with open(os.getenv('VECTORIZER_PKL_FILENAME'), 'wb') as file:
        pickle.dump(vectorizer, file)

    print(feature_matrix.shape)
    # Get feature names
    feature_names = vectorizer.get_feature_names()
    # Get the number of top features describing each cluster centroid
    topn_features = int(os.getenv('FEATURE_NUMBER'))

    matched = False

    if os.getenv('CLUSTERING') == "affinity":
        from document_clustering import (affinity_propagation,
                                         cluster_analysis)
        from topic_modeling import topic_extraction

        # Get clusters using affinity propagation
        ap_obj, clusters = affinity_propagation(feature_matrix=feature_matrix)
        cl_obj = ap_obj

        cluster_analysis(ap_obj, feature_names, titles, clusters,
                         topn_features, feature_matrix)

        # Extract topics of each cluster
        tm_obj = topic_extraction(documents, ap_obj.labels_)

        matched = True

    if os.getenv('CLUSTERING') == "kmeans":
        from document_clustering import (k_means, cluster_analysis)
        from topic_modeling import topic_extraction

        # Get clusters using k-means
        num_clusters = int(os.getenv('CLUSTER_NUMBER'))
        km_obj, clusters = k_means(feature_matrix=feature_matrix,
                                   num_clusters=num_clusters)
        cl_obj = km_obj

        cluster_analysis(km_obj, feature_names, titles, clusters,
                         topn_features, feature_matrix)

        # Extract topics of each cluster
        tm_obj = topic_extraction(documents, km_obj.labels_)
        matched = True

    if os.getenv('CLUSTERING') == "hierarchical":
        from document_clustering import (ward_hierarchical_clustering,
                                         plot_hierarchical_clusters)

        data = pd.DataFrame({'Title': titles})
        # Build ward's linkage matrix
        linkage_matrix = ward_hierarchical_clustering(feature_matrix)
        # Plot the dendrogram
        plot_hierarchical_clusters(linkage_matrix=linkage_matrix,
                                   data=data,
                                   figure_size=(8, 10))
        matched = True

    if not matched:
        raise ValueError("Unknown clustering algorithm!")

    # Save clustering and topic modeling objects in files in the current folder
    with open(os.getenv('CLUSTERING_PKL_FILENAME'), 'wb') as file:
        pickle.dump(cl_obj, file)
    with open(os.getenv('TOPIC_MODELING_PKL_FILENAME'), 'wb') as file:
        pickle.dump(tm_obj, file)
print 'Movie:', movie_titles[0]
print 'Movie Synopsis:', movie_synopses[0][:1000]


from normalization import normalize_corpus
from utils import build_feature_matrix

# normalize corpus
norm_movie_synopses = normalize_corpus(movie_synopses,
                                       lemmatize=True,
                                       only_text_chars=True)

# extract tf-idf features
vectorizer, feature_matrix = build_feature_matrix(norm_movie_synopses,
                                                  feature_type='tfidf',
                                                  min_df=0.24, max_df=0.85,
                                                  ngram_range=(1, 2))
# view number of features
print feature_matrix.shape     

# get feature names
feature_names = vectorizer.get_feature_names()

# print sample features
print feature_names[:20]      

                    
from sklearn.cluster import KMeans

def k_means(feature_matrix, num_clusters=5):
    km = KMeans(n_clusters=num_clusters,
Пример #14
0
            tw = [term for term, wt in topic]
            print tw[:num_terms] if num_terms else tw
        print


print_topics_gensim(topic_model=lsi,
                    total_topics=total_topics,
                    num_terms=5,
                    display_weights=True)

# LSI custom built topic model
from utils import build_feature_matrix, low_rank_svd

norm_corpus = normalize_corpus(toy_corpus)

vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus,
                                                feature_type='tfidf')
td_matrix = tfidf_matrix.transpose()

td_matrix = td_matrix.multiply(td_matrix > 0)

total_topics = 2
feature_names = vectorizer.get_feature_names()

u, s, vt = low_rank_svd(td_matrix, singular_count=total_topics)
weights = u.transpose() * s[:, None]


def get_topics_terms_weights(weights, feature_names):
    feature_names = np.array(feature_names)
    sorted_indices = np.array(
        [list(row[::-1]) for row in np.argsort(np.abs(weights))])
# Using Gensim Summarization Method
docs = parse_document(document1)
text = ' '.join(docs)
text_summarization_gensim(text, summary_ratio=0.3)

sentences = parse_document(document1)
norm_sentences = normalize_corpus(sentences, lemmatize=False)

total_sentences = len(norm_sentences)
print('Total Sentences in Document:', total_sentences)

num_sentences = 3
num_topics = 1

vec, dt_matrix = build_feature_matrix(sentences, feature_type='frequency')

td_matrix = dt_matrix.transpose()
td_matrix = td_matrix.multiply(td_matrix > 0)

u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)

sv_threshold = 0.5
min_sigma_value = max(s) * sv_threshold
s[s < min_sigma_value] = 0

salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt)))
print(np.round(salience_scores, 2))

top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1]
top_sentence_indices.sort()

sample_docs = [100, 5817, 7626, 7356, 1008, 7155, 3533, 13010]
sample_data = [(test_reviews[index],
                test_sentiments[index])
                  for index in sample_docs]

sample_data    

# normalization
norm_train_reviews = normalize_corpus(train_reviews,
                                      lemmatize=True,
                                      only_text_chars=True)
# feature extraction                                                                            
vectorizer, train_features = build_feature_matrix(documents=norm_train_reviews,
                                                  feature_type='tfidf',
                                                  ngram_range=(1, 1), 
                                                  min_df=0.0, max_df=1.0)                                      
                                      
                                      

from sklearn.linear_model import SGDClassifier
# build the model
svm = SGDClassifier(loss='hinge', n_iter=500)
svm.fit(train_features, train_sentiments)



# normalize reviews                        
norm_test_reviews = normalize_corpus(test_reviews,
                                     lemmatize=True,
                                     only_text_chars=True)  
Пример #17
0
# Divide the data into the data (review) and the label (sentiment) in both training and testing sets
train_reviews = np.array(train_data['review'])
train_sentiments = np.array(train_data['sentiment'])
test_reviews = np.array(test_data['review'])
test_sentiments = np.array(test_data['sentiment'])

# Normalize the training review data using the normalization.py module
norm_train_reviews = normalize_corpus(train_reviews,
                                      lemmatize=False,
                                      only_text_chars=True)

# Extract features from these normalized training reviews
# - which features? Try other features using parameters provided in utils.py
vectorizer, train_features = build_feature_matrix(documents=norm_train_reviews,
                                                  feature_type='tfidf',
                                                  ngram_range=(1, 1),
                                                  min_df=0.0,
                                                  max_df=1.0)

from sklearn.linear_model import SGDClassifier

# Build/train an SVM classifier model with the train features extracted from reviews
svm = SGDClassifier(loss='hinge', n_iter=500)
svm.fit(train_features,
        train_sentiments)  # We give the features and the correct labels

# Normalize the test reviews
norm_test_reviews = normalize_corpus(test_reviews,
                                     lemmatize=False,
                                     only_text_chars=True)
# Extract features from the normalized test reviews
print 'Movie:', movie_titles[0]
print 'Movie Synopsis:', movie_synopses[0][:1000]

from normalization import normalize_corpus  ###
from utils import build_feature_matrix

# normalize corpus
norm_movie_synopses = normalize_corpus(movie_synopses,
                                       lemmatize=True,
                                       only_text_chars=True)  ####

# extract tf-idf features
vectorizer, feature_matrix = build_feature_matrix(norm_movie_synopses,
                                                  feature_type='tfidf',
                                                  min_df=0.24,
                                                  max_df=0.85,
                                                  ngram_range=(1, 2))
# view number of features
print feature_matrix.shape

# get feature names
feature_names = vectorizer.get_feature_names()

# print sample features
print feature_names[:20]

from sklearn.cluster import KMeans


def k_means(feature_matrix, num_clusters=5):
    'The dog, fox and cat are good friends'
]

# Documents that we will be measuring similarities for
query_docs = [
    'The fox is definitely smarter than the dog',
    'Java is a static typed programming language unlike Python',
    'I love to relax under the beautiful blue sky!'
]

# We normalize and extract features from the toy corpus
norm_corpus = normalize_corpus(toy_corpus, lemmatize=True)
# NB: As before it returns the particular 'vectorizer' used as well as the extracted feature matrix
tfidf_vectorizer, tfidf_features = build_feature_matrix(norm_corpus,
                                                        feature_type='tfidf',
                                                        ngram_range=(1, 1),
                                                        min_df=0.0,
                                                        max_df=1.0)

# Similarly, we normalize and extract features from the query corpus
norm_query_docs = normalize_corpus(query_docs, lemmatize=True)
# We use the same vectorizer that we used to build the feature matrix for the corpus also for query doc
query_docs_tfidf = tfidf_vectorizer.transform(norm_query_docs)


def compute_cosine_similarity(doc_features, corpus_features, top_n=3):
    # Get document vectors
    doc_features = doc_features[0]
    # Compute similarities by calling dot.product on transposed corpus feature vector
    similarity = np.dot(doc_features, corpus_features.T)
    similarity = similarity.toarray()[0]
            print tw[:num_terms] if num_terms else tw
        print
    

print_topics_gensim(topic_model=lsi,
                    total_topics=total_topics,
                    num_terms=5,
                    display_weights=True)

    
# LSI custom built topic model    
from utils import build_feature_matrix, low_rank_svd

norm_corpus = normalize_corpus(toy_corpus)

vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus, 
                                    feature_type='tfidf')
td_matrix = tfidf_matrix.transpose()
                              
td_matrix = td_matrix.multiply(td_matrix > 0)

total_topics = 2
feature_names = vectorizer.get_feature_names()

u, s, vt = low_rank_svd(td_matrix, singular_count=total_topics)
weights = u.transpose() * s[:, None]

def get_topics_terms_weights(weights, feature_names):
    feature_names = np.array(feature_names)
    sorted_indices = np.array([list(row[::-1]) 
                           for row 
                           in np.argsort(np.abs(weights))])
Пример #21
0
#vectorizer, sample_features = build_feature_matrix(documents=norm_train_reviews,
                                                  feature_type='tfidf',
                                                  ngram_range=(1, 1), 
                                                  min_df=0.0, max_df=1.0)                                      


# normalization
norm_train_reviews = normalize_corpus(train_reviews,
                                      lemmatize=True,
                                      only_text_chars=True)



# feature extraction using tfidf with unigram                                                                            
vectorizer, train_features = build_feature_matrix(documents=norm_train_reviews,
                                                  feature_type='tfidf',
                                                  ngram_range=(1, 1), 
                                                  min_df=0.0, max_df=1.0)                                      
                                      
                                      
print(train_features)
# Import SGDClassified and LogisticRegression models for training and testing to see the results 
from sklearn.linear_model import SGDClassifier, LogisticRegression

# Build the model 
svm = SGDClassifier(loss='hinge', n_iter=500)


# Train the model on training set 
svm.fit(train_features, train_sentiments)
Пример #22
0
def run():
    """
    answers=['Functions are used as one-time processing snippet for inling and jumbling the code.',
    'Functions are used for reusing, inlining and jumbling the code.',
    'Functions are used as one-time processing snippet for inlining and organizing the code.',
    'Functions are used as one-time processing snippet for modularizing and jumbling the code.',
    'Functions are used for reusing, inling and organizing the code.',
    'Functions are used as one-time processing snippet for modularizing and organizing the code.',
    'Functions are used for reusing, modularizing and jumbling the code.',
    'Functions are used for reusing, modularizing and organizing the code.']

    model_answer = ["Functions are used for reusing, modularizing and organizing the code."]
    """
    dev_questions = []
    dev_question_answers = []
    train_questions = []
    train_question_answers = []
    filep = os.path.dirname(os.path.abspath(__file__))
    #train_file = os.path.join(filep, "NQ-open.train.jsonl")
    #dev_file = os.path.join(filep, "NQ-open.efficientqa.dev.1.1.jsonl")
    train_file = os.path.join(filep, "test_train.jsonl")
    dev_file = os.path.join(filep, "test_dev.jsonl")

    with open(train_file, "r") as f:
        for line in f:
            d = json.loads(line)
            train_questions.append((d["question"]))
            if "answer" not in d:
                d["answer"] = "random"
            train_question_answers.append(d["answer"])

    len_train = len(train_questions)

    with open(dev_file, "r") as f:
        for line in f:
            d = json.loads(line)
            dev_questions.append((d["question"]))
            if "answer" not in d:
                d["answer"] = "random"
            dev_question_answers.append(d["answer"])

    len_dev = len(dev_questions)

    answers = train_questions
    model_answer = dev_questions

    # normalize answers
    norm_corpus = normalize_corpus(answers, lemmatize=True)
    print(sys.getsizeof(norm_corpus))
    print(len(norm_corpus))
    # normalize model_answer
    norm_model_answer = normalize_corpus(model_answer, lemmatize=True)

    vectorizer, corpus_features = build_feature_matrix(
        norm_corpus, feature_type='frequency')

    # extract features from model_answer
    model_answer_features = vectorizer.transform(norm_model_answer)

    doc_lengths = [len(doc.split()) for doc in norm_corpus]
    avg_dl = np.average(doc_lengths)
    corpus_term_idfs = compute_corpus_term_idfs(corpus_features, norm_corpus)

    train_predict = [None] * len_dev
    dev_predict = [None] * len_dev
    for index, doc in enumerate(model_answer):
        print(index)
        doc_features = model_answer_features[index]
        #bm25_scores = compute_bm25_similarity(model_answer_features,corpus_features,doc_lengths,avg_dl,corpus_term_idfs,k1=0.82, b=0.68)
        bm25_scores = compute_bm25_similarity(doc_features,
                                              corpus_features,
                                              doc_lengths,
                                              avg_dl,
                                              corpus_term_idfs,
                                              k1=0.82,
                                              b=0.68)
        exit()
        semantic_similarity_scores = []
        for sentence in answers:
            score = (sentence_similarity(sentence, model_answer[0]) +
                     sentence_similarity(model_answer[0], sentence)) / 2
            semantic_similarity_scores.append(score)
        doc_index = 0
        max_index = 0
        max_score = 0
        for score_tuple in zip(semantic_similarity_scores, bm25_scores):
            sim_score = ((score_tuple[0] * 10) + score_tuple[1]) / 2
            if sim_score > max_score:
                max_score = sim_score
                max_index = doc_index
            doc_index = doc_index + 1
        dev_predict[index] = train_question_answers[max_index][0]
    predict_output = [None] * len_dev
    for i in range(len_dev):
        output_dict = {
            'question': dev_questions[i],
            'prediction': dev_predict[i]
        }
        predict_output[i] = output_dict

    pred_file = os.path.join(filep, 'ef_dev_predict.json')
    with open(pred_file, 'w') as output:
        output.write(json.dumps(predict_output, indent=4) + '\n')
text_summarization_gensim(text, summary_ratio=0.4)


    
sentences = parse_document(toy_text)
norm_sentences = normalize_corpus(sentences,lemmatize=False) 

total_sentences = len(norm_sentences)
print 'Total Sentences in Document:', total_sentences   



num_sentences = 3
num_topics = 2

vec, dt_matrix = build_feature_matrix(sentences, 
                                      feature_type='frequency')

td_matrix = dt_matrix.transpose()
td_matrix = td_matrix.multiply(td_matrix > 0)

u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)  
                                         
sv_threshold = 0.5
min_sigma_value = max(s) * sv_threshold
s[s < min_sigma_value] = 0

salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt)))
print np.round(salience_scores, 2)

top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1]
top_sentence_indices.sort()