def train_model(self, X, words, anchors, anchor_strength=3):
        print("trainning model", end="\r")
        # Train the first layer
        model = ct.Corex(n_hidden=20, seed=8)
        model = model.fit(
            X,
            words=words,
            anchors=anchors,  # Pass the anchors in here
            anchor_strength=anchor_strength,  # Tell the model how much it should rely on the anchors
        )
        return model

        # TODO: Train successive layers
        tm_layer2 = ct.Corex(n_hidden=10, seed=16)
        tm_layer2.fit(model.labels)

        tm_layer3 = ct.Corex(n_hidden=9)
        tm_layer3.fit(
            tm_layer2.labels,
            words=words,
            anchors=anchors,  # Pass the anchors in here
            anchor_strength=anchor_strength,  # Tell the model how much it should rely on the anchors
            verbose=1,
            max_iter=300,
        )
        print("finished")
        return tm_layer3
Exemplo n.º 2
0
def get_topic(anchor_words):
    topic_model = ct.Corex(n_hidden=4, seed=0)
    topic_model.fit(doc_word,
                    words=words,
                    anchors=anchor_words,
                    anchor_strength=1000)
    print(topic_model.get_topics(topic=0, n_words=10))
Exemplo n.º 3
0
 def __init__(self,
              text,
              anchors=anchor_words,
              n_topic=25,
              max_features=20000,
              max_iter=200,
              seed=100,
              anchor_strength=10):
     """
     Initialize and train the CorEx model
     :param text: A text series of customer service transcripts
     :param n_topic: number of topics
     :param max_features: maximum features for word embedding
     :param max_iter: maximum iteration time
     :param seed: random state
     :param anchor_strength: weight for anchor word
     """
     self.text = text
     self.n_topic = n_topic
     cv = CountVectorizer(stop_words='english',
                          max_features=max_features,
                          binary=True)
     # Corpus
     corpus = cv.fit_transform(text)
     # Vocabulary
     words = list(np.asarray(cv.get_feature_names()))
     # Build model
     self.topic_model = ct.Corex(n_hidden=n_topic,
                                 max_iter=max_iter,
                                 seed=seed)
     self.topic_model.fit(corpus,
                          words=words,
                          anchors=anchors,
                          anchor_strength=anchor_strength)
     self.topics = self.topic_model.get_topics()
Exemplo n.º 4
0
def sample_add(X,
               sc1,
               X_sub2,
               vocab,
               anchor,
               strength,
               enjeux_list,
               num_average=20):
    """
    Pour ajouter des samples sur la base du delta des scores.
    Renvoie True si le F1 est amélioré.
    """
    deltamoy = [0, 0, 0, 0]
    for it in range(num_average):
        model2 = ct.Corex(n_hidden=len(enjeux_list))
        model2.fit(X_sub2,
                   words=vocab,
                   anchors=anchor,
                   anchor_strength=strength)
        #Prédiction et évaluation sur sur toutes les données
        test = model2.predict(X)
        sc2 = evaluate(test, returnscore=True)

        #Calcul du delta des évaluations
        deltamoy = vadd(deltamoy, delta(sc1, sc2, returnmoy=True))
    deltamoy = np.array(deltamoy) / num_average
    if deltamoy[3] > 0:
        return (True, sc2)
    else:
        return (False, sc1)
Exemplo n.º 5
0
        def __init__(self, train_data, n_topics, model_choice='CorEx'):
            '''
            Description : 

                Class constructor 

            Parameters : 

                - train_data (List[List[Float]]) : the output probabilities for (a) document(s) to belong to each topic. 
                                                   This will be the output of CorExModel.transform() for the whole corpus or CorExModel.predict_proba() for a unique entry.

                - model_choice (String) : the model we want to use for the unsupervised classifier. You can either choose between the use of :

                                                + a Hierarchical Corex Model which will use the output of the first model of CorExModel.tune() (high dimension) 
                                                  as an input and will output a multi-label classification array in lower dimension.

                                                + a simple KMeans (for now) which will use the output of a first CorExModel.tune() (high dimension) and create clusters based on
                                                  the vectorized version of each description in the topic membership probabilities space.
            '''

            self.train_data = train_data

            if model_choice not in ['CorEx', 'KMeans']:
                raise TypeError(
                    "Wrong model choice, please choose between 'CorEx' and 'KMeans'."
                )

            elif model_choice == 'CorEx':
                self.model = ct.Corex(n_hidden=n_topics)

            elif model_choice == 'KMeans':
                self.model = KMeans(n_clusters=n_topics, random_state=42)
Exemplo n.º 6
0
 def train_model(self):
     log.info('Running model training...')
     # Train the CorEx topic model with 50 topics
     topic_model = ct.Corex(n_hidden=self.n_topic, words=self.words, max_iter=200, verbose=False, seed=1)
     topic_model.fit(self.doc_words, words=self.words)
     # save to class
     self.topic_model = topic_model
     if self.print_words:
         self.print_topic_words(topic_model=topic_model)
    def __init__(self, config, preprocessor, load=False, seed=True):
        self.model_path = config.paths.save_model_path
        if load:
            self.model = pickle.load(open(self.model_path, "rb"))
        else:
            self.model = ct.Corex(n_hidden=config.model.num_topics,
                                  seed=config.model.random_state)

        self.vocab = preprocessor.vocab
        self.seed_topics = None
        if seed:
            self.seed_topics = preprocessor.seed_topics
Exemplo n.º 8
0
 def train_model(self):
     log.info('Running model training...')
     """ Train semisupervised topic model with n topics"""
     # set anchor words
     self.anchor_dict = set_anchor_words(anchor_path=self.anchor_path)
     anchor_words = list(self.anchor_dict.values())
     # train model
     topic_model = ct.Corex(n_hidden=self.n_topic, words=self.words, max_iter=200, verbose=False, seed=1)
     topic_model.fit(self.doc_words, words=self.words, anchors=anchor_words, anchor_strength=self.anchor_strength)
     # save to class
     self.topic_model = topic_model
     if self.print_words:
         self.print_topic_words(topic_model=topic_model)
Exemplo n.º 9
0
    def __init__(self):
        self.vectorizer = CountVectorizer(stop_words='english',
                                          max_features=20000,
                                          binary=True)
        filenames = glob.glob('data/reddit/*_comments_*.json.gz')
        filename = filenames[0]
        input_data: pd.DataFrame = data_source.load_from_file(
            filename)  #HACK: speedup
        # Each "Document" is a text comment
        self.doc_word = self.vectorizer.fit_transform(input_data.text)
        self.doc_word = ss.csr_matrix(self.doc_word)

        sub_name = os.path.basename(filename).split('_')[0]

        print(self.doc_word.shape)  # n_docs x m_words

        # Get words that label the columns
        # Encode/decode to get rid of annoying unicode characters that break the topic_model obj when saving/loading
        words = [
            x for x in list(np.asarray(self.vectorizer.get_feature_names()))
        ]

        topic_model_filename = f'data/models/{sub_name}_topic_model.pkl'
        if os.path.exists(topic_model_filename):
            topic_model = cPickle.load(open(topic_model_filename, 'rb'))
        else:
            # Train the CorEx topic model, with some forum-specific anchor words
            topic_model = ct.Corex(
                n_hidden=25,
                anchors=[['xmr', 'monero'],
                         ['btc', 'bitcoin', 'satoshi', 'nakamoto'],
                         ['stellar', 'xlm'], ['ltc', 'litecoin'],
                         ['xrp', 'ripple'], ['eth', 'ethereum', 'vitalik'],
                         ['binance', 'coinbase', 'exchange'],
                         ['electrum', 'wallet']],
                anchor_strength=4)

            # Define the number of latent (hidden) topics to use.
            topic_model.fit(self.doc_word, words=words)  # ,
            cPickle.dump(topic_model, open(topic_model_filename, 'wb'))
            # topic_model.save(topic_model_filename, ensure_compatibility=False)
        self.topic_model = topic_model
        # Print all topics from the model
        topics = topic_model.get_topics()
        for n, topic in enumerate(topics):
            topic_words, _ = zip(*topic)
            print('{}: '.format(n) + ','.join(topic_words))
Exemplo n.º 10
0
        def __init__(self,
                     corpus,
                     n_topics,
                     stem=False,
                     anchors=None,
                     anchor_strength=None,
                     process=False,
                     verbose=False):
            '''
            Description:
            
                Class constructor
                
            Parameters:
            
                - corpus (List[String]) : the list of raw descriptions.
                - n_topics (Integer) : the default number of topics you're looking for. The parameter could be changing with the @tune() method.
                - anchors (List[String] | List[List[String]]) : chosen anchors for the CorEx models. Anchors should be specific to destinations or at least to clusters of destinations.
                - anchor_strength (Integer) : the weigth given to anchors.
                - process (String) : specify if the corpus need to be processed using @process method.
                - verbose (Boolean) : specify if we want information about the model while it is training.
                - stem (Boolean) : specify if we want to use stemming in the processing step
            '''

            self.corpus = corpus
            self.n_topics = n_topics
            self.model = ct.Corex(n_hidden=self.n_topics,
                                  anchors=anchors,
                                  anchor_strength=anchor_strength,
                                  verbose=verbose,
                                  process=process,
                                  seed=42)
            self.is_fitted = False

            if process:
                self.train_data = self.process_corpus(stem=stem)
            else:
                self.train_data = self.corpus

            self.vectorizer = TfidfVectorizer(max_df=.7,
                                              min_df=.01,
                                              max_features=None,
                                              ngram_range=(1, 2),
                                              norm=None,
                                              binary=True,
                                              use_idf=True,
                                              sublinear_tf=False)
Exemplo n.º 11
0
    def train(self,
              df,
              n_hidden=8,
              anchors=[["oil", "gas"]],
              anchor_strength=3):
        print('anchor_strength ', str(anchor_strength), '   n_hidden ',
              str(n_hidden))
        vectorizer = TfidfVectorizer(max_df=.5,
                                     min_df=10,
                                     max_features=None,
                                     ngram_range=(1, 2),
                                     norm=None,
                                     binary=True,
                                     use_idf=False,
                                     sublinear_tf=False,
                                     stop_words='english')
        vectorizer = vectorizer.fit(df['body'])
        tfidf = vectorizer.transform(df['body'])
        vocab = vectorizer.get_feature_names()

        # Anchors designed to nudge the model towards measuring specific genres

        anchors = [[a for a in topic if a in vocab] for topic in anchors]

        model = ct.Corex(n_hidden=n_hidden, seed=42)
        model = model.fit(
            tfidf,
            words=vocab,
            anchors=anchors,  # Pass the anchors in here
            anchor_strength=
            anchor_strength  # Tell the model how much it should rely on the anchors
        )

        topic_hash = {}
        for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
            topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
            topic_hash[i] = topic_ngrams

        self._vectorizer = vectorizer
        self._model = model
        self._topic_hash = topic_hash
        self._oil_and_gas_topic_num = [
            topic_num for topic_num, topic_ngrams in self._topic_hash.items()
            if ('oil' in topic_ngrams) or ('gas' in topic_ngrams)
        ]
Exemplo n.º 12
0
def get_corex_topics(num_topics_list, docs, features, print_flag = False):
    """
    outputs correlation list for model selection
    """
    
    total_corr = []
    for i in num_topics_list:
        topic_model = ct.Corex(n_hidden=i, seed = 10)
        topic_model.fit(docs, words=features) 
        total_corr.append(topic_model.tc)
        
        if print_flag == True:
            topics = topic_model.get_topics()
            print('Num topics: ', i)
            for topic_n, topic in enumerate(topics):
                words,mis = zip(*topic)
                topic_str = str(topic_n+1)+': '+', '.join(words)
                print(topic_str)
            print('')

    return total_corr
Exemplo n.º 13
0
 def train_Corex(self,n_topics,data_df):
     #self.countvectorizer=countvectorizer
     self.data_df=data_df
     text_corpus=self.data_df[self.sentence_col].values.tolist()
     print(len(text_corpus))
     if len(text_corpus)==0:
         raise PipelineError('Please provide text corpus', 'This object provides advanced corex vectors.')
     if self.countvectorizer:
         doc_word=self.countvectorizer.transform(text_corpus)
     else:
         countvectorizer_obj=ClassicVectorizationTrain(countvectorizer=1)
         self.countvectorizer=countvectorizer_obj.get_countVectorizer(text_corpus=text_corpus)
         doc_word=self.countvectorizer.transform(text_corpus)
     doc_word=csr_matrix(doc_word)
     words = list(np.asarray(self.countvectorizer.get_feature_names()))
     not_digit_inds = [ind for ind,word in enumerate(words) if not word.isdigit()]
     doc_word = doc_word[:,not_digit_inds]
     words = [word for ind,word in enumerate(words) if not word.isdigit()]
     # Train the CorEx topic model with 50 topics
     self.corex = ct.Corex(n_hidden=n_topics, words=words, max_iter=200, verbose=False, seed=1)
     self.corex.fit(doc_word, words=words)
     return dict(corex_model=self.corex, countvectorizer= self.countvectorizer)
Exemplo n.º 14
0
def fit_topics(
    dataset_label,
    doc_vectors,
    feature_names,
    titles,
    n_topics,
    anchors,
    anchor_strength=10,
    max_iter=25,
):
    """Apply Corex topic modelling to a set of document vectors,
    and save the model and output to disk.

    Args:
      dataset_label(str): Name of this dataset, for labelling the output files
      doc_vectors(np.array): Count (or equivalent) vector repr of the documents.
      feature_names(list): Names of each feature in the doc_vectors
      titles(list): Name of each document in doc_vectors
      n_topics(int): Number of topics for Corex to generate
      anchors(list of list): Corex anchor terms
      anchor_strength(int, optional): Corex anchor strength multiplier. Defaults to 10.
      max_iter(int, optional): Number of model iterations. Defaults to 25.

    Returns:
      topic_model: trained Corex topic model
    """
    topic_model = ct.Corex(max_iter=max_iter, n_hidden=n_topics)
    topic_model.fit(
        X=doc_vectors,
        words=feature_names,
        docs=titles,
        anchors=anchors,
        anchor_strength=anchor_strength,
    )
    # Use Corex tools for writing the data to the local directory
    label = make_model_label(dataset_label, n_topics, max_iter)
    vt.vis_rep(topic_model, column_label=feature_names, prefix=label)
    return topic_model
Exemplo n.º 15
0
def params_search(fit_text, num_max_df, num_min_df, num_topics):
    '''
    '''
    # Model Parameters
    vectorizer = TfidfVectorizer(strip_accents='ascii',
                                 encoding='unicode',
                                 max_df=num_max_df,
                                 min_df=num_min_df,
                                 max_features=None,
                                 ngram_range=(1, 2),
                                 norm=None,
                                 binary=True,
                                 use_idf=False,
                                 sublinear_tf=False)

    model = ct.Corex(n_hidden=num_topics, seed=42)

    # vectorizer
    vect_fit = vectorizer.fit(fit_text)
    tfidf = vectorizer.transform(fit_text)
    vocab = vect_fit.get_feature_names()

    anchors = []
    model = model.fit(tfidf, words=vocab)

    vt.vis_rep(model,
               column_label=vocab,
               prefix='./corex_models/{}-topic-model'.format(num_topics))

    model_tc = model.tc

    vect_print = 'Vect params: min_df={}, max_df={}'.format(
        num_min_df, num_max_df)
    corex_print = 'CorEx params: n_t={}, tc={}'.format(num_topics, model_tc)

    return vect_print + ' ' + corex_print
Exemplo n.º 16
0
#     print(params_search(all_tweets['text'], x1, num_min_df, num_topics))

# Model Parameters
vectorizer = TfidfVectorizer(strip_accents='ascii',
                             encoding='unicode',
                             max_df=num_max_df,
                             min_df=num_min_df,
                             max_features=None,
                             ngram_range=(1, 2),
                             norm=None,
                             binary=True,
                             use_idf=False,
                             sublinear_tf=False,
                             stop_words='english')

model = ct.Corex(n_hidden=num_topics, seed=42)

anchors = [
    'trump', ['win', 'giveaway'], 'vote',
    ['sanders', 'warren', 'biden', 'democratic',
     'buttigieg'], ['kobe', 'bryant'], 'book', ['super', 'bowl'],
    ['climate', 'change'], ['podcast', 'episode'], ['mental', 'health'],
    ['health', 'care'], 'coronavirus', 'australia', 'jesus',
    ['sexually', 'assaulted'], ['social', 'justice'], 'brexit',
    ['black', 'history'], ['conspiracy', 'theories'], ['trans', 'people'],
    'song', ['movie', 'film'], 'food', 'drink'
]
do_vect = True

if do_vect:
    print('Vectorizing tweets...', end='')
Exemplo n.º 17
0
def run_CorEx(documents, anchorList, n_topics, n_words_per_topic):
    """ Performs CorEx on corpus documents using anchorList.
        Returns topics as strings in topicList.
    """
    # CorEx uses an TF-IDF vectorization
    vectorizer = TfidfVectorizer(max_df=.5, min_df=10, max_features=None,
    ##    ngram_range=(1, 2),  for bi-grams
    ##    ngram_range=(1,3),   for bi-grams and tri-grams
        ngram_range=(1,1),     # for no bi-grams or tri-grams
        norm=None,
        binary=True,
        use_idf=False,
        sublinear_tf=False
    )

    # Fit chat corpus to TF-IDF vectorization
    vectorizer = vectorizer.fit(documents)
    tfidf = vectorizer.transform(documents)
    vocab = vectorizer.get_feature_names()

    # Apply CorEx with no anchors for a comparison
    anchors = []
    model = ct.Corex(n_hidden=n_topics, seed=42) # n_hidden specifies the # of topics
    model = model.fit(tfidf, words=vocab)

    # Display and write to file the results of CorEx with no anchors
    fileName = "CorEx_no_anchors_"+str(n_topics)+"topoics_"+str(n_words_per_topic)+"words.txt"
    outputFile = open(fileName, 'w')
    outputFile.write("File: " + fileName +"\n\n")

    print("\nCorEx Topics with no anchors:")
    for i, topic_ngrams in enumerate(model.get_topics(n_words=n_words_per_topic)):
        topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
        print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))
        outputFile.write("{}".format(" ".join(topic_ngrams))+"\n")
    outputFile.close()

    ## remove anchor words that are not in the chat corpus
    anchors = [
        [a for a in topic if a in vocab]
        for topic in anchorList
    ]

    model = ct.Corex(n_hidden=n_topics, seed=42)
    model = model.fit(
        tfidf,
        words=vocab,
        anchors=anchors, # Pass the anchors in here
        anchor_strength=3 # Tell the model how much it should rely on the anchors
    )

    # Display and write to file the results of CorEx with no anchors
    fileName = "CorEx_anchors_"+str(len(anchors))+"_"+str(n_topics) \
               +"topoics_"+str(n_words_per_topic)+"words.txt"
    outputFile = open(fileName, 'w')
    outputFile.write("File: " + fileName +"\n\n")
    topicList = []
    print("\nCorEx Topics with anchors:")
    for i, topic_ngrams in enumerate(model.get_topics(n_words=n_words_per_topic)):
        topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
        topicList.append(" ".join(topic_ngrams))
        print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))
        outputFile.write("{}".format(" ".join(topic_ngrams))+"\n")
    outputFile.close()    
    return topicList
Exemplo n.º 18
0
#%%

obj2 = instance.optimize_selectivity(bnds=(0.1, 0.9))
prediction = instance.predict(instance.X, selectivity=obj2.x)
sc2 = evaluate(docs_df, prediction, returnscore=True)
#%%
prediction = instance.predict(instance.X)
sc1 = evaluate(docs_df, prediction, returnscore=True)
#delta(sc1,sc2,returnmoy=True)

#%%
#On va utiliser ce modèle comme modèle initial (vérité
# approximative pour raffiner le tout)
k = 2
topic_model = ct.Corex(n_hidden=len(enjeux_list))
topic_model.fit(instance.X,
                words=instance.vocab,
                anchors=instance.thesau_list,
                anchor_strength=k)
mat = topic_model.labels
sc2 = evaluate(docs_df, mat, returnscore=True)
delta(sc1, sc2, returnmoy=True)
#%%
#On va essayer d'optimiser en faisant du stratified sampling
y_true, X_sub, y_pred = separate(instance.docs,
                                 instance.X,
                                 prediction=instance.predict(instance.X))

from sklearn.metrics import label_ranking_loss
def predict_final():

    t1 = [request.args.get('topic10'), request.args.get('topic11'), request.args.get('topic12')]
    t1 = [s.lower() for s in t1 if s]
    t2 = [request.args.get('topic20'), request.args.get('topic21'), request.args.get('topic22')]
    t2 = [s.lower() for s in t2 if s]
    t3 = [request.args.get('topic30'), request.args.get('topic31'), request.args.get('topic32')]
    t3 = [s.lower() for s in t3 if s]
    t4 = [request.args.get('topic40'), request.args.get('topic41'), request.args.get('topic42')]
    t4 = [s.lower() for s in t4 if s]
    t5 = [request.args.get('topic50'), request.args.get('topic51'), request.args.get('topic52')]
    t5 = [s.lower() for s in t5 if s]
    anchors = [t1, t2, t3, t4, t5]

    infile = open('stopwords_final', 'rb')
    stopwords = pickle.load(infile)
    infile.close()
    df = pd.read_pickle('cm_19_06')
    df.drop_duplicates(subset='body', keep=False, inplace=True)
    df = df[df['author'] != 'Ilackfocus']
    df = df[df['author'] != '[deleted]']
    df = df[df['author'] != 'AutoModerator']

    token_pattern_no_number = u'(?ui)\\b\\w*[a-zA-Z]+\\w*\\b'
    vectorizer_corex = CountVectorizer(stop_words=stopwords,
                                       binary=True,
                                       token_pattern=token_pattern_no_number,
                                       ngram_range=(1, 2),
                                       max_df=0.5,
                                       min_df=2,
                                       max_features=20000)
    c_word = vectorizer_corex.fit_transform(df['body'])
    vocab = vectorizer_corex.get_feature_names()

    ct_model = ct.Corex(n_hidden=5, seed=42)
    c_model_fitted = ct_model.fit(c_word, words=vocab, anchors=anchors, anchor_strength=4)

    topic_dist = []
    topic_count = np.asarray(c_model_fitted.labels).sum(axis=0)
    for i, topic_ngrams in enumerate(topic_count):
        topic_dist.append(round((topic_count[i] / len(c_model_fitted.labels)) * 100, 2))

    wd = []
    for i, topic_ngrams in enumerate(c_model_fitted.get_topics(n_words=10)):
        wd.append([ngram[0] for ngram in topic_ngrams if ngram[1] > 0])
    top_df = pd.DataFrame(data={'Topic': [1, 2, 3, 4, 5],
                                '% of all comments': topic_dist})
    top_df['Keywords'] = pd.Series(anchors)
    top_df['Top Words'] = pd.Series(wd)
    top_df.set_index('Topic', inplace=True)

    df['Topic1'] = c_model_fitted.labels[:, 0]
    df['Topic2'] = c_model_fitted.labels[:, 1]
    df['Topic3'] = c_model_fitted.labels[:, 2]
    df['Topic4'] = c_model_fitted.labels[:, 3]
    df['Topic5'] = c_model_fitted.labels[:, 4]

    c1 = df[df['Topic1'] == True][['created_utc', 'author', 'score', 'body',
                                   'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']].sample(5)
    c2 = df[df['Topic2'] == True][['created_utc', 'author', 'score', 'body',
                                   'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']].sample(5)
    c3 = df[df['Topic3'] == True][['created_utc', 'author', 'score', 'body',
                                   'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']].sample(5)
    c4 = df[df['Topic4'] == True][['created_utc', 'author', 'score', 'body',
                                   'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']].sample(5)
    c5 = df[df['Topic5'] == True][['created_utc', 'author', 'score', 'body',
                                   'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']].sample(5)

    comment_df = pd.concat([c1, c2, c3, c4, c5], ignore_index=True)

    comment_df['date'] = pd.to_datetime(comment_df['created_utc'], unit='s').dt.strftime('%m/%d/%Y')

    comment_df = comment_df[['date', 'author', 'score', 'body',
                             'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']]


    return flask.render_template('predict_final.html', table1=[top_df.to_html(table_id='cm')],
                                 table2=[comment_df.to_html(table_id='sc')])
# Check if topic word exists
for topic in topic_list:
    anchor = []
    topic_words = topic.split('\n')
    for topic_word in topic_words:
        if not topic_word.strip() in features_df['KeywordLabel'].tolist():
            pdb.set_trace()
        else:
            anchor.append(topic_word.strip())
    anchors.append(anchor)

# Sparse matrices are also supported
X = ss.csr_matrix(data)

# Train the CorEx topic model
topic_model = ct.Corex(n_hidden=len(
    anchors))  # Define the number of latent (hidden) topics to use.

topic_model.fit(X,
                docs=segment_df.updated_id.tolist(),
                words=features,
                anchors=anchors,
                anchor_strength=10)

topics = topic_model.get_topics()

for topic_n, topic in enumerate(topics):
    words, mis = zip(*topic)
    topic_str = str(topic_n) + ': ' + ','.join(words)
    print(topic_str)

top_docs = topic_model.get_top_docs()
Exemplo n.º 21
0
        if Case == '1':
            cut_case = input('1.jieba or 2.monpa')
            words_list, Label = cut(rows, stop_word_list, cut_case)
            cPickle.dump(words_list,
                         open('words_list_' + cut_case + '.pkl', 'wb'))
        elif Case == '2':
            words_list = cPickle.load(open('words_list_2.pkl', 'rb'))

        anchor = get_anchor()

        for j in range(100):
            vectorizer = CountVectorizer(
                token_pattern='\\b\\w+\\b')  # 原本只使用2個字以上的詞,改為1個字即可使用
            X = vectorizer.fit_transform(words_list)
            words = list(np.asarray(vectorizer.get_feature_names()))
            topic_model = ct.Corex(n_hidden=N_topic, words=words, seed=3)
            # topic_model = cPickle.load(open('model_monpa.pkl', 'rb'))

            if j > 0:
                for j in range(N_topic):
                    anchor_words = input(
                        'input topic_%s\'s anchor words(split with space):\n' %
                        str(j + 1))
                    t = anchor_words.split()
                    for word in t:
                        anchor[j].append(word)
                    print('本次新增anchor', t)
            print(anchor)
            topic_model.fit(X, words=words, anchors=anchor,
                            anchor_strength=4)  # anchors目前是自己設定
            # cPickle.dump(topic_model, open('model.pkl', 'wb'))
Exemplo n.º 22
0

acsdata = read_articles("txt-files/research")
data_words = []
for file in acsfiles:
    data_words.append(cleanText(file))

data = [' '.join(words) for words in data_words]
data_train, data_test, idx_train, idx_test = train_test_split(data, range(len(data)), \
                                                                  test_size=0.2, random_state=0)

id2word = corpora.Dictionary(data_words)
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]

vectorizer = CountVectorizer(stop_words='english', binary=True)
doc_word = vectorizer.fit_transform(data_train)
doc_word = ss.csr_matrix(doc_word)

words = list(np.asarray(vectorizer.get_feature_names()))

topic_model = ct.Corex(n_hidden= 25, words=words, max_iter=200, verbose=False)#, seed=1)
topic_model.fit(doc_word, words=words);
coherence = c_v_topic_coherence(topic_model, corpus = corpus, texts = texts, dictionary = id2word, topn = 20)
print('Coherence Score: ', coherence)

topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))
Exemplo n.º 23
0
    def fit(self, df=None, **kwargs):
        """
        Fits a model using the method specified when initializing the ``TopicModel``. Details on model-specific \
        parameters are below:

        **sklearn_lda**

        Fits a model using :py:class:`sklearn.decomposition.LatentDirichletAllocation`. For more information on \
        available parameters, please refer to the official documentation: \
        https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param alpha: Represents document-topic density. When values are higher, documents will be comprised of more \
        topics; when values are lower, documents will be primarily comprised of only a few topics. This parameter is \
        used instead of the doc_topic_prior sklearn parameter, and will be passed along to sklearn using the formula: \
        ``doc_topic_prior = alpha / num_topics``
        :param beta: Represents topic-word density. When values are higher, topics will be comprised of more words; \
        when values are lower, only a few words will be loaded onto each topic. This parameter is used instead of the \
        topic_word_prior sklearn parameter, and will be passed along to sklearn using the formula: \
        ``topic_word_prior = beta / num_topics``.
        :param learning_decay: See sklearn documentation.
        :param learning_offset: See sklearn documentation.
        :param learning_method: See sklearn documentation.
        :param max_iter: See sklearn documentation.
        :param batch_size: See sklearn documentation.
        :param verbose: See sklearn documentation.

        **sklearn_nmf**

        Fits a model using :py:class:`sklearn.decomposition.NMF`. For more information on available parameters, \
        please refer to the official documentation: \
        https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param alpha: See sklearn documentation.
        :param l1_ratio: See sklearn documentation.
        :param tol: See sklearn documentation.
        :param max_iter: See sklearn documentation.
        :param shuffle: See sklearn documentation.

        **gensim_lda**

        Fits an LDA model using :py:class:`gensim.models.LdaModel` or \
        :py:class:`gensim.models.ldamulticore.LdaMulticore`. \
        When ``use_multicore`` is set to True, the multicore implementation will be used, otherwise the standard \
        LDA implementation will be used. \
        For more information on available parameters, please refer to the official documentation below:

            - use_multicore=True: https://radimrehurek.com/gensim/models/ldamulticore.html
            - use_multicore=False: https://radimrehurek.com/gensim/models/ldamodel.html

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param alpha: Represents document-topic density. When values are higher, documents will be comprised of \
        more topics; when values are lower, documents will be primarily comprised of only a few topics. Gensim \
        options are a bit different than sklearn though; refer to the documentation for the accepted values here.
        :param beta: Represents topic-word density. When values are higher, topics will be comprised of more words; \
        when values are lower, only a few words will be loaded onto each topic. Gensim options are a bit different \
        than sklearn though; refer to the documentation for the accepted values here. Gensim calls this parameter \
        ``eta``. We renamed it to be consistent with the sklearn implementations.
        :param chunksize: See gensim documentation.
        :param passes: See gensim documentation.
        :param decay: See gensim documentation.
        :param offset: See gensim documentation.
        :param workers: Number of cores to use (if using multicore)
        :param use_multicore: Whether or not to use multicore

        **gensim_hdp**

        Fits an HDP model using the gensim implementation. Contrary to LDA and NMF, HDP attempts to auto-detect the
        correct number of topics. In practice, it actually fits ``T`` topics (default is 150) but many are extremely rare
        or occur only in a very few number of documents. To identify the topics that are actually useful, this function
        passes the original :py:class:`pandas.DataFrame` through the trained model after fitting, and identifies \
        topics that compose at least 1% of a document in at least 1% of all documents in the corpus. In other words, \
        topics are thrown out if the number of documents they appear in at a rate of at least 1% are fewer than 1% of \
        the total number of documents. Subsequent use of the model will only make use of topics that meet this \
        threshold. For more information on available parameters, please refer to the official documentation: \
        https://radimrehurek.com/gensim/models/hdpmodel.html

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param max_chunks: See gensim documentation.
        :param max_time: See gensim documentation.
        :param chunksize: See gensim documentation.
        :param kappa: See gensim documentation.
        :param tau: See gensim documentation.
        :param T: See gensim documentation.
        :param K: See gensim documentation.
        :param alpha: See gensim documentation.
        :param beta: See gensim documentation.
        :param gamma: See gensim documentation.
        :param scale: See gensim documentation.
        :param var_converge: See gensim documentation.

        **corex**

        Fits a CorEx topic model. Anchors can be provided in the form of a list of lists, with each item
        corresponding to a set of words to be used to seed a topic. For example:

        .. code-block:: python

            anchors=[
                ['cat', 'kitten'],
                ['dog', 'puppy']
            ]

        The list of anchors cannot be longer than the specified number of topics, and all of the words must
        exist in the vocabulary. The ``anchor_strength`` parameter determines the degree to which the model is able to
        override the suggested words based on the data; providing higher values are a way of "insisting" more strongly
        that the model keep the provided words together in a single topic. For more information on available \
        parameters, please refer to the official documentation: https://github.com/gregversteeg/corex_topic

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param anchors: A list of lists that contain words that the model should try to group together into topics
        :param anchor_strength: The degree to which the provided anchors should be preserved regardless of the data

        """

        fit_params = self.get_fit_params(**kwargs)

        if self.method in ["sklearn_lda", "sklearn_nmf"]:

            if self.method == "sklearn_lda":
                self.model = LatentDirichletAllocation(
                    n_components=self.num_topics, **fit_params)
            if self.method == "sklearn_nmf":
                self.model = NMF(n_components=self.num_topics, **fit_params)

            if is_not_null(df):
                features = self.get_features(df)
            else:
                features = self.train_features
            self.model.fit(features)

        elif self.method in ["gensim_lda", "gensim_hdp"]:

            vocab_dict = dict([(i, s) for i, s in enumerate(self.ngrams)])
            if is_not_null(df):
                features = self.get_features(df, keep_sparse=True)
            else:
                features = self.train_features
            matrix = gensim.matutils.Sparse2Corpus(features,
                                                   documents_columns=False)

            if self.method == "gensim_lda":
                fit_params["num_topics"] = self.num_topics
                fit_params["id2word"] = vocab_dict
                if fit_params["use_multicore"]:
                    model_class = gensim.models.ldamulticore.LdaMulticore
                else:
                    model_class = gensim.models.LdaModel
                    del fit_params["workers"]
                del fit_params["use_multicore"]
                self.model = model_class(**fit_params)
                self.model.update(matrix)
            elif self.method == "gensim_hdp":
                model_class = gensim.models.hdpmodel.HdpModel
                self.model = model_class(matrix, vocab_dict, **fit_params)
                doc_topics = self.get_document_topics(self.df)
                topics = ((doc_topics >= 0.01).astype(int).mean() >=
                          0.01).astype(int)
                self.topic_ids = [
                    int(col.split("_")[-1])
                    for col in topics[topics == 1].index
                    if col.startswith("topic_")
                ]
                self.num_topics = len(self.topic_ids)

        elif self.method == "corex":

            if is_not_null(df):
                features = self.get_features(df, keep_sparse=True)
            else:
                features = self.get_features(self.train_df, keep_sparse=True)
            self.model = corextopic.Corex(n_hidden=self.num_topics)
            self.model.fit(features, words=self.ngrams, **fit_params)
Exemplo n.º 24
0
df_top = df_product[df_p1_top_mask]

df_p1_bot_mask = (df_product['vader'] < -0.25) & (df_product['nps'] == -1)
df_bot = df_product[df_p1_bot_mask]

# In[24]:

cor_vectorizer = CountVectorizer(max_features=20000,
                                 ngram_range=(1, 2),
                                 binary=True,
                                 token_pattern="\\b[a-z][a-z]+\\b",
                                 stop_words='english')

cor_doc_word_top = cor_vectorizer.fit_transform(df_top['nltk_terms'])
cor_words = list(np.asarray(cor_vectorizer.get_feature_names()))
topic_model_top = ct.Corex(n_hidden=6, words=cor_words, seed=1)
topic_model_top.fit(cor_doc_word_top, words=cor_words, docs=df_top.nltk_terms)

# repeat process for bottom reveiws:

cor_doc_word_bot = cor_vectorizer.fit_transform(df_bot['nltk_terms'])
cor_words = list(np.asarray(cor_vectorizer.get_feature_names()))
topic_model_bot = ct.Corex(n_hidden=6, words=cor_words,
                           seed=1)  # must be repeated
topic_model_bot.fit(cor_doc_word_bot, words=cor_words, docs=df_bot.nltk_terms)

# In[25]:

# Print all topics from the top topic model:

topics = topic_model_top.get_topics()
Exemplo n.º 25
0
import multiprocessing ########
cores = multiprocessing.cpu_count() ########


overall_list=list(itertools.chain.from_iterable(text_list))

model = Word2Vec([overall_list], min_count=1, iter=3, sg=1, hs=1, negative=2) #workers=cores, vector_size=100, window=8 #my current computer has 8 cores
#min_count: ignores all words with total frequency lower than this
#sg=0 is CBOW, sg=1 is Skip-gram
#hs=1 employs hierarchical softmax
#negative > 0 employs negative sampling. 2-5 for large datasets, 5-20 for small datasets


# Train the CorEx topic model
topic_model = ct.Corex(n_hidden=10, seed=seed)  # Define the number of latent (hidden) topics to use.
topic_model.fit(X, words=all_vocabs, docs=topics)

top_docs = topic_model.get_top_docs()

print ('\n')
print ('COREX TOP DOCUMENTS:')

for topic_n, topic_docs in enumerate(top_docs):
    docs,probs = zip(*topic_docs)
    topic_str = str('Topic ') + str(topic_n+1)+': ' + ''.join(str(docs))
    print(topic_str)
    
print ('\n')
print ('xxx')
print ('\n')
Exemplo n.º 26
0
# Remove those entries that were removed from document term matrix
for i in sorted(entries_for_remove, reverse=True):
    del chunked_bows[i]

# Create a list from the features (features_list) so that they can be reused
features_list = [element[1] for element in gensim_dic.iteritems()]

# Transform the document term matrix into a binary matrix
doc_word = np.where(matrix_documents > 0, 1, 0)

print("\n")
print ("The following topics were extracted from the Anglo-Saxon Chronicle")
print("\n")
# Run Corex topic modelling
topic_model = ct.Corex(n_hidden=20, max_iter=200, verbose=False, seed=8)
topic_model.fit(np.matrix(doc_word), words=features_list)

# Print document topic matrix: topic_model.log_p_y_given_x

# Print the key topics
topics = topic_model.get_topics()
topics_to_print = []
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    topics_words_values = []
    for element in topic:
        topics_words_values.append(element[0] + ' (' +
                                   str(np.round(element[1], decimals=3)) + ')')
    topics_to_print.append(','.join(topics_words_values))
    print('{}: '.format(n) + ','.join(topic_words))
Exemplo n.º 27
0
#%%
model.fit(X, seed_topics=seed_topics, seed_confidence=0.9)
#%%
n_top_words = 20
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                             1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))
# %%
import numpy as np
import scipy.sparse as ss
from corextopic import corextopic as ct

# Train the CorEx topic model
topic_model = ct.Corex(n_hidden=len(
    enjeux_list))  # Define the number of latent (hidden) topics to use.
topic_model.fit(X, words=vocab)
# %%
topics = topic_model.get_topics()
for topic_n, topic in enumerate(topics):
    # w: word, mi: mutual information, s: sign
    topic = [(w, mi, s) if s > 0 else ('~' + w, mi, s) for w, mi, s in topic]
    # Unpack the info about the topic
    words, mis, signs = zip(*topic)
    # Print topic
    topic_str = str(topic_n + 1) + ': ' + ', '.join(words)
    print('\n' + topic_str)
# %%
import scipy.sparse as ss
from corextopic import corextopic as ct
Exemplo n.º 28
0
def run():
    s3_path_in = os.environ['BATCHPAR_s3_path_in']
    n_hidden = int(literal_eval(os.environ['BATCHPAR_n_hidden']))

    # Load and shape the data
    s3 = boto3.resource('s3')
    s3_obj_in = s3.Object(*parse_s3_path(s3_path_in))
    data = json.load(s3_obj_in.get()['Body'])

    # Pack the data into a sparse matrix
    ids = []  # Index of each row
    indptr = [0]  # Number of non-null entries per row
    indices = []  # Positions of non-null entries per row
    counts = []  # Term counts/weights per position
    vocab = {}  # {Term: position} lookup
    for row in data:
        ids.append(row.pop('id'))
        for term, count in row.items():
            idx = vocab.setdefault(term, len(vocab))
            indices.append(idx)
            counts.append(count)
        indptr.append(len(indices))
    X = csr_matrix((counts, indices, indptr), dtype=int)

    # {Position: term} lookup
    _vocab = {v: k for k, v in vocab.items()}

    # Fit the model
    topic_model = ct.Corex(n_hidden=n_hidden)
    topic_model.fit(X)
    topics = topic_model.get_topics()

    # Generate topic names
    topic_names = {
        f'topic_{itop}': [_vocab[idx] for idx, weight in topic]
        for itop, topic in enumerate(topics)
    }

    # Calculate topic weights as sum(bool(term in doc)*{term_weight})
    rows = [{
        f'topic_{itop}':
        sum(row.getcol(idx).toarray()[0][0] * weight for idx, weight in topic)
        for itop, topic in enumerate(topics)
    } for row in X]
    # Zip the row indexes back in, and ignore small weights
    rows = [
        dict(id=id, **{k: v
                       for k, v in row.items() if v > WEIGHT_THRESHOLD})
        for id, row in zip(ids, rows)
    ]

    # Curate the output
    output = {
        'loss': topic_model.tc,
        'data': {
            'topic_names': topic_names,
            'rows': rows
        }
    }

    # Mark the task as done and save the data
    if "BATCHPAR_outinfo" in os.environ:
        s3_path_out = os.environ["BATCHPAR_outinfo"]
        s3 = boto3.resource('s3')
        s3_obj = s3.Object(*parse_s3_path(s3_path_out))
        s3_obj.put(Body=json.dumps(output))
Exemplo n.º 29
0
                  'segment_keyword_matrix_merged_birkenau.txt',
                  dtype=int)
features_df = pd.read_csv(input_directory +
                          'keyword_index_merged_segments_birkenau.csv')
segment_df = pd.read_csv(input_directory + 'segment_index_merged_birkenau.csv')
features = features_df['KeywordLabel'].values.tolist()
node_filters = constants.output_data_filtered_nodes + "node_filter_1_output.json"

# Sparse matrices are also supported
X = ss.csr_matrix(data)
# Word labels for each column can be provided to the model
# Document labels for each row can be provided

#anchors=['camp adaptation methods']
# Train the CorEx topic model
topic_model = ct.Corex(
    n_hidden=18)  # Define the number of latent (hidden) topics to use.

topic_model.fit(X, docs=segment_df.updated_id.tolist(), words=features)

topics = topic_model.get_topics()
for topic_n, topic in enumerate(topics):
    words, mis = zip(*topic)
    topic_str = str(topic_n + 1) + ': ' + ','.join(words)
    print(topic_str)

top_docs = topic_model.get_top_docs()
for topic_n, topic_docs in enumerate(top_docs):
    docs, probs = zip(*topic_docs)
    docs = [str(element) for element in docs]
    topic_str = str(topic_n + 1) + ': ' + ','.join(docs)
    print(topic_str)
Exemplo n.º 30
0
    vectorizer = TfidfVectorizer(max_df=.5,
                                 min_df=5,
                                 max_features=None,
                                 ngram_range=(1, 2),
                                 norm=None,
                                 binary=True,
                                 use_idf=True,
                                 sublinear_tf=False)

    vectorizer = vectorizer.fit(data_df['lemma'])
    tfidf = vectorizer.transform(data_df['lemma'])
    vocab = vectorizer.get_feature_names()
    N_TOPICS = 10

    anchors = []
    model = ct.Corex(n_hidden=N_TOPICS, seed=42)
    model = model.fit(tfidf, words=vocab)

    for i, topic_ngrams in enumerate(model.get_topics(n_words=20)):
        topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
        st.write("Topic #{}: {}".format(i + 1, ", ".join(topic_ngrams)))

    import scattertext as scatter_text
    st.header("Scatter Text")

    def get_scattertext_corpus(df,
                               dep_data_col,
                               group1_name,
                               group2_name,
                               lang="en"):