Exemplo n.º 1
0
        self.fname = fname

    def __iter__(self):
        for line in codecs.open(filename=self.fname,
                                mode='r',
                                encoding='utf-8'):
            yield line


if __name__ == '__main__':

    training_file_path = 'E:/2017_Deep_learning/text similarity'
    #training_file_path = './'

    # Lsi model
    dictionary = Dictionary()
    corpus = sohu_corpus(fname=os.path.join(
        training_file_path, 'sohu_text_similarity_training.corpus'),
                         dic=dictionary)

    # save dictionary
    #dictionary.save(os.path.join(training_file_path, '07_11_dictionary.dict'))
    MmCorpus.serialize(os.path.join(training_file_path, '01_16_corpus_12.mm'),
                       corpus)
    #dictionary = Dictionary.load(os.path.join(training_file_path, '07_11_dictionary.dict'))
    corpus_tfidf_mm = MmCorpus(
        os.path.join(training_file_path, '01_16_corpus_12.mm'))

    # convert counts to tfidf
    tfidf = TfidfModel(corpus=corpus_tfidf_mm)
    corpus_tfidf = tfidf[corpus_tfidf_mm]
Exemplo n.º 2
0
def get_people(txtstream, my_nlp):
    people = []
    for txt in txtstream:
        doc = my_nlp(txt)
        ppl = [ent for ent in doc.ents if ent.label_ == "PERSON"]
        people.append(ppl)
    return people # set(people)

if __name__ == "__main__":

    #
    # GENSIM TOPIC APPROACH
    #

    dictionary = Dictionary(token_stream(NOVELS_DIRPATH))
    dictionary.filter_extremes(no_below=10, no_above=0.66) # excludes terms like "the", "to", "and", "of", "i", etc.
    print("-------------")
    print("TOKENS", len(dictionary.token2id), list(dictionary.token2id.items())[0:4], "...")

    bags_of_words = [dictionary.doc2bow(tokens) for tokens in token_stream(NOVELS_DIRPATH)]
    print("-------------")
    print("BAGS OF WORDS (CORPUS)", len(bags_of_words), bags_of_words[0])

    lda = LdaMulticore(corpus=bags_of_words, id2word=dictionary, random_state=723812, num_topics=15, passes=10, workers=4)
    print("-------------")
    print("LDA MODEL", type(lda))

    results = lda.print_topics()
    print("-------------")
    print("TOPICS (RAW RESULTS)...")
Exemplo n.º 3
0
def run_tm(topics, below, above, chunksize, passes, iterations):

    m, valid = arevalid(topics, below, above, chunksize, passes, iterations)
    if not valid:

        fehlerfenster = Toplevel()
        fehlerfenster.title('Fehler')
        fehlerfenster.geometry('300x300')
        # Label mit der Fehlermeldung
        labelfehler = Label(master=fehlerfenster, text=m)
        labelfehler.place(x=10, y=10, width=300, height=300)

    else:

        with open('../data/docs', 'rb') as f:
            docs = pickle.load(f)

        tweet_dictionary = Dictionary(docs)
        tweet_dictionary.filter_extremes(no_below=int(below),
                                         no_above=float(above))
        tweet_dictionary.save('../data/tweet_dictionary')

        ngram_docs = ngrams(input_docs=docs)
        corpus = make_bow_corpus(tweet_dictionary, ngram_docs)
        with open('../data/bow_corpus', 'wb') as f:
            pickle.dump(corpus, f)
        print('Number of unique tokens: %d' % len(tweet_dictionary))
        print('Number of documents: %d' % len(corpus))
        """Training parameters."""
        num_topics = int(
            topics
        )  # Number of topics, here relatively low so we can interpret them more easily -> can be set higher
        chunk_size = int(
            chunksize
        )  # Numbers of documents fed into the training algorithm (we have 7)
        passes = int(passes)  # Number of times trained on the entire corpus
        iterations = int(iterations)  # Number of loops over each document
        eval_every = None  # Don't evaluate model perplexity, takes too much time.
        """ Make a index to word dictionary."""
        temp = tweet_dictionary[0]  # This is only to "load" the dictionary.
        id2word = tweet_dictionary.id2token
        """Create model
        We set alpha = 'auto' and eta = 'auto'. Again this is somewhat technical, but essentially we are automatically learning
        two parameters in the model that we usually would have to specify explicitly."""
        model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         chunksize=chunk_size,
                         alpha='auto',
                         eta='auto',
                         iterations=iterations,
                         num_topics=num_topics,
                         passes=passes,
                         eval_every=eval_every)
        model_file = '../data/model/LDA_model_v1'
        model.save(model_file)
        """ Tests """
        # Top topics
        top_topics = model.top_topics(
            corpus
        )  # , num_words=20) Default value = 20, input is our corpus in BOW format

        # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
        """Topic Coherence measures score a single topic by measuring the degree of semantic similarity between high scoring 
        words in the topic. These measurements help distinguish between topics that are semantically interpretable topics and 
        topics that are artifacts of statistical inference """
        avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
        print('Average topic coherence: %.4f.' % avg_topic_coherence)

        pprint(top_topics)
Exemplo n.º 4
0
from util.TextSimilarity import TextSimilarity
from util.TaskReader import TaskReader

from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel

print("LDA Output: ")

first_num = 244

task = TaskReader.read("text.txt")
similarity = TextSimilarity('french')
doc_set = similarity.get_modified_text(task.text)
edu_set = similarity.get_modified_text(task.education)

dictionary = Dictionary([[x for x in i.split()] for i in edu_set])
for i in range(0, len(doc_set)):
    num = i + first_num
    corp = [x for x in doc_set[i].split()]
    corpus = [dictionary.doc2bow(corp)]
    ldamodel = LdaModel(corpus, num_topics=1, id2word=dictionary, passes=50)
    [
        print("Topic № " + str(num) + " : " + x[1])
        for x in ldamodel.print_topics(num_topics=1, num_words=6)
    ]
Exemplo n.º 5
0
                labels.append(label_id)

print('Found %s texts.' % len(texts))

# Tokenize the texts using gensim.

tokens = list()
for text in texts:
    tokens.append(simple_preprocess(text))

# Vectorize the text samples into a 2D integer tensor.

MAX_NUM_WORDS = 10000 # 2 words reserved: 0=pad, 1=oov
MAX_SEQUENCE_LENGTH = 1000

dictionary = Dictionary(tokens)
dictionary.filter_extremes(no_below=0, no_above=1.0,
                           keep_n=MAX_NUM_WORDS-2)

word_index = dictionary.token2id
print('Found %s unique tokens.' % len(word_index))

data = [dictionary.doc2idx(t) for t in tokens]

# Truncate and pad sequences.

data = [i[:MAX_SEQUENCE_LENGTH] for i in data]
data = np.array([np.pad(i, (0, MAX_SEQUENCE_LENGTH-len(i)),
                        mode='constant', constant_values=-2)
                 for i in data], dtype=int)
data = data + 2
Exemplo n.º 6
0
    def lda(self,
            cat_list: list,
            below: int = 100,
            above: float = 0.1,
            eta: float = 0.9):

        assert set(cat_list).issubset(set(self.table.category.unique()))

        df_topic2 = self.table[self.table.category.isin(
            cat_list)].reset_index().iloc[:, 1:]
        instances = df_topic2.clean_text.apply(str.split)
        d = Dictionary(instances)
        print("Dictionary is:", d)
        d.filter_extremes(no_below=below, no_above=above)
        print("Dictionary after filtering:", d)
        ldacorpus = [d.doc2bow(text) for text in instances]
        tfidfmodel = TfidfModel(ldacorpus)
        model_corpus = tfidfmodel[ldacorpus]
        num_topics = len(df_topic2.groupby(['category']).count())
        temp = df_topic2.groupby(['category']).count()
        prior_probabilities = temp["app"] / temp["app"].sum()
        alpha = prior_probabilities.values
        print("Prior probabilities of the topics -alpha- are:", alpha)
        num_passes = 10
        chunk_size = len(model_corpus) * num_passes / 200
        print("Preliminary steps to prepare the model done")
        model = LdaMulticore(
            num_topics=num_topics,  # number of topics
            corpus=model_corpus,  # what to train on 
            id2word=d,  # mapping from IDs to words
            workers=min(10,
                        multiprocessing.cpu_count() -
                        1),  # choose 10 cores, or whatever computer has
            passes=num_passes,  # make this many passes over data
            chunksize=chunk_size,  # update after this many instances
            alpha=alpha,
            eta=eta,
            random_state=5)
        print("Model is ready")
        topic_corpus = model[model_corpus]
        topic_sep = re.compile(r"0\.[0-9]{3}\*")
        model_topics = [(topic_no, re.sub(topic_sep, '',
                                          model_topic).split(' + '))
                        for topic_no, model_topic in model.print_topics(
                            num_topics=num_topics, num_words=5)]

        descriptors = []
        for i, m in model_topics:
            print(i + 1, ", ".join(m[:3]))
            descriptors.append(", ".join(m[:2]).replace('"', ''))
        print(descriptors)
        scores = [[t[1] for t in topic_corpus[entry]]
                  for entry in range(len(instances))]
        topic_distros = pd.DataFrame(data=scores, columns=descriptors)
        topic_distros['category'] = df_topic2['category']
        #%matplotlib inline

        print("Preparing graph")

        sns.set_context('poster')

        fig, ax = plt.subplots(figsize=(20, 10))

        aggregate_by_category = topic_distros.groupby(
            topic_distros.category).mean()

        aggregate_by_category[descriptors].plot.bar(ax=ax)

        fig.set_size_inches(30, 30)
        plt.legend(loc='center left',
                   bbox_to_anchor=(1.0, 0.5),
                   prop={'size': 25})
Exemplo n.º 7
0
STEP_SIZE = 200

# Load Data - corp.pkl contains data_lemmatized, id2word, corpus
with open('corp.pkl', 'rb') as f:
    data_lemmatized, _, _ = pickle.load(f)

# Initialize Parameters
total_time = 0
coherence_arr = []
time_arr = []

# Set Data State to that of existing model in simulation
data = data_lemmatized[:INITIAL_DOC_SIZE]
# When updating Online LDA, if I use a normal dictionary I keep getting key errors.
# That's why for online lda alone I use Hash Dictionary
id2word = Dictionary(documents=data)
corpus = [id2word.doc2bow(doc) for doc in data]

# Building for the first time - To be considered as the starting/existing model in simulation.
start = timeit.default_timer()
lda = LdaMulticore(corpus,
                   num_topics=35,
                   id2word=id2word,
                   workers=3,
                   chunksize=2000,
                   passes=10,
                   batch=False)
end = timeit.default_timer()

time_taken = end - start
total_time += time_taken
Exemplo n.º 8
0
    FLAGS, unparsed = parser.parse_known_args()

    print('Reading data...')
    data = load_data(FLAGS.trainfile)
    comments_text = data['comment_text']
    comments_text = comments_text.tolist()

    print('Finding tokens with embeddings...')
    ft_model = load_embedding(FLAGS.embedfile)
    docs = [c.split(' ') for c in comments_text]
    for i in range(len(docs)):
        docs[i] = [t for t in docs[i] if t in ft_model.vocab]

    print('Building dictionary...')
    comments_dictionary = Dictionary(docs)
    comments_corpus = [comments_dictionary.doc2bow(d) for d in docs]

    print("Creating tfidf model...")
    model_tfidf = TfidfModel(comments_corpus)

    print("Converting to tfidf vectors...")
    comments_tfidf = model_tfidf[comments_corpus]
    comments_vecs = corpus2csc(comments_tfidf).T

    print('Finding important terms...')
    labelcols = data.columns.tolist()[2:]
    terms = Counter()
    for l in labelcols:
        cl = data[l]
        model_fdr = SelectFdr(chi2, alpha=0.025)