def read_project_data(mtc,csc, fname): 
    d1 = Dictionary.load(mtc + ".dict") 
    d2 = Dictionary.load(csc + ".dict")
    #d3 = Dictionary.load('data/postgresql-d4f8dde3-CommitLogCorpus.mallet.dict')
    
    MultiTextCorpus = MalletCorpus(mtc, d1) 
    ChangesetCorpus = MalletCorpus(csc, d2)
    #CommitLogCorpus = MalletCorpus('data/postgresql-d4f8dde3-CommitLogCorpus.mallet', d3)
    
    u1 = set(d1.values())
    u2 = set(d2.values())
    #u3 = set(d3.values())
    
    common = u1.intersection(u2)
    uc_set = (len(u1),len(u2))

    u1_uniq = u1.difference(common)
    u2_uniq = u2.difference(common)
    print(u1_uniq)
    
    fname = "common_words_comparison.txt"
    with open(fname, 'a') as f:
        parts = mtc.split("-")
        f.write(str(parts[0]) + "\n")
        f.write("length of MultiTextCorpus: " + str(len(MultiTextCorpus)) + "\n")
        f.write("length of ChangesetCorpus: " + str(len(ChangesetCorpus)) + "\n" + "\n")
        f.write("(MTC,CSC)  in common" + "\n")
        f.write(str(uc_set) + " " + str(len(common)))
        f.write('\n' + '\n')
def evaluate_log(context, config):
    logger.info('Evalutating models for: %s' % config.project.name)

    model_fname = config.model_fname % ChangesetCorpus.__name__
    changeset_fname = config.corpus_fname % ChangesetCorpus.__name__
    commit_fname = config.corpus_fname % CommitLogCorpus.__name__

    try:
        commit_id2word = Dictionary.load(commit_fname + '.dict')
        commit_corpus = MalletCorpus(commit_fname,
                                     id2word=commit_id2word)
        changeset_id2word = Dictionary.load(changeset_fname + '.dict')
        changeset_corpus = MalletCorpus(changeset_fname,
                                        id2word=changeset_id2word)
    except:
        error('Corpora not built yet -- cannot evaluate')

    try:
        model = LdaModel.load(model_fname)
        logger.info('Opened previously created model at file %s' % model_fname)
    except:
        error('Cannot evalutate LDA models not built yet!')

    changeset_doc_topic = get_doc_topic(changeset_corpus, model)
    commit_doc_topic = get_doc_topic(commit_corpus, model)

    first_shared = dict()
    for id_ in commit_doc_topic:
        i = 0
        commit_topics = [topic[0] for topic in commit_doc_topic[id_]]
        try:
            changeset_topics = [topic[0] for topic in changeset_doc_topic[id_]]
        except:
            continue

        maximum = 101
        minimum = maximum

        for i, topic in enumerate(commit_topics):
            if topic in changeset_topics:
                j = changeset_topics.index(topic)
                minimum = min(minimum, max(i, j))

        for i, topic in enumerate(changeset_topics):
            if topic in commit_topics:
                j = commit_topics.index(topic)
                minimum = min(minimum, max(i, j))

        first_shared[id_] = minimum

        if minimum == maximum:
            logger.info('No common topics found for %s' % str(id_))
            del first_shared[id_]

    mean = sum(first_shared.values()) / len(first_shared)

    with open('data/evaluate-log-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, mean] + list(first_shared.values()))
Exemplo n.º 3
0
    def test_constructor_with_file_wikicorpus(self):
        
        #load tf-idf corpus
        tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm')
        
        #load lda corpus
        #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm')
        
        #load dictionary
        id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict")
        
        #load article titles
        document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt")

        #train esa model
        esa_model = EsaModel(tfidf_corpus, num_clusters = 15, 
                             document_titles = document_titles,
                             num_features = len(id2token))
        
        print esa_model
        
        esa_model.save('/media/sdc1/test_dump/result/wiki_esa.model')
        
        tmp_esa = EsaModel.load('/media/sdc1/test_dump/result/wiki_esa.model') 
        print tmp_esa  
Exemplo n.º 4
0
def write_topics(model_path, csv_name, k):
    model = LdaModel.load(model_path)
    topics = []
    for topic_id in range(model.num_topics):
        topics.append(model.return_topic(topicid=topic_id))

    dictionary = Dictionary.load('data/dictionary/tweets.dict')
    word_indices = dictionary.id2token
    writer = csv.writer(file(csv_name, 'w'))

    output = [[0 for i in range(model.num_topics)] for j in range(k)]
    for topic_id, topic in enumerate(topics):
        for rank, index in enumerate(topic.argsort()[::-1]):
            output[rank][topic_id] = {}
            output[rank][topic_id]['word'] = word_indices[index]
            output[rank][topic_id]['p'] = topic[index]
            rank += 1
            if rank >= k:
                break

    for topic_id in range(model.num_topics):
        row = ['z = ' + str(topic_id)]

        for rank in range(k):
            row.append(output[rank][topic_id]['word'] + ':' + str(output[rank][topic_id]['p']))

        writer.writerow(row)
Exemplo n.º 5
0
    def __iter__(self):
        list_dict = Dictionary.load('terms.dict')
        # list_dict.filter_extremes(no_below=1000,no_above=0.99)
        counter = 0
        doc_id = 0
        for member_id, count in self.members:
            if counter % 100 == 0:
                print('Done', counter)

            self.cursor.execute(self.query, (member_id,))
            expert_text = Counter()

            for result in self.cursor:
                parsed_text = self.parser.parse_list(title=result[1], description=result[2])

                expert_text.update(parsed_text['text'])

            terms = sorted([(e, v) for e, v in expert_text.items() if v > 1], key=operator.itemgetter(1), reverse=True)
            counter += 1

            if len(terms):
                if terms[0][1] > 10:
                    word_bag = []
                    for k, v in terms:
                        try:
                            word_bag.append((list_dict.token2id[k], v))
                        except KeyError:
                            pass
                    expert2doc[member_id] = doc_id
                    doc_id += 1
                    yield word_bag
def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    parser = OptionParser()
    parser.add_option('-f', '--corpus-file')
    parser.add_option('-p', '--parse-procs', default=1, type=int)
    parser.add_option('-s', '--sublexicalize-procs', default=1, type=int)
    parser.add_option('-t', '--tfidf-model')
    parser.add_option('-v', '--vocabulary')
    parser.add_option('-m', '--model-file')
    opts, args = parser.parse_args()

    corpus_fn = opts.corpus_file or sys.exit()
    n_proc_parse = opts.parse_procs
    n_proc_sublex = opts.sublexicalize_procs
    vocab_fn = opts.vocabulary
    tfidf_fn = opts.tfidf_model
    model_fn = opts.model_file or sys.exit()

    with BZ2File(corpus_fn) as f:
        corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()),
                                      order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex,
                                      create_dictionary=False)

        if vocab_fn and os.path.exists(vocab_fn):
            logging.info("Loading vocabulary from %s" % vocab_fn)
            vocab = Dictionary.load(vocab_fn)
        else:
            logging.info("Creating vocabulary")

            start = time.clock()
            vocab = Dictionary(corpus.get_texts())
            end = time.clock()
            logging.info("Vocabulary created in %d seconds" % (end - start))

            if vocab_fn:
                logging.info("Saving dictionary to %s" % vocab_fn)
                vocab.save(vocab_fn)

        corpus.dictionary = vocab

        corpus.dictionary.filter_extremes(no_below=5, no_above=.8)
        corpus.dictionary.compactify()

        if tfidf_fn and os.path.exists(tfidf_fn):
            logging.info("Reading TF-IDF model from %s" % tfidf_fn)
            tfidf = TfidfModel.load(tfidf_fn)
        else:
            logging.info("creating TF-IDF model")
            tfidf = TfidfModel(corpus)

            if tfidf_fn:
                logging.info("Saving TFF-IDF model to %s" % tfidf_fn)
                tfidf.save(tfidf_fn)

        bow_corpus = (tfidf[art] for art in corpus)

        model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary)

        model.save(model_fn)
Exemplo n.º 7
0
    def load(self):

        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)
Exemplo n.º 8
0
Arquivo: dmp.py Projeto: npiaq/dmp
 def load(self):
     '''读取 lda 模型和 dic 词典.
     '''
     lda_file = config.get('dmp', 'lda_file')
     dic_file = config.get('dmp', 'dic_file')
     self.lda = LdaModel.load(lda_file)
     self.dic = Dictionary.load(dic_file)
Exemplo n.º 9
0
    def addHarassingTweet(self, txt):
	'''
	Add an harassing tweet to the model corpus

	While gensim purports to train models incrementally,
	it'll crash if you try.  Instead, we just rebuild
	the model each time we get a new tweet, remembering
	all the old ones as we go.
	'''
	if txt in self.harassment:
	    return

	words = txt.split()
	if os.path.exists('/tmp/MyDict.dict'): 
	  self.d = Dictionary.load('/tmp/MyDict.dict')
	  self.d = self.d.merge(Dictionary([words]))
	else:
	    # build dictionary
	    self.d = Dictionary([words])

	# generate bag of words
	#bow = self.d.doc2bow(words, allow_update=True)
	
	#self.harassment[txt] = bow

	#corpus = []
	#for txt in self.harassment:
	#    corpus.append(self.harassment[txt])

	self.model = LdaModel(id2word=self.d)
Exemplo n.º 10
0
    def __iter__(self):
        list_dict = Dictionary.load('terms.dict')
        # list_dict.filter_extremes(no_below=1000,no_above=0.99)
        counter = 0
        doc_id = 0
        for member_id, count in self.members:
            if counter % 1000 == 0:
                print('Done', counter)

            print(member_id, count)
            self.cursor.execute(self.query, (member_id,))
            expert_text = Counter()

            for result in self.cursor:
                parsed_text = self.parser.parse_list(title=result[1], description=result[2])

                expert_text.update(parsed_text['text'])
                # expert_text.update(parsed_text['bigrams'])

            terms = ((e, v) for e, v in expert_text.items() if v > 10 and any([e.startswith(t) for t in topics]))
            counter += 1

            print(list(terms))

            word_bag = []
            for k, v in terms:
                try:
                    word_bag.append((list_dict.token2id[k], v))
                except KeyError:
                    pass
            expert2doc[member_id] = doc_id
            doc_id += 1

            yield word_bag
Exemplo n.º 11
0
    def load_data(self):

        if not self.tf_idf_model:
            if not os.path.exists(self.tf_idf_model_path):
                raise Exception('TF-IDF model file not found')

            self.dictionary = Dictionary.load(self.dictionary_path)
            self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path)
Exemplo n.º 12
0
def create_corpus(project, repos, Kind, use_level=True, forced_ref=None):
    corpus_fname_base = project.full_path + Kind.__name__

    if use_level:
        corpus_fname_base += project.level

    if forced_ref:
        corpus_fname_base += forced_ref[:8]

    corpus_fname = corpus_fname_base + '.mallet.gz'
    dict_fname = corpus_fname_base + '.dict.gz'
    made_one = False

    if not os.path.exists(corpus_fname):
        combiner = CorpusCombiner()

        for repo in repos:
            try:
                if repo or forced_ref:
                    corpus = Kind(project=project,
                                  repo=repo,
                                  lazy_dict=True,
                                  ref=forced_ref,
                                  )
                else:
                    corpus = Kind(project=project, lazy_dict=True)

            except KeyError:
                continue
            except TaserError as e:
                if repo == repos[-1] and not made_one:
                    raise e
                    # basically, if we are at the last repo and we STILL
                    # haven't sucessfully extracted a corpus, ring some bells
                else:
                    # otherwise, keep trying. winners never quit.
                    continue

            combiner.add(corpus)
            made_one = True

        # write the corpus and dictionary to disk. this will take awhile.
        combiner.metadata = True
        MalletCorpus.serialize(corpus_fname, combiner, id2word=combiner.id2word,
                               metadata=True)
        combiner.metadata = False

        # write out the dictionary
        combiner.id2word.save(dict_fname)

    # re-open the compressed versions of the dictionary and corpus
    id2word = None
    if os.path.exists(dict_fname):
        id2word = Dictionary.load(dict_fname)

    corpus = MalletCorpus(corpus_fname, id2word=id2word)

    return corpus
Exemplo n.º 13
0
def train(text_corpus_file, dict_file):
    """train lsi model from text corpus"""
    gutenberg_corpus = TextCorpus(text_corpus_file)
    dict = Dictionary.load(dict_file)
    lsi = LsiModel(corpus=gutenberg_corpus, id2word=dict, num_topics=400)
    lsi.save(model_file)
    print lsi.projection.u
    print lsi.projection.u.size
    print lsi.projection.u[0].size
Exemplo n.º 14
0
Arquivo: lda.py Projeto: pranab/avenir
	def analyze(self, docs):
		# load dictionary and model
		self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file"))
		self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file"))

		# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
		docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs]

		docTopicDistr = self.getDocumentTopics(docTermMatrix)
		return docTopicDistr
    def __init__(self, dictionary, **kwargs):
        super(GensimIDFProvider, self).__init__(**kwargs)

        if {'missing', 'linear', 'linear'} <= set(kwargs):
            logging.warning('<%s> argumemts to GensimIDFProvider can generate incorrect weights and should not be used'
                            % '|'.join({'missing', 'linear', 'linear'}))

        if isinstance(dictionary, (str, unicode)):
            dictionary = Dictionary.load(dictionary)
        self.dictionary = dictionary
        self.tfidf = TfidfModel(dictionary=dictionary, normalize=False)
def create_evaluation_corpora(config, Kind):
    corpus_fname = config.corpus_fname % Kind.__name__

    try:
        id2word = Dictionary.load(corpus_fname + '.dict')
        corpus = MalletCorpus(corpus_fname, id2word=id2word)
    except:
        error('Corpora not built yet -- cannot evaluate')

    word_freq = list(reversed(sorted(count_words(corpus))))
    print("Top 10 words in %s: %s", (corpus_fname, str(word_freq[:10])))
    print("Bottom 10 words in %s: %s", (corpus_fname, str(word_freq[-10:])))
Exemplo n.º 17
0
def main():
    args = parse_args()
    if args.text:
        print('Creating text')
        create_text(args.text_file_name)
    elif args.dict:
        print('Creating dict')
        stream_dict(args.dict)
    elif args.corp:
        print('Creating corpi')
        dictionary = Dictionary.load('cars2.txt')
        corpi()
Exemplo n.º 18
0
 def __init__(self):
     self.dictionary = Dictionary.load(app.config["RCMDR_DICT"])
     self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"])
     self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"])
     self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"])
     self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"])
     self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"])
     self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"])
     self.job_labels = {
         int(k): v
         for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n"))
     }
Exemplo n.º 19
0
Arquivo: lda.py Projeto: pranab/avenir
	def update(self, docs):
		# load dictionary and model
		self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file"))
		self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file"))

		# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
		docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs]

		numPass = self.config.getIntConfig("train.num.pass")[0]
		self.ldaModel.update(docTermMatrix, passes=numPasses)

		docTopicDistr = self.getDocumentTopics(docTermMatrix)
		return docTopicDistr
Exemplo n.º 20
0
    def test_cluster(self):
        
        #load tf-idf corpus
        tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm')
        
        #load dictionary
        id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict")
        
        kmedoids = self.get_kmedoids(tfidf_corpus, len(id2token), 
                                 num_clusters = 15,
                                 max_iterations = 5)
        clusters = self.cluster(kmedoids)

        print clusters
def create_evaluation_corpora_cosine(config, Kind, Kind2):
    corpus1_fname = config.corpus_fname % Kind.__name__
    corpus2_fname = config.corpus_fname % Kind2.__name__

    try:
        id2word1 = Dictionary.load(corpus1_fname + '.dict')
        corpus1 = MalletCorpus(corpus1_fname, id2word=id2word1)
        id2word2 = Dictionary.load(corpus2_fname + '.dict')
        corpus2 = MalletCorpus(corpus2_fname, id2word=id2word2)
    except:
        error('Corpora not built yet -- cannot evaluate')

    word_freq1 = get_word_freq(corpus1)
    word_freq2 = get_word_freq(corpus2)

    total1 = float(sum(x[1] for x in word_freq1.items()))
    total2 = float(sum(x[1] for x in word_freq2.items()))

    all_words = set(word_freq1.keys()) | set(word_freq2.keys())
    for word in all_words:
        if word not in word_freq1:
            word_freq1[word] = 0
            if word not in word_freq2:
                word_freq2[word] = 0

    dist1 = [x[1]/total1 for x in sorted(word_freq1.items())]
    dist2 = [x[1]/total2 for x in sorted(word_freq2.items())]
    rdist = numpy.random.random_sample(len(all_words))

    res = utils.hellinger_distance(dist1, dist2, filter_by=0.0)
    res1 = utils.hellinger_distance(dist1, rdist, filter_by=0.0)
    res2 = utils.hellinger_distance(dist2, rdist, filter_by=0.0)
    logger.info("Cosine distance between corpora: %f" % res)
    with open(config.path + 'evaluate-hellinger-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([corpus1_fname, corpus2_fname, res, res1, res2])
Exemplo n.º 22
0
    def create(pathtomapping, pathtocorpus, corpusname, window, numtokeep=50000, save_raw=True, shifts=(1, 5, 10)):
        """
        Creates an Shifted Positive Pointwise Mutual Information matrix.

        :param pathtomapping: The path to the id2word mapping. If this is left empty, the id2word mapping gets
        recreated. Warning: this takes a long time.
        :param pathtocorpus: The path to the corpus folder. The corpus can be spread out over multiple files or folders,
        and is read iteratively.
        :param corpusname: The name of the corpus. Used for saving the files.
        :param window: The window used to consider co-occurrences.
        :param numtokeep: The number of most frequent words to keep. Note that the matrix is non-sparse.
        Because of this, the memory requirements of the code are quadratic.
        :param save_raw: Whether to save the raw co-occurrence matrix as a numpy matrix.
        :param shifts: The shifts to apply to the co-occurrence matrix. Each shifted matrix
        gets saved as a separate model.
        """

        start = time.time()

        if not pathtomapping:
            id2word = Dictionary(SentenceIter(pathtocorpus), prune_at=None)
            id2word.filter_extremes(no_below=5, keep_n=numtokeep)
            id2word.compactify()
            logger.info("Creating the word2id took {0} seconds".format(time.time() - start))
        else:
            id2word = Dictionary.load(pathtomapping)

        inter = time.time()

        word2id = gensim.utils.revdict(id2word)

        corpus = SentenceIter(pathtocorpus)
        raw = get_cooccur(corpus, word2id, window=window)

        logger.info("Creating raw co-occurrence matrix took {0} seconds".format(time.time() - inter))

        if save_raw:
            np.save('{0}-cooccur.npy'.format(corpusname), raw)

        SPPMIFactory._save_word2id(word2id, "{0}mapping.json".format(corpusname))
        SPPMIFactory._save_freqs(id2word, "{0}freqs.json".format(corpusname))

        raw = SPPMIFactory.raw2pmi(raw)

        for k in shifts:
            sparse = SPPMIFactory.shift_clip_pmi(np.copy(raw), k_shift=k)
            SPPMIFactory._save_sparse_mtr(sparse, "{0}-SPPMI-sparse-{1}-shift.npz".format(corpusname, k))
            del sparse
def main():
    file = 'f:/projects/elasticsearch-enterprise-system/data/topic_models/nowiki_v2_3pass_lda_250'
    mod = LdaModel.load(file)
    dict = 'f:/projects/elasticsearch-enterprise-system/data/topic_models/voc_vocabulary_0.vocab'
    vocab = Dictionary.load(dict)
    corpfile = 'f:/projects/comperio-text-analytics/models/topicmodel/mojo_lda_100.corp'
    corpus = gensim.corpora.MmCorpus(corpfile)

    print mod.show_topic(0)
    print mod.id2word
    mod.id2word = vocab

    print mod.show_topic(0)

    pydavis = pyLDAvis.gensim.prepare(mod, corpus, vocab)
    pyLDAvis.save_html(pydavis, 'pydavis_250_v2_3passes.html')
    pyLDAvis.show(pydavis)
Exemplo n.º 24
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    result_path = path.join(base_path, p['result_path'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    logger.info('load the articles..')
    article_path = path.join(result_path, p['article_label'])
    wiki = pickle.load(open(path.join(article_path, 'articles.pickle')))

    logger.info('load dictionary and models')
    dictionary = Dictionary.load(path.join(result_path,
                                           p['model_label'],
                                           'dic.dict'))
    model_path = path.join(result_path, p['model_label'])
    lsi = LsiModel.load(path.join(model_path, 'lsi.model'))
    pre = pickle.load(open(path.join(model_path, 'pre.model')))
    if int(p['num_topics']) > lsi.num_topics:
        logger.error('model to small')
    lsi.num_topics = int(p['num_topics'])

    data = {}
    for topic, entries in wiki.iteritems():
        logger.info('working on: %s' % topic)

        data[topic] = {}
        data[topic]['keys'] = []
        vecs = []
        data[topic]['ratings'] = []
        for key, val in entries.iteritems():
            data[topic]['keys'].append(key)
            vecs.append(lsi[pre[dictionary.doc2bow(val['text'])]])
            data[topic]['ratings'].append(val['rating'])
        data[topic]['vecs'] = np.squeeze(np.array(vecs)[:, :, 1:2]).T

        U, d, _ = np.linalg.svd(data[topic]['vecs'], full_matrices=False)
        data[topic]['U'] = U
        data[topic]['d'] = d

    f = open(os.path.join(output_dir, "data.pickle"), 'wb')
    pickle.dump(data, f)
def create_model(config, Kind):
    model_fname = config.model_fname % Kind.__name__
    corpus_fname = config.corpus_fname % Kind.__name__

    if not os.path.exists(model_fname):
        try:
            id2word = Dictionary.load(corpus_fname + '.dict')
            corpus = MalletCorpus(corpus_fname, id2word=id2word)
            logger.info('Opened previously created corpus: %s' % corpus_fname)
        except:
            error('Corpora for building file models not found!')

        file_model = LdaModel(corpus,
                              id2word=corpus.id2word,
                              alpha=config.alpha,
                              passes=config.passes,
                              num_topics=config.num_topics)

        file_model.save(model_fname)
Exemplo n.º 26
0
def get_topics_lda(tokens, n_topics=10):
    """
    Using the `gensim` package for LDA. 
    LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia).
    `gensim` is a package for topic modeling only. So for a particular topic modeling task,
    it is a lighter option to install and run. Also it can be run distributed and updated over an existing model

    :param tokens: Preprocessed tokens for faster dictionary building
    :param n_topics: Number of topics to decompose data to
    :return: list() of topics
    """
    dict_file = 'resources/deals.dict'
    if not os.path.isfile(dict_file):
        print "Dictionary file does not exist. Creating one"
        dictionary = Dictionary(tokens)
        freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1]
        dictionary.filter_tokens(freq1)
        dictionary.compactify()
        dictionary.save(dict_file)
    dictionary = Dictionary.load(dict_file)
    # print dictionary

    corpus_file = 'resources/deals.mm'
    if not os.path.isfile(corpus_file):
        print "Corpus file does not exist. Creating one"
        corpus = [dictionary.doc2bow(token) for token in tokens]
        MmCorpus.serialize(corpus_file, corpus)
    mm = MmCorpus(corpus_file)
    # print mm
    # tfidf = TfidfModel(mm)
    # corpus_tfidf = tfidf[mm]

    lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000,
                   passes=1)
    topics = []
    for i in range(0, n_topics):
        words = lda.print_topic(i).split('+')
        topic = []
        for word in words:
            score, w = word.split('*')
            topic.append((w, score))
        topics.append(topic)
    return topics
Exemplo n.º 27
0
def create_queries(project):
    corpus_fname_base = project.full_path + 'Queries'
    corpus_fname = corpus_fname_base + '.mallet.gz'
    dict_fname = corpus_fname_base + '.dict.gz'

    if not os.path.exists(corpus_fname):
        pp = GeneralCorpus(lazy_dict=True)
        id2word = Dictionary()

        with open(os.path.join(project.full_path, 'ids.txt')) as f:
            ids = [x.strip() for x in f.readlines()]

        queries = list()
        for id in ids:
            with open(os.path.join(project.full_path, 'queries',
                                    'ShortDescription' + id + '.txt')) as f:
                short = f.read()

            with open(os.path.join(project.full_path, 'queries',
                                    'LongDescription' + id + '.txt')) as f:
                long = f.read()

            text = ' '.join([short, long])
            text = pp.preprocess(text)

            # this step will remove any words not found in the dictionary
            bow = id2word.doc2bow(text, allow_update=True)

            queries.append((bow, (id, 'query')))

        # write the corpus and dictionary to disk. this will take awhile.
        MalletCorpus.serialize(corpus_fname, queries, id2word=id2word,
                               metadata=True)

    # re-open the compressed versions of the dictionary and corpus
    id2word = None
    if os.path.exists(dict_fname):
        id2word = Dictionary.load(dict_fname)

    corpus = MalletCorpus(corpus_fname, id2word=id2word)

    return corpus
Exemplo n.º 28
0
	def __init__(self, ldaModelFile, dictionaryfile, stopfile="english.stop.txt"):
		'''
			Const
			Parameters:
				ldaModelFile: the model file that was trained 
				dictionaryfile: id2word mapping file
		'''
		logging.info("[Start] Loading the dictionary " + dictionaryfile)
		self.id2word = Dictionary.load(dictionaryfile)
		logging.info("[Stop] Loading the dictionary " + dictionaryfile)

		logging.info("[Start] Loading the model file " + ldaModelFile)
		self.ldamodel = LdaModel.load(ldaModelFile)
		logging.info("[Done] Loading the model file " + ldaModelFile)

		logging.info("[Start] Loading all topics")
		self.alltopics = self.ldamodel.show_topics(-1)
		logging.info("[Start] Loading all topics")

		self.stopwords = self.loadStop(stopfile)
Exemplo n.º 29
0
    def __init__(self, text, dictionary, stopwords=False, stemming=False):
        self.text = text
        self.remove_stopwords = stopwords
        self.stemming = stemming
        self.dictionary = Dictionary.load(dictionary)

        # blackist words to be removed from text
        # combines stopwords from nltk, gensim and stop_words package
        self.en_stopwords = set(
            stop_words.get_stop_words('en') +
            nltk.corpus.stopwords.words("english") +
            list(gensim.parsing.preprocessing.STOPWORDS)
        )

        # keep -, +, # in words
        self.punctuation = re.sub("[-+#.]", " ", punctuation)

        # make translation dictionary converting punctuations to white spaces
        self.translate_dict = maketrans(punctuation, ' '*len(punctuation))

        # replace patterns
        self.invalid_char = re.compile(r'[0-9]|\\~|\`|\@|\$|\%|\^|\& \
                |\*|\(|\)|\_|\=|\[|\]|\\|\<|\<|\>|\?|\/|\;|\\.')
        self.url_pattern = re.compile(r'(' +
              # Scheme (HTTP, HTTPS, FTP and SFTP):
              r'(?:(https?|s?ftp):\/\/)?' +
              # www:
              r'(?:www\.)?' + 
              r'(' +
              # Host and domain (including ccSLD):
              r'(?:(?:[A-Z0-9][A-Z0-9-]{0,61}[A-Z0-9]\.)+)' +
              # TLD:
              r'([A-Z]{2,6})' +
              # IP Address:
              r'|(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' +
              r')' +
              # Port:
              r'(?::(\d{1,5}))?' +
              # Query path:
              r'(?:(\/\S+)*)' +
              r')', re.IGNORECASE)
Exemplo n.º 30
0
def get_topics():
    '''Computes distribution over topics for each abstract'''

    dictionary = Dictionary.load('lda.dict')
    lda = LdaMulticore.load('lda.gensim')

    base = 'datasets/dspace'
    new_base = 'datasets/dspace_topics'
    for filename in tqdm(os.listdir(base)):
        path = os.path.join(base, filename)
        with open(path, 'r') as f:
            d = json.load(f)
            abstract = d['abstract']
            if abstract is not None:
                words = tokenize(abstract.split())
                bow = dictionary.doc2bow(words)
                topics = lda.get_document_topics(bow, minimum_probability=0)
                topics = to_vec(topics)
                d['topics'] = topics
                new_path = os.path.join(new_base, filename)
                with open(new_path, 'w') as new_f:
                    json.dump(d, new_f)
Exemplo n.º 31
0
 def load_dictionary(self):
     path = os.path.join(
         self.model_directory, "dictionary-%d-%d-%d.pkl" %
         (self.ndocs, self.phrase_min_count, self.vocabulary_size))
     self.dictionary = Dictionary.load(path)
Exemplo n.º 32
0
from gensim.corpora import Dictionary
from gensim.models.lsimodel import LsiModel
from nltk.corpus import stopwords as nltk_stopwords
from os.path import dirname, realpath

try:
    path_to_directory_of_this_file = dirname(realpath(__file__))

    stopwords = []
    with open(path_to_directory_of_this_file + "/stopwords.txt") as f:
        stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")])   
    stopwords = set(stopwords)

    lsi = LsiModel.load(path_to_directory_of_this_file + "/model")
   
    dictionary = Dictionary.load(path_to_directory_of_this_file + "/dictionary")
except Exception as e:
    print("Exception trying to load LSI index.  You can most likely ignore this:", e)

def run(text):

    try:

        words = text.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split()
        words = [word for word in words if len(word) > 3 and word not in stopwords]

        if words:
            probabilities = lsi[dictionary.doc2bow(words)]
            if probabilities:
                return sorted(probabilities, key=lambda tup: -1*tup[1])[0][0]
Exemplo n.º 33
0
#!/usr/bin/env python

from gensim.models import LdaModel
from gensim.corpora import MmCorpus, Dictionary
import sys, os
import pyLDAvis.gensim

if len(sys.argv) < 2:
    print("usage: {0} [path to model.lda]\n".format(sys.argv[0]))
    sys.exit(1)

path, file = os.path.split(sys.argv[1])
corpusname = file.split(".")[0]

dictionary = Dictionary.load(path + "/" + corpusname + ".dict")
corpus = MmCorpus(path + "/" + corpusname + ".mm")
model = LdaModel.load(sys.argv[1])

##############
# cf. https://pyldavis.readthedocs.org/en/latest/modules/API.html

vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)

pyLDAvis.save_html(vis, path + "/" + corpusname + "_interactive.html")
pyLDAvis.show(vis)
Exemplo n.º 34
0
    def load(self):
        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)
Exemplo n.º 35
0
### Generating a large training/background corpus using Wikipedia
from gensim.corpora import WikiCorpus, wikicorpus

articles = "enwiki-latest-pages-articles.xml.bz2"  # available from http://en.wikipedia.org/wiki/Wikipedia:Database_download

wiki_corpus = WikiCorpus(
    articles
)  # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix.
wiki_corpus.dictionary.save("wiki_dict.dict")

MmCorpus.serialize("wiki_corpus.mm", wiki_corpus)  #  File will be several GBs.

### Working with persisted corpus and dictionary
bow_corpus = MmCorpus("wiki_corpus.mm")  # Revive a corpus

dictionary = Dictionary.load("wiki_dict.dict")  # Load a dictionary

### Transformations among vector spaces
from gensim.models import LsiModel, LogEntropyModel

logent_transformation = LogEntropyModel(
    wiki_corpus, id2word=dictionary
)  # Log Entropy weights frequencies of all document features in the corpus

tokenize_func = wikicorpus.tokenize  # The tokenizer used to create the Wikipedia corpus
document = "Some text to be transformed."
bow_document = dictionary.doc2bow(
    tokenize_func(document)
)  # First, tokenize document using the same tokenization as was used on the background corpus, and then convert it to BOW representation using the dictionary created when generating the background corpus.
logent_document = logent_transformation[[
    bow_document
Exemplo n.º 36
0
import numpy as np

from gensim.corpora import Dictionary

import os

import keras
from k_max_pooling import *
from keras.models import load_model
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, TensorBoard, ReduceLROnPlateau

label_dict = Dictionary()
label_dict = label_dict.load(CURRENT_MAIN_PATH + '/dicts/label_dict.dict')
total_label = len(label_dict)

print("Total classes : %d" % total_label)

data = np.load(CURRENT_MAIN_PATH + '/npz_data/train.npz')
x_data = data['x_data']
y_data = data['y_data']

# Shuffle the data
indices = np.random.permutation(x_data.shape[0])
x_data = x_data[indices]
y_data = y_data[indices]

x_data = sequence.pad_sequences(x_data,
                                maxlen=SEQ_LEN,
Exemplo n.º 37
0
def LDA_Analysis():
    #http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb

    if 0 == 1:
        with open('data/review_text_all.txt','w') as myfile:
            myfile.write("")
        
        '''
        loop through db and write jobs descriptions
        '''
        
        with open('data/review_text_all.txt','a') as myfile:
            with Job() as db:
                a=0
                max_ = int(db.getNoJobs()[0][0])
                while (a < max_):
                    #print(a)
                    sample_review = db.readJobDetailClean(a)[0][1]
                    if (sample_review != 'Json Error'):
                        myfile.write(str(sample_review)+'\n')
                    a += 1
    
    #unigram_sentences_filepath = os.path.join(intermediate_directory, 'unigram_sentences_all.txt')
    
    if 0 == 1:
    
        with codecs.open('data/unigram_sentences_all.txt', 'w', encoding='utf_8') as f:
            for sentence in lemmatized_sentence_corpus('data/review_text_all.txt'):
                f.write(sentence + '\n')
    
    unigram_sentences = LineSentence('data/unigram_sentences_all.txt')
   
    '''
    for unigram_sentence in it.islice(unigram_sentences, 230, 240):
        print(u' '.join(unigram_sentence))
        print(u'')
    '''
        
    #bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')
    
    if 0 == 1:

        bigram_model = Phrases('data/unigram_sentences_all.txt')
    
        bigram_model.save('data/bigram_model_all')
    
    # load the finished model from disk
    bigram_model = Phrases.load('data/bigram_model_all')
    
    #bigram_sentences_filepath = os.path.join(intermediate_directory, 'bigram_sentences_all.txt')
   
    if 0 == 1:
    
        with codecs.open('data/bigram_sentences_all.txt', 'w', encoding='utf_8') as f:
            
            for unigram_sentence in unigram_sentences:
                
                bigram_sentence = u' '.join(bigram_model[unigram_sentence])
                
                f.write(bigram_sentence + '\n')
            
    bigram_sentences = LineSentence('data/bigram_sentences_all.txt')
            
    '''                    
    for bigram_sentence in it.islice(bigram_sentences, 230, 240):
        print(u' '.join(bigram_sentence))
        print(u'')  
    '''

    #trigram_model_filepath = os.path.join(intermediate_directory, 'trigram_model_all')

    if 0 == 1:
    
        trigram_model = Phrases(bigram_sentences)
    
        trigram_model.save('data/trigram_model_all')
        
    # load the finished model from disk
    trigram_model = Phrases.load('data/trigram_model_all')

    #trigram_sentences_filepath = os.path.join(intermediate_directory, 'trigram_sentences_all.txt')                     

    if 0 == 1:
    
        with codecs.open('data/trigram_sentences_all.txt', 'w', encoding='utf_8') as f:
            
            for bigram_sentence in bigram_sentences:
                
                trigram_sentence = u' '.join(trigram_model[bigram_sentence])
                
                f.write(trigram_sentence + '\n')
                
    trigram_sentences = LineSentence('data/trigram_sentences_all.txt')

    '''
    for trigram_sentence in it.islice(trigram_sentences, 230, 240):
        print(u' '.join(trigram_sentence))
        print(u'')
    '''

    #trigram_reviews_filepath = os.path.join(intermediate_directory, 'trigram_transformed_reviews_all.txt')
    
    if  0 == 1:
      
        import csv
        
        '''
        Variant A: Use Stopwords
        1) download StopWords.csv from MySQL table: KeyWords.
        2) Remove all relevant words by hand ;)
        '''
        with open('data/StopWords.csv', newline='') as csvfile:
          
          stopwords_ = csv.reader(csvfile, delimiter=' ', quotechar='|')
          for words_ in stopwords_:
            #print(words_[0])
            STOP_WORDS.add(words_[0])
    
        #print(STOP_WORDS)
        
        '''
        Varaint B: Use Dictionary
        '''
        with open('data/Dictionary.csv', 'r', newline='') as csvfile:
          
          file_ = csv.reader(csvfile, delimiter=',', quotechar='"')
          
          dictionary_ = []

          for row in file_:
              dictionary_.append(row[0])
          
          #with open('file.csv', 'r') as f:
  #reader = csv.reader(f)
  #your_list = list(reader)
    
    
        with codecs.open('data/trigram_transformed_reviews_all.txt', 'w', encoding='utf_8') as f:
            
            for parsed_review in nlp.pipe(line_review('data/review_text_all.txt'), batch_size=10000, n_threads=4):
                
                # lemmatize the text, removing punctuation and whitespace
                unigram_review = [token.lemma_ for token in parsed_review
                                  if not punct_space(token)]
                
                # apply the first-order and second-order phrase models
                bigram_review = bigram_model[unigram_review]
                trigram_review = trigram_model[bigram_review]
                
                # remove any remaining stopwords
                '''
                Variant A:
                '''
                #trigram_review = [term for term in trigram_review
                #                  if term not in STOP_WORDS]#spacy.en.STOPWORDS] !!!!! CHECK THIS !!!!! module 'spacy' has no attribute 'en'
                
                '''
                Variant B:
                '''
                trigram_review = [term for term in trigram_review
                                  if term in dictionary_]#
                
                # write the transformed review as a line in the new file
                trigram_review = u' '.join(trigram_review)
                f.write(trigram_review + '\n')
                
    '''
    print(u'Original:' + u'\n')
    
    for review in it.islice(line_review('review_text_all.txt'), 11, 12):
        print(review)
    
    print(u'----' + u'\n')
    print(u'Transformed:' + u'\n')
    
    with codecs.open('trigram_transformed_reviews_all.txt', encoding='utf_8') as f:
        for review in it.islice(f, 11, 12):
            print(review)
    '''

    #trigram_dictionary_filepath = os.path.join(intermediate_directory, 'trigram_dict_all.dict')

    if 0 == 1:
    
        trigram_reviews = LineSentence('data/trigram_transformed_reviews_all.txt')
    
        # learn the dictionary by iterating over all of the reviews
        trigram_dictionary = Dictionary(trigram_reviews)
        
        # filter tokens that are very rare or too common from
        # the dictionary (filter_extremes) and reassign integer ids (compactify)
        trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)#,keep_n=100000)#,)
        trigram_dictionary.compactify()
    
        trigram_dictionary.save('data/trigram_dict_all.dict')
        
    # load the finished dictionary from disk
    trigram_dictionary = Dictionary.load('data/trigram_dict_all.dict')
    
    #trigram_bow_filepath = os.path.join(intermediate_directory, 'trigram_bow_corpus_all.mm')
    
    if 0 == 1:
    
        # generate bag-of-words representations for
        # all reviews and save them as a matrix
        MmCorpus.serialize('data/trigram_bow_corpus_all.mm', trigram_bow_generator(trigram_dictionary,'data/trigram_transformed_reviews_all.txt'))
        
    # load the finished bag-of-words corpus from disk
    trigram_bow_corpus = MmCorpus('data/trigram_bow_corpus_all.mm')
    
    #lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')
    
    if 0 == 1:
    
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            
            # workers => sets the parallelism, and should be
            # set to your number of physical cores minus one
            lda = LdaMulticore(trigram_bow_corpus,
                               num_topics=15,
                               id2word=trigram_dictionary,
                               workers=1)
        
        lda.save('data/lda_model_all')
        
    # load the finished LDA model from disk
    lda = LdaMulticore.load('data/lda_model_all')

    #explore_topic(lda, topic_number=1)

    topic_names = {0:u'Risk Management Bank', 
                   1:u'Big Data Report', 
                   2:u'Automotive SAP', 
                   3:u'Microsoft Java Scrum', 
                   4:u'Medical Consultant', 
                   5:u'Java Engineer', 
                   6:u'Computer Vision Developer', 
                   7:u'Data Analyst', 
                   8:u'BI SAP BW', 
                   9:u'IOT Reporting R', 
                   10:u'Global Project Presentation',
                   11:u'Cloud Engineer IOT', 
                   12:u'Industry 4.0', 
                   13:u'Risk Consulting', 
                   14:u'Machine Learning Data Science'}
    
    #topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl')
    
    with open('data/topic_names.pkl', 'wb') as f:
        pickle.dump(topic_names, f)
    
    #load sameple_review from database
    #sample_review = get_sample_review(10)
    
    #lda_description(bigram_model, trigram_model, trigram_dictionary, lda, topic_names, sample_review)

    #LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared')
    
    if 0 == 1:
        
        #term_ix = np.sort(topic_info.index.unique().values)
    
        LDAvis_prepared = pyLDAvis.gensim_.prepare(lda, trigram_bow_corpus, trigram_dictionary)
    
        with open('data/ldavis_prepared', 'wb') as f:
            pickle.dump(LDAvis_prepared, f)
            
    '''
    export LDA file
    '''
    
    # load the pre-prepared pyLDAvis data from disk
    with open('data/ldavis_prepared', 'rb') as f:
        LDAvis_prepared = pickle.load(f)

    with open('data/DSJobs_LDA.html', 'w') as f:
        pyLDAvis.save_html(LDAvis_prepared, f)            
Exemplo n.º 38
0
    def run_training_batch(self, batch, batch_idx):
        """

        :param batch: dict; contains three keys: input_ids, attention_mask, decoder_input_ids
            Example for 'batch':
                batch: {'input_ids': tensor([[  0,  36, 230,  ...,   8,  41,   2]]),
                'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]]),
                'decoder_input_ids': tensor([[    0,   287,    10,  2107,   111, 10468,   226, 47385, 11579,  1012,
                                                2156,     5,  5302, 47385,   281, 47385, 10003,   255, 47385,   347,
                                                111,  2107, 47385,   574, 47385,  1000, 47385,   398, 47385,   245,
                                                16,    10,   205,  1374, 12576,   479,   646,  1000,  1215,  3388,
                                                510,   742,    85,   128,   579,    65,     9,     5,   357,  3092,
                                                23,    63,  1836,    11,     5,  3555,   111,   672,  2156, 26180,
                                                47385,   642,   111,  3547,  4120,   479,   646,  1000,  1215,  3388,
                                                510,   742,  7192,  8806, 10262,  3444,  7951,  2170,  1318,     2]])}
        :param batch_idx: number of batch
        :return:
        """
        # load tokenizer
        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
        # load config for GSM
        config = yaml_load(f"{self.default_root_dir}/data/config/gsm.yaml")
        # load dict
        dictionary = Dictionary.load(datapath('dict-www-cnndm-unigram'))
        # remove [SEP]
        sep_list = [
            '[SEP_0]', '[SEP_1]', '[SEP_2]', '[SEP_3]', '[SEP_4]', '[SEP_5]',
            '[SEP_6]', '[SEP_7]', '[SEP_8]', '[SEP_9]', '<S_SEP>'
        ]
        # vocab size for topic modeling
        vocab_size = len(dictionary)
        # model
        config['hidden']['features'][0] = vocab_size

        # trainer batch
        config['trainer_batch']['test_sample'] = 1
        config = extend_config_reference(config)
        gsm_trainer = config['GSMtrainer']
        gsm_trainer[
            'base_dir'] = f"{self.default_root_dir}/log/bart-large-cnn-finetune"
        gsm_trainer = GSMTrainer.from_config(gsm_trainer)

        # number of topics
        K = config['gsmtopic']['k']

        # yaml_dump(gsm_trainer,
        #           os.path.join(f"{self.default_root_dir}/log/bart-large-cnn-finetune", "gsm_trainer.yaml"))

        # -----------------------------------------
        # Topic Modeling - GSM
        # -----------------------------------------
        batch_size = batch['input_ids'].size()[0]

        docs = []
        for batch_num in range(batch_size):
            # extract the batch_sentence
            batch_sentence = tokenizer.decode(
                batch['input_ids'][batch_num].tolist(),
                skip_special_tokens=True)
            # change to lowercase and split to list
            batch_sentence_list = batch_sentence.split(" ")
            # remove [SEP]
            batch_sentence_list_nosep = [
                item for item in batch_sentence_list if item not in sep_list
            ]
            text = ' '.join([x for x in batch_sentence_list_nosep])
            fine_text = text.replace(' ##', '').lower()
            batch_sentence = re.sub(r'[^\w\s]', '', fine_text)
            # batch_sentence: change to the cleaned news for topic modeling
            # change to training data format in topic modeling
            gsm_data_bow = dictionary.doc2bow(batch_sentence.split(" "))
            docs.append(gsm_data_bow)

        # gsm_data: data for topic modeling
        gsm_data = DataLoader(DocDataset(docs, len(dictionary), device='cuda'),
                              batch_size=config['dataset']['batch_size'],
                              drop_last=False,
                              num_workers=0)

        gsm_trainer.__dict__['train_iterator'] = gsm_data

        gsm_loss, gsm_p = gsm_trainer.co_train(vocab_size, training=True)

        del gsm_data

        # track grad norms
        grad_norm_dic = {}

        # track all metrics for callbacks
        batch_callback_metrics = []

        # track metrics to log
        batch_log_metrics = []

        if batch is None:
            return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic)

        # Batch start events
        with self.profiler.profile('on_batch_start'):
            # callbacks
            self.on_batch_start()
            # hooks
            if self.is_function_implemented('on_batch_start'):
                response = self.get_model().on_batch_start(batch)
                if response == -1:
                    return AttributeDict(signal=-1,
                                         grad_norm_dic=grad_norm_dic)

        splits = [batch]
        if self.truncated_bptt_steps is not None:
            model_ref = self.get_model()
            with self.profiler.profile('tbptt_split_batch'):
                splits = model_ref.tbptt_split_batch(batch,
                                                     self.truncated_bptt_steps)

        self.hiddens = None
        for split_idx, split_batch in enumerate(splits):
            self.split_idx = split_idx

            for opt_idx, optimizer in self._get_optimizers_iterable():
                # make sure only the gradients of the current optimizer's parameters are calculated
                # in the training step to prevent dangling gradients in multiple-optimizer setup.
                if len(self.optimizers) > 1:
                    for param in self.get_model().parameters():
                        param.requires_grad = False
                    for group in optimizer.param_groups:
                        for param in group['params']:
                            param.requires_grad = True

                # -------------------
                # calculate loss
                # -------------------
                beta = 0.01
                opt_closure_result = self.optimizer_closure(
                    split_batch,
                    batch_idx,
                    opt_idx,
                    optimizer,
                    self.hiddens,
                    gsm_p,  # topic distribution
                    gsm_loss,  # loss for topic modeling
                    K,  # number of topics
                    beta,
                )

                # ------------------------------
                # POST forward bookkeeping
                # ------------------------------
                batch_callback_metrics.append(
                    opt_closure_result.training_step_output.callback_metrics)
                batch_log_metrics.append(
                    opt_closure_result.training_step_output.log_metrics)

                self.add_progress_bar_metrics(
                    opt_closure_result.training_step_output.pbar_on_batch_end)

                # track hiddens
                self.hiddens = opt_closure_result.hiddens

                # check if loss or model weights are nan
                if self.terminate_on_nan:
                    self.detect_nan_tensors(opt_closure_result.loss)

                # track total loss for logging (avoid mem leaks)
                self.batch_loss_value.append(opt_closure_result.loss)

                # ------------------------------
                # BACKWARD PASS
                # ------------------------------
                # gradient update with accumulated gradients
                if (self.batch_idx + 1) % self.accumulate_grad_batches == 0:
                    # backward
                    grad_norm_dic = self.run_batch_backward_pass(
                        split_batch, batch_idx, opt_idx, optimizer)

                    # calculate running loss for display
                    self.running_loss.append(self.batch_loss_value.mean())

                    # reset for next set of accumulated grads
                    self.batch_loss_value.reset()

        # Batch end events
        with self.profiler.profile('on_batch_end'):
            # callbacks
            self.on_batch_end()
            # model hooks
            if self.is_function_implemented('on_batch_end'):
                self.get_model().on_batch_end()

        # collapse all metrics into one dict
        batch_log_metrics = {
            k: v
            for d in batch_log_metrics for k, v in d.items()
        }

        # track all metrics for callbacks
        self.callback_metrics.update(
            {k: v
             for d in batch_callback_metrics for k, v in d.items()})

        result = AttributeDict(
            signal=0,
            grad_norm_dic=grad_norm_dic,
            batch_log_metrics=batch_log_metrics,
            training_step_output_for_epoch_end=opt_closure_result.
            training_step_output_for_epoch_end)
        return result
Exemplo n.º 39
0
    def train(self):
        questions = copy.copy(self.additional)
        for i, q1id in enumerate(self.trainset):
            question = self.trainset[q1id]
            if self.proctrain:
                q1 = [w.lower() for w in question['tokens']
                      ] if self.lowercase else question['tokens']
                q1 = self.remove_punctuation(q1) if self.punctuation else q1
                q1 = self.remove_stopwords(q1) if self.stop else q1
            else:
                q1 = question['tokens']
            questions.append(q1)

            duplicates = question['duplicates']
            for duplicate in duplicates:
                rel_question = duplicate['rel_question']
                if self.proctrain:
                    q2 = [w.lower() for w in rel_question['tokens']
                          ] if self.lowercase else rel_question['tokens']
                    q2 = self.remove_punctuation(
                        q2) if self.punctuation else q2
                    q2 = self.remove_stopwords(q2) if self.stop else q2
                else:
                    q2 = rel_question['tokens']
                questions.append(q2)

                rel_comments = duplicate['rel_comments']
                for rel_comment in rel_comments:
                    if self.proctrain:
                        q3 = [w.lower() for w in rel_comment['tokens']
                              ] if self.lowercase else rel_comment['tokens']
                        q3 = self.remove_punctuation(
                            q3) if self.punctuation else q3
                        q3 = self.remove_stopwords(q3) if self.stop else q3
                    else:
                        q3 = rel_comment['tokens']
                    if len(q3) == 0:
                        q3 = ['eos']
                    questions.append(q3)

        fname = 'transdict'
        if self.lowercase: fname += '.lower'
        if self.stop: fname += '.stop'
        if self.punctuation: fname += '.punct'
        if self.proctrain: fname += '.proctrain'
        fname += '.model'

        path = os.path.join(self.path, fname)
        if not os.path.exists(path):
            self.vocabulary = Dictionary(questions)
            self.vocabulary.save(path)
        else:
            self.vocabulary = Dictionary.load(path)
        self.w_C = compute_w_C(questions, self.vocabulary)  # background lm
        self.model = TRLM([],
                          self.w_C,
                          self.alignments,
                          len(self.vocabulary),
                          alpha=self.alpha,
                          sigma=self.sigma)

        del self.additional
        del self.trainset
    comments_text = data['comment_text']
    data.drop(['comment_text'], inplace=True, axis=1)
    docs = lematize_comments(comments_text, nthreads=16)

    # XXX Add phrasing

    comments_dictionary = None
    if doTrain:
        print("Creating dictionary....")
        comments_dictionary = Dictionary(docs)
        comments_dictionary.filter_extremes(no_below=10, no_above=0.3)
        comments_dictionary.compactify()
        comments_dictionary.save(FLAGS.dictFile)
    else:
        print("Loading dictionary...")
        comments_dictionary = Dictionary.load(FLAGS.dictFile)

    print("Converting to BOW vectors...")
    comments_corpus = [comments_dictionary.doc2bow(d) for d in docs]

    model_tfidf = None
    if doTrain:
        print("Creating tfidf model...")
        model_tfidf = TfidfModel(comments_corpus)
        model_tfidf.save(FLAGS.tfidfFile)
    else:
        print("Loading tfidf model...")
        model_tfidf = TfidfModel.load(FLAGS.tfidfFile)

    print("Converting to tfidf vectors...")
    comments_tfidf = model_tfidf[comments_corpus]
Exemplo n.º 41
0
    STOPWORDS = f.readlines()
    STOPWORDS = set([item.strip(string.whitespace) for item in STOPWORDS])
    STOP_WORDS = STOP_WORDS.union(STOPWORDS)


# encodings:
replace_dict = {
    '\ufb01' : 'fi',
    '\u2019' : '',
    '\u00e9' : 'e', 
    '\u00a8' : '',
    'ямБ': 'fi',
}

# tfidf model
dct = Dictionary.load("../data/models/tfidf/dictionary.model")
tfidf = TfidfModel.load("../data/models/tfidf/tfidf.model")








def clean_chunk(chunk):
    
    result = []
    for token in chunk:
        # if token.text.lower() == 'the':
        #     print(token.text.lower().strip(), token.text.lower().strip() in STOP_WORDS)
Exemplo n.º 42
0
    parser.add_argument('-d', '--dictionary', metavar='PATH', default='dict.pk',
                        help="Pickled dictionary file (Gensim)")
    parser.add_argument('-e', '--epochs', type=int, metavar='N', default=5,
                        help="Number of epochs to train for")
    parser.add_argument('-b', '--batch_size', type=int, metavar='N',
                        default=32, help="Batch size used in training.")
    parser.add_argument('-l', '--load', metavar='FILE',
                        help="Load model from file.")
    parser.add_argument('-s', '--save', metavar='FILE',
                        help="Save model to file.")
    parser.add_argument('-v', '--vector_size',
                        metavar='SIZE', type=int, default=0,
                        help="Size of input vectors (if sequence of vectors)")

    args = parser.parse_args()

    dictionary = Dictionary.load(args.dictionary)

    # Input dataset
    data = pk.load(open(args.datafile, 'rb'))

    model = RNNModel(vocab_size=len(dictionary), load=args.load,
                     vector_size=args.vector_size)

    # Fit and test expect a Dataset object (they use the proper subset)
    model.fit(data, epochs=args.epochs, batch_size=args.batch_size)
    model.test(data, batch_size=args.batch_size)

    if args.save:
        model.save(args.save)
Exemplo n.º 43
0
def load_current_dictionary():
    return Dictionary.load(os.path.join(module_path, "models", f"dictionary_{datetime.now().strftime('%Y-%m-%d')}"))
Exemplo n.º 44
0
 def load(self):
     if os.path.exists(self.path):
         self.id2word = Dictionary.load(self.path)
Exemplo n.º 45
0
    review_txt_filepath = os.path.join('../Reviews', 'review_text_all.txt')

    wrt_trigram_rvs_to_txt(trigram_reviews_filepath, review_txt_filepath,
                           trigram_model, bigram_model)
    """
    create bag of words
    """
    trigram_reviews_filepath = os.path.join(
        'results', 'trigram_transformed_reviews_all.txt')

    trigram_dictionary_filepath = os.path.join('trigram_dict_all.dict')

    learn_vocab_corpus(trigram_reviews_filepath, trigram_dictionary_filepath)

    # load the finished dictionary from disk
    trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

    trigram_bow_filepath = os.path.join('trigram_bow_corpus_all.mm')

    create_bow(trigram_reviews_filepath, trigram_bow_filepath,
               trigram_dictionary)

    # load the finished bag-of-words corpus from disk
    trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
    """
    find topics
    """

    lda_model_filepath = os.path.join('lda_model_all')

    create_topics(lda_model_filepath, trigram_bow_corpus, trigram_dictionary)
Exemplo n.º 46
0
 def LDALoad(self):
     self.ldamodel = LdaModel.load("fixed_time_window_lda.model")
     self.dictionary = Dictionary.load("lda_dictionary.model")
Exemplo n.º 47
0
                                  0.10):int(len(trash_tokens) * .90)]

cleared_docs = [[token for token in document if token in cleared_tokens]
                for document in cleared_docs]

## Save dictionary in serialized form
dictionary = Dictionary(cleared_docs)
dictionary.save('./dictionaries/python_tags.dict')
corpus = [dictionary.doc2bow(document) for document in cleared_docs]
MmCorpus.serialize('./dictionaries/python_tags.mm', corpus)

########################################
## Load Data
########################################
if (os.path.exists("./dictionaries/python_tags.dict")):
    dictionary = Dictionary.load('./dictionaries/python_tags.dict')
    corpus = MmCorpus('./dictionaries/python_tags.mm')
    print("Used dictionary generated")
else:
    print("Please run the preprocessing to generate a dictionary file")

########################################
## Create Model
########################################
print(corpus)
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

########################################
## Applying LSI
########################################
Exemplo n.º 48
0
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
# %matplotlib inline

from gensim.corpora import Dictionary, MmCorpus
trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict')
trigram_bow_corpus = MmCorpus('./models2/trigram_bow_corpus.nm')
# Document to matrix
import numpy as np
from scipy.sparse import csr_matrix
rows = []
cols = []
data = []
Nrow = 1000000  #len(trigram_bow_corpus)
Ncol = len(trigram_dictionary)
for i in range(0, Nrow):  #
    line = trigram_bow_corpus[i]
    for indx, freq in line:
        rows.append(i)
        cols.append(indx)
        data.append(freq)
Exemplo n.º 49
0
# with open(files[0], 'r') as f:
#     s = json.load(f)
#     pprint(s)
"""
FILTER AND SAVE CORPUS
"""
print("---[" + "FILTER AND SAVE CORPUS" + "]---")
news = glob('news_corpus/*.txt')
corpus = BOWCorpus(news)
tfidf = TfidfModel(corpus)

filter_low_tfidf(corpus, tfidf)

del tfidf
gc.collect()

corpus.dictionary.save('bow_corpus.dict')
MmCorpus.serialize('bow_corpus.mm', corpus)
print("-" * 6)
"""

"""

from gensim.corpora import Dictionary

d = Dictionary.load('bow_corpus.dict')
pprint(d.token2id)

# pic.twitter.com/funssqbvdr
Exemplo n.º 50
0
def convert_docid2from_from2docids(docid2from):
    from2docids = defaultdict(list)
    for docid, from_name in enumerate(docid2from):
        from2docids[from_name].append(docid)
    return from2docids


if __name__ == '__main__':
    # logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

    interval = WEEK  # only WEEK is implemented for now

    model = LdaModel.load('result/model_wiki.lda')
    dictionary = Dictionary.load('data/dictionary/report_(NN).dict')

    from2docids = convert_docid2from_from2docids(dictionary.docid2from)
    time2docids = sort_by_time(dictionary.docid2date, interval)

    p_z_d = model.inference(dictionary.corpus)[0].T
    p_z_d = p_z_d / p_z_d.sum(axis=0).reshape(1, p_z_d.shape[1])  # normalize to make it probability

    # iterate over every interval
    from_similarity = {}
    for time in range(max(time2docids.keys())):
        print('\ncompute similarity for time = ' + str(time) + '...')
        from_vectors, from_frequencies = create_from_vectors(p_z_d, from2docids, time2docids, time)
        from_matrix, from_indices = convert_from_vectors(from_vectors)
        similarities = compute_similarity(from_matrix)
        id_frequencies = convert_from_id(from_frequencies, from_indices)
Exemplo n.º 51
0
                            phraser = Phraser(
                                Phrases(transformed_paragraphs,
                                        **phrases_parameters))
                            phraser.save(
                                phraser_filename.format(phraser_iteration + 1))
                        reader_kwargs['phraser'] = phraser
                        transformed_paragraphs = ArXMLivParagraphIterator(
                            *reader_args, **phraser_reader_kwargs)
                    del transformed_paragraphs
                reader_kwargs['phraser'] = phraser

            paragraphs = ArXMLivParagraphIterator(*reader_args,
                                                  **reader_kwargs)

            try:
                dictionary = Dictionary.load(dictionary_filename)
            except IOError:
                dictionary = Dictionary(paragraphs)
                dictionary.save(dictionary_filename)

            try:
                topic_tfidf = TfidfModel.load(topic_tfidf_filename)
            except IOError:
                topic_tfidf = TfidfModel(dictionary=dictionary,
                                         smartirs='dtb',
                                         slope=0.2)
                topic_tfidf.save(topic_tfidf_filename)

            try:
                document_tfidf = TfidfModel.load(document_tfidf_filename)
            except IOError:
Exemplo n.º 52
0
def LDA_Review(review_df, min_topic_freq=0):
    """
    Takes Pandas series as input,
    consisting of one review as
    text string per row
    """
    from tqdm import tqdm
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    text = review_df['FullReview']
    # parse the review text with spaCy
    with codecs.open('./uni_temporary.txt', 'w', encoding='utf_8') as f:
        for sentence in tqdm(lemmatized_sentence_corpus(text)):
            # print(sentence)
            f.write(sentence + '\n')
    f.close()
    # load and apply the first-order and secord-order phrase models
    bigram_model = Phrases.load('./models2/bigram_model.txt')
    trigram_model = Phrases.load('./models2/trigram_model.txt')

    unigram_review = LineSentence('./uni_temporary.txt')
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    # remove any remaining stopwords
    trigram_review = [
        term for term in trigram_review
        if term not in spacy.lang.en.stop_words.STOP_WORDS
    ]
    with codecs.open('./tri_temporary.txt', 'w', encoding='utf_8') as ftri:
        for sentence in trigram_review:
            sentence = u' '.join(sentence)
            ftri.write(sentence + '\n')
    ftri.close()

    trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict')
    lda = LdaMulticore.load('./models2/lda_model')
    trigram_review = LineSentence('./tri_temporary.txt')
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    # create an LDA representation
    review_lda = lda.get_document_topics(review_bow)
    review_lda = sorted(review_lda, key=itemgetter(1), reverse=True)
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
        # print the most highly related topic names and frequencies
        print('{:25} {}'.format(lda_topics[topic_number], round(freq, 3)))

    ### Step 2: Generate the contents of the doctors' snapshots.

    counter = 0
    # The temporary string that stores all of the review highlights in each round of the for loop below.
    big_str = []
    # For every doctor, find two things:
    #     1. The most mentioned FIVE topics in their reviews.
    #         1.1 The sentiments of these topics.
    #     2. The 3 most positive sentences and the 3 most negative sentences.
    #         2.1 Rank all sentences according to sentiment analysis.
    # I do NOT keep info about individual reviews. All sentences are stored in a
    # long list regardless of whether they are from the same reviews or not!
    ###########################################################################
    # Build sentence dataframe for the current doctor.
    ###########################################################################
    this_hotel = pd.DataFrame(columns=[
        "HotelName", "Sentence", "Sentiment_neg", "Sentiment_neu",
        "Sentiment_pos", "Sentiment_compound", "topic_1", "topic1_score",
        "topic_2", "topic2_score"
    ])
    sent_count = 0

    # For every review sentence
    for sentence in unigram_review:
        # Assess sentiment.
        sentiments = senti.polarity_scores(sentence)
        sentiment_neg = sentiments["neg"]
        sentiment_neu = sentiments["neu"]
        sentiment_pos = sentiments["pos"]
        sentiment_compound = sentiments["compound"]
        # Assign topic.
        # Default topic to -1.
        this_topic = -1
        # Preprocess sentence.
        sent_tokens = tokenizer.tokenize(str(sentence).lower())
        cleaned_sent = [p_stemmer.stem(i) for i in sent_tokens]
        # Evaluate for topic.
        sent_topics = []
        for mod_id in range(0, mod_num):
            model = ldamodel[mod_id]
            dicti = dictionary[mod_id]
            lda_score = model[dicti.doc2bow(cleaned_sent)]
            for item in lda_score:
                sent_topics.append((mod_id, item[0], item[1]))
        sent_topics = sorted(sent_topics, key=lambda x: x[2], reverse=True)
        # Assign the most relevant topic to a sentence only if the topic is more than 70% dominant.
        if sent_topics[0][2] > 0.7:
            this_topic = topics_matrix[sent_topics[0][0]][sent_topics[0][1]]

        # Add procressed sentence and its meta information to the sentence dataframe.
        this_doc.loc[sent_count] = [
            sentence, sentiment, this_topic, sent_topics[0][2]
        ]
        sent_count += 1

    ###########################################################################
    # Compiling results for a hotel.
    ###########################################################################
    # Review highlights.
    # Save the most positive and negative sentiments.
    this_doc2 = this_doc.sort_values(["sentiment"],
                                     ascending=[0]).reset_index(drop=True)
    this_doc2 = this_doc2.loc[this_doc2["topic"] != -1].reset_index(drop=True)
    this_doc2 = this_doc2.loc[this_doc2["topic_score"] > 0.5].reset_index(
        drop=True)
    sent_count_2 = len(this_doc2)
    composite = "NONE"
    # Save the most polarizing sentiments only if there are at least 6 sentences.
    if sent_count_2 > 5:
        sent1 = sent2 = sent3 = sent4 = sent5 = sent6 = ""
        # Only keep positive sentiment if its score is above 0.4 (within [-1, 1]).
        if this_doc2.loc[0]["sentiment"] > 0.4:
            sent1 = this_doc2.loc[0]["sentence"]
        if this_doc2.loc[1]["sentiment"] > 0.4:
            sent2 = this_doc2.loc[1]["sentence"]
        if this_doc2.loc[2]["sentiment"] > 0.4:
            sent3 = this_doc2.loc[2]["sentence"]
        # Only keep positive sentiment if its score is below -0.2 (within [-1, 1]).
        if this_doc2.loc[sent_count_2 - 1]["sentiment"] < -0.2:
            sent4 = this_doc2.loc[sent_count_2 - 1]["sentence"]
        if this_doc2.loc[sent_count_2 - 2]["sentiment"] < -0.2:
            sent5 = this_doc2.loc[sent_count_2 - 2]["sentence"]
        if this_doc2.loc[sent_count_2 - 3]["sentiment"] < -0.2:
            sent6 = this_doc2.loc[sent_count_2 - 3]["sentence"]
        composite = sent1 + "SSEEPP" + sent2 + "SSEEPP" + sent3 + "SSEEPP" + sent4 + "SSEEPP" + sent5 + "SSEEPP" + sent6 + "SSEEPP" + str(
            sent_count)
    # Add review highlights to the doctor dataframe.
    doctor_info.set_value(doctor_id, "summary", composite)

    # Top topics and their ratings.
    # Ratings are the percent positive sentences belonging to a topic.
    doc_topics = [[0 for i in range(2)]
                  for j in range(topic_num)]  # [total count, count positive]
    for index2 in range(0, len(this_doc2)):
        topic_index = this_doc2.loc[index2]["topic"]
        if topic_index != -1:
            doc_topics[topic_index][0] += 1
            topic_sentiment = this_doc2.loc[index2]["sentiment"]
            # A topic sentence if positive if its sentiment is bigger than 0.1.
            if topic_sentiment > 0.1:
                doc_topics[topic_index][1] += 1
    # Do not display dentist stuff for non-dentist
    if not is_dentist:
        doc_topics[3][0] = 0
    # Do not output "positive comment" as a topic. It is non-informative.
    doc_topics[0][0] = 0

    # Putting the results into a format to be sparsed by the webapp.
    doc_topic_tuples = []
    for index3, item in enumerate(doc_topics):
        doc_topic_tuples.append((index3, item[0], item[1]))
    doc_topic_tuples = sorted(doc_topic_tuples,
                              key=lambda x: x[1],
                              reverse=True)
    for index4 in range(0, 5):
        if doc_topic_tuples[index4][1] >= 10:
            topic_name = topics[doc_topic_tuples[index4][0]][0]
            percent_positive = str(
                int(doc_topic_tuples[index4][2] / doc_topic_tuples[index4][1] *
                    100))
            composite = topic_name + "SSEEPP" + percent_positive + "SSEEPP" + str(
                doc_topic_tuples[index4][1])
            doctor_info.set_value(doctor_id,
                                  "percent{0}".format(str(index4 + 1)),
                                  composite)

            print(topic_name, "XXXXXX",
                  doctor_info.loc[doctor_id]["specialty"])
            big_str.append(topic_name + "XXXXXX" +
                           str(doctor_info.loc[doctor_id]["specialty"]))
        else:
            doctor_info.set_value(doctor_id,
                                  "percent{0}".format(str(index4 + 1)), "NONE")

    # Print progress.
    print(counter / 5088)
    counter += doctor_review_count
    del this_doc
    del this_doc2
                        help='language of data',
                        type=check_lang)
    parser.add_argument(
        '-f, --filter',
        dest='filter',
        action='store_true',
        help='remove unfrequent and too frequent words from dictionary')
    parser.set_defaults(lang='en', filter=True)
    args = parser.parse_args()

    logging.info("Creating training corpora from data in directories: %s" %
                 args.dirs)
    logging.info("Language: %s" % args.lang)

    dictionary = None if not os.path.exists(
        dictionary_file) else Dictionary.load(dictionary_file)

    # Create BoW corpus and dictionary
    logging.info("Creating BoW corpus...")
    training_corpus = BowNewsCorpus(input=args.dirs,
                                    dictionary=dictionary,
                                    language=args.lang)

    dictionary = training_corpus.dictionary
    if args.filter:
        logging.info("Filtering dictionary...")
        # https://onlinelibrary.wiley.com/doi/epdf/10.1111/j.1756-8765.2010.01108.x
        dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=2000000)
        dictionary.compactify()

    # Serialize pre-processed BoW corpus and dictionary to files
Exemplo n.º 54
0
import itertools as it
import warnings
import time

trigram_posts_file = 'trigram_posts.txt'
trigram_dict_file = 'trigram_dict.dict'

trigram_posts = LineSentence(trigram_posts_file)
trigram_dictionary = Dictionary(trigram_posts)
trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
trigram_dictionary.compactify()

trigram_dictionary.save(trigram_dict_file)

trigram_dictionary = Dictionary.load(trigram_dict_file)

print(trigram_dictionary)
# 34,487 unique tokens

trigram_threads_bow_file = 'trigram_threads_bow_corpus.mm'
trigram_users_bow_file = 'trigram_users_bow_corpus.mm'


def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """

    for post in LineSentence(filepath):
Exemplo n.º 55
0
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from extractor import Document
from os import getcwd

docs_file = "data/0607.head.uc.head"
docs = []
line_counter = 0
with open(docs_file) as f:
    for line in f:
        splits = line.strip().split('\t')
        assert len(splits) == 2, len(splits)
        content = splits[-1]
        docs.append(Document(content).get_string_clean())
        line_counter += 1
        if line_counter == 200:
            break
for i in range(3):
    print(i, docs[i])
print("=============================================")

doc_clean = [doc.split() for doc in docs]
path_dictionary = getcwd() + "/data/dictionary"
path_ldamodel = getcwd() + "/data/ldamodel"

dictionary = Dictionary.load(path_dictionary)
lda_load = LdaModel.load(path_ldamodel)
unseen_doc = dictionary.doc2bow(doc_clean[-1])
vector = lda_load[unseen_doc]
print(vector)
Exemplo n.º 56
0
def doc_processing(documents,
                   stopwordsFilePath='',
                   thres=10,
                   doc=False,
                   dicPath=False):

    # read the stopwords file
    if stopwordsFilePath != '':
        file = codecs.open(stopwordsFilePath, 'r', 'utf-8')
        stopwords = [line.strip() for line in file]
        file.close()
    else:
        stopwords = ''

    if dicPath:
        dictionary = Dictionary.load('../dictionary_2corpora.dic')

        # file = codecs.open(dicPath, 'r', 'utf-8')
        # dictionary = [line.strip() for line in file]
        # file.close()

    N = len(documents)
    wordCounts = []
    word2id = {}
    id2word = {}
    currentId = 0
    my_punctuation = '!"#$%&\'()*+,-./:;<=>?@[]^_`{|}~'

    # generate the word2id and id2word maps and count the number of times of words showing up in documents
    # bigram = gensim.models.Phrases(documents)
    # documents = bigram[documents]

    documents_ = []
    for i, document in enumerate(documents):
        # if i%1000 == 0:
        #     print('Document #%d ...' % i)

        if doc == False:
            words_in_sent = tokenize(document, deacc=False)

            wordCount = {}
            for word in words_in_sent:
                if len(
                        word
                ) > 1 and word not in stopwords and word not in my_punctuation:
                    if word not in word2id.keys():
                        word2id[word] = currentId
                        id2word[currentId] = word
                        currentId += 1
                    if word in wordCount:
                        wordCount[word] += 1
                    else:
                        wordCount[word] = 1
            wordCounts.append(wordCount)
            i += 1

        else:
            doc = []
            words_in_sent = tokenize(document, deacc=False)
            for word in words_in_sent:
                if dicPath:
                    if word in dictionary:
                        doc.append(word)
                elif len(
                        word
                ) > 1 and word not in stopwords and word not in my_punctuation:
                    doc.append(word)

            # if stopwordsFilePath != '':
            #     for word in words_in_sent:
            #         if dicPath:
            #             if word in dictionary:
            #                 doc.append(word)
            #         elif len(word) > 1 and word not in stopwords and word not in my_punctuation:
            #             doc.append(word)
            # else:
            #     stopwords = ''
            #     for word in words_in_sent:
            #         doc.append(word)
            documents_.append(doc)

    if doc == False:
        word2id_ = {}
        id2word_ = {}
        M = len(word2id)

        # generate the document-word matrix
        X = np.zeros([N, M], dtype=np.int8)
        for i in range(N):
            for word in wordCounts[i]:
                j = word2id[word]
                if wordCounts[i][
                        word] < 0:  # Esto no se porque pasa pero alguna vez suelta pone un numero negativo (random) si la palabra no aparece
                    wordCounts[i][word] = 0
                X[i, j] = wordCounts[i][word]

        # Elimino palabras en los extremos
        X2 = []
        for w in range(X.shape[1]):
            thres_up = X.shape[0] * 10
            if thres <= np.sum(X[:, w]) < thres_up:
                X2.append(X[:, w])
                word = id2word[w]
                word2id_[word] = word2id[word]
                id2word_[len(X2) - 1] = word

        X2 = np.array(X2)
        X2 = X2.T
        M = X2.shape[1]
        print('Dictionary size: %d' % M)

        return N, M, word2id_, id2word_, X2

    else:
        return documents_, stopwords
Exemplo n.º 57
0
from gensim.models.phrases import Phraser
from gensim.corpora import Dictionary
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from util import *
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
import time

parquetpath = './dataset/final/'
trigram = Phraser.load("./vocab/trigram")
bigram = Phraser.load("./vocab/bigram")
dct = Dictionary.load("./gensim_dct")
reviews = pd.read_parquet(path=parquetpath)
reviews = reviews[reviews["length"] > 5].sample(10000)

rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_resample(reviews[["text"]],
                                            reviews["sentiment"])
X_resampled = pd.DataFrame(X_resampled)
y_resampled = pd.DataFrame(y_resampled)
X_resampled.columns = reviews[["text"]].columns
y_resampled.columns = reviews[["sentiment"]].columns

model = TfidfModel(dictionary=dct)
t0 = time.time()
X_csc = apply_tfidf(dct, model, X_resampled, bigram, trigram)
t1 = time.time()
print("Applied tfidf:", t1 - t0)
# use SVD only
                for i, a_tweet in enumerate(TweetRawCorpusStream(file_path)):
                    token_f = [
                        x for x in a_tweet.tokens_str.split(",") if len(x) > 1
                    ]
                    dct.add_documents([token_f], prune_at=None)
                sizeofCorpus = i - 1
                print(f"Totally {sizeofCorpus} tweets in {each_collection}.")
        print("Original size of vocabs: {}".format(len(dct)))
        # control the vocabulary
        dct.filter_extremes(no_below=40,
                            no_above=0.5,
                            keep_n=len(dct),
                            keep_tokens=None)
        print("Truncated size of vocabs: {}".format(len(dct)))
    elif preDictTag != None:
        dct = Dictionary.load('{}{}.dict'.format(corpora_path, preDictTag))

    #### Step 2, apply Tf-IDF representation ####
    bow_corpus = []
    meta_wf = open("{}{}-Meta.csv".format(corpora_path, fileTag), "w")
    meta_wf.write("position_index,id_str,created_time\n")

    # use Timer to print elapsed time
    with Timer():
        for each_collection in collections:
            print("Transforming the corpus for {}".format(each_collection))
            file_path = f"{corpora_path}{each_collection}-raw-corpus.tsv"
            for i, a_tweet in enumerate(TweetRawCorpusStream(file_path)):
                # gensim's Dictionary.doc2bow will ignore words that are not in dictionary by default
                bow_per_doc = dct.doc2bow(a_tweet.tokens_str.split(","))
                if len(bow_per_doc) > 4:
Exemplo n.º 59
0
    def __init__(self,path = 'model'):

        self.ldamodel = LdaModel.load(path+"/fixed_time_window_lda.model")
        self.dictionary = Dictionary.load(path+"/lda_dictionary.model")
Exemplo n.º 60
0
def categorisation(semi, model_name, category, update):
    '''
    Apply pre-trained LDA model to a set of space mission design requirements

    Inputs:
    - semi: if True, the model to be used for the categorisation is semi-supervised
    - model_name: name of saved model to load, usually under the format: 'model_topicNumber',
    all saved models can be found under LDAmodels
    - category: category of requirements to use for the categorisation test, one of the following options, found in
    Corpora/requirementsCorpus: 'AOCS', 'com', 'environment', 'GS', 'Launch', 'MA', 'OBDH', 'payload', 'Power', 'prop',
    'thermal'
    - update: If yes the unsupervised LDA model retained will be updated with the Update corpus found in
    Corpora/updateCorpus, for the chosen category.

    Outputs: the Accuracy Score and Mean Reciprocal Ranking of the categorisation

    CAREFUL 1 : The LDA model generation being a stochastic process, in the case of an updated model, the User will need
    to manually label the topic dictionaries, and save them as .txt, under TopicModeling/inputs4Categorisation
    See labels txt file for LDA models in this folder as examples.

    CAREFUL 2: Same applies for the semi-supervised and unsupervised models. Manual labels files are provided for the
    unsupervised and semi-supervised models used in the paper. But new trained models requires new labels, each time.

    CAREFUL 3: some modifications in the requirement pre-processing may have changed the semi-supervised model result
    w.r.t the original paper presented at the IAC 2019.'''

    start = time.time()

    # Unsupervised LDA model case
    if not semi:
        # Load LDA model and corresponding dictionary ------------------------------------------------------------------
        ldaModel = parentDir + '/TopicModeling/LDAmodels/unsupervised/' + str(model_name)
        lda = models.ldamodel.LdaModel.load(ldaModel)
        print('Model Topics Number:', lda.num_topics)

        dic = parentDir + '/TopicModeling/LDAmodels/unsupervised/dic_' + str(model_name) + '.dict'
        modelDic = Dictionary.load(dic)

        # Recreating the topics dictionaries ---------------------------------------------------------------------------
        ldaTopics = lda.show_topics(formatted=False, num_topics=lda.num_topics, num_words=15)
        print('Loaded LDA Topics Dictionaries, top 15 words:', *ldaTopics, sep='\n')

        # Get manual labels --------------------------------------------------------------------------------------------
        labels = []
        with open(parentDir + '/TopicModeling/inputs4Categorisation/manualLabels_' + model_name + '.txt', 'r',
                  encoding="utf-8") as labelsFile:
            labelLine = labelsFile.read().split('\n')
            for line in labelLine:
                if line:
                    labels.append(line.split(', '))

        labels = [[int(label[0]), label[1]] for label in labels]
        labels = list(itertools.chain.from_iterable(labels))
        print('\n Loaded Model Labels:', labels)

        if update:
            # Updated Unsupervised LDA model case
            # Update LDA model with wikipedia pages focused on one topic -----------------------------------------------
            print('\n Generating a specific LDA model for category', category, ':')
            # Only currently available for GS (Ground Segment), Launch, MA (Mission Analysis), OBDH, payload categories
            filepath = parentDir + '/TopicModeling/Corpora/updateCorpus/' + category + '_update/'

            # Pre-processing of .json docs into tokens
            reqdoc = corpusProcessing(filepath)

            # Use lda model dictionary to transform into document-term matrix understood by the model
            addcorpus = [modelDic.doc2bow(text) for text in reqdoc]

            # Update model
            lda.update(addcorpus, passes=600, offset=1500)

            # Print new dictionary of topics
            ldaTopics = lda.show_topics(formatted=False, num_topics=lda.num_topics, num_words=15)
            print('\n LDA Topics after update', *ldaTopics, sep='\n')

            # Get manual labels ----------------------------------------------------------------------------------------
            labels = []
            with open(parentDir + '/TopicModeling/inputs4Categorisation/manualLabels_' + model_name + '_'+ category +'.txt', 'r',
                      encoding="utf-8") as labelsFile:
                labelLine = labelsFile.read().split('\n')
                for line in labelLine:
                    if line:
                        labels.append(line.split(', '))

            labels = [[int(label[0]), label[1]] for label in labels]
            labels = list(itertools.chain.from_iterable(labels))
            print('\n Labels:', labels)

    else:
        # Semi-unsupervised LDA model case

        # Load LDA model and corresponding dictionary
        ldaModel = parentDir+'/TopicModeling/LDAmodels/semisupervised/guided'+str(model_name)
        lda = models.ldamodel.LdaModel.load(ldaModel)
        print('topics number:', lda.num_topics)

        dic = parentDir +'/TopicModeling/LDAmodels/semisupervised/dic_guided'+str(model_name)+'.dict'
        modelDic = Dictionary.load(dic)

        # Recreating the topics dictionaries
        ldaTopics = lda.show_topics(formatted=False, num_topics=lda.num_topics, num_words=20)
        print('LDA Topics ', *ldaTopics, sep='\n')

        # Get manual labels --------------------------------------------------------------------------------------------
        labels = []
        with open(parentDir + '/TopicModeling/inputs4Categorisation/manualLabels_'+model_name+'_semisupervised.txt', 'r',
                  encoding="utf-8") as labelsFile:
            labelLine = labelsFile.read().split('\n')
            for line in labelLine:
                if line:
                    labels.append(line.split(', '))

        labels = [[int(label[0]), label[1]] for label in labels]
        labels = list(itertools.chain.from_iterable(labels))
        print('\n Labels:', labels)

    # Get test requirements List ---------------------------------------------------------------------------------------
    requirementsList = []
    with open(parentDir + '/TopicModeling/Corpora/requirementsCorpus/req_'+ category+'.txt', 'r', encoding="utf-8") as filteredList:
        requirements = filteredList.read().split('\n')
        for req in requirements:
            if req:
                requirementsList.append(req.split(" | "))

    # Categorisation ---------------------------------------------------------------------------------------------------
    gt = []
    allResults = []
    all_req=[]

    for item in requirementsList:
        req = item[0]
        gt.append(item[1])

        # pre-process requirement
        req = NLPPipe(req)
        all_req.append(req)

        # Use the same dictionary as pre-trained model to  convert a list of words into bag of word format
        unseen_doc = modelDic.doc2bow(req)

        # get topic probability distribution for the unseen document
        vector = lda[unseen_doc]
        sorted_vector = sorted(vector, key=itemgetter(1), reverse=True)

        # Treshold - keep top 2 topics associated, with probabilities
        results = list(map(list, sorted_vector[0:2]))

        # associate top results with manually assigned labels
        for item in results:
            item[0] = labels[labels.index(item[0]+1) + 1]

        allResults.append(results)

    #print('\n All requirements:\n', *all_req, sep='\n')
    print('\n All Results for category', category, ' :')
    print(len(requirementsList), ' requirements were analysed.')
    print(*allResults, sep='\n')

    # Categorisation Evaluation -------------------------------------------------------------------------------------------
    # we have per requirement i, the ground truth gt[i] and the LDA model topic distribution results[i]

    # Accuracy calculation
    firstChoice = [item[0][0] for item in allResults]
    firstChoiceAccuracy = accuracy_score(gt, firstChoice)
    print('First Choice Accuracy : ', firstChoiceAccuracy)

    # Mean Reciprocal Ranking
    bigScore = 0
    for item in allResults:
        i = allResults.index(item)
        score = 0
        if item[0][0] == gt[i]:
            score = 1
        elif len(item)>1:
            if item[1][0] == gt[i]:
                score = 0.5
        bigScore = bigScore +score
    meanReciprocalrank = bigScore / len(requirementsList)
    print('Mean Reciprocal Rank : ', meanReciprocalrank, '\n ---------')

    print('Computation Time:', round((time.time() - start) / 60, 2), 'minutes')

    return