예제 #1
0
def merge_dictionaries(dictionaries_path, merged_dictionary_path=None):
    dict_paths = list(iglob(dictionaries_path))

    final_dictionary = Dictionary.load(dict_paths[0])

    for dict_path in dict_paths[1:]:
        dictionary = Dictionary.load(dict_path)

        final_dictionary.merge_with(dictionary)

    if merged_dictionary_path:
        final_dictionary.save(merged_dictionary_path)

    return final_dictionary
예제 #2
0
def merge_dictionaries(dictionaries_path, merged_dictionary_path=None):
    dict_paths = list(iglob(dictionaries_path))

    final_dictionary = Dictionary.load(dict_paths[0])

    for dict_path in dict_paths[1:]:
        dictionary = Dictionary.load(dict_path)

        final_dictionary.merge_with(dictionary)

    if merged_dictionary_path:
        final_dictionary.save(merged_dictionary_path)

    return final_dictionary
예제 #3
0
def loadModelfromFile(modelPath, readOnly=False):

    if readOnly == True:
        lda_model = LdaModel.load(fname=modelPath, mmap='r')
        dictionary = Dictionary.load(fname=modelPath.replace(
            '.topic', '.dict'),
                                     mmap='r')
    else:
        lda_model = LdaModel.load(fname=modelPath)
        dictionary = Dictionary.load(
            fname=modelPath.replace('.topic', '.dict'))
    print('load lda_model model from {0} ok!'.format(modelPath))

    return lda_model, dictionary
예제 #4
0
def analyze_top_dfs(tokendict, tagdict, cutoff_factor=1):
    ''' Provided gensim-dicts `tokendict` and `tagsdict`, show the top word frequencies. '''
    if type(tokendict) == str:
        tokendict = Dictionary.load(tokendict)
    if type(tagdict) == str:
        tagdict = Dictionary.load(tagdict)
    
    max_tag_df = max(tagdict.dfs.iteritems(), key=operator.itemgetter(1))
    sorted_dfs = sorted(tokendict.dfs.iteritems(), key=operator.itemgetter(1), reverse=True)
    print "count threshold: %-15s\t%d" % (tagdict[max_tag_df[0]], max_tag_df[1])
    print "----------------------------------------------"
    for tup in sorted_dfs[:100]:
        if tup[1] > max_tag_df[1] * cutoff_factor:
            print "%-15s\t%d" % (tokendict[tup[0]][:15], tup[1])
        else: break
예제 #5
0
파일: lsi.py 프로젝트: jwilber/artcamp
    def load(lsi_path=None, id2word_path=None, index_path=None):
        """
        If specified, attempts to load gensim LsiModel from `lsi_path`
        and gensim Dictionary from `dictionary_path`.

        Parameters
        ----------
        lsi_path: str
            File-path designating where self.model should be saved.
        id2word_path: str
            File-path designating where self.dictionary should be saved.
        """
        if lsi_path is not None:
            from gensim.models import LsiModel
            if not os.path.exists(lsi_path):
                raise IOError(
                    'The provided file path to the LsiModel was not found.'
                    'Please ensure that the argument is the correct path.')
            return LsiModel.load(lsi_path)
        if id2word_path is not None:
            from gensim.corpora.dictionary import Dictionary
            if not os.path.exists(id2word_path):
                raise IOError(
                    'The provided file path to the Dictionary was not found.'
                    'Please ensure that the argument is the correct path.')
            return Dictionary.load(id2word_path)
        if index_path is not None:
            from gensim.similarities import MatrixSimilarity
            if not os.path.exists(index_path):
                raise IOError(
                    'The provided file path to the Dictionary was not found.'
                    'Please ensure that the argument is the correct path.')
            return MatrixSimilarity.load(index_path)
예제 #6
0
def main(args):
    if args.corpus_type != "wiki":
        if args.processed_corpus_save_path is not None:
            raise ValueError("Processed corpus saving only supported " "for 'wiki' corpus type")

    kwargs = {}
    if args.dictionary_path is not None:
        kwargs["dictionary"] = Dictionary.load(args.dictionary_path)
    if args.dictionary_out_path is not None:
        kwargs["dictionary_save_path"] = args.dictionary_out_path

    if args.corpus_type == "wiki" and args.processed_corpus_save_path is not None:
        kwargs["sentences_save_path"] = args.processed_corpus_save_path

    logging.debug("Building corpus")
    corpus = CORPUS_TYPES[args.corpus_type](args.corpus_path, **kwargs)
    documents = corpus.get_texts()

    logging.debug("Now beginning VSM construction with Word2Vec")

    model = Word2Vec(
        sentences=documents,
        vocab_path=args.vocab_path,
        window=args.window_size,
        drop_capitals=args.drop_capitals,
        min_count=args.minimum_token_count,
        size=args.vector_dimensions,
        workers=multiprocessing.cpu_count(),
    )

    model.save(args.out_path)

    if args.vocab_out_path is not None:
        model.save_vocab(args.vocab_out_path)
예제 #7
0
 def _load(self):
     modeldir = self._workdir.joinpath("ldamodel_{}".format(self._name))
     if not modeldir.exists():
         return False
     self._lda = LdaMulticore.load(str(modeldir))
     self._dictionary = Dictionary.load(
         str(self._workdir.joinpath("dictionary_{}.gz".format(self._name))))
예제 #8
0
    def __init__(self,
                 topics=10,
                 worker=3,
                 pretrained_model=None,
                 dictionary=None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)
예제 #9
0
def plot_dict_hist(gdict):
    ''' Provided gensim-dict `gdict`, plot hist statistics '''
    if type(gdict) == str:
        gdict = Dictionary.load(gdict)
    sorted_dfs = sorted(gdict.dfs.iteritems(), key=operator.itemgetter(1), reverse=True)
    y = [tup[1] for tup in sorted_dfs]
    x = arange(0, len(y))
    
    plt.figure(figsize=(8,5));
    plt.loglog(x, y);
    plt.grid();
    plt.xlabel("Token rank");
    plt.ylabel("Document count");
    
    cdf = np.empty(len(y))
    delta(y, cdf)
    cdf /= np.max(cdf) # normalize
    
    x50 = x[cdf > 0.50][0]
    x80 = x[cdf > 0.80][0]
    x90 = x[cdf > 0.90][0]
    x95 = x[cdf > 0.95][0]
    
    plt.axvline(x50, color='c');
    plt.axvline(x80, color='g');
    plt.axvline(x90, color='r');
    plt.axvline(x95, color='k');
    
    print "50%\t", x50
    print "80%\t", x80
    print "90%\t", x90
    print "95%\t", x95
 def load_model(self, username):
     if username not in self.models:
         self.models[username] = models.LdaModel.load(
             self.get_model_path(username=username))
     if username not in self.dictionaries:
         self.dictionaries[username] = Dictionary.load(
             self.get_dictionary_path(username=username))
예제 #11
0
class Corpus(object):
    def __init__(self, path, dict_path):
        self.dictionary = Dictionary()
        add_to_dict = True
        if dict_path and os.path.exists(dict_path):
            print('loading dictionary')
            self.dictionary = self.dictionary.load(dict_path)
            add_to_dict = False
        self.train = self.tokenize(os.path.join(path, 'train.txt'),
                                   add_to_dict)
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'),
                                   add_to_dict)
        self.test = self.tokenize(os.path.join(path, 'test.txt'), add_to_dict)
        if dict_path and not os.path.exists(dict_path):
            self.dictionary.save(dict_path)

    def tokenize(self, path, add_to_dict):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        all_words = list(
            chain.from_iterable([
                sent.split() + ['<eos>']
                for sent in open(path).read().split('\n')
            ]))
        if add_to_dict:
            self.dictionary.add_documents([all_words])
        return torch.LongTensor(self.dictionary.doc2idx(all_words))
예제 #12
0
파일: lda.py 프로젝트: freygit/36
    def __init__(self, topics = 10, 
                 worker = 3, 
                 pretrained_model = None, 
                 dictionary = None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)
예제 #13
0
def prune_dictionary(src_dictionary_path, dest_dictionary_path=None,
                     no_below=None, no_above=None, keep_n=None):
    dictionary = Dictionary.load(src_dictionary_path)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above,
                               keep_n=keep_n)

    if dest_dictionary_path:
        dictionary.save(dest_dictionary_path)

    return dictionary
예제 #14
0
 def representation(self):
     if not self.model:
         print("LOAD MODEL...")
         self.model = LsiModel.load(
             os.path.join(self.preprocessor.source.path,
                          self.preprocessor.source.info + '.model'))
         self.dictionary = Dictionary.load(
             os.path.join(self.preprocessor.source.path,
                          self.preprocessor.source.info + '.dic'))
     pass
예제 #15
0
def getDictionary(word_corpus, useSavedTill):
    if useSavedTill >= USESAVED.dictionary:
        common_logger.info("loading dictionary from file")
        dictionary = Dictionary.load(file_lda_gensim_dictionary)
        return dictionary
    else:
        common_logger.info("Creating dictionary from corpus")
        dictionary = Dictionary(word_corpus.values())
        common_logger.info("saving dictionary")
        dictionary.save(file_lda_gensim_dictionary)
        return dictionary
예제 #16
0
 def __init__(self, examples, vocab, lda_vocab_path, lda_model_path, args):
     self.data = examples
     self.vocab = vocab
     self.args = args
     self.item_vocab = load_item_vocab(args)
     self.lda_vocab = Dictionary.load(lda_vocab_path)
     self.lda_model = LdaMulticore.load(lda_model_path)
     self.sent_lim = [
         self.args.cp_sentNum, self.args.desc_sentNum,
         self.args.require_sentNum, self.args.benefit_sentNum
     ]
예제 #17
0
파일: summriz.py 프로젝트: diahnuri/TMSS
def f4 (berita):
    from Sastrawi.Stemmer import StemmerFactory
    from Sastrawi.StopWordRemover import StopWordRemoverFactory
    
    import gensim
    from gensim import corpora
    from gensim.corpora.dictionary import Dictionary
    from gensim.models.ldamodel import LdaModel
    from gensim.matutils import cossim as cs
    import os
#     os.chdir('D:/[Projects]/corpus/wiki2')
    
    id2word = Dictionary.load(os.path.join(dir_path, 'wiki_mini.dict'))
    mm_corp = corpora.MmCorpus(os.path.join(dir_path, 'wiki_mini_bow.mm'))
    lda = LdaModel.load(os.path.join(dir_path, 'lda_model_mini_wiki.model'))
    
    
    stopword_factory = StopWordRemoverFactory.StopWordRemoverFactory()
    stemmer_factory = StemmerFactory.StemmerFactory()
    stopwords = stopword_factory.create_stop_word_remover()
    stemmer = stemmer_factory.create_stemmer()
    
    

    judul = stemmer.stem(berita['judul'])
    judul = stopwords.remove(judul)
    bow_judul = id2word.doc2bow(judul.lower().split())
    lda_judul = lda[bow_judul]

    sentences = berita['kalimat_bersih']

    berita ['skor']['f4']=[]
    skor = berita['skor']['f4']

# distance belum bisa pake JSD karena belum ketemu solusi kalau beda ukuran matrixnya
# possibly karena dictionary LDA masih kecil jadi sedikit

    for kalimat in sentences:
        bow_kalimat = id2word.doc2bow(kalimat.lower().split())
        lda_kalimat = lda[bow_kalimat]
        skor_cs = cs(bow_kalimat,bow_judul)
        skor.append(skor_cs)           

#             print(lda_kalimat)
#             print(lda_judul)
#             print(kalimat)
#             print(judul)            
#             distance = jsd(lda_kalimat,lda_judul)
#             if -1 < distance < 1:
#                 skor.append(distance-1)               
#             else :
#                 skor.append(0)
    return berita
예제 #18
0
    def __init__(self,
                 corpus,
                 wiki_dict,
                 wordfile,
                 vocab_size=200000,
                 window_size=5):
        self.w2id_dict = util.load_worddict(wordfile, vocab_size)
        self.window_size = window_size

        print('Starting loading Wiki Corpus...', end='')
        wiki_d = Dictionary.load(wiki_dict)
        self.wiki_corpus = WikiCorpus(corpus, dictionary=wiki_d)
        print('[done]')
예제 #19
0
def filter_extremes_wrapper(gdict, no_below=1, no_above=1.0, keep_n=None, save_pickle=None):
    ''' Given unfiltered gensim-dict `gdict`, wrap filter_extremes '''
    if type(gdict) == str:
        gdict = Dictionary.load(gdict)
    print "Before filtering:", gdict
    gdict.filter_extremes(**kwargs)
    print "After filtering:", gdict
    
    if save_pickle:
        print "\nsaving..."
        gdict.save(save_pickle)
    
    return gdict
예제 #20
0
def prune_dictionary(src_dictionary_path,
                     dest_dictionary_path=None,
                     no_below=None,
                     no_above=None,
                     keep_n=None):
    dictionary = Dictionary.load(src_dictionary_path)
    dictionary.filter_extremes(no_below=no_below,
                               no_above=no_above,
                               keep_n=keep_n)

    if dest_dictionary_path:
        dictionary.save(dest_dictionary_path)

    return dictionary
def get_vocab(tweets=None):
    if 'vocab_sentiment' in os.listdir('.'):
        if not tweets:
            print("Loading vocabulary...")
            vocab = Dictionary.load('vocab_sentiment')
            print("Loaded vocabulary")
            return vocab
        response = input('Vocabulary found. Do you want to load it? (Y/n)'\
                             ': ')
        if response.lower() in ['n', 'no', 'nah', 'nono', 'nahi', 'nein']:
            if not tweets:
                tweets, labels = export()
                del labels
            return create_vocab(tweets)
        else:
            print("Loading vocabulary...")
            vocab = Dictionary.load('vocab_sentiment')
            print("Loaded vocabulary")
            return vocab
    else:
        if not tweets:
            tweets, labels = export()
            del labels
        return create_vocab(tweets)
예제 #22
0
def load_data():
    '''this function loads up the already processed data with all of the nested lists properly reformatted as lists, and loads up the dictionaries'''
    df = pd.read_csv('data/processed_full.tsv', sep='\t')
    df['english_tokens'] = df['english_tokens'].apply(
        lambda x: x.strip("['']").split("', '"))
    df['french_tokens'] = df['french_tokens'].apply(
        lambda x: x.strip("['']").split("', '"))
    df['english_bow'] = df['english_bow'].apply(str_to_int)
    df['french_bow'] = df['french_bow'].apply(str_to_int)
    df['english_padded'] = df['english_padded'].apply(str_to_int)
    df['french_padded'] = df['french_padded'].apply(str_to_int)
    df = df.drop('Unnamed: 0', axis=1)

    eng = Dictionary.load('data/Dictionaries/eng')
    fren = Dictionary.load('data/Dictionaries/fren')

    # create ML data
    X_eng = np.vstack(df['english_padded'].values)
    y_fren = np.vstack(df['french_padded'].values)

    y_fren = y_fren.reshape(*y_fren.shape, 1)
    X_eng = X_eng.reshape(*X_eng.shape, 1)

    return df, eng, fren, X_eng, y_fren
예제 #23
0
def pre_processing():
    global vocab,model;
    try:
        model = load_model('SentimentAnalysis/model_nn.h5')
    except IOError:
        if 'model_nn.tar.gz' not in os.listdir('SentimentAnalysis'):
            raise IOError("Could not find Sentiment Analysis model. Ensure model "\
                      "is present in: ./SentimentAnalysis")
        else:
            process = subprocess.Popen("cd SentimentAnalysis/; "\
                                   "tar -zxf model_nn.tar.gz; cd ..",
                                   shell=True, stdout=subprocess.PIPE)
            process.wait()
            model = load_model('/content/PClub-Project-master/SentimentAnalysis/model_nn.h5')
    vocab = Dictionary.load('SentimentAnalysis/vocab_sentiment')
예제 #24
0
def main():
    logformat = '%(asctime)s %(name)-12s: %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=logformat)
    kera = NOB_kera()
    es = Elasticsearch(port=9201)
    mod = LdaModel.load(modelfile)
    vocab = Dictionary.load(vocabulary)
    tfidf = TfidfModel(dictionary=vocab)
    results = []
    for (topics, topicid) in get_doc_topics(mod, mod.num_topics, num_words_from_topic, vocab, tfidf):
        res = es.search(index='wiki4', body={"query": {"match": {"_all": topics}}}, size=num_results_from_es)
        results.append({'topics': topics, 'result': res, 'topicid': topicid})
    results = add_keywords(results, kera)
    df = pd.DataFrame(results)
    df.to_csv('nowiki_4_with_kera_250_topics.csv', encoding='utf-8')
예제 #25
0
def main(coursesList):
    lda = LDA.load("./best_model.lda")
    dictionary = Dictionary.load("best_model.lda.id2word")
    bigrams = Phraser.load("./bigram_model.pkl")
    trigrams = Phraser.load("./trigram_model.pkl")
    text_clean = [doc.split(' ') for doc in coursesList['description']]
    corpus = [dictionary.doc2bow(text) for text in text_clean]
    create_vector_topics(lda, corpus, dictionary, coursesList)
    courses_topic = config.matrix_courses_topic.to_numpy()

    #lda, dictionary, bigrams, trigrams = create_LDA_model(coursesList)
    #courses_topic = config.matrix_courses_topic.to_numpy()

    cursor.execute("select id from auth_group")
    id_groups = cursor.fetchall()
    for i in id_groups:
        cursor.execute(
            "select distinct studyplan_id from students where group_id = %(id)s ",
            {'id': i[0]})
        studyplan_id = cursor.fetchall()
        for j in studyplan_id:
            subject_list = pd.DataFrame(columns=['id_subject', 'description'])
            subject_list = WordProcessing.word_processing(
                get_work_program(j[0], subject_list))
            #for k in subject_list:
            token_stud_prog = [
                program.split(' ') for program in subject_list['description']
            ]
            #token_stud_prog = add_n_grams(token_stud_prog, bigrams, trigrams)
            prog_corp = [
                dictionary.doc2bow(program) for program in token_stud_prog
            ]
            topic_prog = lda.get_document_topics(prog_corp)
            for l in range(0, len(topic_prog)):
                profile_student = np.zeros(config.num_lda_topic)
                dense_topic_prog = np.zeros(config.num_lda_topic)
                for m in topic_prog[l]:
                    dense_topic_prog[m[0]] += m[1]
                #mask = np.argsort(dense_topic_prog)[::-1][:1]
                #profile_student[mask] += 1
                profile_student = dense_topic_prog
                cosine_similarities = linear_kernel(
                    profile_student.reshape(1, -1), courses_topic).flatten()
                top_courses = np.where(cosine_similarities >= 0.2)[0]
                print(subject_list.loc[l, 'id_subject'])
                #print(top_courses)
                print(coursesList.loc[top_courses, 'name':'link'])
예제 #26
0
파일: building.py 프로젝트: diahnuri/TMSS
def latentDir():
    #     ini cuman intuk tes bisa masuk sini apa engga LDA modelnya
    import gensim
    from gensim import corpora
    from gensim.corpora.dictionary import Dictionary
    from gensim.models.ldamodel import LdaModel
    import os
    os.chdir('D:/[Projects]/corpus/wiki2')
    mm_corp = corpora.MmCorpus('./LDA/wiki_mini_bow.mm')
    id2word = Dictionary.load('./LDA/wiki_mini.dict')
    lda = LdaModel.load('./LDA/lda_model_mini_wiki.model')

    if lda != None:
        print('Model LDA berhasil di load')
    else:
        print('model LDA gagal di load')
    return
예제 #27
0
    def __init__(self, examples, tokenizer, lda_vocab_path, lda_model_path,
                 args):
        self.data = examples
        self.tokenizer = tokenizer
        # add new special token
        self.spec_tokens = load_special_tokens(args)
        self.tokenizer.additional_special_tokens = self.spec_tokens
        self.tokenizer.add_tokens(self.spec_tokens)
        self.args = args
        self.item_vocab = load_item_vocab(args)
        self.lda_vocab = Dictionary.load(lda_vocab_path)
        self.lda_model = LdaMulticore.load(lda_model_path)

        self.sent_lim = [
            self.args.cp_sentNum, self.args.desc_sentNum,
            self.args.require_sentNum, self.args.benefit_sentNum
        ]
        self.text_fields = self.data[0]._fields[:4]
예제 #28
0
def questions_to_keywords(questions, per_question):
    with open('corpus.json') as bowfile:
        bow = json.load(bowfile)
        dictionary = Dictionary.load('dictionary.dict')
        if per_question:
            keywords_per_question = []
            for question in questions:
                words = preprocess_question(question)
                keywords_per_question.append(
                    tf_idf_keywords(words, bow, dictionary))
            return keywords_per_question
        else:
            for i in range(len(questions)):
                question = questions[i]
                words = preprocess_question(question)
                questions[i] = words
            questions = [word for question in questions for word in question]
            return tf_idf_keywords(questions, bow, dictionary)
예제 #29
0
def corpus_tfidf():
    path = "" 
    corpus = MmCorpus(path + "corpus.mm")
    id2word = Dictionary.load(path + 'corpus.mm.dict')
    
    # TF-IDF the corpus
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    tfidf.save("5_topics_tfidf_only.model")
    
    lda_model_tfidf = models.LdaModel(corpus_tfidf, num_topics=5, id2word=id2word)#models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=id2word, passes=2, workers=4) # better model
    print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
    
    for idx, topic in lda_model_tfidf.print_topics(-1):
        print('Topic: {} Word: {}'.format(idx, topic))
        
    lda_model_tfidf.save(path + "5_topics_test.model")
    lda_model_tfidf.wv.save(path + "5_topics_test_kv.model")
예제 #30
0
파일: parser.py 프로젝트: cboix/rdaneel
def buildDictionary(force=False):
    """ Build a dictionary in which each post corresponds to a document. """

    global globalDict

    if force or not isfile(dictName):
        postids = getPostids()
        numPosts = len(postids)

        count = 0
        for postid in postids:
            if count % 100 == 0:
                print "Added %d out of %d to dictionary: %s" % (count, numPosts, time.strftime("%H:%M:%S"))
            addPostToDict(postid)
            count += 1
    else:
        globalDict = Dictionary.load(dictName)

    # Filter out extremely common words
    globalDict.filter_extremes(no_below=2, no_above=0.5)
예제 #31
0
    def __init__(self, analyzed_items_path=None, dictionary_path=None,
                 corpus_path=None, tfidf_model_path=None):
        if dictionary_path:
            self.dictionary = Dictionary.load(dictionary_path)
        else:
            self.dictionary = None

        if analyzed_items_path:
            self.analyzed_items_path = analyzed_items_path
        else:
            self.analyzed_items_path = None

        if corpus_path:
            self.corpus = MmCorpus(corpus_path)
        else:
            self.corpus = None

        if tfidf_model_path:
            self.tfidf_model = TfidfModel.load(tfidf_model_path)
        else:
            self.tfidf_model = None
예제 #32
0
    def build_lda_model(self, topics: int=20):
        ignore_words = [
            'like', 'know', 'f**k', 'f*****g', 'want', 'shit', 'know', 'sure',
            'isn', 'CHANBOARD', 'think', 'people', 'good', 'time', 'going',
            'WEBLINK', 'got', 'way', ''
        ]
        filename = op.join(self.input_dir, f'{self.board}.dictionary')
        dictionary: Dictionary = Dictionary.load(filename)
        documents = ReadThreads(
            self.board, input_dir=self.input_dir, file_type='phrases',
            return_func=lambda x, y: dictionary.doc2bow(
                [w for w in y.split() if w not in ignore_words]
            )
        )

        lda = LdaMulticore(
            documents, id2word=dictionary, num_topics=topics, iterations=2)

        filename = op.join(self.input_dir, f'{self.board}.lda')
        lda.save(filename)

        return lda
예제 #33
0
def matcher_attribute_descriptions(path, text1, text2):
    # using gensim lda

    temp_file = datapath(path + 'lda_model')
    lda = LdaModel.load(temp_file)

    dictionary = Dictionary.load(path + 'dict')
    # common_dictionary = Dictionary(common_texts)
    # print(lda.print_topics(5))

    text1 = rm_special_chars(text1)
    text2 = rm_special_chars(text2)

    text1 = text1.split()
    text2 = text2.split()

    corpus = [text1, text2]
    # print(corpus)
    corpus = [dictionary.doc2bow(text) for text in corpus]
    # print(corpus)

    vector1 = lda[corpus[0]]
    vector2 = lda[corpus[1]]

    from pprint import pprint
    # pprint(vector1)
    # pprint(vector2)
    vector1 = sorted(vector1, key=lambda x: x[1], reverse=True)
    vector2 = sorted(vector2, key=lambda x: x[1], reverse=True)
    print(vector1)
    print(vector2)

    topics1 = [(dictionary[tup[0]], tup[1]) for tup in vector1]
    topics2 = [(dictionary[tup[0]], tup[1]) for tup in vector2]
    print(topics1)
    print(topics2)

    return
예제 #34
0
    def _create_dictionary(self, mongo_client):
        """
        Creates the gensim Dictionary (gensim.corpora.dictionary.Dictionary) or loads it if it already exists and sets
        the object's dictionary property.
        :param mongo_client: server.db.MongoClientContext
        """
        from gensim.corpora.dictionary import Dictionary

        if self._resource_exists(self.dictionary_file):
            self.logger().debug(
                    "Dictionary file found, loading it [%s]" % self._create_resource_path(self.dictionary_file))
            self._dictionary = Dictionary.load(self._create_resource_path(self.dictionary_file))
        else:
            self.logger().debug("Dictionary file not found, creating a new Dictionary file")
            self._dictionary = Dictionary()

        documents = []
        for doc in [di for d in mongo_client.scrappers_collections() for di in d.find()]:
            documents.append(self.tokenize_sentence(doc[self.considerable_doc_property]))

        self.logger().debug("Adding %d documents to dictionary (will skip existing ones)" % len(documents))
        self._dictionary.add_documents(documents)
        self._dictionary.save(self._create_resource_path(self.dictionary_file))
예제 #35
0
def questions_to_keywords(questions, per_question):
    with open('corpus.json') as bowfile:
        bow = json.load(bowfile)
        dictionary = Dictionary.load('dictionary.dict')
        if per_question:
            keywords_per_question = []
            for question in questions:
                question = question.lower()
                question = re.sub("'", ' ',
                                  question).replace('_',
                                                    ' ').replace(' -', ' ')
                question = re.sub(r'[^A-Za-z^-]', ' ', question)
                question = re.sub(r'\s+', ' ', question)
                words = [
                    word for word in question.split()
                    if word not in stopwords.words('dutch')
                ]
                keywords_per_question.append(
                    tf_idf_keywords(words, bow, dictionary))
            return keywords_per_question
        else:
            for i in range(len(questions)):
                question = questions[i]
                question = question.lower()
                question = re.sub("'", ' ',
                                  question).replace('_',
                                                    ' ').replace(' -', ' ')
                question = re.sub(r'[^A-Za-z^-]', ' ', question)
                question = re.sub(r'\s+', ' ', question)
                words = [
                    word for word in question.split()
                    if word not in stopwords.words('dutch')
                ]
                questions[i] = words
            questions = [word for question in questions for word in question]
            return tf_idf_keywords(questions, bow, dictionary)
예제 #36
0
    def __init__(self,
                 analyzed_items_path=None,
                 dictionary_path=None,
                 corpus_path=None,
                 tfidf_model_path=None):
        if dictionary_path:
            self.dictionary = Dictionary.load(dictionary_path)
        else:
            self.dictionary = None

        if analyzed_items_path:
            self.analyzed_items_path = analyzed_items_path
        else:
            self.analyzed_items_path = None

        if corpus_path:
            self.corpus = MmCorpus(corpus_path)
        else:
            self.corpus = None

        if tfidf_model_path:
            self.tfidf_model = TfidfModel.load(tfidf_model_path)
        else:
            self.tfidf_model = None
예제 #37
0
 def update(self, name, n=500, method='FastICA'):
     settings = self._setstorage.load(encode_name(name))
     clusterer = Clusterer(settings)
     
     # load the models
     dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY]))
     ngram_size = len(dictionary[0])
     transformer = NgramTransformer(ngram_size)
     ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL]))
     
     # get the input
     segments = self._segstorage.load(name=settings[SEGMENT_NAME], limit=int(n))
     documents = [s.value for s in segments]
     
     # prepare args
     kwargs = {'dictionary': dictionary,
               'ngramtransformer': transformer,
               'ldamodel': ldamodel,
               'method': method}
     Xt = clusterer.fit_transform(documents, **kwargs)
     labels = clusterer.assign_labels(documents)
     data = self._make_data(Xt, labels, documents)
     return json.dumps({'result': 'OK',
                        'data': data})
예제 #38
0
#print title_corpus
#print title_corpus.dictionary

#description_corpus = Corpus_Column(fname, "FullDescription")
#print len(description_corpus)
#for word in description_corpus.get_texts():
    #a = 5
#joblib.dump(cnt, path_join(cache_dir, "counter_train_desc_nltk"), compress=3)

cnt = joblib.load(path_join(cache_dir, "counter_train_desc_nltk"))

for word, freq in cnt.most_common(10): #[:-100:-1]:
    print word, freq
#MmCorpus.serialize(path_join(cache_dir, "train_desc_nltk_corpus.pickle1"), description_corpus)
#description_corpus.dictionary.save(path_join(cache_dir, "train_desc_nltk_dic.pickle"))
dicti = Dictionary.load(path_join(cache_dir, "train_desc_nltk_dic.pickle"))
#dicti = description_corpus.dictionary
print dicti
#print description_corpus
#print description_corpus.dictionary
#print files.dictionary

#id2token = dicti.id2token
i = 0
for k, v in sorted(dicti.dfs.items(), key=operator.itemgetter(1), reverse=True):
    if i < 10:
        print dicti[k], v, "ID:", k
        i = i + 1
k=0
print "printam token", k
print id2token[k], dicti.dfs[k], "ID:", k
        fscore_np = np.asarray(fscore)

        mean_jaccard.append(np.mean(jacc_np))
        mean_bleu.append(np.mean(bleu_np))
        mean_cos.append(np.mean(cos_np))
        mean_fscore.append(np.mean(fscore_np))
    return np.max(np.asarray(mean_bleu)), np.max(
        np.asarray(mean_jaccard)), np.max(np.asarray(mean_cos)), np.max(
            np.asarray(mean_fscore))


GH_IDs, SO_IDs, GH_annotation_intersect, GH_annotation_union, SO_annotation_intersect, SO_annotation_union = load_annotations(
)
path = "/home/norberteke/PycharmProjects/Thesis/data/"

dictionary = Dictionary.load(path + 'GH_full_processed_Dictionary.dict')
corpus = MmCorpus(datapath(path + 'corpus_processed_GH_full.mm'))

texts = []
with open(path + 'GH_full_processed_corpus.csv', 'r') as f:
    reader = csv.reader(f)
    texts = list(reader)

terms = []
for (key, value) in dictionary.iteritems():
    terms.append(value)


def write_results_to_file(path, lda_model, max_bleu, max_jaccard, max_cos,
                          max_fscore):
    with open(path, 'a') as f:
예제 #40
0
    def fit(self):
        self._lda = LdaModel(corpus=self._corpus, id2word=self._dictionary, num_topics=self._num_topics, distributed=True)
    
    def get(self):
        return self._lda

def usage():
    print 'usage: ldalearner.py [segment_name] [dictionary_name] [resulting_model_name]'
    sys.exit(0)


if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)
    args = sys.argv[1:]
    
    if len(args) != 3:
        usage()
    
    segment_name = unicode(args[0])
    dict_path = os.path.join(DICTIONARY_PATH, args[1])
    dictionary = Dictionary.load(dict_path)
    
    model_path = os.path.join(LDA_PATH, args[2])
    
    corpus = SegmentCorpus(segment_name, dictionary, MongoSegmentStorage())
    learner = LdaLearner(corpus, dictionary)
    learner.fit()
    
    learner.get().save(model_path)

예제 #41
0
from __future__ import division
from collections import Counter, defaultdict
from gensim.corpora.dictionary import Dictionary
from lib.iterators import row_stream
from itertools import izip

import networkx as nx
from itertools import combinations

common, usefulness = defaultdict(int), defaultdict(int)
total = Dictionary.load("../working/titledict.pickle")

num_eng = 4
for eid in xrange(num_eng):
    for row in row_stream("../data/pruned_Train_%d.csv" % eid):
        ID, title, body, tags = row
        title_tokens = title.split()
        tags = set(tags.split())
        for token in title_tokens:
            if token in tags:
                common[token] += 1

for (hash_id, count) in total.dfs.iteritems():
    token = total[hash_id]
    usefulness[token] = common[token] / count
''' Tag==>Tag recommender '''
G = nx.Graph()

num_eng = 4
for eid in xrange(num_eng):
    for row in row_stream("../data/pruned_Train_%d.csv" % eid):
    def load(self):
        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)
예제 #43
0
 def load(self):
     self._lda = LdaModel.load(self._model_file)
     self._dictionary = Dictionary.load(self._dict_file)
예제 #44
0
from __future__ import division
from collections import Counter, defaultdict
from gensim.corpora.dictionary import Dictionary
from lib.iterators import row_stream
from itertools import izip

import networkx as nx
from itertools import combinations

common, usefulness = defaultdict(int), defaultdict(int)
total = Dictionary.load("../working/titledict.pickle")

num_eng = 4
for eid in xrange(num_eng):
    for row in row_stream("../data/pruned_Train_%d.csv" % eid):
        ID, title, body, tags = row
        title_tokens = title.split()
        tags = set(tags.split())
        for token in title_tokens:
            if token in tags:
                common[token] += 1
            
for (hash_id, count) in total.dfs.iteritems():
    token = total[hash_id]
    usefulness[token] = common[token] / count


''' Tag==>Tag recommender '''
G = nx.Graph()

num_eng = 4
예제 #45
0
            break
    return segments

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Cluster segments')
    parser.add_argument('clustermodel', type=unicode, help='The clusterer model to use.')
    
    args = parser.parse_args()

    setstorage = MongoSettingsStorage()
    docstorage = MongoDocumentStorage()
    segstorage = MongoSegmentStorage()
    
    logger.info('Loading clusterer model')
    settings = setstorage.load(encode_name(args.clustermodel))
    dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY]))
    ngram_size = len(dictionary[0])
    transformer = NgramTransformer(ngram_size)
    ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL]))
    logger.info('Clusterer model loaded!')
    
    kwargs = {'dictionary': dictionary,
                  'ngramtransformer': transformer,
                  'ldamodel': ldamodel,
                  'method': 'LDA'}
    
    
    logger.info('Fitting clusterer')
    clusterer = Clusterer(settings)
    texts, labels = clusterer.get_training_data()
    clusterer.fit(texts, labels, **kwargs)
예제 #46
0
def scorer(model, dic):
    tfidf = TfidfModel.load(model)
    dictionary = Dictionary.load(dic)
    def score(words):
        return tfidf[dictionary.doc2bow(words)]
    return score
예제 #47
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    model_path = path.join(base_path,
                           p['result_path'],
                           p['model_label'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # train the model on the small marketing corpus
    preprocess = []

    if 'stoplist' in p.as_dict():
        stoplist = open(path.join(base_path, p['stoplist'])).readlines()
        stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist]
        def remove_stopwords(sentence):
            return [word for word in sentence if not word in stoplist]
        preprocess.append(remove_stopwords)

    if 'stemmer' in p.as_dict():
        stemmer = Stemmer.Stemmer(p['stemmer'])
        preprocess.append(stemmer.stemWords)

    if not p['model_label']:
        cor = TextFilesCorpus(path.join(base_path, p['corpus_path']),
                              no_below=p['no_below'],
                              no_above=p['no_above'],
                              preprocess=preprocess)
        dictionary = cor.dictionary

        pre = LogEntropyModel(cor, id2word=dictionary, normalize=True)
        lsi = LsiModel(pre[cor], id2word=dictionary, num_topics=p['num_topics'])
    else:
        dictionary = Dictionary.load(path.join(model_path, p['dict_name']))
        pre = SaveLoad.load(path.join(model_path, 'pre.model'))
        lsi = LsiModel.load(path.join(model_path, 'lsi.model'))
        lsi.num_topics = p['num_topics']

    test_cor_path = path.join(base_path, p['test_cor_path'])
    test_answers, gold_answers, ratings = [], [], []


    flist = glob.glob(path.join(test_cor_path, 'corpus_3', '*.txt'))
    for file in flist:
        match = re.search('data3_(\d)_\d+.txt', file)
        ratings.append(int(match.group(1)))
        with open(file) as f:
            doc = string.join(map(string.strip, f.readlines()))
            doc = utils.tokenize(doc, lower=True)
            for func in preprocess:
                doc = func(doc)
            corpus = lsi[pre[dictionary.doc2bow(doc)]]
            test_answers.append(corpus)
    flist = glob.glob(path.join(test_cor_path, 'corpus_3_golden', '*.txt'))
    for file in flist:
        with open(file) as f:
            doc = string.join(map(string.strip, f.readlines()))
            doc = utils.tokenize(doc, lower=True)
            for func in preprocess:
                doc = func(doc)
            corpus = lsi[pre[dictionary.doc2bow(doc)]]
            gold_answers.append(corpus)


    sim = MatrixSimilarity(test_answers)[gold_answers]
    mean_sim = np.mean(sim, axis=0)
    print 'pearsons corrcoef: %f' % np.corrcoef(ratings, mean_sim)[0,1]
    print 'spearmans r: %f with p: %f' % stats.spearmanr(ratings, mean_sim)
예제 #48
0
파일: lda.py 프로젝트: msushkov/cs224w-wiki
def get_dictionary():
    return Dictionary.load(DICTIONARY_FILE)
예제 #49
0
out stopwords without a explicit list.

@author: dedan
'''

from __future__ import division
from gensim.corpora.dictionary import Dictionary
import pylab as plt
import numpy as np

min_freq = 1000
n_words  = 200

stoplist = open('/Users/dedan/projects/mpi/data/stoplists/german_stoplist.txt').readlines()
stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist]
dic = Dictionary.load('/Users/dedan/projects/mpi/data/results/20110628-170809/dic.dict')


# word frequncy distibution of the dictionary
freqs = np.array(dic.dfs.values())
freqs = freqs[freqs > min_freq]
plt.figure()
plt.subplot(3,1,1)
plt.hist(freqs, bins=100)
plt.title('distribution of word frequencies with frequency > %s' % min_freq)

# most frequent words in the dictionary
freqs   = np.array([dic.dfs[dic.token2id[key]] for key in dic.token2id.keys()])
words   = dic.token2id.keys()
idx     = np.argsort(freqs)
freqs   = freqs[idx[-n_words:]]