예제 #1
0
def main(argv):
	token_list = make_token_list()
	dictionary = Dictionary(token_list)
	corpus = [dictionary.doc2bow(doc_tokens) for doc_tokens in token_list]
	dictionary.save('dictionary.dict')
	with open('corpus.json', 'w') as out:
		json.dump(corpus, out)
예제 #2
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-d', '--wiki-dump')
    parser.add_argument('-l', '--limit', default=None, type=int)
    parser.add_argument('-p', '--num-procs', default=1, type=int)
    parser.add_argument('-o', '--out', default='vocab')
    opts = parser.parse_args()

    dump_loc = opts.wiki_dump
    limit = opts.limit
    n_procs = opts.num_procs
    out_fn = opts.out

    dump_gen = get_dump_gen(dump_loc, limit=limit, n_procs=n_procs)

    nlp = spacy.en.English()
    vocab = Dictionary(([
        token.text.lower().strip() for token in doc if token.text.strip() != ""
    ] for doc in nlp.pipe((art['article.text'] for art in dump_gen),
                          n_threads=n_procs,
                          parse=False,
                          tag=False,
                          entity=False)))

    vocab.save('%s.vocab' % out_fn)
    vocab.save_as_text('%s.txt' % out_fn)
class LdaMalletHandler:
    def __init__(self, mallet_path):
        self.mallet_path = mallet_path

    def run_model(self, model_name, corpus, **kwargs):
        self.model_name = model_name
        self.dictionary = Dictionary(corpus)
        corpus_bow = [self.dictionary.doc2bow(text) for text in corpus]
        os.makedirs("ldamodels/"+model_name, exist_ok=True )
        self.model = LdaMallet(self.mallet_path, corpus_bow, id2word=self.dictionary, prefix="./ldamodels/"+model_name+"/", **kwargs)

    def save_model(self):
        self.model.save("ldamodels/"+self.model_name+"/model.model")
        self.dictionary.save("ldamodels/"+self.model_name+"/dict.dict")

    def load_model(self, model_name):
        self.model_name = model_name
        self.dictionary  = corpora.Dictionary.load("ldamodels/"+self.model_name+"/dict.dict")
        self.model = LdaMallet.load("ldamodels/"+self.model_name+"/model.model")
        self.model.mallet_path = self.mallet_path
    
    def doc_topics(self, doc_idx):
        if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
        return self.doc_retriever.doc_topics(doc_idx)    
    
    def ext_doc_topics(self, ext_doc):
        doc_bow = self.dictionary.doc2bow(ext_doc)
        doc_topics = self.model[doc_bow]
        doc_topics.sort(key=lambda x: x[1], reverse=True)
        return doc_topics

    def ext_doc_n_most_similar(self, ext_doc, n=5, metric='cosine'):
        if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
        doc_bow = self.dictionary.doc2bow(ext_doc)
        doc_topics = self.model[doc_bow]
        topics = []
        for topic in doc_topics:
            topics.append(topic[1])    
        most_similar = self.doc_retriever.n_most_similar(topics, n=n, metric=metric)    
        return most_similar

    def n_most_representative(self, topic, n=3):
         if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
         topics = np.zeros(self.model.num_topics)
         topics[topic]=1
         most_similar = self.doc_retriever.n_most_similar(topics, n=n)
         return most_similar
        
    def get_string_topics(self, num_topics=-1, num_words=10):
        if(num_topics==-1):
            num_topics = self.model.num_topics 
        string_topics = []
        for topic in self.model.print_topics(num_topics=num_topics, num_words=num_words):
            splitted = topic[1].split("\"")
            result = [splitted[2*i+1] for i in range(0,int(len(splitted)/2))]
            string_topics.append(" ".join(result))
        return string_topics    
예제 #4
0
class Corpus(object):
    def __init__(self, path, dict_path):
        self.dictionary = Dictionary()
        add_to_dict = True
        if dict_path and os.path.exists(dict_path):
            print('loading dictionary')
            self.dictionary = self.dictionary.load(dict_path)
            add_to_dict = False
        self.train = self.tokenize(os.path.join(path, 'train.txt'),
                                   add_to_dict)
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'),
                                   add_to_dict)
        self.test = self.tokenize(os.path.join(path, 'test.txt'), add_to_dict)
        if dict_path and not os.path.exists(dict_path):
            self.dictionary.save(dict_path)

    def tokenize(self, path, add_to_dict):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        all_words = list(
            chain.from_iterable([
                sent.split() + ['<eos>']
                for sent in open(path).read().split('\n')
            ]))
        if add_to_dict:
            self.dictionary.add_documents([all_words])
        return torch.LongTensor(self.dictionary.doc2idx(all_words))
예제 #5
0
def create_dictionary(analyzed_items_path, dictionary_path=None):
    dictionary = Dictionary(iter_docs(analyzed_items_path))

    if dictionary_path:
        dictionary.save(dictionary_path)

    return dictionary
예제 #6
0
파일: lda.py 프로젝트: msushkov/cs224w-wiki
def build_dictionary():
    dictionary = Dictionary()
    for line in open(wiki_index.ARTICLES_FILE):
        dictionary.add_documents([line.lower().split()])
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    dictionary.save(DICTIONARY_FILE)
    return dictionary
예제 #7
0
def create_dictionary(analyzed_items_path, dictionary_path=None):
    dictionary = Dictionary(iter_docs(analyzed_items_path))

    if dictionary_path:
        dictionary.save(dictionary_path)

    return dictionary
예제 #8
0
def build_dictionaries_from_splits(splits_template, n, save_pickle_tup=None):
    ''' Builds all 3 dictionaries from splits. If provided, `save_pickle_tup` must
        be a 3-tuple of the picklefile names in the following order:
        
        (title, body, tags)
        
        If `save_pickle_tup[i]` is None, the corresponding dictionary will not be saved.
    '''
    utitledict, ubodydict, utagdict = Dictionary(), Dictionary(), Dictionary()
    for eid in xrange(n):
        for row in row_stream(splits_template % eid):
            ID, title, body, tags = row
            utitledict.doc2bow(title.split(), allow_update=True)
            ubodydict.doc2bow(body.split(), allow_update=True)
            utagdict.doc2bow(tags.split(), allow_update=True)
    
    assert ubodydict.num_docs == utitledict.num_docs == utagdict.num_docs
    print "Before filtering..."
    print "utitledict:", utitledict
    print "ubodydict:", ubodydict
    print "utagdict:", utagdict
    
    if save_pickle_tup:
        assert len(save_pickle_tup) == 3
        if save_pickle_tup[0]:
            print "saving utitledict..."
            utitledict.save(save_pickle_tup[0])
        if save_pickle_tup[1]:
            print "saving ubodydict..."
            ubodydict.save(save_pickle_tup[1])
        if save_pickle_tup[2]:
            print "saving utagdict..."
            utagdict.save(save_pickle_tup[2])
            
    return (utitledict, ubodydict, utagdict)
예제 #9
0
파일: topics.py 프로젝트: no-name-coder/GFI
def lda_train(train_data, part, save_root):
    ids = list(train_data['id'])
    texts = list(train_data[part])

    with Pool() as pool:
        texts = list(
            tqdm.tqdm(pool.imap(tokenize, texts), total=len(texts), ncols=100))

    text_dictionary = Dictionary(texts)
    text_dictionary.save(os.path.join(save_root, 'dict'))

    with Pool(initializer=make_dictionary_global,
              initargs=(text_dictionary, )) as pool:
        texts = list(
            tqdm.tqdm(pool.imap(doc2bow_unit, texts),
                      total=len(texts),
                      ncols=100))

    lda_model = LdaMulticore(texts, workers=7)
    lda_model.save(os.path.join(save_root, 'model'))

    with Pool(initializer=make_model_global, initargs=(lda_model, )) as pool:
        rows = list(
            tqdm.tqdm(pool.imap(get_document_topics_unit, texts),
                      total=len(texts),
                      ncols=100))
    topics = pd.DataFrame(rows, columns=['topics', 'topic_num'])
    topics.insert(0, 'id', ids)
    topics.to_csv(os.path.join(save_root, 'train.csv'), index=False)

    return text_dictionary, lda_model
예제 #10
0
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
예제 #11
0
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None  # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
예제 #12
0
 def build_dictionary(self):
     documents = ReadThreads(
         self.board, input_dir=self.input_dir, file_type='phrases',
         return_func=lambda x, y: y.split())
     dictionary = Dictionary(documents)
     dictionary.save(f'{self.board}.dictionary')
     
     return dictionary
예제 #13
0
def getDictionary(word_corpus, useSavedTill):
    if useSavedTill >= USESAVED.dictionary:
        common_logger.info("loading dictionary from file")
        dictionary = Dictionary.load(file_lda_gensim_dictionary)
        return dictionary
    else:
        common_logger.info("Creating dictionary from corpus")
        dictionary = Dictionary(word_corpus.values())
        common_logger.info("saving dictionary")
        dictionary.save(file_lda_gensim_dictionary)
        return dictionary
예제 #14
0
def build_dictionary_from_splits(splits_template, column, n, save_pickle=None):
    ''' Build dictionary from splits. If `save_pickle` is provided, then save. '''
    unfiltered_dict = Dictionary()
    for eid in xrange(n):
        unfiltered_dict.add_documents(csv_isolator("../../data/proc_Train_%d.csv" % eid, column))
    print "Before filtering,", unfiltered_dict
    if save_pickle:
        print "\nsaving..."
        unfiltered_dict.save(save_pickle)
    
    return unfiltered_dict
예제 #15
0
파일: text.py 프로젝트: hans/deepBLE
class TextCorpus(gensim.corpora.TextCorpus):
    """A corpus class which makes some minor extensions to the Gensim
    `TextCorpus` implementation:

    - Support loading of pre-built dictionary
    """

    def __init__(self, input=None, dictionary=None, dictionary_save_path=None,
                 pre_tokenized=False, lowercase=False):
        super(gensim.corpora.TextCorpus, self).__init__()

        self.input = input
        self.metadata = False

        self.pre_tokenized = pre_tokenized
        self.lowercase = lowercase

        if dictionary is None:
            self.dictionary = Dictionary()

            if input is not None:
                self.dictionary.add_documents(self.get_texts())
            else:
                logging.warning("No input document stream provided; "
                                "assuming dictionary will be "
                                "initialized in some other way.")
        else:
            self.dictionary = dictionary

        if dictionary_save_path is not None:
            self.dictionary.save(dictionary_save_path)

    def get_texts(self):
        length = 0

        # Input should have one document (sentence, for the word2vec case) per line
        for line in getstream(self.input):
            length += 1

            if self.pre_tokenized:
		if not isinstance(line, unicode):
		    line = unicode(line, encoding='utf8', errors='strict')
                yield line
            else:
                yield gensim.utils.tokenize(line, lowercase=self.lowercase)

        self.length = length
예제 #16
0
 def init_dictionary(self, save=True):
     import gzip
     from collections import Counter
     corpus_file = self.params.get(
         'dictionary__corpus_file') or self.params.get(
             'corpus_file') or 'sentences.txt.gz'
     doc_id = 0
     num_pos = 0
     num_nnz = 0
     cfs = Counter()
     dfs = Counter()
     f = gzip.open(self.path + corpus_file, 'rt', encoding='utf8')
     f = tqdm(f, 'dictionary', self.sentences_cnt)
     unique = set()
     for line in f:
         line = line.strip()
         if not line:  # end of document
             dfs.update(unique)
             num_nnz += len(unique)
             #
             doc_id += 1
             unique = set()
             continue
         tokens = line.split(' ')
         cfs.update(tokens)
         num_pos += len(tokens)
         unique.update(tokens)
     f.close()
     #
     token2id = {t: i for i, (t, cnt) in enumerate(cfs.most_common())}
     dictionary = GensimDictionary()
     dictionary.num_pos = num_pos
     dictionary.num_nnz = num_nnz
     dictionary.num_docs = doc_id
     dictionary.token2id = token2id
     #dictionary.cfs = {i:cfs[t] for t,i in token2id.items()}
     #dictionary.dfs = {i:dfs[t] for t,i in token2id.items()}
     for t, i in token2id.items():
         dictionary.cfs[i] = cfs[t]
         dictionary.dfs[i] = dfs[t]
     #dictionary.patch_with_special_tokens({'<PAD>':0})
     if save:
         dictionary.save(self.path + 'dictionary.pkl')
     self.dictionary = dictionary
예제 #17
0
class LDAembedding(InputEmbedding):
    def __init__(self, workdir="./embedding-models", name="lda-embedding"):
        """
        Erstellt durch Aufruf von Pretrain ein Vokabular
        :param workdir:
        :param name:
        """
        super(LDAembedding, self).__init__(workdir=workdir, name=name)
        self._normalizer = TweetNormalisation()

    def _load(self):
        modeldir = self._workdir.joinpath("ldamodel_{}".format(self._name))
        if not modeldir.exists():
            return False
        self._lda = LdaMulticore.load(str(modeldir))
        self._dictionary = Dictionary.load(
            str(self._workdir.joinpath("dictionary_{}.gz".format(self._name))))

    def pretrain(self, texts: typing.Iterable[typing.Text]):
        texts = [self._normalizer(text).split() for text in tqdm(texts)]
        self._dictionary = Dictionary(texts, prune_at=200000)
        corpus = [self._dictionary.doc2bow(text) for text in tqdm(texts)]
        self._lda = LdaMulticore(corpus=corpus,
                                 id2word=self._dictionary,
                                 workers=15,
                                 num_topics=50)

        self._dictionary.save(
            str(self._workdir.joinpath("dictionary_{}.gz".format(self._name))))
        self._lda.save(
            str(self._workdir.joinpath("ldamodel_{}".format(self._name))))

    def get_train_data(self, texts: typing.Iterable[typing.Text]) -> np.array:
        to_array = lambda x: np.array([
            v
            for _, v in self._lda.get_document_topics(x, minimum_probability=0)
        ])
        return np.stack([
            to_array(self._dictionary.doc2bow(self._normalizer(text).split()))
            for text in texts
        ])
예제 #18
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-d', '--wiki-dump')
    parser.add_argument('-l', '--limit', default=None, type=int)
    parser.add_argument('-p', '--num-procs', default=1, type=int)
    parser.add_argument('-o', '--out', default='vocab')
    opts = parser.parse_args()

    dump_loc = opts.wiki_dump
    limit = opts.limit
    n_procs = opts.num_procs
    out_fn = opts.out

    dump_gen = get_dump_gen(dump_loc, limit=limit, n_procs=n_procs)

    nlp = spacy.en.English()
    vocab = Dictionary(([token.text.lower().strip() for token in doc if token.text.strip() != ""]
                        for doc in nlp.pipe((art['article.text'] for art in dump_gen), n_threads=n_procs,
                                            parse=False, tag=False, entity=False)))

    vocab.save('%s.vocab' % out_fn)
    vocab.save_as_text('%s.txt' % out_fn)
예제 #19
0
                    help='File name to give the dictionary upon saving')

args = parser.parse_args()

input_path = args.input_path
output_name = args.output_name
CHUNK_SIZE = args.chunk_size

# Stream in documents from path
rdr = lmd.Reader(input_path)
gnr = rdr.stream_data(get_meta=True)

# Build a dictionary out of the validation documents
dictionary = Dictionary()
docs = rdr.stream_data(threaded=True)
doc_chunks = chunks(docs, size=CHUNK_SIZE)
# Progress in chunks
for chunk in doc_chunks:
    print("Adding ", CHUNK_SIZE, " docs")
    tokenized = [[
        tok.lower_ for tok in doc if not tok.is_stop and tok.is_alpha
    ] for doc in tokenizer.pipe(
        [item for item in chunk if language(item) == 'en'],
        batch_size=CHUNK_SIZE)]
    dictionary.add_documents(tokenized)

# Keep only 2**16 most frequent tokens
dictionary.filter_extremes(keep_n=2**16)
dictionary.compactify()
dictionary.save(output_name)
예제 #20
0
파일: LDA.py 프로젝트: ZJUHSY/LDA
    def __init__(
            self,
            path='test.json',
            tf_idf=True,
            dic_below=15,
            dic_above=0.9,
            dic_keep=80000,
            new=False):  # tf_idf: whether or not use tf_idf method to produce
        if os.path.isfile(
                'dictionary.gensim'
        ) and not new:  # if new,corpus beside model should be loaded
            # load data
            inp = open(path, 'rb')
            self.data = pd.DataFrame(json.load(inp))
            inp.close()
            _inp = open('pro_docs.json', 'rb')
            self.processed_docs = json.load(_inp)
            _inp.close()

            self.dictionary = gensim.corpora.Dictionary.load(
                'dictionary.gensim')
            if tf_idf:
                self.corpus = pickle.load(open('corpus.pkl_tfidf', 'rb'))
            else:
                self.corpus = pickle.load(open('corpus.pkl', 'rb'))
            return
        else:
            # use jieba to produce word list
            jb = jb_cut(
                path)  # see jieba_prepary/ cut the document into words and
            processed_docs = jb.process(
            )  # get format lists of list: like [[......],[word1,word2,word3......],[.....]]
            self.processed_docs = processed_docs  # used for train

            outp = open("pro_docs.json", 'w',
                        encoding="utf-8")  # save list of lists
            outp.write(
                json.dumps(self.processed_docs, indent=4, ensure_ascii=False))
            outp.close()

            # use processed_docs to produce dictionary and corpus
            dictionary = Dictionary(
                processed_docs
            )  # use lists of lists to get overall model dictionary
            dictionary.filter_extremes(no_below=dic_below,
                                       no_above=dic_above,
                                       keep_n=dic_keep)  # filter dcitionary
            self.dictionary = dictionary
            if not new:
                dictionary.save('dictionary.gensim')

            # format: lists of lists of tuples
            corpus = [dictionary.doc2bow(text) for text in processed_docs
                      ]  # get doc2bow for each corpus in the corpora

            if not new:
                pickle.dump(corpus, open('corpus.pkl', 'wb'))

            if tf_idf:
                tfidf_model = models.TfidfModel(corpus)
                corpus = tfidf_model[corpus]
                if not new:
                    pickle.dump(corpus, open('corpus.pkl_tfidf',
                                             'wb'))  # save cirpus
            self.corpus = corpus
예제 #21
0
def main(args):
    #set path variable
    data_path = os.path.join(args.data_dir, args.task)
    embed_path = f'{args.model_type}_{args.embed_type}_{args.corpora}_{args.embed_dim}d.kv'
    embed_path = os.path.join('./process/model/embed_model', embed_path)

    #load model for nlp pipeline & embed
    spacy.require_gpu()
    en_nlp = spacy.load(args.nlp_type)
    special_tokens = load_special_tokens(args)
    logger.info(f'loading pretrain embed model from {embed_path}')
    if os.path.isfile(embed_path):
        if args.model_type == 'orig':
            kv_model = keyedvectors.KeyedVectors.load(embed_path)
        elif args.model_type == 'ft':
            kv_model = fasttext.FastTextKeyedVectors.load(embed_path)
    else:
        raise FileNotFoundError('Embed file path incorrect!')

    # read data
    df = pd.read_csv(os.path.join(data_path, args.mode, 'data.csv'),
                     encoding=args.encode_format)
    df = df.drop_duplicates(subset=['description', 'title'], keep=False)

    logger.info(f'Read data from {os.path.join(data_path,args.mode)} success!')
    logger.info(f'df shape:{df.shape}')
    logger.info(f'Special token: {special_tokens} nums: {len(special_tokens)}')

    #build corpous using nlp pipeline
    logger.info('****Start to build vocab****')
    if args.select_context_name:
        logger.info(
            f'Start to build context vocab for {args.select_context_name}!')

        #text prepare
        context_data = combine_data(df, args.select_context_name).tolist()
        context_corpus = corpus_process(context_data, en_nlp)
        # build lda model vocab
        documents = [[w for sent in doc for w in sent]
                     for doc in context_corpus]
        context_dict = Dictionary(documents)
        context_dict.filter_extremes(1, 1, args.max_context_vocab)
        #add special token
        context_vocab = [
            w
            for w in set(context_dict.token2id).difference(set(special_tokens))
        ]

        if args.spec_first:
            context_vocab = special_tokens + context_vocab
        else:
            context_vocab += special_tokens
        context_dict.token2id.update(
            {w: w_id
             for w_id, w in enumerate(context_vocab)})

        logger.info(f'context vocab num : {len(context_vocab)}')
        logger.info(f'top 15 context vocab : {context_vocab[:15]}')
        save_vocab_file(context_vocab, data_path, args.context_vocab_file)
        context_dict.save(args.lda_vocab_file)

        logger.info('Build pretrain embed model..')
        #build embed model
        logger.info('Create item_vocab pretrain embed!')
        create_embeds(context_dict.token2id, kv_model, data_path, args)

    if args.select_item_name:

        logger.info(
            f'Start to build string vocab for {args.select_item_name}!')

        #data prepare
        item_data = df[args.select_item_name].values.tolist()
        item_corpus = list(map(text_clean, item_data))  #clean text

        #build title vocab
        item_corpus = [[
            w.lower() for w in doc.split()
            if (w not in string.punctuation) and w.isalpha()
        ] for doc in item_corpus]
        item_vocab, item_token2id = build_vocab(item_corpus, special_tokens,
                                                args.max_item_size,
                                                args.min_item_freq,
                                                args.spec_first)

        logger.info(f'item vocab size : {len(item_vocab)}')
        logger.info(f'top 15 item vocab : {item_vocab[:15]}')
        #save item vocab
        save_vocab_file(item_vocab, data_path, args.str_vocab_file)

        logger.info('Start to create item_vocab embed')
        #create item embed vector
        create_embeds(item_token2id, kv_model, data_path, args)
        for word in sentence:
            for c in word:
                char_set.append(c)

    with open('char_set.pkl', 'wb') as f:
        pickle.dump(set(char_set), f)

else:
    with open('char_set.pkl', 'rb') as f:
        char_set = pickle.load(f)

# In[329]:

if 1 == 0:
    vocabulary = Dictionary(x_tokenized)
    vocabulary.save('voca')
else:
    vocabulary = Dictionary.load('voca')

word_to_idx = {vocabulary[idx]: idx for idx in range(len(vocabulary))}
idx_to_word = {idx: vocabulary[idx] for idx in range(len(vocabulary))}

char_to_idx = {c: idx for idx, c in enumerate(char_set)}
idx_to_char = {idx: c for idx, c in enumerate(char_set)}


def hf(word):
    return word_to_idx[word]


# In[83]:
예제 #23
0
파일: snippet.py 프로젝트: szabo92/gistable
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey"
]

# gensim
texts = [[word for word in document.lower().split()] for document in documents]
dictionary = Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')
print(dictionary)
print(dictionary.token2id)
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus)
MmCorpus.serialize('/tmp/corpus.mm', corpus)
corpus = MmCorpus('/tmp/corpus.mm')
print(corpus)
print(list(corpus))

# sklearn
vec = CountVectorizer(min_df=1,
                      stop_words=None,
예제 #24
0
    cleanDatafile = arguments['--in']
    dictfile = arguments['--out-dict']
    ldafile = arguments['--out-model']
    ldavectorfile = arguments['--out-topics']

    print('Loading data...')
    rawfile = open(cleanDatafile, 'rb').read()
    encodeInfo = chardet.detect(rawfile[:50000])

    sentences = SentenceIterator(cleanDatafile, encoding=encodeInfo['encoding'],
                            row2record=lambda row, index: row[1].split())

    # Create a corpus from a list of texts
    print('Creating dictionary...')
    cases_dict = Dictionary(sentences)
    cases_dict.save(dictfile)

    # Train the model on the corpus.
    print('Training LDA topic model...')
    cases_corpus = SentenceIterator(cleanDatafile, encoding=encodeInfo['encoding'],
                            row2record=lambda row, index: cases_dict.doc2bow(row[1].split()))

    lda = LdaModel(cases_corpus, num_topics=N_TOPICS)
    lda.save(ldafile)

    print('Building LDA topics...')
    idx_documents = SentenceIterator(cleanDatafile, encoding=encodeInfo['encoding'],
                            row2record=lambda row, index: (row[0], row[1]))

    with open(ldavectorfile, 'w') as csvfile:
        writer = csv.writer(csvfile, quoting=csv.QUOTE_NONNUMERIC)
예제 #25
0
def train_model(corpus_path, dic_conf, lda_conf):
    logging.info('Loading corpus from file {}'.format(corpus_path))
    corpus = FastTextCorpus(corpus_path, bufsize=20000000, length=5926250)
    # corpus = LineSentence(corpus_path, 10000000)
    print '-' * 80
    if lda_conf["build_dict"]:
        logging.info("Building dictionary ...")
        dic = Dictionary(corpus)
        dic.filter_extremes(no_below=dic_conf["min_tf"],
                            no_above=dic_conf["max_df"],
                            keep_n=dic_conf["vocab_size"])
        dic.compactify()
        logging.info("Saving dictionary ...")
        dic.save(dic_conf["dic"])
    else:
        logging.info("Loading dictionary ..")
        dic = Dictionary.load(dic_conf["dic"])

    bow = IntCorpus(corpus, dic)
    l = len(bow)
    print l

    tfMod = TfidfModel.load(lda_conf["tfmod"])
    #save corpus to disk for later usage
    # logging.info("Saving corpus to disk ...")
    # MmCorpus.serialize("data/corpus.mm", bow)
    # bow = MmCorpus("data/large_corpus.mm")

    print '-' * 80
    if lda_conf["new"]:
        logging.info("Training new lda model")
        logging.info("Loading defined keywords ...")
        keywords = {}
        topics = []
        with codecs.open(lda_conf["kw_file"], "r", "utf-8") as f:
            for l in f:
                sp = l.strip().split(':')
                topic = int(sp[0])
                topics.append(sp[1])
                kws = sp[2].split(',')
                for kw in kws:
                    if kw not in keywords:
                        keywords[kw] = set([topic])
                    else:
                        keywords[kw].add(topic)
                    #keywords[kw.lower()] = topic

        logging.info("Number of defined keywords: {}".format(len(keywords)))
        if lda_conf["threads"] <= 1:
            model = LdaModelNew(corpus=bow,
                                id2word=dic,
                                iterations=lda_conf["iterations"],
                                num_topics=lda_conf["num_topics"],
                                passes=lda_conf["passes"],
                                chunksize=lda_conf["chunksize"],
                                defined_kws=keywords,
                                alpha='auto',
                                eval_every=lda_conf["eval_every"])
        else:
            logging.info("Training model using mutlicore lda version")
            model = LdaMulticoreNew(corpus=bow,
                                    id2word=dic,
                                    workers=lda_conf["threads"],
                                    iterations=lda_conf["iterations"],
                                    num_topics=lda_conf["num_topics"],
                                    passes=lda_conf["passes"],
                                    defined_kws=keywords,
                                    alpha='symmetric',
                                    chunksize=lda_conf["chunksize"],
                                    eval_every=lda_conf["eval_every"],
                                    tfMod=tfMod,
                                    topic_names=topics)

    else:
        logging.info("Training ldamodel implemented in gensim")
        model = LdaModelOld(corpus=bow,
                            id2word=dic,
                            iterations=lda_conf["iterations"],
                            num_topics=lda_conf["num_topics"],
                            passes=lda_conf["passes"],
                            chunksize=lda_conf["chunksize"],
                            alpha='auto',
                            eval_every=lda_conf["eval_every"])

    logging.info('Saving lda model to {}'.format(lda_conf["model_path"]))
    model.save(lda_conf["model_path"])
    logging.info('Saving model done!')
예제 #26
0
        logging.info('no calculated files found, recomputing...')
        logging.info('loading files...')
        with open(data_dark_file, 'r') as f1, open(data_clean_file, 'r') as f2:
            logging.info('loading dark text...')
            dark_text = [line.split() for line in f1.readlines()]
            logging.info('loading clean text...')
            clean_text = [line.split() for line in f2.readlines()]
            logging.info('load file done')

        if os.path.exists(dict_file):
            dictionary = Dictionary.load(dict_file)
        else:
            logging.info('creating the dictionary...')
            dictionary = Dictionary(dark_text)
            dictionary.add_documents(clean_text)
            dictionary.save(dict_file)

        dictionary = filter_dict(args.vocab_size, dictionary,
                                 get_keep_tokens(dictionary))
        logging.info('dictionary created')

        logging.info('building neighbor unigrams...')

        if os.path.exists(file_unigram_dark) and os.path.exists(
                file_unigram_dark_all):
            unigram_dark = np.load(file_unigram_dark)
            unigram_dark_all = np.load(file_unigram_dark_all)
        else:
            unigram_dark, unigram_dark_all = get_neighbor_unigram(
                dictionary, dark_text, args.num_neighbors)
            np.save(file_unigram_dark, unigram_dark)
예제 #27
0
class my_LDA(object):
    def __init__(self):
        self.low_corpus = list()
        self.bow_corpus = list()
        self.dictionary = None

    @classmethod
    def tokenize_text(cls, text):
        text = unicodedata.normalize("NFKD", text).replace("\n", " ").replace(
            "\t", "").replace("  ", " ")
        # tokenize text as English
        text_tokenized = word_tokenize(text, language='English')
        # convert to lowercases and remove puncutation
        text_tokenized = [
            word.lower() for word in text_tokenized if word.isalpha()
        ]
        # lemmatize token
        text_tokenized = [
            wordlemmatizer.lemmatize(word) for word in text_tokenized
            if word not in stopwords.words('english')
        ]
        return text_tokenized

    def callback(self, text_tokenized):
        if len(text_tokenized) > 10:
            self.low_corpus.append(text_tokenized)

    def text_to_low(self, texts):
        """
        convert texts to list of words, concurrency-enabled
        args:
            text: list of string
        """
        logging.info('running text_to_low')
        pool = multiprocessing.Pool(os.cpu_count())
        for text in texts:
            pool.apply_async(self.tokenize_text,
                             args=(text, ),
                             callback=self.callback)
        pool.close()
        pool.join()

    def low_to_bow(self):
        """
        list of words to bag of words
        """
        logging.info('running low_to_bow')
        if not self.low_corpus:
            raise ValueError("Run text_to_low First")
        self.dictionary = Dictionary(self.low_corpus)
        self.dictionary.filter_extremes(no_below=15, no_above=0.9)
        self.bow_corpus = [
            self.dictionary.doc2bow(doc) for doc in self.low_corpus
        ]

    def run_lda(self):
        logging.info('running run_lda')
        if not self.bow_corpus or self.dictionary is None:
            raise ValueError("Run low_to_bow First")
        lda_model = models.LdaMulticore(self.bow_corpus,
                                        alpha=0.001,
                                        num_topics=10,
                                        id2word=self.dictionary,
                                        workers=os.cpu_count())
        lda_model.save("data/LDA/my_LDA/topic_model.model")
        self.dictionary.save("data/LDA/my_LDA/dictionary.dict")
        corpora.MmCorpus.serialize("data/LDA/my_LDA/corpus.mm",
                                   self.bow_corpus)
예제 #28
0
class text_corpus(object):
    def __init__(self, tsv_path, n_examples=100000):
        print("Getting %s iterator..." % tsv_path)
        self.n_examples = n_examples
        self.document_path = tsv_path
        self.fin = open(self.document_path, 'rb')
        self.instances = sum(1 for line in open(tsv_path))
        self.bigram = Phraser(Phrases())
        self.trigram = Phraser(Phrases())

    def __iter__(self):
        for i, doc in self.indexed_docs(self.n_examples):
            yield TaggedDocument(self.process(doc), [i])

    def process(self, text):
        return self.trigram[self.bigram[tokenize(text)]]

    def docs(self, n_examples=None):
        if n_examples == None:
            n_examples = self.n_examples
        for _, doc in self.indexed_docs(n_examples):
            yield self.process(doc)

    def reset_docs(self):
        self.fin.close()
        self.fin = open(self.document_path, 'rb')

    def indexed_docs(self, n_examples=-1):
        if n_examples == -1:
            with open(self.document_path, 'rb') as fin:
                for line in fin:
                    try:
                        i, doc = line.decode(
                            'utf-8', errors='replace').strip().split('\t')
                        yield i, doc
                    except:
                        pass
        else:
            current_example = 0
            for line in self.fin:
                if (current_example < n_examples):
                    try:
                        i, doc = line.decode(
                            'utf-8', errors='replace').strip().split('\t')
                        current_example += 1
                        yield i, doc
                    except:
                        pass
                else:
                    raise StopIteration

    def get_phraser(self, directory, sensitivity=3):

        if not os.path.isdir(directory):
            os.makedirs(directory)

        print("\t\tGetting bigram detector...")
        if not os.path.isfile(directory + '/bigrams.pkl'):
            self.bigram = Phraser(
                Phrases(self.docs(n_examples=-1),
                        min_count=2,
                        threshold=sensitivity,
                        max_vocab_size=2000000))
            self.bigram.save(directory + '/bigrams.pkl')
        else:
            self.bigram = Phraser.load(directory + '/bigrams.pkl')

        print("\t\tGetting trigram detector...")
        if not os.path.isfile(directory + '/trigrams.pkl'):
            self.trigram = Phraser(
                Phrases(self.bigram[self.docs(n_examples=-1)],
                        min_count=2,
                        threshold=sensitivity + 1,
                        max_vocab_size=2000000))
            self.trigram.save(directory + '/trigrams.pkl')
        else:
            self.trigram = Phraser.load(directory + '/trigrams.pkl')

    def load_phraser(self, directory):
        print("\tLoading gram detector...")
        self.bigram = Phraser.load(directory + '/bigrams.pkl')
        self.trigram = Phraser.load(directory + '/trigrams.pkl')

    def get_dictionary(self, directory, keep=100000):
        if not os.path.isdir(directory):
            os.makedirs(directory)
        if not os.path.isfile(directory + '/dictionary.dict'):
            print("\tBuilding dictionary...")
            self.dictionary = Dictionary(self.docs(n_examples=-1),
                                         prune_at=2000000)
            print("\tFiltering dictionary extremes...")
            self.dictionary.filter_extremes(no_below=3,
                                            no_above=0.5,
                                            keep_n=keep)
            print("\tSaving dictionary...")
            self.dictionary.save(directory + '/dictionary.dict')
            self.dictionary.save_as_text(directory + '/word_list.tsv')
        else:
            self.load_dictionary(directory)

    def get_word_ids(self):
        word_list = set()
        for doc in self.docs(n_examples=-1):
            word_list.update(doc)
        return dict(zip(range(len(word_list)), word_list))

    def load_dictionary(self, directory):
        print("\tLoading dictionary...")
        self.dictionary = Dictionary.load(directory + '/dictionary.dict')
예제 #29
0
class Similarities(object):
    """
    Class for text similarities stuff
    """

    def __init__(self, mongo_conn_rec, stopwords=None):
        self._stopwords = set(stopwords) if stopwords is not None else set()
        self._mongo_connection_record = mongo_conn_rec
        self._lsi_mapping = dict()
        self._sim_index = None
        self._dictionary = None
        self._lsimodel = None

        self._run_transformers()

    @staticmethod
    def logger():
        """
        Scrapper's specific logger instance. Use this to log inside scrappers.
        :return: Returns a logging.Logger('openews.scrappers') instance.
        """
        return logging.getLogger('openews.language')

    @property
    def considerable_doc_property(self):
        """
        The document property to use for training. this is the actually data we take from the MongoDB documents to
        parse and train.
        :return: str
        """
        return 'title'

    @property
    def dictionary_file(self):
        """
        The filename to use when serializing gensim.corpora.dictionary.Dictionary to disk.
        :return: str
        """
        return "openews.processors.dict"

    @property
    def dictionary(self):
        """
        The used Dictionary.
        :return: gensim.corpora.dictionary.Dictionary
        """
        return self._dictionary

    @property
    def lsi_model(self):
        """
        The used LSI model.
        :return: gensim.models.lsimodel.LsiModel
        """
        return self._lsimodel

    @property
    def similarity_index(self):
        """
        The similarity index instance
        :return: gensim.similarities.docsim.MatrixSimilarity
        """
        return self._sim_index

    @property
    def similarity_threshold(self):
        """
        The similarity threshold.
        Anything above or equals to this value will be considered as similar document.
        :return: float
        """
        return server_app.config['SIMILARITY_THRESHOLD']

    @property
    def lsi_index_mapping(self):
        """
        A mapping between the LSI model index (key) and the documents (Collection the document is in, document)
        :return: dict
        """
        return self._lsi_mapping

    @staticmethod
    def _create_resource_path(resource_file):
        """
        Creates a absolute path to resource_file based on the given system's temp directory.
        :param resource_file: str
        :return: str
        """
        return os.path.join(tempfile.gettempdir(), resource_file)

    def _resource_exists(self, resource_file):
        """
        Checks if resource_file exists in the given system's temp directory.
        :param resource_file: str
        :return: bool
        """
        return os.path.isfile(self._create_resource_path(resource_file))

    def _run_transformers(self):
        """
        Runs all the transformer methods listed providing the MongoDB client context instance.
        """
        with MongoClientContext(self._mongo_connection_record) as client:
            self._create_dictionary(client)
            self._create_lsi_similarity_index(client)

    def _create_dictionary(self, mongo_client):
        """
        Creates the gensim Dictionary (gensim.corpora.dictionary.Dictionary) or loads it if it already exists and sets
        the object's dictionary property.
        :param mongo_client: server.db.MongoClientContext
        """
        from gensim.corpora.dictionary import Dictionary

        if self._resource_exists(self.dictionary_file):
            self.logger().debug(
                    "Dictionary file found, loading it [%s]" % self._create_resource_path(self.dictionary_file))
            self._dictionary = Dictionary.load(self._create_resource_path(self.dictionary_file))
        else:
            self.logger().debug("Dictionary file not found, creating a new Dictionary file")
            self._dictionary = Dictionary()

        documents = []
        for doc in [di for d in mongo_client.scrappers_collections() for di in d.find()]:
            documents.append(self.tokenize_sentence(doc[self.considerable_doc_property]))

        self.logger().debug("Adding %d documents to dictionary (will skip existing ones)" % len(documents))
        self._dictionary.add_documents(documents)
        self._dictionary.save(self._create_resource_path(self.dictionary_file))

    def _create_lsi_similarity_index(self, mongo_client):
        """
        Creates a Similarity index based on LSI model from the available dictionary. Sets the object's lsi_model and
        similarity_index object properties.
        """
        from gensim.models import LsiModel
        from gensim.similarities import MatrixSimilarity

        self._lsi_mapping.clear()
        bow_corpus = []
        for idx, tp in enumerate([(c, di) for c in mongo_client.scrappers_collections() for di in c.find()]):
            self._lsi_mapping[idx] = tp
            bow_corpus.append(self.sentence_to_bow(tp[1][self.considerable_doc_property]))

        self._lsimodel = LsiModel(bow_corpus, id2word=self.dictionary)
        self._sim_index = MatrixSimilarity(self._lsimodel[bow_corpus])

    def calculate_similarities(self):
        """
        Find / calculate similarities between documents in the index.
        Returns a defaultdict with the key as the LSI index and the value is a list of tuples with the following values
        (LSI model Index, similarity threshold - numpy.float32)
        tuple
        :return: defaultdict(list)
        """
        similarities = defaultdict(list)
        if not self.lsi_index_mapping:
            return

        for idx, tp in sorted(self.lsi_index_mapping.items(), key=itemgetter(0)):
            sentence = tp[1][self.considerable_doc_property]
            bow = self.sentence_to_bow(sentence)
            latent_space_vector = self.lsi_model[bow]
            sim_vector = self.similarity_index[latent_space_vector]
            sorted_mapped_vector = list(sorted(enumerate(sim_vector), key=itemgetter(1)))
            for sit in [v for v in sorted_mapped_vector if
                        v[0] != idx and v[1] >= self.similarity_threshold and tp[0].name !=
                                self.lsi_index_mapping[v[0]][0].name]:
                if sit[0] not in similarities:
                    similarities[idx].append(sit)

        for s in similarities.items():
            main_sentence = self.lsi_index_mapping[s[0]][1][self.considerable_doc_property]
            print("[%s] %s:" % (self.lsi_index_mapping[s[0]][0].name, main_sentence))
            for sm in s[1]:
                print("\t[%f][%s]: %s" % (sm[1], self._lsi_mapping[sm[0]][0].name,
                                          self.lsi_index_mapping[sm[0]][1][self.considerable_doc_property]))
        return similarities

    def store_similarities(self, update=False):
        """
        Stores the similarities to the database
        :param update: True to update existing, False to delete and add new items
        """
        with MongoClientContext(self._mongo_connection_record) as client:
            pass

    def tokenize_sentence(self, sentence):
        """
        Tokenize a sentence (see 'tokenized_corpus_sentences' method on what tokenization in this context means).
        :param sentence: str
        :return: a list
        """
        excluded = set(chain(self._stopwords, string.punctuation))
        return [w.lower() for w in word_tokenize(sentence) if w.lower() not in excluded]

    def sentence_to_bow(self, sentence):
        """
        Transforms a string sentence to a VSM bag-of-words representation.
        :param sentence: str
        :return: list of tuples
        """
        return self.dictionary.doc2bow(self.tokenize_sentence(sentence))
예제 #30
0
def train_LDA(base_path,
              table_paths,
              batch_size,
              limit,
              use_dictionary=False,
              **kwargs):

    model_name = dic2name(kwargs)
    print("Model: ", model_name)
    topic_num = kwargs['tn']

    # Pass 1 get the dictionary
    if use_dictionary == 'True':
        dic = Dictionary.load(
            join(LDA_CACHE, 'dictionary_{}'.format(model_name)))
    else:

        dic = Dictionary([])
        b = 0
        for corpus in corpus_iter(base_path, table_paths, batch_size, limit,
                                  **kwargs):
            dic.add_documents(corpus)
            print('Dictionary batch {}: current dic size {}'.format(
                b, len(dic)))
            b += 1

        # save dictionary
        dic.save(join(LDA_CACHE, 'dictionary_{}'.format(model_name)))

    print("Dictionary size", len(dic))

    # Pass 2 train LDA
    whole_corpus = corpus_iter(base_path, table_paths, batch_size, limit,
                               **kwargs)
    first_batch = next(whole_corpus)
    first_bow = [dic.doc2bow(text, allow_update=False) for text in first_batch]
    #print(first_bow)

    lda = LdaModel(first_bow,
                   id2word=dic,
                   num_topics=topic_num,
                   minimum_probability=0.0)
    batch_no = 0
    print('LDA update batch {}'.format(batch_no))

    for batch in whole_corpus:
        batch_bow = [dic.doc2bow(text, allow_update=False) for text in batch]
        #print(corpus_bow)
        lda.update(batch_bow)
        batch_no += 1
        print('LDA update batch {}'.format(batch_no))

    # Save model to disk.
    temp_file = join(LDA_CACHE, "model_{}".format(model_name))
    lda.save(temp_file)

    print(
        "Training from {} done. Batch_size: {}, long str tokenization threshold: {}, numerical representations: {}.\
          \nTotal size of dictionary: {}".format(table_paths, batch_size,
                                                 kwargs['thr'], kwargs['num'],
                                                 len(dic)))
    return
예제 #31
0
class GensimBOW(BaseEstimator, TransformerMixin):
    """
    Custom sklearn transformer to convert tokenized,
    preprocessed data to bag-of-words representation.
    """
    def __init__(self, id2word_path=None, use_sparse_representation=False):
        """
        Parameters
        ----------
        id2word_path : str
            Path to location of gensim id2word dict.
            If specified, the model will load and use this object
            as its id2_word dict.
        use_sparse_representation: Boolean (default=False)
            When True, a sparse representation of the array is returned.
                Use this when feeding into a gensim model.
            When False, the full array is returned.
                Use this if feeding into sklearn estimator.
        """
        self.id2word = None
        self.use_sparse_representation = use_sparse_representation
        if id2word_path:
            self._load(id2word_path=id2word_path)

    def _load(self, id2word_path):
        """
        If self.id2word_path specified, loads gensim.id2word dict from path.

        Parameters
        ----------
        id2word_path: str
            File-path designating where self.id2word should be saved.
        """
        from gensim.corpora.dictionary import Dictionary
        if not os.path.exists(id2word_path):
            raise IOError(
                'The provided file path to id2word_path was not found.'
                'Please ensure that the argument is the correct path.')
        self.id2word = Dictionary().load(id2word_path)

    def save(self, id2word_path):
        """
        Saves self.id2word to id2word_path.
        If id2word does not exist, AttributeError is raised.

        Parameters
        ----------
        id2word_path: str
            File-path designating where self.id2word should be saved.
        """
        if not self.id2word:
            raise AttributeError('Nothing to save yet, please run .fit first.')
        self.id2word.save(id2word_path)

    def fit(self, documents, labels=None):
        """
        Creates map between words and their integer ids,
        storing it as `self.id2word`.

        Parameters
        ----------
        documents: iterable
            List of documents; each document a list of preprocessed tokens.
        labels:
            Optional list of same size as documents, specifying label for each document.
        """
        from gensim.corpora.dictionary import Dictionary
        self.id2word = Dictionary(documents)

    def transform(self, documents):
        """
        Converts a collection of words to its bag-of-words representation.

        Parameters
        ----------
        documents: iterable
            List of documents. Each document must be a list of tokens.

        Returns
        -------
            generator: yields vectorized representation of each document.
        """
        from gensim.matutils import sparse2full
        if self.id2word is None:
            raise AttributeError('Must have a fit id2word in order'
                                 ' to call transform.')

        def generator():
            """
            Closure to mutate return type depending on value of `use_sparse_representation`.
            """
            for document in documents:
                docbow = self.id2word.doc2bow(document)
                if self.use_sparse_representation:
                    yield docbow
                else:
                    yield sparse2full(docbow, len(self.id2word))

        return list(generator())
예제 #32
0
def train_lda(path):
    # from gensim.test.utils import common_texts
    # from gensim.corpora.dictionary import Dictionary
    # from gensim.models.ldamodel import LdaModel
    # from gensim.test.utils import datapath

    # common_dictionary = Dictionary(common_texts)
    # common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

    # print(common_dictionary.get(80))

    # lda = LdaModel(common_corpus, num_topics=5)
    # temp_file = datapath(path)
    # lda.save(temp_file)
    # lda = LdaModel.load(temp_file)

    documents = [
        "Amazon sells many things ", "Apple is releasing a new product ",
        "Microsoft announces Nokia acquisition ",
        'Julie loves me more than Linda loves me ',
        'Jane likes me more than Julie loves me'
    ]

    documents = [rm_special_chars(s) for s in documents]
    stoplist = [
        'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there',
        'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they',
        'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into',
        'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who',
        'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below',
        'are', 'we', 'these', 'your', 'his', 'through', "don't", 'nor', 'me',
        'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our',
        'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she',
        'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and',
        'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then',
        'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not',
        'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too',
        'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom',
        'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it',
        'how', 'further', 'was', 'here', 'than'
    ]

    texts = [[
        word for word in document.lower().split() if word not in stoplist
    ] for document in documents]
    # print(texts)
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # print(dictionary.get(0))

    num_topics = 3
    lda = LdaModel(corpus=corpus,
                   id2word=dictionary,
                   num_topics=num_topics,
                   update_every=1,
                   random_state=100,
                   chunksize=100,
                   passes=20,
                   alpha='auto')

    temp_file = datapath(path + 'lda_model')
    lda.save(temp_file)
    lda = LdaModel.load(temp_file)

    dictionary.save(path + 'dict')

    return lda, dictionary
def create_vocab(tweets):
    print("Building vocabulary...")
    vocab = Dictionary()
    vocab.add_documents(tweets)
    vocab.save('vocab_sentiment')
    return vocab
예제 #34
0
class GensimTfidf(BaseEstimator, TransformerMixin):
    """
    Custom sklearn transformer to convert tokenized,
    preprocessed data to tf-idf representation.
    """
    def __init__(self,
                 tfidf_path=None,
                 dictionary_path=None,
                 use_sparse_representation=False):
        """
        Instantiate GensimTfidf object. If loading previously fit Dictionary and
        TfidfModel, you must specify a path to both the Dictionary and the TfidfModel.

        Parameters
        ----------
        tfidf_path : str
            Path to location of saved gensim TfidfModel.
            If specified, the model will load and use this object
            as its TfidfModel.
        dictionary_path : str
            Path to location of saved gensim Dictionary.
            If specified, the model will load and use this object
            as its Dictionary.
        use_sparse_representation: Boolean (default=False)
            When True, a sparse representation of the array is returned.
                Use this when feeding into a gensim model.
            When False, the full array is returned.
                Use this if feeding into sklearn estimator.
        """
        self.use_sparse_representation = use_sparse_representation
        self.dictionary = None
        self.tfidf = None
        # if both paths specified, load object
        if tfidf_path and dictionary_path:
            self._load(tfidf_path=tfidf_path, dictionary_path=dictionary_path)
        elif tfidf_path or dictionary_path:
            raise AttributeError(
                'If loading pre-fit Dictionary and TfidfModel,'
                ' both must be specified, not just one.')

    def _load(self, tfidf_path, dictionary_path):
        """
        If specified, attempts to load gensim TfidfModel from `tfidf_path`
        and gensim Dictionary from `dictionary_path`.

        Parameters
        ----------
        tfidf_path: str
            File-path designating where self.tfidf should be saved.
        dictionary_path: str
            File-path designating where self.dictionary should be saved.
        """
        from gensim.models import TfidfModel
        from gensim.corpora.dictionary import Dictionary
        if not os.path.exists(tfidf_path):
            raise IOError(
                'The provided file path to the TfidfModel was not found.'
                'Please ensure that the argument is the correct path.')
        if not os.path.exists(dictionary_path):
            raise IOError(
                'The provided file path to the Dictionary was not found.'
                'Please ensure that the argument is the correct path.')
        self.tfidf = TfidfModel().load(tfidf_path)
        self.dictionary = Dictionary().load(dictionary_path)

    def save(self, tfidf_path, dictionary_path):
        """
        Saves objects from fit process: gensim.TfidfModel to `tfidf_path`
        and gensim.Dictionary to `dictionary_path`.
        If either self.tfidf or self.dictionary does not exist, an
        AttributeError is raised.

        Parameters
        ----------
        tfidf_path: str
            File-path designating where self.tfidf should be saved.
        dictionary_path: str
            File-path designating where self.dictionary should be saved.
        """
        if not (self.tfidf and self.dictionary):
            raise AttributeError('Nothing to save yet, please run .fit first.')
        self.tfidf.save(tfidf_path)
        self.dictionary.save(dictionary_path)

    def fit(self, documents, labels=None):
        """
        Fits a gensim TfidfModel to documents.

        Parameters
        ----------
        documents: iterable
            List of documents. Each document must be a list of preprocessed tokens.
        labels: iterable
            Optional list of same size as documents, specifying label for each document.

        """
        from gensim.models import TfidfModel
        from gensim.corpora.dictionary import Dictionary
        self.dictionary = Dictionary(documents)
        self.tfidf = TfidfModel(
            [self.dictionary.doc2bow(doc) for doc in documents],
            id2word=self.dictionary)
        return self

    def transform(self, documents):
        """
        Returns a vectorized embedding of each document in documents.

        Parameters
        -----------
        documents: iterable
            List of documents. Each document must be a list of tokens.

        Returns
        -------
            iterable: list of vectorized documents.
        """
        from gensim.matutils import sparse2full
        if self.dictionary is None:
            raise AttributeError('Must have a fit vocab in order'
                                 ' to call transform.')

        def generator():
            """
            Closure to mutate return type depending on value of `use_sparse_representation`.
            """
            for document in documents:
                vec = self.tfidf[self.dictionary.doc2bow(document)]
                if self.use_sparse_representation:
                    yield vec
                else:
                    yield sparse2full(vec, len(self.dictionary))

        return list(generator())
예제 #35
0
# path = 'C:\\Users\\okigboo\\Desktop\\PythonDataScience\\tweeter\\'
os.chdir(path)

data = pd.read_csv('nyt.csv')

text_clean = []

for text in data['News_content']:
    text_clean.append(pptext(text).split())

print(text_clean[:3])

dictionary = Dictionary(text_clean)
corpus = [dictionary.doc2bow(text) for text in text_clean]
pickle.dump(corpus, open('topicModels//corpus2.pkl', 'wb'))
dictionary.save('topicModels//dictionary2.gensim')

ldamodel = LdaModel(corpus, num_topics=15, id2word=dictionary, passes=15)
ldamodel.save('topicModels//model15.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

# Visaulization works only on Jupyter Notebook
# type jupyter notebook or jupyter console
dictionary = Dictionary.load('topicModels//dictionary2.gensim')
corpus = pickle.load(open('topicModels//corpus2.pkl', 'rb'))
ldamd = LdaModel.load('topicModels//model15.gensim')

lda_display = pyLDAvis.gensim.prepare(ldamd,
                                      corpus,
예제 #36
0
  def saveGensim(self, topic):
    if topic is None:
      # generate all
      self.saveGensim('movie')
      self.saveGensim('celebrity')
      self.saveGensim('syria')
      self.saveGensim('ufo')
      return

    posDocs = []
    negDocs = []

    if topic == 'movie':
      topic = 'movie_reviews'
    elif topic == 'celebrity':
      topic = 'bieber'

    if topic == 'movie_reviews':
      count = 100
      posDocs = self.movieReviews('positive', count)
      negDocs = self.movieReviews('negative', count)
    else:
      posDocs = self.getArticlesHelper('positive', topic)
      negDocs = self.getArticlesHelper('negative', topic)

    listOfTokens = [] # dictionary
    docs = [] # corpus

    for posDoc in posDocs:
      processed = self.processDocForGensim(posDoc)
      tokens = self.tokensFromText(processed)
      listOfTokens.append(tokens)
      docs.append(processed)
    for negDoc in negDocs:
      processed = self.processDocForGensim(negDoc)
      tokens = self.tokensFromText(processed)
      listOfTokens.append(tokens)
      docs.append(processed)

    dictionaryFilename = 'gensim_dictionary.txt'
    corpusFilename = 'gensim_corpus.mm'

    # make destination files if they don't exist
    dictionaryPath = os.path.join(
      os.path.dirname(os.path.abspath(__file__)),
      'james_data',
      topic,
      dictionaryFilename
    )

    corpusPath = os.path.join(
      os.path.dirname(os.path.abspath(__file__)),
      'james_data',
      topic,
      corpusFilename
    )

    corpusTempPath = corpusPath + '.tmp'

    if os.path.exists(dictionaryPath):
      os.remove(dictionaryPath)

    if os.path.exists(corpusPath):
      os.remove(corpusPath)

    if os.path.exists(corpusTempPath):
      os.remove(corpusTempPath)

    with open(dictionaryPath, 'w') as f:
      f.write(' ')

    with open(corpusPath, 'w') as f:
      f.write(' ')

    # save dictionary and corpus
    d = Dictionary(listOfTokens)
    d.save(dictionaryPath)

    with open(corpusTempPath, 'w') as f:
      f.write('\n'.join(docs))

    corpus = TextCorpus(corpusTempPath)
    MmCorpus.save_corpus(corpusPath, corpus)

    return
예제 #37
0
    ]

    test_texts = [
        text_to_word_sequence(data['text'])
        for data in tqdm(imdb_dataset(test=True))
    ]
    test_labels = [
        sentiment[data['sentiment']] for data in imdb_dataset(test=True)
    ]

    # test = imdb_dataset(test=True)

    all_texts = np.concatenate((train_texts, test_texts)).tolist()

    vocabulary = Dictionary(documents=all_texts)
    vocabulary.save('imdb_vocabulary')

    train_x = np.asarray([
        np.asarray(vocabulary.doc2idx(doc), dtype=np.int32) + 1
        for doc in tqdm(train_texts)
    ])
    train_y = np.asarray(train_labels, dtype=np.int32)

    test_x = np.asarray([
        np.asarray(vocabulary.doc2idx(doc), dtype=np.int32) + 1
        for doc in tqdm(test_texts)
    ])
    test_y = np.asarray(test_labels, dtype=np.int32)

    np.save('train_x.npy', train_x)
    np.save('train_y', train_y)
예제 #38
0
        )
    else:
        langlinks_fname = sys.argv[1]
        in_dict_fnames = sys.argv[2:-1]
        num_langs = len(in_dict_fnames)
        out_dict_fname = sys.argv[-1]

        with open(langlinks_fname) as langlinks_file:
            langlinks = csv.reader(langlinks_file)
            lang_names = next(langlinks)  # Read header row

        out_dict = Dictionary()
        id_offset = 0
        for in_dict_fname, lang_name in zip(in_dict_fnames, lang_names):
            in_dict = Dictionary.load(in_dict_fname)
            for token, old_id in in_dict.token2id.items():
                df = in_dict.dfs[old_id]
                new_id = old_id + id_offset
                new_token = '{}#{}'.format(lang_name, token)

                out_dict.token2id[new_token] = new_id
                out_dict.dfs[new_id] = df

            out_dict.num_docs += in_dict.num_docs
            out_dict.num_pos += in_dict.num_pos
            out_dict.num_nnz += in_dict.num_nnz

            id_offset += len(in_dict)

        out_dict.save(out_dict_fname)
예제 #39
0
    lt = LoopTimer(update_after=10, avg_length=1000, target=target)
    for abstract_id, row in infoDF.iterrows():
        doc = Doc(vocab).from_disk(
            os.path.join(path_to_annotations, f"{abstract_id}.spacy"))

        doc = replace_cluster_in_doc(doc, replace_dic, sorted_mentions, nlp)

        lemma_s_list.append(doc_2_token(doc, split_sentences=True))
        lemma_d_list.append(doc_2_token(doc, split_sentences=False))
        abstract_id_list.append(abstract_id)

        breaker = lt.update(f"Create Pandas - {len(lemma_d_list)}")

    dictionary = Dictionary(lemma_d_list)
    id_d_list = [dictionary.doc2idx(document) for document in lemma_d_list]
    id_s_list = [[dictionary.doc2idx(sentence) for sentence in document]
                 for document in lemma_s_list]

    corpus = {
        "abstract_id": abstract_id_list,
        "lemma_sentence": lemma_s_list,
        "lemma_document": lemma_d_list,
        "lemma_id_sentence": id_s_list,
        "lemma_id_document": id_d_list
    }

    with open(os.path.join(path_to_pandas, corpus_file_name), "wb") as handle:
        pickle.dump(corpus, handle)

    dictionary.save(os.path.join(path_to_pandas, dictionary_file_name))
예제 #40
0
def saveWords(words, wordfile):
  from gensim.corpora.dictionary import Dictionary
  from gensim.corpora import MmCorpus
  dict=Dictionary(words)
  dict.save(wordfile)
예제 #41
0
파일: parser.py 프로젝트: cboix/rdaneel
    
    db.hset('idlookup', index, postid)

class RedisCorpus(object):
    def __init__(self, postids):
        self.postids = postids
        self.numPosts = len(self.postids)
        
    def __iter__(self):
        count = 0
        for postid in self.postids:
            if count % 100 == 0:
                print "Wrote %d out of %d to corpus: %s" % (count, self.numPosts, time.strftime("%H:%M:%S"))
            addCorpusMap(count, postid)
            count += 1
            yield corpusOfPost(postid, force=True)

def buildCorpus():
    """ Returns a corpus object that contains sparse vectors from every post. """

    postids = getPostids()
    corpus = RedisCorpus(postids)
    return corpus

if __name__ == "__main__":
    buildDictionary(force=True)
    globalDict.save(dictName)

    corpus = buildCorpus()
    BleiCorpus.serialize('redditcorpus.lda-c', corpus)
예제 #42
0
파일: corpus.py 프로젝트: Tooa/cablemap
class CableCorpus(BaseCorpus):
    """\
    The cable corpus consists of several files which are written into a directory.

    * a dictionary with a ``<word id> <word> <frequency>`` mapping saved under "wordids.pickle"
    * a JSON file with a ``<cable reference id> <document number>`` mapping under "id2docid.json"
    * a `Market Matrix format <http://math.nist.gov/MatrixMarket/formats.html>` vector space model file "bow.mm"

    CAUTION: The corpus overrides any existing files with the same file name in the specified directory.

    By default, the corpus creates the word dictionary and the vector space model which
    may lead into an unuseful vector space model. To filter certain words, the corpus may be
    initialized with a pre-generated word dictionary. To make the dictionary immutable, the property
    ``allow_dict_updates`` should be set to ``False`` (updates are allowed by default).
    The resulting vector space model contains only words which are in the word dictionary then.

    Example to reduce the clutter::

        corpus = CableCorpus('/my/directory/')
        # Add some texts here
        corpus.add_text('ref-1', u'bla bla bla')
        corpus.add_text('ref-2', u'bla bla blub')
        ...
        corpus.dct.filter_extremes()
        corpus.close()

        from gensim.corpora.dictionary import Dictionary

        # Load previously created dict
        dct = Dictionary.load_from_text('/my/directory/cables_wordids.txt')
        # Create another corpus with the previously word dict
        corpus = CableCorpus('/my/directory/', dct, allow_dict_updates=False)
        # Add some texts
        ....
        corpus.close()
    """
    def __init__(self, path, dct=None, tokenizer=None, allow_dict_updates=True, prefix=None):
        """\
        Initializes the cable corpus.
        
        `path`
            Directory where the generated files are stored.
        `dct`
            An existing `gensim.corpora.dictionary.Dictionary`
            If it's ``None`` (default) a dictionary will be created.
        `tokenizer`
            A function to tokenize/normalize/clean-up/remove stop words from strings.
            If it's ``None`` (default), a default function will be used to tokenize texts.
        `allow_dict_updates`
            Indicats if unknown words should be added to the dictionary (default ``True``).
        `prefix`
            A prefix for the generated file names.
        """
        super(CableCorpus, self).__init__(tokenizer)
        if not os.path.isdir(path):
            raise IOError('Expected a directory path')
        self.dct = Dictionary() if dct is None else dct
        self._path = path
        self._prefix = prefix or 'cables_'
        self._mw = IncrementalMmWriter(os.path.join(path, self._prefix + 'bow.mm'))
        self.allow_dict_updates = allow_dict_updates
        self._cables = []

    def add_words(self, reference_id, words):
        self._cables.append(reference_id)
        self._mw.add_vector(self.dct.doc2bow(words, self.allow_dict_updates))

    def close(self):
        self._mw.close()
        self.dct.save(os.path.join(self._path, self._prefix + 'wordids.pickle'))
        json_filename = os.path.join(self._path, self._prefix + 'id2docid.json')
        json.dump(dict(zip(self._cables, count())), open(json_filename, 'wb'))