Exemplo n.º 1
0
def tfidf_filter(dataset, threshold):
    tokens = []
    #print('tokenizing documents...')
    for doc in dataset:
        #doc = clean_text(doc)
        tokenize = regTokenize(doc)
        tokens.append(tokenize)
    #print('creating dictionary...')
    dct = Dictionary(tokens)
    corpus = [dct.doc2bow(line) for line in tokens]
    #print(len(corpus))
    #print('creating tf-idf model...')
    model = TfidfModel(corpus, id2word=dct)
    low_value_words = []
    for bow in corpus:
        low_value_words += [
            id for id, value in model[bow] if (value < threshold)
        ]  #and dct[id] != "reforma_tributaria")]
    #print("low_value_words:",len(low_value_words))
    dct.filter_tokens(bad_ids=low_value_words)
    new_corpus = [dct.doc2bow(doc) for doc in tokens]
    #print(len(new_corpus))
    corp = []
    for doc in new_corpus:
        corp.append([dct[id] for id, value in doc])
    return corp
def remove_rare_often_word(texts, low_value, high_value):
    #removing frequent and rare words
    texts_tokenized = [simple_preprocess(doc) for doc in texts]
    dictionary = Dictionary(texts_tokenized)
    corpus = [dictionary.doc2bow(doc) for doc in texts_tokenized]

    tfidf = TfidfModel(corpus, id2word=dictionary)
    corpus_tfidf = tfidf[corpus]

    bad_words = []
    for sent_tfidf in tqdm(corpus_tfidf, desc="selecting bad words"):
        bad_words += [
            id for id, value in sent_tfidf
            if (value < low_value) or (value > high_value)
        ]

    dictionary.filter_tokens(bad_ids=bad_words)

    out_bow = [dictionary.doc2bow(doc) for doc in texts_tokenized]

    out_corpus = []
    for doc in tqdm(out_bow, desc='Creating out corpus'):
        out_corpus.append([dictionary.get(id) for id, value in doc])

    dict_tfidf = {
        dictionary.get(id): value
        for doc in corpus_tfidf for id, value in doc
        if (value >= low_value) and (value <= high_value)
    }

    return {
        'texts': out_corpus,
        'dict_tfidf': dict_tfidf,
        'dictionary': dictionary
    }
Exemplo n.º 3
0
class MiCorpus:
    """
    Iterable: en cada iteración devuelve vectores bag-of-words, uno por documento.
    Procesa un documento a la vez usando generators. Nunca carga todo el corpus a RAM.
    """
    def __init__(self, directorio, lenguaje, otros=None):
        self.directorio = directorio
        self.lenguaje = lenguaje
        self.otros = otros

        self.ngramas = model_ngrams(
            iter_sentences(self.directorio, self.lenguaje, self.otros))

        self.diccionario = Dictionary(
            iter_documents(self.ngramas, self.directorio, self.lenguaje,
                           self.otros))
        self.diccionario.filter_extremes(no_above=0.8)
        self.diccionario.filter_tokens(
            bad_ids=(tokid for tokid, freq in self.diccionario.dfs.items()
                     if freq == 1))
        self.diccionario.compactify()

    def __iter__(self):
        """
        CorpusConsultivos es un streamed iterable.
        """
        for tokens in iter_documents(self.ngramas, self.directorio,
                                     self.lenguaje, self.otros):
            yield self.diccionario.doc2bow(tokens)
Exemplo n.º 4
0
def texts2corpus(documents,
                 tfidf=False,
                 stopwords=None,
                 filter_below=5,
                 filter_above=0.5,
                 keep_n=100000,
                 logg=print):
    logg(f'generating {"tfidf" if tfidf else "bow"} corpus and dictionary')

    dictionary = Dictionary(documents, prune_at=None)
    dictionary.filter_extremes(no_below=filter_below,
                               no_above=filter_above,
                               keep_n=keep_n)

    # filter some noice (e.g. special characters)
    if stopwords:
        stopword_ids = [dictionary.token2id[token] for token in stopwords]
        dictionary.filter_tokens(bad_ids=stopword_ids, good_ids=None)

    bow_corpus = [dictionary.doc2bow(text) for text in documents]
    if tfidf:
        tfidf_model = TfidfModel(bow_corpus)
        corpus = tfidf_model[bow_corpus]
    else:
        corpus = bow_corpus

    return corpus, dictionary
Exemplo n.º 5
0
    def testFilterTokens(self):
        self.maxDiff = 10000
        d = Dictionary(self.texts)

        removed_word = d[0]
        d.filter_tokens([0])

        expected = {
            'computer': 0,
            'eps': 8,
            'graph': 10,
            'human': 1,
            'interface': 2,
            'minors': 11,
            'response': 3,
            'survey': 4,
            'system': 5,
            'time': 6,
            'trees': 9,
            'user': 7
        }
        del expected[removed_word]
        self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))

        expected[removed_word] = len(expected)
        d.add_documents([[removed_word]])
        self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))
Exemplo n.º 6
0
    def _build_vocab(self, max_vocab_cnt):
        all_words = []
        for data in self.valid + self.non_valid:
            all_words.append(data["title"] + data["content"])
        vocab = Dictionary(all_words)
        raw_vocab_size = len(vocab)

        vocab.filter_extremes(no_below=5)
        vocab.filter_extremes(keep_n=max_vocab_cnt)
        len_1_words = list(
            filter(
                lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w
                not in ["a", "i"] and True or False, vocab.values()))
        vocab.filter_tokens(list(map(vocab.token2id.get, len_1_words)))
        if self.config.use_dict == "seq" and self.config.enable_pad:
            vocab.token2id[PAD] = len(vocab)
            vocab.compactify()
            self.pad_wid = vocab.token2id.get(PAD)
        self.vocab_seq = vocab  # seq dictionary
        # build bow dictionary
        self.vocab_bow = copy.deepcopy(vocab)
        self.vocab_bow.filter_tokens(
            map(self.vocab_bow.token2id.get, STOPWORDS))  # filter stop words
        self.vocab_bow.compactify()
        if self.config.tfidf:
            tfidf_corpus = [self.vocab_bow.doc2bow(line) for line in all_words]
            self.tfidf_model = TfidfModel(tfidf_corpus)
        print("Load corpus with non_valid size %d, valid size %d, "
              "raw vocab size %d seq vocab size %d, bow vocab size %d" %
              (len(self.non_valid), len(self.valid), raw_vocab_size,
               len(self.vocab_seq), len(self.vocab_bow)))
Exemplo n.º 7
0
def prepare_corpus(tweets_file, corpus_file, dictionary_file, author_topic):
    stop_words = set(stopwords.words('english'))
    stop_words.add(u'rt')

    print('Loading tweets from ' + tweets_file)
    tweets = pd.read_pickle(tweets_file)

    if author_topic:
        tweets = tweets.groupby('user').agg({'text': 'sum'})

    print('%d tweets loaded' % len(tweets.index))

    dictionary = Dictionary(tweets['text'])
    stopword_ids = map(dictionary.token2id.get, stop_words)
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None)
    dictionary.compactify()

    corpus = [dictionary.doc2bow(doc) for doc in tweets['text']]

    # print(corpus)
    print("Writing corpus to " + corpus_file)
    MmCorpus.serialize(corpus_file, corpus)
    # print(dictionary)
    print("Writing dictionary to " + dictionary_file)

    dictionary.save(dictionary_file)
Exemplo n.º 8
0
class process_corpus(object):

    def __init__(self, sql=None,lemmatize=False,first_sentences=False,n_sentences=10):

        self.sql=sql
        self.first_sentences=first_sentences
        self.n_sentences=n_sentences
        self.wordnet=WordNetLemmatizer()
        self.pstemmer=PorterStemmer()
        self.lemmatize=lemmatize
        self.dictionary = Dictionary(self.iterrecords())

        print('dictionary before:', self.dictionary.token2id)


        once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq == 1]
        self.dictionary.filter_tokens(once_ids)
        self.dictionary.compactify()
        print('dictionary after filtering:', self.dictionary.token2id)

    def  __iter__(self):
        self.cl=0
        for tokens in self.iterrecords():  # generates the document tokens and creates bow using dictionary
            self.cl+=1

            yield self.dictionary.doc2bow(tokens)






    def iterrecords(self): # generates document tokens for the dictionary

        self.index=[]
        cursor.execute(self.sql)
        ct=0


        for doc in cursor:
                print ct
                self.index.append(str(doc[0]).strip())

                doc=doc[1]
#                print to_beautiful(doc[1])


                if self.first_sentences:

                    doc=get_first_n_sentences_from_document(doc,self.n_sentences)


                tokens=clean_text_by_word(doc)

                ct+=1
                yield  tokens # or whatever tokenization suits you

    def __len__(self):

        return self.cl
		def fetch_dict():
			global dictionary
			dictionary=Dictionary([i for i in my_dictionary])
			once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
			dictionary.filter_tokens(once_ids)
			dictionary.compactify()
			dictionary.save("Topic/dic.loc")
			return dictionary
Exemplo n.º 10
0
def pipline(data: DataFrame):
    if os.path.isfile(cache_path('run/' + _ARGS.name)):
        corpus, dictionary, documents = load_cache('run/' + _ARGS.name)
    elif data:
        documents = data['tokens'].to_list()
        # Create a dictionary representation of the documents.
        dictionary = Dictionary(documents)

        # Filter out words that occur less than 20 documents, or more than 50% of the documents.
        dictionary.filter_extremes(no_below=20, no_above=0.5)

        # 去停用词
        bad_ids = [dictionary.token2id[t] for t in STOP_WORDS if t in dictionary.token2id]
        dictionary.filter_tokens(bad_ids=bad_ids)

        # Bag-of-words representation of the documents.
        corpus = [dictionary.doc2bow(doc) for doc in documents]
        dump_cache((corpus, dictionary, documents), 'run/' + _ARGS.name)
    else:
        raise ValueError('cache不存在且未传入data')

    _ = dictionary[0]  # This is only to "load" the dictionary.
    output('Number of unique tokens: ', len(dictionary))
    output('Number of documents: ', len(corpus))
    # test = get_model(6, corpus, dictionary.id2token)

    topic_range = tuple(int(s.strip()) for s in _ARGS.range.split(','))
    kwargs = dict(
        id2word=dictionary.id2token, chunksize=len(corpus),
        passes=_ARGS.passes, alpha='auto', eta='auto', eval_every=1,
        iterations=_ARGS.iterations, random_state=123)
    if len(corpus) < 1e6:  # 并行训练模型
        pool = Pool(_ARGS.pool_size)
        result_dict = dict()
        for k in range(*topic_range):
            result_dict[k] = pool.apply_async(get_model, (corpus, k, kwargs))
        result_dict = {k: v.get() for k, v in result_dict.items()}
        pool.close()  # 等子进程执行完毕后关闭进程池
        pool.join()
        output(f"Searched range{topic_range}")
        # 计算一致性的代码自己有多进程,所以只能串行
        for k, (model, ids) in result_dict.items():
            eval_and_write(data, k, documents, dictionary, corpus, model, ids)
    else:
        # kwargs['alpha'] = 'symmetric'
        kwargs['chunksize'] = len(corpus) // 8 // _ARGS.pool_size + 1
        # kwargs['batch'] = True
        for k in range(*topic_range, 2):  # 大数据就粗点筛
            # model = LdaMulticore(corpus, k, workers=_ARGS.pool_size, **kwargs)
            model = LdaModel(corpus, k, **kwargs)
            ids = save_and_inference(model, corpus, k, kwargs['chunksize'])
            # result_dict[k] = (model, ids)  # 内存不够用啊,4M句子
            eval_and_write(None, k, documents, dictionary, corpus, model, ids)
            del model, ids
            gc.collect()

    output(f"===> {_ARGS.name} compete. \n")
Exemplo n.º 11
0
 def fetch_dict():
     global dictionary
     dictionary = Dictionary([i for i in my_dictionary])
     once_ids = [
         tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
         if docfreq == 1
     ]
     dictionary.filter_tokens(once_ids)
     dictionary.compactify()
     dictionary.save("Topic/dic.loc")
     return dictionary
Exemplo n.º 12
0
        def prep_corpus(docs, additional_stopwords=set(), no_below=2, no_above=0.05):

            dictionary = Dictionary(docs)
            stopwords = nltk_stopwords().union(additional_stopwords)
            stopword_ids = map(dictionary.token2id.get, stopwords)
            dictionary.filter_tokens(stopword_ids)
            dictionary.compactify()
            dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
            dictionary.compactify()
            corpus = [dictionary.doc2bow(doc) for doc in docs]

            return dictionary, corpus
Exemplo n.º 13
0
def Gensim_Dic(sentences, tem_fname):
    dct = Dictionary(sentences)

    a = []
    for w in stopwords:
        if w in dct.token2id.keys():
            a.append(dct.token2id[w])

    dct.filter_extremes(no_below=10)

    dct.filter_tokens(bad_ids=a)
    dct.compactify()
    dct.save_as_text(tmp_fname)
Exemplo n.º 14
0
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
  print('Building dictionary...')
  dictionary = Dictionary(docs)
  stopwords = nltk_stopwords().union(additional_stopwords)
  stopword_ids = map(dictionary.token2id.get, stopwords)
  dictionary.filter_tokens(stopword_ids)
  dictionary.compactify()
  dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
  dictionary.compactify()

  print('Building corpus...')
  corpus = [dictionary.doc2bow(doc) for doc in docs]

  return dictionary, corpus
Exemplo n.º 15
0
    def __prep_dict(self, doc):
        def nltk_stopwords():
            return set(nltk.corpus.stopwords.words('english'))

        additional_stopwords = [
            'nbsp', '.', ',', '"', "'", '?', '!', '>', ':', ';', '(', ')', '[',
            ']', '{', '}', '/', '.com'
        ]
        dictionary = Dictionary(doc)
        stopwords = nltk_stopwords().union(additional_stopwords)
        stopword_ids = map(dictionary.token2id.get, stopwords)
        dictionary.filter_tokens(stopword_ids)
        dictionary.compactify()
        return dictionary.doc2bow(doc)
	def fetch_dict():
		print "Fetching Dictionary...",
		try:
			dictionary=Dictionary().load("Topic/dic.tm")
			print "Dictionary loaded!"
		except IOError:
			print "Dictionary not found, building Dictionary..."
			dictionary=Dictionary(i for i in MyDictionary())
			once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
			dictionary.filter_tokens(once_ids)
			dictionary.compactify() 
			print "\rDictionary Built!"
			print dictionary
			dictionary.save("Topic/dic.tm")
		return dictionary
Exemplo n.º 17
0
def extract_topics(words):
    word_id_map=Dictionary([words])
    word_id_map.filter_tokens([id for id, occurance in word_id_map.dfs.iteritems() if occurance == 2])
    word_id_map.compactify()
    deals_corpus=[word_id_map.doc2bow(words)]
    lda=LdaModel(corpus=deals_corpus, id2word=word_id_map, num_topics=15, update_every=1, chunksize=1000,passes=1)
    topics=[]
    for i in range(15):
        tokens=lda.print_topic(i).split('+')
        topic_scores=[]
        for token in tokens:
            score,token_val=token.split('*')
            topic_scores.append((token_val,score))
        topics.append(topic_scores)
    return topics
Exemplo n.º 18
0
    def testFilterTokens(self):
        self.maxDiff = 10000
        d = Dictionary(self.texts)

        removed_word = d[0]
        d.filter_tokens([0])

        expected = {'computer': 0, 'eps': 8, 'graph': 10, 'human': 1,
                'interface': 2, 'minors': 11, 'response': 3, 'survey': 4,
                'system': 5, 'time': 6, 'trees': 9, 'user': 7}
        del expected[removed_word]
        self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))

        expected[removed_word] = len(expected)
        d.add_documents([[removed_word]])
        self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))
 def fetch_dict():
     print "Fetching Dictionary...",
     try:
         dictionary = Dictionary().load("Topic/dic.tm")
         print "Dictionary loaded!"
     except IOError:
         print "Dictionary not found, building Dictionary..."
         dictionary = Dictionary(i for i in MyDictionary())
         once_ids = [
             tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
             if docfreq == 1
         ]
         dictionary.filter_tokens(once_ids)
         dictionary.compactify()
         print "\rDictionary Built!"
         print dictionary
         dictionary.save("Topic/dic.tm")
     return dictionary
Exemplo n.º 20
0
    def execute(self, data, passes=10):
        wordlists = [corpus.contents.lower().split() for corpus in data]

        stoplist = stopwords.words('english')

        dictionary = Dictionary(wordlists)

        # Remove stop words and words that appear too much or too little
        stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
        dictionary.filter_tokens(stop_ids)
        dictionary.filter_extremes(no_below=2, no_above=0.2)

        bags_of_words = [dictionary.doc2bow(t) for t in wordlists]

        # This can take a while to run:
        lda = LdaModel(bags_of_words, id2word=dictionary, num_topics=self.num_topics, passes=passes)

        results = self.assemble_topics(lda)
        return results
Exemplo n.º 21
0
def get_topics_lda(tokens, n_topics=10):
    """
    Using the `gensim` package for LDA. 
    LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia).
    `gensim` is a package for topic modeling only. So for a particular topic modeling task,
    it is a lighter option to install and run. Also it can be run distributed and updated over an existing model

    :param tokens: Preprocessed tokens for faster dictionary building
    :param n_topics: Number of topics to decompose data to
    :return: list() of topics
    """
    dict_file = 'resources/deals.dict'
    if not os.path.isfile(dict_file):
        print "Dictionary file does not exist. Creating one"
        dictionary = Dictionary(tokens)
        freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1]
        dictionary.filter_tokens(freq1)
        dictionary.compactify()
        dictionary.save(dict_file)
    dictionary = Dictionary.load(dict_file)
    # print dictionary

    corpus_file = 'resources/deals.mm'
    if not os.path.isfile(corpus_file):
        print "Corpus file does not exist. Creating one"
        corpus = [dictionary.doc2bow(token) for token in tokens]
        MmCorpus.serialize(corpus_file, corpus)
    mm = MmCorpus(corpus_file)
    # print mm
    # tfidf = TfidfModel(mm)
    # corpus_tfidf = tfidf[mm]

    lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000,
                   passes=1)
    topics = []
    for i in range(0, n_topics):
        words = lda.print_topic(i).split('+')
        topic = []
        for word in words:
            score, w = word.split('*')
            topic.append((w, score))
        topics.append(topic)
    return topics
def prepare_word_embedding():
    """Construct vocabulary file and word embedding file.
    """
    df = pd.read_csv(
        "data/raw/train.csv", usecols=["original_phrase1", "original_phrase2", "ytrue"]
    )

    model = KeyedVectors.load_word2vec_format(
        "/data/mayu-ot/Data/Model/GoogleNews-vectors-negative300.bin.gz", binary=True
    )

    CUSTOM_FILTERS = [
        lambda x: x.lower(),
        strip_punctuation,
        strip_multiple_whitespaces,
        strip_numeric,
    ]

    doc = [preprocess_string(x, CUSTOM_FILTERS) for x in df.values[:, :2].ravel()]

    dct = Dictionary(doc)

    bad_ids = []
    for k, v in dct.iteritems():
        if v not in model:
            bad_ids.append(k)
    dct.filter_tokens(bad_ids)

    dct.compactify()

    for k, v in dct.iteritems():
        print(k, v)
        if k == 10:
            break

    dct.save_as_text("data/processed/dictionary.txt")

    word_emb = np.ones((len(dct), 300))

    for k, v in dct.iteritems():
        word_emb[k] = model[v]

    np.save("data/processed/word2vec", word_emb)
Exemplo n.º 23
0
    def train_lda(self, df, n_topics, min_count=2, labels=None, tag=False):
        """
        Learn an LDA topic model from input data using gensim
        :param df:
        :param n_topics:
        :param min_count:
        :return:
        """
        
        #Save class labels if necessary
        if labels != None:
            y = df.loc[:, labels].values
        
        #Clean and find phrases
        df = read_clean(df, phraser=self.phrases)
        
        # Get gensim dictionary, remove function words and infrequent words
        common_dictionary = Dictionary(df)
        common_dictionary.filter_extremes(no_below=min_count)
        remove_ids = [common_dictionary.token2id[x] for x in self.function_words_single if
                      x in common_dictionary.token2id]

        # Filter out words we don't want
        common_dictionary.filter_tokens(bad_ids=remove_ids)
        common_corpus = [common_dictionary.doc2bow(text) for text in df]

        # Train LDA
        lda = LdaModel(common_corpus, 
                        num_topics=n_topics,
                        distributed=False,
                        passes=10,
                        iterations=10,
                        )

        # Save to class
        self.lda = lda
        self.lda_dictionary = common_dictionary
        ai_logger.debug("Done learning LDA model")
        
        #If necessary, annotate the corpus as well
        if tag==True:
            tag_df = self.use_lda(df, y, cleaned=True)
            return tag_df
Exemplo n.º 24
0
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(docs)
    # remove stopwords
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(shortword_ids)
    dictionary.compactify()
    # remove words that appear only once
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1]
    dictionary.filter_tokens(once_ids)
    dictionary.compactify()
    # filter extreme values
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
    dictionary.compactify()

    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return dictionary, corpus
Exemplo n.º 25
0
def build_dictionary(text_file, stop_words):
    """
    This function takes a text file and a file of stop words, and builds dictionary with pairs of word indexes and word
    counts in every paragraph.

    :param text_file: Input text file
    :param stop_words: Text file of stop words
    :return: Corpus object (=list of paragraphs); each paragraph is a list of pairs (word-index, word-count)
    """
    words, paragraphs = process_text(text_file)
    dictionary = Dictionary(words)

    # Gather all stop words
    with codecs.open(stop_words, "r", "utf-8") as stop_w:
        stop_words = stop_w.read().split(',')

    # Gather all stop word ids
    stop_word_ids = []
    for i in range(len(dictionary)):
        if dictionary[
                i] in stop_words:  # Check if stop word exists in dictionary
            stop_word_ids.append(dictionary.token2id[dictionary[i]])
    dictionary.filter_tokens(stop_word_ids)  # Filter out all stop words

    bags_of_words = []
    printProgressBar(0,
                     len(words),
                     prefix='Building dictionary:',
                     suffix='Complete',
                     length=50)
    for i in range(len(words)):
        printProgressBar(i + 1,
                         len(words),
                         prefix='Building dictionary:',
                         suffix='Complete',
                         length=50)
        bags_of_words.append(dictionary.doc2bow(words[i]))

    return bags_of_words, dictionary, paragraphs
Exemplo n.º 26
0
def prep_text_lda(docs, vocab_size=20000):
    """ docs: (pd.Series str) cleaned text """

    english_stopwords = set([s.replace("\'", "") for s in stopwords.words("english")])
    tqdm.pandas(desc="Tokenizing")
    tokenized_docs = docs.progress_apply(lambda x: [w.lower() for w in tokenize(x)])

    bigram = Phrases(tokenized_docs.values.tolist())
    phraser = Phraser(bigram)
    tqdm.pandas(desc="Bigrams")
    bigrammed_docs = tokenized_docs.progress_apply(lambda tokens_: phraser[tokens_])

    id2word = Dictionary(bigrammed_docs.values.tolist())
    id2word.filter_extremes(keep_n=vocab_size, no_above=0.5)
    id2word.filter_tokens(bad_ids=[id2word.token2id[a] for a in english_stopwords if a in id2word.token2id])
    id2word.compactify()

    tqdm.pandas(desc="Cleaning")
    tokenized = bigrammed_docs.progress_apply(lambda doc_tokens: " ".join([w for w in doc_tokens if w in id2word.token2id]))
    reconst_docs = tokenized.apply(lambda x: x.split())

    return id2word, reconst_docs
Exemplo n.º 27
0
 def __init__(self, directory=None, dictionary=None, distributions=None, corpus=None, max_docs=None):
     if directory:
         docs = self.get_docs(directory, distributions, max_docs)
         if not dictionary:
             """ Construct dictionary without having all texts in memory, based off the example in the Gensim docs"""
             dictionary = Dictionary(filter_common(codecs.open(doc, encoding='utf-8').read().lower().split()) for doc in docs)
             once_words = [id for id, freq in dictionary.dfs.iteritems() if freq is 1]
             dictionary.filter_tokens(once_words)     # Exclude if appears once
             dictionary.compactify()                  # Remove gaps in ids left by removing words
             dictionary.filter_extremes(no_below=20, no_above=0.75, keep_n=None)  # Filter if in less than 20 docs and if in more than 75%
             self.dictionary = dictionary
         else:
             self.dictionary = Dictionary.load(dictionary)
         self.docs = PaperCorpus(docs)
     elif dictionary and corpus:
         self.dictionary = Dictionary.load(dictionary)
         self.docs = MmCorpus(corpus)
     else:
         self.dictionary = Dictionary([])
         self.docs = PaperCorpus([])
     self.transformation = IdentityTransformation()
     self.train_time = None
     self.sim_index = None
     return
Exemplo n.º 28
0
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(docs)
    # remove stopwords
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    # get ids for short words len(word)<=3
    shortword_ids = [
        tokenid for tokenid, word in dictionary.iteritems()
        if len(word.split('/')[0]) <= 3
    ]
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    # get ids for short words len(word)<=3
    shortword_ids = [
        tokenid for tokenid, word in dictionary.iteritems()
        if len(word.split('/')[0]) <= 3
    ]
    dictionary.filter_tokens(shortword_ids)
    dictionary.compactify()
    # remove words that appear only once
    once_ids = [
        tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
        if docfreq == 1
    ]
    dictionary.filter_tokens(once_ids)
    dictionary.compactify()
    # filter extreme values
    dictionary.filter_extremes(no_below=no_below,
                               no_above=no_above,
                               keep_n=None)
    dictionary.compactify()

    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return dictionary, corpus
Exemplo n.º 29
0
mecab = MeCab.Tagger("-Owakati")

# 辞書に含めない単語たち
words_blacklist = [
    ">>",  # チャットのアノテーション
    "some_agent",
    "\u3000",  # 全角スペースを意味している
    "。",
    "、",
]

dct = Dictionary()
# csvファイルの読み込み
df = pd.read_csv(filepath, delimiter=",", names=["talker", "words", "type"])
# 文を分かち書き -> 半角スペースで区切り -> 最後の1文字(改行コード)を消したリストを得る
wakati_df = df["words"].map(lambda x: mecab.parse(x).split(" ")[:-1])
# 辞書に追加
dct.add_documents(wakati_df)

# ブラックリストの辞書内でのidを得る
words_blacklist_id = dct.doc2idx(words_blacklist)
# 辞書から削除
dct.filter_tokens(bad_ids=words_blacklist_id)
#dct.filter_n_most_frequent(600)

# 辞書の保存
dct.save(os.path.join(filedir, ".".join([filename, "dict"])))

# 辞書の中身と単語数の表示
print(dct.token2id)
print(len(dct.token2id))
Exemplo n.º 30
0
class CMVCorpus(object):
    logger = logging.getLogger(__name__)

    def __init__(self, config):
        self.config = config
        self._path = config.data_dir[0]
        self.max_data_size = config.max_data_size
        self.max_utt_len = config.max_utt_len
        self.tokenize = get_chat_tokenize()
        self.train_corpus, self.test_corpus = self._read_file(
            os.path.join(self._path))
        self._build_vocab(config.max_vocab_cnt)
        print("Done loading corpus")

    def _process_dialog(self, data):
        new_dialog = []
        all_lens = []
        all_dialog_lens = []

        for raw_dialog in data:
            dialog = {
                "title": self.tokenize(raw_dialog['title'].lower()),
                "op": self.tokenize(raw_dialog["content"].lower()),
                "pos_conv_lst": [],
                "neg_conv_lst": []
            }
            for i, turns in enumerate(
                    raw_dialog['comments']):  # for each comment lst
                if turns["win"]:
                    conv_lst = dialog["pos_conv_lst"]
                else:
                    conv_lst = dialog["neg_conv_lst"]
                new_utt_lst = []
                for turn in turns["utt_lst"]:
                    argument = self.tokenize(turn.lower())
                    all_lens.append(len(argument))
                    new_utt_lst.append(argument)
                conv_lst.append(new_utt_lst)
                all_dialog_lens.append(len(new_utt_lst))
            new_dialog.append(dialog)
            # cut for the max data size
            if len(new_dialog) >= self.max_data_size:
                break

        print("Max utt len %d, mean utt len %.2f" %
              (np.max(all_lens), float(np.mean(all_lens))))
        print("Max dialog len %d, mean dialog len %.2f" %
              (np.max(all_dialog_lens), float(np.mean(all_dialog_lens))))
        return new_dialog

    def _build_vocab(self, max_vocab_cnt):
        all_words = []
        for dialog in self.train_corpus:
            all_words.append(dialog["op"] + dialog["title"])
            for turns in dialog["pos_conv_lst"] + dialog["neg_conv_lst"]:
                for turn in turns:
                    all_words.append(turn)

        self.vocab_bow = Dictionary(all_words)
        raw_vocab_size = len(self.vocab_bow)
        raw_wc = np.sum(list(self.vocab_bow.dfs.values()))

        # build useless stopwords vocab (e.g, very few words, single ascii words, some punctuation ,."')
        self.vocab_bow.filter_extremes(no_below=10, keep_n=max_vocab_cnt)
        bad_ids = HTML_STOPWORDS + ['cmv']
        self.vocab_bow.filter_tokens(
            list(map(self.vocab_bow.token2id.get, bad_ids)))
        self.vocab_bow.compactify()
        self.vocab_seq = copy.deepcopy(self.vocab_bow)  # for sequence model
        self.vocab_seq.token2id[self.vocab_seq[0]] = len(self.vocab_seq)
        self.vocab_seq.token2id[PAD] = 0
        self.vocab_seq.token2id[UNK] = len(self.vocab_seq)
        self.vocab_seq.compactify()
        self.pad_wid = self.vocab_seq.token2id.get(PAD)

        len_1_words = list(
            filter(
                lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w
                not in ["[", "]", "$", "?", "!", "\"", "'", "i", "a"
                        ] and True or False, self.vocab_bow.values()))
        self.vocab_bow.filter_tokens(
            list(map(self.vocab_bow.token2id.get, len_1_words)))
        # some makeup words
        # makeup_lst = [PAD]
        # for w in makeup_lst:
        #     self.vocab_bow.token2id[w] = len(self.vocab_bow)
        # self.vocab_bow.compactify()
        # self.pad_wid = self.vocab_bow.token2id.get(PAD)
        # here we keep stopwords and some meaningful punctuations
        non_stopwords = filter(
            lambda w: re.match(r"^[\w\d_-]*$", w) and w not in STOPWORDS and
            True or False, self.vocab_bow.values())
        self.vocab_bow_stopwords = copy.deepcopy(self.vocab_bow)
        self.vocab_bow_stopwords.filter_tokens(
            map(self.vocab_bow_stopwords.token2id.get, non_stopwords))
        self.vocab_bow_stopwords.compactify()
        self.vocab_bow_non_stopwords = copy.deepcopy(self.vocab_bow)
        self.vocab_bow_non_stopwords.filter_tokens(
            map(self.vocab_bow_non_stopwords.token2id.get,
                self.vocab_bow_stopwords.values()))
        self.vocab_bow_non_stopwords.compactify()
        remain_wc = np.sum(list(self.vocab_bow.dfs.values()))
        min_count = np.min(list(self.vocab_bow.dfs.values()))
        # create vocabulary list sorted by count
        print(
            "Load corpus with train size %d, "
            "test size %d raw vocab size %d vocab size %d at cut_off %d OOV rate %f"
            % (len(self.train_corpus), len(self.test_corpus), raw_vocab_size,
               len(self.vocab_bow), min_count, 1 - float(remain_wc) / raw_wc))

    def _read_file(self, path):
        with open(path, 'r') as f:
            data = json.load(f)
        return self._process_dialog(data["train"]), self._process_dialog(
            data["test"])

    def _sent2id_seq(self, sent, vocab):
        return list(
            filter(lambda x: x is not None,
                   [vocab.token2id.get(t) for t in sent]))

    def _sent2id_bow(self, sent, vocab):
        if sent:
            return vocab.doc2bow(sent)
        else:
            return []

    def _to_id_corpus(self, data, vocab_seq, vocab_bow):
        results = []
        word_cnt = 0
        msg_cnt = 0

        for dialog in data:
            # convert utterance and feature into numeric numbers
            id_dialog = Pack(title=self._sent2id_seq(dialog["title"],
                                                     vocab_seq),
                             op=self._sent2id_seq(dialog["op"], vocab_seq),
                             pos_conv_seq_lst=[],
                             pos_conv_bow_lst=[],
                             neg_conv_seq_lst=[],
                             neg_conv_bow_lst=[])
            for turns in dialog["pos_conv_lst"]:
                new_turns_bow = []
                new_turns_seq = []
                for turn in turns:
                    id_turn_seq = self._sent2id_seq(turn, vocab_seq)
                    id_turn_bow = self._sent2id_bow(turn, vocab_bow)
                    if id_turn_seq and id_turn_bow:  # filter empty utt
                        new_turns_bow.append(id_turn_bow)
                        new_turns_seq.append(id_turn_seq)
                        word_cnt += len(id_turn_seq)
                        msg_cnt += 1
                if new_turns_seq and new_turns_bow:
                    id_dialog["pos_conv_bow_lst"].append(new_turns_bow)
                    id_dialog["pos_conv_seq_lst"].append(new_turns_seq)
            for turns in dialog["neg_conv_lst"]:
                new_turns_bow = []
                new_turns_seq = []
                for turn in turns:
                    id_turn_seq = self._sent2id_seq(turn, vocab_seq)
                    id_turn_bow = self._sent2id_bow(turn, vocab_bow)
                    if id_turn_seq and id_turn_bow:  # filter empty utt
                        new_turns_bow.append(id_turn_bow)
                        new_turns_seq.append(id_turn_seq)
                        word_cnt += len(id_turn_seq)
                        msg_cnt += 1
                if new_turns_seq and new_turns_bow:
                    id_dialog["neg_conv_bow_lst"].append(new_turns_bow)
                    id_dialog["neg_conv_seq_lst"].append(new_turns_seq)
            if id_dialog.pos_conv_bow_lst and id_dialog.neg_conv_bow_lst:
                results.append(id_dialog)
        print("Load seq with %d msgs, %d words" % (msg_cnt, word_cnt))
        return results, msg_cnt, word_cnt

    def _to_id_corpus_bow(self, data, vocab):
        results = []
        word_cnt = 0
        msg_cnt = 0

        for dialog in data:
            # convert utterance and feature into numeric numbers
            id_dialog = Pack(title=self._sent2id_bow(dialog["title"], vocab),
                             op=self._sent2id_bow(dialog["op"], vocab),
                             pos_conv_bow_lst=[],
                             neg_conv_bow_lst=[])
            for turns in dialog["pos_conv_lst"]:
                new_turns = []
                for turn in turns:
                    id_turn = self._sent2id_bow(turn, vocab)
                    if id_turn:  # filter empty utt
                        new_turns.append(id_turn)
                        word_cnt += np.sum([j for i, j in id_turn])
                        msg_cnt += 1
                if new_turns:
                    id_dialog["pos_conv_bow_lst"].append(new_turns)
            for turns in dialog["neg_conv_lst"]:
                new_turns = []
                for turn in turns:
                    id_turn = self._sent2id_bow(turn, vocab)
                    if id_turn:  # filter empty utt
                        new_turns.append(id_turn)
                        word_cnt += np.sum([j for i, j in id_turn])
                        msg_cnt += 1
                if new_turns:
                    id_dialog["neg_conv_bow_lst"].append(new_turns)
            if id_dialog.pos_conv_bow_lst and id_dialog.neg_conv_bow_lst:
                results.append(id_dialog)
        print("Load bow with %d msgs, %d words" % (msg_cnt, word_cnt))
        return results, msg_cnt, word_cnt

    def get_corpus_bow(self, keep_stopwords=True):
        if keep_stopwords:
            vocab = self.vocab_bow
        else:
            vocab = self.vocab_bow_non_stopwords
        id_train = self._to_id_corpus_bow(self.train_corpus, vocab)
        id_test = self._to_id_corpus_bow(self.test_corpus, vocab)
        return Pack(train=id_train, test=id_test, vocab_size=len(vocab))

    def get_corpus_seq(self):
        vocab = self.vocab_seq

        id_train = self._to_id_corpus_seq(self.train_corpus, vocab)
        id_test = self._to_id_corpus_seq(self.test_corpus, vocab)
        return Pack(train=id_train, test=id_test, vocab_size=len(vocab))

    def get_corpus(self):
        id_train = self._to_id_corpus(self.train_corpus, self.vocab_seq,
                                      self.vocab_bow)
        id_test = self._to_id_corpus(self.test_corpus, self.vocab_seq,
                                     self.vocab_bow)
        # id_valid = self._to_id_corpus(self.valid_corpus, self.vocab_seq, self.vocab_bow)
        return Pack(train=id_train,
                    test=id_test,
                    vocab_size=len(self.vocab_bow))
Exemplo n.º 31
0
stop = set(stopwords.words('english'))
stop_words = [
    'name', 'traceback', 'time', 'require', 'create', 'yamanashi', 'int',
    'byte', 'lyt', 'still', 'thu', 'total', 'cisco', 'type', 'actual', 'node',
    'show', 'needed', 'init', 'clear', 'set', 'ok', 'please', 'jan', 'feb',
    'mar', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'mon', 'tue',
    'wed', 'thur', 'fri', 'sat', 'sun', 'utc', 'ist', 'changed', 'info',
    'saved', 'successfully', 'need', 'collecting', 'second', 'minute', 'hour',
    'timer', 'timed', 'manager', 'director', 'major', 'fujitsu', 'us', 'india',
    'united states', 'japan', 'china'
]
stop_words = list(set(list(stop) + stop_words))

for word in stop_words:
    if (word in dictionary.token2id):
        dictionary.filter_tokens(bad_ids=[dictionary.token2id[word]])

# Vectorize data
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

# Set training parameters.
num_topics = 8
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
Exemplo n.º 32
0
        for indexOfWord in range(len(listOfParagraphs[paragraph])):
            listOfParagraphs[paragraph][indexOfWord] = stemmer.stem(listOfParagraphs[paragraph][indexOfWord])

   
# Making dictionary and removing stopwords
dictionary = Dictionary(listOfParagraphs)
f = codecs.open("common-english-words.txt", "r", "utf-8")
stopwords = f.read().split(',')
stopword_ids = []
for word in stopwords:
    try:
        stopwordid = dictionary.token2id[word]
        stopword_ids.append(stopwordid)
    except:
        continue
dictionary.filter_tokens(stopword_ids)
documentToBow = []
for para in listOfParagraphs:
    documentToBow.append(dictionary.doc2bow(para))

# Creating tfidf model, lsi model and matrices

tfidfModel = gensim.models.TfidfModel(documentToBow)
tfidfCorpus = tfidfModel[documentToBow]
tfidfMatrix = gensim.similarities.MatrixSimilarity(tfidfCorpus)
lsiModel = gensim.models.LsiModel(tfidfCorpus, id2word=dictionary, num_topics=100)
lsiCorpus = lsiModel[documentToBow]
lsiMatrix = gensim.similarities.MatrixSimilarity(lsiCorpus)
print("Report and try to interpret first 3 LSI topics: ")
topics = lsiModel.show_topics(3)
print("3 first lsi topics: ", topics)
Exemplo n.º 33
0
def build_dict(data_lst):
    dictionary = Dictionary(data_lst)
    dictionary.filter_tokens(list(map(dictionary.token2id.get, STOPWORDS)))
    dictionary.filter_extremes(no_below=3)  #, keep_n=10000)
    dictionary.compactify()
    return dictionary