예제 #1
0
def train(args, output_dir):
    """Build the corpus, trains the DTM, and saves the model to the output
    dir."""
    corpus = Corpus()

    # Create the dictionary.
    dictionary = Dictionary(corpus.debates.bag_of_words)
    dictionary.filter_extremes(no_below=100)

    # Save empirical term distribution within each time step.
    term_counts = corpus2csc(
        corpus.debates.groupby('year').agg({
            'bag_of_words': 'sum'
        }).bag_of_words.apply(dictionary.doc2bow))
    save_npz(os.path.join(output_dir, 'term_counts.npz'), term_counts)

    # Train and save dtm.
    time_slices = corpus.debates.groupby('year').size()
    dtm_corpus = corpus.debates.bag_of_words.apply(dictionary.doc2bow)
    model = Dtm(args.executable,
                corpus=dtm_corpus,
                id2word=dictionary,
                num_topics=args.num_topics,
                time_slices=time_slices.values,
                rng_seed=args.random_seed)
    model.save(os.path.join(output_dir, 'dtm.gensim'))
예제 #2
0
class EnronCorpus(TextCorpus):
    def __init__(self,
                 root_name,
                 no_below=20,
                 keep_words=DEFAULT_DICT_SIZE,
                 dictionary=None):
        """
    Initialize the corpus. This scans through all the emails once, to determine the corpus
    vocabulary. (only the first `keep_words` most frequent words that appear in at least 
    `no_below` documents are kept).
    """
        self.root_name = root_name
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=no_below,
                                            no_above=0.1,
                                            keep_n=keep_words)
        else:
            self.dictionary = dictionary

    def get_texts(self, return_raw=False):
        """
    Walk the file system, strip punctuation, normalize all numbers to be '2'.
    """
        filenames = walk_os(self.root_name)
        opened_files = gen_open(filenames)
        stripped_files = strip_punct(opened_files)
        length = 0
        for email in stripped_files:
            if len(email) > ARTICLE_MIN_CHARS:
                length += 1
                print 'Iteration: %i' % length
                yield tokenize(email)
        self.length = length  # cache corpus length
예제 #3
0
def make_item_descriptions(max_sentence_length=None):
    descriptions = pd.read_csv(os.path.join(
        'data', 'descriptions.csv')).rename(columns={'movie': 'item'})
    texts = descriptions.description
    texts = texts.apply(lambda x: x.strip().split())
    dictionary = Dictionary(texts.values)
    dictionary.filter_extremes()
    eos_id = len(dictionary.keys())

    # to index list
    texts = texts.apply(
        lambda x: dictionary.doc2idx(x, unknown_word_index=eos_id))
    texts = texts.apply(lambda x: np.array([a for a in x if a != eos_id]))
    max_sentence_length = max(
        texts.apply(len)) if max_sentence_length is None else min(
            max(texts.apply(len)), max_sentence_length)

    # padding
    texts = texts.apply(lambda x: x[:max_sentence_length])
    texts = texts.apply(lambda x: np.pad(x, (0, max_sentence_length - len(x)),
                                         'constant',
                                         constant_values=(0, eos_id)))

    # change types
    texts = texts.apply(lambda x: x.astype(np.int32))
    descriptions.id = descriptions.id.astype(np.int32)

    return descriptions.id.values, texts.values, len(dictionary.keys()) + 1
예제 #4
0
파일: lda.py 프로젝트: msushkov/cs224w-wiki
def build_dictionary():
    dictionary = Dictionary()
    for line in open(wiki_index.ARTICLES_FILE):
        dictionary.add_documents([line.lower().split()])
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    dictionary.save(DICTIONARY_FILE)
    return dictionary
예제 #5
0
class EnronCorpus(TextCorpus):
    def __init__(self, root_name, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None):
        """
    Initialize the corpus. This scans through all the emails once, to determine the corpus
    vocabulary. (only the first `keep_words` most frequent words that appear in at least 
    `no_below` documents are kept).
    """
        self.root_name = root_name
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
        else:
            self.dictionary = dictionary

    def get_texts(self, return_raw=False):
        """
    Walk the file system, strip punctuation, normalize all numbers to be '2'.
    """
        filenames = walk_os(self.root_name)
        opened_files = gen_open(filenames)
        stripped_files = strip_punct(opened_files)
        length = 0
        for email in stripped_files:
            if len(email) > ARTICLE_MIN_CHARS:
                length += 1
                print "Iteration: %i" % length
                yield tokenize(email)
        self.length = length  # cache corpus length
예제 #6
0
    def fill_dictionary(self, prune_at=2000000):
        """
        Update dictionary from a collection of documents. Each document is a list
        of tokens = **tokenized and normalized** strings (either utf8 or unicode).

        This is a convenience wrapper for calling `doc2bow` on each document
        with `allow_update=True`, which also prunes infrequent words, keeping the
        total number of unique words <= `prune_at`. This is to save memory on very
        large inputs. To disable this pruning, set `prune_at=None`.
        """
        if self.metadata:
            dictionary = Dictionary()

            for docno, item in enumerate(self.get_texts()):
                title, document = item
                self.titles.append(title)
                # log progress & run a regular check for pruning, once
                # every 10k docs
                if docno % 10000 == 0:
                    if prune_at is not None and len(dictionary) > prune_at:
                        dictionary.filter_extremes(no_below=0,
                                                   no_above=1.0,
                                                   keep_n=prune_at)
                    logger.info("adding document #%i to %s", docno, dictionary)

                # update Dictionary with the document
                dictionary.doc2bow(document, allow_update=True)

            logger.info(
                "built %s from %i documents (total %i corpus positions)",
                dictionary, dictionary.num_docs, dictionary.num_pos)

            return dictionary
        else:
            return Dictionary(self.get_texts())
예제 #7
0
def main():
    docs = get_train(
        'D:/ByResearch/基于文本的原油油价预测/20200615code/code/SeaNMF-master/data/wedata.txt'
    )
    docs = [s.strip().split() for s in docs]

    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=10, no_above=0.2)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    # Make a index to word dictionary.
    temp = dictionary[0]  # only to "load" the dictionary.
    id2word = dictionary.id2token

    PMI = []
    for i in range(2, 11):
        print(i)
        lda_model = LdaModel(corpus=corpus,
                             id2word=id2word,
                             iterations=100,
                             num_topics=i)
        # Print the Keyword in the 5 topics
        print(lda_model.print_topics())

        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=docs,
                                             dictionary=dictionary,
                                             coherence='c_uci')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)
        del lda_model
        PMI.append(coherence_lda)
    print(PMI)
예제 #8
0
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None  # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
예제 #9
0
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
예제 #10
0
def create_lda_model(data_csv, num_topics):
    custom_texts = []
    # for i in range(0, 30):
    for i in range(0, len(data_csv)):
        # add context vocab to dict
        context = data_csv['Context'][i]
        # hard-coded condition to train only contexts corresponding to first 20000 questions
        # if (context[: 42] == "Agricultural production is concentrated on"):
        #     break
        context = context.lower()
        context = context.replace("\'s", '')
        context = context.replace("\'", '')
        lst_words_context = re.findall(r"[\w']+|[.,!?;]", context)

        words = [w for w in lst_words_context
                 if not w in stop_words]  # remove stopwords
        words = [word for word in words
                 if word.isalpha()]  # remove punctuation
        custom_texts.append(words)

    custom_dict = Dictionary(custom_texts)
    custom_dict.filter_extremes(no_below=1, no_above=0.3)
    custom_corpus = [custom_dict.doc2bow(text) for text in custom_texts]

    # Train the model on the corpus.
    lda = models.LdaModel(custom_corpus,
                          num_topics=num_topics,
                          id2word=custom_dict)

    return lda, custom_dict
예제 #11
0
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
    """


    :rtype : gensim.corpora.dictionary.Dictionary
    :param corpora: 
    :param stopwords: 
    :param allowed_pos: 
    :param max_doc: 
    :return: 
    """
    logging.info('Lemmatizing the corpora...')
    count = 0
    corpus_num = len(corpora)
    processed_corpora = []
    corpus_id2orig_id = []

    for index, corpus in corpora.items():
        count += 1
        if count > max_doc:
            break
        if corpus is None:  # skip if corpus is None
            continue

        print '\r', count, '/', corpus_num,
        cleaned_corpus = clean_text(corpus)  # delete irrelevant characters
        corpus = []
        tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
        for token in tokens:
            word, pos = token.split('/')
            corpus.append(word)

        # convert compound word into one token
        corpus = convert_compound(corpus)

        # filter stop words, long words, and non-english words
        corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
        processed_corpora.append(corpus)
        corpus_id2orig_id.append(index)

    print '\n'

    logging.info('Creating dictionary and corpus...')
    dictionary = Dictionary(processed_corpora)
    dictionary.corpus_id2orig_id = corpus_id2orig_id

    logging.info('Filtering unimportant terms...')
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    dictionary.compactify()

    logging.info('Generating corpus...')
    dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
    dictionary.id2token = revdict(dictionary.token2id)

    return dictionary
예제 #12
0
def create_gensim_dict_corpus(docs_raw, num_below, num_above, num_features):
    '''
    Create corpus to be used to determine optimal number of components using the gensim package.
    
    '''
    gensim_dict = Dictionary(docs_raw)
    gensim_dict.filter_extremes(no_below=num_below,
                                no_above=num_above,
                                keep_n=num_features)
    corpus = [gensim_dict.doc2bow(doc) for doc in docs_raw]

    return gensim_dict, corpus
예제 #13
0
    def _make_property(self, review_dict_list: list) -> tuple:
        """
        review_dict's keys are 'date', 'star', 'vote', 'name', 'title' and 'review' 
        """
        reviews = OrderedDict()
        for idx, review_dict in enumerate(review_dict_list):
            review = normalize(review_dict['review'])
            reviews[idx] = review

        text_list = [[
            term.word for term in self._tokenizer.get_baseforms(review)
        ] for review in reviews.values()]

        dictionary = Dictionary(text_list)
        dictionary.filter_extremes(no_below=1, no_above=0.6)
        corpus = [dictionary.doc2bow(words) for words in text_list]

        return corpus, dictionary
def preprocess_text(docs):
    num_task = os.cpu_count()
    len_slices = len(docs) // num_task
    remainder_slices = len(docs) % num_task

    texts = []
    stoplist = set(stopwords.words('english'))
    
    wn.ensure_loaded()
    t_start = time.perf_counter()
    with ProcessPoolExecutor(max_workers=num_task) as executor:

        futures_tokenize = []
        for n in range(0, num_task):

            upper_bound = (n+1) * len_slices
            if n == num_task - 1:
                upper_bound = (n+1) * len_slices + remainder_slices

            print(n, upper_bound)
            futures_tokenize.append(executor.submit(preprocess_tokenize, docs[n * len_slices:upper_bound],
                            stoplist))

        for future in concurrent.futures.as_completed(futures_tokenize):
            texts += future.result()

    t_stop = time.perf_counter()
    print("removed stopwords and lemmatized in {} s".format(t_stop - t_start))
    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phraser(Phrases(texts, min_count=20))
    for idx in range(len(texts)):
        for token in bigram[texts[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                texts[idx].append(token)

    print("Done bigrams")
    dictionary = Dictionary(texts)
    dictionary.filter_extremes(no_below=30, no_above=0.5)
    dictionary.filter_tokens(bad_ids=[dictionary.token2id["like"]])
    special_tokens = {'_pad_': 0}
    dictionary.patch_with_special_tokens(special_tokens)

    return texts, dictionary
예제 #15
0
    def compute_coherence_values(self, kmin, kmax, kstep):
        """
        Compute c_v coherence for various number of topics

        Parameters:
        ----------
        kmin : The minimum number of topics
        kmax : Max num of topics
        kstep : The step size of the topics

        Returns:
        -------
        k_values: The number of topics used. 
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        topic_list: The list of topics. 
        """
        dictionary = Dictionary(self.docs)
        dictionary.filter_extremes(no_below=10, no_above=0.2)
        corpus = [dictionary.doc2bow(doc) for doc in self.docs]

        k_values = []
        coherence_values = []
        topic_list = []
        for num_topics in range(kmin, kmax + 1, kstep):
            # The following print line is so that you can visually see it go and don't freak out
            print("num_topics:\t" + str(num_topics))
            model = LdaModel(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num_topics)
            coherencemodel = CoherenceModel(model=model,
                                            texts=self.docs,
                                            dictionary=dictionary,
                                            coherence='c_v')
            coherence_lda = coherencemodel.get_coherence()
            coherence_values.append(coherence_lda)
            topic_list.append(
                model.show_topics(num_topics=num_topics,
                                  num_words=20,
                                  log=False,
                                  formatted=True))
            k_values.append(num_topics)
        return k_values, coherence_values, topic_list
예제 #16
0
def get_classif_perf(theta,
                     tokens,
                     labels,
                     embeds,
                     methods=['theta', 'lda', 's-bert', 'tfidf']):
    # print('Checking inputs dim for classif:', len(theta), len(labels))
    import pandas as pd
    perf = []

    if 'theta' in methods:
        X = theta
        perf.append(train_predict(X, labels))

    if 'lda' in methods:
        corpus = tokens.tolist()
        corpus = [[str(w) for w in d[0]] for d in corpus]
        dictionary = Dictionary(corpus)
        bow_corpus = [dictionary.doc2bow(x) for x in corpus]
        mod = LdaModel(bow_corpus, num_topics=theta.shape[1])
        transcorp = mod[bow_corpus]
        X = transcorp2matrix(transcorp, bow_corpus, theta.shape[1])
        perf.append(train_predict(X, labels))

    if 's-bert' in methods:
        from sklearn.decomposition import PCA
        X = PCA(n_components=theta.shape[1]).fit_transform(embeds)
        perf.append(train_predict(X, labels))

    if 'tfidf' in methods:
        corpus = tokens.tolist()
        corpus = [[str(w) for w in d[0]] for d in corpus]
        dictionary = Dictionary(corpus)
        dictionary.filter_extremes(keep_n=theta.shape[1])
        bow_corpus = [dictionary.doc2bow(x) for x in corpus]
        mod = TfidfModel(bow_corpus, dictionary=dictionary)
        corpus_tfidf = mod[bow_corpus]
        X = corpus2dense(corpus_tfidf, num_terms=theta.shape[1]).T
        perf.append(train_predict(X, labels))

    perf = pd.DataFrame(perf, index=methods)
    print('Model performances on classification is:\n{}'.format(perf))
예제 #17
0
def main():
    try:
        dictionary = Dictionary.load_from_text("dictionary.txt")
    except:
        dictionary = Dictionary(rcv1_train)
        dictionary.filter_extremes()
        dictionary.save_as_text("dictionary.txt")

    class RCV1BowCorpus(object):
        def __iter__(self):
            for document in rcv1_train:
                yield dictionary.doc2bow(document)

    ln.debug("Training model on %s documents" % len(rcv1_train))
    try:
        vector_model = LsiModel.load("lsi_model")
    except:
        vector_model = LsiModel(corpus=RCV1BowCorpus(),
                                num_topics=100,
                                id2word=dictionary)
        vector_model.save("lsi_model")

    def get_lsi_features(text):
        """
        Must return either numpy array or dictionary
        """
        res = vector_model[dictionary.doc2bow(text)]
        return dict(res)

    def get_bow_features(text):
        return dict(dictionary.doc2bow(text))

    clf = train_classifier(train_samples=rcv1_train,
                           train_targets=rcv1_train_target,
                           get_features=get_lsi_features,
                           classifier="sgd")

    evaluate_classifier(clf,
                        rcv1_test,
                        rcv1_test_target,
                        get_features=get_lsi_features)
예제 #18
0
파일: utils_gensim.py 프로젝트: wpli/ptr
class FolderCorpus(corpora.TextCorpus):
    def __init__(self, filepaths, preprocess=[], dictionary=None):
        self.filepaths = filepaths
        self.preprocess = preprocess
        self.metadata = None

        self.dictionary = Dictionary()

        self.dictionary.add_documents(self.get_texts())
        self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)
        self.dictionary.compactify()

    def get_texts(self):
        for path in self.filepaths:
            with codecs.open(path, encoding='utf8') as f:
                raw_text = f.read()
                raw_text = raw_text.lower()
                for filt in self.preprocess:
                    raw_text = filt(raw_text)
                text = list(utils.tokenize(raw_text, deacc=True, lowercase=True))
                yield text
예제 #19
0
def further_preprocessing_phase(temp_data_frame):
    temp_data_frame['text'] = temp_data_frame['text'].apply(lambda text: th.tokenize_text(text) if text != None else '')
    # textlist = temp_data_frame['text'].to_numpy()
    textlist = temp_data_frame['text'].tolist()

    # if it raises an exeption could be the empty texts
    patent_dictionary = Dictionary(textlist)
    corpus = [patent_dictionary.doc2bow(text) for text in textlist]

    print('original dictionary size: ', len(patent_dictionary))

    vocab_tf={}
    for i in corpus:
        for item, count in dict(i).items():
            if item in vocab_tf:
                vocab_tf[item]+=int(count)
            else:
                vocab_tf[item] =int(count)

    remove_ids=[]
    no_of_ids_below_limit=0
    for id,count in vocab_tf.items():
        if count<=5:
            remove_ids.append(id)
    patent_dictionary.filter_tokens(bad_ids=remove_ids)

    patent_dictionary.filter_extremes(no_below=0)
    patent_dictionary.filter_n_most_frequent(30)

    print('parsed dictionary size: ', len(patent_dictionary))

    vocabulary = list(patent_dictionary.token2id.keys())

    ids_list = []
    data_frame = pd.DataFrame(columns=['patent_id', 'text', 'classification'])
    temp_data_frame.apply(lambda row : shrink_vocabulary(row, vocabulary, data_frame, ids_list), axis=1)
    print(len(ids_list))
    data_frame.set_index(data_frame['patent_id'], inplace=True)
    data_frame.drop(ids_list, axis=0, inplace=True)
    return data_frame
예제 #20
0
def create_LDA_model(coursesList):
    warnings.filterwarnings('ignore')
    text_clean = [doc.split(' ') for doc in coursesList['description']]
    bigrams, trigrams = create_n_grams(text_clean)
    text_clean = add_n_grams(text_clean, bigrams, trigrams)

    id2word = Dictionary(text_clean)
    id2word.filter_extremes(no_below=5, no_above=0.45)
    corpus = [id2word.doc2bow(text) for text in text_clean]

    num_topics = config.num_lda_topic
    lda_model = LDA(corpus=corpus,
                    id2word=id2word,
                    num_topics=num_topics,
                    random_state=42,
                    alpha='asymmetric',
                    passes=25)
    lda_model.save("./best_model.lda")
    coherence_model_c_v = CoherenceModel(model=lda_model,
                                         texts=text_clean,
                                         dictionary=id2word,
                                         coherence='c_v')
    c_v = coherence_model_c_v.get_coherence()
    term_topic_mat = lda_model.get_topics()
    aver_cosine_similarities = 0
    for i in range(0, (num_topics - 1)):
        cosine_similarities = linear_kernel(term_topic_mat[i].reshape(1, -1),
                                            term_topic_mat[i + 1:]).flatten()
        aver_cosine_similarities += sum(cosine_similarities)
    if num_topics != 1:
        aver_cosine_similarities = aver_cosine_similarities / (
            num_topics * (num_topics - 1) / 2)
    print(c_v)
    print(aver_cosine_similarities)

    create_vector_topics(lda_model, corpus, id2word, coursesList)

    visual_data = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    pyLDAvis.save_html(visual_data, 'topics.html')
    return lda_model, id2word, bigrams, trigrams
예제 #21
0
def LDA_model(corpus_Quran, corpus_NT, corpus_OT):
    # run LDA on the entire set of verses from all corpora
    total_corpus = corpus_Quran + corpus_NT + corpus_OT
    dictionary = Dictionary(total_corpus)
    dictionary.filter_extremes(no_below=50, no_above=0.1)
    corpus = [dictionary.doc2bow(text) for text in total_corpus]
    lda = LdaModel(corpus, num_topics=20, id2word=dictionary, random_state=1)
    # compute document-topic probability for Quran
    dictionary1 = Dictionary(corpus_Quran)
    dictionary1.filter_extremes(no_below=50, no_above=0.1)
    corpus1 = [dictionary1.doc2bow(text) for text in corpus_Quran]
    topics_Quran = lda.get_document_topics(corpus1)
    topic_dic_Quran = {}
    for doc in topics_Quran:
        for topic in doc:
            if topic[0] not in topic_dic_Quran.keys():
                topic_dic_Quran[topic[0]] = topic[1]
            else:
                topic_dic_Quran[topic[0]] += topic[1]
    # compute document-topic probability for OT
    dictionary2 = Dictionary(corpus_OT)
    dictionary2.filter_extremes(no_below=50, no_above=0.1)
    corpus2 = [dictionary2.doc2bow(text) for text in corpus_OT]
    topics_OT = lda.get_document_topics(corpus2)
    topic_dic_OT = {}
    for doc in topics_OT:
        for topic in doc:
            if topic[0] not in topic_dic_OT.keys():
                topic_dic_OT[topic[0]] = topic[1]
            else:
                topic_dic_OT[topic[0]] += topic[1]
    # compute document-topic probability for NT
    dictionary3 = Dictionary(corpus_NT)
    dictionary3.filter_extremes(no_below=50, no_above=0.1)
    corpus3 = [dictionary3.doc2bow(text) for text in corpus_NT]
    topics_NT = lda.get_document_topics(corpus3)
    topic_dic_NT = {}
    for doc in topics_NT:
        for topic in doc:
            if topic[0] not in topic_dic_NT.keys():
                topic_dic_NT[topic[0]] = topic[1]
            else:
                topic_dic_NT[topic[0]] += topic[1]
    for k, v in topic_dic_Quran.items():
        topic_dic_Quran[k] = v / len(corpus_Quran)
    for k, v in topic_dic_OT.items():
        topic_dic_OT[k] = v / len(corpus_OT)
    for k, v in topic_dic_NT.items():
        topic_dic_NT[k] = v / len(corpus_NT)
    return lda, topic_dic_Quran, topic_dic_NT, topic_dic_OT
예제 #22
0
파일: hn.py 프로젝트: imclab/HN_stats
class HNCorpus(TextCorpus):
    def __init__(self, hn_folder, dictionary=None):
        """
        Takes the HN folder of articles 
        as input and builds the dictionary and corpus
        """
        self.hn_folder = hn_folder
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=NO_BELOW, 
                    no_above=NO_ABOVE, keep_n=VOCAB_SIZE)
        else:
            self.dictionary = dictionary


    def get_texts(self):
        """
        Iterate over the HN articles returning text
        """
        positions, hn_articles = 0, 0

        # ************ HN articles ************
        fnamelist = []
        for g in glob.iglob(self.hn_folder + '/*.txt'):
            fnamelist.append(g)
        for fileno, fname in enumerate(fnamelist):
            hn_text = open(fname).read()
            hn_articles += 1
            if LEMMATIZE:
                result = utils.lemmatize(hn_text)
                positions += len(result)
                yield result
            else:
                result = tokenize(hn_text) # text into tokens here
                positions += len(result)
                yield result

        print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions))

        self.length = hn_articles # cache corpus length
def training_vectorize(holder):
    #Vector uses BOW to store features of the corpus. Uses dictionary
    #for facilitating this operation. This is an important part of the
    #sequential vectorization

    # split the data
    holder.content = holder['content'].apply(lambda row: row.split())
    # make a dictionary
    dictionary = Dictionary(holder.content.tolist())

    # filter the dictionary
    dictionary.filter_extremes(no_above=0.8, no_below=5)
    dictionary.compactify()

    # transform the data with the dictionary
    holder["content"] = holder["content"].apply(
        lambda row: dictionary.doc2bow(row))

    # transform with tf-idf
    # tfidf = TfidfModel(holder["content"].tolist())
    # holder["content"] = holder["content"].apply(lambda col: tfidf[col])
    return holder, dictionary  #, tfidf
예제 #24
0
class ArchiveCorpus(corpora.TextCorpus):

	def __init__(self, datafile, preprocess=[], dictionary=None):
		self.datafile = datafile
		self.preprocess = preprocess
		self.metadata = None

		if dictionary:
				self.dictionary = dictionary
		else:
				self.dictionary = Dictionary()
				if datafile is not None:
					self.dictionary.add_documents(self.get_texts())
					self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)


	def get_texts(self):
		with utils.smart_open(self.datafile) as inputfile:
			for line in inputfile:
				for f in self.preprocess:
					line = f(line)
				text = list(utils.tokenize(line, deacc=True, lowercase=True))
				yield text
예제 #25
0
def preprocess(tweets):
    # Get only negative ones (for this task)
    newTweets = tweets.copy()

    newTweets = remove_airline_tags(newTweets)
    newTweets.text = remove_links(newTweets.text)
    newTweets.text = lt_gt_conversion(
        ampersand_conversion(arrow_conversion(newTweets.text)))
    newTweets.text = with_without_conversion(newTweets.text)
    newTweets.text = hashtag_to_words(newTweets.text)
    newTweets = translate_all_emoji(newTweets)
    newTweets.text = remove_contractions(newTweets.text)
    newTweets.text = remove_punctuation(newTweets.text)
    newTweets.text = lemmatize_texts(newTweets.text)
    newTweets.text = remove_stopwords(newTweets.text)
    newTweets.text = newTweets.text.str.lower()
    texts = newTweets["text"].values

    # Tokenize and remove short words or filtered words
    tokenized_texts = []
    for text in texts:
        split_text = text.split()
        split_text = [
            word for word in split_text
            if len(word) > 2 and word not in FILTERED_WORDS
        ]
        tokenized_texts.append(split_text)

    # Create a dictionary for each word, and a bag of words
    text_dictionary = Dictionary(tokenized_texts)

    # Remove words that appear in over 50%, or less than 0.5%, and keep the top 66% of the vocabulary
    text_dictionary.filter_extremes(no_below=5,
                                    no_above=0.5,
                                    keep_n=len(text_dictionary) // 2)
    text_corpus = [text_dictionary.doc2bow(text) for text in tokenized_texts]
    return (text_dictionary, text_corpus)
예제 #26
0
    def buildDict(self):
        batchiter = BatchIterBert(self.trainDataIter,
                                  filling_last_batch=False,
                                  postProcessor=xonlyBatchProcessor,
                                  batch_size=1)
        common_dictionary = Dictionary(batchiter)
        print(len(common_dictionary))
        if self.testReaderargs:
            print('update vocab from test set')
            batchiter = BatchIterBert(self.testDataIter,
                                      filling_last_batch=False,
                                      postProcessor=xonlyBatchProcessor,
                                      batch_size=1)
            common_dictionary.add_documents(batchiter)
            print(len(common_dictionary))

        common_dictionary.filter_extremes(no_below=self.dict_no_below,
                                          no_above=self.dict_no_above,
                                          keep_n=self.dict_keep_n)
        self.dictProcess = DictionaryProcess(common_dictionary)
        self.postProcessor.dictProcess = self.dictProcess
        self.vocab_dim = len(self.dictProcess)
        self.have_dict = True

        if 1:
            count_list = []
            self.trainDataIter._reset_iter()
            batchiter = BatchIterBert(self.trainDataIter,
                                      filling_last_batch=False,
                                      postProcessor=xonlyBatchProcessor,
                                      batch_size=1)
            for item in batchiter:
                current_count = sum(item)
                count_list.append(current_count)
                #print(current_count)
            print(sum(count_list) / len(count_list))
def train(docs):
    num_topics = lda_cfg("topics")
    epochs = lda_cfg("epochs")
    label = f'{datetime.now().isoformat(".", timespec="minutes")}({num_topics}-topics,{epochs}-epochs)'

    log_path = config("path.lda-log").format(label)
    os.makedirs(os.path.dirname(log_path), exist_ok=True)
    logging.basicConfig(filename=log_path,
                        format='%(asctime)s : %(levelname)s : %(message)s',
                        datefmt='%H:%M:%S',
                        level=logging.INFO)

    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=lda_cfg("word-extremes.min-count"),
                               no_above=lda_cfg("word-extremes.max-freq"))
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    model = LdaMulticore(corpus,
                         id2word=dictionary,
                         num_topics=num_topics,
                         passes=epochs,
                         eval_every=lda_cfg.dict_like.get("eval-every"),
                         chunksize=lda_cfg("chunk-size"))

    return label, model, dictionary, corpus
예제 #28
0
def evaluate(docs):
    # global docs
    # Perform function on our document
    docs = docs_preprocessor(docs)
    # Create Biagram & Trigram Models
    from gensim.models import Phrases
    if __name__ == "__main__":
        # Add bigrams and trigrams to docs,minimum count 10 means only that appear 10 times or more.
        bigram = Phrases(docs, min_count=10)
        trigram = Phrases(bigram[docs])

        for idx in range(len(docs)):
            for token in bigram[docs[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    docs[idx].append(token)
            for token in trigram[docs[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    docs[idx].append(token)
        # Remove rare & common tokens
        # Create a dictionary representation of the documents.
        dictionary = Dictionary(docs)
        dictionary.filter_extremes(no_below=10, no_above=0.2)
        # Create dictionary and corpus required for Topic Modeling
        corpus = [dictionary.doc2bow(doc) for doc in docs]
        print('Number of unique tokens: %d' % len(dictionary))
        print('Number of documents: %d' % len(corpus))
        print(corpus[:1])

        # Set parameters.
        num_topics = 20
        chunksize = 500
        passes = 20
        iterations = 400
        eval_every = 1

        # Make a index to word dictionary.
        temp = dictionary[0]  # only to "load" the dictionary.
        id2word = dictionary.id2token

        lda_model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                             alpha='auto', eta='auto', \
                             iterations=iterations, num_topics=num_topics, \
                             passes=passes, eval_every=eval_every)
        # Print the Keyword in the 5 topics
        print(lda_model.print_topics())

        # Compute Coherence Score using c_v
        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=docs,
                                             dictionary=dictionary,
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)

        # Compute Coherence Score using UMass
        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=docs,
                                             dictionary=dictionary,
                                             coherence="u_mass")
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)
        """
               Compute c_v coherence for various number of topics
    
               Parameters:
               ----------
               dictionary : Gensim dictionary
               corpus : Gensim corpus
               texts : List of input texts
               limit : Max num of topics
    
               Returns:
               -------
               model_list : List of LDA topic models
               coherence_values : Coherence values corresponding to the LDA model with respective number of topics
               """

        model_list, coherence_values = compute_coherence_values(
            dictionary=dictionary,
            corpus=corpus,
            texts=docs,
            start=2,
            limit=40,
            step=6)
        # Show graph
        import matplotlib.pyplot as plt

        limit = 40
        start = 2
        step = 6
        x = range(start, limit, step)
        plt.plot(x, coherence_values)
        plt.xlabel("Num Topics")
        plt.ylabel("Coherence score")
        plt.legend(("coherence_values"), loc='best')
        plt.show()

        return coherence_lda
from multiprocessing import Pool
from functools import partial
import math
import numpy as np

# use the newsgroup data as corpus
df = pd.read_json(
    "https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json"
)
documents = df.content.tolist()
documents = preprocess_documents(documents)

# fit an LDA model, n_topic = 5
news_dictionary = Dictionary(documents)
news_dictionary.filter_extremes(no_below=5,
                                no_above=0.5,
                                keep_n=5000,
                                keep_tokens=None)
corpus = [news_dictionary.doc2bow(text) for text in documents]
lda = gensim.models.LdaModel(corpus, num_topics=5, id2word=news_dictionary)

lda.show_topics()

# convert gensim corpus to a sparse document-term matrix for coherence measure
corpus_dense = gensim.matutils.corpus2csc(corpus,
                                          num_terms=len(
                                              news_dictionary.keys()))
corpus_dense = corpus_dense.astype(int)
corpus_dense = corpus_dense.transpose()
print(corpus_dense.shape)

예제 #30
0
    return total


df = pd.read_csv("data_lda_final.csv", header=[0], sep='\t')

clear = df["tokenized"].tolist()
clear = [str(i) for i in clear]
clear = trial_docs_preprocessor(clear)
docs = clear

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
print('Number of unique words in initital documents:', len(dictionary))

# Filter out words that occur less than 3 documents, or more than 70% of the documents.
dictionary.filter_extremes(no_below=3, no_above=0.70)
print('Number of unique words after removing rare and common words:',
      len(dictionary))

corpus = [dictionary.doc2bow(doc) for doc in docs]
# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

num_topics = 30
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     alpha='auto',
                     eta='auto',
                     num_topics=num_topics)
예제 #31
0
파일: LDA_.py 프로젝트: Joker-Chan/IE3320
from gensim.models.ldamodel import LdaModel

from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
import os

import numpy as np
import matplotlib.pyplot as plt
num_topics =10
f = open("./data/weibo_nof_vec.txt", "r", encoding='utf-8')

texts = [document.split() for document in f]

dictionary = Dictionary(texts)

dictionary.filter_extremes(no_above = 0.2)


corpus = [dictionary.doc2bow(text) for text in texts]

print('done with corpus.')

lda = LdaModel(corpus, id2word=dictionary, iterations=100, num_topics=10)

#print(lda.print_topics())
for i in range(0,num_topics):
    print("-----------------------------------")
    print(lda.print_topic(i))

#lda.save('lda')
topic = []
예제 #32
0
class WikiCorpus(TextCorpus):
    """
    Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus.

    The documents are extracted on-the-fly, so that the whole (massive) dump
    can stay compressed on disk.

    >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
    >>> wiki.saveAsText('wiki_en_vocab200k') # another 8h, creates a file in MatrixMarket format plus file with id->word

    """
    def __init__(self, fname, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None):
        """
        Initialize the corpus. This scans the corpus once, to determine its
        vocabulary (only the first `keep_words` most frequent words that
        appear in at least `noBelow` documents are kept).
        """
        self.fname = fname
        if keep_words is None:
            keep_words = DEFAULT_DICT_SIZE
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
        else:
            self.dictionary = dictionary


    def get_texts(self, return_raw=False):
        """
        Iterate over the dump, returning text version of each article.

        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print vec
        """
        articles, articles_all = 0, 0
        intext, positions = False, 0
        if LEMMATIZE:
            lemmatizer = utils.lemmatizer
            yielded = 0

        for _, text in _extract_pages(bz2.BZ2File(self.fname)):
            text = filter_wiki(text)
            articles_all += 1
            if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
                articles += 1
                if return_raw:
                    result = text
                    yield result
                else:
                    if LEMMATIZE:
                        _ = lemmatizer.feed(text)
                        while lemmatizer.has_results():
                            _, result = lemmatizer.read() # not necessarily the same text as entered above!
                            positions += len(result)
                            yielded += 1
                            yield result
                    else:
                        result = tokenize(text) # text into tokens here
                        positions += len(result)
                        yield result

        if LEMMATIZE:
            logger.info("all %i articles read; waiting for lemmatizer to finish the %i remaining jobs" %
                        (articles, articles - yielded))
            while yielded < articles:
                _, result = lemmatizer.read()
                positions += len(result)
                yielded += 1
                yield result

        logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions"
                     " (total %i articles before pruning)" %
                     (articles, positions, articles_all))
        self.length = articles # cache corpus length
예제 #33
0
파일: preprocess.py 프로젝트: yakzan/ktext
class processor(processor_base):
    """
    Pre-process text in memory.

    Includes utilities for cleaning, tokenization, and vectorization in parallel.
    """
    def __init__(self,
                 hueristic_pct_padding: float = .90,
                 append_indicators: bool = False,
                 keep_n: int = 150000,
                 padding: str = 'pre',
                 padding_maxlen: Union[int, None] = None,
                 truncating: str = 'post'):
        """
        Parameters:
        ----------
        hueristic_pct_padding: float
            This parameter is only used if `padding_maxlen` = None.  A histogram
            of documents is calculated, and the maxlen is set hueristic_pct_padding.
        append_indicators: bool
            If True, will append the tokens '_start_' and '_end_' to the beginning
            and end of your tokenized documents.  This can be useful when training
            seq2seq models.
        keep_n: int = 150000
            This is the maximum size of your vocabulary (unique number of words
            allowed).  Consider limiting this to a reasonable size based upon
            your corpus.
        padding : str
            'pre' or 'post', pad either before or after each sequence.
        padding_maxlen : int or None
            Maximum sequence length, longer sequences are truncated and shorter
            sequences are padded with zeros at the end.  Note if this is specified,
            the `hueristic_pct_padding` is ignored.
        truncating : str
            'pre' or 'post', remove values from sequences larger than padding_maxlen
            either in the beginning or in the end of the sequence.

        See https://keras.io/preprocessing/sequence/

        Attributes:
        -----------
        vocabulary : gensim.corpora.dictionary.Dictionary
            This is a gensim object that is built after parsing all the tokens
            in your corpus.
        n_tokens : int
            The total number of tokens in the corpus.  Will be less than or
            equal to keep_n
        id2token : dict
            dict with { int : str} ex: {'the': 2, 'cat': 3}
            this is used for converting tokens to integers.
        token2id : dict
            dict with {str: int} ex: {2: 'the', 3: 'cat'}
            this is used for decoding predictions back to tokens
        document_length_stats : pandas.DataFrame
            histogram of document lengths.  Can be used to decide padding_maxlen.
        """
        super().__init__()
        self.hueristic_pct = hueristic_pct_padding
        self.append_indicators = append_indicators
        self.keep_n = keep_n
        self.padding = padding
        self.padding_maxlen = padding_maxlen
        self.truncating = truncating

        # These are placeholders for data that will be collected or calculated
        self.vocabulary = Dictionary()
        self.n_tokens = None
        self.id2token = None
        self.token2id = None
        self.document_length_histogram = Counter()
        self.document_length_stats = None
        self.doc_length_huerestic = None

        # These values are 'hardcoded' for now
        self.padding_value = 0.0
        self.padding_dtype = 'int32'
        self.start_tok = '_start_'
        self.end_tok = '_end_'
        self.keep_tokens = [self.start_tok, self.end_tok]

    def process_text(self, text: List[str]) -> List[List[str]]:
        """Combine the cleaner and tokenizer."""
        return self.__apply_tokenizer(self.__apply_cleaner(text))

    def __apply_cleaner(self, data: List[str]) -> List[str]:
        """Apply the cleaner over a list."""
        return [self.cleaner(doc) for doc in data]

    def __apply_tokenizer(self, data: List[str]) -> List[List[str]]:
        """Apply the tokenizer over a list."""
        if self.append_indicators:
            tmp = [[self.start_tok] + self.tokenizer(doc) + [self.end_tok]
                   for doc in data]
            return tmp
        else:
            return [self.tokenizer(doc) for doc in data]

    def parallel_process_text(self, data: List[str]) -> List[List[str]]:
        """Apply cleaner -> tokenizer."""
        return apply_parallel(data, self.process_text)

    def generate_doc_length_stats(self):
        """Analyze document length statistics for padding strategy"""
        hueristic = self.hueristic_pct
        histdf = (pd.DataFrame(
            [(a, b) for a, b in self.document_length_histogram.items()],
            columns=['bin', 'doc_count']).sort_values(by='bin'))
        histdf['cumsum_pct'] = histdf.doc_count.cumsum(
        ) / histdf.doc_count.sum()

        self.document_length_stats = histdf
        self.doc_length_huerestic = histdf.query(
            f'cumsum_pct >= {hueristic}').bin.head(1).values[0]
        logging.warning(' '.join([
            "Setting maximum document length to",
            f'{self.doc_length_huerestic} based upon',
            f'hueristic of {hueristic} percentile.\n',
            'See full histogram by insepecting the',
            "`document_length_stats` attribute."
        ]))
        self.padding_maxlen = self.doc_length_huerestic

    def fit(self,
            data: List[str],
            return_tokenized_data: bool = False,
            no_below: int = 100,
            no_above: float = .9) -> Union[None, List[List[str]]]:
        """
        TODO: update docs

        Apply cleaner and tokenzier to raw data and build vocabulary.

        Parameters
        ----------
        data : List[str]
            These are raw documents, which are a list of strings. ex:
            [["The quick brown fox"], ["jumps over the lazy dog"]]
        return_tokenized_data : bool
            Return the tokenized strings.  This is primarly used for debugging
            purposes.
        no_below : int
            See below explanation
        no_above : float
            See below explanation

        When tokenizing documents, filter tokens according to these rules:
        1. occur less than `no_below` documents (absolute number) or
        2. occur more than `no_above` documents (fraction of total corpus size, not absolute number).
        3. after (1), and (2), keep only the first keep_n most frequent tokens.

        Returns
        -------
        None or List[List[str]]
            if return_tokenized_data=True then will return tokenized documents,
            otherwise will not return anything.

        This method heavily leverages gensim https://radimrehurek.com/gensim/corpora/dictionary.html
        """
        now = get_time()
        logging.warning(f'....tokenizing data')
        tokenized_data = list(
            chain.from_iterable(self.parallel_process_text(data)))

        if not self.padding_maxlen:
            document_len_counters = apply_parallel(tokenized_data, count_len)

            for doc_counter in document_len_counters:
                self.document_length_histogram.update(doc_counter)
            self.generate_doc_length_stats()

        # chunk the data manually for corpus build adnd pass to build corpus method
        logging.warning(f'(1/3) done. {time_diff(now)} sec')
        logging.warning(f'....building corpus')
        now = get_time()
        corpus = build_corpus(tokenized_data)

        # Merge the corpuses from each thread together, this is like a "reduce" step
        logging.warning(f'(2/3) done. {time_diff(now)} sec')
        logging.warning(f'....consolidating corpus')
        now = get_time()
        self.vocabulary.merge_with(corpus)

        # # get rid of rare tokens from corpus such that they will get the same id
        self.vocabulary.filter_extremes(no_below,
                                        no_above,
                                        self.keep_n,
                                        keep_tokens=self.keep_tokens)

        # compactify the ids for each word
        self.vocabulary.compactify()

        # Build Dictionary accounting For 0 padding, and reserve 1 for unknown and rare Words
        self.token2id = dict([(k, v + 2)
                              for k, v in self.vocabulary.token2id.items()])
        self.id2token = dict([(v, k) for k, v in self.token2id.items()])
        self.n_tokens = len(self.id2token.keys())

        # logging
        logging.warning(f'(3/3) done. {time_diff(now)} sec')
        logging.warning(
            f'Finished parsing {self.vocabulary.num_docs:,} documents.')

        if return_tokenized_data:
            return tokenized_data

    def token_count_pandas(self):
        """ See token counts as pandas dataframe"""
        freq_df = pd.DataFrame(
            [b for a, b in self.vocabulary.dfs.items()],
            index=[a for a, b in self.vocabulary.dfs.items()],
            columns=['count'])

        id2tokens = [(b, a) for a, b in self.vocabulary.token2id.items()]

        token_df = pd.DataFrame([b for a, b in id2tokens],
                                index=[a for a, b in id2tokens],
                                columns=['token'])

        return freq_df.join(token_df).sort_values('count', ascending=False)

    def fit_transform(self,
                      data: List[str],
                      no_below: int = 25,
                      no_above: float = 0.8) -> List[List[int]]:
        """
        Apply cleaner and tokenzier to raw data, build vocabulary and return
        transfomred dataset that is a List[List[int]].  This will use
        process-based-threading on all available cores.

        ex:
        >>> data = [["The quick brown fox"], ["jumps over the lazy dog"]]
        >>> pp = preprocess(maxlen=5, no_below=0)
        >>> pp.fit_transform(data)
        # 0 padding is applied
        [[0, 2, 3, 4, 5], [6, 7, 2, 8, 9]]

        Parameters
        ----------
        data : List[str]
            These are raw documents, which are a list of strings. ex:
            [["The quick brown fox"], ["jumps over the lazy dog"]]
        no_below : int
            See below explanation
        no_above : float
            See below explanation

        When tokenizing documents, filter tokens according to these rules:
        1. occur less than `no_below` documents (absolute number) or
        2. occur more than `no_above` documents (fraction of total corpus size, not absolute number).
        3. after (1), and (2), keep only the first keep_n most frequent tokens.

        Returns
        -------
        numpy.array with shape (number of documents, max_len)


        This method leverages gensim https://radimrehurek.com/gensim/corpora/dictionary.html
        """
        tokdata = self.fit(data,
                           return_tokenized_data=True,
                           no_below=no_below,
                           no_above=no_above)

        logging.warning(f'...fit is finished, beginning transform')
        now = get_time()
        vec_data = self.vectorize_parallel(tokdata)
        logging.warning(f'done. {time_diff(now)} sec')
        return vec_data

    def transform(self, data: List[str]) -> List[List[int]]:
        """
        Transform List of documents into List[List[int]]
        If transforming a large number of documents consider using the method
        `transform_parallel` instead.

        ex:
        >> pp = processor()
        >> pp.fit(docs)
        >> new_docs = [["The quick brown fox"], ["jumps over the lazy dog"]]
        >> pp.transform(new_docs)
        [[1, 2, 3, 4], [5, 6, 1, 7, 8]]
        """
        return self.vectorize(self.process_text(data))

    def transform_parallel(self, data: List[str]) -> List[List[int]]:
        """
        Transform List of documents into List[List[int]].  Uses process based
        threading on all available cores.  If only processing a small number of
        documents ( < 10k ) then consider using the method `transform` instead.

        ex:
        >> pp = processor()
        >> pp.fit(docs)
        >> new_docs = [["The quick brown fox"], ["jumps over the lazy dog"]]
        >> pp.transform_parallel(new_docs)
        [[1, 2, 3, 4], [5, 6, 1, 7, 8]]
        """
        return np.vstack(apply_parallel(data, self.transform))

    def get_idx(self, token: str) -> int:
        """Get integer index from token."""
        # return the index for index or if not foudn return out of boundary index which is 1
        return self.token2id.get(token, 1)

    def __vec_one_doc(self, doc: List[str]) -> List[int]:
        """
        Vectorize a single tokenized document.
        ex: ['hello', 'world']
        """
        return [self.get_idx(tok) for tok in doc]

    def vectorize(self, docs: List[List[str]]) -> List[List[int]]:
        """
        Vectorize and apply padding on a set of tokenized doucments
        ex: [['hello, 'world'], ['goodbye', 'now']]

        """
        # First apply indexing on all the rows then pad_sequnces (i found this
        # faster than trying to do these steps on each row
        return pad_sequences(list(map(self.__vec_one_doc, docs)),
                             maxlen=self.padding_maxlen,
                             dtype=self.padding_dtype,
                             padding=self.padding,
                             truncating=self.truncating,
                             value=self.padding_value)

    def vectorize_parallel(self, data: List[List[str]]) -> np.array:
        """
        Apply idx-> token mappings in parallel and apply padding.

        Arguments:
        data: List of List of strings
        """
        indexed_data = apply_parallel(data, self.vectorize)
        # concatenate list of arrays vertically
        return np.vstack(indexed_data)
예제 #34
0
    dense_corpus_sent2vec = np.array(feature_vector)
    target_vector = np.array(target_vector)

    print()
    print(dense_corpus_sent2vec.shape)
    print(target_vector.shape)

    print(len(docs_lemma))
    print(len(docs_pos))

    print('Make Dictionary')
    dictionary_lemma = Dictionary(docs_lemma)
    dictionary_pos = Dictionary(docs_pos)
    print('Number of unique pos: %d' % len(dictionary_pos))
    dictionary_lemma.filter_extremes(
        no_below=10,
        no_above=0.2,
        keep_tokens=trigger_words if allow_tw else None)
    print('Number of unique lemma: %d' % len(dictionary_lemma))

    lemma_bigrams = list()
    for d in docs_lemma:
        lemma_bigram = [
            f"{bigram[0]}_{bigram[1]}" for bigram in list(ngrams(d, 2))
        ]
        lemma_bigrams.append(lemma_bigram)

    pos_bigrams = list()
    for d in docs_pos:
        pos_bigram = [
            f"{bigram[0]}_{bigram[1]}" for bigram in list(ngrams(d, 2))
        ]
예제 #35
0
def createDictionary(texts):
    dictionary = Dictionary(texts)
    dictionary.filter_extremes(no_below=2, no_above=0.4, keep_n=1000000)
    dictionary.compactify()
    return dictionary
예제 #36
0
                    help='File name to give the dictionary upon saving')

args = parser.parse_args()

input_path = args.input_path
output_name = args.output_name
CHUNK_SIZE = args.chunk_size

# Stream in documents from path
rdr = lmd.Reader(input_path)
gnr = rdr.stream_data(get_meta=True)

# Build a dictionary out of the validation documents
dictionary = Dictionary()
docs = rdr.stream_data(threaded=True)
doc_chunks = chunks(docs, size=CHUNK_SIZE)
# Progress in chunks
for chunk in doc_chunks:
    print("Adding ", CHUNK_SIZE, " docs")
    tokenized = [[
        tok.lower_ for tok in doc if not tok.is_stop and tok.is_alpha
    ] for doc in tokenizer.pipe(
        [item for item in chunk if language(item) == 'en'],
        batch_size=CHUNK_SIZE)]
    dictionary.add_documents(tokenized)

# Keep only 2**16 most frequent tokens
dictionary.filter_extremes(keep_n=2**16)
dictionary.compactify()
dictionary.save(output_name)
    trigram = Phrases(bigram[docs])

    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)
        for token in trigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)

    # Remove rare & common tokens
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=10, no_above=0.2)
    # Create dictionary and corpus required for Topic Modeling
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))
    print(corpus[:1])

    # Train the model
    print("Stage 2: Train the model.")
    # Set parameters.
    num_topics = 5
    chunksize = 500
    passes = 10  # 20
    iterations = 100  # 400

    # Make a index to word dictionary.
class CDS_Corpus(TextCorpus):
    def __init__(self, folder, dictionary=None):
        """
        Takes the list of txt files in a folder from Isabelle 
        as input and builds the dictionary and corpus
        """
        self.folder = folder
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=NO_BELOW, 
                    no_above=NO_ABOVE, keep_n=VOCAB_SIZE)
        else:
            self.dictionary = dictionary


    def get_texts(self):
        """
        Iterate over the "documents" (sessions/places) returning text
        """
        filter_words = set()
        if FILTER_WORDS:
            filter_words = []
            with open(FILTER_WORDS) as f:
                for line in f:
                    filter_words.append(line.rstrip('\n'))
            filter_words = set(filter_words)
            #print "the following words will be filtered", filter_words
        filter_words_add = set()
        if FILTER_WORDS_ADD:
            filter_words_add = []
            with open(FILTER_WORDS_ADD) as f:
                for line in f:
                    filter_words_add.append(line.rstrip('\n'))
            filter_words_add = set(filter_words_add)

        positions, hn_articles = 0, 0
        fnamelist = []
        docs = 0
        for g in glob.iglob(self.folder + '/*.txt'):
            fnamelist.append(g)
        for fileno, fname in enumerate(fnamelist):
            with open(fname) as f:
                text = ""
                for line in f:
                    if line[0] != '@':
                        #sentence = re.sub('\d+', '', line.rstrip('\n').strip('\t').replace('\n', '')).split(' ')
                        sentence = tokenize(re.sub('\d+', '', line.rstrip('\n').strip('\t').replace('\n', '')))
                        for ind, word in enumerate(sentence):
                            w = word.lower().rstrip(' ').strip(' ').strip('\t')
                            sentence[ind] = w
                        if FILTER_WORDS:
                            for ind, word in enumerate(sentence):
                                if word.upper() in filter_words:
                                    sentence[ind] = ''
                        if FILTER_WORDS_ADD:
                            for ind, word in enumerate(sentence):
                                if word in filter_words_add:
                                    sentence[ind] = ''
                        text += ' '.join(sentence) + '\n'
                    else:
                        docs += 1
                        if LEMMATIZE:
                            result = lemmatizer(text)
                            positions += len(result)
                            yield result
                        else:
                            result = tokenize(text) # text into tokens here
                            positions += len(result)
                            yield result
                        text = ""
                docs += 1
                if LEMMATIZE:
                    result = lemmatizer(text)
                    positions += len(result)
                    yield result
                else:
                    result = tokenize(text) # text into tokens here
                    positions += len(result)
                    yield result

        print (">>> finished iterating over the corpus of %i documents with %i positions" % (docs, positions))

        self.length = docs # cache corpus length
예제 #39
0
class WikiCorpus(TextCorpus):
    """
    Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus.

    The documents are extracted on-the-fly, so that the whole (massive) dump
    can stay compressed on disk.

    >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
    >>> wiki.saveAsText('wiki_en_vocab200k') # another 8h, creates a file in MatrixMarket format plus file with id->word

    """

    def __init__(self, fname, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None):
        """
        Initialize the corpus. This scans the corpus once, to determine its
        vocabulary (only the first `keep_words` most frequent words that
        appear in at least `noBelow` documents are kept).
        """
        self.fname = fname
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
        else:
            self.dictionary = dictionary

    def get_texts(self, return_raw=False):
        """
        Iterate over the dump, returning text version of each article.

        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print vec
        """
        articles, articles_all = 0, 0
        intext, positions = False, 0
        for lineno, line in enumerate(bz2.BZ2File(self.fname)):
            if line.startswith("      <text"):
                intext = True
                line = line[line.find(">") + 1 :]
                lines = [line]
            elif intext:
                lines.append(line)
            pos = line.find("</text>")  # can be on the same line as <text>
            if pos >= 0:
                articles_all += 1
                intext = False
                if not lines:
                    continue
                lines[-1] = line[:pos]
                text = filter_wiki("".join(lines))
                if len(text) > ARTICLE_MIN_CHARS:  # article redirects are pruned here
                    articles += 1
                    if return_raw:
                        result = text
                    else:
                        result = tokenize(text)  # text into tokens here
                        positions += len(result)
                    yield result

        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles before pruning)" % (articles, positions, articles_all)
        )
        self.length = articles  # cache corpus length
예제 #40
0
class WikiHNCorpus(TextCorpus):
    def __init__(self, wiki_file, hn_folder, dictionary=None, processes=None, 
            lemmatize=utils.HAS_PATTERN):
        """
        Takes the wikipedia *articles.xml.bz2 and the HN folder of articles 
        as input and builds the dictionary and corpus
        """
        global outputname
        self.lemmatize = lemmatize
        if self.lemmatize:
            print "We will lemmatize ('you were'->'be/VB')"
            self.outputname = outputname + "_lemmatized"
        else:
            print "We will only tokenize ('you were'->'you','were')"

        self.wiki_file = wiki_file
        self.hn_folder = hn_folder

        if processes is None:
            processes = max(1, multiprocessing.cpu_count() - 1)
        self.processes = processes

        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=NO_BELOW, 
                    no_above=NO_ABOVE, keep_n=VOCAB_SIZE)
        else:
            self.dictionary = dictionary


    def get_texts(self):
        """
        Iterate over the Wikipedia dump and the HN articles returning text
        """
        wiki_articles, hn_articles, articles_all = 0, 0, 0
        positions, positions_all = 0, 0

        # ************ Wikipedia ************
        texts = ((text, self.lemmatize) for _, text in wikicorpus._extract_pages(bz2.BZ2File(self.wiki_file)))
        pool = multiprocessing.Pool(self.processes)
        for group in utils.chunkize(texts, chunksize=10 * pool._processes, maxsize=1): # otherwise imap puts all the corpus into memory
            for tokens in pool.imap(wikicorpus.process_article, group):
                articles_all += 1
                positions_all += len(tokens)
                if len(tokens) > WIKI_ARTICLE_MIN_WORDS:
                    wiki_articles += 1
                    positions += len(tokens)
                    yield tokens
        pool.terminate()

        print (">>> finished iterating over Wikipedia corpus of %i documents with %i positions (total %i articles, %i positions before pruning articles shorter than %i words)" % (wiki_articles, positions, articles_all, positions_all, WIKI_ARTICLE_MIN_WORDS))

        # ************ HN articles ************
        positions_after_wiki = positions
        fnamelist = []
        for g in glob.iglob(self.hn_folder + '/*.txt'):
            fnamelist.append(g)
        for fileno, fname in enumerate(fnamelist): # TODO parallelize as Wiki
            hn_text = open(fname).read()
            if self.lemmatize:
                result = utils.lemmatize(hn_text) # text into lemmas here
            else:
                result = tokenize(hn_text) # text into tokens here
            articles_all += 1
            positions_all += len(result)
            if len(result) > HN_ARTICLE_MIN_WORDS:
                hn_articles += 1
                positions += len(result)
                yield result

        print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions - positions_after_wiki))
        # ************ /HN articles ************

        self.length = wiki_articles + hn_articles # cache corpus length