Python Dictionary.compactify示例，gensim.corpora.dictionary.Dictionary.compactify Python示例

示例#1

0

显示文件

def download_dictionary(corpus_name: str, target_path: str) -> Dictionary:
    """
    Download dictionary only for a corpus from UCI website

    :param corpus_name: name of UCI corpus
    :param target_path: output directory for dictionary file
    :return: gensim Dictionary
    """

    url_root = "https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/"
    target_path = os.path.join(target_path, "uci", "raw")
    if not os.path.exists(target_path):
        print("creating target path: {}".format(target_path))
        os.makedirs(target_path)

    vocab_file = os.path.join(target_path, "vocab.{}.txt".format(corpus_name))
    print("downloading {} vocab file to: {}".format(corpus_name, vocab_file))
    urllib.request.urlretrieve(url_root + "vocab.{}.txt".format(corpus_name),
                               filename=vocab_file)

    dictionary = Dictionary()
    with open(vocab_file) as f:
        for line in f:
            dictionary.add_documents([[line.strip()]])

    dictionary.compactify()

    return dictionary

示例#2

0

显示文件

文件： preprocess.py 项目： kensk8er/MsTweetAnalysis

def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
    """


    :rtype : gensim.corpora.dictionary.Dictionary
    :param corpora: 
    :param stopwords: 
    :param allowed_pos: 
    :param max_doc: 
    :return: 
    """
    logging.info('Lemmatizing the corpora...')
    count = 0
    corpus_num = len(corpora)
    processed_corpora = []
    corpus_id2orig_id = []

    for index, corpus in corpora.items():
        count += 1
        if count > max_doc:
            break
        if corpus is None:  # skip if corpus is None
            continue

        print '\r', count, '/', corpus_num,
        cleaned_corpus = clean_text(corpus)  # delete irrelevant characters
        corpus = []
        tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
        for token in tokens:
            word, pos = token.split('/')
            corpus.append(word)

        # convert compound word into one token
        corpus = convert_compound(corpus)

        # filter stop words, long words, and non-english words
        corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
        processed_corpora.append(corpus)
        corpus_id2orig_id.append(index)

    print '\n'

    logging.info('Creating dictionary and corpus...')
    dictionary = Dictionary(processed_corpora)
    dictionary.corpus_id2orig_id = corpus_id2orig_id

    logging.info('Filtering unimportant terms...')
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    dictionary.compactify()

    logging.info('Generating corpus...')
    dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
    dictionary.id2token = revdict(dictionary.token2id)

    return dictionary

示例#3

0

显示文件

文件： preprocess_corpus.py 项目： MichaelRechenberg/cs411webproject

    def preprocess_corpus(cls, raw_corpus):
        """Preprocess a corpus for the downage categories

            Parameters:
                raw_corpus: A list of strings where each string is a document

            Returns:
                A tuple (dictionary, id2token, corpus_bow) where

                    dictionary: The gensim.corpora.dictionary for the preprocessed corpus
                    id2token: A python dictionary mapping BOW id to token
                    corpus_bow: The preprocessed corpus in BOW form, using BOW ids from `dictionary`
        """

        # Define filters to apply to each word
        # Make each token lowercase
        # Remove punctuation
        # Remove numeric characters
        # Any token less than 2 characters is removed
        FILTERS = [(lambda x: x.lower()), strip_punctuation, strip_numeric, (lambda x: strip_short(x, minsize=2))]

        preprocessed_corpus = [[word for word in preprocess_string(doc, FILTERS) if word not in STOPWORDS]
                      for doc in raw_corpus]

        # Porter stemming
        # porter = PorterStemmer()
        # tweet_corpus = [[porter.stem(word) for word in doc] for doc in tweet_corpus]

        # Discover useful bigrams like "hot dog" via mutual information (see https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf)
        # Bigrams will have a _ put between them (so the bigram "hot dog" will be transformed to "hot_dog")
        phrases = Phrases(preprocessed_corpus, min_count=BIGRAM_MIN_COUNT, threshold=BIGRAM_SCORE_THRESHOLD)
        preprocessed_corpus = phrases[preprocessed_corpus]

        dictionary = Dictionary(preprocessed_corpus)
        dictionary.compactify()

        # So we can convert BOW ids to tokens
        id2token = {bow_id:token for (token, bow_id) in dictionary.token2id.items()}

        corpus_bow = [dictionary.doc2bow(doc) for doc in preprocessed_corpus]
        
        return (dictionary, id2token, corpus_bow)

示例#4

0

显示文件

文件： utils_gensim.py 项目： wpli/ptr

class FolderCorpus(corpora.TextCorpus):
    def __init__(self, filepaths, preprocess=[], dictionary=None):
        self.filepaths = filepaths
        self.preprocess = preprocess
        self.metadata = None

        self.dictionary = Dictionary()

        self.dictionary.add_documents(self.get_texts())
        self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)
        self.dictionary.compactify()

    def get_texts(self):
        for path in self.filepaths:
            with codecs.open(path, encoding='utf8') as f:
                raw_text = f.read()
                raw_text = raw_text.lower()
                for filt in self.preprocess:
                    raw_text = filt(raw_text)
                text = list(utils.tokenize(raw_text, deacc=True, lowercase=True))
                yield text

示例#5

0

显示文件

文件： Vectorize_SBIR.py 项目： Kevinisagirl/Fake_TV_Transcript_Generator

def training_vectorize(holder):
    #Vector uses BOW to store features of the corpus. Uses dictionary
    #for facilitating this operation. This is an important part of the
    #sequential vectorization

    # split the data
    holder.content = holder['content'].apply(lambda row: row.split())
    # make a dictionary
    dictionary = Dictionary(holder.content.tolist())

    # filter the dictionary
    dictionary.filter_extremes(no_above=0.8, no_below=5)
    dictionary.compactify()

    # transform the data with the dictionary
    holder["content"] = holder["content"].apply(
        lambda row: dictionary.doc2bow(row))

    # transform with tf-idf
    # tfidf = TfidfModel(holder["content"].tolist())
    # holder["content"] = holder["content"].apply(lambda col: tfidf[col])
    return holder, dictionary  #, tfidf

示例#6

0

显示文件

文件： preprocess.py 项目： yakzan/ktext

class processor(processor_base):
    """
    Pre-process text in memory.

    Includes utilities for cleaning, tokenization, and vectorization in parallel.
    """
    def __init__(self,
                 hueristic_pct_padding: float = .90,
                 append_indicators: bool = False,
                 keep_n: int = 150000,
                 padding: str = 'pre',
                 padding_maxlen: Union[int, None] = None,
                 truncating: str = 'post'):
        """
        Parameters:
        ----------
        hueristic_pct_padding: float
            This parameter is only used if `padding_maxlen` = None.  A histogram
            of documents is calculated, and the maxlen is set hueristic_pct_padding.
        append_indicators: bool
            If True, will append the tokens '_start_' and '_end_' to the beginning
            and end of your tokenized documents.  This can be useful when training
            seq2seq models.
        keep_n: int = 150000
            This is the maximum size of your vocabulary (unique number of words
            allowed).  Consider limiting this to a reasonable size based upon
            your corpus.
        padding : str
            'pre' or 'post', pad either before or after each sequence.
        padding_maxlen : int or None
            Maximum sequence length, longer sequences are truncated and shorter
            sequences are padded with zeros at the end.  Note if this is specified,
            the `hueristic_pct_padding` is ignored.
        truncating : str
            'pre' or 'post', remove values from sequences larger than padding_maxlen
            either in the beginning or in the end of the sequence.

        See https://keras.io/preprocessing/sequence/

        Attributes:
        -----------
        vocabulary : gensim.corpora.dictionary.Dictionary
            This is a gensim object that is built after parsing all the tokens
            in your corpus.
        n_tokens : int
            The total number of tokens in the corpus.  Will be less than or
            equal to keep_n
        id2token : dict
            dict with { int : str} ex: {'the': 2, 'cat': 3}
            this is used for converting tokens to integers.
        token2id : dict
            dict with {str: int} ex: {2: 'the', 3: 'cat'}
            this is used for decoding predictions back to tokens
        document_length_stats : pandas.DataFrame
            histogram of document lengths.  Can be used to decide padding_maxlen.
        """
        super().__init__()
        self.hueristic_pct = hueristic_pct_padding
        self.append_indicators = append_indicators
        self.keep_n = keep_n
        self.padding = padding
        self.padding_maxlen = padding_maxlen
        self.truncating = truncating

        # These are placeholders for data that will be collected or calculated
        self.vocabulary = Dictionary()
        self.n_tokens = None
        self.id2token = None
        self.token2id = None
        self.document_length_histogram = Counter()
        self.document_length_stats = None
        self.doc_length_huerestic = None

        # These values are 'hardcoded' for now
        self.padding_value = 0.0
        self.padding_dtype = 'int32'
        self.start_tok = '_start_'
        self.end_tok = '_end_'
        self.keep_tokens = [self.start_tok, self.end_tok]

    def process_text(self, text: List[str]) -> List[List[str]]:
        """Combine the cleaner and tokenizer."""
        return self.__apply_tokenizer(self.__apply_cleaner(text))

    def __apply_cleaner(self, data: List[str]) -> List[str]:
        """Apply the cleaner over a list."""
        return [self.cleaner(doc) for doc in data]

    def __apply_tokenizer(self, data: List[str]) -> List[List[str]]:
        """Apply the tokenizer over a list."""
        if self.append_indicators:
            tmp = [[self.start_tok] + self.tokenizer(doc) + [self.end_tok]
                   for doc in data]
            return tmp
        else:
            return [self.tokenizer(doc) for doc in data]

    def parallel_process_text(self, data: List[str]) -> List[List[str]]:
        """Apply cleaner -> tokenizer."""
        return apply_parallel(data, self.process_text)

    def generate_doc_length_stats(self):
        """Analyze document length statistics for padding strategy"""
        hueristic = self.hueristic_pct
        histdf = (pd.DataFrame(
            [(a, b) for a, b in self.document_length_histogram.items()],
            columns=['bin', 'doc_count']).sort_values(by='bin'))
        histdf['cumsum_pct'] = histdf.doc_count.cumsum(
        ) / histdf.doc_count.sum()

        self.document_length_stats = histdf
        self.doc_length_huerestic = histdf.query(
            f'cumsum_pct >= {hueristic}').bin.head(1).values[0]
        logging.warning(' '.join([
            "Setting maximum document length to",
            f'{self.doc_length_huerestic} based upon',
            f'hueristic of {hueristic} percentile.\n',
            'See full histogram by insepecting the',
            "`document_length_stats` attribute."
        ]))
        self.padding_maxlen = self.doc_length_huerestic

    def fit(self,
            data: List[str],
            return_tokenized_data: bool = False,
            no_below: int = 100,
            no_above: float = .9) -> Union[None, List[List[str]]]:
        """
        TODO: update docs

        Apply cleaner and tokenzier to raw data and build vocabulary.

        Parameters
        ----------
        data : List[str]
            These are raw documents, which are a list of strings. ex:
            [["The quick brown fox"], ["jumps over the lazy dog"]]
        return_tokenized_data : bool
            Return the tokenized strings.  This is primarly used for debugging
            purposes.
        no_below : int
            See below explanation
        no_above : float
            See below explanation

        When tokenizing documents, filter tokens according to these rules:
        1. occur less than `no_below` documents (absolute number) or
        2. occur more than `no_above` documents (fraction of total corpus size, not absolute number).
        3. after (1), and (2), keep only the first keep_n most frequent tokens.

        Returns
        -------
        None or List[List[str]]
            if return_tokenized_data=True then will return tokenized documents,
            otherwise will not return anything.

        This method heavily leverages gensim https://radimrehurek.com/gensim/corpora/dictionary.html
        """
        now = get_time()
        logging.warning(f'....tokenizing data')
        tokenized_data = list(
            chain.from_iterable(self.parallel_process_text(data)))

        if not self.padding_maxlen:
            document_len_counters = apply_parallel(tokenized_data, count_len)

            for doc_counter in document_len_counters:
                self.document_length_histogram.update(doc_counter)
            self.generate_doc_length_stats()

        # chunk the data manually for corpus build adnd pass to build corpus method
        logging.warning(f'(1/3) done. {time_diff(now)} sec')
        logging.warning(f'....building corpus')
        now = get_time()
        corpus = build_corpus(tokenized_data)

        # Merge the corpuses from each thread together, this is like a "reduce" step
        logging.warning(f'(2/3) done. {time_diff(now)} sec')
        logging.warning(f'....consolidating corpus')
        now = get_time()
        self.vocabulary.merge_with(corpus)

        # # get rid of rare tokens from corpus such that they will get the same id
        self.vocabulary.filter_extremes(no_below,
                                        no_above,
                                        self.keep_n,
                                        keep_tokens=self.keep_tokens)

        # compactify the ids for each word
        self.vocabulary.compactify()

        # Build Dictionary accounting For 0 padding, and reserve 1 for unknown and rare Words
        self.token2id = dict([(k, v + 2)
                              for k, v in self.vocabulary.token2id.items()])
        self.id2token = dict([(v, k) for k, v in self.token2id.items()])
        self.n_tokens = len(self.id2token.keys())

        # logging
        logging.warning(f'(3/3) done. {time_diff(now)} sec')
        logging.warning(
            f'Finished parsing {self.vocabulary.num_docs:,} documents.')

        if return_tokenized_data:
            return tokenized_data

    def token_count_pandas(self):
        """ See token counts as pandas dataframe"""
        freq_df = pd.DataFrame(
            [b for a, b in self.vocabulary.dfs.items()],
            index=[a for a, b in self.vocabulary.dfs.items()],
            columns=['count'])

        id2tokens = [(b, a) for a, b in self.vocabulary.token2id.items()]

        token_df = pd.DataFrame([b for a, b in id2tokens],
                                index=[a for a, b in id2tokens],
                                columns=['token'])

        return freq_df.join(token_df).sort_values('count', ascending=False)

    def fit_transform(self,
                      data: List[str],
                      no_below: int = 25,
                      no_above: float = 0.8) -> List[List[int]]:
        """
        Apply cleaner and tokenzier to raw data, build vocabulary and return
        transfomred dataset that is a List[List[int]].  This will use
        process-based-threading on all available cores.

        ex:
        >>> data = [["The quick brown fox"], ["jumps over the lazy dog"]]
        >>> pp = preprocess(maxlen=5, no_below=0)
        >>> pp.fit_transform(data)
        # 0 padding is applied
        [[0, 2, 3, 4, 5], [6, 7, 2, 8, 9]]

        Parameters
        ----------
        data : List[str]
            These are raw documents, which are a list of strings. ex:
            [["The quick brown fox"], ["jumps over the lazy dog"]]
        no_below : int
            See below explanation
        no_above : float
            See below explanation

        When tokenizing documents, filter tokens according to these rules:
        1. occur less than `no_below` documents (absolute number) or
        2. occur more than `no_above` documents (fraction of total corpus size, not absolute number).
        3. after (1), and (2), keep only the first keep_n most frequent tokens.

        Returns
        -------
        numpy.array with shape (number of documents, max_len)


        This method leverages gensim https://radimrehurek.com/gensim/corpora/dictionary.html
        """
        tokdata = self.fit(data,
                           return_tokenized_data=True,
                           no_below=no_below,
                           no_above=no_above)

        logging.warning(f'...fit is finished, beginning transform')
        now = get_time()
        vec_data = self.vectorize_parallel(tokdata)
        logging.warning(f'done. {time_diff(now)} sec')
        return vec_data

    def transform(self, data: List[str]) -> List[List[int]]:
        """
        Transform List of documents into List[List[int]]
        If transforming a large number of documents consider using the method
        `transform_parallel` instead.

        ex:
        >> pp = processor()
        >> pp.fit(docs)
        >> new_docs = [["The quick brown fox"], ["jumps over the lazy dog"]]
        >> pp.transform(new_docs)
        [[1, 2, 3, 4], [5, 6, 1, 7, 8]]
        """
        return self.vectorize(self.process_text(data))

    def transform_parallel(self, data: List[str]) -> List[List[int]]:
        """
        Transform List of documents into List[List[int]].  Uses process based
        threading on all available cores.  If only processing a small number of
        documents ( < 10k ) then consider using the method `transform` instead.

        ex:
        >> pp = processor()
        >> pp.fit(docs)
        >> new_docs = [["The quick brown fox"], ["jumps over the lazy dog"]]
        >> pp.transform_parallel(new_docs)
        [[1, 2, 3, 4], [5, 6, 1, 7, 8]]
        """
        return np.vstack(apply_parallel(data, self.transform))

    def get_idx(self, token: str) -> int:
        """Get integer index from token."""
        # return the index for index or if not foudn return out of boundary index which is 1
        return self.token2id.get(token, 1)

    def __vec_one_doc(self, doc: List[str]) -> List[int]:
        """
        Vectorize a single tokenized document.
        ex: ['hello', 'world']
        """
        return [self.get_idx(tok) for tok in doc]

    def vectorize(self, docs: List[List[str]]) -> List[List[int]]:
        """
        Vectorize and apply padding on a set of tokenized doucments
        ex: [['hello, 'world'], ['goodbye', 'now']]

        """
        # First apply indexing on all the rows then pad_sequnces (i found this
        # faster than trying to do these steps on each row
        return pad_sequences(list(map(self.__vec_one_doc, docs)),
                             maxlen=self.padding_maxlen,
                             dtype=self.padding_dtype,
                             padding=self.padding,
                             truncating=self.truncating,
                             value=self.padding_value)

    def vectorize_parallel(self, data: List[List[str]]) -> np.array:
        """
        Apply idx-> token mappings in parallel and apply padding.

        Arguments:
        data: List of List of strings
        """
        indexed_data = apply_parallel(data, self.vectorize)
        # concatenate list of arrays vertically
        return np.vstack(indexed_data)

示例#7

0

显示文件

def createDictionary(texts):
    dictionary = Dictionary(texts)
    dictionary.filter_extremes(no_below=2, no_above=0.4, keep_n=1000000)
    dictionary.compactify()
    return dictionary

示例#8

0

显示文件

文件： topic_analysis_01.py 项目： cfoster0/pile-explorer

                    help='File name to give the dictionary upon saving')

args = parser.parse_args()

input_path = args.input_path
output_name = args.output_name
CHUNK_SIZE = args.chunk_size

# Stream in documents from path
rdr = lmd.Reader(input_path)
gnr = rdr.stream_data(get_meta=True)

# Build a dictionary out of the validation documents
dictionary = Dictionary()
docs = rdr.stream_data(threaded=True)
doc_chunks = chunks(docs, size=CHUNK_SIZE)
# Progress in chunks
for chunk in doc_chunks:
    print("Adding ", CHUNK_SIZE, " docs")
    tokenized = [[
        tok.lower_ for tok in doc if not tok.is_stop and tok.is_alpha
    ] for doc in tokenizer.pipe(
        [item for item in chunk if language(item) == 'en'],
        batch_size=CHUNK_SIZE)]
    dictionary.add_documents(tokenized)

# Keep only 2**16 most frequent tokens
dictionary.filter_extremes(keep_n=2**16)
dictionary.compactify()
dictionary.save(output_name)

示例#9

0

显示文件

文件： textual_similarity_search.py 项目： ht22pt/method-embedding


def get_texts(path):
    with open(path, encoding='latin') as corpus_file:
        for line in corpus_file:
            yield tok.tokenize(line.strip())


#%%

texts = [line.strip() for line in open(corpus_path, encoding='latin')]
#%%

common_dictionary = Dictionary(get_texts(corpus_path))
common_dictionary.filter_extremes()
common_dictionary.compactify()

#%%

common_corpus = [
    common_dictionary.doc2bow(text) for text in get_texts(corpus_path)
]

#%%
from gensim.test.utils import get_tmpfile

index_tmpfile = get_tmpfile("index")
index = Similarity(index_tmpfile,
                   common_corpus,
                   num_features=len(common_dictionary))

示例#10

0

显示文件

def cooccurence_matrix(infile, total, window, smoothing):
    """
    Generates a co-occurrence matrix using symmetric-window skip-grams of
    length window.  Then generates a PPMI transform using smoothed probabilities.

    :param infile: bz2-compressed file to read.
    :param total: the total number of files, if known, for TQDM to use.
    :param window: symmetric window size to use.
    :param smoothing: smoothing value for smoothed prior distributions
    :param no_below: no_below arg for Gensim dict.
    :param no_above: no_above arg for Gensim dict.
    :return: SVD vectors
    """

    with bz2.open(infile, "r") as F:
        # gensim Dictionary for word<->id mappings
        vocab = Dictionary(i.split()[1:] for i in tqdm(
            F, total=total, desc=f"{infile}: {'Gathering Vocabulary':<25s}"))
        vocab.compactify()
        sleep(.5)
        print("\nVOCAB SIZE: {}".format(len(vocab)))
        sleep(.5)

    with bz2.open(infile, "r") as F:
        INDS = Counter((
            DOC[i], DOC[i + j]
        ) for DOC in (np.array(vocab.doc2idx(J.split()[1:])) for J in tqdm(
            F, total=total, desc=f"{infile}: {'Co-occurrence Matrix':<25s}"))
                       for i in range(1, len(DOC))
                       for j in range(min(window,
                                          len(DOC) - i)))

    # Convert {(A, B):C} dict structure to np.array([C, A, B]) for
    # sparse matrix construction.
    INDS = np.array([[
        INDS[I], I[0], I[1]
    ] for I in tqdm(INDS.keys(), desc=f"{infile}: {'Generating Indices':<25s}")
                     if I[0] != I[1] and I[0] > 0 and I[1] > 0])
    print(INDS.shape)
    ppmi_mat = csr_matrix((INDS[:, 0], (INDS[:, 1], INDS[:, 2])),
                          shape=(len(vocab), len(vocab)))

    print("PPMI matrix shape: {}".format(ppmi_mat.shape))
    del INDS
    # ppmi_mat.eliminate_zeros()
    # Add transpose, since PPMI is symmetric--PPMI(i,j) = PPMI(j,i)
    ppmi_mat = ppmi_mat + ppmi_mat.transpose()

    ### PPMI TRANSFORMATION ###
    print("Generating matrices for PPMI transform...")
    # We'll use these more than once, so only calculate them the one time
    POW = ppmi_mat.power(smoothing)
    TOT = np.sum(ppmi_mat)
    p_i_star = np.array(np.sum(ppmi_mat, axis=1) / TOT).astype(
        np.float32).reshape((-1, ))
    p_star_j = np.array(np.sum(POW, axis=0) / np.sum(POW)).astype(
        np.float32).reshape((-1, ))
    ppmi_mat = ppmi_mat / TOT

    ### PPMI TRANSFORM ###
    data = ppmi_mat.data.astype(np.float32)
    indices = ppmi_mat.indices.astype(np.int32)
    indptr = ppmi_mat.indptr.astype(np.int32)
    for i in trange(indptr.shape[0] - 1,
                    desc=f"{infile}: {'PPMI Transform':<25s}"):
        data[indptr[i]:indptr[i+1]] = \
            np.maximum(
                0,
                np.log2(data[indptr[i]:indptr[i+1]] / (p_i_star[i] * p_star_j[indices[indptr[i]:indptr[i+1]]]))
        )
    ppmi_mat = csr_matrix((data, indices, indptr))
    ppmi_mat.eliminate_zeros()

    ### SVD ###
    sleep(.5)
    print("SVD...")
    # per https://web.stanford.edu/~jurafsky/slp3/16.pdf we only
    # use the raw left singular values as the word embedding vectors
    U = svds(ppmi_mat, k=300, return_singular_vectors="u")[0]

    return U, vocab

示例#11

0

显示文件

文件： train_lda_v2.py 项目： tuan3w/semi_supervised_lda

def train_model(corpus_path, dic_conf, lda_conf):
    logging.info('Loading corpus from file {}'.format(corpus_path))
    corpus = FastTextCorpus(corpus_path, bufsize=20000000, length=5926250)
    # corpus = LineSentence(corpus_path, 10000000)
    print '-' * 80
    if lda_conf["build_dict"]:
        logging.info("Building dictionary ...")
        dic = Dictionary(corpus)
        dic.filter_extremes(no_below=dic_conf["min_tf"],
                            no_above=dic_conf["max_df"],
                            keep_n=dic_conf["vocab_size"])
        dic.compactify()
        logging.info("Saving dictionary ...")
        dic.save(dic_conf["dic"])
    else:
        logging.info("Loading dictionary ..")
        dic = Dictionary.load(dic_conf["dic"])

    bow = IntCorpus(corpus, dic)
    l = len(bow)
    print l

    tfMod = TfidfModel.load(lda_conf["tfmod"])
    #save corpus to disk for later usage
    # logging.info("Saving corpus to disk ...")
    # MmCorpus.serialize("data/corpus.mm", bow)
    # bow = MmCorpus("data/large_corpus.mm")

    print '-' * 80
    if lda_conf["new"]:
        logging.info("Training new lda model")
        logging.info("Loading defined keywords ...")
        keywords = {}
        topics = []
        with codecs.open(lda_conf["kw_file"], "r", "utf-8") as f:
            for l in f:
                sp = l.strip().split(':')
                topic = int(sp[0])
                topics.append(sp[1])
                kws = sp[2].split(',')
                for kw in kws:
                    if kw not in keywords:
                        keywords[kw] = set([topic])
                    else:
                        keywords[kw].add(topic)
                    #keywords[kw.lower()] = topic

        logging.info("Number of defined keywords: {}".format(len(keywords)))
        if lda_conf["threads"] <= 1:
            model = LdaModelNew(corpus=bow,
                                id2word=dic,
                                iterations=lda_conf["iterations"],
                                num_topics=lda_conf["num_topics"],
                                passes=lda_conf["passes"],
                                chunksize=lda_conf["chunksize"],
                                defined_kws=keywords,
                                alpha='auto',
                                eval_every=lda_conf["eval_every"])
        else:
            logging.info("Training model using mutlicore lda version")
            model = LdaMulticoreNew(corpus=bow,
                                    id2word=dic,
                                    workers=lda_conf["threads"],
                                    iterations=lda_conf["iterations"],
                                    num_topics=lda_conf["num_topics"],
                                    passes=lda_conf["passes"],
                                    defined_kws=keywords,
                                    alpha='symmetric',
                                    chunksize=lda_conf["chunksize"],
                                    eval_every=lda_conf["eval_every"],
                                    tfMod=tfMod,
                                    topic_names=topics)

    else:
        logging.info("Training ldamodel implemented in gensim")
        model = LdaModelOld(corpus=bow,
                            id2word=dic,
                            iterations=lda_conf["iterations"],
                            num_topics=lda_conf["num_topics"],
                            passes=lda_conf["passes"],
                            chunksize=lda_conf["chunksize"],
                            alpha='auto',
                            eval_every=lda_conf["eval_every"])

    logging.info('Saving lda model to {}'.format(lda_conf["model_path"]))
    model.save(lda_conf["model_path"])
    logging.info('Saving model done!')