def download_dictionary(corpus_name: str, target_path: str) -> Dictionary: """ Download dictionary only for a corpus from UCI website :param corpus_name: name of UCI corpus :param target_path: output directory for dictionary file :return: gensim Dictionary """ url_root = "https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/" target_path = os.path.join(target_path, "uci", "raw") if not os.path.exists(target_path): print("creating target path: {}".format(target_path)) os.makedirs(target_path) vocab_file = os.path.join(target_path, "vocab.{}.txt".format(corpus_name)) print("downloading {} vocab file to: {}".format(corpus_name, vocab_file)) urllib.request.urlretrieve(url_root + "vocab.{}.txt".format(corpus_name), filename=vocab_file) dictionary = Dictionary() with open(vocab_file) as f: for line in f: dictionary.add_documents([[line.strip()]]) dictionary.compactify() return dictionary
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None): """ :rtype : gensim.corpora.dictionary.Dictionary :param corpora: :param stopwords: :param allowed_pos: :param max_doc: :return: """ logging.info('Lemmatizing the corpora...') count = 0 corpus_num = len(corpora) processed_corpora = [] corpus_id2orig_id = [] for index, corpus in corpora.items(): count += 1 if count > max_doc: break if corpus is None: # skip if corpus is None continue print '\r', count, '/', corpus_num, cleaned_corpus = clean_text(corpus) # delete irrelevant characters corpus = [] tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos) for token in tokens: word, pos = token.split('/') corpus.append(word) # convert compound word into one token corpus = convert_compound(corpus) # filter stop words, long words, and non-english words corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()] processed_corpora.append(corpus) corpus_id2orig_id.append(index) print '\n' logging.info('Creating dictionary and corpus...') dictionary = Dictionary(processed_corpora) dictionary.corpus_id2orig_id = corpus_id2orig_id logging.info('Filtering unimportant terms...') dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) dictionary.compactify() logging.info('Generating corpus...') dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora] dictionary.id2token = revdict(dictionary.token2id) return dictionary
def preprocess_corpus(cls, raw_corpus): """Preprocess a corpus for the downage categories Parameters: raw_corpus: A list of strings where each string is a document Returns: A tuple (dictionary, id2token, corpus_bow) where dictionary: The gensim.corpora.dictionary for the preprocessed corpus id2token: A python dictionary mapping BOW id to token corpus_bow: The preprocessed corpus in BOW form, using BOW ids from `dictionary` """ # Define filters to apply to each word # Make each token lowercase # Remove punctuation # Remove numeric characters # Any token less than 2 characters is removed FILTERS = [(lambda x: x.lower()), strip_punctuation, strip_numeric, (lambda x: strip_short(x, minsize=2))] preprocessed_corpus = [[word for word in preprocess_string(doc, FILTERS) if word not in STOPWORDS] for doc in raw_corpus] # Porter stemming # porter = PorterStemmer() # tweet_corpus = [[porter.stem(word) for word in doc] for doc in tweet_corpus] # Discover useful bigrams like "hot dog" via mutual information (see https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf) # Bigrams will have a _ put between them (so the bigram "hot dog" will be transformed to "hot_dog") phrases = Phrases(preprocessed_corpus, min_count=BIGRAM_MIN_COUNT, threshold=BIGRAM_SCORE_THRESHOLD) preprocessed_corpus = phrases[preprocessed_corpus] dictionary = Dictionary(preprocessed_corpus) dictionary.compactify() # So we can convert BOW ids to tokens id2token = {bow_id:token for (token, bow_id) in dictionary.token2id.items()} corpus_bow = [dictionary.doc2bow(doc) for doc in preprocessed_corpus] return (dictionary, id2token, corpus_bow)
class FolderCorpus(corpora.TextCorpus): def __init__(self, filepaths, preprocess=[], dictionary=None): self.filepaths = filepaths self.preprocess = preprocess self.metadata = None self.dictionary = Dictionary() self.dictionary.add_documents(self.get_texts()) self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000) self.dictionary.compactify() def get_texts(self): for path in self.filepaths: with codecs.open(path, encoding='utf8') as f: raw_text = f.read() raw_text = raw_text.lower() for filt in self.preprocess: raw_text = filt(raw_text) text = list(utils.tokenize(raw_text, deacc=True, lowercase=True)) yield text
def training_vectorize(holder): #Vector uses BOW to store features of the corpus. Uses dictionary #for facilitating this operation. This is an important part of the #sequential vectorization # split the data holder.content = holder['content'].apply(lambda row: row.split()) # make a dictionary dictionary = Dictionary(holder.content.tolist()) # filter the dictionary dictionary.filter_extremes(no_above=0.8, no_below=5) dictionary.compactify() # transform the data with the dictionary holder["content"] = holder["content"].apply( lambda row: dictionary.doc2bow(row)) # transform with tf-idf # tfidf = TfidfModel(holder["content"].tolist()) # holder["content"] = holder["content"].apply(lambda col: tfidf[col]) return holder, dictionary #, tfidf
class processor(processor_base): """ Pre-process text in memory. Includes utilities for cleaning, tokenization, and vectorization in parallel. """ def __init__(self, hueristic_pct_padding: float = .90, append_indicators: bool = False, keep_n: int = 150000, padding: str = 'pre', padding_maxlen: Union[int, None] = None, truncating: str = 'post'): """ Parameters: ---------- hueristic_pct_padding: float This parameter is only used if `padding_maxlen` = None. A histogram of documents is calculated, and the maxlen is set hueristic_pct_padding. append_indicators: bool If True, will append the tokens '_start_' and '_end_' to the beginning and end of your tokenized documents. This can be useful when training seq2seq models. keep_n: int = 150000 This is the maximum size of your vocabulary (unique number of words allowed). Consider limiting this to a reasonable size based upon your corpus. padding : str 'pre' or 'post', pad either before or after each sequence. padding_maxlen : int or None Maximum sequence length, longer sequences are truncated and shorter sequences are padded with zeros at the end. Note if this is specified, the `hueristic_pct_padding` is ignored. truncating : str 'pre' or 'post', remove values from sequences larger than padding_maxlen either in the beginning or in the end of the sequence. See https://keras.io/preprocessing/sequence/ Attributes: ----------- vocabulary : gensim.corpora.dictionary.Dictionary This is a gensim object that is built after parsing all the tokens in your corpus. n_tokens : int The total number of tokens in the corpus. Will be less than or equal to keep_n id2token : dict dict with { int : str} ex: {'the': 2, 'cat': 3} this is used for converting tokens to integers. token2id : dict dict with {str: int} ex: {2: 'the', 3: 'cat'} this is used for decoding predictions back to tokens document_length_stats : pandas.DataFrame histogram of document lengths. Can be used to decide padding_maxlen. """ super().__init__() self.hueristic_pct = hueristic_pct_padding self.append_indicators = append_indicators self.keep_n = keep_n self.padding = padding self.padding_maxlen = padding_maxlen self.truncating = truncating # These are placeholders for data that will be collected or calculated self.vocabulary = Dictionary() self.n_tokens = None self.id2token = None self.token2id = None self.document_length_histogram = Counter() self.document_length_stats = None self.doc_length_huerestic = None # These values are 'hardcoded' for now self.padding_value = 0.0 self.padding_dtype = 'int32' self.start_tok = '_start_' self.end_tok = '_end_' self.keep_tokens = [self.start_tok, self.end_tok] def process_text(self, text: List[str]) -> List[List[str]]: """Combine the cleaner and tokenizer.""" return self.__apply_tokenizer(self.__apply_cleaner(text)) def __apply_cleaner(self, data: List[str]) -> List[str]: """Apply the cleaner over a list.""" return [self.cleaner(doc) for doc in data] def __apply_tokenizer(self, data: List[str]) -> List[List[str]]: """Apply the tokenizer over a list.""" if self.append_indicators: tmp = [[self.start_tok] + self.tokenizer(doc) + [self.end_tok] for doc in data] return tmp else: return [self.tokenizer(doc) for doc in data] def parallel_process_text(self, data: List[str]) -> List[List[str]]: """Apply cleaner -> tokenizer.""" return apply_parallel(data, self.process_text) def generate_doc_length_stats(self): """Analyze document length statistics for padding strategy""" hueristic = self.hueristic_pct histdf = (pd.DataFrame( [(a, b) for a, b in self.document_length_histogram.items()], columns=['bin', 'doc_count']).sort_values(by='bin')) histdf['cumsum_pct'] = histdf.doc_count.cumsum( ) / histdf.doc_count.sum() self.document_length_stats = histdf self.doc_length_huerestic = histdf.query( f'cumsum_pct >= {hueristic}').bin.head(1).values[0] logging.warning(' '.join([ "Setting maximum document length to", f'{self.doc_length_huerestic} based upon', f'hueristic of {hueristic} percentile.\n', 'See full histogram by insepecting the', "`document_length_stats` attribute." ])) self.padding_maxlen = self.doc_length_huerestic def fit(self, data: List[str], return_tokenized_data: bool = False, no_below: int = 100, no_above: float = .9) -> Union[None, List[List[str]]]: """ TODO: update docs Apply cleaner and tokenzier to raw data and build vocabulary. Parameters ---------- data : List[str] These are raw documents, which are a list of strings. ex: [["The quick brown fox"], ["jumps over the lazy dog"]] return_tokenized_data : bool Return the tokenized strings. This is primarly used for debugging purposes. no_below : int See below explanation no_above : float See below explanation When tokenizing documents, filter tokens according to these rules: 1. occur less than `no_below` documents (absolute number) or 2. occur more than `no_above` documents (fraction of total corpus size, not absolute number). 3. after (1), and (2), keep only the first keep_n most frequent tokens. Returns ------- None or List[List[str]] if return_tokenized_data=True then will return tokenized documents, otherwise will not return anything. This method heavily leverages gensim https://radimrehurek.com/gensim/corpora/dictionary.html """ now = get_time() logging.warning(f'....tokenizing data') tokenized_data = list( chain.from_iterable(self.parallel_process_text(data))) if not self.padding_maxlen: document_len_counters = apply_parallel(tokenized_data, count_len) for doc_counter in document_len_counters: self.document_length_histogram.update(doc_counter) self.generate_doc_length_stats() # chunk the data manually for corpus build adnd pass to build corpus method logging.warning(f'(1/3) done. {time_diff(now)} sec') logging.warning(f'....building corpus') now = get_time() corpus = build_corpus(tokenized_data) # Merge the corpuses from each thread together, this is like a "reduce" step logging.warning(f'(2/3) done. {time_diff(now)} sec') logging.warning(f'....consolidating corpus') now = get_time() self.vocabulary.merge_with(corpus) # # get rid of rare tokens from corpus such that they will get the same id self.vocabulary.filter_extremes(no_below, no_above, self.keep_n, keep_tokens=self.keep_tokens) # compactify the ids for each word self.vocabulary.compactify() # Build Dictionary accounting For 0 padding, and reserve 1 for unknown and rare Words self.token2id = dict([(k, v + 2) for k, v in self.vocabulary.token2id.items()]) self.id2token = dict([(v, k) for k, v in self.token2id.items()]) self.n_tokens = len(self.id2token.keys()) # logging logging.warning(f'(3/3) done. {time_diff(now)} sec') logging.warning( f'Finished parsing {self.vocabulary.num_docs:,} documents.') if return_tokenized_data: return tokenized_data def token_count_pandas(self): """ See token counts as pandas dataframe""" freq_df = pd.DataFrame( [b for a, b in self.vocabulary.dfs.items()], index=[a for a, b in self.vocabulary.dfs.items()], columns=['count']) id2tokens = [(b, a) for a, b in self.vocabulary.token2id.items()] token_df = pd.DataFrame([b for a, b in id2tokens], index=[a for a, b in id2tokens], columns=['token']) return freq_df.join(token_df).sort_values('count', ascending=False) def fit_transform(self, data: List[str], no_below: int = 25, no_above: float = 0.8) -> List[List[int]]: """ Apply cleaner and tokenzier to raw data, build vocabulary and return transfomred dataset that is a List[List[int]]. This will use process-based-threading on all available cores. ex: >>> data = [["The quick brown fox"], ["jumps over the lazy dog"]] >>> pp = preprocess(maxlen=5, no_below=0) >>> pp.fit_transform(data) # 0 padding is applied [[0, 2, 3, 4, 5], [6, 7, 2, 8, 9]] Parameters ---------- data : List[str] These are raw documents, which are a list of strings. ex: [["The quick brown fox"], ["jumps over the lazy dog"]] no_below : int See below explanation no_above : float See below explanation When tokenizing documents, filter tokens according to these rules: 1. occur less than `no_below` documents (absolute number) or 2. occur more than `no_above` documents (fraction of total corpus size, not absolute number). 3. after (1), and (2), keep only the first keep_n most frequent tokens. Returns ------- numpy.array with shape (number of documents, max_len) This method leverages gensim https://radimrehurek.com/gensim/corpora/dictionary.html """ tokdata = self.fit(data, return_tokenized_data=True, no_below=no_below, no_above=no_above) logging.warning(f'...fit is finished, beginning transform') now = get_time() vec_data = self.vectorize_parallel(tokdata) logging.warning(f'done. {time_diff(now)} sec') return vec_data def transform(self, data: List[str]) -> List[List[int]]: """ Transform List of documents into List[List[int]] If transforming a large number of documents consider using the method `transform_parallel` instead. ex: >> pp = processor() >> pp.fit(docs) >> new_docs = [["The quick brown fox"], ["jumps over the lazy dog"]] >> pp.transform(new_docs) [[1, 2, 3, 4], [5, 6, 1, 7, 8]] """ return self.vectorize(self.process_text(data)) def transform_parallel(self, data: List[str]) -> List[List[int]]: """ Transform List of documents into List[List[int]]. Uses process based threading on all available cores. If only processing a small number of documents ( < 10k ) then consider using the method `transform` instead. ex: >> pp = processor() >> pp.fit(docs) >> new_docs = [["The quick brown fox"], ["jumps over the lazy dog"]] >> pp.transform_parallel(new_docs) [[1, 2, 3, 4], [5, 6, 1, 7, 8]] """ return np.vstack(apply_parallel(data, self.transform)) def get_idx(self, token: str) -> int: """Get integer index from token.""" # return the index for index or if not foudn return out of boundary index which is 1 return self.token2id.get(token, 1) def __vec_one_doc(self, doc: List[str]) -> List[int]: """ Vectorize a single tokenized document. ex: ['hello', 'world'] """ return [self.get_idx(tok) for tok in doc] def vectorize(self, docs: List[List[str]]) -> List[List[int]]: """ Vectorize and apply padding on a set of tokenized doucments ex: [['hello, 'world'], ['goodbye', 'now']] """ # First apply indexing on all the rows then pad_sequnces (i found this # faster than trying to do these steps on each row return pad_sequences(list(map(self.__vec_one_doc, docs)), maxlen=self.padding_maxlen, dtype=self.padding_dtype, padding=self.padding, truncating=self.truncating, value=self.padding_value) def vectorize_parallel(self, data: List[List[str]]) -> np.array: """ Apply idx-> token mappings in parallel and apply padding. Arguments: data: List of List of strings """ indexed_data = apply_parallel(data, self.vectorize) # concatenate list of arrays vertically return np.vstack(indexed_data)
def createDictionary(texts): dictionary = Dictionary(texts) dictionary.filter_extremes(no_below=2, no_above=0.4, keep_n=1000000) dictionary.compactify() return dictionary
help='File name to give the dictionary upon saving') args = parser.parse_args() input_path = args.input_path output_name = args.output_name CHUNK_SIZE = args.chunk_size # Stream in documents from path rdr = lmd.Reader(input_path) gnr = rdr.stream_data(get_meta=True) # Build a dictionary out of the validation documents dictionary = Dictionary() docs = rdr.stream_data(threaded=True) doc_chunks = chunks(docs, size=CHUNK_SIZE) # Progress in chunks for chunk in doc_chunks: print("Adding ", CHUNK_SIZE, " docs") tokenized = [[ tok.lower_ for tok in doc if not tok.is_stop and tok.is_alpha ] for doc in tokenizer.pipe( [item for item in chunk if language(item) == 'en'], batch_size=CHUNK_SIZE)] dictionary.add_documents(tokenized) # Keep only 2**16 most frequent tokens dictionary.filter_extremes(keep_n=2**16) dictionary.compactify() dictionary.save(output_name)
def get_texts(path): with open(path, encoding='latin') as corpus_file: for line in corpus_file: yield tok.tokenize(line.strip()) #%% texts = [line.strip() for line in open(corpus_path, encoding='latin')] #%% common_dictionary = Dictionary(get_texts(corpus_path)) common_dictionary.filter_extremes() common_dictionary.compactify() #%% common_corpus = [ common_dictionary.doc2bow(text) for text in get_texts(corpus_path) ] #%% from gensim.test.utils import get_tmpfile index_tmpfile = get_tmpfile("index") index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary))
def cooccurence_matrix(infile, total, window, smoothing): """ Generates a co-occurrence matrix using symmetric-window skip-grams of length window. Then generates a PPMI transform using smoothed probabilities. :param infile: bz2-compressed file to read. :param total: the total number of files, if known, for TQDM to use. :param window: symmetric window size to use. :param smoothing: smoothing value for smoothed prior distributions :param no_below: no_below arg for Gensim dict. :param no_above: no_above arg for Gensim dict. :return: SVD vectors """ with bz2.open(infile, "r") as F: # gensim Dictionary for word<->id mappings vocab = Dictionary(i.split()[1:] for i in tqdm( F, total=total, desc=f"{infile}: {'Gathering Vocabulary':<25s}")) vocab.compactify() sleep(.5) print("\nVOCAB SIZE: {}".format(len(vocab))) sleep(.5) with bz2.open(infile, "r") as F: INDS = Counter(( DOC[i], DOC[i + j] ) for DOC in (np.array(vocab.doc2idx(J.split()[1:])) for J in tqdm( F, total=total, desc=f"{infile}: {'Co-occurrence Matrix':<25s}")) for i in range(1, len(DOC)) for j in range(min(window, len(DOC) - i))) # Convert {(A, B):C} dict structure to np.array([C, A, B]) for # sparse matrix construction. INDS = np.array([[ INDS[I], I[0], I[1] ] for I in tqdm(INDS.keys(), desc=f"{infile}: {'Generating Indices':<25s}") if I[0] != I[1] and I[0] > 0 and I[1] > 0]) print(INDS.shape) ppmi_mat = csr_matrix((INDS[:, 0], (INDS[:, 1], INDS[:, 2])), shape=(len(vocab), len(vocab))) print("PPMI matrix shape: {}".format(ppmi_mat.shape)) del INDS # ppmi_mat.eliminate_zeros() # Add transpose, since PPMI is symmetric--PPMI(i,j) = PPMI(j,i) ppmi_mat = ppmi_mat + ppmi_mat.transpose() ### PPMI TRANSFORMATION ### print("Generating matrices for PPMI transform...") # We'll use these more than once, so only calculate them the one time POW = ppmi_mat.power(smoothing) TOT = np.sum(ppmi_mat) p_i_star = np.array(np.sum(ppmi_mat, axis=1) / TOT).astype( np.float32).reshape((-1, )) p_star_j = np.array(np.sum(POW, axis=0) / np.sum(POW)).astype( np.float32).reshape((-1, )) ppmi_mat = ppmi_mat / TOT ### PPMI TRANSFORM ### data = ppmi_mat.data.astype(np.float32) indices = ppmi_mat.indices.astype(np.int32) indptr = ppmi_mat.indptr.astype(np.int32) for i in trange(indptr.shape[0] - 1, desc=f"{infile}: {'PPMI Transform':<25s}"): data[indptr[i]:indptr[i+1]] = \ np.maximum( 0, np.log2(data[indptr[i]:indptr[i+1]] / (p_i_star[i] * p_star_j[indices[indptr[i]:indptr[i+1]]])) ) ppmi_mat = csr_matrix((data, indices, indptr)) ppmi_mat.eliminate_zeros() ### SVD ### sleep(.5) print("SVD...") # per https://web.stanford.edu/~jurafsky/slp3/16.pdf we only # use the raw left singular values as the word embedding vectors U = svds(ppmi_mat, k=300, return_singular_vectors="u")[0] return U, vocab
def train_model(corpus_path, dic_conf, lda_conf): logging.info('Loading corpus from file {}'.format(corpus_path)) corpus = FastTextCorpus(corpus_path, bufsize=20000000, length=5926250) # corpus = LineSentence(corpus_path, 10000000) print '-' * 80 if lda_conf["build_dict"]: logging.info("Building dictionary ...") dic = Dictionary(corpus) dic.filter_extremes(no_below=dic_conf["min_tf"], no_above=dic_conf["max_df"], keep_n=dic_conf["vocab_size"]) dic.compactify() logging.info("Saving dictionary ...") dic.save(dic_conf["dic"]) else: logging.info("Loading dictionary ..") dic = Dictionary.load(dic_conf["dic"]) bow = IntCorpus(corpus, dic) l = len(bow) print l tfMod = TfidfModel.load(lda_conf["tfmod"]) #save corpus to disk for later usage # logging.info("Saving corpus to disk ...") # MmCorpus.serialize("data/corpus.mm", bow) # bow = MmCorpus("data/large_corpus.mm") print '-' * 80 if lda_conf["new"]: logging.info("Training new lda model") logging.info("Loading defined keywords ...") keywords = {} topics = [] with codecs.open(lda_conf["kw_file"], "r", "utf-8") as f: for l in f: sp = l.strip().split(':') topic = int(sp[0]) topics.append(sp[1]) kws = sp[2].split(',') for kw in kws: if kw not in keywords: keywords[kw] = set([topic]) else: keywords[kw].add(topic) #keywords[kw.lower()] = topic logging.info("Number of defined keywords: {}".format(len(keywords))) if lda_conf["threads"] <= 1: model = LdaModelNew(corpus=bow, id2word=dic, iterations=lda_conf["iterations"], num_topics=lda_conf["num_topics"], passes=lda_conf["passes"], chunksize=lda_conf["chunksize"], defined_kws=keywords, alpha='auto', eval_every=lda_conf["eval_every"]) else: logging.info("Training model using mutlicore lda version") model = LdaMulticoreNew(corpus=bow, id2word=dic, workers=lda_conf["threads"], iterations=lda_conf["iterations"], num_topics=lda_conf["num_topics"], passes=lda_conf["passes"], defined_kws=keywords, alpha='symmetric', chunksize=lda_conf["chunksize"], eval_every=lda_conf["eval_every"], tfMod=tfMod, topic_names=topics) else: logging.info("Training ldamodel implemented in gensim") model = LdaModelOld(corpus=bow, id2word=dic, iterations=lda_conf["iterations"], num_topics=lda_conf["num_topics"], passes=lda_conf["passes"], chunksize=lda_conf["chunksize"], alpha='auto', eval_every=lda_conf["eval_every"]) logging.info('Saving lda model to {}'.format(lda_conf["model_path"])) model.save(lda_conf["model_path"]) logging.info('Saving model done!')