def walk_corpus(walk_dir, chunk_name='document', encoding='utf8', ignore=IGNORE, nltk_stop=True, stop_freq=1, add_stop=None, decode=False, verbose=1, simple=False, tokenizer=word_tokenize): filenames = [] for root, dirs, files in os.walk(walk_dir): for file in files: filenames.append(os.path.join(root, file)) # filter the blacklist (typically .json, .log, etc.) filenames = filter_by_suffix(filenames, ignore) files = [] for filename in filenames: if encoding == 'detect': encoding = detect_encoding(filename) try: if decode: with open(filename, mode='r', encoding=encoding) as f: files.append(unidecode(f.read())) else: with open(filename, mode='r', encoding=encoding) as f: files.append(f.read()) except UnicodeDecodeError: encoding = detect_encoding(filename) if decode: with open(filename, mode='r', encoding=encoding) as f: files.append(unidecode(f.read())) else: with open(filename, mode='r', encoding=encoding) as f: files.append(f.read()) words, tok = dir_tokenize(files, filenames, chunk_name=chunk_name, paragraphs=False, verbose=verbose, simple=simple, tokenizer=tokenizer) names, data = list(zip(*list(tok.items()))) c = Corpus(words, context_data=data, context_types=names) if nltk_stop or stop_freq or add_stop: c = apply_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
def test_LdaCgsQuerySampler_init(self): old_corp = Corpus([], remove_empty=False) old_corp.corpus = np.array([0, 1, 1, 0, 0, 1], dtype='i') old_corp.context_data = [ np.array([(3, ), (3, )], dtype=[('idx', 'i')]) ] old_corp.context_types = ['document'] old_corp.words = np.array(['0', '1'], dtype='i') old_corp.words_int = {'0': 0, '1': 1} new_corp = Corpus([], remove_empty=False) new_corp.corpus = np.array([0, 0], dtype='i') new_corp.context_data = [np.array([(2, )], dtype=[('idx', 'i')])] new_corp.context_types = ['document'] new_corp.words = np.array(['0', '1'], dtype='i') new_corp.words_int = {'0': 0, '1': 1} m = LdaCgsSeq(corpus=old_corp, context_type='document', K=2, V=2) m.Z[:] = np.array([0, 0, 0, 1, 1, 1], dtype='i') m.word_top[:] = np.array([[1.01, 2.01], [2.01, 1.01]], dtype='d') m.top_doc[:] = np.array([[3.01, 0.01], [0.01, 3.01]], dtype='d') m.inv_top_sums[:] = 1. / m.word_top.sum(0) q = LdaCgsQuerySampler(m, new_corpus=new_corp, old_corpus=old_corp) self.assertTrue(q.V == 2) self.assertTrue(q.K == 2) self.assertTrue(len(q.corpus) == 2) self.assertTrue((q.corpus == new_corp.corpus).all()) self.assertTrue(len(q.indices) == 1) self.assertTrue( (q.indices == new_corp.view_metadata('document')['idx']).all()) self.assertTrue(q.word_top.shape == (2, 2)) self.assertTrue((q.word_top == m.word_top).all()) self.assertTrue(q.top_doc.shape == (2, 1)) self.assertTrue((q.top_doc == np.array([[0.01], [0.01]], dtype=q.top_doc.dtype)).all()) self.assertTrue(q.inv_top_sums.shape == (2, )) self.assertTrue((q.inv_top_sums == m.inv_top_sums).all()) self.assertTrue(q.alpha.shape == (2, 1)) self.assertTrue((q.alpha == m.alpha).all()) self.assertTrue(q.beta.shape == (2, 1)) self.assertTrue((q.beta == m.beta).all())
def empty_corpus(context_type='document'): """ Creates an empty Corpus with defined context_type. :param context_type: A type of tokenization. Default is 'document'. :type context_type: string :returns: An empty Corpus with no words or context_data. :See Also: :class:`vsm.corpus.Corpus` """ return Corpus([], context_data=[np.array([], dtype=[('idx', np.int)])], context_types=[context_type])
def corpus_fromlist(ls, context_type='context', remove_empty=True): """ Takes a list of lists or arrays containing strings or integers and returns a Corpus object. The label associated to a given context is `context_type` prepended to the context index. :param ls: List of lists or List of arrays containing strings or integers. :type ls: list :param context_type: A type of tokenization. :type context_type: string, optional :returns: A Corpus object built from `ls`. :See Also: :class:`vsm.corpus.Corpus` **Examples** >>> ls = [['a', 'b'], ['c'], ['d', 'e']] >>> c = corpus_fromlist(ls, context_type='sentence') >>> c.view_contexts('sentence', as_strings=True) [array(['a', 'b'], dtype='|S1'), array(['c'], dtype='|S1'), array(['d', 'e'], dtype='|S1')] >>> c.context_data [array([(2, 'sentence_0'), (3, 'sentence_1'), (5, 'sentence_2')], dtype=[('idx', '<i8'), ('sentence_label', '|S10')])] """ corpus = chain.from_iterable(ls) #[w for ctx in ls for w in ctx] indices = np.cumsum([len(sbls) for sbls in ls]) metadata = ['{0}_{1}'.format(context_type, i) for i in range(len(indices))] md_type = np.array(metadata).dtype md_type = np.object_ dtype = [('idx', np.int), (context_type + '_label', md_type)] context_data = [np.array(list(zip(indices, metadata)), dtype=dtype)] return Corpus(corpus, context_data=context_data, context_types=[context_type], words_corpus=chain.from_iterable(copy(ctx) for ctx in ls), remove_empty=remove_empty)
def corpus_from_strings(strings, metadata=[], decode=False, nltk_stop=True, stop_freq=0, add_stop=None, tokenizer=word_tokenize): """ Takes a list of strings and returns a Corpus object whose document tokens are the strings. :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`. :type tokenizer: lambda s -> tokens """ if decode: for i in range(len(strings)): if isinstance(strings[i], str): strings[i] = unidecode(strings[i]) documents = [tokenizer(s) for s in strings] corpus = sum(documents, []) indices = np.cumsum([len(d) for d in documents]) del documents if len(metadata) == 0: metadata = ['document_{0}'.format(i) for i in range(len(strings))] md_type = np.array(metadata).dtype md_type = np.object_ dtype = [('idx', np.int), ('document_label', md_type)] context_data = [np.array(list(zip(indices, metadata)), dtype=dtype)] c = Corpus(corpus, context_data=context_data, context_types=['document']) if nltk_stop or stop_freq or add_stop: c = apply_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
def coll_corpus(coll_dir, encoding='utf8', ignore=IGNORE, nltk_stop=True, stop_freq=1, add_stop=None, decode=False, verbose=1, simple=False, tokenizer=word_tokenize): """ `coll_corpus` is a convenience function for generating Corpus objects from a directory of plain text files. It will also strip punctuation and arabic numerals outside the range 1-29. All letters are made lowercase. :param coll_dir: Directory containing a collections of books which contain pages as plain-text files. :type coll_dir: string-like :param encoding: A string indicating the file encoding or 'detect', in which case `chardet` is used to automatically guess the encoding. Default is `utf8`. :type encoding: string, optional :param ignore: The list containing suffixes of files to be filtered. The suffix strings are normally file types. Default is ['.json', '.log','.pickle', '.DS_Store']. :type ignore: list of strings, optional :param nltk_stop: If `True` then the corpus object is masked using the NLTK English stop words. Default is `False`. :type nltk_stop: boolean, optional :param stop_freq: The upper bound for a word to be masked on the basis of its collection frequency. Default is 1. :type stop_freq: int, optional :param add_stop: A list of stop words. Default is `None`. :type add_stop: array-like, optional :param decode: If `True` then unicode characters are converted to ASCII. Default is `False`. :type decode: boolean, optional :param verbose: Verbosity level. 1 prints a progress bar. :type verbose: int, default 1 :returns: c : a Corpus object Contains the tokenized corpus built from the plain-text files in `coll_dir` corpus. Document tokens are named `documents`. """ books = [] book_names = os.listdir(coll_dir) book_names = filter_by_suffix(book_names, ignore) book_names.sort() for book_name in book_names: pages = [] book_path = os.path.join(coll_dir, book_name) page_names = os.listdir(book_path) page_names = filter_by_suffix(page_names, ignore) page_names.sort() for page_name in page_names: page_file = book_name + '/' + page_name page_name = os.path.join(book_path, page_name) if encoding == 'detect': encoding = detect_encoding(page_name) try: if decode: with open(page_name, mode='r', encoding=encoding) as f: pages.append((unidecode(f.read()), page_file)) else: with open(page_name, mode='r', encoding=encoding) as f: pages.append((f.read(), page_file)) except UnicodeDecodeError: encoding = detect_encoding(page_name) if decode: with open(page_name, mode='r', encoding=encoding) as f: pages.append((unidecode(f.read()), page_file)) else: with open(page_name, mode='r', encoding=encoding) as f: pages.append((f.read(), page_file)) books.append(pages) words, tok = coll_tokenize(books, book_names, simple=simple, tokenizer=tokenizer) names, data = list(zip(*list(tok.items()))) c = Corpus(words, context_data=data, context_types=names) in_place_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
def random_corpus(corpus_len, n_words, min_token_len, max_token_len, context_type='document', metadata=False, seed=None): """ Generates a random integer corpus. :param corpus_len: Size of the Corpus. :type corpus_len: int :param n_words: Number of words to draw random integers from. :type n_words: int :param min_token_len: minimum token length used to create indices for corpus. :type min_token_len: int :param max_token_len: maximum token length used to create indices for corpus. :type max_token_len: int :param context_type: A type of tokenization. Default is 'document'. :type context_type: string, optional :param metadata: If `True` generates metadata. If `False` the only metadata for the corpus is the index information. :type metadata: boolean, optional :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`. :type tokenizer: lambda s -> tokens :returns: Corpus object with random integers as its entries. :See Also: :class:`vsm.corpus.Corpus` """ random_state = np.random.RandomState(seed) corpus = random_state.randint(n_words, size=corpus_len) corpus = [str(word) for word in corpus] indices = [] i = np.random.randint(min_token_len, max_token_len) while i < corpus_len: indices.append(i) i += np.random.randint(min_token_len, max_token_len) indices.append(corpus_len) if metadata: metadata_ = [ '{0}_{1}'.format(context_type, i) for i in range(len(indices)) ] dtype = [('idx', np.array(indices).dtype), (context_type + '_label', np.object_)] rand_tok = np.array(list(zip(indices, metadata_)), dtype=dtype) else: rand_tok = np.array([(i, ) for i in indices], dtype=[('idx', np.array(indices).dtype)]) return Corpus(corpus, context_types=[context_type], context_data=[rand_tok])
def dir_corpus(plain_dir, chunk_name='article', encoding='utf8', paragraphs=True, ignore=IGNORE, nltk_stop=True, stop_freq=1, add_stop=None, decode=False, verbose=1, simple=False, tokenizer=word_tokenize): """ `dir_corpus` is a convenience function for generating Corpus objects from a directory of plain text files. `dir_corpus` will retain file-level tokenization and perform sentence and word tokenizations. Optionally, it will provide paragraph-level tokenizations. It will also strip punctuation and arabic numerals outside the range 1-29. All letters are made lowercase. :param plain_dir: String containing directory containing a plain-text corpus. :type plain_dir: string-like :param chunk_name: The name of the tokenization corresponding to individual files. For example, if the files are pages of a book, one might set `chunk_name` to `pages`. Default is `articles`. :type chunk_name: string-like, optional :param encoding: A string indicating the file encoding or 'detect', in which case `chardet` is used to automatically guess the encoding. Default is `utf8`. :type encoding: string, optional :param paragraphs: If `True`, a paragraph-level tokenization is included. Defaults to `True`. :type paragraphs: boolean, optional :param ignore: The list containing suffixes of files to be filtered. The suffix strings are normally file types. Default is ['.json', '.log','.pickle', '.DS_Store']. :type ignore: list of strings, optional :param nltk_stop: If `True` then the corpus object is masked using the NLTK English stop words. Default is `False`. :type nltk_stop: boolean, optional :param stop_freq: The upper bound for a word to be masked on the basis of its collection frequency. Default is 1. :type stop_freq: int, optional :param add_stop: A list of stop words. Default is `None`. :type add_stop: array-like, optional :param decode: If `True` then unicode characters are converted to ASCII. Default is `False`. :type decode: boolean, optional :param verbose: Verbosity level. 1 prints a progress bar. :type verbose: int, default 1 :returns: c : a Corpus object Contains the tokenized corpus built from the input plain-text corpus. Document tokens are named `documents`. :See Also: :class:`vsm.corpus.Corpus`, :meth:`dir_tokenize`, :meth:`vsm.corpus.util.apply_stoplist` """ chunks = [] filenames = os.listdir(plain_dir) filenames = filter_by_suffix(filenames, ignore) filenames.sort() for filename in filenames: filename = os.path.join(plain_dir, filename) if encoding == 'detect': encoding = detect_encoding(filename) try: if decode: with open(filename, mode='r', encoding=encoding) as f: chunks.append(unidecode(f.read())) else: with open(filename, mode='r', encoding=encoding) as f: chunks.append(f.read()) except UnicodeDecodeError: encoding = detect_encoding(filename) if decode: with open(filename, mode='r', encoding=encoding) as f: chunks.append(unidecode(f.read())) else: with open(filename, mode='r', encoding=encoding) as f: chunks.append(f.read()) words, tok = dir_tokenize(chunks, filenames, chunk_name=chunk_name, paragraphs=paragraphs, verbose=verbose, simple=simple, tokenizer=tokenizer) names, data = list(zip(*list(tok.items()))) c = Corpus(words, context_data=data, context_types=names) if nltk_stop or stop_freq or add_stop: c = apply_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
def json_corpus(json_file, doc_key, label_key, encoding='utf8', nltk_stop=False, stop_freq=0, add_stop=None, tokenizer=word_tokenize): """ `json_corpus` is a convenience function for generating Corpus objects from a json file. It construct a corpus, document labels and metadata respectively from the specified fields in the json file. `json_corpus` will perform word-level tokenization. It will also strip punctuation and arabic numerals outside the range 1-29. All letters are made lowercase. :param json_file: Json file name containing documents and metadata. :type json_file: string-like :param doc_key: Name of the key for documents. :type doc_key: string-like :param label_key: Name of the key used for document labels. Labels are used when a viewer function outputs a list of documents. Any field other than `doc_key` and `label_key` is stored as metadata. :type label_key: string-like :param encoding: A string indicating the file encoding or 'detect', in which case `chardet` is used to automatically guess the encoding. Default is `utf8`. :type encoding: string, optional :param nltk_stop: If `True` then the corpus object is masked using the NLTK English stop words. Default is `False`. :type nltk_stop: boolean, optional :param stop_freq: The upper bound for a word to be masked on the basis of its collection frequency. Default is 0. :type stop_freq: int, optional :param add_stop: A list of stop words. Default is `None`. :type add_stop: array-like, optional :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`. :type tokenizer: lambda s -> tokens :returns: c : a Corpus object Contains the tokenized corpus built from the input plain-text corpus. Document tokens are named `documents`. :See Also: :class:`vsm.corpus.Corpus`, :meth:`vsm.corpus.util.paragraph_tokenize`, :meth:`vsm.corpus.util.apply_stoplist` """ import json if encoding == 'detect': encoding = detect_encoding(json_file) with open(json_file, 'r', encoding=encoding) as f: json_data = json.load(f) docs = [] label = [] metadata = [] for i in json_data: docs.append(i.pop(doc_key, None).encode('ascii', 'ignore')) label.append(i.pop(label_key, None)) metadata.append(i) # metadata are all the rest docs = [tokenizer(d) for d in docs] corpus = sum(docs, []) tok = np.cumsum(np.array([len(d) for d in docs])) # add document label and metadata dtype = [('idx', np.array(tok).dtype), ('document_label', np.object_), ('metadata', np.array(metadata).dtype) ] # todo: create separate dtype for each key? tok = np.array(list(zip(tok, label, metadata)), dtype=dtype) c = Corpus(corpus, context_data=[tok], context_types=['document']) if nltk_stop or stop_freq or add_stop: c = apply_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
def file_corpus(filename, encoding='utf8', nltk_stop=True, stop_freq=1, add_stop=None, decode=False, simple=False, tokenizer=word_tokenize): """ `file_corpus` is a convenience function for generating Corpus objects from a a plain text corpus contained in a single string. `file_corpus` will strip punctuation and arabic numerals outside the range 1-29. All letters are made lowercase. :param filename: File name of the plain text file. :type plain_dir: string-like :param encoding: A string indicating the file encoding or 'detect', in which case `chardet` is used to automatically guess the encoding. Default is `utf8`. :type encoding: string, optional :param nltk_stop: If `True` then the corpus object is masked using the NLTK English stop words. Default is `False`. :type nltk_stop: boolean, optional :param stop_freq: The upper bound for a word to be masked on the basis of its collection frequency. Default is 1. :type stop_freq: int, optional :param add_stop: A list of stop words. Default is `None`. :type add_stop: array-like, optional :param decode: If `True` then unicode characters are converted to ASCII. Default is `False`. :type decode: boolean, optional :returns: c : a Corpus object Contains the tokenized corpus built from the input plain-text corpus. Document tokens are named `documents`. :See Also: :class:`vsm.corpus.Corpus`, :meth:`file_tokenize`, :meth:`vsm.corpus.util.apply_stoplist` """ if encoding == 'detect': encoding = detect_encoding(filename) try: with open(filename, mode='r', encoding=encoding) as f: text = f.read() except UnicodeDecodeError: encoding = detect_encoding(filename) if decode: text = unidecode(text) words, tok = file_tokenize(text, simple=simple, tokenizer=tokenizer) names, data = list(zip(*list(tok.items()))) c = Corpus(words, context_data=data, context_types=names) if nltk_stop or stop_freq or add_stop: c = apply_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
def toy_corpus(plain_corpus, is_filename=False, encoding='utf8', nltk_stop=False, stop_freq=0, add_stop=None, decode=False, metadata=None, autolabel=False, tokenizer=word_tokenize, simple=False): """ `toy_corpus` is a convenience function for generating Corpus objects from a given string or a single file. `toy_corpus` will perform both word and document-level tokenization. It will also strip punctuation and arabic numerals outside the range 1-29. All letters are made lowercase. Document tokens are delimited by two or more line breaks. E.g., <document 0> <document 1> ... <document n> where <document i> is any chunk of text to be tokenized by word. :param plain_corpus: String containing a plain-text corpus or a filename of a file containing one. :type plain_corpus: string-like :param is_filename: If `True` then `plain_corpus` is treated like a filename. Otherwise, `plain_corpus` is presumed to contain the corpus. Default is `False`. :type is_filename: boolean, optional :param encoding: A string indicating the file encoding or 'detect', in which case `chardet` is used to automatically guess the encoding. Default is `utf8`. :type encoding: string, optional :param nltk_stop: If `True` then the corpus object is masked using the NLTK English stop words. Default is `False`. :type nltk_stop: boolean, optional :param stop_freq: The upper bound for a word to be masked on the basis of its collection frequency. Default is 0. :type stop_freq: int, optional :param add_stop: A list of stop words. Default is `None`. :type add_stop: array-like, optional :param decode: If `True` then unicode characters are converted to ASCII. Default is `False`. :type decode: boolean, optional :param metadata: A list of strings providing metadata about the documents. If provided, must have length equal to the number of documents. Default is `None`. :type metadata: array-like, optional :param autolabel: A boolean specifying whether to automatically label documents by position in file. Default is False :type metadata: boolean, optional :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`. :type tokenizer: lambda s -> tokens :returns: c : a Corpus object Contains the tokenized corpus built from the input plain-text corpus. Document tokens are named `documents`. :See Also: :class:`vsm.corpus.Corpus`, :meth:`vsm.corpus.util.paragraph_tokenize`, :meth:`vsm.corpus.util.apply_stoplist` """ if is_filename: if encoding == 'detect': encoding = detect_encoding(plain_corpus) with open(plain_corpus, 'rb', encoding=encoding) as f: plain_corpus = f.read() if decode: plain_corpus = unidecode(plain_corpus) docs = paragraph_tokenize(plain_corpus) docs = [tokenizer(d) for d in docs] corpus = sum(docs, []) tok = np.cumsum(np.array([len(d) for d in docs])) if not metadata and autolabel: metadata = ['Document {0}'.format(i) for i in range(len(tok))] if metadata: if not len(metadata) == len(tok): msg = 'Metadata mismatch: metadata length is {0} and number'\ 'of documents is {1}'.format(len(metadata), len(tok)) raise Exception(msg) else: md_type = np.object_ dtype = [('idx', np.array(tok).dtype), ('document_label', md_type)] tok = np.array(list(zip(tok, metadata)), dtype=dtype) else: dtype = [('idx', np.array(tok).dtype)] tok = np.array([(i, ) for i in tok], dtype=dtype) c = Corpus(corpus, context_data=[tok], context_types=['document']) if nltk_stop or stop_freq or add_stop: c = apply_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
def dir_corpus(plain_dir, chunk_name='article', encoding='utf8', paragraphs=True, word_len=2, nltk_stop=True, stop_freq=1, add_stop=None, corpus_sent=True, ignore=['.log', '.pickle', '.xml'], decode=False, simple=False): """ `dir_corpus` is a convenience function for generating Corpus objects from a directory of plain text files. `dir_corpus` will retain file-level tokenization and perform sentence and word tokenizations. Optionally, it will provide paragraph-level tokenizations. It will also strip punctuation and arabic numerals outside the range 1-29. All letters are made lowercase. :param plain_dir: String containing directory containing a plain-text corpus. :type plain_dir: string-like :param chunk_name: The name of the tokenization corresponding to individual files. For example, if the files are pages of a book, one might set `chunk_name` to `pages`. Default is `articles`. :type chunk_name: string-like, optional :param paragraphs: If `True`, a paragraph-level tokenization is included. Defaults to `True`. :type paragraphs: boolean, optional :param word_len: Filters words whose lengths are <= word_len. Default is 2. :type word_len: int, optional :param nltk_stop: If `True` then the corpus object is masked using the NLTK English stop words. Default is `False`. :type nltk_stop: boolean, optional :param stop_freq: The upper bound for a word to be masked on the basis of its collection frequency. Default is 1. :type stop_freq: int, optional :param corpus_sent: If `True` a CorpusSent object is returned. Otherwise Corpus object is returned. Default is `True`. :type corpus_sent: boolean, optional :param add_stop: A list of stop words. Default is `None`. :type add_stop: array-like, optional :param ignore: The list containing suffixes of files to be filtered. The suffix strings are normally file types. Default is ['.json', '.log', '.pickle']. :type ignore: list of strings, optional :returns: c : Corpus or CorpusSent Contains the tokenized corpus built from the input plain-text corpus. Document tokens are named `documents`. :See Also: :class: Corpus, :class: CorpusSent, :meth: dir_tokenize, :meth: apply_stoplist """ chunks = [] filenames = os.listdir(plain_dir) filenames = filter_by_suffix(filenames, ignore) filenames.sort() for filename in filenames: filename = os.path.join(plain_dir, filename) if encoding == 'detect': encoding = detect_encoding(filename) try: if decode: with open(filename, mode='r', encoding=encoding) as f: chunks.append(unidecode(f.read())) else: with open(filename, mode='r', encoding=encoding) as f: chunks.append(f.read()) except UnicodeDecodeError: encoding = detect_encoding(filename) if decode: with open(filename, mode='r', encoding=encoding) as f: chunks.append(unidecode(f.read())) else: with open(filename, mode='r', encoding=encoding) as f: chunks.append(f.read()) words, tok, sent = dir_tokenize(chunks, filenames, chunk_name=chunk_name, paragraphs=paragraphs) names, data = zip(*tok.items()) if corpus_sent: c = CorpusSent(words, sent, context_data=data, context_types=names, remove_empty=False) else: c = Corpus(words, context_data=data, context_types=names) in_place_stoplist(c, nltk_stop=nltk_stop, add_stop=add_stop, freq=stop_freq) return c