예제 #1
0
def walk_corpus(walk_dir,
                chunk_name='document',
                encoding='utf8',
                ignore=IGNORE,
                nltk_stop=True,
                stop_freq=1,
                add_stop=None,
                decode=False,
                verbose=1,
                simple=False,
                tokenizer=word_tokenize):

    filenames = []
    for root, dirs, files in os.walk(walk_dir):
        for file in files:
            filenames.append(os.path.join(root, file))

    # filter the blacklist (typically .json, .log, etc.)
    filenames = filter_by_suffix(filenames, ignore)
    files = []
    for filename in filenames:
        if encoding == 'detect':
            encoding = detect_encoding(filename)

        try:
            if decode:
                with open(filename, mode='r', encoding=encoding) as f:
                    files.append(unidecode(f.read()))
            else:
                with open(filename, mode='r', encoding=encoding) as f:
                    files.append(f.read())
        except UnicodeDecodeError:
            encoding = detect_encoding(filename)
            if decode:
                with open(filename, mode='r', encoding=encoding) as f:
                    files.append(unidecode(f.read()))
            else:
                with open(filename, mode='r', encoding=encoding) as f:
                    files.append(f.read())

    words, tok = dir_tokenize(files,
                              filenames,
                              chunk_name=chunk_name,
                              paragraphs=False,
                              verbose=verbose,
                              simple=simple,
                              tokenizer=tokenizer)
    names, data = list(zip(*list(tok.items())))

    c = Corpus(words, context_data=data, context_types=names)
    if nltk_stop or stop_freq or add_stop:
        c = apply_stoplist(c,
                           nltk_stop=nltk_stop,
                           freq=stop_freq,
                           add_stop=add_stop)
    return c
예제 #2
0
    def test_LdaCgsQuerySampler_init(self):

        old_corp = Corpus([], remove_empty=False)
        old_corp.corpus = np.array([0, 1, 1, 0, 0, 1], dtype='i')
        old_corp.context_data = [
            np.array([(3, ), (3, )], dtype=[('idx', 'i')])
        ]
        old_corp.context_types = ['document']
        old_corp.words = np.array(['0', '1'], dtype='i')
        old_corp.words_int = {'0': 0, '1': 1}

        new_corp = Corpus([], remove_empty=False)
        new_corp.corpus = np.array([0, 0], dtype='i')
        new_corp.context_data = [np.array([(2, )], dtype=[('idx', 'i')])]
        new_corp.context_types = ['document']
        new_corp.words = np.array(['0', '1'], dtype='i')
        new_corp.words_int = {'0': 0, '1': 1}

        m = LdaCgsSeq(corpus=old_corp, context_type='document', K=2, V=2)
        m.Z[:] = np.array([0, 0, 0, 1, 1, 1], dtype='i')
        m.word_top[:] = np.array([[1.01, 2.01], [2.01, 1.01]], dtype='d')
        m.top_doc[:] = np.array([[3.01, 0.01], [0.01, 3.01]], dtype='d')
        m.inv_top_sums[:] = 1. / m.word_top.sum(0)

        q = LdaCgsQuerySampler(m, new_corpus=new_corp, old_corpus=old_corp)
        self.assertTrue(q.V == 2)
        self.assertTrue(q.K == 2)
        self.assertTrue(len(q.corpus) == 2)
        self.assertTrue((q.corpus == new_corp.corpus).all())
        self.assertTrue(len(q.indices) == 1)
        self.assertTrue(
            (q.indices == new_corp.view_metadata('document')['idx']).all())
        self.assertTrue(q.word_top.shape == (2, 2))
        self.assertTrue((q.word_top == m.word_top).all())
        self.assertTrue(q.top_doc.shape == (2, 1))
        self.assertTrue((q.top_doc == np.array([[0.01], [0.01]],
                                               dtype=q.top_doc.dtype)).all())
        self.assertTrue(q.inv_top_sums.shape == (2, ))
        self.assertTrue((q.inv_top_sums == m.inv_top_sums).all())
        self.assertTrue(q.alpha.shape == (2, 1))
        self.assertTrue((q.alpha == m.alpha).all())
        self.assertTrue(q.beta.shape == (2, 1))
        self.assertTrue((q.beta == m.beta).all())
예제 #3
0
def empty_corpus(context_type='document'):
    """
    Creates an empty Corpus with defined context_type.

    :param context_type: A type of tokenization. Default is 'document'.
    :type context_type: string

    :returns: An empty Corpus with no words or context_data.

    :See Also: :class:`vsm.corpus.Corpus`
    """
    return Corpus([],
                  context_data=[np.array([], dtype=[('idx', np.int)])],
                  context_types=[context_type])
예제 #4
0
def corpus_fromlist(ls, context_type='context', remove_empty=True):
    """
    Takes a list of lists or arrays containing strings or integers and
    returns a Corpus object. The label associated to a given context
    is `context_type` prepended to the context index.
    
    :param ls: List of lists or List of arrays containing strings or integers.
    :type ls: list

    :param context_type: A type of tokenization.
    :type context_type: string, optional

    :returns: A Corpus object built from `ls`.

    :See Also: :class:`vsm.corpus.Corpus`

    **Examples**

    >>> ls = [['a', 'b'], ['c'], ['d', 'e']]
    >>> c = corpus_fromlist(ls, context_type='sentence')
    >>> c.view_contexts('sentence', as_strings=True)
    [array(['a', 'b'], dtype='|S1'),
     array(['c'], dtype='|S1'),
     array(['d', 'e'], dtype='|S1')]
    >>> c.context_data
    [array([(2, 'sentence_0'), (3, 'sentence_1'), (5, 'sentence_2')], 
          dtype=[('idx', '<i8'), ('sentence_label', '|S10')])]
    """
    corpus = chain.from_iterable(ls)  #[w for ctx in ls for w in ctx]
    indices = np.cumsum([len(sbls) for sbls in ls])

    metadata = ['{0}_{1}'.format(context_type, i) for i in range(len(indices))]
    md_type = np.array(metadata).dtype
    md_type = np.object_
    dtype = [('idx', np.int), (context_type + '_label', md_type)]
    context_data = [np.array(list(zip(indices, metadata)), dtype=dtype)]

    return Corpus(corpus,
                  context_data=context_data,
                  context_types=[context_type],
                  words_corpus=chain.from_iterable(copy(ctx) for ctx in ls),
                  remove_empty=remove_empty)
예제 #5
0
def corpus_from_strings(strings,
                        metadata=[],
                        decode=False,
                        nltk_stop=True,
                        stop_freq=0,
                        add_stop=None,
                        tokenizer=word_tokenize):
    """
    Takes a list of strings and returns a Corpus object whose document
    tokens are the strings.
    :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`.
    :type tokenizer: lambda s -> tokens

    """
    if decode:
        for i in range(len(strings)):
            if isinstance(strings[i], str):
                strings[i] = unidecode(strings[i])

    documents = [tokenizer(s) for s in strings]
    corpus = sum(documents, [])
    indices = np.cumsum([len(d) for d in documents])
    del documents

    if len(metadata) == 0:
        metadata = ['document_{0}'.format(i) for i in range(len(strings))]
    md_type = np.array(metadata).dtype
    md_type = np.object_
    dtype = [('idx', np.int), ('document_label', md_type)]
    context_data = [np.array(list(zip(indices, metadata)), dtype=dtype)]

    c = Corpus(corpus, context_data=context_data, context_types=['document'])
    if nltk_stop or stop_freq or add_stop:
        c = apply_stoplist(c,
                           nltk_stop=nltk_stop,
                           freq=stop_freq,
                           add_stop=add_stop)
    return c
예제 #6
0
def coll_corpus(coll_dir,
                encoding='utf8',
                ignore=IGNORE,
                nltk_stop=True,
                stop_freq=1,
                add_stop=None,
                decode=False,
                verbose=1,
                simple=False,
                tokenizer=word_tokenize):
    """
    `coll_corpus` is a convenience function for generating Corpus
    objects from a directory of plain text files.

    It will also strip punctuation and arabic numerals outside the
    range 1-29. All letters are made lowercase.

    :param coll_dir: Directory containing a collections of books
        which contain pages as plain-text files.
    :type coll_dir: string-like
    
    :param encoding: A string indicating the file encoding or 'detect',
        in which case `chardet` is used to automatically guess the encoding.
        Default is `utf8`.
    :type encoding: string, optional
    
    :param ignore: The list containing suffixes of files to be filtered.
        The suffix strings are normally file types. Default is ['.json',
        '.log','.pickle', '.DS_Store'].
    :type ignore: list of strings, optional

    :param nltk_stop: If `True` then the corpus object is masked 
        using the NLTK English stop words. Default is `False`.
    :type nltk_stop: boolean, optional
    
    :param stop_freq: The upper bound for a word to be masked on 
        the basis of its collection frequency. Default is 1.
    :type stop_freq: int, optional
    
    :param add_stop: A list of stop words. Default is `None`.
    :type add_stop: array-like, optional
    
    :param decode: If `True` then unicode characters are converted to
        ASCII. Default is `False`.
    :type decode: boolean, optional

    :param verbose: Verbosity level. 1 prints a progress bar.
    :type verbose: int, default 1 

    :returns: c : a Corpus object
        Contains the tokenized corpus built from the plain-text files
        in `coll_dir` corpus. Document tokens are named `documents`.
    """
    books = []
    book_names = os.listdir(coll_dir)
    book_names = filter_by_suffix(book_names, ignore)
    book_names.sort()

    for book_name in book_names:
        pages = []
        book_path = os.path.join(coll_dir, book_name)
        page_names = os.listdir(book_path)
        page_names = filter_by_suffix(page_names, ignore)
        page_names.sort()

        for page_name in page_names:
            page_file = book_name + '/' + page_name
            page_name = os.path.join(book_path, page_name)
            if encoding == 'detect':
                encoding = detect_encoding(page_name)
            try:
                if decode:
                    with open(page_name, mode='r', encoding=encoding) as f:
                        pages.append((unidecode(f.read()), page_file))
                else:
                    with open(page_name, mode='r', encoding=encoding) as f:
                        pages.append((f.read(), page_file))
            except UnicodeDecodeError:
                encoding = detect_encoding(page_name)
                if decode:
                    with open(page_name, mode='r', encoding=encoding) as f:
                        pages.append((unidecode(f.read()), page_file))
                else:
                    with open(page_name, mode='r', encoding=encoding) as f:
                        pages.append((f.read(), page_file))

        books.append(pages)

    words, tok = coll_tokenize(books,
                               book_names,
                               simple=simple,
                               tokenizer=tokenizer)
    names, data = list(zip(*list(tok.items())))

    c = Corpus(words, context_data=data, context_types=names)
    in_place_stoplist(c,
                      nltk_stop=nltk_stop,
                      freq=stop_freq,
                      add_stop=add_stop)

    return c
예제 #7
0
def random_corpus(corpus_len,
                  n_words,
                  min_token_len,
                  max_token_len,
                  context_type='document',
                  metadata=False,
                  seed=None):
    """
    Generates a random integer corpus.

    :param corpus_len: Size of the Corpus.
    :type corpus_len: int

    :param n_words: Number of words to draw random integers from.
    :type n_words: int

    :param min_token_len: minimum token length used to create indices
        for corpus.
    :type min_token_len: int

    :param max_token_len: maximum token length used to create indices
        for corpus.
    :type max_token_len: int

    :param context_type: A type of tokenization. Default is 'document'.
    :type context_type: string, optional

    :param metadata: If `True` generates metadata. If `False` the only
        metadata for the corpus is the index information.
    :type metadata: boolean, optional
    
    :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`.
    :type tokenizer: lambda s -> tokens

    :returns: Corpus object with random integers as its entries. 

    :See Also: :class:`vsm.corpus.Corpus`
    """
    random_state = np.random.RandomState(seed)
    corpus = random_state.randint(n_words, size=corpus_len)
    corpus = [str(word) for word in corpus]

    indices = []
    i = np.random.randint(min_token_len, max_token_len)
    while i < corpus_len:
        indices.append(i)
        i += np.random.randint(min_token_len, max_token_len)
    indices.append(corpus_len)

    if metadata:
        metadata_ = [
            '{0}_{1}'.format(context_type, i) for i in range(len(indices))
        ]
        dtype = [('idx', np.array(indices).dtype),
                 (context_type + '_label', np.object_)]
        rand_tok = np.array(list(zip(indices, metadata_)), dtype=dtype)
    else:
        rand_tok = np.array([(i, ) for i in indices],
                            dtype=[('idx', np.array(indices).dtype)])

    return Corpus(corpus,
                  context_types=[context_type],
                  context_data=[rand_tok])
예제 #8
0
def dir_corpus(plain_dir,
               chunk_name='article',
               encoding='utf8',
               paragraphs=True,
               ignore=IGNORE,
               nltk_stop=True,
               stop_freq=1,
               add_stop=None,
               decode=False,
               verbose=1,
               simple=False,
               tokenizer=word_tokenize):
    """
    `dir_corpus` is a convenience function for generating Corpus
    objects from a directory of plain text files.

    `dir_corpus` will retain file-level tokenization and perform
    sentence and word tokenizations. Optionally, it will provide
    paragraph-level tokenizations.

    It will also strip punctuation and arabic numerals outside the
    range 1-29. All letters are made lowercase.

    :param plain_dir: String containing directory containing a 
        plain-text corpus.
    :type plain_dir: string-like
    
    :param chunk_name: The name of the tokenization corresponding 
        to individual files. For example, if the files are pages 
        of a book, one might set `chunk_name` to `pages`. Default 
        is `articles`.
    :type chunk_name: string-like, optional
    
    :param encoding: A string indicating the file encoding or 'detect',
        in which case `chardet` is used to automatically guess the encoding.
        Default is `utf8`.
    :type encoding: string, optional
    
    :param paragraphs: If `True`, a paragraph-level tokenization 
        is included. Defaults to `True`.
    :type paragraphs: boolean, optional
    
    :param ignore: The list containing suffixes of files to be filtered.
        The suffix strings are normally file types. Default is ['.json',
        '.log','.pickle', '.DS_Store'].
    :type ignore: list of strings, optional

    :param nltk_stop: If `True` then the corpus object is masked 
        using the NLTK English stop words. Default is `False`.
    :type nltk_stop: boolean, optional
    
    :param stop_freq: The upper bound for a word to be masked on 
        the basis of its collection frequency. Default is 1.
    :type stop_freq: int, optional
    
    :param add_stop: A list of stop words. Default is `None`.
    :type add_stop: array-like, optional
    
    :param decode: If `True` then unicode characters are converted to
        ASCII. Default is `False`.
    :type decode: boolean, optional

    :param verbose: Verbosity level. 1 prints a progress bar.
    :type verbose: int, default 1 

    :returns: c : a Corpus object
        Contains the tokenized corpus built from the input plain-text
        corpus. Document tokens are named `documents`.
    
    :See Also: :class:`vsm.corpus.Corpus`, 
            :meth:`dir_tokenize`, 
            :meth:`vsm.corpus.util.apply_stoplist`
    """
    chunks = []
    filenames = os.listdir(plain_dir)
    filenames = filter_by_suffix(filenames, ignore)
    filenames.sort()

    for filename in filenames:
        filename = os.path.join(plain_dir, filename)
        if encoding == 'detect':
            encoding = detect_encoding(filename)
        try:
            if decode:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(unidecode(f.read()))
            else:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(f.read())
        except UnicodeDecodeError:
            encoding = detect_encoding(filename)
            if decode:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(unidecode(f.read()))
            else:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(f.read())

    words, tok = dir_tokenize(chunks,
                              filenames,
                              chunk_name=chunk_name,
                              paragraphs=paragraphs,
                              verbose=verbose,
                              simple=simple,
                              tokenizer=tokenizer)
    names, data = list(zip(*list(tok.items())))

    c = Corpus(words, context_data=data, context_types=names)
    if nltk_stop or stop_freq or add_stop:
        c = apply_stoplist(c,
                           nltk_stop=nltk_stop,
                           freq=stop_freq,
                           add_stop=add_stop)
    return c
예제 #9
0
def json_corpus(json_file,
                doc_key,
                label_key,
                encoding='utf8',
                nltk_stop=False,
                stop_freq=0,
                add_stop=None,
                tokenizer=word_tokenize):
    """
    `json_corpus` is a convenience function for generating Corpus
    objects from a json file. It construct a corpus, document labels
    and metadata respectively from the specified fields in the json file.

    `json_corpus` will perform word-level tokenization. 
    It will also strip punctuation and arabic numerals
    outside the range 1-29. All letters are made lowercase.

    :param json_file: Json file name containing documents and metadata.
    :type json_file: string-like
    
    :param doc_key: Name of the key for documents.
    :type doc_key: string-like

    :param label_key: Name of the key used for document labels. Labels are 
    used when a viewer function outputs a list of documents. Any field other
    than `doc_key` and `label_key` is stored as metadata.
    :type label_key: string-like

    :param encoding: A string indicating the file encoding or 'detect',
        in which case `chardet` is used to automatically guess the encoding.
        Default is `utf8`.
    :type encoding: string, optional
    
    :param nltk_stop: If `True` then the corpus object is masked using
        the NLTK English stop words. Default is `False`.
    :type nltk_stop: boolean, optional

    :param stop_freq: The upper bound for a word to be masked on the basis of its
        collection frequency. Default is 0.
    :type stop_freq: int, optional

    :param add_stop: A list of stop words. Default is `None`.
    :type add_stop: array-like, optional
    
    :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`.
    :type tokenizer: lambda s -> tokens

    :returns: c : a Corpus object
        Contains the tokenized corpus built from the input plain-text
        corpus. Document tokens are named `documents`.

    :See Also: :class:`vsm.corpus.Corpus`, 
        :meth:`vsm.corpus.util.paragraph_tokenize`, 
        :meth:`vsm.corpus.util.apply_stoplist`
    """
    import json

    if encoding == 'detect':
        encoding = detect_encoding(json_file)
    with open(json_file, 'r', encoding=encoding) as f:
        json_data = json.load(f)

    docs = []
    label = []
    metadata = []
    for i in json_data:
        docs.append(i.pop(doc_key, None).encode('ascii', 'ignore'))
        label.append(i.pop(label_key, None))
        metadata.append(i)  # metadata are all the rest

    docs = [tokenizer(d) for d in docs]

    corpus = sum(docs, [])
    tok = np.cumsum(np.array([len(d) for d in docs]))

    # add document label and metadata
    dtype = [('idx', np.array(tok).dtype), ('document_label', np.object_),
             ('metadata', np.array(metadata).dtype)
             ]  # todo: create separate dtype for each key?
    tok = np.array(list(zip(tok, label, metadata)), dtype=dtype)

    c = Corpus(corpus, context_data=[tok], context_types=['document'])
    if nltk_stop or stop_freq or add_stop:
        c = apply_stoplist(c,
                           nltk_stop=nltk_stop,
                           freq=stop_freq,
                           add_stop=add_stop)
    return c
예제 #10
0
def file_corpus(filename,
                encoding='utf8',
                nltk_stop=True,
                stop_freq=1,
                add_stop=None,
                decode=False,
                simple=False,
                tokenizer=word_tokenize):
    """
    `file_corpus` is a convenience function for generating Corpus
    objects from a a plain text corpus contained in a single string.

    `file_corpus` will strip punctuation and arabic numerals outside
    the range 1-29. All letters are made lowercase.

    :param filename: File name of the plain text file.
    :type plain_dir: string-like

    :param encoding: A string indicating the file encoding or 'detect',
        in which case `chardet` is used to automatically guess the encoding.
        Default is `utf8`.
    :type encoding: string, optional
    
    :param nltk_stop: If `True` then the corpus object is masked 
        using the NLTK English stop words. Default is `False`.
    :type nltk_stop: boolean, optional
    
    :param stop_freq: The upper bound for a word to be masked on 
        the basis of its collection frequency. Default is 1.
    :type stop_freq: int, optional
    
    :param add_stop: A list of stop words. Default is `None`.
    :type add_stop: array-like, optional
    
    :param decode: If `True` then unicode characters are converted to
        ASCII. Default is `False`.
    :type decode: boolean, optional

    :returns: c : a Corpus object
        Contains the tokenized corpus built from the input plain-text
        corpus. Document tokens are named `documents`.
    
    :See Also: :class:`vsm.corpus.Corpus`, 
        :meth:`file_tokenize`, 
        :meth:`vsm.corpus.util.apply_stoplist`
    """
    if encoding == 'detect':
        encoding = detect_encoding(filename)
    try:
        with open(filename, mode='r', encoding=encoding) as f:
            text = f.read()
    except UnicodeDecodeError:
        encoding = detect_encoding(filename)

    if decode:
        text = unidecode(text)

    words, tok = file_tokenize(text, simple=simple, tokenizer=tokenizer)
    names, data = list(zip(*list(tok.items())))

    c = Corpus(words, context_data=data, context_types=names)
    if nltk_stop or stop_freq or add_stop:
        c = apply_stoplist(c,
                           nltk_stop=nltk_stop,
                           freq=stop_freq,
                           add_stop=add_stop)
    return c
예제 #11
0
def toy_corpus(plain_corpus,
               is_filename=False,
               encoding='utf8',
               nltk_stop=False,
               stop_freq=0,
               add_stop=None,
               decode=False,
               metadata=None,
               autolabel=False,
               tokenizer=word_tokenize,
               simple=False):
    """
    `toy_corpus` is a convenience function for generating Corpus
    objects from a given string or a single file.

    `toy_corpus` will perform both word and document-level
    tokenization. It will also strip punctuation and arabic numerals
    outside the range 1-29. All letters are made lowercase.

    Document tokens are delimited by two or more line breaks. E.g.,

        <document 0>

        <document 1>

        ...

        <document n>

    where <document i> is any chunk of text to be tokenized by word.

    :param plain_corpus: String containing a plain-text corpus or a 
        filename of a file containing one.
    :type plain_corpus: string-like
    
    :param is_filename: If `True` then `plain_corpus` is treated like
        a filename. Otherwise, `plain_corpus` is presumed to contain 
        the corpus. Default is `False`.
    :type is_filename: boolean, optional

    :param encoding: A string indicating the file encoding or 'detect',
        in which case `chardet` is used to automatically guess the encoding.
        Default is `utf8`.
    :type encoding: string, optional
    
    :param nltk_stop: If `True` then the corpus object is masked using
        the NLTK English stop words. Default is `False`.
    :type nltk_stop: boolean, optional

    :param stop_freq: The upper bound for a word to be masked on the basis of its
        collection frequency. Default is 0.
    :type stop_freq: int, optional

    :param add_stop: A list of stop words. Default is `None`.
    :type add_stop: array-like, optional
    
    :param decode: If `True` then unicode characters are converted to
        ASCII. Default is `False`.
    :type decode: boolean, optional

    :param metadata: A list of strings providing metadata about the documents. If
        provided, must have length equal to the number of documents.
        Default is `None`.
    :type metadata: array-like, optional
    
    :param autolabel: A boolean specifying whether to automatically label
        documents by position in file. Default is False
    :type metadata: boolean, optional
    
    :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`.
    :type tokenizer: lambda s -> tokens
    
    :returns: c : a Corpus object
        Contains the tokenized corpus built from the input plain-text
        corpus. Document tokens are named `documents`.

    :See Also: :class:`vsm.corpus.Corpus`, 
        :meth:`vsm.corpus.util.paragraph_tokenize`, 
        :meth:`vsm.corpus.util.apply_stoplist`
    """
    if is_filename:
        if encoding == 'detect':
            encoding = detect_encoding(plain_corpus)

        with open(plain_corpus, 'rb', encoding=encoding) as f:
            plain_corpus = f.read()

    if decode:
        plain_corpus = unidecode(plain_corpus)

    docs = paragraph_tokenize(plain_corpus)
    docs = [tokenizer(d) for d in docs]

    corpus = sum(docs, [])
    tok = np.cumsum(np.array([len(d) for d in docs]))

    if not metadata and autolabel:
        metadata = ['Document {0}'.format(i) for i in range(len(tok))]

    if metadata:
        if not len(metadata) == len(tok):
            msg = 'Metadata mismatch: metadata length is {0} and number'\
                   'of documents is {1}'.format(len(metadata), len(tok))
            raise Exception(msg)
        else:
            md_type = np.object_
            dtype = [('idx', np.array(tok).dtype), ('document_label', md_type)]
            tok = np.array(list(zip(tok, metadata)), dtype=dtype)
    else:
        dtype = [('idx', np.array(tok).dtype)]
        tok = np.array([(i, ) for i in tok], dtype=dtype)

    c = Corpus(corpus, context_data=[tok], context_types=['document'])
    if nltk_stop or stop_freq or add_stop:
        c = apply_stoplist(c,
                           nltk_stop=nltk_stop,
                           freq=stop_freq,
                           add_stop=add_stop)
    return c
예제 #12
0
def dir_corpus(plain_dir, chunk_name='article', encoding='utf8', 
               paragraphs=True, word_len=2, nltk_stop=True, stop_freq=1, 
               add_stop=None, corpus_sent=True, 
               ignore=['.log', '.pickle', '.xml'], decode=False, simple=False):
    """
    `dir_corpus` is a convenience function for generating Corpus
    objects from a directory of plain text files.

    `dir_corpus` will retain file-level tokenization and perform
    sentence and word tokenizations. Optionally, it will provide
    paragraph-level tokenizations.

    It will also strip punctuation and arabic numerals outside the
    range 1-29. All letters are made lowercase.

    :param plain_dir: String containing directory containing a 
        plain-text corpus.
    :type plain_dir: string-like
    
    :param chunk_name: The name of the tokenization corresponding 
        to individual files. For example, if the files are pages 
        of a book, one might set `chunk_name` to `pages`. Default 
        is `articles`.
    :type chunk_name: string-like, optional
    
    :param paragraphs: If `True`, a paragraph-level tokenization 
        is included. Defaults to `True`.
    :type paragraphs: boolean, optional
    
    :param word_len: Filters words whose lengths are <= word_len.
        Default is 2.
    :type word_len: int, optional

    :param nltk_stop: If `True` then the corpus object is masked 
        using the NLTK English stop words. Default is `False`.
    :type nltk_stop: boolean, optional
    
    :param stop_freq: The upper bound for a word to be masked on 
        the basis of its collection frequency. Default is 1.
    :type stop_freq: int, optional

    :param corpus_sent: If `True` a CorpusSent object is returned.
        Otherwise Corpus object is returned. Default is `True`. 
    :type corpus_sent: boolean, optional

    :param add_stop: A list of stop words. Default is `None`.
    :type add_stop: array-like, optional

    :param ignore: The list containing suffixes of files to be filtered.
        The suffix strings are normally file types. Default is ['.json',
        '.log', '.pickle'].
    :type ignore: list of strings, optional

    :returns: c : Corpus or CorpusSent
        Contains the tokenized corpus built from the input plain-text
        corpus. Document tokens are named `documents`.
    
    :See Also: :class: Corpus, :class: CorpusSent, :meth: dir_tokenize,
        :meth: apply_stoplist
    """
    chunks = []
    filenames = os.listdir(plain_dir)
    filenames = filter_by_suffix(filenames, ignore)
    filenames.sort()

    for filename in filenames:
        filename = os.path.join(plain_dir, filename)
        if encoding == 'detect':
            encoding = detect_encoding(filename)
        try:
            if decode:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(unidecode(f.read()))
            else:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(f.read())
        except UnicodeDecodeError:
            encoding = detect_encoding(filename)
            if decode:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(unidecode(f.read()))
            else:
                with open(filename, mode='r', encoding=encoding) as f:
                    chunks.append(f.read())

    words, tok, sent = dir_tokenize(chunks, filenames, chunk_name=chunk_name,
                              paragraphs=paragraphs)
    names, data = zip(*tok.items())
    
    if corpus_sent:
        c = CorpusSent(words, sent, context_data=data, context_types=names,
			remove_empty=False)
    else:
        c = Corpus(words, context_data=data, context_types=names)
    
    in_place_stoplist(c, nltk_stop=nltk_stop, add_stop=add_stop, freq=stop_freq)

    return c