예제 #1
0
def texts2json(ids, names, field, text_docs):
    """Convert a set of text documents into a
    JSON array of document objects."""

    docs = []

    names = read_names(names)
    ids = read_names(ids)

    for idx, path in enumerate(text_docs):
        tokens_doc = open(path, "r")
        content = ""
        with click.open_file(path):
            content = tokens_doc.read()

        # ordered so that these attributes stay at the top
        doc = OrderedDict()

        if idx < len(ids) - 1:
            doc["id"] = ids[idx]
        else:
            doc["id"] = path

        if idx < len(names) - 1:
            doc["name"] = names[idx]
        else:
            doc["name"] = path

        doc[field] = content
        docs.append(doc)
        tokens_doc.close()

    out_content = json.dumps(docs, indent=2)
    output(out_content)
예제 #2
0
def texts2json(ids, names, field, text_docs):
    '''Convert a set of text documents into a
    JSON array of document objects.'''

    docs = []

    names = read_names(names)
    ids = read_names(ids)

    for idx, path in enumerate(text_docs):
        tokens_doc = open(path, 'r')
        content = ""
        with click.open_file(path):
            content = tokens_doc.read()

        # ordered so that these attributes stay at the top
        doc = OrderedDict()

        if idx < len(ids) - 1:
            doc['id'] = ids[idx]
        else:
            doc['id'] = path

        if idx < len(names) - 1:
            doc['name'] = names[idx]
        else:
            doc['name'] = path

        doc[field] = content
        docs.append(doc)
        tokens_doc.close()

    out_content = json.dumps(docs, indent=2)
    output(out_content)
예제 #3
0
def tokens2stem(tokens, algorithm):
    '''Stem a list of tokens to get their root.'''
    content = read_tokens(tokens)
    stemmer = ALGOS[algorithm]()

    if algorithm == 'wordnet':
        for token in content:
            output(stemmer.lemmatize(token))
    else:
        for token in content:
            output(stemmer.stem(token))
예제 #4
0
def words2ngrams(sep, length, tokens):
    '''Tokenize words into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    ngrams = list(nltk.ngrams(content, length))
    [output(sep.join(ngram)) for ngram in ngrams]
예제 #5
0
def words2bigrams(sep, tokens):
    '''Tokenize words into bigrams. Bigrams are two word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    bigrams = list(nltk.bigrams(content))
    [output(sep.join(bigram)) for bigram in bigrams]
예제 #6
0
def filterwords(language, custom, tokens):
    '''Remove stop words from tokens, returning tokens without stop words.'''
    content = read_tokens(tokens)
    stopwords = get_stopwords(language)
    if custom:
        stopwords = stopwords + read_tokens(custom)

    [output(token) for token in content if token.lower() not in stopwords]
예제 #7
0
def filterwords(language, custom, tokens):
    '''Remove stop words from tokens, returning tokens without stop words.'''
    content = read_tokens(tokens)
    stopwords = get_stopwords(language)
    if custom:
        stopwords = stopwords + read_tokens(custom)

    [output(token) for token in content
        if token.lower() not in stopwords]
예제 #8
0
def tokens2pos(sep, tokens):
    '''Tokenize words into their parts of speech. Each item
    is the original word with its role as the second part
    of the item. Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    nltk.data.path.append(data_item())
    tags = nltk.pos_tag(content)
    [output("{},{}".format(t[0], t[1])) for t in tags]
예제 #9
0
def tokens2pos(sep, tokens):
    '''Tokenize words into their parts of speech. Each item
    is the original word with its role as the second part
    of the item. Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    nltk.data.path.append(data_item())
    tags = nltk.pos_tag(content)
    [output("{},{}".format(t[0], t[1])) for t in tags]
예제 #10
0
def top_bigrams(sep, measure, freq, scores, tokens):
    '''Find top most interesting bi-grams in a token document.
    Uses the --measure argument to determine what measure to use to define
    'interesting'.
    '''

    output(sep)
    content = read_tokens(tokens)
    bcf = nltk.collocations.BigramCollocationFinder.from_words(content)
    bcf.apply_freq_filter(freq)

    nltk_measure = MEASURES[measure]
    bigrams = bcf.score_ngrams(nltk_measure)

    out = [b[0] for b in bigrams]
    if scores:
        out = [b[0] + tuple([str(b[1])]) for b in bigrams]
    [output(sep.join(line)) for line in out]
예제 #11
0
def transliterate(file):
    '''Convert international text to ascii.'''
    content = ''.join(file.readlines())
    try:
        content = content.decode(chardet.detect(content)['encoding'])
    except AttributeError:
        # Strings do not have a decode method in python 3.
        pass
    [output(unidecode(content).encode('ascii', 'ignore'))]
예제 #12
0
def text2punc(text):
    '''Tokenize text into punctuation tokens.
    Words and numbers are removed, leaving only punctuation.'''

    # from: http://stackoverflow.com/questions/17485092/how-to-just-keep-punctuation-with-a-string-in-python

    content = '\n'.join([open(f).read() for f in text])
    out = re.sub(r'[^{}]+'.format(punctuation), ' ', content)
    out = out.split()
    [output(p) for p in out]
예제 #13
0
def text2punc(text):
    '''Tokenize text into punctuation tokens.
    Words and numbers are removed, leaving only punctuation.'''

    # from: http://stackoverflow.com/questions/17485092/how-to-just-keep-punctuation-with-a-string-in-python

    content = '\n'.join([open(f).read() for f in text])
    out = re.sub(r'[^{}]+'.format(punctuation), ' ', content)
    out = out.split()
    [output(p) for p in out]
예제 #14
0
def text2sentences(text):
    '''Tokenize text into sentence tokens.'''
    content = '\n'.join([open(f).read() for f in text])
    sentences = []
    try:
        sentences = sent_tokenize(content)
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(s.strip()) for s in sentences]
예제 #15
0
def text2sentences(text):
    """Tokenize text into sentence tokens."""
    content = "\n".join([open(f).read() for f in text])
    sentences = []
    try:
        sentences = sent_tokenize(content)
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message='Have you run "textkit download"?', nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(s.strip()) for s in sentences]
예제 #16
0
def tokens2json(ids, names, field, split, sep, token_docs):
    '''Convert a set of token documents into a
    JSON array of document objects.'''

    docs = []

    names = read_names(names)
    ids = read_names(ids)

    for idx, path in enumerate(token_docs):
        if path == '-':
            tokens_doc = sys.stdin
        else:
            tokens_doc = open(path, 'r')
        if split:
            content = read_csv(tokens_doc, sep)
            content = coerce_types(content)
        else:
            content = read_tokens(tokens_doc)

        # ordered so that these attributes stay at the top
        doc = OrderedDict()

        if idx < len(ids) - 1:
            doc['id'] = ids[idx]
        else:
            doc['id'] = path

        if idx < len(names) - 1:
            doc['name'] = names[idx]
        else:
            doc['name'] = path

        doc[field] = content
        docs.append(doc)
        tokens_doc.close()

    out_content = json.dumps(docs, indent=2)
    output(out_content)
예제 #17
0
def text2words(text):
    '''Tokenize text into word tokens.
    Punctuation is considered as a separate token.'''
    content = '\n'.join([open(f).read() for f in text])
    tokens = []
    try:
        tokens = nltk.word_tokenize(content)
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(token) for token in tokens]
예제 #18
0
def words2bigrams(sep, tokens):
    """Tokenize words into bigrams. Bigrams are two word tokens.
    Punctuation is considered as a separate token."""

    content = read_tokens(tokens)
    bigrams = []
    try:
        bigrams = list(nltk.bigrams(content))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message='Have you run "textkit download"?', nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(sep.join(bigram)) for bigram in bigrams]
예제 #19
0
def words2bigrams(sep, tokens):
    '''Tokenize words into bigrams. Bigrams are two word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    bigrams = []
    try:
        bigrams = list(nltk.bigrams(content))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(sep.join(bigram)) for bigram in bigrams]
예제 #20
0
def filterpunc(tokens):
    '''Remove tokens that are only punctuation from a list of tokens.'''
    content = read_tokens(tokens)
    [output(token) for token in content if token not in punctuation]
예제 #21
0
def nonewlines(text):
    '''Remove newlines from a text file.'''
    content = '\n'.join([open(f).read() for f in text])
    content = re.sub('\n|\r\n|\r', ' ', content).strip()
    output(content)
예제 #22
0
파일: words.py 프로젝트: justalfred/textkit
def text2words(text):
    '''Tokenize text into word tokens.
    Punctuation is considered as a separate token.'''
    content = '\n'.join([open(f).read() for f in text])
    tokens = nltk.word_tokenize(content)
    [output(token) for token in tokens]
예제 #23
0
def uppercase(tokens):
    '''Transform all tokens to uppercase.'''
    content = read_tokens(tokens)
    [output(token.upper()) for token in content]
예제 #24
0
def filterlengths(minimum, tokens):
    '''Remove tokens that are shorter then the minimum length provided.'''
    content = read_tokens(tokens)
    [output(token) for token in content if len(token) >= minimum]
예제 #25
0
def uppercase(tokens):
    '''Transform all tokens to uppercase.'''
    content = read_tokens(tokens)
    [output(token.upper()) for token in content]
예제 #26
0
def filterlengths(minimum, tokens):
    '''Remove tokens that are shorter then the minimum length provided.'''
    content = read_tokens(tokens)
    [output(token) for token in content if len(token) >= minimum]
예제 #27
0
def tokens2text(sep, tokens):
    '''Combine tokens in a token document into a single text file.'''

    content = read_tokens(tokens)
    out = sep.join(content)
    output(out)
예제 #28
0
def nonewlines(text):
    """Remove newlines from a text file."""
    content = "\n".join([open(f).read() for f in text])
    content = content.replace("\n", " ").strip()
    output(content)
예제 #29
0
def showstops(language):
    '''Display stop words used by textkit for a given language.'''
    stopwords = get_stopwords(language)

    [output(token) for token in stopwords]
예제 #30
0
def text2sentences(text):
    '''Tokenize text into sentence tokens.'''
    content = '\n'.join([open(f).read() for f in text])
    sentences = sent_tokenize(content)
    [output(s.strip()) for s in sentences]
예제 #31
0
def text2sentences(text):
    '''Tokenize text into sentence tokens.'''
    content = '\n'.join([open(f).read() for f in text])
    sentences = sent_tokenize(content)
    [output(s.strip()) for s in sentences]
예제 #32
0
def tokens2lower(tokens):
    '''Transform all tokens to lowercase.'''
    content = read_tokens(tokens)
    [output(token.lower()) for token in content]
예제 #33
0
def nonewlines(text):
    '''Remove newlines from a text file.'''
    content = '\n'.join([open(f).read() for f in text])
    content = content.replace('\n', ' ').strip()
    output(content)