Exemplo n.º 1
0
def filterwords(language, custom, tokens):
    '''Remove stop words from tokens, returning tokens without stop words.'''
    content = read_tokens(tokens)
    stopwords = get_stopwords(language)
    if custom:
        stopwords = stopwords + read_tokens(custom)

    [output(token) for token in content
        if token.lower() not in stopwords]
Exemplo n.º 2
0
def words2ngrams(sep, length, tokens):
    '''Tokenize words into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    ngrams = list(nltk.ngrams(content, length))
    [output(sep.join(ngram)) for ngram in ngrams]
Exemplo n.º 3
0
def test_read_tokens():
    f = open('test_data/word_tokens.txt', 'r')

    tokens = read_tokens(f)
    assert len(tokens) == 6

    f.close()
Exemplo n.º 4
0
def words2ngrams(sep, num, tokens):
    '''Convert word tokens into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    ngrams = list(nltk.ngrams(content, num))
    write_csv(ngrams, str(sep))
Exemplo n.º 5
0
def words2bigrams(sep, tokens):
    '''Tokenize words into bigrams. Bigrams are two word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    bigrams = list(nltk.bigrams(content))
    [output(sep.join(bigram)) for bigram in bigrams]
Exemplo n.º 6
0
def tokens2pos(sep, tokens):
    '''Tokenize words into their parts of speech. Each item
    is the original word with its role as the second part
    of the item. Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    nltk.data.path.append(data_item())
    tags = nltk.pos_tag(content)
    [output("{},{}".format(t[0], t[1])) for t in tags]
Exemplo n.º 7
0
def tokens2pos(sep, tokens):
    '''Tokenize words into their parts of speech. Each item
    is the original word with its role as the second part
    of the item. Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    nltk.data.path.append(data_item())
    tags = nltk.pos_tag(content)
    [output("{},{}".format(t[0], t[1])) for t in tags]
Exemplo n.º 8
0
def tokens2pos(sep, tokens):
    '''Tokenize words into their parts of speech. Output contains the
       word token followed by its part-of-speech tag, separated by the
       character specified by --sep.
    '''

    content = read_tokens(tokens)
    nltk.data.path.append(data_item())
    tags = nltk.pos_tag(content)
    write_csv(tags, str(sep))
Exemplo n.º 9
0
def tokens2pos(sep, tokens):
    '''Tokenize words into their parts of speech. Output contains the
       word token followed by its part-of-speech tag, separated by the
       character specified by --sep.
    '''

    content = read_tokens(tokens)
    nltk.data.path.append(data_item())
    tags = nltk.pos_tag(content)
    write_csv(tags, str(sep))
Exemplo n.º 10
0
def tokens2stem(tokens, algorithm):
    '''Stem a list of tokens to get their root.'''
    content = read_tokens(tokens)
    stemmer = ALGOS[algorithm]()

    if algorithm == 'wordnet':
        for token in content:
            output(stemmer.lemmatize(token))
    else:
        for token in content:
            output(stemmer.stem(token))
Exemplo n.º 11
0
def words2bigrams(sep, tokens):
    '''Tokenize words into bigrams. Bigrams are two word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    bigrams = []
    try:
        bigrams = list(nltk.bigrams(content))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(sep.join(bigram)) for bigram in bigrams]
Exemplo n.º 12
0
def words2bigrams(sep, tokens):
    """Tokenize words into bigrams. Bigrams are two word tokens.
    Punctuation is considered as a separate token."""

    content = read_tokens(tokens)
    bigrams = []
    try:
        bigrams = list(nltk.bigrams(content))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message='Have you run "textkit download"?', nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(sep.join(bigram)) for bigram in bigrams]
Exemplo n.º 13
0
def tokens2counts(sep, limit, tokens):
    '''Count unique tokens in a list of tokens.
    Tokens are sorted by top counts.'''
    content = read_tokens(tokens)
    counts = sort_counts(get_counts(content))

    # we want the argument type to be an INT - but python only
    # has support for a float infinity. So if it the limit is negative,
    # it becomes infinite
    if limit < 0:
        limit = float('inf')

    # using csv writer to ensure proper encoding of the seperator.
    rows = [list(map(str, vals)) for ind, vals in enumerate(counts) if ind < limit]
    write_csv(rows, str(sep))
Exemplo n.º 14
0
def tokens2topbigrams(sep, measure, freq, scores, tokens):
    '''Find top most interesting bi-grams in a token document.
    Uses the --measure argument to determine what measure to use to define
    'interesting'.
    '''

    content = read_tokens(tokens)
    bcf = nltk.collocations.BigramCollocationFinder.from_words(content)
    bcf.apply_freq_filter(freq)

    nltk_measure = MEASURES[measure]
    bigrams = bcf.score_ngrams(nltk_measure)

    out = [b[0] for b in bigrams]
    if scores:
        out = [b[0] + tuple([str(b[1])]) for b in bigrams]
    write_csv(out, str(sep))
Exemplo n.º 15
0
def tokens2topbigrams(sep, measure, freq, scores, tokens):
    '''Find top most interesting bi-grams in a token document.
    Uses the --measure argument to determine what measure to use to define
    'interesting'.
    '''

    content = read_tokens(tokens)
    bcf = nltk.collocations.BigramCollocationFinder.from_words(content)
    bcf.apply_freq_filter(freq)

    nltk_measure = MEASURES[measure]
    bigrams = bcf.score_ngrams(nltk_measure)

    out = [b[0] for b in bigrams]
    if scores:
        out = [b[0] + tuple([str(b[1])]) for b in bigrams]
    write_csv(out, str(sep))
Exemplo n.º 16
0
def tokens2json(ids, names, field, split, sep, token_docs):
    '''Convert a set of token documents into a
    JSON array of document objects.'''

    docs = []

    names = read_names(names)
    ids = read_names(ids)

    for idx, path in enumerate(token_docs):
        if path == '-':
            tokens_doc = sys.stdin
        else:
            tokens_doc = open(path, 'r')
        if split:
            content = read_csv(tokens_doc, sep)
            content = coerce_types(content)
        else:
            content = read_tokens(tokens_doc)

        # ordered so that these attributes stay at the top
        doc = OrderedDict()

        if idx < len(ids) - 1:
            doc['id'] = ids[idx]
        else:
            doc['id'] = path

        if idx < len(names) - 1:
            doc['name'] = names[idx]
        else:
            doc['name'] = path

        doc[field] = content
        docs.append(doc)
        tokens_doc.close()

    out_content = json.dumps(docs, indent=2)
    output(out_content)
Exemplo n.º 17
0
def uppercase(tokens):
    '''Transform all tokens to uppercase.'''
    content = read_tokens(tokens)
    [output(token.upper()) for token in content]
Exemplo n.º 18
0
def filterlengths(minimum, tokens):
    '''Remove tokens that are shorter then the minimum length provided.'''
    content = read_tokens(tokens)
    [output(token) for token in content if len(token) >= minimum]
Exemplo n.º 19
0
def filterlengths(minimum, tokens):
    '''Remove tokens that are shorter then the minimum length provided.'''
    content = read_tokens(tokens)
    [output(token) for token in content if len(token) >= minimum]
Exemplo n.º 20
0
def uppercase(tokens):
    '''Transform all tokens to uppercase.'''
    content = read_tokens(tokens)
    [output(token.upper()) for token in content]
Exemplo n.º 21
0
def get_stopwords(stopword_name):
    path = data_item('/stopwords/' + stopword_name + '.txt')
    stopwords = []
    with open(path) as filename:
        stopwords = read_tokens(filename)
    return stopwords
Exemplo n.º 22
0
def read_names(names_path):
    names = []
    if names_path:
        names_doc = open(names_path, 'r')
        names = read_tokens(names_doc)
    return names
Exemplo n.º 23
0
def filterpunc(tokens):
    '''Remove tokens that are only punctuation from a list of tokens.'''
    content = read_tokens(tokens)
    [output(token) for token in content if token not in punctuation]
Exemplo n.º 24
0
def tokens2lower(tokens):
    '''Transform all tokens to lowercase.'''
    content = read_tokens(tokens)
    [output(token.lower()) for token in content]
Exemplo n.º 25
0
def tokens2text(sep, tokens):
    '''Combine tokens in a token document into a single text file.'''

    content = read_tokens(tokens)
    out = sep.join(content)
    output(out)