Exemplo n.º 1
0
def words2ngrams(sep, num, tokens):
    '''Convert word tokens into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    ngrams = list(nltk.ngrams(content, num))
    write_csv(ngrams, str(sep))
Exemplo n.º 2
0
def words2ngrams(sep, num, tokens):
    '''Convert word tokens into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    ngrams = list(nltk.ngrams(content, num))
    write_csv(ngrams, str(sep))
Exemplo n.º 3
0
def tokens2pos(sep, tokens):
    '''Tokenize words into their parts of speech. Output contains the
       word token followed by its part-of-speech tag, separated by the
       character specified by --sep.
    '''

    content = read_tokens(tokens)
    nltk.data.path.append(data_item())
    tags = nltk.pos_tag(content)
    write_csv(tags, str(sep))
Exemplo n.º 4
0
def tokens2pos(sep, tokens):
    '''Tokenize words into their parts of speech. Output contains the
       word token followed by its part-of-speech tag, separated by the
       character specified by --sep.
    '''

    content = read_tokens(tokens)
    nltk.data.path.append(data_item())
    tags = nltk.pos_tag(content)
    write_csv(tags, str(sep))
Exemplo n.º 5
0
def text2ngrams(sep, num, text):
    '''Tokenize plain text into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''
    content = '\n'.join([open(f).read() for f in text])
    try:
        tokens = nltk.word_tokenize(content)
        ngrams = list(nltk.ngrams(tokens, num))
        write_csv(ngrams, str(sep))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
Exemplo n.º 6
0
def text2ngrams(sep, num, text):
    '''Tokenize plain text into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''
    content = '\n'.join([open(f).read() for f in text])
    try:
        tokens = nltk.word_tokenize(content)
        ngrams = list(nltk.ngrams(tokens, num))
        write_csv(ngrams, str(sep))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
Exemplo n.º 7
0
def tokens2counts(sep, limit, tokens):
    '''Count unique tokens in a list of tokens.
    Tokens are sorted by top counts.'''
    content = read_tokens(tokens)
    counts = sort_counts(get_counts(content))

    # we want the argument type to be an INT - but python only
    # has support for a float infinity. So if it the limit is negative,
    # it becomes infinite
    if limit < 0:
        limit = float('inf')

    # using csv writer to ensure proper encoding of the seperator.
    rows = [list(map(str, vals)) for ind, vals in enumerate(counts) if ind < limit]
    write_csv(rows, str(sep))
Exemplo n.º 8
0
def tokens2topbigrams(sep, measure, freq, scores, tokens):
    '''Find top most interesting bi-grams in a token document.
    Uses the --measure argument to determine what measure to use to define
    'interesting'.
    '''

    content = read_tokens(tokens)
    bcf = nltk.collocations.BigramCollocationFinder.from_words(content)
    bcf.apply_freq_filter(freq)

    nltk_measure = MEASURES[measure]
    bigrams = bcf.score_ngrams(nltk_measure)

    out = [b[0] for b in bigrams]
    if scores:
        out = [b[0] + tuple([str(b[1])]) for b in bigrams]
    write_csv(out, str(sep))
Exemplo n.º 9
0
def tokens2topbigrams(sep, measure, freq, scores, tokens):
    '''Find top most interesting bi-grams in a token document.
    Uses the --measure argument to determine what measure to use to define
    'interesting'.
    '''

    content = read_tokens(tokens)
    bcf = nltk.collocations.BigramCollocationFinder.from_words(content)
    bcf.apply_freq_filter(freq)

    nltk_measure = MEASURES[measure]
    bigrams = bcf.score_ngrams(nltk_measure)

    out = [b[0] for b in bigrams]
    if scores:
        out = [b[0] + tuple([str(b[1])]) for b in bigrams]
    write_csv(out, str(sep))