Exemplos de output em Python, exemplos de textkit.utils.output em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: texts_to_json.py Projeto: learntextvis/textkit

def texts2json(ids, names, field, text_docs):
    """Convert a set of text documents into a
    JSON array of document objects."""

    docs = []

    names = read_names(names)
    ids = read_names(ids)

    for idx, path in enumerate(text_docs):
        tokens_doc = open(path, "r")
        content = ""
        with click.open_file(path):
            content = tokens_doc.read()

        # ordered so that these attributes stay at the top
        doc = OrderedDict()

        if idx < len(ids) - 1:
            doc["id"] = ids[idx]
        else:
            doc["id"] = path

        if idx < len(names) - 1:
            doc["name"] = names[idx]
        else:
            doc["name"] = path

        doc[field] = content
        docs.append(doc)
        tokens_doc.close()

    out_content = json.dumps(docs, indent=2)
    output(out_content)

Exemplo n.º 2

0

Exibir arquivo

def texts2json(ids, names, field, text_docs):
    '''Convert a set of text documents into a
    JSON array of document objects.'''

    docs = []

    names = read_names(names)
    ids = read_names(ids)

    for idx, path in enumerate(text_docs):
        tokens_doc = open(path, 'r')
        content = ""
        with click.open_file(path):
            content = tokens_doc.read()

        # ordered so that these attributes stay at the top
        doc = OrderedDict()

        if idx < len(ids) - 1:
            doc['id'] = ids[idx]
        else:
            doc['id'] = path

        if idx < len(names) - 1:
            doc['name'] = names[idx]
        else:
            doc['name'] = path

        doc[field] = content
        docs.append(doc)
        tokens_doc.close()

    out_content = json.dumps(docs, indent=2)
    output(out_content)

Exemplo n.º 3

0

Exibir arquivo

def tokens2stem(tokens, algorithm):
    '''Stem a list of tokens to get their root.'''
    content = read_tokens(tokens)
    stemmer = ALGOS[algorithm]()

    if algorithm == 'wordnet':
        for token in content:
            output(stemmer.lemmatize(token))
    else:
        for token in content:
            output(stemmer.stem(token))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: ngrams.py Projeto: justalfred/textkit

def words2ngrams(sep, length, tokens):
    '''Tokenize words into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    ngrams = list(nltk.ngrams(content, length))
    [output(sep.join(ngram)) for ngram in ngrams]

Exemplo n.º 5

0

Exibir arquivo

def words2bigrams(sep, tokens):
    '''Tokenize words into bigrams. Bigrams are two word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    bigrams = list(nltk.bigrams(content))
    [output(sep.join(bigram)) for bigram in bigrams]

Exemplo n.º 6

0

Exibir arquivo

Arquivo: filter_words.py Projeto: learntextvis/textkit

def filterwords(language, custom, tokens):
    '''Remove stop words from tokens, returning tokens without stop words.'''
    content = read_tokens(tokens)
    stopwords = get_stopwords(language)
    if custom:
        stopwords = stopwords + read_tokens(custom)

    [output(token) for token in content if token.lower() not in stopwords]

Exemplo n.º 7

0

Exibir arquivo

Arquivo: filter_words.py Projeto: jennschiffer/textkit

def filterwords(language, custom, tokens):
    '''Remove stop words from tokens, returning tokens without stop words.'''
    content = read_tokens(tokens)
    stopwords = get_stopwords(language)
    if custom:
        stopwords = stopwords + read_tokens(custom)

    [output(token) for token in content
        if token.lower() not in stopwords]

Exemplo n.º 8

0

Exibir arquivo

Arquivo: tokens_to_pos.py Projeto: jennschiffer/textkit

def tokens2pos(sep, tokens):
    '''Tokenize words into their parts of speech. Each item
    is the original word with its role as the second part
    of the item. Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    nltk.data.path.append(data_item())
    tags = nltk.pos_tag(content)
    [output("{},{}".format(t[0], t[1])) for t in tags]

Exemplo n.º 9

0

Exibir arquivo

Arquivo: tokens_to_pos.py Projeto: justalfred/textkit

def tokens2pos(sep, tokens):
    '''Tokenize words into their parts of speech. Each item
    is the original word with its role as the second part
    of the item. Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    nltk.data.path.append(data_item())
    tags = nltk.pos_tag(content)
    [output("{},{}".format(t[0], t[1])) for t in tags]

Exemplo n.º 10

0

Exibir arquivo

Arquivo: top_bigrams.py Projeto: justalfred/textkit

def top_bigrams(sep, measure, freq, scores, tokens):
    '''Find top most interesting bi-grams in a token document.
    Uses the --measure argument to determine what measure to use to define
    'interesting'.
    '''

    output(sep)
    content = read_tokens(tokens)
    bcf = nltk.collocations.BigramCollocationFinder.from_words(content)
    bcf.apply_freq_filter(freq)

    nltk_measure = MEASURES[measure]
    bigrams = bcf.score_ngrams(nltk_measure)

    out = [b[0] for b in bigrams]
    if scores:
        out = [b[0] + tuple([str(b[1])]) for b in bigrams]
    [output(sep.join(line)) for line in out]

Exemplo n.º 11

0

Exibir arquivo

Arquivo: transliterate.py Projeto: learntextvis/textkit

def transliterate(file):
    '''Convert international text to ascii.'''
    content = ''.join(file.readlines())
    try:
        content = content.decode(chardet.detect(content)['encoding'])
    except AttributeError:
        # Strings do not have a decode method in python 3.
        pass
    [output(unidecode(content).encode('ascii', 'ignore'))]

Exemplo n.º 12

0

Exibir arquivo

Arquivo: punc.py Projeto: jennschiffer/textkit

def text2punc(text):
    '''Tokenize text into punctuation tokens.
    Words and numbers are removed, leaving only punctuation.'''

    # from: http://stackoverflow.com/questions/17485092/how-to-just-keep-punctuation-with-a-string-in-python

    content = '\n'.join([open(f).read() for f in text])
    out = re.sub(r'[^{}]+'.format(punctuation), ' ', content)
    out = out.split()
    [output(p) for p in out]

Exemplo n.º 13

0

Exibir arquivo

def text2punc(text):
    '''Tokenize text into punctuation tokens.
    Words and numbers are removed, leaving only punctuation.'''

    # from: http://stackoverflow.com/questions/17485092/how-to-just-keep-punctuation-with-a-string-in-python

    content = '\n'.join([open(f).read() for f in text])
    out = re.sub(r'[^{}]+'.format(punctuation), ' ', content)
    out = out.split()
    [output(p) for p in out]

Exemplo n.º 14

0

Exibir arquivo

def text2sentences(text):
    '''Tokenize text into sentence tokens.'''
    content = '\n'.join([open(f).read() for f in text])
    sentences = []
    try:
        sentences = sent_tokenize(content)
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(s.strip()) for s in sentences]

Exemplo n.º 15

0

Exibir arquivo

Arquivo: sentences.py Projeto: learntextvis/textkit

def text2sentences(text):
    """Tokenize text into sentence tokens."""
    content = "\n".join([open(f).read() for f in text])
    sentences = []
    try:
        sentences = sent_tokenize(content)
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message='Have you run "textkit download"?', nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(s.strip()) for s in sentences]

Exemplo n.º 16

0

Exibir arquivo

Arquivo: tokens_to_json.py Projeto: learntextvis/textkit

def tokens2json(ids, names, field, split, sep, token_docs):
    '''Convert a set of token documents into a
    JSON array of document objects.'''

    docs = []

    names = read_names(names)
    ids = read_names(ids)

    for idx, path in enumerate(token_docs):
        if path == '-':
            tokens_doc = sys.stdin
        else:
            tokens_doc = open(path, 'r')
        if split:
            content = read_csv(tokens_doc, sep)
            content = coerce_types(content)
        else:
            content = read_tokens(tokens_doc)

        # ordered so that these attributes stay at the top
        doc = OrderedDict()

        if idx < len(ids) - 1:
            doc['id'] = ids[idx]
        else:
            doc['id'] = path

        if idx < len(names) - 1:
            doc['name'] = names[idx]
        else:
            doc['name'] = path

        doc[field] = content
        docs.append(doc)
        tokens_doc.close()

    out_content = json.dumps(docs, indent=2)
    output(out_content)

Exemplo n.º 17

0

Exibir arquivo

def text2words(text):
    '''Tokenize text into word tokens.
    Punctuation is considered as a separate token.'''
    content = '\n'.join([open(f).read() for f in text])
    tokens = []
    try:
        tokens = nltk.word_tokenize(content)
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(token) for token in tokens]

Exemplo n.º 18

0

Exibir arquivo

Arquivo: bigrams.py Projeto: learntextvis/textkit

def words2bigrams(sep, tokens):
    """Tokenize words into bigrams. Bigrams are two word tokens.
    Punctuation is considered as a separate token."""

    content = read_tokens(tokens)
    bigrams = []
    try:
        bigrams = list(nltk.bigrams(content))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message='Have you run "textkit download"?', nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(sep.join(bigram)) for bigram in bigrams]

Exemplo n.º 19

0

Exibir arquivo

def words2bigrams(sep, tokens):
    '''Tokenize words into bigrams. Bigrams are two word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    bigrams = []
    try:
        bigrams = list(nltk.bigrams(content))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(sep.join(bigram)) for bigram in bigrams]

Exemplo n.º 20

0

Exibir arquivo

Arquivo: filter_punc.py Projeto: learntextvis/textkit

def filterpunc(tokens):
    '''Remove tokens that are only punctuation from a list of tokens.'''
    content = read_tokens(tokens)
    [output(token) for token in content if token not in punctuation]

Exemplo n.º 21

0

Exibir arquivo

def nonewlines(text):
    '''Remove newlines from a text file.'''
    content = '\n'.join([open(f).read() for f in text])
    content = re.sub('\n|\r\n|\r', ' ', content).strip()
    output(content)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: words.py Projeto: justalfred/textkit

def text2words(text):
    '''Tokenize text into word tokens.
    Punctuation is considered as a separate token.'''
    content = '\n'.join([open(f).read() for f in text])
    tokens = nltk.word_tokenize(content)
    [output(token) for token in tokens]

Exemplo n.º 23

0

Exibir arquivo

Arquivo: uppercase.py Projeto: jennschiffer/textkit

def uppercase(tokens):
    '''Transform all tokens to uppercase.'''
    content = read_tokens(tokens)
    [output(token.upper()) for token in content]

Exemplo n.º 24

0

Exibir arquivo

Arquivo: filter_lengths.py Projeto: jennschiffer/textkit

def filterlengths(minimum, tokens):
    '''Remove tokens that are shorter then the minimum length provided.'''
    content = read_tokens(tokens)
    [output(token) for token in content if len(token) >= minimum]

Exemplo n.º 25

0

Exibir arquivo

def uppercase(tokens):
    '''Transform all tokens to uppercase.'''
    content = read_tokens(tokens)
    [output(token.upper()) for token in content]

Exemplo n.º 26

0

Exibir arquivo

Arquivo: filter_lengths.py Projeto: learntextvis/textkit

def filterlengths(minimum, tokens):
    '''Remove tokens that are shorter then the minimum length provided.'''
    content = read_tokens(tokens)
    [output(token) for token in content if len(token) >= minimum]

Exemplo n.º 27

0

Exibir arquivo

Arquivo: tokens_to_text.py Projeto: learntextvis/textkit

def tokens2text(sep, tokens):
    '''Combine tokens in a token document into a single text file.'''

    content = read_tokens(tokens)
    out = sep.join(content)
    output(out)

Exemplo n.º 28

0

Exibir arquivo

Arquivo: newlines.py Projeto: learntextvis/textkit

def nonewlines(text):
    """Remove newlines from a text file."""
    content = "\n".join([open(f).read() for f in text])
    content = content.replace("\n", " ").strip()
    output(content)

Exemplo n.º 29

0

Exibir arquivo

def showstops(language):
    '''Display stop words used by textkit for a given language.'''
    stopwords = get_stopwords(language)

    [output(token) for token in stopwords]

Exemplo n.º 30

0

Exibir arquivo

def text2sentences(text):
    '''Tokenize text into sentence tokens.'''
    content = '\n'.join([open(f).read() for f in text])
    sentences = sent_tokenize(content)
    [output(s.strip()) for s in sentences]

Exemplo n.º 31

0

Exibir arquivo

Arquivo: sentences.py Projeto: jennschiffer/textkit

def text2sentences(text):
    '''Tokenize text into sentence tokens.'''
    content = '\n'.join([open(f).read() for f in text])
    sentences = sent_tokenize(content)
    [output(s.strip()) for s in sentences]

Exemplo n.º 32

0

Exibir arquivo

def tokens2lower(tokens):
    '''Transform all tokens to lowercase.'''
    content = read_tokens(tokens)
    [output(token.lower()) for token in content]

Exemplo n.º 33

0

Exibir arquivo

Arquivo: newlines.py Projeto: justalfred/textkit

def nonewlines(text):
    '''Remove newlines from a text file.'''
    content = '\n'.join([open(f).read() for f in text])
    content = content.replace('\n', ' ').strip()
    output(content)