Пример #1
0
def ngrams(s, n):
    # wrapper for nltk ngrams

    # this returns a generator
    tokens = tokenize(s)
    g = nltkNgrams(tokens, n)
    for i in g:
        print(i)
Пример #2
0
def test():
    filename = "/tmp/103.txt"
    # guten.from_s3('103', filename)
    with open(filename, 'r', errors='ignore') as myfile:
        in_text = myfile.read().replace('\n', ' ')

    processed = tokenizers.tokenize(in_text[4000:4800])
    print(" ".join(processed['tokens'][0:200]))
    print(processed['entities'])
Пример #3
0
def encodeDocument(voc,doc):
    tokens = tokenize(doc)
    # add an "out of vocabulary" element
    arr = initArray(len(tokens),len(voc)+1)
    for i,tok in enumerate(tokens):
        if tok in voc:
            arr[i,voc.index(tok)]=1
        else:
            arr[i,len(voc)]=1
    return arr
Пример #4
0
 def import_file(self, filename):
     with open(filename, 'r') as myfile:
         in_text = myfile.read().replace('\n', ' ')
     start_point = max(
         0, in_text.find("*** START OF THIS PROJECT GUTENBERG EBOOK"))
     if start_point > 0:
         start_point += len("*** START OF THIS PROJECT GUTENBERG EBOOK")
     end_point = min(
         len(in_text),
         in_text.find("*** END OF THIS PROJECT GUTENBERG EBOOK"))
     trimmed = in_text[start_point:end_point]
     # mc.train_words(in_text.split())
     self.train_words(tokenizers.tokenize(trimmed))
Пример #5
0
def lowMemBinaryEncodeDocument(voc,doc):
    # only store the frequency info of tokens in doc using a dict.
    # this is clearly less memory intense than storing the vectors
    enc = {}
    tokens = tokenize(doc)
    for tok in tokens:
        # use voc as the set of tokens you care about
        # alternative is to collect all of them and decide a vocab after the fact.
        if tok in voc:
            enc[tok] = 1
        else:
            enc['out_out_vocab']=1
    return enc
Пример #6
0
def format_error_analysis(output, compare=None, per_token=False,
                          only_differing=False, show_all=False, show_tokens=True):
    examples_table_template = '''    <h3>{cond}</h3>
    <table>
        <tr><th>input</th>{alt_inputs_header}{alt_outputs_header}<th>gold</th><th>prediction</th><th>{prob_header}</th>{compare_header}</tr>
{examples}
    </table>'''

    example_template = '        <tr>{input}{alt_inputs}{alt_outputs}{output}' \
                       '{prediction}{pprob}{comparison}{cprob}</tr>'
    score_template = '<td>{}</td>'
    show_alt_inputs = max_len(output.data, 'alt_inputs')
    show_alt_outputs = max_len(output.data, 'alt_outputs')

    if compare and 'input' not in compare.data[0]:
        # Results when there's an error loading the comparison file;
        # no need to print a second warning.
        compare = None
    if compare and len(compare.data) != len(output.data):
        warnings.warn("Skipping comparison--mismatch between number of output examples (%s) "
                      "and number of comparison examples (%s)" %
                      (len(output.data), len(compare.data)))
        compare = None

    collated = []
    for i, (inst, score, pred) in enumerate(zip(output.data, output.scores, output.predictions)):
        example = {}
        example['input'] = format_value(inst['input'])
        example['alt_inputs'] = format_alts(inst['alt_inputs'], show_alt_inputs)
        if show_tokens and output.tokens:
            example['output'] = format_tokens(inst['output'], output.tokens[i])
        else:
            example['output'] = format_value(inst['output'])
        example['alt_outputs'] = format_alts(inst['alt_outputs'], show_alt_outputs)
        example['prediction'] = format_value(pred)
        if isinstance(score, Number):
            if per_token:
                num_tokens = len(tokenize(inst['output'])) + 1
            else:
                num_tokens = 1
            pprob = np.exp(score / num_tokens)
        else:
            pprob = score
        example['pprob'] = score_template.format(format_number(pprob))
        example['pprob_val'] = pprob if isinstance(pprob, Number) else 0
        if compare:
            if compare.data[i]['input'] != inst['input']:
                warnings.warn((u"Comparison input doesn't match this input: %s != %s" %
                               (compare.data[i]['input'], inst['input'])).encode('utf-8'))
            example['comparison'] = format_value(compare.predictions[i])
            cscore = compare.scores[i]
            if isinstance(cscore, Number):
                cprob = np.exp(cscore / num_tokens)
            else:
                cprob = cscore
            example['cprob'] = score_template.format(format_number(cprob))
            example['cprob_val'] = cprob if isinstance(cprob, Number) else 0
        else:
            example['comparison'] = ''
            example['cprob'] = ''
            example['cprob_val'] = 0.0
        collated.append(example)

    score_order = sorted(collated, key=lambda e: e['pprob_val'])
    tables = [
        ('Worst', score_order[:100]),
        ('Best', reversed(score_order[-100:])),
        (('All', collated)
         if show_all else
         ('Head', collated[:100])),
    ]
    if compare:
        if only_differing:
            differing = [e for e in collated if e['prediction'] != e['comparison']]
        else:
            differing = collated
        diff_order = sorted(differing, key=lambda e: e['pprob_val'] - e['cprob_val'])
        tables.extend([
            ('Biggest decline', diff_order[:100]),
            ('Biggest improvement', reversed(diff_order[-100:])),
        ])

    prob_header = 'prob (per token)' if per_token else 'prob'
    compare_header = ('<th>comparison</th><th>{prob_header}</th>'.format(prob_header=prob_header)
                      if compare else '')
    return '\n'.join(examples_table_template.format(
        cond=cond,
        alt_inputs_header=(('<th>alt inputs</th>' if show_alt_inputs else '') +
                           '<th></th>' * (show_alt_inputs - 1)),
        alt_outputs_header=(('<th>alt outputs</th>' if show_alt_outputs else '') +
                            '<th></th>' * (show_alt_outputs - 1)),
        compare_header=compare_header,
        prob_header=prob_header,
        examples='\n'.join(
            example_template.format(**inst) for inst in examples
        )
    ) for cond, examples in tables)
Пример #7
0
def format_error_analysis(output, compare=None, per_token=False):
    examples_table_template = '''    <h3>{cond}</h3>
    <table>
        <tr><th>input</th>{alt_inputs_header}{alt_outputs_header}<th>gold</th><th>prediction</th><th>{prob_header}</th>{compare_header}</tr>
{examples}
    </table>'''

    example_template = '        <tr>{input}{alt_inputs}{alt_outputs}{output}' \
                       '{prediction}{pprob}{comparison}{cprob}</tr>'
    score_template = '<td>{}</td>'
    show_alt_inputs = max_len(output.data, 'alt_inputs')
    show_alt_outputs = max_len(output.data, 'alt_outputs')
    collated = []
    for i, (inst, score, pred) in enumerate(
            zip(output.data, output.scores, output.predictions)):
        example = {}
        example['input'] = format_value(inst['input'])
        example['alt_inputs'] = format_alts(inst['alt_inputs'],
                                            show_alt_inputs)
        example['output'] = format_value(inst['output'])
        example['alt_outputs'] = format_alts(inst['alt_outputs'],
                                             show_alt_outputs)
        example['prediction'] = format_value(pred)
        if isinstance(score, Number):
            if per_token:
                num_tokens = len(tokenize(inst['output'])) + 1
            else:
                num_tokens = 1
            pprob = np.exp(score / num_tokens)
        else:
            pprob = score
        example['pprob'] = score_template.format(format_number(pprob))
        example['pprob_val'] = pprob if isinstance(pprob, Number) else 0
        if compare:
            if compare.data[i]['input'] == inst['input']:
                example['comparison'] = format_value(compare.predictions[i])
                cscore = compare.scores[i]
                if isinstance(cscore, Number):
                    cprob = np.exp(cscore / num_tokens)
                else:
                    cprob = cscore
                example['cprob'] = score_template.format(format_number(cprob))
                example['cprob_val'] = cprob if isinstance(cprob,
                                                           Number) else 0
            else:
                warnings.warn(
                    "Comparison input doesn't match this input: %s != %s" %
                    (compare.data[i]['input'], inst['input']))
                example['comparison'] = ''
                example['cprob'] = ''
        else:
            example['comparison'] = ''
            example['cprob'] = ''
        collated.append(example)

    score_order = sorted(collated, key=lambda e: e['pprob_val'])
    tables = [
        ('Worst', score_order[:100]),
        ('Best', reversed(score_order[-100:])),
        ('Head', collated[:100]),
    ]
    if compare:
        diff_order = sorted(collated,
                            key=lambda e: e['pprob_val'] - e['cprob_val'])
        tables.extend([
            ('Biggest decline', diff_order[:100]),
            ('Biggest improvement', reversed(diff_order[-100:])),
        ])

    prob_header = 'prob (per token)' if per_token else 'prob'
    compare_header = ('<th>comparison</th><th>{prob_header}</th>'.format(
        prob_header=prob_header) if compare else '')
    return '\n'.join(
        examples_table_template.format(
            cond=cond,
            alt_inputs_header=(
                ('<th>alt inputs</th>' if show_alt_inputs else '') +
                '<th></th>' * (show_alt_inputs - 1)),
            alt_outputs_header=(
                ('<th>alt outputs</th>' if show_alt_outputs else '') +
                '<th></th>' * (show_alt_outputs - 1)),
            compare_header=compare_header,
            prob_header=prob_header,
            examples='\n'.join(
                example_template.format(**inst) for inst in examples))
        for cond, examples in tables)
Пример #8
0
def sentence2dict(sentence):
    return dict([(token,1) for token in tokenize(sentence)])
Пример #9
0
def getVocab(s):
    tokens = tokenize(s)
    return sorted(set(tokens))