def ngrams(s, n): # wrapper for nltk ngrams # this returns a generator tokens = tokenize(s) g = nltkNgrams(tokens, n) for i in g: print(i)
def test(): filename = "/tmp/103.txt" # guten.from_s3('103', filename) with open(filename, 'r', errors='ignore') as myfile: in_text = myfile.read().replace('\n', ' ') processed = tokenizers.tokenize(in_text[4000:4800]) print(" ".join(processed['tokens'][0:200])) print(processed['entities'])
def encodeDocument(voc,doc): tokens = tokenize(doc) # add an "out of vocabulary" element arr = initArray(len(tokens),len(voc)+1) for i,tok in enumerate(tokens): if tok in voc: arr[i,voc.index(tok)]=1 else: arr[i,len(voc)]=1 return arr
def import_file(self, filename): with open(filename, 'r') as myfile: in_text = myfile.read().replace('\n', ' ') start_point = max( 0, in_text.find("*** START OF THIS PROJECT GUTENBERG EBOOK")) if start_point > 0: start_point += len("*** START OF THIS PROJECT GUTENBERG EBOOK") end_point = min( len(in_text), in_text.find("*** END OF THIS PROJECT GUTENBERG EBOOK")) trimmed = in_text[start_point:end_point] # mc.train_words(in_text.split()) self.train_words(tokenizers.tokenize(trimmed))
def lowMemBinaryEncodeDocument(voc,doc): # only store the frequency info of tokens in doc using a dict. # this is clearly less memory intense than storing the vectors enc = {} tokens = tokenize(doc) for tok in tokens: # use voc as the set of tokens you care about # alternative is to collect all of them and decide a vocab after the fact. if tok in voc: enc[tok] = 1 else: enc['out_out_vocab']=1 return enc
def format_error_analysis(output, compare=None, per_token=False, only_differing=False, show_all=False, show_tokens=True): examples_table_template = ''' <h3>{cond}</h3> <table> <tr><th>input</th>{alt_inputs_header}{alt_outputs_header}<th>gold</th><th>prediction</th><th>{prob_header}</th>{compare_header}</tr> {examples} </table>''' example_template = ' <tr>{input}{alt_inputs}{alt_outputs}{output}' \ '{prediction}{pprob}{comparison}{cprob}</tr>' score_template = '<td>{}</td>' show_alt_inputs = max_len(output.data, 'alt_inputs') show_alt_outputs = max_len(output.data, 'alt_outputs') if compare and 'input' not in compare.data[0]: # Results when there's an error loading the comparison file; # no need to print a second warning. compare = None if compare and len(compare.data) != len(output.data): warnings.warn("Skipping comparison--mismatch between number of output examples (%s) " "and number of comparison examples (%s)" % (len(output.data), len(compare.data))) compare = None collated = [] for i, (inst, score, pred) in enumerate(zip(output.data, output.scores, output.predictions)): example = {} example['input'] = format_value(inst['input']) example['alt_inputs'] = format_alts(inst['alt_inputs'], show_alt_inputs) if show_tokens and output.tokens: example['output'] = format_tokens(inst['output'], output.tokens[i]) else: example['output'] = format_value(inst['output']) example['alt_outputs'] = format_alts(inst['alt_outputs'], show_alt_outputs) example['prediction'] = format_value(pred) if isinstance(score, Number): if per_token: num_tokens = len(tokenize(inst['output'])) + 1 else: num_tokens = 1 pprob = np.exp(score / num_tokens) else: pprob = score example['pprob'] = score_template.format(format_number(pprob)) example['pprob_val'] = pprob if isinstance(pprob, Number) else 0 if compare: if compare.data[i]['input'] != inst['input']: warnings.warn((u"Comparison input doesn't match this input: %s != %s" % (compare.data[i]['input'], inst['input'])).encode('utf-8')) example['comparison'] = format_value(compare.predictions[i]) cscore = compare.scores[i] if isinstance(cscore, Number): cprob = np.exp(cscore / num_tokens) else: cprob = cscore example['cprob'] = score_template.format(format_number(cprob)) example['cprob_val'] = cprob if isinstance(cprob, Number) else 0 else: example['comparison'] = '' example['cprob'] = '' example['cprob_val'] = 0.0 collated.append(example) score_order = sorted(collated, key=lambda e: e['pprob_val']) tables = [ ('Worst', score_order[:100]), ('Best', reversed(score_order[-100:])), (('All', collated) if show_all else ('Head', collated[:100])), ] if compare: if only_differing: differing = [e for e in collated if e['prediction'] != e['comparison']] else: differing = collated diff_order = sorted(differing, key=lambda e: e['pprob_val'] - e['cprob_val']) tables.extend([ ('Biggest decline', diff_order[:100]), ('Biggest improvement', reversed(diff_order[-100:])), ]) prob_header = 'prob (per token)' if per_token else 'prob' compare_header = ('<th>comparison</th><th>{prob_header}</th>'.format(prob_header=prob_header) if compare else '') return '\n'.join(examples_table_template.format( cond=cond, alt_inputs_header=(('<th>alt inputs</th>' if show_alt_inputs else '') + '<th></th>' * (show_alt_inputs - 1)), alt_outputs_header=(('<th>alt outputs</th>' if show_alt_outputs else '') + '<th></th>' * (show_alt_outputs - 1)), compare_header=compare_header, prob_header=prob_header, examples='\n'.join( example_template.format(**inst) for inst in examples ) ) for cond, examples in tables)
def format_error_analysis(output, compare=None, per_token=False): examples_table_template = ''' <h3>{cond}</h3> <table> <tr><th>input</th>{alt_inputs_header}{alt_outputs_header}<th>gold</th><th>prediction</th><th>{prob_header}</th>{compare_header}</tr> {examples} </table>''' example_template = ' <tr>{input}{alt_inputs}{alt_outputs}{output}' \ '{prediction}{pprob}{comparison}{cprob}</tr>' score_template = '<td>{}</td>' show_alt_inputs = max_len(output.data, 'alt_inputs') show_alt_outputs = max_len(output.data, 'alt_outputs') collated = [] for i, (inst, score, pred) in enumerate( zip(output.data, output.scores, output.predictions)): example = {} example['input'] = format_value(inst['input']) example['alt_inputs'] = format_alts(inst['alt_inputs'], show_alt_inputs) example['output'] = format_value(inst['output']) example['alt_outputs'] = format_alts(inst['alt_outputs'], show_alt_outputs) example['prediction'] = format_value(pred) if isinstance(score, Number): if per_token: num_tokens = len(tokenize(inst['output'])) + 1 else: num_tokens = 1 pprob = np.exp(score / num_tokens) else: pprob = score example['pprob'] = score_template.format(format_number(pprob)) example['pprob_val'] = pprob if isinstance(pprob, Number) else 0 if compare: if compare.data[i]['input'] == inst['input']: example['comparison'] = format_value(compare.predictions[i]) cscore = compare.scores[i] if isinstance(cscore, Number): cprob = np.exp(cscore / num_tokens) else: cprob = cscore example['cprob'] = score_template.format(format_number(cprob)) example['cprob_val'] = cprob if isinstance(cprob, Number) else 0 else: warnings.warn( "Comparison input doesn't match this input: %s != %s" % (compare.data[i]['input'], inst['input'])) example['comparison'] = '' example['cprob'] = '' else: example['comparison'] = '' example['cprob'] = '' collated.append(example) score_order = sorted(collated, key=lambda e: e['pprob_val']) tables = [ ('Worst', score_order[:100]), ('Best', reversed(score_order[-100:])), ('Head', collated[:100]), ] if compare: diff_order = sorted(collated, key=lambda e: e['pprob_val'] - e['cprob_val']) tables.extend([ ('Biggest decline', diff_order[:100]), ('Biggest improvement', reversed(diff_order[-100:])), ]) prob_header = 'prob (per token)' if per_token else 'prob' compare_header = ('<th>comparison</th><th>{prob_header}</th>'.format( prob_header=prob_header) if compare else '') return '\n'.join( examples_table_template.format( cond=cond, alt_inputs_header=( ('<th>alt inputs</th>' if show_alt_inputs else '') + '<th></th>' * (show_alt_inputs - 1)), alt_outputs_header=( ('<th>alt outputs</th>' if show_alt_outputs else '') + '<th></th>' * (show_alt_outputs - 1)), compare_header=compare_header, prob_header=prob_header, examples='\n'.join( example_template.format(**inst) for inst in examples)) for cond, examples in tables)
def sentence2dict(sentence): return dict([(token,1) for token in tokenize(sentence)])
def getVocab(s): tokens = tokenize(s) return sorted(set(tokens))