def process_questions(questions, return_score_modifiers=False): # Make a list if not isinstance(questions, list): questions = [questions] # Clean and tokenize prepared_questions = [] for question in questions: question = question.strip() prepared_questions.append( apply_bpe(tokenize(question)) if question else '##emptyquestion##') # Run inference answers_list = inference_helper(prepared_questions) # Process answers prepared_answers_list = [] for index, answers in enumerate(answers_list): answers = detokenize(answers) answers = replace_in_answers(answers) answers = normalize_new_lines(answers) answers_score = score_answers(questions[index], answers) best_index, best_score = get_best_score(answers_score['score']) if prepared_questions[index] == '##emptyquestion##': prepared_answers_list.append(None) elif return_score_modifiers: prepared_answers_list.append({ 'answers': answers, 'scores': answers_score['score'], 'best_index': best_index, 'best_score': best_score, 'score_modifiers': answers_score['score_modifiers'] }) else: prepared_answers_list.append({ 'answers': answers, 'scores': answers_score['score'], 'best_index': best_index, 'best_score': best_score }) return prepared_answers_list
def process(input_folder, type, output_folder): updated_json = open(os.path.join(output_folder, type + ".json"), mode="w", encoding="utf-8") file_list = os.listdir(input_folder) for filename in file_list: if type in filename: print("filename", filename) json_file = open(os.path.join(input_folder, filename), mode="r", encoding="utf-8") data = json.load(json_file) upd_trdata = [] for entry_index, entry in enumerate(data): summary = entry['summary'] summary = detokenize(summary) summary = " ".join(word_tokenize(summary)) upd_entry = entry upd_entry['summary'] = summary upd_trdata.append(upd_entry) if entry_index % 50 == 0: print(entry_index) json.dump(upd_trdata, updated_json)
def detokenize(corpus: iCorpus, lang: Lang, tokenizer=str, model=str, progress=True) -> iCorpus: if progress: corpus = tqdm(corpus) if lang == 'en': if tokenizer is None or tokenizer == "" or tokenizer == 'moses': return (_lazy_load_moses_detokenizer('en').detokenize( line.split(' '), return_str=True, unescape=False) for line in corpus) elif tokenizer == 'bpe': # The bpe tokenizer will not remove \n, but the others will. Make BPE remove \n return (_lazy_load_bpe_tokenizer('en', model=model).DecodePieces( line.split(' ')).replace('▁', ' ').replace('\n', '') for line in corpus) else: raise ValueError(f'Unknown tokenizer={tokenizer}') elif lang == 'is': if tokenizer is None or tokenizer == "": return (mideind_tok.detokenize(list( mideind_tok.tokenize(line, normalize=False)), normalize=False) for line in corpus) elif tokenizer == 'bpe': # The bpe tokenizer will not remove \n, but the others will. Make BPE remove \n return (_lazy_load_bpe_tokenizer('is', model=model).DecodePieces( line.split(' ')).replace('▁', ' ').replace('\n', '') for line in corpus) elif tokenizer == 'moses': return (_lazy_load_moses_detokenizer('is').detokenize( line.split(' '), return_str=True, unescape=False) for line in corpus) else: raise ValueError(f'Unknown tokenizer={tokenizer}') else: raise ValueError(f'Unkown language={lang}')
def create_json(input_folder, input_summaries, output_folder): for filename in os.listdir(input_folder): d = None with codecs.open(input_folder+filename) as json_data: d = json.load(json_data) print('filename',input_folder+filename) output = [] for entry in d: datetime_object = datetime.strptime(entry['day'], '%m_%d_%y') html_file_name = [] html_file_name.append(datetime_object.strftime("%Y%m%d")) visname_homename = entry['vis_name'].replace(" ", "_") + "-" + entry['home_name'].replace(" ", "_") visname_homename = visname_homename.replace('D-backs', 'Diamondbacks') html_file_name.append(visname_homename) html_file_name.append(str(entry['vis_line']['team_runs']) + "-" + str(entry['home_line']['team_runs'])) files = glob.glob(input_summaries+"*" +"_".join(html_file_name)) if len(files) < 1: print(input_summaries+"*"+"_".join(html_file_name) + " not found") elif len(files) > 1: print(input_summaries + "*" + "_".join(html_file_name) + " multiple found") else: fname = files[0] with codecs.open(fname, encoding='utf-8') as f: content = f.readlines() updated_content = [] for line in content: words = word_tokenize(detokenize(line.strip().split())) updated_content.append(" ".join(words)) text = " *NEWPARAGRAPH* ".join(updated_content) entry['summary'] = text.split() output.append(entry) if len(output) > 0: with codecs.open(output_folder+'combined_'+filename, 'w+') as outfile: json.dump(output, outfile) outfile.close()
def check_grammar(**options: Any) -> str: """Do a full spelling and grammar check of the source text""" accumul: List[str] = [] offset = 0 inneroptions: Dict[str, Union[str, bool]] = {} inneroptions["annotate_unparsed_sentences"] = options.get( "annotate_unparsed_sentences", True ) inneroptions["ignore_rules"] = options.get("ignore_rules", set()) annlist: List[str] = [] format = options.get("format", "json") for toklist in sentence_stream(**options): len_tokens = len(toklist) # Invoke the spelling and grammar checker on the token list # Only contains options relevant to the grammar check sent = check_tokens(toklist, **inneroptions) if sent is None: # Should not happen? continue tokens: List[AnnTokenDict] if sent.tree is None: # Not parsed: use the raw token list tokens = [ AnnTokenDict(k=d.kind, x=d.txt, o=d.original or d.txt) for d in sent.tokens ] else: # Successfully parsed: use the text from the terminals (where available) # since we have more info there, for instance on em/en dashes. # Create a map of token indices to corresponding terminal text assert sent.terminals is not None token_map = {t.index: t.text for t in sent.terminals} tokens = [ AnnTokenDict( k=d.kind, x=token_map.get(ix, d.txt), o=d.original or d.txt ) for ix, d in enumerate(sent.tokens) ] # Maintain token character offsets, accumulated over the entire source text token_offsets: Dict[int, int] = dict() for ix, t in enumerate(toklist): token_offsets[ix] = offset offset += len(t.original or t.txt or "") # Create a normalized form of the sentence cleaned = detokenize(toklist, normalize=True) # Extract the annotation list (defensive programming here) a: List[Annotation] = getattr(sent, "annotations", cast(List[Annotation], [])) # Sort in ascending order by token start index, and then by end index # (more narrow/specific annotations before broader ones) a.sort(key=lambda ann: (ann.start, ann.end)) if format == "text" or format == "textplustoks": arev = sorted(a, key=lambda ann: (ann.start, ann.end), reverse=True) cleantoklist: List[CorrectToken] = toklist[:] for xann in arev: if xann.suggest is None: # Nothing to correct with, nothing we can do continue cleantoklist[xann.start + 1].txt = xann.suggest if xann.end > xann.start: # Annotation spans many tokens # "Okkur börnunum langar í fisk" # "Leita að kílómeter af féinu" → leita að kílómetri af fénu → leita að kílómetra af fénu # "dást af þeim" → "dást að þeim" # Single-token annotations for this span have already been handled # Only case is one ann, many toks in toklist # Give the first token the correct value # Delete the other tokens del cleantoklist[xann.start + 2 : xann.end + 2] txt = detokenize(cleantoklist, normalize=True) if options.get("annotations", False): for aann in a: annlist.append(str(aann)) if annlist and not options.get("print_all", False): txt = txt + "\n" + "\n".join(annlist) annlist = [] accumul.append(txt) elif format == "json": # Create final dictionary for JSON encoding # Convert the annotations to a standard format before encoding in JSON annotations: List[AnnDict] = [ AnnDict( # Start token index of this annotation start=ann.start, # End token index (inclusive) end=ann.end, # Character offset of the start of the annotation in the original text start_char=token_offsets[ann.start], # Character offset of the end of the annotation in the original text # (inclusive, i.e. the offset of the last character) end_char=( token_offsets[ann.end + 1] if ann.end + 1 < len_tokens else offset ) - 1, code=ann.code, text=ann.text, detail=ann.detail or "", suggest=ann.suggest or "", ) for ann in a ] ard = AnnResultDict( original=cleaned, corrected=sent.tidy_text, tokens=tokens, annotations=annotations, ) accumul.append(json_dumps(ard)) elif format == "csv": for cann in a: accumul.append( "{},{},{},{},{},{}".format( cann.code, cann.original, cann.suggest, cann.start, cann.end, cann.suggestlist, ) ) elif format == "m2": accumul.append("S {0}".format(cleaned)) for mann in a: accumul.append( "A {0} {1}|||{2}|||{3}|||REQUIRED|||-NONE-|||0".format( mann.start, mann.end, mann.code, mann.suggest ) ) accumul.append("") if options.get("print_all", True): accumstr = " ".join(accumul) if annlist: # We want the annotations at the bottom accumstr = accumstr + "\n" + "\n".join(annlist) else: accumstr = "\n".join(accumul) return accumstr
def test_grammar(**options: Any) -> Tuple[str, TokenSumType]: """Do a full spelling and grammar check of the source text""" accumul: List[str] = [] offset = 0 alltoks: TokenSumType = [] inneroptions: Dict[str, Union[str, bool]] = {} inneroptions["annotate_unparsed_sentences"] = options.get( "annotate_unparsed_sentences", True ) inneroptions["ignore_rules"] = options.get("ignore_rules", set()) annlist: List[str] = [] for toklist in sentence_stream(**options): # Invoke the spelling and grammar checker on the token list # Only contains options relevant to the grammar check sent = check_tokens(toklist, **inneroptions) if sent is None: # Should not happen? continue # Maintain token character offsets, accumulated over the entire source text token_offsets: Dict[int, int] = dict() for ix, t in enumerate(toklist): token_offsets[ix] = offset offset += len(t.original or t.txt or "") # Extract the annotation list (defensive programming here) a: List[Annotation] = getattr(sent, "annotations", cast(List[Annotation], [])) # Sort in ascending order by token start index, and then by end index # (more narrow/specific annotations before broader ones) a.sort(key=lambda ann: (ann.start, ann.end)) arev = sorted(a, key=lambda ann: (ann.start, ann.end), reverse=True) cleantoklist: List[CorrectToken] = toklist[:] alltoks.extend(cleantoklist) for xann in arev: if xann.suggest is None: # Nothing to correct with, nothing we can do continue cleantoklist[xann.start + 1].txt = xann.suggest if xann.end > xann.start: # Annotation spans many tokens # "Okkur börnunum langar í fisk" # "Leita að kílómeter af féinu" → leita að kílómetri af fénu → leita að kílómetra af fénu # "dást af þeim" → "dást að þeim" # Single-token annotations for this span have already been handled # Only case is one ann, many toks in toklist # Give the first token the correct value # Delete the other tokens del cleantoklist[xann.start + 2 : xann.end + 2] txt = detokenize(cleantoklist, normalize=True) if options.get("annotations", False): for aann in a: annlist.append(str(aann)) if annlist and not options.get("print_all", False): txt = txt + "\n" + "\n".join(annlist) annlist = [] accumul.append(txt) accumstr = "\n".join(accumul) return accumstr, alltoks
def should_be(s1: str, s2: str) -> None: toklist = t.tokenize(s1, **options) assert s2 == t.detokenize(toklist, **options)
def should_be_equal(s: str) -> None: toklist = t.tokenize(s, **options) assert s == t.detokenize(toklist, **options)
def should_be(s1, s2): toklist = t.tokenize(s1, **options) assert s2 == t.detokenize(toklist, **options)
def correct_spaces(tokens: Iterable[Tuple[str, str]]) -> str: """ Returns a string with a reasonably correct concatenation of the tokens, where each token is a (tag, text) tuple. """ return detokenize( Tok(TOK.PUNCTUATION if tag == "c" else TOK.WORD, txt, None) for tag, txt in tokens)
def test_correction(): SENT = [ ( """Hann sagði: "Þú ert fífl"! Ég mótmælti því.""", """Hann sagði: „Þú ert fífl“! Ég mótmælti því.""", ), ( """Hann sagði: Þú ert "fífl"! Ég mótmælti því.""", """Hann sagði: Þú ert „fífl“! Ég mótmælti því.""", ), ( """Hann sagði: Þú ert «fífl»! Ég mótmælti því.""", """Hann sagði: Þú ert „fífl“! Ég mótmælti því.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu í 3ja sinn.""", """Hann sagði: ‚Þú ert fífl‘! Farðu í 3ja sinn.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""", """Hann sagði: ‚Þú ert fífl‘! Farðu í 1sta sinn.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""", """Hann sagði: ‚Þú ert fífl‘! Farðu 2svar í bað.""", ), ( """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""", """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""", ), ("""Hann sagði: Þú ert ´fífl´! Hringdu í 7771234.""", """Hann sagði: Þú ert ‚fífl‘! Hringdu í 7771234."""), ( """Hann sagði: Þú ert (´fífl´)! Ég mótmælti því.""", """Hann sagði: Þú ert (‘ fífl‘)! Ég mótmælti því.""", # !!! ), ("""Hann "gaf" mér 10,780.65 dollara.""", """Hann „gaf“ mér 10,780.65 dollara."""), ( """Hann "gaf" mér €10,780.65.""", """Hann „gaf“ mér €10,780.65.""", ), ( """Hann "gaf" mér €10.780,65.""", """Hann „gaf“ mér €10.780,65.""", ), ] SENT_KLUDGY_ORDINALS_MODIFY = [ ( """Hann sagði: ´Þú ert fífl´! Farðu í 3ja herbergja íbúð.""", """Hann sagði: ‚Þú ert fífl‘! Farðu í þriggja herbergja íbúð.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""", """Hann sagði: ‚Þú ert fífl‘! Farðu í fyrsta sinn.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""", """Hann sagði: ‚Þú ert fífl‘! Farðu tvisvar í bað.""", ), ( """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""", """Ég keypti fjögurra herbergja íbúð á verði tveggja herbergja.""", ), ] SENT_KLUDGY_ORDINALS_TRANSLATE = [ ( """Hann sagði: ´Þú ert fífl´! Farðu í 3ja sinn.""", """Hann sagði: ‚Þú ert fífl‘! Farðu í 3ja sinn.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""", """Hann sagði: ‚Þú ert fífl‘! Farðu í 1sta sinn.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""", """Hann sagði: ‚Þú ert fífl‘! Farðu 2svar í bað.""", ), ( """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""", """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""", ), ] SENT_CONVERT_NUMBERS = [ ("""Hann "gaf" mér 10,780.65 dollara.""", """Hann „gaf“ mér 10.780,65 dollara."""), ("""Hann "gaf" mér €10,780.65.""", """Hann „gaf“ mér €10.780,65."""), ( """Hann "gaf" mér €10.780,65.""", """Hann „gaf“ mér €10.780,65.""", ), ] for sent, correct in SENT: s = t.tokenize(sent) txt = t.detokenize(s, normalize=True) assert txt == correct for sent, correct in SENT_KLUDGY_ORDINALS_MODIFY: s = t.tokenize(sent, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_MODIFY) txt = t.detokenize(s, normalize=True) assert txt == correct for sent, correct in SENT_KLUDGY_ORDINALS_TRANSLATE: s = t.tokenize(sent, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_TRANSLATE) txt = t.detokenize(s, normalize=True) assert txt == correct for sent, correct in SENT_CONVERT_NUMBERS: s = t.tokenize(sent, convert_numbers=True) txt = t.detokenize(s, normalize=True) assert txt == correct