def create_new_segmentation(infile: IO, outfile: IO, lang: str): simple_fieldnames = [ "simple_article_id", "simple_section_id", "simple_sent_id", "simple_url", "simple_article_title", "simple_section_title", "simple_sent", "de_article_title", "de_url", "simplede_sent", ] simple_reader = csv.DictReader(infile, simple_fieldnames, delimiter="\t") simple_writer = csv.DictWriter(outfile, simple_fieldnames, delimiter="\t", extrasaction="ignore") last_article_id = None last_section_id = None last_line = None section_content = "" with MosesSentenceSplitter(lang) as splitsents: for line in simple_reader: # new section begins at article borders or within articles at section borders # in subsequent short articles, section ID does not necessarily change if (last_article_id != line["simple_article_id"] or last_section_id != line["simple_section_id"] ) and last_section_id is not None: sent_id = 1 if section_content.strip(): for sent in splitsents([section_content]): out_line = last_line out_line.update({ "simple_sent": "NOT_PARALLEL", "simplede_sent": sent, "simple_sent_id": sent_id, }) simple_writer.writerow(out_line) sent_id += 1 section_content = "" section_content += " " + line["simplede_sent"].strip() last_article_id = line["simple_article_id"] last_section_id = line["simple_section_id"] last_line = line for sent in splitsents([section_content]): out_line = last_line out_line.update({ "simple_sent": "NOT_PARALLEL", "simplede_sent": sent, "simple_sent_id": sent_id }) simple_writer.writerow(out_line) sent_id += 1
def moses_sentenize(text): from mosestokenizer import MosesSentenceSplitter global MOSES_SENT if not MOSES_SENT: MOSES_SENT = MosesSentenceSplitter('ru') chunks = MOSES_SENT([text]) return find_substrings(chunks, text)
def __init__(self, lm_type:LMType , language:str): """ lm_type: LMType language: language code """ self.language=language self.tokenizer=MosesTokenizer(self.language) self.normalizer=MosesPunctuationNormalizer(self.language) self.splitter=MosesSentenceSplitter(self.language, more=False) self.type=lm_type
def main(args): splits = MosesSentenceSplitter('fi') detok = MosesDetokenizer("fi") with open(args.infile, "r") as fi, open(args.outfile, "w") as fo: for line in fi: data = json.loads(line.strip()) text = html.unescape(detok(data["text"].split())) if args.moses_tokenized else data["text"] sents = splits([text]) for i, s in enumerate(sents): d = data.copy() d["text"] = s if "id" in d.keys(): d["id"] = d["id"] + f"-s{i}" fo.write(json.dumps(d, ensure_ascii=False) + "\n")
def __init__(self, srclang, targetlang, sourcebpe=None, targetbpe=None, sourcespm=None, targetspm=None): self.bpe_source = None self.bpe_target = None self.sp_processor_source = None self.sp_processor_target = None self.sentences = [] # load BPE model for pre-processing if sourcebpe: # print("load BPE codes from " + sourcebpe, flush=True) BPEcodes = open(sourcebpe, 'r', encoding="utf-8") self.bpe_source = BPE(BPEcodes) if targetbpe: # print("load BPE codes from " + targetbpe, flush=True) BPEcodes = open(targetbpe, 'r', encoding="utf-8") self.bpe_target = BPE(BPEcodes) # load SentencePiece model for pre-processing if sourcespm: # print("load sentence piece model from " + sourcespm, flush=True) self.sp_processor_source = sentencepiece.SentencePieceProcessor() self.sp_processor_source.Load(sourcespm) if targetspm: # print("load sentence piece model from " + targetspm, flush=True) self.sp_processor_target = sentencepiece.SentencePieceProcessor() self.sp_processor_target.Load(targetspm) # pre- and post-processing tools self.tokenizer = None self.detokenizer = None # TODO: should we have support for other sentence splitters? # print("start pre- and post-processing tools") self.sentence_splitter = MosesSentenceSplitter(srclang) self.normalizer = MosesPunctuationNormalizer(srclang) if self.bpe_source: self.tokenizer = MosesTokenizer(srclang) if self.bpe_source: self.detokenizer = MosesDetokenizer(targetlang)
def __init__(self, mosestokenizer_language_code="en", store_data=False, spell_checker_lang=None, n_jobs=1): self.mosestokenizer_language_code = mosestokenizer_language_code self.splitsents = MosesSentenceSplitter( self.mosestokenizer_language_code) self.tokenize = MosesTokenizer(self.mosestokenizer_language_code) nltk.download('wordnet', quiet=False) self.lemmatizer = nltk.stem.WordNetLemmatizer() self.stop = False self.store_data = store_data if spell_checker_lang is None: logger.info("The spell checker is disabled.") self.spell_checker = None else: logger.info("The spell checker is enabled for %s." % (spell_checker_lang)) self.spell_checker = SpellChecker(language=spell_checker_lang, n_jobs=n_jobs)
def split_sentences(paragraph, language): if language == "en": with MosesSentenceSplitter(language) as splitter: return splitter([paragraph]) elif language in INDIC: return sentence_tokenize.sentence_split(paragraph, lang=language)
indic_language_dict = { 'Assamese': 'as', 'Hindi': 'hi', 'Marathi': 'mr', 'Tamil': 'ta', 'Bengali': 'bn', 'Kannada': 'kn', 'Oriya': 'or', 'Telugu': 'te', 'Gujarati': 'gu', 'Malayalam': 'ml', 'Punjabi': 'pa', } splitter = MosesSentenceSplitter('en') def get_inference_params(): source_language = request.form['source_language'] target_language = request.form['target_language'] if source_language in indic_language_dict and target_language == 'English': model = indic2en_model source_lang = indic_language_dict[source_language] target_lang = 'en' elif source_language == 'English' and target_language in indic_language_dict: model = en2indic_model source_lang = 'en' target_lang = indic_language_dict[target_language] elif source_language in indic_language_dict and target_language in indic_language_dict:
def main(args): splits = MosesSentenceSplitter('fi') with open(os.path.join(args.model, "config.json"), "r") as f: config = json.load(f) id2label = config["id2label"] label2id = config["label2id"] labels = [v for k, v in sorted(id2label.items(), key=lambda x: int(x[0]))] tokenizer = transformers.BertTokenizer.from_pretrained(args.model) model = models.BertForMultiLabelSequenceClassification.from_pretrained( args.model, num_labels=len(id2label) ) model.eval() with open(args.data, "r") as f: data = [json.loads(line.strip()) for line in f] if args.sentences: for sample in data: preds = [] sents = splits([sample["text"]]) batches = utils.get_batches(sents, 8) for batch in batches: input = tokenizer(batch, truncation=True, padding=True, return_tensors="pt") result = model( input_ids=input["input_ids"], token_type_ids=input["token_type_ids"], attention_mask=input["attention_mask"] ) pred = sigmoid(result.logits.detach().numpy()) pred = (pred >= args.classification_threshold).astype(int) preds.append(pred) preds = np.concatenate(preds, axis=0) docpred = list(preds.sum(axis=0)) doctrue = [0]*len(label2id) for tag in sample["tags"]: doctrue[label2id[tag]] = 1 print("-"*50) print(f"DOCUMENT: {sample['id']}, #sents: {len(sents)}") print(tabulate.tabulate( [["pred"] + docpred, ["gold"] + doctrue], headers=[""] + labels) ) print() if args.print_sentences: for sent, inds in zip(sents, preds): print(sent, end="") for i, ind in enumerate(inds): if ind == 1: print(" " + id2label[str(i)], end="") print() print() else: texts = [sample["text"] for sample in data] batches = utils.get_batches(sents, 8) for batch in batches: input = tokenizer(batch, truncation=True, padding=True, return_tensors="pt") result = model( input_ids=input["input_ids"], token_type_ids=input["token_type_ids"], attention_mask=input["attention_mask"] ) print(f"DOCUMENT: {sample['id']}, #chars: {len(sents)}")
def split(self, data: list) -> list: data = [line for line in data if line.strip()] if len(data): with MosesSentenceSplitter(self.language_code) as splitter: data = splitter(data) return data
def __init__(self): from mosestokenizer import MosesSentenceSplitter self.splitter = MosesSentenceSplitter('ru')
from mosestokenizer import ( MosesTokenizer, MosesPunctuationNormalizer, MosesSentenceSplitter, MosesDetokenizer) JSON_HEADER = {'Content-type': 'application/json'} APP = Flask(__name__) APP.sentiment_en_address = None APP.sentiment_cs_address = None EN_MOSES_TOKENIZER = MosesTokenizer("en") CS_MOSES_TOKENIZER = MosesTokenizer("cs") EN_MOSES_PUNCT_NORM = MosesPunctuationNormalizer("en") CS_MOSES_PUNCT_NORM = MosesPunctuationNormalizer("cs") EN_MOSES_SENT_SPLITTER = MosesSentenceSplitter("en") EN_MOSES_DETOKENIZER = MosesDetokenizer("en") CS_MOSES_DETOKENIZER = MosesDetokenizer("cs") ALPHANUMERIC_CHARSET = set( chr(i) for i in range(sys.maxunicode) if (unicodedata.category(chr(i)).startswith("L") or unicodedata.category(chr(i)).startswith("N"))) def root_dir(): # pragma: no cover return os.path.abspath(os.path.dirname(__file__)) def get_file(file_name): try:
def _generate_lines( self, articles: List[Tuple[Dict[str, str], str]], match_lang: str, input_lang: str, cursor: "CMySQLCursor", ) -> Tuple[List[Tuple[str]], List[Tuple[str]]]: """ generates tuples representing lines in a final output file """ match_lines = [] no_match_lines = [] with MosesSentenceSplitter(input_lang) as splitsents: for attrs, text in articles: lines = [] section_name = "Summary" section_id = 1 sent_id = 1 section_str = "" for line in text.split("\n"): # filtering out empty lines and the title line if not line.startswith( "\n") and not line == attrs["title"]: # the beginning of a new section if line.startswith("Section::::"): if section_str.strip(): for sent in splitsents([section_str]): lin = [ attrs["id"], section_id, sent_id, attrs["url"], attrs["title"], section_name, sent, ] lines.append(lin) sent_id += 1 section_str = "" section_id += 1 section_name = line.replace("Section::::", "").rstrip(".") # normal text rows else: section_str += " " + line.strip() if section_str.strip(): for sent in splitsents([section_str]): lin = [ attrs["id"], section_id, sent_id, attrs["url"], attrs["title"], section_name, sent, ] lines.append(lin) sent_id += 1 if self.find_corresponding_article_title: matched_title = self._find_other_lang_title( attrs["id"], cursor, match_lang) if matched_title: match_lines += [ tuple(line + [matched_title]) for line in lines ] else: no_match_lines += [tuple(line) for line in lines] else: match_lines += [tuple(line) for line in lines] return match_lines, no_match_lines