def create_new_segmentation(infile: IO, outfile: IO, lang: str):
    simple_fieldnames = [
        "simple_article_id",
        "simple_section_id",
        "simple_sent_id",
        "simple_url",
        "simple_article_title",
        "simple_section_title",
        "simple_sent",
        "de_article_title",
        "de_url",
        "simplede_sent",
    ]
    simple_reader = csv.DictReader(infile, simple_fieldnames, delimiter="\t")
    simple_writer = csv.DictWriter(outfile,
                                   simple_fieldnames,
                                   delimiter="\t",
                                   extrasaction="ignore")

    last_article_id = None
    last_section_id = None
    last_line = None
    section_content = ""
    with MosesSentenceSplitter(lang) as splitsents:
        for line in simple_reader:
            # new section begins at article borders or within articles at section borders
            # in subsequent short articles, section ID does not necessarily change
            if (last_article_id != line["simple_article_id"]
                    or last_section_id != line["simple_section_id"]
                ) and last_section_id is not None:
                sent_id = 1
                if section_content.strip():
                    for sent in splitsents([section_content]):
                        out_line = last_line
                        out_line.update({
                            "simple_sent": "NOT_PARALLEL",
                            "simplede_sent": sent,
                            "simple_sent_id": sent_id,
                        })
                        simple_writer.writerow(out_line)
                        sent_id += 1
                section_content = ""

            section_content += " " + line["simplede_sent"].strip()
            last_article_id = line["simple_article_id"]
            last_section_id = line["simple_section_id"]
            last_line = line

        for sent in splitsents([section_content]):
            out_line = last_line
            out_line.update({
                "simple_sent": "NOT_PARALLEL",
                "simplede_sent": sent,
                "simple_sent_id": sent_id
            })
            simple_writer.writerow(out_line)
            sent_id += 1
Пример #2
0
def moses_sentenize(text):
    from mosestokenizer import MosesSentenceSplitter

    global MOSES_SENT
    if not MOSES_SENT:
        MOSES_SENT = MosesSentenceSplitter('ru')

    chunks = MOSES_SENT([text])
    return find_substrings(chunks, text)
Пример #3
0
 def __init__(self, lm_type:LMType , language:str):
     """
         lm_type: LMType
         language: language code
     """
     
     self.language=language
     self.tokenizer=MosesTokenizer(self.language)
     self.normalizer=MosesPunctuationNormalizer(self.language)
     self.splitter=MosesSentenceSplitter(self.language, more=False)
     self.type=lm_type
Пример #4
0
def main(args):
    splits = MosesSentenceSplitter('fi')
    detok = MosesDetokenizer("fi")
    with open(args.infile, "r") as fi, open(args.outfile, "w") as fo:
        for line in fi:
            data = json.loads(line.strip())
            text = html.unescape(detok(data["text"].split())) if args.moses_tokenized else data["text"]
            sents = splits([text])
            for i, s in enumerate(sents):
                d = data.copy()
                d["text"] = s
                if "id" in d.keys():
                    d["id"] = d["id"] + f"-s{i}"

                fo.write(json.dumps(d, ensure_ascii=False) + "\n")
Пример #5
0
    def __init__(self,
                 srclang,
                 targetlang,
                 sourcebpe=None,
                 targetbpe=None,
                 sourcespm=None,
                 targetspm=None):
        self.bpe_source = None
        self.bpe_target = None
        self.sp_processor_source = None
        self.sp_processor_target = None
        self.sentences = []
        # load BPE model for pre-processing
        if sourcebpe:
            # print("load BPE codes from " + sourcebpe, flush=True)
            BPEcodes = open(sourcebpe, 'r', encoding="utf-8")
            self.bpe_source = BPE(BPEcodes)
        if targetbpe:
            # print("load BPE codes from " + targetbpe, flush=True)
            BPEcodes = open(targetbpe, 'r', encoding="utf-8")
            self.bpe_target = BPE(BPEcodes)

        # load SentencePiece model for pre-processing
        if sourcespm:
            # print("load sentence piece model from " + sourcespm, flush=True)
            self.sp_processor_source = sentencepiece.SentencePieceProcessor()
            self.sp_processor_source.Load(sourcespm)
        if targetspm:
            # print("load sentence piece model from " + targetspm, flush=True)
            self.sp_processor_target = sentencepiece.SentencePieceProcessor()
            self.sp_processor_target.Load(targetspm)

        # pre- and post-processing tools
        self.tokenizer = None
        self.detokenizer = None

        # TODO: should we have support for other sentence splitters?
        # print("start pre- and post-processing tools")
        self.sentence_splitter = MosesSentenceSplitter(srclang)
        self.normalizer = MosesPunctuationNormalizer(srclang)
        if self.bpe_source:
            self.tokenizer = MosesTokenizer(srclang)

        if self.bpe_source:
            self.detokenizer = MosesDetokenizer(targetlang)
Пример #6
0
 def __init__(self,
              mosestokenizer_language_code="en",
              store_data=False,
              spell_checker_lang=None,
              n_jobs=1):
     self.mosestokenizer_language_code = mosestokenizer_language_code
     self.splitsents = MosesSentenceSplitter(
         self.mosestokenizer_language_code)
     self.tokenize = MosesTokenizer(self.mosestokenizer_language_code)
     nltk.download('wordnet', quiet=False)
     self.lemmatizer = nltk.stem.WordNetLemmatizer()
     self.stop = False
     self.store_data = store_data
     if spell_checker_lang is None:
         logger.info("The spell checker is disabled.")
         self.spell_checker = None
     else:
         logger.info("The spell checker is enabled for %s." %
                     (spell_checker_lang))
         self.spell_checker = SpellChecker(language=spell_checker_lang,
                                           n_jobs=n_jobs)
Пример #7
0
def split_sentences(paragraph, language):
    if language == "en":
        with MosesSentenceSplitter(language) as splitter:
            return splitter([paragraph])
    elif language in INDIC:
        return sentence_tokenize.sentence_split(paragraph, lang=language)
Пример #8
0
indic_language_dict = {
    'Assamese': 'as',
    'Hindi': 'hi',
    'Marathi': 'mr',
    'Tamil': 'ta',
    'Bengali': 'bn',
    'Kannada': 'kn',
    'Oriya': 'or',
    'Telugu': 'te',
    'Gujarati': 'gu',
    'Malayalam': 'ml',
    'Punjabi': 'pa',
}

splitter = MosesSentenceSplitter('en')


def get_inference_params():
    source_language = request.form['source_language']
    target_language = request.form['target_language']

    if source_language in indic_language_dict and target_language == 'English':
        model = indic2en_model
        source_lang = indic_language_dict[source_language]
        target_lang = 'en'
    elif source_language == 'English' and target_language in indic_language_dict:
        model = en2indic_model
        source_lang = 'en'
        target_lang = indic_language_dict[target_language]
    elif source_language in indic_language_dict and target_language in indic_language_dict:
Пример #9
0
def main(args):
    splits = MosesSentenceSplitter('fi')

    with open(os.path.join(args.model, "config.json"), "r") as f:
        config = json.load(f)

    id2label = config["id2label"]
    label2id = config["label2id"]
    labels = [v for k, v in sorted(id2label.items(), key=lambda x: int(x[0]))]
    tokenizer = transformers.BertTokenizer.from_pretrained(args.model)
    model = models.BertForMultiLabelSequenceClassification.from_pretrained(
        args.model, num_labels=len(id2label)
    )
    model.eval()

    with open(args.data, "r") as f:
        data = [json.loads(line.strip()) for line in f]

    if args.sentences:
        for sample in data:
            preds = []
            sents = splits([sample["text"]])
            batches = utils.get_batches(sents, 8)
            for batch in batches:
                input = tokenizer(batch, truncation=True, padding=True, return_tensors="pt")
                result = model(
                    input_ids=input["input_ids"],
                    token_type_ids=input["token_type_ids"],
                    attention_mask=input["attention_mask"]
                )
                pred = sigmoid(result.logits.detach().numpy())
                pred = (pred >= args.classification_threshold).astype(int)
                preds.append(pred)

            preds = np.concatenate(preds, axis=0)
            docpred = list(preds.sum(axis=0))
            doctrue = [0]*len(label2id)
            for tag in sample["tags"]:
                doctrue[label2id[tag]] = 1

            print("-"*50)
            print(f"DOCUMENT: {sample['id']}, #sents: {len(sents)}")
            print(tabulate.tabulate(
                [["pred"] + docpred, ["gold"] + doctrue],
                headers=[""] + labels)
            )
            print()
            if args.print_sentences:
                for sent, inds in zip(sents, preds):
                    print(sent, end="")
                    for i, ind in enumerate(inds):
                        if ind == 1:
                            print(" " + id2label[str(i)], end="")
                    print()

            print()
        else:
            texts = [sample["text"] for sample in data]
            batches = utils.get_batches(sents, 8)
            for batch in batches:
                input = tokenizer(batch, truncation=True, padding=True, return_tensors="pt")
                result = model(
                    input_ids=input["input_ids"],
                    token_type_ids=input["token_type_ids"],
                    attention_mask=input["attention_mask"]
                )

                print(f"DOCUMENT: {sample['id']}, #chars: {len(sents)}")
Пример #10
0
 def split(self, data: list) -> list:
     data = [line for line in data if line.strip()]
     if len(data):
         with MosesSentenceSplitter(self.language_code) as splitter:
             data = splitter(data)
     return data
Пример #11
0
    def __init__(self):
        from mosestokenizer import MosesSentenceSplitter

        self.splitter = MosesSentenceSplitter('ru')
Пример #12
0
from mosestokenizer import (
        MosesTokenizer, MosesPunctuationNormalizer, MosesSentenceSplitter,
        MosesDetokenizer)

JSON_HEADER = {'Content-type': 'application/json'}

APP = Flask(__name__)
APP.sentiment_en_address = None
APP.sentiment_cs_address = None

EN_MOSES_TOKENIZER = MosesTokenizer("en")
CS_MOSES_TOKENIZER = MosesTokenizer("cs")
EN_MOSES_PUNCT_NORM = MosesPunctuationNormalizer("en")
CS_MOSES_PUNCT_NORM = MosesPunctuationNormalizer("cs")
EN_MOSES_SENT_SPLITTER = MosesSentenceSplitter("en")
EN_MOSES_DETOKENIZER = MosesDetokenizer("en")
CS_MOSES_DETOKENIZER = MosesDetokenizer("cs")

ALPHANUMERIC_CHARSET = set(
    chr(i) for i in range(sys.maxunicode)
    if (unicodedata.category(chr(i)).startswith("L")
        or unicodedata.category(chr(i)).startswith("N")))


def root_dir():  # pragma: no cover
        return os.path.abspath(os.path.dirname(__file__))


def get_file(file_name):
    try:
Пример #13
0
 def _generate_lines(
     self,
     articles: List[Tuple[Dict[str, str], str]],
     match_lang: str,
     input_lang: str,
     cursor: "CMySQLCursor",
 ) -> Tuple[List[Tuple[str]], List[Tuple[str]]]:
     """
     generates tuples representing lines in a final output file
     """
     match_lines = []
     no_match_lines = []
     with MosesSentenceSplitter(input_lang) as splitsents:
         for attrs, text in articles:
             lines = []
             section_name = "Summary"
             section_id = 1
             sent_id = 1
             section_str = ""
             for line in text.split("\n"):
                 # filtering out empty lines and the title line
                 if not line.startswith(
                         "\n") and not line == attrs["title"]:
                     # the beginning of a new section
                     if line.startswith("Section::::"):
                         if section_str.strip():
                             for sent in splitsents([section_str]):
                                 lin = [
                                     attrs["id"],
                                     section_id,
                                     sent_id,
                                     attrs["url"],
                                     attrs["title"],
                                     section_name,
                                     sent,
                                 ]
                                 lines.append(lin)
                                 sent_id += 1
                         section_str = ""
                         section_id += 1
                         section_name = line.replace("Section::::",
                                                     "").rstrip(".")
                     # normal text rows
                     else:
                         section_str += " " + line.strip()
             if section_str.strip():
                 for sent in splitsents([section_str]):
                     lin = [
                         attrs["id"],
                         section_id,
                         sent_id,
                         attrs["url"],
                         attrs["title"],
                         section_name,
                         sent,
                     ]
                     lines.append(lin)
                     sent_id += 1
             if self.find_corresponding_article_title:
                 matched_title = self._find_other_lang_title(
                     attrs["id"], cursor, match_lang)
                 if matched_title:
                     match_lines += [
                         tuple(line + [matched_title]) for line in lines
                     ]
                 else:
                     no_match_lines += [tuple(line) for line in lines]
             else:
                 match_lines += [tuple(line) for line in lines]
     return match_lines, no_match_lines