예제 #1
0
def split(list_of_text, thread_number, TMP_DIR):
    """
    Splits text in sentences
    Writes line for line with leading space (for BPE) 
    Every document is separated by a free line
    """
    print(os.path.join(TMP_DIR, "Splitted_{:05d}.txt".format(thread_number)))
    outF = open(
        os.path.join(TMP_DIR, "Splitted_{:05d}.txt".format(thread_number)),
        "w")
    tokenizer = SoMaJo("de_CMC", split_camel_case=True)
    for part in list_of_text:
        sentences = tokenizer.tokenize_text([part])
        for sentence in sentences:
            output = ""
            for token in sentence:
                #word_list = [token.text for token in sentence]
                if (token.space_after and not token.last_in_sentence
                        and not token.first_in_sentence):
                    output += (token.text + ' ')
                elif token.first_in_sentence:
                    output += (' ' + token.text + ' ')
                else:
                    #output = " ".join(word_list[:-1])
                    output += token.text
                    #output += word_list[-1]
            #sen_out.append(output)
            outF.write(output)
            outF.write("\n")
        outF.write("\n")

    return thread_number
예제 #2
0
    def __init__(self, language, processes=None):
        from somajo import SoMaJo

        tokenizer_type = {"de": "de_CMC", "en": "en_PTB"}[language]
        self.tokenizer = SoMaJo(tokenizer_type,
                                split_camel_case=True,
                                split_sentences=True)
예제 #3
0
def tokenize(text):
    tokenizer = SoMaJo(language="de_CMC")
    for i in range(len(text)):
        text[i] = text[i].split()
        tok = tokenizer.tokenize_text(text[i])
        tok_sent = []
        for sent in tok:
            for word in sent:
                tok_sent.append(word.text)
        text[i] = tok_sent
 def tokenizer(self, text):
     tokenizer = SoMaJo("en_PTB")
     tokenized_object = tokenizer.tokenize_text([text])
     sentences = []
     types = []
     for sent in tokenized_object:
         sentence = []
         for token in sent:
             sentence.append(token.text)
             types.append(token.token_class)
         sentences.append(sentence)
     self.output['tokens'] = sentences
     self.output['types'] = types
     return sentences, types
예제 #5
0
def replace_hashtags_tokenizer(text):
    tokenizer = SoMaJo("de_CMC", split_camel_case=True)
    for i in range(len(text)):
        line = text[i].split()
        for j in range(len(line)):
            if line[j].startswith('#'):
                hashtag = []
                line[j] = line[j].replace('#', "")
                hashtag.append(line[j])
                tok_hashtag = tokenizer.tokenize_text(hashtag)
                for tok in tok_hashtag:
                    for t in tok:
                        print(t.text)
        text[i] = " ".join(line)
    return (text)
예제 #6
0
def main():
    args = arguments()
    n_tokens = 0
    n_sentences = 0
    t0 = time.perf_counter()
    is_xml = False
    if args.xml or args.tag is not None:
        is_xml = True
    tokenizer = SoMaJo(args.language,
                       split_camel_case=args.split_camel_case,
                       split_sentences=args.split_sentences,
                       xml_sentences=args.sentence_tag)
    if is_xml:
        eos_tags = args.tag
        if eos_tags is None:
            eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split(
            )
        eos_tags = set(eos_tags)
        chunks = tokenizer.tokenize_xml_file(args.FILE,
                                             eos_tags,
                                             strip_tags=args.strip_tags,
                                             parallel=args.parallel)
    else:
        chunks = tokenizer.tokenize_text_file(args.FILE,
                                              args.paragraph_separator,
                                              parallel=args.parallel)
    for chunk in chunks:
        n_sentences += 1
        for token in chunk:
            output = token.text
            if not token.markup:
                n_tokens += 1
                if args.token_classes:
                    output += "\t" + token.token_class
                if args.extra_info:
                    output += "\t" + token.extra_info
            print(output)
        if args.split_sentences and args.sentence_tag is None:
            print()
    t1 = time.perf_counter()
    if args.split_sentences:
        logging.info(
            "Tokenized %d tokens (%d sentences) in %d seconds (%d tokens/s)" %
            (n_tokens, n_sentences, t1 - t0, n_tokens / (t1 - t0)))
    else:
        logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" %
                     (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
예제 #7
0
    def make(self, prerequisite_data):
        paragraphs = prerequisite_data['paragraph']
        tokenizer = SoMaJo("de_CMC", split_camel_case=True)
        sentences = tokenizer.tokenize_text(paragraphs)

        tokens = []
        sentence_alignment = []

        for (i, s) in zip(range(len(sentences)), sentences):
            tokens += [token.text for token in s]
            sentence_alignment += [i] * len(s)

        return {
            'token-somajo': tokens,
            'sentence-somajo': sentence_alignment,
            'token': tokens,
            'sentence': sentence_alignment
        }
예제 #8
0
파일: rouge.py 프로젝트: domfr/GeRouge
    def __init__(self, alpha, stemming=True, split_compounds=True, minimal_mode=False):
        self.tokenizer = SoMaJo('de_CMC')
        self.sentence_splitter = SentenceSplitter(is_tuple=False)
        self.alpha = alpha
        self.stemming = stemming
        self.split_compounds = split_compounds
        self.stemmer = SnowballStemmer('german')
        self.minimal_mode = minimal_mode
        self.base_path = pathlib.Path(__file__).parent.absolute()

        self.remove_chars = ['²', '³', '“', '„', ',', '†', '‚', '‘', '–']
        self.remove_chars.extend(list(string.punctuation))
        self.replace_chars = [('ss', 'ß'), ('ä', 'ae'), ('ü', 'ue'), ('ö', 'oe')]

        self.stop = set()
        with open(os.path.join(self.base_path, 'data', 'GermanST_utf8.txt'), 'r') as f:
            for line in f:
                self.stop.add(line.strip())
        if not minimal_mode:
            self.smart_stop = set()
            with open(os.path.join(self.base_path, 'data', 'smart_stop.txt'), 'r') as f:
                for line in f:
                    word = line.strip().lower()
                    self.smart_stop.add(word)
                    for replace_char in self.replace_chars:
                        word = word.replace(replace_char[0], replace_char[1])
            self.lemmas = {}
            with open(os.path.join(self.base_path, 'data', 'baseforms_by_projekt_deutscher_wortschatz.txt'), 'r') as f:
                for line in f:
                    l = line.strip().split('\t')
                    l[0] = l[0].strip().lower()
                    l[1] = l[1].strip().lower()
                    for replace_char in self.replace_chars:
                        l[0] = l[0].replace(replace_char[0], replace_char[1])
                        l[1] = l[1].replace(replace_char[0], replace_char[1])
                    self.lemmas[l[0]] = l[1]
예제 #9
0
            or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False


# =============================================================================
# SoMaJo taken from https://github.com/tsproisl/SoMaJo
# =============================================================================
if False:
    from tqdm import tqdm

    sen_out = []
    tokenizer = SoMaJo("de_CMC", split_camel_case=True)
    for part in tqdm(raw_text):
        sentences = tokenizer.tokenize_text([part])
        for sentence in sentences:
            word_list = [token.text for token in sentence]
            output = " ".join(word_list[:-1])
            output += word_list[-1]
            sen_out.append(output)

    _is_punctuation(raw_text[-1][-1])

    stripped = []
    for index, part in tqdm(enumerate(sen_out)):
        reordered = ""
        for char in part:
            if not _is_punctuation(char):
예제 #10
0
 def setUp(self):
     """Necessary preparations"""
     self.tokenizer = SoMaJo("de_CMC", split_camel_case=True, split_sentences=True)
예제 #11
0
import gzip
import orjson
from somajo import SoMaJo
from tqdm import tqdm
import argparse

tokenizer = SoMaJo("de_CMC")


# see https://github.com/tsproisl/SoMaJo/issues/17
def detokenize(tokens):
    out = []
    for token in tokens:
        if token.original_spelling is not None:
            out.append(token.original_spelling)
        else:
            out.append(token.text)

        if token.space_after:
            out.append(" ")

    return "".join(out)


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('filename')
    args = parser.parse_args()
    input_filename = args.filename
예제 #12
0
def SentenceSplit(text):

    tokenizer = SoMaJo("de_CMC")
    tokens = tokenizer.tokenize_text(text)
    return tokens
예제 #13
0
 def __init__(self, model_name: str):
     super().__init__()
     self.tokenizer = SoMaJo(model_name, split_sentences=False)
예제 #14
0
 def __init__(self, model_name: str):
     super().__init__()
     self.tokenizer = SoMaJo(model_name)
예제 #15
0
class TokenizedBaseDocument(BaseDocument):
    _tokenizers: ClassVar[Mapping[str, SoMaJo]] = {
        "en": SoMaJo("en_PTB", split_sentences=False),
        "de": SoMaJo("de_CMC", split_sentences=False),
    }
    _lang_callback: ClassVar[Callable[[Mapping[str, object]], str]] = lambda _: "en"
    _text_field_map: ClassVar[Mapping[str, object]]

    @classmethod
    def _make_text_field_map(
        cls,
        document_cls: _T_DocumentMeta,
    ) -> Mapping[str, object]:
        text_field_map: MutableMapping[str, object] = {}

        mapping = document_cls._doc_type.mapping
        for field_name in mapping:
            field = mapping[field_name]

            if isinstance(field, Text):
                text_field_map[field_name] = True

            elif isinstance(field, Object):
                inner_class = field._doc_class
                inner_text_field_map = cls._make_text_field_map(inner_class)
                if inner_text_field_map:
                    text_field_map[field_name] = inner_text_field_map

        return text_field_map

    @classmethod
    @overrides
    def prepare_doc_dict(cls, doc_dict: MutableMapping[str, object]) -> None:
        super().prepare_doc_dict(doc_dict)

        lang = cls._lang_callback(doc_dict)
        if lang not in cls._tokenizers.keys():
            _LOGGER.error(
                "No tokenizer available for language '{}'. Defaulting to '{}'. "
                "Available languages: {}",
                lang,
                "en",
                ", ".join(cls._tokenizers.keys()),
            )
            lang = "en"

        cls._tokenize_doc_dict(doc_dict, cls._text_field_map, lang)

    @classmethod
    def _tokenize_doc_dict(
        cls,
        doc_dict: MutableMapping[str, object],
        text_field_map: Mapping[str, object],
        lang: str,
    ) -> None:
        for field_name, text_field_or_childs in text_field_map.items():
            # text_field_or_childs is either True or a mapping
            value = doc_dict.get(field_name)
            if not value:
                continue
            elif text_field_or_childs is True:
                (
                    doc_dict[field_name],
                    doc_dict[field_name + "_orig"],
                    doc_dict[field_name + "_tokens"],
                ) = cls._tokenize(checked_cast(str, value), lang)
            elif isinstance(value, MutableMapping):
                cls._tokenize_doc_dict(
                    value, cast(Mapping[str, object], text_field_or_childs), lang
                )
            elif isinstance(value, Sequence):
                for v in value:
                    cls._tokenize_doc_dict(
                        v, cast(Mapping[str, object], text_field_or_childs), lang
                    )
            else:
                raise ValueError(
                    f"Value for Object-field needs to be either a Mapping or a "
                    f"Sequence. The value was: {value}"
                )

    @classmethod
    def _tokenize(cls, text_orig: str, lang: str) -> Tuple[str, str, Sequence[str]]:
        text = text_orig.strip()
        text = normalize("NFKC", text)
        if not text:
            return "", "", []

        try:
            text = str(html.fromstring(text).text_content())
        except LxmlError:
            _LOGGER.warning(
                "lxml HTML parsing failed. Skipping it for this document.",
                exc_info=True,
            )

        if not text:
            return "", "", []

        tokens = [
            token.text.lower()
            for token in next(cls._tokenizers[lang].tokenize_text([text]))
            if (token.token_class not in ["URL", "symbol"])
        ]
        return " ".join(tokens), text_orig, tokens