예제 #1
0
파일: app.py 프로젝트: qurator-spk/sbb_ner
    def __init__(self):

        self._word_tokenizer = Tokenizer(split_camel_case=True,
                                         token_classes=False,
                                         extra_info=False)

        self._sentence_splitter = SentenceSplitter()
예제 #2
0
파일: nlp.py 프로젝트: epochx/PEER
 def __init__(self, language='en'):
     self.language = language
     if language == 'en':
         self.tokenizer = PunktSentenceTokenizer()
     elif language == 'de':
         self.tokenizer = SentenceSplitter(is_tuple=False)
     else:
         raise NotImplementedError
예제 #3
0
def main():
    args = arguments()
    n_tokens = 0
    t0 = time.perf_counter()
    is_xml = False
    if args.xml or args.tag is not None:
        is_xml = True
    tokenizer = Tokenizer(args.split_camel_case, args.token_classes,
                          args.extra_info, args.language)
    sentence_splitter = SentenceSplitter(args.token_classes or args.extra_info,
                                         args.language)
    if is_xml:
        if args.parallel > 1:
            logging.warning(
                "Parallel tokenization of XML files is currently not supported."
            )
        eos_tags = args.tag
        if eos_tags is None:
            eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split(
            )
        eos_tags = set(eos_tags)
        tokenized_paragraphs = [tokenizer.tokenize_xml(args.FILE)]
        if args.split_sentences:
            tokenized_paragraphs = list(
                sentence_splitter.split_xml(tokenized_paragraphs[0], eos_tags))
    else:
        if args.paragraph_separator == "empty_lines":
            paragraphs = utils.get_paragraphs(args.FILE)
        elif args.paragraph_separator == "single_newlines":
            paragraphs = (line for line in args.FILE if line.strip() != "")
        if args.parallel > 1:
            pool = multiprocessing.Pool(
                min(args.parallel, multiprocessing.cpu_count()))
            tokenized_paragraphs = pool.imap(tokenizer.tokenize, paragraphs,
                                             250)
        else:
            tokenized_paragraphs = map(tokenizer.tokenize, paragraphs)
        tokenized_paragraphs = (tp for tp in tokenized_paragraphs if tp)
        if args.split_sentences:
            tokenized_paragraphs = map(sentence_splitter.split,
                                       tokenized_paragraphs)
            tokenized_paragraphs = (s for tp in tokenized_paragraphs
                                    for s in tp)
    if args.token_classes or args.extra_info:
        if is_xml:
            tokenized_paragraphs = ([(l[0], ) if l[1] is None else l
                                     for l in tp]
                                    for tp in tokenized_paragraphs)
        tokenized_paragraphs = (["\t".join(t) for t in tp]
                                for tp in tokenized_paragraphs)
    for tp in tokenized_paragraphs:
        n_tokens += len(tp)
        print("\n".join(tp), "\n", sep="")
    t1 = time.perf_counter()
    logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" %
                 (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
예제 #4
0
def SentenceSplit(text):

    tokenizer = Tokenizer(split_camel_case=False,
                          token_classes=False,
                          extra_info=False)
    tokens = tokenizer.tokenize(text)

    sentence_splitter = SentenceSplitter(is_tuple=False)
    sentences = sentence_splitter.split(tokens)
    return sentences
예제 #5
0
def get_sents(texts):
    tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False)
    sentence_splitter = SentenceSplitter(is_tuple=False)
    
    results = []
    for text in texts:
#         text = clean(text, lang='de', lower=False)
        tokens = tokenizer.tokenize_paragraph(text)
        sentences = sentence_splitter.split(tokens)
        cleaned = [clean(' '.join(s), no_urls=True, no_digits=True, no_punct=True, no_line_breaks=True, lang='de') for s in sentences]
        results.append(cleaned)
    return results
예제 #6
0
파일: nlp.py 프로젝트: epochx/PEER
class SentenceTokenizer(object):
    def __init__(self, language='en'):
        self.language = language
        if language == 'en':
            self.tokenizer = PunktSentenceTokenizer()
        elif language == 'de':
            self.tokenizer = SentenceSplitter(is_tuple=False)
        else:
            raise NotImplementedError

    def tokenize(self, sentences):
        if self.language == 'en':
            return self.tokenizer.tokenize(sentences)
        else:
            return self.tokenizer.split(sentences)
예제 #7
0
파일: cli.py 프로젝트: Horsmann/SoMaJo
def main():
    args = arguments()
    n_tokens = 0
    t0 = time.perf_counter()
    tokenizer = Tokenizer(args.split_camel_case, args.token_classes,
                          args.extra_info)
    sentence_splitter = SentenceSplitter(args.token_classes or args.extra_info)
    if args.paragraph_separator == "empty_lines":
        paragraphs = get_paragraphs(args.FILE)
    elif args.paragraph_separator == "single_newlines":
        paragraphs = (line for line in args.FILE if line.strip() != "")
    if args.parallel > 1:
        pool = multiprocessing.Pool(
            min(args.parallel, multiprocessing.cpu_count()))
        tokenized_paragraphs = pool.imap(tokenizer.tokenize, paragraphs, 250)
    else:
        tokenized_paragraphs = map(tokenizer.tokenize, paragraphs)
    tokenized_paragraphs = (tp for tp in tokenized_paragraphs if tp)
    if args.split_sentences:
        tokenized_paragraphs = map(sentence_splitter.split,
                                   tokenized_paragraphs)
        tokenized_paragraphs = (s for tp in tokenized_paragraphs for s in tp)
    if args.token_classes or args.extra_info:
        tokenized_paragraphs = (["\t".join(t) for t in tp]
                                for tp in tokenized_paragraphs)

    for tp in tokenized_paragraphs:
        n_tokens += len(tp)
        print("\n".join(tp), "\n", sep="")
    t1 = time.perf_counter()
    logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" %
                 (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
예제 #8
0
파일: app.py 프로젝트: qurator-spk/sbb_ner
class NERTokenizer:
    def __init__(self):

        self._word_tokenizer = Tokenizer(split_camel_case=True,
                                         token_classes=False,
                                         extra_info=False)

        self._sentence_splitter = SentenceSplitter()

    def parse_text(self, text):
        tokens = self._word_tokenizer.tokenize_paragraph(text)

        sentences_tokenized = self._sentence_splitter.split(tokens)

        sentences = []
        for sen in sentences_tokenized:

            sen = [tok.replace(" ", "") for tok in sen]

            if len(sen) == 0:
                continue

            sentences.append((sen, []))

        return sentences
예제 #9
0
class TestSentenceSplitterPretokenized(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.sentence_splitter = SentenceSplitter(language="de_CMC")

    def _equal(self, tokens, tokenized_sentences):
        """"""
        sentences = self.sentence_splitter.split(tokens.split())
        self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])

    def _equal_xml(self, tokens, tokenized_sentences):
        """"""
        eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
        eos_tags = set(eos_tags)
        sentences = self.sentence_splitter.split_xml(tokens.split(), eos_tags)
        self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
def splitSentTokenIdx(text):
    # generate tokens from text:
    tokens = tokenSplit(text)

    # sort to sentences:
    sentence_splitter = SentenceSplitter(is_tuple=False)
    sentences = sentence_splitter.split(tokens)

    # add start and end indexes of token in text:
    endIdxUpdate = 0
    sents_idxd = []
    for sent in sentences:
        tokens_idxd = []
        for token in sent:
            startIdx = text.find(token, endIdxUpdate)
            endIdx = startIdx + len(token)
            if startIdx != -1:
                endIdxUpdate = endIdx
            tokens_idxd.append((token, startIdx, endIdx))
        sents_idxd.append(tokens_idxd)
    return sents_idxd
예제 #11
0
class TestSentenceSplitter(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True)
        self.sentence_splitter = SentenceSplitter()

    def _equal(self, raw, tokenized_sentences):
        """"""
        tokens = self.tokenizer.tokenize(raw)
        sentences = self.sentence_splitter.split(tokens)
        sentences = [" ".join(s) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)

    def _equal_xml(self, raw, tokenized_sentences):
        """"""
        eos_tags = "title h1 h2 h3 h4 h5 h6 p br div ol ul dl table".split()
        eos_tags = set(eos_tags)
        tokens = self.tokenizer.tokenize(raw)
        sentences = self.sentence_splitter.split_xml(tokens, eos_tags)
        sentences = [" ".join(s) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)
예제 #12
0
class TestSentenceSplitter(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True)
        self.sentence_splitter = SentenceSplitter()

    def _equal(self, raw, tokenized_sentences):
        """"""
        tokens = self.tokenizer.tokenize(raw)
        sentences = self.sentence_splitter.split(tokens)
        sentences = [" ".join(s) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)
예제 #13
0
파일: rouge.py 프로젝트: domfr/GeRouge
    def __init__(self, alpha, stemming=True, split_compounds=True, minimal_mode=False):
        self.tokenizer = SoMaJo('de_CMC')
        self.sentence_splitter = SentenceSplitter(is_tuple=False)
        self.alpha = alpha
        self.stemming = stemming
        self.split_compounds = split_compounds
        self.stemmer = SnowballStemmer('german')
        self.minimal_mode = minimal_mode
        self.base_path = pathlib.Path(__file__).parent.absolute()

        self.remove_chars = ['²', '³', '“', '„', ',', '†', '‚', '‘', '–']
        self.remove_chars.extend(list(string.punctuation))
        self.replace_chars = [('ss', 'ß'), ('ä', 'ae'), ('ü', 'ue'), ('ö', 'oe')]

        self.stop = set()
        with open(os.path.join(self.base_path, 'data', 'GermanST_utf8.txt'), 'r') as f:
            for line in f:
                self.stop.add(line.strip())
        if not minimal_mode:
            self.smart_stop = set()
            with open(os.path.join(self.base_path, 'data', 'smart_stop.txt'), 'r') as f:
                for line in f:
                    word = line.strip().lower()
                    self.smart_stop.add(word)
                    for replace_char in self.replace_chars:
                        word = word.replace(replace_char[0], replace_char[1])
            self.lemmas = {}
            with open(os.path.join(self.base_path, 'data', 'baseforms_by_projekt_deutscher_wortschatz.txt'), 'r') as f:
                for line in f:
                    l = line.strip().split('\t')
                    l[0] = l[0].strip().lower()
                    l[1] = l[1].strip().lower()
                    for replace_char in self.replace_chars:
                        l[0] = l[0].replace(replace_char[0], replace_char[1])
                        l[1] = l[1].replace(replace_char[0], replace_char[1])
                    self.lemmas[l[0]] = l[1]
예제 #14
0
 def myprocessor(myinput):
     tokenizer = Tokenizer(language="de")
     sentsplitter = SentenceSplitter(language="de")
     tokenized = tokenizer.tokenize_paragraph(myinput)
     sentsplit = sentsplitter.split(tokenized)
     return sentsplit
예제 #15
0
def read_clef(clef_file):

    with open(clef_file, 'r') as f:

        sentence_splitter = SentenceSplitter()

        docs = []
        segments = []
        text_part = []
        header = None
        urls = []

        def make_segement():

            nonlocal docs, segments, text_part

            if len(text_part) == 0:
                return

            tmp = None
            # noinspection PyBroadException
            try:
                tmp = pd.read_csv(StringIO(header + "".join(text_part)),
                                  sep='\t',
                                  comment='#',
                                  quoting=3)
            except:
                import ipdb
                ipdb.set_trace()

            tmp = tmp.reset_index().rename(columns={'index': 'TOKEN_ID'})

            tmp['url_id'] = len(docs)
            tmp['segment_id'] = len(segments)

            segments.append(tmp)

            text_part = []

        def make_doc():

            nonlocal docs, segments, sentence_splitter

            doc = pd.concat(segments)

            sentences = sentence_splitter.split(
                doc.TOKEN.astype(str).to_list())
            doc['TOKEN_ID'] = [i for s in sentences for i in range(len(s))]

            docs.append(doc)
            segments = []

        for line in tqdm(f):

            if header is None:
                header = "\t".join(line.split()) + '\n'
                continue

            if not line.startswith('#'):
                text_part.append(line)

            if re.match(r'#\s+segment_iiif_link\s+=.*', line):

                make_segement()

            if re.match(r'#\s+document_id\s+=.*', line):

                make_segement()

                urls.append(line)
                if len(segments) > 0:
                    make_doc()

        make_segement()
        make_doc()

        return urls, pd.concat(docs).reset_index(drop=True)
예제 #16
0
 def setUp(self):
     """Necessary preparations"""
     self.sentence_splitter = SentenceSplitter(language="de_CMC")
예제 #17
0
 def setUp(self):
     """Necessary preparations"""
     self.tokenizer = Tokenizer(split_camel_case=True)
     self.sentence_splitter = SentenceSplitter()