Exemplo n.º 1
0
def main():
    args = arguments()
    n_tokens = 0
    t0 = time.perf_counter()
    is_xml = False
    if args.xml or args.tag is not None:
        is_xml = True
    tokenizer = Tokenizer(args.split_camel_case, args.token_classes,
                          args.extra_info, args.language)
    sentence_splitter = SentenceSplitter(args.token_classes or args.extra_info,
                                         args.language)
    if is_xml:
        if args.parallel > 1:
            logging.warning(
                "Parallel tokenization of XML files is currently not supported."
            )
        eos_tags = args.tag
        if eos_tags is None:
            eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split(
            )
        eos_tags = set(eos_tags)
        tokenized_paragraphs = [tokenizer.tokenize_xml(args.FILE)]
        if args.split_sentences:
            tokenized_paragraphs = list(
                sentence_splitter.split_xml(tokenized_paragraphs[0], eos_tags))
    else:
        if args.paragraph_separator == "empty_lines":
            paragraphs = utils.get_paragraphs(args.FILE)
        elif args.paragraph_separator == "single_newlines":
            paragraphs = (line for line in args.FILE if line.strip() != "")
        if args.parallel > 1:
            pool = multiprocessing.Pool(
                min(args.parallel, multiprocessing.cpu_count()))
            tokenized_paragraphs = pool.imap(tokenizer.tokenize, paragraphs,
                                             250)
        else:
            tokenized_paragraphs = map(tokenizer.tokenize, paragraphs)
        tokenized_paragraphs = (tp for tp in tokenized_paragraphs if tp)
        if args.split_sentences:
            tokenized_paragraphs = map(sentence_splitter.split,
                                       tokenized_paragraphs)
            tokenized_paragraphs = (s for tp in tokenized_paragraphs
                                    for s in tp)
    if args.token_classes or args.extra_info:
        if is_xml:
            tokenized_paragraphs = ([(l[0], ) if l[1] is None else l
                                     for l in tp]
                                    for tp in tokenized_paragraphs)
        tokenized_paragraphs = (["\t".join(t) for t in tp]
                                for tp in tokenized_paragraphs)
    for tp in tokenized_paragraphs:
        n_tokens += len(tp)
        print("\n".join(tp), "\n", sep="")
    t1 = time.perf_counter()
    logging.info("Tokenized %d tokens in %d seconds (%d tokens/s)" %
                 (n_tokens, t1 - t0, n_tokens / (t1 - t0)))
Exemplo n.º 2
0
class TestTokenizer(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True)

    def _equal(self, raw, tokenized):
        """"""
        self.assertEqual(self.tokenizer.tokenize(raw), tokenized.split())

    def _equal_xml(self, raw, tokenized):
        """"""
        self.assertEqual(self.tokenizer.tokenize_xml(raw, is_file=False), tokenized.split())

    def _fail_means_improvement(self, raw, tokenized):
        """"""
        self.assertNotEqual(self.tokenizer.tokenize(raw), tokenized.split())