Exemplo n.º 1
0
def clean_element(xml: str, selector=':root',
                  progress_iterator=None, num_processes=1, **kwargs) -> str:
    """Return xml with the selected elements cleaned up. kwargs are passed to
    text_cleanup.raw.cleanup()"""
    # Use html.parser so that it doesn't try to fix the structure
    soup = BeautifulSoup(xml, 'html.parser')
    # Build entire list first to avoid modifying a live iterator
    nodes = set(
        node
        for element in soup.select(selector, **kwargs)
        for node in element.strings
        if not node.isspace())
    if progress_iterator is None:
        progress_iterator = lambda x: x  # noqa: E731

    text_iterator = map(str, nodes)

    if num_processes > 1:
        with multiprocessing.Pool(num_processes) as pool:
            fixed = []
            futures = [
                pool.apply_async(cleanup, (text,), kwargs)
                for text in text_iterator]
            fixed = [future.get() for future in progress_iterator(futures)]
    else:
        fixed = progress_iterator([cleanup(t, **kwargs)
                                   for t in text_iterator])

    for node, new in zip(nodes, fixed):
        node.replace_with(new)
        # Maybe show small diff here?

    return str(soup)
Exemplo n.º 2
0
 def test_complicated_sample(self):
     sample = """
     In the context of 1960, Stranger in a Strange Land was a book that his
     publishers feared-itwas too far off the beaten path. So, in order to
     mini- mize possible losses, Robert was asked to cutthe monuscript down
     to 150,000 words-a loss of about 70,000 words. Other changes were
     alsorequested, before the editor was willing to take a chance on
     publication.
     """
     expected = """
     In the context of 1960, Stranger in a Strange Land was a book that his
     publishers feared-it was too far off the beaten path. So, in order to
     minimize possible losses, Robert was asked to cut the manuscript down
     to 150,000 words-a loss of about 70,000 words. Other changes were
     also requested, before the editor was willing to take a chance on
     publication.
     """
     result = raw.cleanup(sample)
     self.assertEqual(result, expected)
Exemplo n.º 3
0
def main(argv=None):
    """Entry point for text-cleanup cli."""
    parser = argparse.ArgumentParser("Clean up text.")
    parser.add_argument('input',
                        nargs='?',
                        type=argparse.FileType(encoding='utf-8'),
                        help="The input file to clean up.",
                        default=sys.stdin)
    parser.add_argument('--output',
                        type=argparse.FileType(mode='w', encoding='utf-8'),
                        help="Write results to this filename.",
                        default=sys.stdout)
    parser.add_argument(
        '--selector',
        '-s',
        help="Only clean elements mathching this CSS selector. Implies --xml.")
    parser.add_argument('--xml', action='store_true', help="Assume XML input.")
    parser.add_argument('--num_processes',
                        '-n',
                        metavar='N',
                        help="Utilize N processes.",
                        type=int,
                        default=1)
    parser.add_argument('--disallow_substitution',
                        action='store_false',
                        help='Allow the correction to substitute letters.')
    parser.add_argument('--disallow_deletion',
                        action='store_false',
                        help='Allow the correction to delete letters.')
    parser.add_argument('--disallow_insertion',
                        action='store_false',
                        help='Allow the correction to insert letters.')
    parser.add_argument('--avoid_capitalized_words',
                        action='store_true',
                        help=("Ignore words starting with a capital letter"
                              "unless we're *really* sure."))
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--reformat-only',
        action='store_true',
        help="Prettify XML input without changing any of the text.")

    args = parser.parse_args(argv or sys.argv[1:])

    # Fix dependencies between arguments (e.g. x implies y)
    if args.selector or args.reformat_only:
        args.xml = True
    if args.selector is None:
        args.selector = ':root'

    if args.xml:
        xml = args.input.read()
        if args.reformat_only:
            output = XML.reformat(xml)
        else:

            def make_bar(items):
                return progressbar.progressbar(items)

            output = XML.clean_element(
                xml,
                args.selector,
                progress_iterator=make_bar,
                num_processes=args.num_processes,
                insertion=not args.disallow_insertion,
                deletion=not args.disallow_deletion,
                substitution=not args.disallow_substitution,
                avoid_capitalized_words=args.avoid_capitalized_words,
            )
    else:
        text = args.input.read()
        output = raw.cleanup(
            text,
            insertion=not args.disallow_insertion,
            deletion=not args.disallow_deletion,
            substitution=not args.disallow_substitution,
            avoid_capitalized_words=args.avoid_capitalized_words,
        )

    args.output.write(output)
Exemplo n.º 4
0
 def test_complicated_punctuation(self):
     sample = """"Wait! I con't!" he said agaon-twice thot day now."""
     expected = """"Wait! I can't!" he said again-twice that day now."""
     result = raw.cleanup(sample)
     self.assertEqual(result, expected)
Exemplo n.º 5
0
 def test_missing_spaces_with_errors(self):
     sample = "This texthqs missingspaces, but also someerrors."
     expected = "This text has missing spaces, but also some errors."
     result = raw.cleanup(sample)
     self.assertEqual(result, expected)
Exemplo n.º 6
0
 def test_missing_spaces(self):
     sample = "This texthas a few missingspaces."
     expected = "This text has a few missing spaces."
     result = raw.cleanup(sample)
     self.assertEqual(result, expected)
Exemplo n.º 7
0
 def test_simple_misspelling(self):
     sample = "This tixt has one error."
     expected = "This text has one error."
     result = raw.cleanup(sample)
     self.assertEqual(result, expected)
Exemplo n.º 8
0
 def test_noop(self):
     expected = "This text has no errors."
     result = raw.cleanup(expected)
     self.assertEqual(result, expected)