Exemplo n.º 1
0
def read_gold_standard_file(data_dir, fileroot, encoding=None, cetr=False, format='1.0'):
    """
    Read the gold standard content file corresponding to identifier ``fileroot``
    in the gold standard directory below the root ``data_dir``.

    Args:
        data_dir (str)
        fileroot (str)
        encoding (str)
        cetr (bool): if True, assume no comments and parse the gold standard
            to remove tags
        format (bool): if True, assume no comments and parse the gold standard
            to remove tags

    Returns:
        List[str, str]: contents string and comments string, respectively
    """
    ds_format: DatasetFormat = FORMATS[format]
    fname = os.path.join(
        data_dir, ds_format.gold_standard_dirname, fileroot + ds_format.gold_standard_ext)
    encodings = (encoding,) if encoding else ('utf-8', 'utf-16', 'iso-8859-1')
    for encoding in encodings:
        try:
            with io.open(fname, mode='rt', encoding=encoding) as f:
                gold_standard = f.read()
            break
        except (UnicodeDecodeError, UnicodeError):
            gold_standard = None

    if not gold_standard:
        return [u'', u'']

    if format == '1.0':
        if not cetr:
            content_comments = RE_COMMENTS_DELIM.split(gold_standard, maxsplit=1)
            # if no comments delimiter found, append empty comments string
            if len(content_comments) == 1:
                content_comments = [content_comments[0], u'']
        else:
            tree = etree.fromstring(gold_standard, parser=etree.HTMLParser())
            content_comments = [u' '.join(text_from_subtree(tree)), u'']
    elif format == '2.0':
        # Load toml data
        # toml parser has an issue with multiline text strings
        text = gold_standard.split("'''")[1]
        # data = toml.loads(gold_standard)
        # text = data['text']
        tree = etree.fromstring(text, parser=etree.HTMLParser())
        content_comments = [u' '.join(text_from_subtree(tree)), u'']
    else:
        raise NotImplementedError(f'Format version {format} is not implemented')

    # fix text in case of mangled encodings
    content_comments = [ftfy.fix_encoding(content_comments[0]).strip(),
                        ftfy.fix_encoding(content_comments[1]).strip()]

    return content_comments
Exemplo n.º 2
0
 def test_text_from_subtree(self):
     s = '<a href=".">WILL <img src="."> THIS PASS <b>THE TEST</b> ??</a>'
     tree = etree.fromstring(s, etree.HTMLParser(recover=True))
     text_list = blocks.text_from_subtree(tree)
     text_str = " ".join(
         [ele.strip() for ele in text_list if ele.strip() != ""])
     assert text_str == "WILL THIS PASS THE TEST ??"
Exemplo n.º 3
0
 def test_text_from_subtree_decode_error(self):
     from lxml import etree
     # this is an invalid utf-8 character
     s = '<div>\x92</div>'
     tree = etree.fromstring(s, etree.HTMLParser(recover=True, encoding='utf-8'))
     text_list = blocks.text_from_subtree(tree)
     text_str = ' '.join([ele.strip() for ele in text_list if ele.strip() != ''])
     self.assertEqual(text_str, '')
Exemplo n.º 4
0
 def test_text_from_subtree(self):
     from lxml import etree
     s = """<a href=".">WILL <img src="."> THIS PASS <b>THE TEST</b> ??</a>"""
     tree = etree.fromstring(s, etree.HTMLParser(recover=True))
     text_list = blocks.text_from_subtree(tree)
     text_str = ' '.join([ele.strip() for ele in text_list if ele.strip() != ''])
     self.assertEqual(text_str,
         'WILL THIS PASS THE TEST ??')
Exemplo n.º 5
0
 def test_text_from_subtree(self):
     from lxml import etree
     s = """<a href=".">WILL <img src="."> THIS PASS <b>THE TEST</b> ??</a>"""
     tree = etree.fromstring(s, etree.HTMLParser(recover=True))
     text_list = blocks.text_from_subtree(tree)
     text_str = ' '.join(
         [ele.strip() for ele in text_list if ele.strip() != ''])
     self.assertEqual(text_str, 'WILL THIS PASS THE TEST ??')
Exemplo n.º 6
0
 def test_text_from_subtree_decode_error(self):
     # this is an invalid utf-8 character
     s = b"<div>\x92</div>"
     tree = etree.fromstring(
         s, etree.HTMLParser(recover=True, encoding="utf-8"))
     text_list = blocks.text_from_subtree(tree)
     text_str = " ".join(
         [ele.strip() for ele in text_list if ele.strip() != ""])
     assert text_str == ""
Exemplo n.º 7
0
 def test_text_from_subtree_decode_error(self):
     from lxml import etree
     # this is an invalid utf-8 character
     s = '<div>\x92</div>'
     tree = etree.fromstring(
         s, etree.HTMLParser(recover=True, encoding='utf-8'))
     text_list = blocks.text_from_subtree(tree)
     text_str = ' '.join(
         [ele.strip() for ele in text_list if ele.strip() != ''])
     self.assertEqual(text_str, '')
Exemplo n.º 8
0
def read_gold_standard_file(data_dir, fileroot, encoding=None, cetr=False):
    """
    Read the gold standard content file corresponding to identifier ``fileroot``
    in the gold standard directory below the root ``data_dir``.

    Args:
        data_dir (str)
        fileroot (str)
        encoding (str)
        cetr (bool): if True, assume no comments and parse the gold standard
            to remove tags

    Returns:
        List[str, str]: contents string and comments string, respectively
    """
    fname = os.path.join(data_dir, GOLD_STANDARD_DIRNAME,
                         fileroot + GOLD_STANDARD_EXT)
    encodings = (encoding, ) if encoding else ('utf-8', 'utf-16', 'iso-8859-1')
    for encoding in encodings:
        try:
            with io.open(fname, mode='rt', encoding=encoding) as f:
                gold_standard = f.read()
            break
        except (UnicodeDecodeError, UnicodeError):
            gold_standard = None

    if not gold_standard:
        return [u'', u'']

    if not cetr:
        content_comments = RE_COMMENTS_DELIM.split(gold_standard, maxsplit=1)
        # if no comments delimiter found, append empty comments string
        if len(content_comments) == 1:
            content_comments = [content_comments[0], u'']
    else:
        tree = etree.fromstring(gold_standard, parser=etree.HTMLParser())
        content_comments = [u' '.join(text_from_subtree(tree)), u'']

    # fix text in case of mangled encodings
    content_comments = [
        ftfy.fix_encoding(content_comments[0]).strip(),
        ftfy.fix_encoding(content_comments[1]).strip()
    ]

    return content_comments