Python Text примеры, fnl.text.text.Text Python примеры использования

Пример #1

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testEq(self):
     text1 = Text('n\U0010ABCDn', [(('ns', 'key', (0, 3)), None)])
     text2 = Text(text1)
     text2.attributes[('ns', 'key', (0, 3))] = {'a': 'v'}
     self.assertTrue(text1 == text2, '{!r} != {!r}'.format(text1, text2))
     self.assertFalse(text1 is text2,
                      '{!r} is not {!r}'.format(text1, text2))

Пример #2

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testUpdate(self):
     text1 = Text('blabla', [(('ns', 'k', (1, 2)), {
         'a1': 'v1',
         'a2': 'v1'
     })])
     text2 = Text('blabla', [(('ns', 'k', (1, 2)), {
         'a1': 'v2'
     }), (('ns', 'k', (4, 6)), {
         'a1': 'v1'
     })])
     text3 = Text('blahblah', [(('ns', 'k', (1, 2)), {'a1': 'v1'})])
     text1.update(text2)
     self.assertListEqual([('ns', 'k', (1, 2)), ('ns', 'k', (4, 6))],
                          list(text1))
     self.assertDictEqual(
         {
             'ns': {
                 ('ns', 'k', (1, 2)): {
                     'a1': 'v2',
                     'a2': 'v1'
                 },
                 ('ns', 'k', (4, 6)): {
                     'a1': 'v1'
                 }
             }
         }, text1.attributes)
     self.assertRaises(ValueError, text1.update, text3)
     self.assertRaises(TypeError, text1.update, 'bla')

Пример #3

0

Показать файл

Файл: extract.py Проект: living1069/libfnl

def Extract(filename: str,
            encoding: str = None,
            mime_type: str = None) -> Text:
    """
    :param filename: The path and name of the file to extract.
    :param encoding: The charset encoding of the file; can be guessed by
        :func:`mimetypes.guess_type`. However, if the encoding cannot be
        guessed, UTF-8 is assumed.
    :param mime_type: Optionally, set the MIME type that describes the
        file's type (instead of having `mimetypes` guess it).
    :return: A :class:`.Unicode` text.
    :raise IOError: If the file cannot be opened.
    :raise RuntimeError: If there are no extraction rules for the file's MIME
        type or the extractor fails horribly.
    """
    logger = getLogger('Extract')
    logger.info('filename %s', filename)

    if not encoding or not mime_type:
        guessed_mime_type, guessed_encoding = guess_type(filename)

        if not encoding:
            if not guessed_encoding:
                logger.warn('encoding of %s unknown - using UTF-8', filename)
                guessed_encoding = 'utf-8'

            encoding = guessed_encoding

        if not mime_type:
            if not guessed_mime_type:
                logger.warn('could not guess MIME type of %s', filename)
                logger.info('assuming text/plain for %s', filename)
                mime_type = 'text/plain'
            else:
                mime_type = guessed_mime_type

    if mime_type in ('text/html', 'application/xhtml'):
        html = HtmlExtractor()
        try:
            html.feed(open(filename, encoding=encoding).read())
            html.close()
        except HTMLParseError as err:
            raise RuntimeError("could not parse {}: {} at line {}".format(
                filename, err, html.lineno))
        text = Text(html.string)
        tags = [(t, html.tags[t]) for t in sorted(html.tags, key=Text.Key)]
        text.add(tags, html.namespace)
    elif mime_type == 'text/plain':
        encoding = encoding or 'utf-8'
        plain_text = open(filename, 'rb', encoding=encoding).read()
        text = Text(normalize('NFC', plain_text))
    else:
        msg = 'no extraction rules for MIME type {}'.format(mime_type)
        raise RuntimeError(msg)

    return text

Пример #4

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testAddIllegalTags(self):
     text = Text('abcd')
     tag = ('ns', 'id', (0, 2))
     self.assertRaises(TypeError, text.add, 1)
     self.assertRaises(TypeError, text.add, [(1, None)])
     self.assertRaises(TypeError, text.add, [(['ns', 'id', 'o'], {1: 2})])
     self.assertRaises(TypeError, text.add, [(tag, 1)])
     self.assertRaises(ValueError,
                       Text('abcd').add, [(tag, [('a', 'b', 'c')])])
     self.assertRaises(ValueError, Text('abcd').add, [tag])

Пример #5

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testNamespaces(self):
     tag1 = ('ns1', 'key1', (0, 3))
     tag2 = ('ns2', 'key2', (1, 2))
     tag3 = ('ns3', 'key1', (2, 3))
     text = Text('a\U0010ABCDb', [(tag1, None), (tag2, None), (tag3, None)])
     self.assertListEqual(['ns1', 'ns2', 'ns3'],
                          sorted(list(text.namespaces)))

Пример #6

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testGetitem(self):
     tag1, tag2 = ('ns', 'key1', (1, 3)), ('ns', 'key2', (0, 2))
     text = Text('n\U0010ABCDn', [(tag1, None), (tag2, None)])
     self.assertEqual([tag2, tag1], text[1])
     self.assertEqual([tag2, tag1], text[1:2])
     self.assertEqual([tag1], text[1:3:True])
     self.assertEqual([tag1], text[-1])

Пример #7

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testTagInitializationUnicode(self):
     tag1 = ('ns1', 'key1', (0, 3))
     tag2 = ('ns1', 'key1', (1, 2))
     tag3 = ('ns2', 'key2', (2, 3))
     tags = [tag1, tag2, tag3]
     text = Text('a\U0010ABCDb', [(tag1, {}), (tag2, {}), (tag3, {})])
     self.assertListEqual(tags, sorted(list(text)))

Пример #8

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testDigestCallsEncodeOnce(self, mock):
     check = 'aü\U0010ABCD'
     mock.return_value = check.encode()
     text = Text(check)
     digest = text.digest
     self.assertEqual(digest, text.digest)
     self.assertEqual(mock.call_count, 1)

Пример #9

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testTags(self):
     tag1 = ('ns1', 'id', (0, 3))
     tag2 = ('ns2', 'id', (1, 3))
     tag3 = ('ns3', 'id', (0, 2))
     text = Text('a\U0010ABCDb', [(tag1, None), (tag2, None), (tag3, None)])
     self.assertListEqual([tag1, tag3, tag2], text.tags())
     self.assertListEqual([tag1, tag2, tag3], text.tags(Text.ReverseKey))

Пример #10

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testContains(self):
     tag2, tag1 = ('ns', 'key', (1, 2)), ('ns', 'key', (0, 2))
     text = Text('n\U0010ABCDn', [(tag1, None)])
     self.assertTrue(tag1 in text)
     self.assertFalse(tag2 in text)
     self.assertFalse(('ns', 'key', object()) in text)
     self.assertFalse(('ns', object(), (1, 2)) in text)
     self.assertFalse((object(), 'key', (1, 2)) in text)

Пример #11

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testIter(self):
     string = 'abcd'
     offsets = [(0, 2), (1, 3), (2, 4)]
     text = Text(string, [(('n', 'i', off), {'a': off}) for off in offsets])
     self.assertListEqual([('i', string[off[0]:off[1]], {
         'a': off
     }) for off in offsets], list(text.iter('n')))
     text.add([(('ms', 'i', (0, 1, 3, 4)), None)])
     self.assertListEqual([('i', 'ad', None)], list(text.iter('ms')))

Пример #12

0

Показать файл

Файл: test_text.py Проект: pombredanne/libfnl

 def testFromJson(self):
     json = {
         "text": "abcd",
         "checksum": {"MD-5": md5(b"abcd").hexdigest(), "encoding": "UTF-8"},
         "maps": {"UTF-8": (0, 1, 2, 3, 4), "UTF-16": (2, 4, 6, 8, 10)},
     }
     text = Text.fromJson(json)
     self.assertEqual("abcd", str(text))
     self.assertDictEqual({"_utf8": (0, 1, 2, 3, 4), "_utf16": (2, 4, 6, 8, 10)}, text._maps)

Пример #13

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testString(self):
     check = 'aü\U0010ABCDb'
     text = Text(check)
     self.assertTupleEqual(('a', 'ü', '\U0010ABCD', 'b'),
                           tuple(text.string))
     self.assertEqual('\U0010ABCD', text.string[2:3])
     self.assertEqual('b\U0010ABCDüa', text.string[4:-5:-1])
     self.assertEqual('bü', text.string[4:-5:-2])
     self.assertTrue(isinstance(text.string, str))
     self.assertEqual('Aü\U0010ABCDb', text.string.capitalize())

Пример #14

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testDelitem(self):
     tag2, tag1 = ('ns', 'key', (1, 3)), ('ns', 'key', (0, 2))
     tag3 = ('ns', 'key', (2, 3))
     text = Text('n\U0010ABCDn', [(tag1, None), (tag2, {'a': 'v'})])
     del text[0]
     self.assertListEqual([(tag2, {'a': 'v'})], list(text.get()))
     text.add([(tag3, None)])
     del text[2]
     self.assertListEqual([], list(text.get()))
     self.assertListEqual([], list(text))
     self.assertDictEqual({}, text.attributes)

Пример #15

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def assertUtfCalls(self, mock, attr):
     text = Text('aä\U0010ABCD!')
     text._maps['_' + attr] = None
     mock.return_value = iter((True, ))
     self.assertTupleEqual((True, ), getattr(text, attr))
     self.assertTupleEqual((True, ), getattr(text, attr))
     self.assertEqual(1, mock.call_count)
     text._maps['_' + attr] = None
     mock.return_value = iter((True, ))
     self.assertTupleEqual((True, ), getattr(text, attr))
     self.assertEqual(2, mock.call_count)

Пример #16

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testGetitemSliceSorting(self):
     tags = [
         ('a', 'a', (0, 5)),
         ('a', 'a', (1, 4)),
         ('a', 'a', (1, 3)),
         ('a', 'a', (1, 2)),
         ('a', 'a', (2, 3)),
         ('a', 'a', (15, 19)),
     ]
     text = Text('0123456789ABDEFGHIJK', [(t, None) for t in tags])
     self.assertEqual(tags[:-1], text[1:3])
     self.assertEqual(tags[2:5], text[1:3:True])
     self.assertEqual(tags, text[0:])
     self.assertEqual(tags, text[0::True])

Пример #17

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testToJson(self):
     text = Text('abcd', [(('ns', 'id', (0, 4)), {'a': 'v'})])
     json = {
         'text': 'abcd',
         'checksum': {
             'md5': md5(b'abcd').hexdigest(),
             'encoding': 'utf8'
         },
         'maps': {
             'utf8': (0, 1, 2, 3, 4),
             'utf16': (2, 4, 6, 8, 10),
         },
     }
     self.assertDictEqual(json, text.toJson())

Пример #18

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testRemove(self):
     tag1 = ('ns1', 'key1', (0, 3))
     tag2 = ('ns1', 'key1', (1, 2))
     tag3 = ('ns2', 'key2', (2, 3))
     text = Text('a\U0010ABCDb', [(tag1, None), (tag2, {
         1: 2
     }), (tag3, None)])
     self.assertListEqual([tag1, tag2, tag3], sorted(list(text)))
     text.remove([tag1, tag3])
     self.assertListEqual([tag2], list(text))
     self.assertDictEqual({'ns1': {tag2: {1: 2}}}, text.attributes)
     text.remove(None, 'ns1')
     self.assertListEqual([], list(text))
     self.assertDictEqual({}, text.attributes)

Пример #19

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testAdd(self):
     tag1 = ('ns1', 'key1', (0, 3))
     tag2 = ('ns1', 'key1', (1, 2))
     tag3 = ('ns2', 'key2', (2, 3))
     tags = [tag1, tag2, tag3]
     text = Text('a\U0010ABCDb', [(tag1, None)])
     self.assertListEqual([(tag1, None)], list(text.get()))
     attrs = {'n': {'x': 'y'}}
     text.add([(tag1, attrs), (tag2, None)], 'ns1')
     self.assertListEqual([(tag1, attrs), (tag2, None)], list(text.get()))
     text.add([(tag1, attrs), (tag2, attrs), (tag3, attrs)])
     self.assertListEqual(tags, sorted(list(text)))
     self.assertListEqual([(tag1, attrs), (tag2, attrs)],
                          list(text.get('ns1')))
     self.assertListEqual([(tag3, attrs)], list(text.get('ns2')))

Пример #20

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

    def testBase64Digest(self):
        SAFE = bytearray(range(256))
        SAFE[ord('+')] = ord('-')
        SAFE[ord('-')] = ord('+')
        SAFE[ord('/')] = ord('.')
        SAFE[ord('.')] = ord('/')

        def b64encode(b: bytes, charmap: bytes = SAFE) -> bytes:
            return b2a_base64(b)[:-1].translate(charmap)

        check = 'aü\U0010ABCD'
        text = Text(check)
        b64digest = b64encode(md5(check.encode()).digest())
        b64digest = b64digest[:-2].decode()
        self.assertEqual(b64digest, text.base64digest)

Пример #21

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

    def testIter(self):
        tags = OrderedDict([
            (('a', 'a', (0, 5)), None),
            (('a', 'a', (1, 4)), None),
            (('a', 'b', (1, 3)), {
                'n': 'v'
            }),
            (('a', 'a', (1, 2)), None),
            (('a', 'a', (2, 3)), {
                'a': 'b'
            }),
        ])
        text = Text('01234', [(t, a) for t, a in tags.items()])

        for expected, received in zip(list(tags.keys()), list(text)):
            self.assertTupleEqual(expected, received)

Пример #22

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testGet(self):
     tag1 = ('ns1', 'key1', (0, 3))
     tag2 = ('ns1', 'key2', (1, 2))
     tag3 = ('ns2', 'key1', (2, 3))
     text = Text('a\U0010ABCDb', [(tag1, {
         1: 2
     }), (tag2, {
         3: 4
     }), (tag3, {
         5: 6
     })])
     self.assertListEqual([(tag1, {
         1: 2
     }), (tag2, {
         3: 4
     })], list(text.get('ns1')))
     self.assertListEqual([(tag3, {5: 6})], list(text.get('ns2')))

Пример #23

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

    def testAddUpdateAttributes(self):
        tag = ('ns1', 'key1', (0, 3))

        for first, update, result in [('a', 'b', 'b'), ([1], [2], [1, 2]),
                                      ({
                                          1: 1
                                      }, {
                                          1: 3,
                                          2: 2
                                      }, {
                                          1: 3,
                                          2: 2
                                      })]:
            text = Text('test')
            text.add([(tag, {'n': first})], 'ns1')
            self.assertEqual(first, text.attributes['ns1'][tag]['n'])
            text.add([(tag, {'n': update})], 'ns1')
            self.assertEqual(result, text.attributes['ns1'][tag]['n'])

Пример #24

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testFromJson(self):
     json = {
         'text': 'abcd',
         'checksum': {
             'MD-5': md5(b'abcd').hexdigest(),
             'encoding': 'UTF-8'
         },
         'maps': {
             'UTF-8': (0, 1, 2, 3, 4),
             'UTF-16': (2, 4, 6, 8, 10),
         },
     }
     text = Text.fromJson(json)
     self.assertEqual('abcd', str(text))
     self.assertDictEqual(
         {
             '_utf8': (0, 1, 2, 3, 4),
             '_utf16': (2, 4, 6, 8, 10),
         }, text._maps)

Пример #25

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testInitializationWithTags(self):
     tag1 = ('ns', 'key', (0, 1))
     tag2 = ('ns', 'key', (1, 2))
     tag3 = ('ns', 'key', (2, 3))
     tag4 = ('ns', 'key', (3, 4))
     tags = [tag1, tag2, tag3, tag4]
     text = Text('text', [(tag1, {
         'n': 'v'
     }), (tag2, [('dict', 'like')]), (tag3, ['kv']), (tag4, {})])
     self.assertListEqual(tags, list(text))
     self.assertDictEqual(
         {
             tag1: {
                 'n': 'v'
             },
             tag2: {
                 'dict': 'like'
             },
             tag3: {
                 'k': 'v'
             }
         }, text.attributes['ns'])

Пример #26

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testAddFromDict(self):
     text = Text('abcd')
     tags = {
         'ns1': {
             'id1': {
                 '0.1': {
                     'a': 'v'
                 },
                 '1.2': None
             },
             'id2': {
                 '2.3': {
                     'x': 'y'
                 }
             }
         },
         'ns2': {
             'id1': {
                 '0.4': {
                     'k': 'l'
                 }
             }
         }
     }
     text.addFromDict(tags)
     self.assertListEqual([
         (('ns1', 'id1', (0, 1)), {
             'a': 'v'
         }),
         (('ns1', 'id1', (1, 2)), None),
         (('ns1', 'id2', (2, 3)), {
             'x': 'y'
         }),
         (('ns2', 'id1', (0, 4)), {
             'k': 'l'
         }),
     ], list(text.get()))

Пример #27

0

Показать файл

Файл: corpus.py Проект: living1069/libfnl

    def toText(self, stream:TextIOBase) -> iter([Text]):
        """
        Read an open XML stream, yielding article ID, :class:`.Text` instance
        tuples, one per article.

        The PoS attributes on the XML token elements are used to create tags on
        the text, using the Penn tag name as tag IDs. The start and end
        positions of the title, abstract, and sentences are stored in the
        section tag namespace, using their XML element name as tag ID.
        """
        for event, element in iterparse(stream, events=('end',)):
            if element.tag == self.article_elem:
                self.article = []
                self.section_tags = []
                self.pos_tags = []

                length = self._parseArticle(element)

                if length:
                    text = Text(''.join(self.article))
                    text.add(self.section_tags, self.section_tag_ns)
                    text.add(self.pos_tags, self.pos_tag_ns)
                    article_id = element.find(self.article_id_path)
                    yield article_id.text.strip(), text

Пример #28

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testTagsAsDict(self):
     text = Text('abcd', [
         (('ns1', 'id1', (0, 1)), {
             'a': 'v'
         }),
         (('ns1', 'id1', (1, 2)), None),
         (('ns1', 'id2', (2, 3)), {
             'x': 'y'
         }),
         (('ns2', 'id1', (0, 4)), {
             'k': 'l'
         }),
     ])
     self.assertDictEqual(
         {
             'ns1': {
                 'id1': {
                     '0.1': {
                         'a': 'v'
                     },
                     '1.2': None
                 },
                 'id2': {
                     '2.3': {
                         'x': 'y'
                     }
                 }
             },
             'ns2': {
                 'id1': {
                     '0.4': {
                         'k': 'l'
                     }
                 }
             }
         }, text.tagsAsDict())

Пример #29

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testInitialization(self):
     text = Text('text')
     self.assertDictEqual(dict(), text.attributes)

Пример #30

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testStringLen(self):
     text = Text('n\U0010ABCDn')
     self.assertEqual(3, len(text.string))

Пример #31

0

Показать файл

Файл: test_text.py Проект: living1069/libfnl

 def testDigest(self):
     check = 'aü\U0010ABCD'
     text = Text(check)
     self.assertEqual(md5(check.encode()).digest(), text.digest)

Python Text примеры использования