/
patent.py
93 lines (70 loc) · 3.31 KB
/
patent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import lib.xmltodict as xmltodict
import unittest
# coding=utf-8
__author__ = 'Tang'
"""
Models each XML patent file using a Patent object with a dictionary attribute
that stores the XML file's element-content pairs for easier access and
manipulation.
The xmltodict module was chosen as it is "makes working with XML feel like you
are working with JSON" (https://github.com/martinblech/xmltodict)
Running this python module on its own just runs the unit tests defined within.
"""
class Patent:
"""Patent data as extracted from an XML patent file using xmltodict."""
def __init__(self, filename):
"""Initializes Patent object with data from XML file mentioned in arg,
and stores that data in a dictionary attribute for easier access and
manipulation.
:param filename: Filename of XML patent file to extract data from.
"""
with open(filename, 'r') as infile:
# Ensure that data only resides on a single line
data = infile.read().replace('\n', '')
if data is not None:
self.dict = dict()
# All XML contents are within 'doc' and 'str' tags consecutively,
# hence after parsing with xmltodict, all other tags are nested
# under a list of ordered dictionaries with u'@name' or u'#text'
# keys and XML element or content values respectively.
temp_dict = xmltodict.parse(data)['doc']['str']
for ord_dict in temp_dict:
# Ignore empty fields
if u'#text' in ord_dict.keys():
# Simplify dictionary structure
self.dict[ord_dict[u'@name']] = ord_dict[u'#text']
else:
# File is empty
raise PatentFileException(filename)
def get_data(self):
"""Returns python dictionary with key-value pairs based on XML
element-content pairs from the original XML patent file.
:return: Python dictionary with key-value pairs based on XML
element-content pairs from the original XML patent file.
"""
return self.dict
class PatentFileException(Exception):
"""Raised when patent file is empty"""
def __init__(self, filename):
"""Initializes exception object with the attribute containing the
empty file's filename.
:param filename: The empty patent file that raised this exception.
"""
self.filename = filename
class TestPatentClass(unittest.TestCase):
"""Test case ensuring XML patent files are parsed as expected"""
def test_read_patent(self):
"""Ensures Patent class parses XML patent files in an expected format.
Compares the result from parsing the XML file "EP0049154B2.xml" (from
PatSnap corpus), with the test file "patent_class_test1.txt".
"patent_class_test1.txt" contains a human-vetted python dictionary
in JSON format, with key-value pairs based on XML element-content
pairs. "EP0049154B2.xml" is the XML file which
patent_class_test1.txt was based off.
"""
with open("tests/json/patent_class_test1.txt", 'r') as infile:
output = infile.read().replace('\n', '')
p = Patent("tests/patsnap_corpus/EP0049154B2.xml")
self.assertEqual(output, str(p.get_data()))
if __name__ == '__main__':
unittest.main()