forked from buchanaa/pypat
/
XMLParser.py
188 lines (157 loc) · 7.54 KB
/
XMLParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# Originially written by Andy Buchanan to produce a dictionary of Patent objects,
# I've modified this so that it produces an array of dictionaries (which can
# easily be loaded into MongoDB or saved as .json)
import datetime, os, xml.dom.minidom, logging
import Patent
# fr = '/Users/Shared/patent_raw_data/ipgb-2005-present/'
# DB: The following line is the source directory for the xml files to be parsed.
# fr = '/Users/drewblunt/Desktop/tests/'
fr = '/Volumes/7200/patent_raw_data/ipgb-2005-present/'
class XMLParser:
def __init__(self):
# self.patns will be an array of dicts
self.patns = []
self.badPatns = {}
# self.badPatns isn't used anywhere in the code (it's always left empty)
def parseFile(self, fp):
self.fn = os.path.basename(fp)
buf = ''
bufs = []
with open(fp) as fXml:
for line in fXml:
if line[0:5] == '<?xml':
if buf != '':
bufs.append(buf)
buf = line
else:
buf += line
bufs.append(buf) # make sure to catch the last one
# print "Found",len(bufs),"xml entities to parse in",os.path.basename(fp)
for sPatn in bufs:
self.dom = xml.dom.minidom.parseString(sPatn) # save dom for debugging convenience
self.patn = self.parseXMLDom(self.dom)
if self.patn != None and self.patn['pno'] not in self.badPatns:
self.patns.append(self.patn)
# print(self.patns)
return (self.patns, self.badPatns)
# previously returned a patent, should now return a patent dict
def parseXMLDom(self, dom):
elmPubRef = dom.getElementsByTagName('publication-reference')[0].getElementsByTagName('document-id')[0]
try:
pno = int(elmPubRef.getElementsByTagName('doc-number')[0].childNodes[0].data)
self.patn = {
'rawcites' : [],
'cites' : [],
'citedby': [],
'pno' : pno
}
except ValueError:
# presume that pno found is not a utility patent and ignore
return
# self.patn = Patent.Patent(pno)
isd = elmPubRef.getElementsByTagName('date')[0].childNodes[0].data
# self.patn.isd = datetime.datetime.strptime(isd, "%Y%m%d").date()
isd = datetime.datetime.strptime(isd, "%Y%m%d").date()
# Mongo cannot accept dates, only date+time. Gotta pad isd with time = midnight.
self.patn['isd'] = datetime.datetime.combine(isd, datetime.datetime.min.time())
self.patn['isq'] = Patent.d2q(self.patn['isd'])
elmAppRef = dom.getElementsByTagName('application-reference')[0].getElementsByTagName('document-id')[0]
apd = elmAppRef.getElementsByTagName('date')[0].childNodes[0].data
# self.patn.apd = datetime.datetime.strptime(apd, "%Y%m%d").date()
apd = datetime.datetime.strptime(apd, "%Y%m%d").date()
# again, have to pad the date to make it a date+time
self.patn['apd'] = datetime.datetime.combine(apd, datetime.datetime.min.time())
self.patn['apq'] = Patent.d2q(self.patn['apd']) # NB: may be out of nQuarters range
uspc = dom.getElementsByTagName('classification-national')[0]
uspc = uspc.getElementsByTagName('main-classification')[0].childNodes[0].data
# self.patn.uspc = str(uspc.encode('ascii','replace'))
self.patn['uspc'] = str(uspc.encode('ascii','replace'))
ipc = (dom.getElementsByTagName('classification-ipc') + dom.getElementsByTagName('classification-ipcr'))[0]
try:
# for classification-ipcr which breaks out each part of the IPC
# section-class-subclass group/subgroup
# NB: this is not the same format for these as in the DAT files
ipc = "%s%s%s %s/%s" % tuple([x.childNodes[0].data for x in ipc.childNodes[5:14] if x.nodeType == 1])
except IndexError:
# sometimes the main-group is just <main-group/> instead of a real value
# this is treated as '1' in the online database
# so that's what we'll do too
ipc1 = "%s%s%s" % tuple([x.childNodes[0].data for x in ipc.childNodes[5:10] if x.nodeType == 1])
ipc2 = " 01/%s" % (ipc.childNodes[13],)
ipc = ipc1 + ipc2
except TypeError:
# for classification-ipc which just gives a single string for each IPC
ipc = ipc.getElementsByTagName('main-classification')[0].childNodes[0].data
# self.patn.ipc = str(ipc.encode('ascii','replace'))
self.patn['ipc'] = str(ipc.encode('ascii','replace'))
elmsTitles = dom.getElementsByTagName('invention-title')[0].childNodes
# self.patn.title = ''
self.patn['title'] = ''
for node in elmsTitles:
# sometimes the title has subelements like italic text or what have you
# sometimes the subelements don't have text at the bottom
# 7632827, I'm looking at you here
while node.nodeType != node.TEXT_NODE and node.childNodes:
node = node.childNodes[0]
if node.nodeType == node.TEXT_NODE:
# self.patn.title += str(node.data.encode('ascii','replace'))
self.patn['title'] += str(node.data.encode('ascii','replace'))
else:
# logging.warning('Skipped part of title %d in %s: %s', self.patn.pno, self.fn, node)
logging.warning('Skipped part of title %d in %s: %s', self.patn['pno'], self.fn, node)
elmAssig = dom.getElementsByTagName('assignees')
if elmAssig:
elmAssig = elmAssig[0]
# sometimes it's an orgname, sometimes first + last, get all
ass = (elmAssig.getElementsByTagName('orgname') + elmAssig.getElementsByTagName('first-name') + elmAssig.getElementsByTagName('last-name'))
# self.patn.assignee = str(' '.join([x.childNodes[0].data.encode('ascii','replace') for x in ass]))
self.patn['assignee'] = str(' '.join([x.childNodes[0].data.encode('ascii','replace') for x in ass]))
# DB: I wrote this part to load the abstracts, copying the elmAssig codeblock above
elmAbstract = dom.getElementsByTagName('abstract')
if elmAbstract:
elmAbstract = elmAbstract[0]
pgraphs = elmAbstract.getElementsByTagName('p')
# handleTok defined/explained below
# dressing around handleTok taken from Andy's code
abs = str(self.handleTok(pgraphs).encode('ascii','replace'))
# "Element instance has no attribute 'data' ": is this because some
# abstracts are long enough to be stored by xmldom as two nodes?
# see https://mail.python.org/pipermail/tutor/2004-July/030397.html
# self.patn['abstract'] = str('\n'.join([p.childNodes[0].data.encode('ascii','replace') for p in pgraphs]))
self.patn['abstract'] = abs
self.patn['rawcites'] = []
elmRefCit = dom.getElementsByTagName('references-cited')
if not elmRefCit:
# the data changes citation field-name convention between '05 and '14.
elmRefCit = dom.getElementsByTagName('us-references-cited')
if elmRefCit:
for cite in elmRefCit[0].getElementsByTagName('patcit'):
if cite.getElementsByTagName('country')[0].childNodes[0].data == 'US':
try:
pno = int(cite.getElementsByTagName('doc-number')[0].childNodes[0].data)
except ValueError:
# presume that pno cited is not a utility patent and ignore
continue
else:
# self.patn.rawcites.append(pno)
self.patn['rawcites'].append(pno)
else:
logging.warning('No citations for %d in %s: %s', self.patn['pno'], self.fn, node)
return self.patn
# Following two functions help parse especially long abstracts.
# found on StackOverflow: http://stackoverflow.com/questions/6653128
# Thanks to asker 'coffee' and answerer 'kmdent'. My changes commented.
# I added each 'self' argument in keeping with Andy's style.
def getText(self, nodelist):
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(node.data)
return ''.join(rc)
# From the StackOverflow source, I changed to use '\n'.join rather than
# looped append with " ".
def handleTok(self, tokenlist):
texts = '\n'.join([self.getText(token.childNodes) for token in tokenlist])
# for token in tokenlist:
# texts += " "+ getText(token.childNodes)
return texts