def __init__(self, filename, useFile=False): self.file = BZ2File(filename) if useFile: self.writeToFile = True self.output = codecs.open('output.dat', 'w', 'utf-8') else: self.writeToFile = False self.dao = WikiDao('../resources/datasource.properties') self.namespace = '{http://www.mediawiki.org/xml/export-0.8/}'
class WikiParser: def __init__(self, filename, useFile=False): self.file = BZ2File(filename) if useFile: self.writeToFile = True self.output = codecs.open('output.dat', 'w', 'utf-8') else: self.writeToFile = False self.dao = WikiDao('../resources/datasource.properties') self.namespace = '{http://www.mediawiki.org/xml/export-0.8/}' def __del__(self): self.file.close() if self.writeToFile: self.output.close() else: del self.dao def tag_name(self,name): return self.namespace + name def cope_with_data(self,data): if self.writeToFile: #save to file userlist = data['users'] if len(userlist) == 1: self.output.write(userlist[0].id) elif len(userlist) > 1: for i in range(0, len(userlist)-1): for j in range(i+1, len(userlist)): self.output.write( '%s;%s\n' % (userlist[i].id,userlist[j].id) ) else: #save to database self.dao.savePage(data['page']) for user in data['users']: self.dao.saveUser(user) for revision in data['revisions']: self.dao.saveRevision(revision) self.dao.connection.commit() del data def revIsValid(self, id, time): return id is not None and id.text is not None and time is not None and time.text is not None def process_xml(self): for event, elem in etree.iterparse(self.file, events=("start","end")): data = {} if elem.tag == self.tag_name('page') and event == 'start': title = elem.find(self.tag_name('title')) pgId = elem.find(self.tag_name('id')) if title is not None and pgId is not None: if title.text is None or pgId.text is None: print 'Page title or id is none' else: # save page data aTitle = title.text.replace('\'', '') if len(aTitle) > 100: aTitle = aTitle[:100] page = Page(pgId.text, aTitle) data['page'] = page revisions = [] users = [] for revision in elem.findall(self.tag_name('revision')): # get the revision data rev_id = revision.find(self.tag_name('id')) rev_timestamp = revision.find(self.tag_name('timestamp')) rev_comment = revision.find(self.tag_name('comment')) if self.revIsValid(rev_id, rev_timestamp): timestamp = rev_timestamp.text.split('T')[0] #only date in format YYYY-MM-DD comment = '' if rev_comment is None else rev_comment.text comment = '' if comment is None else comment.replace('\'', '') #avoid special chars in comments if len(comment) > 200: comment = comment[:200] contrib = revision.find(self.tag_name('contributor')) if(contrib != None): username = contrib.find(self.tag_name('username')) user_id = contrib.find(self.tag_name('id')) if username is not None and user_id is not None: if user_id.text is not None and username.text is not None: user = User(user_id.text, username.text) users.append(user) revision = Revision(rev_id.text, page.id, user.id, timestamp, comment) revisions.append(revision) else: print 'UserId or username not found' else: pass else: print 'Invalid revision' data['revisions'] = revisions data['users'] = users self.cope_with_data(data) elem.clear() del elem