def _validate(aString, firstOccurrenceOnly, loggedEvents, base, encoding, selfURIs=None): """validate RSS from string, returns validator object""" from xml.sax import make_parser, handler from base import SAXDispatcher from exceptions import UnicodeError from cStringIO import StringIO # By now, aString should be Unicode source = InputSource() source.setByteStream(StringIO(xmlEncoding.asUTF8(aString))) validator = SAXDispatcher(base, selfURIs or [base], encoding) validator.setFirstOccurrenceOnly(firstOccurrenceOnly) validator.loggedEvents += loggedEvents # experimental RSS-Profile draft 1.06 support validator.setLiterals(re.findall('&(\w+);',aString)) xmlver = re.match("^<\?\s*xml\s+version\s*=\s*['\"]([-a-zA-Z0-9_.:]*)['\"]",aString) if xmlver and xmlver.group(1)<>'1.0': validator.log(logging.BadXmlVersion({"version":xmlver.group(1)})) try: from xml.sax.expatreader import ExpatParser class fake_dtd_parser(ExpatParser): def reset(self): ExpatParser.reset(self) self._parser.UseForeignDTD(1) parser = fake_dtd_parser() except: parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) parser.setContentHandler(validator) parser.setErrorHandler(validator) parser.setEntityResolver(validator) if hasattr(parser, '_ns_stack'): # work around bug in built-in SAX parser (doesn't recognize xml: namespace) # PyXML doesn't have this problem, and it doesn't have _ns_stack either parser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'}) def xmlvalidate(log): import libxml2 from StringIO import StringIO from random import random prefix="...%s..." % str(random()).replace('0.','') msg=[] libxml2.registerErrorHandler(lambda msg,str: msg.append(str), msg) input = libxml2.inputBuffer(StringIO(xmlEncoding.asUTF8(aString))) reader = input.newTextReader(prefix) reader.SetParserProp(libxml2.PARSER_VALIDATE, 1) ret = reader.Read() while ret == 1: ret = reader.Read() msg=''.join(msg) for line in msg.splitlines(): if line.startswith(prefix): log(line.split(':',4)[-1].strip()) validator.xmlvalidator=xmlvalidate try: parser.parse(source) except SAXException: pass except UnicodeError: import sys exctype, value = sys.exc_info()[:2] validator.log(logging.UnicodeError({"exception":value})) if validator.getFeedType() == TYPE_RSS1: try: from rdflib.syntax.parsers.RDFXMLHandler import RDFXMLHandler class Handler(RDFXMLHandler): ns_prefix_map = {} prefix_ns_map = {} def add(self, triple): pass def __init__(self, dispatcher): RDFXMLHandler.__init__(self, self) self.dispatcher=dispatcher def error(self, message): self.dispatcher.log(InvalidRDF({"message": message})) source.getByteStream().reset() parser.reset() parser.setContentHandler(Handler(parser.getContentHandler())) parser.setErrorHandler(handler.ErrorHandler()) parser.parse(source) except: pass return validator
class DataImporter(Digester): def __init__(self, ictx, file): Digester.__init__(self) self._ictx = ictx self._file = file self._input = InputSource(file.name) self._input.setByteStream(BZ2File(file.name, 'r')) self._conn = ictx['conn'].connection self._cursor = self._conn.cursor() self.success = self._closed = False self._add_rules() def _add_rules(self): self.addOnBegin('packet', self._check_packet) self.addOnBeginAndEnd('packet/transaction/event', self._on_event, self._on_event_end) self.addOnBody('packet/transaction/event/keys/column', self._on_key_column) self.addOnBody('packet/transaction/event/values/column', self._on_value_column) self.addOnFinish(self._on_finish) def _check_packet(self, tag, attrs): if self._ictx['schema_seq'] != int(attrs.getValue('schema_seq')): raise Exception('<packet> schema_seq: {0} not matched the expected seq number {1}', attrs.getValue('schema_seq'), self._ictx['replication_seq']) if self._ictx['replication_seq'] != int(attrs.getValue('replication_seq')): raise Exception('<packet> replication_seq: {0} not matched the expected seq number {1}', attrs.getValue('replication_seq'), self._ictx['replication_seq']) def _on_key_column(self, tag, attrs, val): event = self.peek() event['keys'][attrs.getValue('name')] = val def _on_value_column(self, tag, attrs, val): event = self.peek() isNull = attrs.getValue("null") if attrs.has_key('null') else None event['values'][attrs.getValue('name')] = val if isNull != "yes" else None def _on_event(self, tag, attrs): event = { 'op': attrs.getValue('op'), 'table': attrs.getValue('table'), 'keys': OrderedDict(), #array of tuples column name -> column val 'values': OrderedDict() #array of tuples column name -> column val } self.push(event) def _on_event_end(self, tag): event = self.pop() type = event['op'] table = event['table'] keys = event['keys'] values = event['values'] params = [] if type == 'I': sql_columns = ', '.join(values.keys()) sql_values = ', '.join(['%s'] * len(values)) sql = 'INSERT INTO %s (%s) VALUES (%s)' % (table, sql_columns, sql_values) params = values.values() elif type == 'U': sql_values = ', '.join('%s=%%s' % i for i in values) sql = 'UPDATE %s SET %s' % (table, sql_values) params = values.values() elif type == 'D': sql = 'DELETE FROM %s' % table else: raise Exception('Invalid <event> op: %s' % type) if type == 'D' or type == 'U': sql += ' WHERE ' + ' AND '.join('%s%s%%s' % (i, ' IS ' if keys[i] is None else '=') for i in keys.keys()) params.extend(keys.values()) #print '%s %s' % (sql, params) self._cursor.execute(sql, params) def _on_finish(self): pass def load(self): logger.warning('Saving dataset....') self.parse(self._input) self.success = True def recover(self): """ This is duty hack to remove weird characters presented in some replications files. Using the tidy tool. """ logger.warning('Trying to recover invalid XML...') originalXML = None fixedXML = None try: originalXML = tempfile.NamedTemporaryFile(suffix='.xml', delete=False) #bunzipped tmp fixedXML = tempfile.NamedTemporaryFile(suffix='.xml', delete=False) #fixed tmp fixedXML.close() #Fetch uncompressed file data to recover bzf = self._input.getByteStream() bzf.seek(0) shutil.copyfileobj(bzf, originalXML) originalXML.close() cmd = ['tidy', '-xml', '-o', fixedXML.name, originalXML.name] logger.warning('Running: %s', ' '.join(cmd)) ret = subprocess.call(cmd) if ret: #raise Exception('Failed to fix XML data, ret=%s' % ret) pass #ready to load self.close() self._file = file(fixedXML.name, 'r') self._input = InputSource(fixedXML.name) self._input.setByteStream(self._file) self._cursor = self._conn.cursor() self.success = self._closed = False self.reset() self._add_rules() self.load() finally: for f in [originalXML, fixedXML]: if f and not f.closed: f.close() if f and os.path.exists(f.name): os.unlink(f.name) def close(self): if self._closed: return try: if self.success: self._conn.commit() logger.warning('Done') else: logger.warning('Rolling back transaction. Seq number: {0}'.format(self._ictx['replication_seq'])) self._conn.rollback() self._cursor.close() finally: self._closed = True self._input.getByteStream().close() self._file.close()
def _validate(aString, firstOccurrenceOnly, loggedEvents, base, encoding, selfURIs=None, mediaType=None): """validate RSS from string, returns validator object""" from xml.sax import make_parser, handler from .base import SAXDispatcher from exceptions import UnicodeError from cStringIO import StringIO if re.match("^\s+<\?xml",aString) and re.search("<generator.*wordpress.*</generator>",aString): lt = aString.find('<'); gt = aString.find('>') if lt > 0 and gt > 0 and lt < gt: loggedEvents.append(logging.WPBlankLine({'line':1,'column':1})) # rearrange so that other errors can be found aString = aString[lt:gt+1]+aString[0:lt]+aString[gt+1:] # By now, aString should be Unicode source = InputSource() source.setByteStream(StringIO(xmlEncoding.asUTF8(aString))) validator = SAXDispatcher(base, selfURIs or [base], encoding) validator.setFirstOccurrenceOnly(firstOccurrenceOnly) if mediaType == 'application/atomsvc+xml': validator.setFeedType(TYPE_APP_SERVICE) elif mediaType == 'application/atomcat+xml': validator.setFeedType(TYPE_APP_CATEGORIES) validator.loggedEvents += loggedEvents # experimental RSS-Profile support validator.rssCharData = [s.find('&#x')>=0 for s in aString.split('\n')] xmlver = re.match("^<\?\s*xml\s+version\s*=\s*['\"]([-a-zA-Z0-9_.:]*)['\"]",aString) if xmlver and xmlver.group(1) != '1.0': validator.log(logging.BadXmlVersion({"version":xmlver.group(1)})) try: from xml.sax.expatreader import ExpatParser class fake_dtd_parser(ExpatParser): def reset(self): ExpatParser.reset(self) self._parser.UseForeignDTD(1) parser = fake_dtd_parser() except: parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) parser.setContentHandler(validator) parser.setErrorHandler(validator) parser.setEntityResolver(validator) if hasattr(parser, '_ns_stack'): # work around bug in built-in SAX parser (doesn't recognize xml: namespace) # PyXML doesn't have this problem, and it doesn't have _ns_stack either parser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'}) def xmlvalidate(log): import libxml2 from StringIO import StringIO from random import random prefix="...%s..." % str(random()).replace('0.','') msg=[] libxml2.registerErrorHandler(lambda msg,str: msg.append(str), msg) input = libxml2.inputBuffer(StringIO(xmlEncoding.asUTF8(aString))) reader = input.newTextReader(prefix) reader.SetParserProp(libxml2.PARSER_VALIDATE, 1) ret = reader.Read() while ret == 1: ret = reader.Read() msg=''.join(msg) for line in msg.splitlines(): if line.startswith(prefix): log(line.split(':',4)[-1].strip()) validator.xmlvalidator=xmlvalidate try: parser.parse(source) except SAXException: pass except UnicodeError: import sys exctype, value = sys.exc_info()[:2] validator.log(logging.UnicodeError({"exception":value})) if validator.getFeedType() == TYPE_RSS1: try: from rdflib.syntax.parsers.RDFXMLHandler import RDFXMLHandler class Handler(RDFXMLHandler): ns_prefix_map = {} prefix_ns_map = {} def add(self, triple): pass def __init__(self, dispatcher): RDFXMLHandler.__init__(self, self) self.dispatcher=dispatcher def error(self, message): self.dispatcher.log(InvalidRDF({"message": message})) source.getByteStream().reset() parser.reset() parser.setContentHandler(Handler(parser.getContentHandler())) parser.setErrorHandler(handler.ErrorHandler()) parser.parse(source) except: pass return validator
class DataImporter(Digester): def __init__(self, ictx, file): Digester.__init__(self) self._ictx = ictx self._file = file self._input = InputSource(file.name) self._input.setByteStream(BZ2File(file.name, 'r')) self._conn = ictx['conn'].connection self._cursor = self._conn.cursor() self.success = self._closed = False self._add_rules() def _add_rules(self): self.addOnBegin('packet', self._check_packet) self.addOnBeginAndEnd('packet/transaction/event', self._on_event, self._on_event_end) self.addOnBody('packet/transaction/event/keys/column', self._on_key_column) self.addOnBody('packet/transaction/event/values/column', self._on_value_column) self.addOnFinish(self._on_finish) def _check_packet(self, tag, attrs): if self._ictx['schema_seq'] != int(attrs.getValue('schema_seq')): raise Exception( '<packet> schema_seq: {0} not matched the expected seq number {1}', attrs.getValue('schema_seq'), self._ictx['replication_seq']) if self._ictx['replication_seq'] != int( attrs.getValue('replication_seq')): raise Exception( '<packet> replication_seq: {0} not matched the expected seq number {1}', attrs.getValue('replication_seq'), self._ictx['replication_seq']) def _on_key_column(self, tag, attrs, val): event = self.peek() event['keys'][attrs.getValue('name')] = val def _on_value_column(self, tag, attrs, val): event = self.peek() isNull = attrs.getValue("null") if attrs.has_key('null') else None event['values'][attrs.getValue( 'name')] = val if isNull != "yes" else None def _on_event(self, tag, attrs): event = { 'op': attrs.getValue('op'), 'table': attrs.getValue('table'), 'keys': OrderedDict(), #array of tuples column name -> column val 'values': OrderedDict() #array of tuples column name -> column val } self.push(event) def _on_event_end(self, tag): event = self.pop() type = event['op'] table = event['table'] keys = event['keys'] values = event['values'] params = [] if type == 'I': sql_columns = ', '.join(values.keys()) sql_values = ', '.join(['%s'] * len(values)) sql = 'INSERT INTO %s (%s) VALUES (%s)' % (table, sql_columns, sql_values) params = values.values() elif type == 'U': sql_values = ', '.join('%s=%%s' % i for i in values) sql = 'UPDATE %s SET %s' % (table, sql_values) params = values.values() elif type == 'D': sql = 'DELETE FROM %s' % table else: raise Exception('Invalid <event> op: %s' % type) if type == 'D' or type == 'U': sql += ' WHERE ' + ' AND '.join( '%s%s%%s' % (i, ' IS ' if keys[i] is None else '=') for i in keys.keys()) params.extend(keys.values()) #print '%s %s' % (sql, params) self._cursor.execute(sql, params) def _on_finish(self): pass def load(self): logger.warning('Saving dataset....') self.parse(self._input) self.success = True def recover(self): """ This is duty hack to remove weird characters presented in some replications files. Using the tidy tool. """ logger.warning('Trying to recover invalid XML...') originalXML = None fixedXML = None try: originalXML = tempfile.NamedTemporaryFile( suffix='.xml', delete=False) #bunzipped tmp fixedXML = tempfile.NamedTemporaryFile(suffix='.xml', delete=False) #fixed tmp fixedXML.close() #Fetch uncompressed file data to recover bzf = self._input.getByteStream() bzf.seek(0) shutil.copyfileobj(bzf, originalXML) originalXML.close() cmd = ['tidy', '-xml', '-o', fixedXML.name, originalXML.name] logger.warning('Running: %s', ' '.join(cmd)) ret = subprocess.call(cmd) if ret: #raise Exception('Failed to fix XML data, ret=%s' % ret) pass #ready to load self.close() self._file = file(fixedXML.name, 'r') self._input = InputSource(fixedXML.name) self._input.setByteStream(self._file) self._cursor = self._conn.cursor() self.success = self._closed = False self.reset() self._add_rules() self.load() finally: for f in [originalXML, fixedXML]: if f and not f.closed: f.close() if f and os.path.exists(f.name): os.unlink(f.name) def close(self): if self._closed: return try: if self.success: self._conn.commit() logger.warning('Done') else: logger.warning( 'Rolling back transaction. Seq number: {0}'.format( self._ictx['replication_seq'])) self._conn.rollback() self._cursor.close() finally: self._closed = True self._input.getByteStream().close() self._file.close()
def _validate(aString, firstOccurrenceOnly, loggedEvents, base, encoding, selfURIs=None, mediaType=None): """validate RSS from string, returns validator object""" from xml.sax import make_parser, handler from .base import SAXDispatcher from exceptions import UnicodeError from cStringIO import StringIO if re.match("^\s+<\?xml", aString) and re.search( "<generator.*wordpress.*</generator>", aString): lt = aString.find('<') gt = aString.find('>') if lt > 0 and gt > 0 and lt < gt: loggedEvents.append(logging.WPBlankLine({'line': 1, 'column': 1})) # rearrange so that other errors can be found aString = aString[lt:gt + 1] + aString[0:lt] + aString[gt + 1:] # By now, aString should be Unicode source = InputSource() source.setByteStream(StringIO(xmlEncoding.asUTF8(aString))) validator = SAXDispatcher(base, selfURIs or [base], encoding) validator.setFirstOccurrenceOnly(firstOccurrenceOnly) if mediaType == 'application/atomsvc+xml': validator.setFeedType(TYPE_APP_SERVICE) elif mediaType == 'application/atomcat+xml': validator.setFeedType(TYPE_APP_CATEGORIES) validator.loggedEvents += loggedEvents # experimental RSS-Profile support validator.rssCharData = [s.find('&#x') >= 0 for s in aString.split('\n')] xmlver = re.match( "^<\?\s*xml\s+version\s*=\s*['\"]([-a-zA-Z0-9_.:]*)['\"]", aString) if xmlver and xmlver.group(1) != '1.0': validator.log(logging.BadXmlVersion({"version": xmlver.group(1)})) try: from xml.sax.expatreader import ExpatParser class fake_dtd_parser(ExpatParser): def reset(self): ExpatParser.reset(self) self._parser.UseForeignDTD(1) parser = fake_dtd_parser() except: parser = make_parser() parser.setFeature(handler.feature_namespaces, 1) parser.setContentHandler(validator) parser.setErrorHandler(validator) parser.setEntityResolver(validator) if hasattr(parser, '_ns_stack'): # work around bug in built-in SAX parser (doesn't recognize xml: namespace) # PyXML doesn't have this problem, and it doesn't have _ns_stack either parser._ns_stack.append( {'http://www.w3.org/XML/1998/namespace': 'xml'}) def xmlvalidate(log): import libxml2 from StringIO import StringIO from random import random prefix = "...%s..." % str(random()).replace('0.', '') msg = [] libxml2.registerErrorHandler(lambda msg, str: msg.append(str), msg) input = libxml2.inputBuffer(StringIO(xmlEncoding.asUTF8(aString))) reader = input.newTextReader(prefix) reader.SetParserProp(libxml2.PARSER_VALIDATE, 1) ret = reader.Read() while ret == 1: ret = reader.Read() msg = ''.join(msg) for line in msg.splitlines(): if line.startswith(prefix): log(line.split(':', 4)[-1].strip()) validator.xmlvalidator = xmlvalidate try: parser.parse(source) except SAXException: pass except UnicodeError: import sys exctype, value = sys.exc_info()[:2] validator.log(logging.UnicodeError({"exception": value})) if validator.getFeedType() == TYPE_RSS1: try: from rdflib.syntax.parsers.RDFXMLHandler import RDFXMLHandler class Handler(RDFXMLHandler): ns_prefix_map = {} prefix_ns_map = {} def add(self, triple): pass def __init__(self, dispatcher): RDFXMLHandler.__init__(self, self) self.dispatcher = dispatcher def error(self, message): self.dispatcher.log(InvalidRDF({"message": message})) source.getByteStream().reset() parser.reset() parser.setContentHandler(Handler(parser.getContentHandler())) parser.setErrorHandler(handler.ErrorHandler()) parser.parse(source) except: pass return validator