def parse(location, handler): """ Given the location of an XML file and a handler function accepting a etree document, parse the file at location and invoke the handler on the etree doc. If parsing fails while calling handler, another approach to parsing is used. This is a workaround some lxml bug/weirdness wrt unicode in the 2.3 version in use. The `handler` function must have no side effects and can be called again on failures without risk. Try first to call lxml from a location then try from a string to deal with weird encodings """ try: parser = etree.XMLParser(recover=True, remove_blank_text=True, resolve_entities=False) xdoc = etree.parse(location, parser) return handler(xdoc) except: parser = etree.XMLParser(recover=True, remove_blank_text=True, resolve_entities=False) text = analysis.unicode_text(location) xdoc= etree.fromstring(_as_unicode_bytes(text), parser) return handler(xdoc)
def __init__(self, location): # NOTE: most of this is copied over from Pom.__init__ try: with codecs.open(location, 'rb', encoding='UTF-8') as fh: xml = fh.read() except UnicodeDecodeError as _a: xml = analysis.unicode_text(location) xml = xml[xml.find('<project'):] xml = STRIP_NAMESPACE_RE.sub('<project>', xml, 1) self._xml = etree.fromstring(xml, parser=POM_PARSER) # FXIME: we do not use a client for now. there are pending issues at pymaven to address this self._client = None self.model_version = self._get_attribute('modelVersion') self.group_id = self._get_attribute('groupId') self.artifact_id = self._get_attribute('artifactId') self.version = self._get_attribute('version') self.classifier = self._get_attribute('classifier') self.packaging = self._get_attribute('packaging') or 'jar' self.name = self._get_attribute('name') self.description = self._get_attribute('description') self.inception_year = self._get_attribute('inceptionYear') self.url = self._get_attribute('url') self.organization_name = self._get_attribute('organization/name') self.organization_url = self._get_attribute('organization/url') self.licenses = list(self._find_licenses()) self.developers = list(self._find_parties('developers/developer')) self.contributors = list( self._find_parties('contributors/contributor')) self.mailing_lists = list(self._find_mailing_lists()) self.scm = self._find_scm() self.issue_management = self._find_issue_management() self.ci_management = self._find_ci_management() self.distribution_management = self._find_distribution_management() self.repositories = list( self._find_repositories('repositories/repository')) self.plugin_repositories = list( self._find_repositories('pluginRepositories/pluginRepository')) self.modules = self._get_attributes_list('modules/module') # FIXME: this attribute should be collected with the parent but # is not retrieved yet by pymaven it points to the relative path # where to find the full parent POM self.parent_relative_path = self._get_attribute( 'relativePath') # or '../pom.xml' # FIXME: Other types that are not collected for now (or # indirectly through dependencies management) include: build, # reporting, profiles, etc # dynamic attributes self._parent = None self._dep_mgmt = None self._dependencies = None self._properties = None
def _parse_as_string(location): """ Return an etree doc from the XML file at `location` trying hard to get unicode. """ parser = etree.XMLParser(recover=True, remove_blank_text=True, resolve_entities=False) text = analysis.unicode_text(location) return etree.fromstring(_as_unicode_bytes(text), parser)
def __init__(self, location=None, text=None): """ Build a POM from a location or unicode text. """ assert (location or text) and (not (location and text)) if location: try: with io.open(location, encoding='utf-8') as fh: xml_text = fh.read() except UnicodeDecodeError as _a: xml_text = analysis.unicode_text(location) else: xml_text = text xml_text = strip_namespace(xml_text) xml_text = xml_text.encode('utf-8') if TRACE: logger.debug('MavenPom.__init__: xml_text: {}'.format(xml_text)) self._pom_data = etree.fromstring(xml_text, parser=pom.POM_PARSER) # collect and then remove XML comments from the XML elements tree self.comments = self._get_comments() etree.strip_tags(self._pom_data, etree.Comment) # FIXME: we do not use a client for now. # There are pending issues at pymaven to address this self._client = None self.model_version = self._get_attribute('modelVersion') if not self.model_version: # for older POM version 3 self.model_version = self._get_attribute('pomVersion') self.group_id = self._get_attribute('groupId') self.artifact_id = self._get_attribute('artifactId') if TRACE: logger.debug('MavenPom.__init__: self.artifact_id: {}'.format(self.artifact_id)) self.version = self._get_attribute('version') self.classifier = self._get_attribute('classifier') self.packaging = self._get_attribute('packaging') or 'jar' self.name = self._get_attribute('name') self.description = self._get_attribute('description') self.inception_year = self._get_attribute('inceptionYear') self.url = self._get_attribute('url') self.organization_name = self._get_attribute('organization/name') self.organization_url = self._get_attribute('organization/url') self.licenses = list(self._find_licenses()) self.developers = list(self._find_parties('developers/developer')) self.contributors = list(self._find_parties('contributors/contributor')) self.mailing_lists = list(self._find_mailing_lists()) self.scm = self._find_scm() self.issue_management = self._find_issue_management() self.ci_management = self._find_ci_management() self.distribution_management = self._find_distribution_management() self.repositories = list(self._find_repositories('repositories/repository')) self.plugin_repositories = list(self._find_repositories('pluginRepositories/pluginRepository')) self.modules = self._get_attributes_list('modules/module') # FIXME: this attribute should be collected with the parent but # is not retrieved yet by pymaven it points to the relative path # where to find the full parent POM self.parent_relative_path = self._get_attribute('relativePath') # or '../pom.xml_text' # FIXME: Other types that are not collected for now (or # indirectly through dependencies management) include: build, # reporting, profiles, etc # dynamic attributes self._parent = None self._dep_mgmt = None self._dependencies = None self._properties = None
def __init__(self, location=None, text=None): """ Build a POM from a location or unicode text. """ assert (location or text) and (not (location and text)) # NOTE: most of this is derived from pymaven.Pom.__init__ if location: try: with io.open(location, encoding='utf-8') as fh: xml = fh.read() except UnicodeDecodeError as _a: xml = analysis.unicode_text(location) else: xml = text xml = xml[xml.find('<project'):] xml = STRIP_NAMESPACE_RE.sub('<project>', xml, 1) parser = etree.XMLParser( recover=True, # we keep comments in case there is a license in the comments remove_comments=False, remove_pis=True, remove_blank_text=True, resolve_entities=False ) self._xml = etree.fromstring(xml, parser=parser) # collect and then remove XML comments from the XML elements tree self.comments = self._get_comments() etree.strip_tags(self._xml, etree.Comment) # FIXME: we do not use a client for now. There are pending issues at pymaven to address this self._client = None self.model_version = self._get_attribute('modelVersion') if not self.model_version: # for version 3 self.model_version = self._get_attribute('pomVersion') self.group_id = self._get_attribute('groupId') self.artifact_id = self._get_attribute('artifactId') if TRACE: logger.debug('MavenPom.__init__: self.artifact_id: {}'.format(self.artifact_id)) self.version = self._get_attribute('version') self.classifier = self._get_attribute('classifier') self.packaging = self._get_attribute('packaging') or 'jar' self.name = self._get_attribute('name') self.description = self._get_attribute('description') self.inception_year = self._get_attribute('inceptionYear') self.url = self._get_attribute('url') self.organization_name = self._get_attribute('organization/name') self.organization_url = self._get_attribute('organization/url') self.licenses = list(self._find_licenses()) self.developers = list(self._find_parties('developers/developer')) self.contributors = list(self._find_parties('contributors/contributor')) self.mailing_lists = list(self._find_mailing_lists()) self.scm = self._find_scm() self.issue_management = self._find_issue_management() self.ci_management = self._find_ci_management() self.distribution_management = self._find_distribution_management() self.repositories = list(self._find_repositories('repositories/repository')) self.plugin_repositories = list(self._find_repositories('pluginRepositories/pluginRepository')) self.modules = self._get_attributes_list('modules/module') # FIXME: this attribute should be collected with the parent but # is not retrieved yet by pymaven it points to the relative path # where to find the full parent POM self.parent_relative_path = self._get_attribute('relativePath') # or '../pom.xml' # FIXME: Other types that are not collected for now (or # indirectly through dependencies management) include: build, # reporting, profiles, etc # dynamic attributes self._parent = None self._dep_mgmt = None self._dependencies = None self._properties = None
def __init__(self, location=None, text=None): """ Build a POM from a location or unicode text. """ assert (location or text) and (not (location and text)) # NOTE: most of this is copied over from Pom.__init__ if location: try: with codecs.open(location, 'rb', encoding='UTF-8') as fh: xml = fh.read() except UnicodeDecodeError as _a: xml = analysis.unicode_text(location) else: xml = text xml = xml[xml.find('<project'):] xml = STRIP_NAMESPACE_RE.sub('<project>', xml, 1) parser = etree.XMLParser( recover=True, remove_comments=True, remove_pis=True, remove_blank_text=True, resolve_entities=False ) self._xml = etree.fromstring(xml, parser=parser) # FXIME: we do not use a client for now. there are pending issues at pymaven to address this self._client = None self.model_version = self._get_attribute('modelVersion') if not self.model_version: # for version 3 self.model_version = self._get_attribute('pomVersion') self.group_id = self._get_attribute('groupId') self.artifact_id = self._get_attribute('artifactId') self.version = self._get_attribute('version') self.classifier = self._get_attribute('classifier') self.packaging = self._get_attribute('packaging') or 'jar' self.name = self._get_attribute('name') self.description = self._get_attribute('description') self.inception_year = self._get_attribute('inceptionYear') self.url = self._get_attribute('url') self.organization_name = self._get_attribute('organization/name') self.organization_url = self._get_attribute('organization/url') self.licenses = list(self._find_licenses()) self.developers = list(self._find_parties('developers/developer')) self.contributors = list(self._find_parties('contributors/contributor')) self.mailing_lists = list(self._find_mailing_lists()) self.scm = self._find_scm() self.issue_management = self._find_issue_management() self.ci_management = self._find_ci_management() self.distribution_management = self._find_distribution_management() self.repositories = list(self._find_repositories('repositories/repository')) self.plugin_repositories = list(self._find_repositories('pluginRepositories/pluginRepository')) self.modules = self._get_attributes_list('modules/module') # FIXME: this attribute should be collected with the parent but # is not retrieved yet by pymaven it points to the relative path # where to find the full parent POM self.parent_relative_path = self._get_attribute('relativePath') # or '../pom.xml' # FIXME: Other types that are not collected for now (or # indirectly through dependencies management) include: build, # reporting, profiles, etc # dynamic attributes self._parent = None self._dep_mgmt = None self._dependencies = None self._properties = None