Exemplo n.º 1
0
    def _initialize_directory(self):
        """Initialize the local DTD/XSD directories (PRIVATE).

        Added to allow for custom directory (cache) locations,
        for example when code is deployed on AWS Lambda.
        """
        # If user hasn't set a custom cache location, initialize it.
        if self.directory is None:
            import platform
            if platform.system() == "Windows":
                self.directory = os.path.join(os.getenv("APPDATA"), "biopython")
            else:  # Unix/Linux/Mac
                home = os.path.expanduser("~")
                self.directory = os.path.join(home, ".config", "biopython")
                del home
            del platform
        # Create DTD local directory
        self.local_dtd_dir = os.path.join(self.directory, "Bio", "Entrez", "DTDs")
        try:
            os.makedirs(self.local_dtd_dir)  # use exist_ok=True on Python >= 3.2
        except OSError as exception:
            # Check if local_dtd_dir already exists, and that it is a directory.
            # Trying os.makedirs first and then checking for os.path.isdir avoids
            # a race condition.
            if not os.path.isdir(self.local_dtd_dir):
                _raise_from(exception, None)
        # Create XSD local directory
        self.local_xsd_dir = os.path.join(self.directory, "Bio", "Entrez", "XSDs")
        try:
            os.makedirs(self.local_xsd_dir)  # use exist_ok=True on Python >= 3.2
        except OSError as exception:
            if not os.path.isdir(self.local_xsd_dir):
                _raise_from(exception, None)
Exemplo n.º 2
0
    def externalEntityRefHandler(self, context, base, systemId, publicId):
        """Handle external entity reference in order to cache DTD locally.

        The purpose of this function is to load the DTD locally, instead
        of downloading it from the URL specified in the XML. Using the local
        DTD results in much faster parsing. If the DTD is not found locally,
        we try to download it. If new DTDs become available from NCBI,
        putting them in Bio/Entrez/DTDs will allow the parser to see them.
        """
        urlinfo = _urlparse(systemId)
        # Following attribute requires Python 2.5+
        # if urlinfo.scheme=='http':
        if urlinfo[0] in ["http", "https", "ftp"]:
            # Then this is an absolute path to the DTD.
            url = systemId
        elif urlinfo[0] == "":
            # Then this is a relative path to the DTD.
            # Look at the parent URL to find the full path.
            try:
                source = self.dtd_urls[-1]
            except IndexError:
                # Assume the default URL for DTDs if the top parent
                # does not contain an absolute path
                source = "http://www.ncbi.nlm.nih.gov/dtd/"
            else:
                source = os.path.dirname(source)
            # urls always have a forward slash, don't use os.path.join
            url = source.rstrip("/") + "/" + systemId
        else:
            raise ValueError("Unexpected URL scheme %r" % (urlinfo[0]))
        self.dtd_urls.append(url)
        # First, try to load the local version of the DTD file
        location, filename = os.path.split(systemId)
        handle = self.open_dtd_file(filename)
        if not handle:
            # DTD is not available as a local file. Try accessing it through
            # the internet instead.
            try:
                handle = _urlopen(url)
            except IOError:
                _raise_from(
                    RuntimeError("Failed to access %s at %s" % (filename, url)), None
                )
            text = handle.read()
            handle.close()
            self.save_dtd_file(filename, text)
            handle = BytesIO(text)

        parser = self.parser.ExternalEntityParserCreate(context)
        parser.ElementDeclHandler = self.elementDecl
        parser.ParseFile(handle)
        handle.close()
        self.dtd_urls.pop()
        return 1
Exemplo n.º 3
0
    def parse(self, handle):
        """Parse the XML in the given file handle."""
        BLOCK = 1024
        while True:
            # Read in another block of the file...
            text = handle.read(BLOCK)
            try:
                self.parser.Parse(text, False)
            except expat.ExpatError as e:
                if self.parser.StartElementHandler:
                    # We saw the initial <!xml declaration, so we can be sure
                    # that we are parsing XML data. Most likely, the XML file
                    # is corrupted.
                    _raise_from(CorruptedXMLError(e), None)
                else:
                    # We have not seen the initial <!xml declaration, so
                    # probably the input data is not in XML format.
                    _raise_from(NotXMLError(e), None)
            try:
                records = self.record
            except AttributeError:
                if self.parser.StartElementHandler:
                    # We saw the initial <!xml declaration, and expat
                    # didn't notice any errors, so self.record should be
                    # defined. If not, this is a bug.
                    _raise_from(RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers via the mailing list or GitHub for assistance."), None)
                else:
                    # We did not see the initial <!xml declaration, so
                    # probably the input data is not in XML format.
                    _raise_from(NotXMLError("XML declaration not found"), None)

            if not isinstance(records, list):
                raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse")

            if not text:
                break

            while len(records) >= 2:
                # Then the first record is finished, while the second record
                # is still a work in progress.
                record = records.pop(0)
                yield record

        # We have reached the end of the XML file
        self.parser = None
        if self.element is not None:
            # No more XML data, but there is still some unfinished business
            raise CorruptedXMLError("Premature end of XML stream")

        # Send out the remaining records
        for record in records:
            yield record
Exemplo n.º 4
0
    def read(self, handle):
        """Set up the parser and let it parse the XML results."""
        # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser
        # expects binary data
        if handle.__class__.__name__ == "EvilHandleHack":
            handle = handle._handle
        if handle.__class__.__name__ == "TextIOWrapper":
            handle = handle.buffer
        if hasattr(handle, "closed") and handle.closed:
            # Should avoid a possible Segmentation Fault, see:
            # http://bugs.python.org/issue4877
            raise IOError("Can't parse a closed handle")
        if sys.version_info[0] >= 3:
            # Another nasty hack to cope with a unicode StringIO handle
            # since the Entrez XML parser expects binary data (bytes)
            from io import StringIO

            if isinstance(handle, StringIO):
                from Bio._py3k import _as_bytes

                handle = BytesIO(_as_bytes(handle.read()))
        try:
            self.parser.ParseFile(handle)
        except expat.ExpatError as e:
            if self.parser.StartElementHandler:
                # We saw the initial <!xml declaration, so we can be sure that
                # we are parsing XML data. Most likely, the XML file is
                # corrupted.
                _raise_from(CorruptedXMLError(e), None)
            else:
                # We have not seen the initial <!xml declaration, so probably
                # the input data is not in XML format.
                _raise_from(NotXMLError(e), None)
        try:
            return self.record
        except AttributeError:
            if self.parser.StartElementHandler:
                # We saw the initial <!xml declaration, and expat didn't notice
                # any errors, so self.record should be defined. If not, this is
                # a bug.
                _raise_from(
                    RuntimeError(
                        "Failed to parse the XML file correctly, possibly due to a bug "
                        "in Bio.Entrez. Please contact the Biopython developers via "
                        "the mailing list or GitHub for assistance."
                    ),
                    None,
                )
            else:
                # We did not see the initial <!xml declaration, so probably
                # the input data is not in XML format.
                _raise_from(NotXMLError("XML declaration not found"), None)