Пример #1
0
 def startElementHandler(self, name, attrs):
     # preprocessing the xml schema
     if self.is_schema:
         if len(attrs) == 1:
             schema = list(attrs.values())[0]
             handle = self.open_xsd_file(os.path.basename(schema))
             # if there is no local xsd file grab the url and parse the file
             if not handle:
                 handle = _urlopen(schema)
                 text = handle.read()
                 self.save_xsd_file(os.path.basename(schema), text)
                 handle.close()
                 self.parse_xsd(ET.fromstring(text))
             else:
                 self.parse_xsd(ET.fromstring(handle.read()))
                 handle.close()
     self.content = ""
     if name in self.lists:
         object = ListElement()
     elif name in self.dictionaries:
         object = DictionaryElement()
     elif name in self.structures:
         object = StructureElement(self.structures[name])
     elif name in self.items:  # Only appears in ESummary
         name = str(attrs["Name"])  # convert from Unicode
         del attrs["Name"]
         itemtype = str(attrs["Type"])  # convert from Unicode
         del attrs["Type"]
         if itemtype == "Structure":
             object = DictionaryElement()
         elif name in ("ArticleIds", "History"):
             object = StructureElement(["pubmed", "medline"])
         elif itemtype == "List":
             object = ListElement()
         else:
             object = StringElement()
         object.itemname = name
         object.itemtype = itemtype
     elif name in self.strings + self.errors + self.integers:
         self.attributes = attrs
         return
     else:
         # Element not found in DTD
         if self.validating:
             raise ValidationError(name)
         else:
             # this will not be stored in the record
             object = ""
     if object != "":
         object.tag = name
         if attrs:
             object.attributes = dict(attrs)
         if len(self.stack) != 0:
             current = self.stack[-1]
             try:
                 current.append(object)
             except AttributeError:
                 current[name] = object
     self.stack.append(object)
Пример #2
0
 def startElementHandler(self, name, attrs):
     # preprocessing the xml schema
     if self.is_schema:
         if len(attrs) == 1:
             schema = list(attrs.values())[0]
             handle = self.open_xsd_file(os.path.basename(schema))
             # if there is no local xsd file grab the url and parse the file
             if not handle:
                 handle = _urlopen(schema)
                 text = handle.read()
                 self.save_xsd_file(os.path.basename(schema), text)
                 handle.close()
                 self.parse_xsd(ET.fromstring(text))
             else:
                 self.parse_xsd(ET.fromstring(handle.read()))
                 handle.close()
     self.content = ""
     if name in self.lists:
         object = ListElement()
     elif name in self.dictionaries:
         object = DictionaryElement()
     elif name in self.structures:
         object = StructureElement(self.structures[name])
     elif name in self.items:  # Only appears in ESummary
         name = str(attrs["Name"])  # convert from Unicode
         del attrs["Name"]
         itemtype = str(attrs["Type"])  # convert from Unicode
         del attrs["Type"]
         if itemtype == "Structure":
             object = DictionaryElement()
         elif name in ("ArticleIds", "History"):
             object = StructureElement(["pubmed", "medline"])
         elif itemtype == "List":
             object = ListElement()
         else:
             object = StringElement()
         object.itemname = name
         object.itemtype = itemtype
     elif name in self.strings + self.errors + self.integers:
         self.attributes = attrs
         return
     else:
         # Element not found in DTD
         if self.validating:
             raise ValidationError(name)
         else:
             # this will not be stored in the record
             object = ""
     if object != "":
         object.tag = name
         if attrs:
             object.attributes = dict(attrs)
         if len(self.stack) != 0:
             current = self.stack[-1]
             try:
                 current.append(object)
             except AttributeError:
                 current[name] = object
     self.stack.append(object)
Пример #3
0
def _open(cgi, params=None, post=None, ecitmatch=False):
    """Build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.
    By default (post=None), POST is used if the URL encoded paramters would
    be over 1000 characters long.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    # NCBI requirement: At most three queries per second if no API key is provided.
    # Equivalently, at least a third of second between queries
    params = _construct_params(params)
    options = _encode_options(ecitmatch, params)
    delay = 0.1 if api_key else 0.333333334
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current

    # By default, post is None. Set to a boolean to over-ride length choice:
    if post is None and len(options) > 1000:
        post = True
    cgi = _construct_cgi(cgi, post, options)

    try:
        if post:
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)
Пример #4
0
    def externalEntityRefHandler(self, context, base, systemId, publicId):
        """Handle external entiry reference in order to cache DTD locally.

        The purpose of this function is to load the DTD locally, instead
        of downloading it from the URL specified in the XML. Using the local
        DTD results in much faster parsing. If the DTD is not found locally,
        we try to download it. If new DTDs become available from NCBI,
        putting them in Bio/Entrez/DTDs will allow the parser to see them.
        """
        urlinfo = _urlparse(systemId)
        # Following attribute requires Python 2.5+
        # if urlinfo.scheme=='http':
        if urlinfo[0] in ['http', 'https', 'ftp']:
            # Then this is an absolute path to the DTD.
            url = systemId
        elif urlinfo[0] == '':
            # Then this is a relative path to the DTD.
            # Look at the parent URL to find the full path.
            try:
                source = self.dtd_urls[-1]
            except IndexError:
                # Assume the default URL for DTDs if the top parent
                # does not contain an absolute path
                source = "http://www.ncbi.nlm.nih.gov/dtd/"
            else:
                source = os.path.dirname(source)
            # urls always have a forward slash, don't use os.path.join
            url = source.rstrip("/") + "/" + systemId
        else:
            raise ValueError("Unexpected URL scheme %r" % (urlinfo[0]))
        self.dtd_urls.append(url)
        # First, try to load the local version of the DTD file
        location, filename = os.path.split(systemId)
        handle = self.open_dtd_file(filename)
        if not handle:
            # DTD is not available as a local file. Try accessing it through
            # the internet instead.
            try:
                handle = _urlopen(url)
            except IOError:
                raise RuntimeError("Failed to access %s at %s" % (filename, url))
            text = handle.read()
            handle.close()
            self.save_dtd_file(filename, text)
            handle = BytesIO(text)

        parser = self.parser.ExternalEntityParserCreate(context)
        parser.ElementDeclHandler = self.elementDecl
        parser.ParseFile(handle)
        handle.close()
        self.dtd_urls.pop()
        return 1
Пример #5
0
 def startElementHandler(self, name, attrs):
     # First, check if the current consumer can use the tag
     if self.consumer is not None:
         consumed = self.consumer.startElementHandler(name, attrs)
         if consumed:
             return
     # preprocessing the xml schema
     if self.is_schema:
         if len(attrs) == 1:
             schema = list(attrs.values())[0]
             handle = self.open_xsd_file(os.path.basename(schema))
             # if there is no local xsd file grab the url and parse the file
             if not handle:
                 handle = _urlopen(schema)
                 text = handle.read()
                 self.save_xsd_file(os.path.basename(schema), text)
                 handle.close()
                 self.parse_xsd(ET.fromstring(text))
             else:
                 self.parse_xsd(ET.fromstring(handle.read()))
                 handle.close()
     cls = self.classes.get(name)
     if cls is None:
         # Element not found in DTD
         if self.validating:
             raise ValidationError(name)
         else:
             # this will not be stored in the record
             consumer = Consumer(name, attrs)
     else:
         consumer = cls(name, attrs)
     consumer.parent = self.consumer
     if self.consumer is None:
         # This is relevant only for Entrez.parse, not for Entrez.read.
         # If self.consumer is None, then this is the first start tag we
         # encounter, and it should refer to a list. Store this list in
         # the record attribute, so that Entrez.parse can iterate over it.
         # The record attribute will be set again at the last end tag;
         # However, it doesn't hurt to set it twice.
         value = consumer.value
         if value is not None:
             self.record = value
     self.consumer = consumer
Пример #6
0
def _open(cgi, params=None, post=None, ecitmatch=False):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.
    By default (post=None), POST is used if the query URL would be over
    1000 characters long.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    if params is None:
        params = {}
    # NCBI requirement: At most three queries per second.
    # Equivalently, at least a third of second between queries
    delay = 0.333333334
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current
    # Remove None values from the parameters
    for key, value in list(params.items()):
        if value is None:
            del params[key]
    # Tell Entrez that we are using Biopython (or whatever the user has
    # specified explicitly in the parameters or by changing the default)
    if "tool" not in params:
        params["tool"] = tool
    # Tell Entrez who we are
    if "email" not in params:
        if email is not None:
            params["email"] = email
        else:
            warnings.warn("""
Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is [email protected], you can specify it as follows:
   from Bio import Entrez
   Entrez.email = '*****@*****.**'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.""", UserWarning)

    # Open a handle to Entrez.
    options = _urlencode(params, doseq=True)
    # _urlencode encodes pipes, which NCBI expects in ECitMatch
    if ecitmatch:
        options = options.replace('%7C', '|')
    # print cgi + "?" + options

    # By default, post is None. Set to a boolean to over-ride length choice:
    if post is None and len(options) > 1000:
        post = True
    try:
        if post:
            # HTTP POST
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            # HTTP GET
            cgi += "?" + options
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)
Пример #7
0
def _open(cgi, params=None, post=None, ecitmatch=False):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.
    By default (post=None), POST is used if the query URL would be over
    1000 characters long.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    if params is None:
        params = {}
    # NCBI requirement: At most three queries per second.
    # Equivalently, at least a third of second between queries
    delay = 0.333333334
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current
    # Remove None values from the parameters
    for key, value in list(params.items()):
        if value is None:
            del params[key]
    # Tell Entrez that we are using Biopython (or whatever the user has
    # specified explicitly in the parameters or by changing the default)
    if "tool" not in params:
        params["tool"] = tool
    # Tell Entrez who we are
    if "email" not in params:
        if email is not None:
            params["email"] = email
        else:
            warnings.warn(
                """
Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is [email protected], you can specify it as follows:
   from Bio import Entrez
   Entrez.email = '*****@*****.**'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.""", UserWarning)

    # Open a handle to Entrez.
    options = _urlencode(params, doseq=True)
    # _urlencode encodes pipes, which NCBI expects in ECitMatch
    if ecitmatch:
        options = options.replace('%7C', '|')
    # print cgi + "?" + options

    # By default, post is None. Set to a boolean to over-ride length choice:
    if post is None and len(options) > 1000:
        post = True
    try:
        if post:
            # HTTP POST
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            # HTTP GET
            cgi += "?" + options
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)