Пример #1
0
class make_rate_limited_function:
    """make_rate_limited_function(function, delay) -> callable object

    Create a version of function that does not run more often than
    once every delay seconds.

    """
    def __init__(self, function, delay):
        self.fn = function
        self.limiter = RequestLimiter(delay)
    def __call__(self, *args, **keywds):
        self.limiter.wait()
        return self.fn(*args, **keywds)
Пример #2
0
    def __init__(self, delay=5.0, parser=None):
        """__init__(self, delay=5.0, parser=None)

        Create a new Dictionary to access SwissProt.  parser is an optional
        parser (e.g. SProt.RecordParser) object to change the results
        into another form.  If set to None, then the raw contents of the
        file will be returned.  delay is the number of seconds to wait
        between each query.

        """
        import warnings
        from Bio.WWW import RequestLimiter
        warnings.warn(
            "Bio.SwissProt.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_sprot_raw instead.",
            DeprecationWarning)
        self.parser = parser
        self.limiter = RequestLimiter(delay)
Пример #3
0
    def __init__(self, delay=5.0, parser=None):
        """Dictionary(delay=5.0, parser=None)

        Create a new Dictionary to access PubMed.  parser is an optional
        parser (e.g. Medline.RecordParser) object to change the results
        into another form.  If set to None, then the raw contents of the
        file will be returned.  delay is the number of seconds to wait
        between each query.

        """
        self.parser = parser
        self.limiter = RequestLimiter(delay)
Пример #4
0
    def __init__(self, delay=5.0, parser=None):
        """__init__(self, delay=5.0, parser=None)

        Create a new Dictionary to access PROSITE.  parser is an optional
        parser (e.g. Prosite.RecordParser) object to change the results
        into another form.  If set to None, then the raw contents of the
        file will be returned.  delay is the number of seconds to wait
        between each query.

        """
        import warnings
        from Bio.WWW import RequestLimiter
        warnings.warn("Bio.Prosite.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_prosite_raw instead.",
              DeprecationWarning)
        self.parser = parser
        self.limiter = RequestLimiter(delay)
Пример #5
0
class Dictionary:
    """Access PubMed using a read-only dictionary interface.

    Methods:
    
    """
    def __init__(self, delay=5.0, parser=None):
        """Dictionary(delay=5.0, parser=None)

        Create a new Dictionary to access PubMed.  parser is an optional
        parser (e.g. Medline.RecordParser) object to change the results
        into another form.  If set to None, then the raw contents of the
        file will be returned.  delay is the number of seconds to wait
        between each query.

        """
        self.parser = parser
        self.limiter = RequestLimiter(delay)

    def __len__(self):
        raise NotImplementedError, "PubMed contains lots of entries"
    def clear(self):
        raise NotImplementedError, "This is a read-only dictionary"
    def __setitem__(self, key, item):
        raise NotImplementedError, "This is a read-only dictionary"
    def update(self):
        raise NotImplementedError, "This is a read-only dictionary"
    def copy(self):
        raise NotImplementedError, "You don't need to do this..."
    def keys(self):
        raise NotImplementedError, "You don't really want to do this..."
    def items(self):
        raise NotImplementedError, "You don't really want to do this..."
    def values(self):
        raise NotImplementedError, "You don't really want to do this..."
    
    def has_key(self, id):
        """S.has_key(id) -> bool"""
        try:
            self[id]
        except KeyError:
            return 0
        return 1

    def get(self, id, failobj=None):
        try:
            return self[id]
        except KeyError:
            return failobj
        raise "How did I get here?"

    def __getitem__(self, id):
        """S.__getitem__(id) -> object

        Return the Medline entry.  id is either the Medline Unique ID
        or the Pubmed ID of the article.  Raises a KeyError if there's an
        error.
        
        """
        # First, check to see if enough time has passed since my
        # last query.
        self.limiter.wait()
        
        try:
            handle = NCBI.efetch(
                db="pubmed", id=id, retmode='text', rettype='medlars')
        except IOError, x:
            # raise a KeyError instead of an IOError
            # XXX I really should distinguish between a real IOError and
            # if the id is not in the database.
            raise KeyError, x
        if self.parser is not None:
            return self.parser.parse(handle)
        return handle.read()
Пример #6
0
def download_many(ids, callback_fn, broken_fn=None, delay=120.0, faildelay=5.0,
                  batchsize=500, parser=None):
    """download_many(ids, callback_fn[, broken_fn][, delay][, faildelay][, batchsize])

    Download many records from PubMed.  ids is a list of either the
    Medline Unique ID or the PubMed ID's of the articles.  Each time a
    record is downloaded, callback_fn is called with the text of the
    record.  broken_fn is an optional function that is called with the
    id of records that were not able to be downloaded.  delay is the
    number of seconds to wait between requests.  batchsize is the
    number of records to request each time.

    """
    # parser is an undocumented parameter that allows people to
    # specify an optional parser to handle each record.  This is
    # dangerous because the results may be malformed, and exceptions
    # in the parser may disrupt the whole download process.
    if batchsize > 500 or batchsize < 1:
        raise ValueError, "batchsize must be between 1 and 500"
    limiter = RequestLimiter(delay)
    current_batchsize = batchsize
    
    # Loop until all the ids are processed.  We want to process as
    # many as possible with each request.  Unfortunately, errors can
    # occur.  Some id may be incorrect, or the server may be
    # unresponsive.  In addition, one broken id out of a list of id's
    # can cause a non-specific error.  Thus, the strategy I'm going to
    # take, is to start by downloading as many as I can.  If the
    # request fails, I'm going to half the number of records I try to
    # get.  If there's only one more record, then I'll report it as
    # broken and move on.  If the request succeeds, I'll double the
    # number of records until I get back up to the batchsize.
    nsuccesses = 0
    while ids:
        if current_batchsize > len(ids):
            current_batchsize = len(ids)
        
        id_str = ','.join(ids[:current_batchsize])

        # Make sure enough time has passed before I do another query.
        if not nsuccesses:
            limiter.wait(faildelay)
        else:
            limiter.wait()
        try:
            # Query PubMed.  If one or more of the id's are broken,
            # this will raise an IOError.
            handle = NCBI.efetch(
                db="pubmed", id=id_str, retmode='text', rettype='medlars')

            # I'm going to check to make sure PubMed returned the same
            # number of id's as I requested.  If it didn't then I'm going
            # to raise an exception.  This could take a lot of memory if
            # the batchsize is large.
            results = handle.read()
            num_ids = 0
            for x in Medline.Iterator(File.StringHandle(results)):
                num_ids = num_ids + 1
            if num_ids != current_batchsize:
                raise IOError
            handle = File.StringHandle(results)
        except IOError:   # Query did not work.
            if current_batchsize == 1:
                # There was only 1 id in the query.  Report it as
                # broken and move on.
                id = ids.pop(0)
                if broken_fn is not None:
                    broken_fn(id)
            else:
                # I don't know which one is broken.  Try again with
                # fewer id's.
                current_batchsize = current_batchsize / 2
            nsuccesses = 0
            continue
        nsuccesses = nsuccesses + 1

        # Iterate through the results and pass the records to the
        # callback.
        idnum = 0
        for rec in Medline.Iterator(handle, parser):
            callback_fn(ids[idnum], rec)
            idnum = idnum + 1

        ids = ids[current_batchsize:]

        # If I'm not downloading the maximum number of articles,
        # double the number for next time.
        if nsuccesses >= 2 and current_batchsize < batchsize:
            current_batchsize = current_batchsize * 2
            if current_batchsize > batchsize:
                current_batchsize = batchsize
Пример #7
0
def search_for(search, reldate=None, mindate=None, maxdate=None,
               batchsize=100, delay=2, callback_fn=None,
               start_id=0, max_ids=None):
    """search_for(search[, reldate][, mindate][, maxdate]
    [, batchsize][, delay][, callback_fn][, start_id][, max_ids]) -> ids

    Search PubMed and return a list of the PMID's that match the
    criteria.  search is the search string used to search the
    database.  reldate is the number of dates prior to the current
    date to restrict the search.  mindate and maxdate are the dates to
    restrict the search, e.g. 2002/01/01.  batchsize specifies the
    number of ids to return at one time.  By default, it is set to
    10000, the maximum.  delay is the number of seconds to wait
    between queries (default 2).  callback_fn is an optional callback
    function that will be called as passed a PMID as results are
    retrieved.  start_id specifies the index of the first id to
    retrieve and max_ids specifies the maximum number of id's to
    retrieve.

    XXX The date parameters don't seem to be working with NCBI's
    script.  Please let me know if you can get it to work.
    
    """
    class ResultParser(sgmllib.SGMLParser):
        # Parse the ID's out of the XML-formatted page that PubMed
        # returns.  The format of the page is:
        # [...]
        #    <Id>...</Id>
        # [...]
        def __init__(self):
            sgmllib.SGMLParser.__init__(self)
            self.ids = []
            self.in_id = 0
        def start_id(self, attributes):
            self.in_id = 1
        def end_id(self):
            self.in_id = 0
        _not_pmid_re = re.compile(r'\D')
        def handle_data(self, data):
            if not self.in_id:
                return
            # If data is just whitespace, then ignore it.
            data = string.strip(data)
            if not data:
                return
            # Everything here should be a PMID.  Check and make sure
            # data really is one.  A PMID should be a string consisting
            # of only integers.  Should I check to make sure it
            # meets a certain minimum length?
            if self._not_pmid_re.search(data):
                raise ValueError, \
                      "I expected an ID, but %s doesn't look like one." % \
                      repr(data)
            self.ids.append(data)

    params = {
        'db' : 'pubmed',
        'term' : search,
        'reldate' : reldate,
        'mindate' : mindate,
        'maxdate' : maxdate
        }
    for k, v in params.items():
        if v is None:
            del params[k]

    limiter = RequestLimiter(delay)
    ids = []
    while max_ids is None or len(ids) < max_ids:
        parser = ResultParser()
        
        # Check to make sure enough time has passed before my
        # last search.  If not, then wait.
        limiter.wait()

        start = start_id + len(ids)
        max = batchsize
        if max_ids is not None and max > max_ids - len(ids):
            max = max_ids - len(ids)

        params['retstart'] = start
        params['retmax'] = max
        h = NCBI.esearch(**params)
        parser.feed(h.read())
        ids.extend(parser.ids)
        if callback_fn is not None:
            # Call the callback function with each of the new ID's.
            for id in parser.ids:
                callback_fn(id)
        if len(parser.ids) < max or not parser.ids:  # no more id's to read
            break
    return ids
Пример #8
0
class ExPASyDictionary:
    """Access PROSITE at ExPASy using a read-only dictionary interface.

    """
    def __init__(self, delay=5.0, parser=None):
        """__init__(self, delay=5.0, parser=None)

        Create a new Dictionary to access PROSITE.  parser is an optional
        parser (e.g. Prosite.RecordParser) object to change the results
        into another form.  If set to None, then the raw contents of the
        file will be returned.  delay is the number of seconds to wait
        between each query.

        """
        import warnings
        from Bio.WWW import RequestLimiter
        warnings.warn("Bio.Prosite.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_prosite_raw instead.",
              DeprecationWarning)
        self.parser = parser
        self.limiter = RequestLimiter(delay)

    def __len__(self):
        raise NotImplementedError("Prosite contains lots of entries")
    def clear(self):
        raise NotImplementedError("This is a read-only dictionary")
    def __setitem__(self, key, item):
        raise NotImplementedError("This is a read-only dictionary")
    def update(self):
        raise NotImplementedError("This is a read-only dictionary")
    def copy(self):
        raise NotImplementedError("You don't need to do this...")
    def keys(self):
        raise NotImplementedError("You don't really want to do this...")
    def items(self):
        raise NotImplementedError("You don't really want to do this...")
    def values(self):
        raise NotImplementedError("You don't really want to do this...")
    
    def has_key(self, id):
        """has_key(self, id) -> bool"""
        try:
            self[id]
        except KeyError:
            return 0
        return 1

    def get(self, id, failobj=None):
        try:
            return self[id]
        except KeyError:
            return failobj

    def __getitem__(self, id):
        """__getitem__(self, id) -> object

        Return a Prosite entry.  id is either the id or accession
        for the entry.  Raises a KeyError if there's an error.
        
        """
        from Bio import ExPASy
        # First, check to see if enough time has passed since my
        # last query.
        self.limiter.wait()

        try:
            handle = ExPASy.get_prosite_entry(id)
        except IOError:
            raise KeyError(id)
        try:
            handle = File.StringHandle(_extract_record(handle))
        except ValueError:
            raise KeyError(id)
        
        if self.parser is not None:
            return self.parser.parse(handle)
        return handle.read()
Пример #9
0
class ExPASyDictionary:
    """Access SwissProt at ExPASy using a read-only dictionary interface.

    """
    def __init__(self, delay=5.0, parser=None):
        """__init__(self, delay=5.0, parser=None)

        Create a new Dictionary to access SwissProt.  parser is an optional
        parser (e.g. SProt.RecordParser) object to change the results
        into another form.  If set to None, then the raw contents of the
        file will be returned.  delay is the number of seconds to wait
        between each query.

        """
        self.parser = parser
        self.limiter = RequestLimiter(delay)

    def __len__(self):
        raise NotImplementedError, "SwissProt contains lots of entries"
    def clear(self):
        raise NotImplementedError, "This is a read-only dictionary"
    def __setitem__(self, key, item):
        raise NotImplementedError, "This is a read-only dictionary"
    def update(self):
        raise NotImplementedError, "This is a read-only dictionary"
    def copy(self):
        raise NotImplementedError, "You don't need to do this..."
    def keys(self):
        raise NotImplementedError, "You don't really want to do this..."
    def items(self):
        raise NotImplementedError, "You don't really want to do this..."
    def values(self):
        raise NotImplementedError, "You don't really want to do this..."
    
    def has_key(self, id):
        """has_key(self, id) -> bool"""
        try:
            self[id]
        except KeyError:
            return 0
        return 1

    def get(self, id, failobj=None):
        try:
            return self[id]
        except KeyError:
            return failobj
        raise "How did I get here?"

    def __getitem__(self, id):
        """__getitem__(self, id) -> object

        Return a SwissProt entry.  id is either the id or accession
        for the entry.  Raises a KeyError if there's an error.
        
        """
        # First, check to see if enough time has passed since my
        # last query.
        self.limiter.wait()

        try:
            handle = ExPASy.get_sprot_raw(id)
        except IOError:
            raise KeyError, id
        
        if self.parser is not None:
            return self.parser.parse(handle)
        return handle.read()
Пример #10
0
def qblast(program, database, sequence,
           auto_format=None,composition_based_statistics=None,
           db_genetic_code=None,endpoints=None,entrez_query='(none)',
           expect=10.0,filter=None,gapcosts=None,genetic_code=None,
           hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None,
           matrix_name=None,nucl_penalty=None,nucl_reward=None,
           other_advanced=None,perc_ident=None,phi_pattern=None,
           query_file=None,query_believe_defline=None,query_from=None,
           query_to=None,searchsp_eff=None,service=None,threshold=None,
           ungapped_alignment=None,word_size=None,
           alignments=500,alignment_view=None,descriptions=500,
           entrez_links_new_window=None,expect_low=None,expect_high=None,
           format_entrez_query=None,format_object=None,format_type='XML',
           ncbi_gi=None,results_file=None,show_overview=None
           ):
    """Do a BLAST search using the QBLAST server at NCBI.

    Supports all parameters of the qblast API for Put and Get.
    Some useful parameters:
    program        BLASTP or BLASTN
    database       Which database to search against.
    sequence       The sequence to search.
    ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
    descriptions   Number of descriptions to show.  Def 500.
    alignments     Number of alignments to show.  Def 500.
    expect         An expect value cutoff.  Def 10.0.
    matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
    filter         "none" turns off filtering.  Default no filtering
    format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
    entrez_query   Entrez query to limit Blast search
    hitlist_size   Number of hits to return. Default 50

    This function does no checking of the validity of the parameters
    and passes the values to the server as is.  More help is available at:
    http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html

    """
    import urllib, urllib2
    from Bio.WWW import RequestLimiter

    assert program == 'blastn' or program == 'blastp'

    # Format the "Put" command, which sends search requests to qblast.
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
    parameters = [
        ('AUTO_FORMAT',auto_format),
        ('COMPOSITION_BASED_STATISTICS',composition_based_statistics),
        ('DATABASE',database),
        ('DB_GENETIC_CODE',db_genetic_code),
        ('ENDPOINTS',endpoints),
        ('ENTREZ_QUERY',entrez_query),
        ('EXPECT',expect),
        ('FILTER',filter),
        ('GAPCOSTS',gapcosts),
        ('GENETIC_CODE',genetic_code),
        ('HITLIST_SIZE',hitlist_size),
        ('I_THRESH',i_thresh),
        ('LAYOUT',layout),
        ('LCASE_MASK',lcase_mask),
        ('MATRIX_NAME',matrix_name),
        ('NUCL_PENALTY',nucl_penalty),
        ('NUCL_REWARD',nucl_reward),
        ('OTHER_ADVANCED',other_advanced),
        ('PERC_IDENT',perc_ident),
        ('PHI_PATTERN',phi_pattern),
        ('PROGRAM',program),
        ('QUERY',sequence),
        ('QUERY_FILE',query_file),
        ('QUERY_BELIEVE_DEFLINE',query_believe_defline),
        ('QUERY_FROM',query_from),
        ('QUERY_TO',query_to),
        ('SEARCHSP_EFF',searchsp_eff),
        ('SERVICE',service),
        ('THRESHOLD',threshold),
        ('UNGAPPED_ALIGNMENT',ungapped_alignment),
        ('WORD_SIZE',word_size),
        ('CMD', 'Put'),
        ]
    query = [x for x in parameters if x[1] is not None]
    message = urllib.urlencode(query)

    # Send off the initial query to qblast.
    request = urllib2.Request("http://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
                              message,
                              {"User-Agent":"BiopythonClient"})
    handle = urllib2.urlopen(request)

    # Format the "Get" command, which gets the formatted results from qblast
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007    
    rid, rtoe = _parse_qblast_ref_page(handle)
    parameters = [
        ('ALIGNMENTS',alignments),
        ('ALIGNMENT_VIEW',alignment_view),
        ('DESCRIPTIONS',descriptions),
        ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window),
        ('EXPECT_LOW',expect_low),
        ('EXPECT_HIGH',expect_high),
        ('FORMAT_ENTREZ_QUERY',format_entrez_query),
        ('FORMAT_OBJECT',format_object),
        ('FORMAT_TYPE',format_type),
        ('NCBI_GI',ncbi_gi),
        ('RID',rid),
        ('RESULTS_FILE',results_file),
        ('SERVICE',service),
        ('SHOW_OVERVIEW',show_overview),
        ('CMD', 'Get'),
        ]
    query = [x for x in parameters if x[1] is not None]
    message = urllib.urlencode(query)

    # Poll NCBI until the results are ready.
    limiter = RequestLimiter(3)
    while 1:
        limiter.wait()
        request = urllib2.Request("http://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
                                  message,
                                  {"User-Agent":"BiopythonClient"})
        handle = urllib2.urlopen(request)
        results = handle.read()
        # XML results don't have the Status tag when finished
        if results.find("Status=") < 0:
            break
        i = results.index("Status=")
        j = results.index("\n", i)
        status = results[i+len("Status="):j].strip()
        if status.upper() == "READY":
            break

    return StringIO.StringIO(results)
Пример #11
0
class ExPASyDictionary:
    """Access SwissProt at ExPASy using a read-only dictionary interface.

    """
    def __init__(self, delay=5.0, parser=None):
        """__init__(self, delay=5.0, parser=None)

        Create a new Dictionary to access SwissProt.  parser is an optional
        parser (e.g. SProt.RecordParser) object to change the results
        into another form.  If set to None, then the raw contents of the
        file will be returned.  delay is the number of seconds to wait
        between each query.

        """
        import warnings
        from Bio.WWW import RequestLimiter
        warnings.warn(
            "Bio.SwissProt.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_sprot_raw instead.",
            DeprecationWarning)
        self.parser = parser
        self.limiter = RequestLimiter(delay)

    def __len__(self):
        raise NotImplementedError("SwissProt contains lots of entries")

    def clear(self):
        raise NotImplementedError("This is a read-only dictionary")

    def __setitem__(self, key, item):
        raise NotImplementedError("This is a read-only dictionary")

    def update(self):
        raise NotImplementedError("This is a read-only dictionary")

    def copy(self):
        raise NotImplementedError("You don't need to do this...")

    def keys(self):
        raise NotImplementedError("You don't really want to do this...")

    def items(self):
        raise NotImplementedError("You don't really want to do this...")

    def values(self):
        raise NotImplementedError("You don't really want to do this...")

    def has_key(self, id):
        """has_key(self, id) -> bool"""
        try:
            self[id]
        except KeyError:
            return 0
        return 1

    def get(self, id, failobj=None):
        try:
            return self[id]
        except KeyError:
            return failobj

    def __getitem__(self, id):
        """__getitem__(self, id) -> object

        Return a SwissProt entry.  id is either the id or accession
        for the entry.  Raises a KeyError if there's an error.
        
        """
        from Bio import ExPASy
        # First, check to see if enough time has passed since my
        # last query.
        self.limiter.wait()

        try:
            handle = ExPASy.get_sprot_raw(id)
        except IOError:
            raise KeyError(id)

        if self.parser is not None:
            return self.parser.parse(handle)
        return handle.read()
Пример #12
0
 def __init__(self, function, delay):
     self.fn = function
     self.limiter = RequestLimiter(delay)