示例#1
0
 def test_esearchs(self):
     """Get the first batch of 16S GIs
     
     this test isn't reliable as NCBI does not always return the same GIs as
     more records are added. Just make sure we get back 10 GI like ids
     """
     # only pulling 10 for testing simplicity
     obs = esearch('16S', retmax=10, binsize=3)
     try:
         foo = map(int, obs)
     except:
         self.fail()
示例#2
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # if we already have these records, then we do not need to reobtain them
    if opts.existing_gb:
        existing_gis = set([l.strip() for l in open(opts.existing_gb)])
    else:
        existing_gis = set([])

    if opts.verbose:
        print "Number of existing GIs: %d" % len(existing_gis)

    if opts.possible_new_gb_out is None:
        option_parser.error("Need to specify --possible-new-gb-output")

    if opts.cached_ids:
        possible_gis = set([l.strip() for l in open(opts.cached_ids)])
    else:
        #ncbi_record_queries = ['16S','18S','small subunit','rrna[fkey]','ribosomal']
        ncbi_record_queries = ['16S AND tm7']
        # grab all the ids
        possible_gis = set([])
        for query in ncbi_record_queries:
            if opts.verbose:
                cur_size = len(possible_gis)
            possible_gis.update(esearch(query, retmax=10000000))

            if opts.verbose:
                print "Query %s added %d to set" % (query, len(possible_gis) - cur_size)

    # drop out any existing ids
    possible_gis = possible_gis - existing_gis

    if opts.verbose:
        print "Total number of GIs to query: %d" % len(possible_gis)
   
    chunk_count = 0
    total_bytes = 0
    if opts.use_gz:
        poss_output = open_gz(opts.possible_new_gb_out,'w')
    else:
        poss_output = open(opts.possible_new_gb_out,'w')
    
    collected = set([])

    retries = 0
    while possible_gis and retries < 100:
        try:
            for chunk in bulk_efetch(possible_gis):
                chunk_count += 1
                total_bytes += len(chunk)

                # Occasionally, and silently, NCBI corrupts records. 
                if '<html>' in chunk:
                    if verbose:
                        print "Erroneous record in chunk, disregarding full chunk"
                        continue

                # pullout the GIs
                records = [] 
                for l in chunk.splitlines():
                    if l.startswith('VERSION'):
                        records.append(l.split(':')[1])

                if opts.verbose:
                    print "%s - retry: %d, Chunk %d, covering %d records, writing %d bytes, %d written in total" % \
                        (time.strftime("%m-%d-%y %H:%M:%S"), retries, chunk_count, len(records), len(chunk), total_bytes)
                poss_output.write(chunk)
                collected.update(set(records))
        except Exception, e:
            retries += 1
            print "Caught exception: ", e
        possible_gis = possible_gis - collected
        collected = set([])
        
        possible_gis_at_retry = open('possible_retries_at_retry_%d.txt.gz' % retries, 'w')
        possible_gis_at_retry.write('\n'.join(possible_gis))
        possible_gis_at_retry.close()
示例#3
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # if we already have these records, then we do not need to reobtain them
    if opts.existing_gb:
        existing_gis = set([l.strip() for l in open(opts.existing_gb)])
    else:
        existing_gis = set([])

    if opts.verbose:
        print "Number of existing GIs: %d" % len(existing_gis)

    if opts.possible_new_gb_out is None:
        option_parser.error("Need to specify --possible-new-gb-output")

    if opts.cached_ids:
        possible_gis = set([l.strip() for l in open(opts.cached_ids)])
    else:
        # ncbi_record_queries = ['16S','18S','small subunit','rrna[fkey]','ribosomal']
        ncbi_record_queries = ["16S AND tm7"]
        # grab all the ids
        possible_gis = set([])
        for query in ncbi_record_queries:
            if opts.verbose:
                cur_size = len(possible_gis)
            possible_gis.update(esearch(query, retmax=10000000))

            if opts.verbose:
                print "Query %s added %d to set" % (query, len(possible_gis) - cur_size)

    # drop out any existing ids
    possible_gis = possible_gis - existing_gis

    if opts.verbose:
        print "Total number of GIs to query: %d" % len(possible_gis)

    chunk_count = 0
    total_bytes = 0
    if opts.use_gz:
        poss_output = open_gz(opts.possible_new_gb_out, "w")
    else:
        poss_output = open(opts.possible_new_gb_out, "w")

    collected = set([])

    retries = 0
    while possible_gis and retries < 100:
        try:
            for chunk in bulk_efetch(possible_gis):
                chunk_count += 1
                total_bytes += len(chunk)

                # Occasionally, and silently, NCBI corrupts records.
                if "<html>" in chunk:
                    if verbose:
                        print "Erroneous record in chunk, disregarding full chunk"
                        continue

                # pullout the GIs
                records = []
                for l in chunk.splitlines():
                    if l.startswith("VERSION"):
                        records.append(l.split(":")[1])

                if opts.verbose:
                    print "%s - retry: %d, Chunk %d, covering %d records, writing %d bytes, %d written in total" % (
                        time.strftime("%m-%d-%y %H:%M:%S"),
                        retries,
                        chunk_count,
                        len(records),
                        len(chunk),
                        total_bytes,
                    )
                poss_output.write(chunk)
                collected.update(set(records))
        except Exception, e:
            retries += 1
            print "Caught exception: ", e
        possible_gis = possible_gis - collected
        collected = set([])

        possible_gis_at_retry = open("possible_retries_at_retry_%d.txt.gz" % retries, "w")
        possible_gis_at_retry.write("\n".join(possible_gis))
        possible_gis_at_retry.close()