示例#1
0
文件: gffrec.py 项目: zghnbv/deepgo
def gff_view(params):
    """
    Converts data to fastya
    """

    print("##gff-version 3")

    for param in params:

        # Stop when data was not found.
        if not param.json:
            utils.error(f"data not found: {param.name}")

        # Each data may have multiple entries.
        for item in param.json:

            # Pull out the features.
            feats = item[const.FEATURES]

            # The name of the GFF anchor.
            anchor = param.seqid or item['id']

            # Subselect by coordinates.
            feats = jsonrec.filter_features(feats,
                                            start=param.start,
                                            end=param.end,
                                            gene=param.gene,
                                            ftype=param.type,
                                            regexp=param.regexp)

            # Generate the gff output
            for feat in feats:
                values = feature2gff(feat, anchor=anchor)
                values = map(str, values)
                print("\t".join(values))
示例#2
0
文件: ncbi.py 项目: jhpeach/bio
def genome(name, fname, update=False, genbank={}, refseq={}, summary=ASSEMBLY_FILE_NAME,
           jsondb=ASSEMBLY_JSON_DB):
    """
    Parse and search and assembly file for an accession number.
    """

    # Update assembly information if it is missing.
    if not os.path.isfile(summary):
        update = True

    # When update is true get the assembly summary file again.
    if update:
        logger.info("updating assembly summary")
        download_assembly()

    if not os.path.isfile(jsondb):
        utils.error("json db needs to be built")

    urlpath = genbank.get(name) or refseq.get(name)

    # Read the file line by line.
    if urlpath:
        download_file(url=urlpath, dest=fname)
    else:
        # If we go this far we have not found the data.
        print(f'*** accession not found: {name}')
示例#3
0
文件: dblink.py 项目: Natay/bio-2
def search(term, db='sra', tabular=False, limit=None):

    limit = 10000 if not limit else limit

    env = entrez.esearch(db=db, term=term, usehistory="y")

    data = entrez.efetch(db=db, env=env, retmax=limit, rettype="runinfo")

    elems = data.get('SraRunInfo', {}).get("Row", {})

    if not elems:
        utils.error("the query at SRA has not returned results.")

    if tabular and elems:
        fieldnames = elems[0].keys()
        writer = csv.DictWriter(sys.stdout,
                                delimiter="\t",
                                fieldnames=fieldnames)
        writer.writeheader()
        for row in elems:
            writer.writerow(row)

    else:
        pprint(elems)

    return data
示例#4
0
def gff_view(params):
    """
    Converts data to fastya
    """

    print("##gff-version 3")

    for param in params:

        # GFF is a interval (record mode).
        param.record = True

        # Stop when data was not found.
        if not param.json:
            utils.error(f"data not found: {param.acc}")

        # Each data may have multiple entries.
        for item in param.json:

            # Pull out the features.
            feats = jsonrec.get_json_features(item)

            # The name of the GFF anchor.
            anchor = param.seqid or item['id']

            # Subselect by coordinates.
            feats = jsonrec.filter_features(feats, param=param)

            # Generate the gff output
            for feat in feats:
                for values in feature2gff(feat,
                                          anchor=anchor,
                                          allow_parent=not (param.type)):
                    values = map(str, values)
                    print("\t".join(values))
示例#5
0
def parse_file(fname, seqid=None):
    """
    Parses a recognized file into a JSON representation
    """

    logger.info(f"parsing {fname}")

    if not os.path.exists(fname):
        logger.warning(f"File does not exist: {fname}")
        return

    # Handle both compressed and uncompressed formats.
    stream = gzip.open(fname, 'rt') if fname.endswith(".gz") else open(
        fname, 'rt')

    # Detect extentions
    name, ext = os.path.splitext(fname)
    ext = ext.lower()

    # Split extension one more time if it looks like a compressed file.
    if ext == ".gz":
        name, ext = os.path.splitext(name)
        ext = ext.lower()

    # Cascade over the known file formats.
    if ext in (".gb", ".gbk", ".genbank"):
        recs = SeqIO.parse(stream, format=const.GENBANK)
        data = convert_genbank(recs, seqid=seqid)
    elif ext in (".fa", ".fasta"):
        recs = SeqIO.parse(stream, format=const.FASTA)
        data = convert_fasta(recs, seqid=seqid)
    else:
        utils.error(f"file format not recognized: {fname}")

    return data
示例#6
0
def parse_data(fname, study_size=10):
    """
    Take a .gaf file and return an association dictionary and population dict.
    """

    if not os.path.isfile(fname):
        utils.error("Association file needs to be downloaded first.")

    # Read the population from file
    association = {}
    population = set()

    stream = utils.gz_read(fname, 'r')
    print(f"*** parsing {fname}")
    # Get the gene from each row
    for line in stream:
        line = line.decode()
        if line.startswith('!'):
            continue
        gene = line.split('\t')[2]
        goterm = line.split('\t')[4]

        association.setdefault(gene, set()).update([goterm])
        population.update([gene])

    return population, association
示例#7
0
def search_names(word, archive=TAXDB_NAME, name="names.dmp", limit=None):
    """
    Processes the names.dmp component of the taxdump.
    """

    # Needs a taxdump to work.
    if not os.path.isfile(archive):
        utils.error("taxdump file not found (download and build it first)")

    # Open stream into the tarfile.
    stream = open_tarfile(archive=archive, filename=name, limit=limit)

    # The pattern may be regular expression.
    patt = re.compile(word, re.IGNORECASE)

    # Labels that will be searched.
    valid = {'scientific name', 'equivalent name', 'genbank common name'}

    def select(row):
        taxid, name, label = row[0], row[2], row[6]
        return label in valid and patt.search(name)

    # Apply the selector.
    stream = filter(select, stream)
    for elems in stream:
        taxid, name, label = elems[0], elems[2], elems[6]
        yield taxid, name
示例#8
0
文件: jsonrec.py 项目: zghnbv/deepgo
def json_view(params):
    """
    Prints json output to
    """
    for param in params:

        # Stop when data was not found.
        if not param.json:
            utils.error(f"data not found: {param.name}")

        # Produce the full file when no parameters are set.
        if param.unset():
            text = json.dumps(param.json, indent=4)
            print(text)
        else:
            # Selects individual features.
            for item in param.json:
                feats = item[const.FEATURES]
                feats = filter_features(feats,
                                        start=param.start,
                                        end=param.end,
                                        ftype=param.type,
                                        gene=param.gene,
                                        regexp=param.regexp)
                text = json.dumps(list(feats), indent=4)
                print(text)
示例#9
0
def run(start=1,
        end='',
        mode=LOCAL_ALIGN,
        gap_open=11,
        gap_extend=1,
        protein=False,
        translate=False,
        inter=False,
        verbose=False,
        query='',
        target=''):
    """
    Handles an alignment request.
    """

    # Set the verbosity of the process.
    utils.set_verbosity(logger, level=int(verbose))

    # Ensure counter is reset.
    jsonrec.reset_counter()

    # Requires two inputs.
    if not (query and target):
        utils.error(f"Please specify both a QUERY and a TARGET")

    param1 = objects.Param(name=query,
                           protein=protein,
                           translate=translate,
                           start=start,
                           end=end,
                           gap_open=gap_open,
                           gap_extend=gap_extend,
                           mode=mode)
    param2 = objects.Param(name=target,
                           protein=protein,
                           translate=translate,
                           start=start,
                           end=end,
                           gap_open=gap_open,
                           gap_extend=gap_extend,
                           mode=mode)

    # Get the JSON data.
    param1.json = storage.get_json(param1.name, inter=inter, strict=True)
    param2.json = storage.get_json(param2.name, inter=inter, strict=True)

    for rec1 in param1.json:

        for rec2 in param2.json:

            qrecs = fastarec.get_fasta(rec1, param=param1)
            trecs = fastarec.get_fasta(rec2, param=param2)

            for qseq in qrecs:
                for tseq in trecs:
                    parasail_align(qseq=qseq, tseq=tseq, param=param1)
示例#10
0
def get_data(preload=False):
    if preload:
        if not os.path.isfile(JSON_DB):
            utils.error(
                f"ontology file not found (you must build it first): {JSON_DB}"
            )
        store = json.load(open(JSON_DB))
        terms = store[TERM]
    else:
        terms = open_db(TERM)

    return terms
示例#11
0
def get_data(preload=False):
    if preload:
        if not os.path.isfile(JSON_DB):
            utils.error(
                f"taxonomy file not found (you must build it first): {JSON_DB}"
            )
        store = json.load(open(JSON_DB))
        names = store[NAMES]
        graph = store[GRAPH]
    else:
        names = open_db(NAMES)
        graph = open_db(GRAPH)

    return names, graph
示例#12
0
def genbank_view(params):
    for param in params:
        altname = resolve_fname(param.acc, format="gb")

        if os.path.isfile(param.acc):
            stream = utils.gz_read(param.acc)
        elif os.path.isfile(altname):
            stream = utils.gz_read(altname)
        else:
            stream = []
            utils.error(f"data not found: {param.acc}")

        for line in stream:
            print(line, end='')
示例#13
0
def fetch_genbank(acc, dest_name):
    """
    Returns a genbank file.
    """
    try:
        db = 'nuccore'

        rettype, retmode = "gbwithparts", "text"

        params = dict(db=db, rettype=rettype, id=acc, retmode=retmode)

        utils.download(EFETCH_URL, params=params, dest_name=dest_name)

    except Exception as exc:
        utils.error(exc)
示例#14
0
def get_data(preload=False, acc=False):
    """
    Returns the graph structure for the database.
    """
    if preload:
        if not os.path.isfile(JSON_DB):
            utils.error(f"taxonomy file not found (you must build it first): {JSON_DB}")
        store = json.load(open(JSON_DB))
        names = store[TAXID]
        graph = store[GRAPH]
    else:
        names = open_db(TAXID)
        graph = open_db(GRAPH)

    return names, graph
示例#15
0
def get_data(preload=False):
    if preload:
        if not os.path.isfile(JSON_DB):
            utils.error(f"ontology file not found (you must build it first): {JSON_DB}")
        store = json.load(open(JSON_DB))
        terms = store[TERM]
        nodes = store[GRAPH]
        names = store[NAMES]
        back = store[CHILDREN]
    else:
        terms = utils.open_db(TERM, fname=SQLITE_DB)
        nodes = utils.open_db(GRAPH, fname=SQLITE_DB)
        names = utils.open_db(NAMES, fname=SQLITE_DB)
        back = utils.open_db(CHILDREN, fname=SQLITE_DB)

    return terms, nodes, names, back
示例#16
0
def build_database(fname=TAXDB_NAME, limit=None):
    """
    Downloads taxdump file.
    """
    print(f"*** building database from: {fname}")
    path = os.path.join(utils.DATADIR, fname)

    # Check the file.
    if not os.path.isfile(path):
        utils.error(f"no taxdump file found, run the --download flag")

    # Parse the names
    name_dict = parse_names(fname, limit=limit)

    # Parse the nodes.
    node_dict, back_dict = parse_nodes(fname, name_dict=name_dict, limit=limit)

    def save_table(name, obj):
        size = len(obj)
        table = open_db(table=name, flag='w')
        for index, (key, value) in enumerate(obj.items()):
            table[key] = value
            if index % CHUNK == 0:
                perc = round(index / size * 100)
                print(
                    f"*** saving {name} with {size:,} elements ({perc:.0f}%)",
                    end="\r")
                table.commit()
        print(f"*** saved {name} with {size:,} elements (100%)", end="\r")
        print("")
        table.commit()
        table.close()

    # Save the names into the database
    save_table(NAMES, name_dict)

    # Save the nodes.
    save_table(GRAPH, node_dict)

    print("*** saving the JSON model")
    json_path = os.path.join(utils.DATADIR, JSON_DB)

    # JSON will only have the graph and names.
    store = dict(NAMES=name_dict, GRAPH=node_dict, SYNONYMS={}, BACK={})
    fp = open(json_path, 'wt')
    json.dump(store, fp, indent=4)
    fp.close()
示例#17
0
def filter_file(stream, terms, keep, remove, graph, colidx=0):
    """
    Filters a file to retain only the rows where a taxid is ina subtree.
    """
    if not stream:
        if len(terms) == 0:
            msg = f"filtering needs an input stream or a filename"
            utils.error(msg)
        stream = open(terms[0])

    # Collects all children of the taxids.
    keep_dict, remove_dict = {}, {}

    # Taxids to keep
    keeper = keep.split(",")
    # Fill the keeper dictionary.
    for term in keeper:
        dfs_visitor(graph=graph, node=term, visited=keep_dict)

    # Fill the remover dictionary.
    remover = remove.split(",")
    for term in remover:
        dfs_visitor(graph=graph, node=term, visited=remove_dict)

    # Read the stream.
    reader = csv.reader(stream, delimiter="\t")

    # Selection condition.
    def keep_func(row):
        taxid = row[colidx]
        return taxid in keep_dict

    def remove_func(row):
        taxid = row[colidx]
        return taxid not in remove_dict

    # What to keep.
    if keep:
        reader = filter(keep_func, reader)

    # What to remove.
    if remove:
        reader = filter(remove_func, reader)

    # Generate the output.
    writer = csv.writer(sys.stdout, delimiter="\t")
    writer.writerows(reader)
示例#18
0
文件: taxdb.py 项目: jhpeach/bio
def build_database(fname=TAXDB_NAME, limit=None):
    """
    Downloads taxdump file.
    """
    print(f"*** building database from: {fname}")
    path = os.path.join(utils.DATADIR, fname)

    # Check the file.
    if not os.path.isfile(path):
        utils.error(f"no taxdump file found, run the --download flag")

    # Get the assembly.
    _, _, taxon_acc = ncbi.parse_summary()

    # Parse the names
    name_dict, latin_dict = parse_names(fname,
                                        limit=limit,
                                        taxon_acc=taxon_acc)

    # Parse the nodes.
    node_dict, back_dict = parse_nodes(fname, name_dict=name_dict, limit=limit)

    def save_table(name, obj):
        utils.save_table(name=name, obj=obj, fname=SQLITE_DB)

    # Save the names into the database
    save_table(NAMES, name_dict)

    # Save the nodes.
    save_table(GRAPH, node_dict)

    # Save the latin names.
    save_table(LATIN, latin_dict)

    print("*** saving the JSON model")
    json_path = os.path.join(utils.DATADIR, JSON_DB)

    # JSON will only have the graph and names.
    store = dict(NAMES=name_dict,
                 GRAPH=node_dict,
                 SYNONYMS={},
                 BACK={},
                 LATIN=latin_dict)
    fp = open(json_path, 'wt')
    json.dump(store, fp, indent=4)
    fp.close()
示例#19
0
def get_json(name, seqid=None, update=False, inter=False, strict=False):
    """
    Attempts to return a JSON formatted data based on a name.
    """

    # Data is an existing path to a file.
    if os.path.isfile(name):
        data = jsonrec.parse_file(name, seqid=seqid)
        return data

    # Not a local file, attempt to resolve to storage.

    # Report as not found if update is requested.
    if update:
        return None

    # The JSON representation of the data.
    json_name = resolve_fname(name=name, format="json")

    # GenBank representation of the data.
    gbk_name = resolve_fname(name=name, format="gb")

    # Found the JSON representation of the file.
    if os.path.isfile(json_name):
        logger.info(f"found {json_name}")
        data = read_json_file(json_name)
        return data

    # No JSON file but there is a genbank file.
    if os.path.isfile(gbk_name):
        logger.info(f"found {gbk_name}")
        data = jsonrec.parse_file(fname=gbk_name, seqid=seqid)
        data = save_json_file(fname=json_name, data=data)
        return data

    # If not found and interactive mode create a JSON from the name itself.
    if inter:
        data = jsonrec.make_jsonrec(seq=name, seqid=seqid)
        return data

    # At this point the data was not found
    if strict:
        utils.error(f"data not found: {name}")

    return None
示例#20
0
def json_view(params):
    """
    Prints json output to
    """
    for param in params:

        # Stop if data was not found.
        if not param.json:
            utils.error(f"data not found: {param.acc}")

        # Override the sequence ids for every record.
        if param.seqid:
            for rec in param.json:
                rec[const.SEQID] = param.seqid

        # Produce the a nicely indended JSON representation.
        text = json.dumps(param.json, indent=4)
        print(text)
示例#21
0
def modify_record(seq, param):
    """
    Modifies a sequence record based on parameters.
    """

    # Shortcuts to coordinates.
    start, end = param.start, param.end

    # Words are added to description to keep track of operations
    desc = []

    # Slice the sequence.
    if start != 0 or end:
        # Don't exceed sequence length.
        end = len(seq) if not end else min(end, len(seq))
        seq = seq[start:end]
        desc.append(f'[{start + 1}:{end}]')

    try:
        # Possible sequence transformations.
        if param.revcomp:
            seq = seq.reverse_complement()
            desc.append("reverse-complemented")

        if param.reverse:
            seq = seq[::-1]
            desc.append("reversed")

        if param.complement:
            seq = seq.complement()
            desc.append("complemented")

        if param.translate:
            seq = seq.translate()
            desc.append("translated")

        if param.transcribe:
            seq = seq.transcribe()
            desc.append("transcribed DNA")

    except Exception as exc:
        utils.error(exc)

    return seq, desc
示例#22
0
def fasta_view(params):
    """
    Converts data to fastya
    """

    for param in params:

        # Stop when data was not found.
        if not param.json:
            utils.error(f"data not found: {param.acc}")

        # Each data may have multiple entries.
        for item in param.json:

            # Get the fasta for each entry.
            recs = get_fasta(item, param=param)

            # Print the fasta records.
            print_fasta(recs)
示例#23
0
def get_json(name, seqid=None, inter=False, strict=False):
    """
    Attempts to return a JSON formatted data based on a name.
    """

    # Data is an existing path to a JSON file.
    if os.path.isfile(name):
        try:
            data = jsonrec.parse_file(name, seqid=seqid)
        except Exception as exc:
            logger.error(f"JSON parsing error for file {name}: {exc}")
            sys.exit(-1)
        return data

    # The JSON representation of the data.
    json_name = resolve_fname(name=name, format="json")

    # GenBank representation of the data.
    gbk_name = resolve_fname(name=name, format="gb")

    # Found the JSON representation of the file.
    if os.path.isfile(json_name):
        logger.info(f"found {json_name}")
        data = read_json_file(json_name)
        return data

    # There is no JSON file but there is a GenBank file.
    if os.path.isfile(gbk_name):
        logger.info(f"found {gbk_name}")
        data = jsonrec.parse_file(fname=gbk_name, seqid=seqid)
        data = save_json_file(fname=json_name, data=data)
        return data

    # Interactive input, make JSON from name
    if inter:
        data = jsonrec.make_jsonrec(name, seqid=seqid)
        return data

    # Raise error if in strict mode
    if strict:
        utils.error(f"data not found: {name}")
    return None
示例#24
0
文件: taxdb.py 项目: jhpeach/bio
def get_data(preload=False, acc=False):
    if preload:
        if not os.path.isfile(JSON_DB):
            utils.error(
                f"taxonomy file not found (you must build it first): {JSON_DB}"
            )
        store = json.load(open(JSON_DB))
        names = store[NAMES]
        graph = store[GRAPH]
        latin = store[LATIN]
    else:
        names = open_db(NAMES)
        graph = open_db(GRAPH)
        latin = open_db(LATIN)

    if acc:
        _, taxon_acc, _ = ncbi.get_data()
    else:
        taxon_acc = {}

    return names, graph, taxon_acc, latin
示例#25
0
文件: main.py 项目: zghnbv/deepgo
    def make_param(name):
        """
        Creates a parameter for each accession.

        """
        # Set the verbosity
        utils.set_verbosity(logger, level=int(verbose))

        # A very common error to pass a fragment as
        if name.startswith("-"):
            msg = f"Invalid accession number: {name}"
            utils.error(msg)

        # A simple wrapper class to carry all parameters around.
        p = objects.Param(start=start, end=end, seqid=seqid, protein=protein, revcomp=revcomp,
                        update=update, name=name, gff=gff, translate=translate, reverse=reverse, complement=complement,
                        fasta=fasta, type=type, gene=gene, regexp=match, transcribe=transcribe)

        # Fill the json data for the parameter.
        p.json = storage.get_json(p.name, seqid=seqid, inter=inter)
        return p
示例#26
0
def search_names(word, fname=TAXDB_NAME, name="names.dmp", limit=None):
    """
    Parses the names.dmp component of the taxdump.
    """

    if not os.path.isfile(fname):
        utils.error("taxdump file not found (download and build it first)")

    # The taxdump file.
    tar = tarfile.open(fname, "r:gz")

    stream = get_stream(tar=tar, name=name, limit=limit)
    stream = csv.reader(stream, delimiter="\t")

    patt = re.compile(word, re.IGNORECASE)

    for index, elems in enumerate(stream):
        taxid, name, label = elems[0], elems[2], elems[6]
        if label == 'scientific name' or label == 'equivalent name' or label == 'genbank common name':
            if patt.search(name):
                yield taxid, name
示例#27
0
def build_database(archive=TAXDB_NAME, limit=None):
    """
    Downloads taxdump file.
    """
    print(f"*** building database from: {archive}")

    # The location of the archive.
    path = os.path.join(utils.DATADIR, archive)

    # Download the latest taxdump file.
    update_taxdump()

    # Check the file.
    if not os.path.isfile(path):
        utils.error(f"no taxdump file found")

    # Parse the names
    tax2data = parse_names(archive, limit=limit)

    # Parse the nodes and backpropagation.
    graph = parse_nodes(archive, tax2data=tax2data, limit=limit)

    # A shortcut to the function.
    def save_table(name, obj):
        utils.save_table(name=name, obj=obj, fname=SQLITE_DB)

    # Save the taxid definitions.
    save_table(TAXID, tax2data)

    # Save the graph.
    save_table(GRAPH, graph)

    print("*** saving the JSON model")
    json_path = os.path.join(utils.DATADIR, JSON_DB)

    # Save the JSON file as well.
    store = dict(TAXID=tax2data, GRAPH=graph)
    fp = open(json_path, 'wt')
    json.dump(store, fp, indent=4)
    fp.close()
示例#28
0
def run(start=1, end='', gap_open=11, gap_extend=1, local_=False, global_=False, semiglobal=False,
        protein=False, translate=False, inter=False, table=False, mutations=False, strict=False,
        pep1=False, pep3=False, limit=1, verbose=False, target=None, query=None):
    """
    Performs an alignment between the query and target.
    """

    # Alignments over this size will take a long time!
    MAX_LEN = 100000

    # Set the verbosity of the process.
    utils.set_verbosity(logger, level=int(verbose))

    # Reset counter (needed for consistency during testing).
    jsonrec.reset_sequence_names()

    # This method requires two inputs.
    if not (query and target):
        utils.error(f"Please specify a TARGET and a QUERY")

    if global_:
        mode = const.GLOBAL_ALIGN
    elif local_:
        mode = const.LOCAL_ALIGN
    elif semiglobal:
        mode = const.SEMIGLOBAL_ALIGN
    else:
        mode = const.GLOBAL_ALIGN

    # A parameter for each record.
    common = dict(
        protein=protein, translate=translate, mutations=mutations, pep1=pep1, pep3=pep3,
        table=table, strict=strict, start=start, end=end, gap_open=gap_open, gap_extend=gap_extend,
        mode=mode
    )

    # Create parameters to represent each data.
    param_t = objects.Param(acc=target, **common)
    param_q = objects.Param(acc=query, **common)

    # Fill JSON data for parameters.
    param_t.json = fetch.get_json(param_t.acc, inter=inter, strict=True)[:limit]
    param_q.json = fetch.get_json(param_q.acc, inter=inter, strict=True)[:limit]

    # Each data object may contain several records.
    #
    # For more than one record we iterate in pairs
    #
    for rec1, rec2 in zip(param_q.json, param_t.json):
        qrecs = fastarec.get_fasta(rec1, param=param_q)
        trecs = fastarec.get_fasta(rec2, param=param_t)
        for qseq, tseq in zip(qrecs, trecs):

            if (len(qseq) > MAX_LEN):
                utils.error(f"query is longer than maximum: {len(qseq):,} > {MAX_LEN:,}")

            if (len(tseq) > MAX_LEN):
                utils.error(f"target sequence is longer than maximum: {len(tseq):,} > {MAX_LEN:,}")

            biopython_align(qseq=qseq, tseq=tseq, param=param_q)
示例#29
0
文件: storage.py 项目: zghnbv/deepgo
def ncbi_efetch(name, gbk_name, db=None):
    """
    Connects to Entrez Direct to download data.
    """
    # Get the entire GenBank file.
    format, retmode = "gbwithparts", "text"

    # Guess accession numbers that are proteins.
    if name[:2] in ["AP", "NP", "YP", "XP", "WP", "AK"]:
        db = db or "protein"
    else:
        db = db or "nuccore"

    try:
        logger.info(f"connecting to Entrez for {name}")
        stream = Entrez.efetch(id=name, db=db, rettype=format, retmode=retmode)
    except Exception as exc:
        msg = f"{exc} for efetch acc={name} db={db} format={format} mode={retmode}"
        utils.error(msg)

    # Save the stream to GenBank.
    utils.save_stream(stream=stream, fname=gbk_name)
示例#30
0
def get_metadata(taxid, limit=None):
    """
    Returns all accessions
    """
    import requests

    # The dataset accession point.
    url = f"https://api.ncbi.nlm.nih.gov/datasets/v1alpha/virus/taxon/{taxid}/genome/table"

    params = {
        'format':
        'tsv',
        'refseq_only':
        "false",
        'complete_only':
        'true',
        'table_fields': [
            'host_tax_id',
            'species_tax_id',
            'nucleotide_accession',
            'collection_date',
            'geo_location',
            'isolate_name',
        ]
    }

    conn = requests.get(url, stream=True, params=params)
    lines = conn.iter_lines()
    lines = islice(lines, limit)

    if conn.status_code != 200:
        msg = f"HTTP status code: {conn.status_code}"
        utils.error(msg)

    lines = map(decode, lines)

    return lines