def handle(self, *args, **options): tmp = os.path.abspath(options['tmp']) if not os.path.exists(tmp): os.makedirs(tmp) qs = PDB.objects.filter(code=options["pdb"]) if options["pdb"] else PDB.objects.all() total = qs.count() utils = PDBs(options["pdbs_dir"]) with tqdm(qs, total=total) as pbar: for pdb in pbar: pbar.set_description(pdb.code) try: fpocket2sql = FPocket2SQL() fpocket2sql.create_or_get_pocket_properties() fpocket2sql.load_pdb(pdb.code) fpocket2sql.run_fpocket(options['tmp'],pdb_path=utils.pdb_path(pdb.code), pockets_path=utils.pdb_pockets_path(pdb.code), force=options["force"]) fpocket2sql.load_pockets() # res.delete_dir() except IOError as ex: traceback.print_exc() self.stderr.write("error processing pockets from %s: %s" % (pdb.code, str(ex))) except Exception as ex: traceback.print_exc() raise CommandError(ex)
def __init__(self, pdb_dir="/data/databases/pdb/"): self.utils = PDBs(pdb_dir) self.seqs_path = "/tmp/seq.faa" self.aln_path = "/tmp/msa.faa" self.ref_seq = None self.pdbfile = None self.pdb_data = defaultdict(dict)
def handle(self, *args, **options): if options["verbose"] == 1: import logging logging.basicConfig(level=logging.DEBUG) pdbs = PDBs(options["pdbs_dir"]) pdbs.url_pdb_entries = options["entries_url"] if not os.path.exists(options["entries_path"]): pdbs.download_pdb_entries() pdbio = PDBIO(options['pdbs_dir'] + "/", options['entries_path'], options['tmp']) pdbio.init() try: pdbs.update_pdb(options['code']) pdbio.process_pdb(options['code'], force=options['force'], pocket_path=pdbs.pdb_pockets_path( options['code']), pdb_path=pdbs.pdb_path(options['code'])) except IOError as ex: traceback.print_exc() self.stderr.write("error processing pockets from %s: %s" % (options['code'], str(ex))) except Exception as ex: traceback.print_exc() raise CommandError(ex)
def pdbs_seq_for_modelling(self, out_fasta=None, pdbsIter=None, reuse_previours=None): if pdbsIter == None: pdbsIter = PDBs(self.pdb_dir) if not out_fasta: out_fasta = self.pdb_dir + "processed/seqs_from_pdb.fasta" pdb_codes = {x.lower(): 1 for x in self.entries_df().IDCODE} reuse = defaultdict(lambda: []) if reuse_previours: for x in bpio.parse(reuse_previours, "fasta"): pdb = x.id.split("_")[0] reuse[pdb].append(x) reuse = dict(reuse) pdblist = list(pdbsIter) with open(out_fasta, "w") as out_fasta_handle: for (pdb, pdb_file_path) in tqdm(pdblist): if pdb in pdb_codes: if pdb in reuse: bpio.write(reuse[pdb], out_fasta_handle, "fasta") else: self.seq_from_pdb(out_fasta_handle, pdb, pdb_file_path)
def load_pdb_pocket(self, pdb, pdb_dir="/data/databases/pdb/"): utils = PDBs(pdb_dir) if not os.path.exists(utils.pdb_pockets_path(pdb)): utils.update_pdb(pdb) fpocket = FPocket(utils.pdb_path(pdb)) result = fpocket.hunt_pockets() mkdir(os.path.dirname(utils.pdb_pockets_path(pdb))) result.save(utils.pdb_pockets_path(pdb)) with open(utils.pdb_pockets_path(pdb)) as h: result = json.load(h) self.pdb_data[pdb]["pockets"] = result return self.pdb_data[pdb]["pockets"]
def add_arguments(self, parser): pdbs = PDBs() parser.add_argument('--code', required=True, help="4 letter PDB code") parser.add_argument('--tmp', default="data/tmp/load_pdb") parser.add_argument('--pdbs_dir', default="/data/databases/pdb/divided/") parser.add_argument('--entries_path', default="/data/databases/pdb/entries.idx") parser.add_argument('--entries_url', default=pdbs.url_pdb_entries)
def add_arguments(self, parser): pdbs = PDBs() parser.add_argument('--pdbs_dir', default="data/pdb/") parser.add_argument('--entries_path', default=None) parser.add_argument( '--only_annotated', action='store_false', help="by default only cross referenced pdbs are downloaded") parser.add_argument('--entries_url', default=pdbs.url_pdb_entries)
def add_arguments(self, parser): pdbs = PDBs() parser.add_argument('--code', required=True, help="4 letter PDB code") parser.add_argument('--tmp', default="data/tmp/load_pdb") parser.add_argument('--pdbs_dir', default="/data/databases/pdb/") parser.add_argument('--entries_path', default="/data/databases/pdb/entries.idx") parser.add_argument('--entries_url', default=pdbs.url_pdb_entries) parser.add_argument('--force', action="store_true") parser.add_argument('--verbose', default=0, choices=[0, 1], type=int)
def handle(self, *args, **options): pdbs = PDBs() pdbs.url_pdb_entries = options["entries_url"] if not os.path.exists(options["entries_path"]): pdbs.download_pdb_entries() pdbio = PDBIO(options['pdbs_dir'] + "/", options['entries_path'], options['tmp']) pdbio.init() try: pdbio.process_pdb(options['code']) except IOError as ex: traceback.print_exc() self.stderr.write("error processing pockets from %s: %s" % (options['code'], str(ex))) except Exception as ex: traceback.print_exc() raise CommandError(ex)
def handle(self, *args, **options): pdbs_utils = PDBs(pdb_dir=options['pdbs_dir']) pdbs_utils.url_pdb_entries = options["entries_url"] if not options['entries_path']: options['entries_path'] = options['pdbs_dir'] + "/entries.idx" if (datetime.now() - datetime.fromtimestamp( os.path.getctime(options["entries_path"]))).days > 7: pdbs_utils.download_pdb_entries() pdb2sql = PDB2SQL(options['pdbs_dir'], options['entries_path']) pdb2sql.load_entries() if options["only_annotated"]: self.stderr.write("only_annotated option activated by default") from bioseq.models.Dbxref import Dbxref pdbs = [(x.accession.lower(), pdbs_utils.pdb_path(x.accession.lower())) for x in Dbxref.objects.filter(dbname="PDB")] else: pdbs = list(tqdm(iterpdbs(options['pdbs_dir']))) # 4zux 42 mer 2lo7("5my5","/data/databases/pdb/divided/my/pdb5my5.ent") # ("4zu4", "/data/databases/pdb/divided/zu/pdb4zu4.ent") with tqdm(pdbs) as pbar: for code, pdb_path in pbar: code = code.lower() if PDBsWS.is_obsolete(code): self.stderr.write(f"{code} entry is obsolete") continue try: pdb_path = pdbs_utils.update_pdb(code) except KeyboardInterrupt: raise except: self.stderr.write("PDB %s could not be downloaded" % code) continue if PDB.objects.filter(code=code).exists(): self.stderr.write("PDB %s already exists" % code) continue pbar.set_description(code) try: pdb2sql.create_pdb_entry(code, pdb_path) pdb2sql.update_entry_data(code, pdb_path) except KeyboardInterrupt: raise except Exception as ex: import traceback traceback.print_exc() raise CommandError(ex)
def handle(self, *args, **options): pdbs = PDBs(pdb_dir=options['pdbs_dir']) pdbs.url_pdb_entries = options["entries_url"] if not os.path.exists(options["entries_path"]): pdbs.download_pdb_entries() pdb2sql = PDB2SQL(options['pdbs_dir'], options['entries_path']) pdb2sql.load_entries() if options["only_annotated"]: self.stderr.write("only_annotated option activated by default") from bioseq.models.Dbxref import Dbxref pdbs = [(x.accession.lower(),pdbs.pdb_path( x.accession.lower())) for x in Dbxref.objects.filter(dbname="PDB")] else: pdbs = list(tqdm(iterpdbs(options['pdbs_dir']))) # 4zux 42 mer 2lo7("5my5","/data/databases/pdb/divided/my/pdb5my5.ent") # ("4zu4", "/data/databases/pdb/divided/zu/pdb4zu4.ent") with tqdm(pdbs) as pbar: for code,pdb_path in pbar: code = code.lower() try: pdb_path = pdb2sql.download(code) except: self.stderr.write("PDB %s could not be downloaded" % code) continue if PDB.objects.filter(code=code).exists(): self.stderr.write("PDB %s already exists" % code) continue pbar.set_description(code) try: pdb2sql.create_pdb_entry(code, pdb_path) pdb2sql.update_entry_data(code, pdb_path) except Exception as ex: raise CommandError(ex)
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-i", "--data_path", default='/data/databases/pdb/') parser.add_argument( "-o", "--output_path", default='/data/databases/pdb/processed/domain_analisis') args = parser.parse_args() domains = defaultdict(lambda: []) for seq in bpio.parse(args.data_path + "/processed/domains.fasta", "fasta"): domains["_".join(seq.id.split("_")[0:2])].append(seq.id.split("_")) for (code, pdb_path) in tqdm(PDBs(pdb_dir=args.data_path)): pdb_model = PDB(code=code) pdb_model.save() p = PDBParser(PERMISSIVE=True, QUIET=True) try: for chain in p.get_structure(code, pdb_path).get_chains(): chains_dir = args.output_path + "/chains/" + code[1:3] + "/" mkdir(chains_dir) cs = ChainSplitter(chains_dir) process_chain(pdb_path, code, chain.id, pdb_model) for (_, _, res_start, res_end, dn, dn_start, dn_end) in domains[code + "_" + chain.id]: # 1r9d_A_2_787_PF02901.14_8_648
class StructureVariant: def __init__(self, pdb_dir="/data/databases/pdb/"): self.utils = PDBs(pdb_dir) self.seqs_path = "/tmp/seq.faa" self.aln_path = "/tmp/msa.faa" self.ref_seq = None self.pdbfile = None self.pdb_data = defaultdict(dict) def load_msa(self, input_sequence, pdb_code, pdb_chain=None): pdb_code = pdb_code.lower() self.utils.update_pdb(pdb_code) self.ref_seq = bpio.read(input_sequence, "fasta") self.pdbfile = PDBFile(pdb_code, self.utils.pdb_path(pdb_code)) with open(self.seqs_path, "w") as h: bpio.write(self.ref_seq, h, "fasta") bpio.write(self.pdbfile.seq(selected_chain=pdb_chain), h, "fasta") cmd = docker_wrap_command( f'mafft --quiet --localpair --maxiterate 1000 {self.seqs_path} > {self.aln_path} ' ) execute(cmd) self.msa = MSAMap.from_msa(self.aln_path) self.res_map = self.pdbfile.residues_map(pdb_chain) def residues_from_pos(self, pos): pos_data = [] for sample in self.msa.samples(): if sample != self.ref_seq.id: pdb, chain = sample.split("_")[:2] if self.msa.exists_pos(self.ref_seq.id, pos, sample): msa_pos = self.msa.pos_seq_msa_map[self.ref_seq.id][pos] sample_pos = self.msa.pos_from_seq(self.ref_seq.id, pos, sample) line = { "pos": pos + 1, "ref": self.msa.seqs[self.ref_seq.id][msa_pos], "alt": self.msa.seqs[sample][msa_pos], "pdb": pdb, "chain": chain, "resid": str(self.res_map[chain][sample_pos][1]), "icode": str(self.res_map[chain][sample_pos][2]), "pdb_pos": sample_pos } pos_data.append(line) return pos_data def residues_from_aln_seq(self, input_sequence, pdb_code, pdb_chain=None): self.load_msa(input_sequence, pdb_code, pdb_chain) variants = [ (k, v) for k, v in sorted(self.msa.variants(self.ref_seq.id).items(), key=lambda x: int(x[0].split("_")[1])) ] output = [] for ref_pos, alt_samples in variants: ref, pos = ref_pos.split("_") pos = int(pos) for alt, samples in alt_samples.items(): if alt != self.msa.gap_code: pos_data = self.residues_from_pos(pos) output += pos_data return pd.DataFrame(output) def annotate_resid(self, pdb: str, resid: str, structure_annotator: StructureAnnotator): pdb = pdb.lower() data = {} if pdb not in self.pdb_data: self.load_pdb_ann(pdb, structure_annotator) if str(resid) in self.pdb_data[pdb]["binding"]: data["binding"] = self.pdb_data[pdb]["binding"][str(resid)] if str(resid) in self.pdb_data[pdb]["pockets"]: data["pockets"] = self.pdb_data[pdb]["pockets"][str(resid)] return data def load_pdb_ann(self, pdb, structure_annotator: StructureAnnotator): binding_data = structure_annotator.load_pdb_binding_data(pdb) binding_dict = defaultdict(list) for site in binding_data: for site_res in site["site_residues"]: res = str(site_res["residue_number"]) + (site_res.get( "author_insertion_code", "") or "") binding_dict[res].append({ "site_id": site["site_id"], "details": site["details"], "ligands": [{ c: x[c] for c in ["chain_id", "author_residue_number", "chem_comp_id"] } for x in site["site_residues"] if x["chem_comp_id"] in binding_dict and ( x["chem_comp_id"] != "HOH")] }) self.pdb_data[pdb]["binding"] = binding_dict pockets_data = structure_annotator.load_pdb_pocket( pdb, self.utils.pdb_dir) pockets_dict = defaultdict(list) for pocket in pockets_data: for residue in set(pocket["residues"]): pockets_dict[residue].append({ "pocket_num": pocket["number"], "druggabilitty": pocket["properties"]['Druggability Score'] }) self.pdb_data[pdb]["pockets"] = dict(pockets_dict) def annotate_residue_list(self, df, structure_annotator: StructureAnnotator): """ :param df: columns=["pdb", "chain", "resid", "alt", "ref", "pos"] or generated by residues_from_aln_seq :return: """ output = {} for i, r in df.iterrows(): output[ f'{r.pdb}_{r.chain}_{r.resid}_{r.alt}'] = self.annotate_resid( r.pdb, str(r.resid), structure_annotator) return output
def old_or_inexistent(filepath, period=30): return not os.path.exists(filepath) or (( (time.time() - os.path.getatime(filepath)) / 60 / 60 / 24) > period) #os.environ["http_proxy"] = "http://proxy.fcen.uba.ar:8080" #os.environ["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080" mkdir("/data/pdb/") download_file("ftp://ftp.wwpdb.org/pub/pdb/derived_data/index/entries.idx", "/data/pdb/entries.idx", ovewrite=True) pdbs = PDBs("/data/pdb/") pdbs.download_pdb_seq_ses() pdbs.update_pdb_dir() mkdir("/data/pdb/processed/") pdbs.pdbs_seq_for_modelling() execute("makeblastdb -dbtype prot -in /data/pdb/processed/seqs_from_pdb.fasta") if old_or_inexistent("/data/uniprot/uniref/uniref90/uniref90.fasta"): mkdir("/data/uniprot/uniref/uniref90") download_file( "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz", "/data/uniprot/uniref/uniref90/uniref90.fasta.gz", ovewrite=True) execute("gunzip /data/uniprot/uniref/uniref90/uniref90.fasta.gz") if old_or_inexistent("/data/uniprot/uniref/uniref90/uniref90.fasta.pal"):
def main(argv=None): # IGNORE:C0111 '''Command line options.''' if argv is None: argv = sys.argv else: sys.argv.extend(argv) parser = ArgumentParser( formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %(default)s]") # parser.add_argument("-dir", "--structs_dir", required = True ) parser.add_argument("-db", "--database_name", default='pdb') parser.add_argument("-host", "--db_host", default='127.0.0.1') parser.add_argument( "--csa", default='/data/databases/csa/csa.txt') parser.add_argument( "--hmm", default='/data/databases/pdb/pdb_seq_res.hmm') parser.add_argument( "--pdbs", default='/data/databases/pdb/') parser.add_argument( "--distances", default='/data/databases/pdb/processed/distances.tbl') args = parser.parse_args() # pdbs = PDBs() # pdbs.update('/data/pdb/divided/') BioMongoDB(args.database_name) #args.db_host # update_quaternary() # # clusters cd hit # update_clusters() # # residues near ligands --> metal drug/cofactor if not os.path.exists(args.csa): sys.stderr.write("%s not found. Download it from %s" % ( args.csa, "http://www.ebi.ac.uk/thornton-srv/databases/CSA/downloads/CSA_2_0_121113.txt" )) sys.exit(1) if not os.path.exists(args.pdbs): sys.stderr.write("%s not found. Specify where is pdbs/divided directory" % ( args.pdbs )) sys.exit(1) if not os.path.exists(args.distances): sys.stderr.write("%s not found. Run extended_domain.py script to create it." % ( args.distances )) sys.exit(1) pdbUtils = PDBs(pdb_dir=args.pdbs) print("Update Quaternary") update_quaternary(pdbUtils) print("Update CSA") update_csa(args.csa) print("Update CYS/TYR") free_cys_tyr(pdbUtils) print("Update Importan Pfam") important_pfam(args.hmm) print("Update Binding residues") update_binding_residues(args.distances) _log.info("update pdb properties finished!!")
parser.add_argument("-p", "--dbpass", required=True) parser.add_argument("-i", "--pdb_dir", default="/data/databases/pdb/") parser.add_argument("-db", "--dbname", default="pdbdb") parser.add_argument("-u", "--dbuser", default="root") args = parser.parse_args() from peewee import MySQLDatabase mysql_db = MySQLDatabase(args.dbname, user=args.dbuser, password=args.dbpass) mysql_db.close() sqldb.initialize(mysql_db) pdb_utils = PDBs(pdb_dir=args.pdb_dir) df = pdb_utils.entries_df() pdbs = list(pdb_utils) with tqdm(pdbs) as pbar: for (code, pdb_path) in pbar: mysql_db.connect(reuse_if_open=True) pbar.set_description(code) try: entry = df[df.IDCODE == code.upper()].iloc[0] except IndexError: continue pdb_model = PDB(code=code, experiment=str(entry.EXPERIMENT)) try: resolution = float(entry.RESOLUTION)
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-p", "--dbpass", required=True) parser.add_argument("-i", "--pdb_dir", default="/data/databases/pdb/") parser.add_argument("-db", "--dbname", default="pdbdb") parser.add_argument("-u", "--dbuser", default="root") args = parser.parse_args() from peewee import MySQLDatabase mysql_db = MySQLDatabase(args.dbname, user=args.dbuser, password=args.dbpass) sqldb.initialize(mysql_db) pdb_utils = PDBs(pdb_dir=args.pdb_dir) props = {x.name: x for x in Property.select()} pdbs = list(pdb_utils) with tqdm(pdbs) as pbar: for (code, pdb_path) in pbar: pdb_model = PDB.select().where(PDB.code == code).first() p = PDBParser(PERMISSIVE=True, QUIET=True) try: for chain in p.get_structure(code, pdb_path).get_chains(): chains_dir = args.pdb_dir + "/chains/" + code[1:3] + "/" mkdir(chains_dir) cs = ChainSplitter(chains_dir) process_chain(pdb_path, code, chain.id, pdb_model, props)
def add_arguments(self, parser): pdbs = PDBs() parser.add_argument('--pdbs_dir', default="data/pdb/") parser.add_argument('--entries_path', default="data/pdb/entries.idx") parser.add_argument('--only_annotated', action='store_false') parser.add_argument('--entries_url', default=pdbs.url_pdb_entries)
return self.pdbs_dir + pdb[1:3] + "/pdb" + pdb + ".ent" @staticmethod def sequence_from_residues(residues): return "".join([ protein_letters_3to1[res.get_resname()[0] + res.get_resname()[1:3].lower()] for res in residues ]) if __name__ == '__main__': from SNDG import init_log import argparse from SNDG.Structure.PDBs import PDBs parser = argparse.ArgumentParser(description='PDB Update utils') init_log() pdbs = PDBs(pdb_dir="/data/databases/pdb/") #os.environ["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080" # pdbs.download_pdb_seq_ses() pdbs.download_pdb_entries() pdbs.update_pdb_dir() # from SNDG.Structure.PDBs import PDBs # pdbs = PDBs(pdb_dir="/data/databases/pdb/") # pdbs.pdbs_seq_for_modelling("/data/databases/pdb/processed/seqs_from_pdb.fasta") #pepe = pdbs.entries_df() #print pepe
"--verbose", dest="verbose", action="count", help="set verbosity level [default: %(default)s]") parser.add_argument("-host", "--db_host", default='127.0.0.1') parser.add_argument("-db", "--db_name", default='tdr') parser.add_argument("--pdb_entries", default='/data/databases/pdb/entries.idx') parser.add_argument("--pdbs", default='/data/databases/pdb/') args = parser.parse_args() BioMongoDB(args.db_name) pdbUtils = PDBs(pdb_dir=args.pdbs) db = MongoClient(args.db_host)["pdb"] col_name = "pdb" if not os.path.exists(args.pdb_entries): sys.stderr.write("%s does not exists" % args.pdb_entries) sys.exit(1) """ collection = SeqCollection.objects(name=col_name) if len(collection): collection = collection.get() else: collection = SeqCollection(name=col_name, description="Protein Data Bank", organism="?") collection.save() """
parser = argparse.ArgumentParser(description='PDB utils') subparsers = parser.add_subparsers(help='commands', description='valid subcommands', dest='command') update_pdb = subparsers.add_parser('update', help='List contents') update_pdb.add_argument('-i', '--pdbs_dir', help="pdbs_directory", default="/data/databases/pdb/") # update_pdb = subparsers.add_parser('getpdb', help='List contents') # update_pdb.add_argument('-i', '--pdb_code', help="4 letter code", required=True) # update_pdb.add_argument('-o', '--ouput_file', help="output file") args = parser.parse_args() if args.command == "update": # remzemeber to configure ftp pdbs = PDBs(pdb_dir=args.pdbs_dir) pdbs.download_pdb_entries() pdbs.update_pdb_dir() pdbs.download_pdb_seq_ses() sys.exit(0) # os.environ["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080" # pdbs.download_pdb_seq_ses() # from SNDG.Structure.PDBs import PDBs # pdbs = PDBs(pdb_dir="/data/databases/pdb/") # pdbs.pdbs_seq_for_modelling("/data/databases/pdb/processed/seqs_from_pdb.fasta") # pepe = pdbs.entries_df() # print pepe
help='pdb files directory') parser.add_argument('--tmp_dir', default=mkdtemp(), help='temporal directory') parser.add_argument('--cpus', default=1, type=int, help='cpu cores to use') # parser.add_argument('--max_alns', default=3, type=int, help='max different templates to use') parser.add_argument('-t', "--templates_to_use", default=3, type=int, help='max amount of templates to use.') args = parser.parse_args() pdbs_dir = args.pdbs_dir + ("/" if args.pdbs_dir[-1] != "/" else "") mkdir(f'{pdbs_dir}/divided') pdb_utils = PDBs(pdbs_dir) # pbar = tqdm(args.alns) sys.stderr.write(str(args)) sys.stderr.write(f'reading alignment file\n') alns = [{ "aln_file": x, "templates2use": args.templates_to_use, "output_dir": args.output_dir, "tmp_dir": args.tmp_dir } for x in args.alns] mkdir(args.output_dir) assert os.path.exists( args.output_dir), f'"{args.output_dir}" could not be created' sys.stderr.write(f'processing alignment files\n')
help="set verbosity level [default: %(default)s]") parser.add_argument("-host", "--db_host", default='127.0.0.1') parser.add_argument("-db", "--db_name", default='tdr') parser.add_argument( "--pdb_entries", default='/data/databases/pdb/entries.idx') parser.add_argument( "--pdbs", default='/data/databases/pdb/') parser.add_argument( "--pdb_timeout", default=60,type=int) parser.add_argument( "--review_pockets", action="store_true") parser.add_argument("--organism_name", default=None) args = parser.parse_args() mdb = BioMongoDB(args.db_name,host=args.db_host) pdbUtils = PDBs(pdb_dir=args.pdbs) db = MongoClient(args.db_host)["pdb"] col_name = "pdb" if not os.path.exists(args.pdb_entries): sys.stderr.write("%s does not exists" % args.pdb_entries) sys.exit(1) """ collection = SeqCollection.objects(name=col_name) if len(collection): collection = collection.get() else: collection = SeqCollection(name=col_name, description="Protein Data Bank", organism="?") collection.save()