def main(argv=None): # IGNORE:C0111 program_version = "v%s" % __version__ program_build_date = str(__updated__) program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date) program_shortdesc = __import__('__main__').__doc__.split("\n")[1] program_license = '''%s Created by user_name on %s. Copyright 2015 BIA. All rights reserved. Licensed under the Apache License 2.0 http://www.apache.org/licenses/LICENSE-2.0 Distributed on an "AS IS" basis without warranties or conditions of any kind, either express or implied. USAGE ''' % (program_shortdesc, str(__date__)) parser = ArgumentParser(description=program_license, formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %(default)s]") parser.add_argument("-db_structure", "--db_structure", help="Mongo structure db", default='pdb') parser.add_argument("-db_genome", "--db_genome", help="Mongo proteins db", default='saureus') parser.add_argument('-o', '--overwrite', default=True, action='store_true') parser.add_argument("-host", "--db_host", default='127.0.0.1') parser.add_argument('-V', '--version', action='version', version=program_version_message) args = parser.parse_args() db = pymongo.MongoClient(args.db_host)[args.db_structure] BioMongoDB(args.db_genome) logging.getLogger("peewee").setLevel(logging.WARN) from peewee import MySQLDatabase from SNDG.BioMongo.Process.Taxon import tax_db tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito")) tax_cache = {} for t in Taxonomy.objects().no_cache(): for n in t.names: tax_cache[n.lower()] = t tax_cache[t.ncbi_taxon_id] = t query = {} idx_name = "sndg_index" if not args.overwrite: query = {idx_name: {"$exists": 0}} # total = db.structures.count(query) # with tqdm(db.structures.find(query, {"organism": 1}), total=total) as pbar: # for struct in pbar: # if "organism" in struct: # for org in [x for x in set(struct["organism"].lower().split(";") + struct["organism"].lower().split(",") + # [struct["organism"].lower().split("(")[0]]) if ";" not in x and "," not in x and "(" not in x]: # org = org.strip() # val = get_or_load_by_name(org, tax_cache) # if val: # db.structures.update({"_id": struct["_id"]}, {"$set": {idx_name + ".tax": list(val.keywords)}}) # else: # tax_cache[org.lower()] = None # _log.warn(org + " not found") # db.structures.update({"ligands.0":{"$exists",1}}, {"$set": {idx_name + ".ligand": 1}},multi=True); db = pymongo.MongoClient(args.db_host)[args.db_genome] # total = db.barcodes.count(query) # with tqdm(db.barcodes.find(query, {"tax": 1}), total=total) as pbar: # for barcode in pbar: # val = get_or_load_by_id(barcode["tax"], tax_cache) # update_element(val, db.barcodes, barcode, idx_name, tax_cache,barcode["tax"]) total = db.sequence_collection.count(query) with tqdm(db.sequence_collection.find(query, { "name": 1, "tax": 1, "assemblyStatus": 1 }, no_cursor_timeout=True), total=total) as pbar: for genome in pbar: if "tax" in genome: val = get_or_load_by_id(int(genome["tax"]["tid"]), tax_cache) update_element(val, db.sequence_collection, genome, idx_name, tax_cache, genome["tax"]["tid"]) if val: select = {"organism": genome["name"]} kws = list(val.keywords) db.proteins.update(select, {"$set": { idx_name + ".tax": kws }}, multi=True) db.proteins.update( select, {"$addToSet": { "keywords": { "$each": kws } }}, multi=True) db.contig_collection.update(select, { "$set": { idx_name + ".tax": kws, idx_name + ".assemblyStatus": genome["assemblyStatus"] } }, multi=True) db.contig_collection.update( select, {"$addToSet": { "keywords": { "$each": kws } }}, multi=True) print("Ok")
sf.qualifiers["locus_tag"] = [ x.id + "_" + sf.qualifiers["locus_tag"][0].replace(tag, "") ] contigs.append(x) GFF.write(contigs, h, False) if __name__ == '__main__': init_log() logging.getLogger("peewee").setLevel(logging.WARN) from peewee import MySQLDatabase from SNDG.BioMongo.Process.Taxon import tax_db tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito")) mdb = BioMongoDB("saureus", port=27017) # mdb.delete_seq_collection("ILEX_PARA2") def extract_annotation_feature(feature): mrnas = [f for f in feature.sub_features if f.type == "mRNA"] return mrnas[0] if feature.type == "gene" and len(mrnas) else feature def accept_protein_feature(feature): return feature.type == "gene" and feature.sub_features and feature.sub_features[ 0].type == "mRNA" # prot_dict = bpio.to_dict(bpio.parse("/data/organismos/ILEX_PARA/contigs/ncbi_IP4.faa","fasta")) def extract_sequence(c, f):
init_log() parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-p", "--dbpass", required=True) parser.add_argument("-a", "--assemblyAccession", required=True) parser.add_argument("-mdb", "--mongodbname", required=True) parser.add_argument("-mydbtax", "--mysqldbtaxname", default="bioseqdb") parser.add_argument("--cpus", default=multiprocessing.cpu_count()) parser.add_argument("-mydbunip", "--mysqldbunip", default="unipmap") parser.add_argument("-myu", "--mysqldbuser", default="root") args = parser.parse_args() args.cpus = int(args.cpus) mdb = BioMongoDB(args.mongodbname) tax_db.initialize( MySQLDatabase(args.mysqldbtaxname, user=args.mysqldbuser, passwd=args.dbpass)) ProteinAnnotator.connect_to_db(database=args.mysqldbunip, user=args.mysqldbuser, password=args.dbpass) assert not mdb.seq_col_exists( args.assemblyAccession), "assembly already exists" Entrez.email = "*****@*****.**" assembly_id = Entrez.read( Entrez.esearch(db="assembly", term=args.assemblyAccession, retmax=1))["IdList"][0] resource = Entrez.read( Entrez.esummary(db="assembly", id=assembly_id, validate=False)) try: