Exemplo n.º 1
0
def main(argv=None):  # IGNORE:C0111

    program_version = "v%s" % __version__
    program_build_date = str(__updated__)
    program_version_message = '%%(prog)s %s (%s)' % (program_version,
                                                     program_build_date)
    program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    program_license = '''%s

  Created by user_name on %s.
  Copyright 2015 BIA. All rights reserved.

  Licensed under the Apache License 2.0
  http://www.apache.org/licenses/LICENSE-2.0

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied.

USAGE
''' % (program_shortdesc, str(__date__))

    parser = ArgumentParser(description=program_license,
                            formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-v",
                        "--verbose",
                        dest="verbose",
                        action="count",
                        help="set verbosity level [default: %(default)s]")

    parser.add_argument("-db_structure",
                        "--db_structure",
                        help="Mongo structure db",
                        default='pdb')
    parser.add_argument("-db_genome",
                        "--db_genome",
                        help="Mongo proteins db",
                        default='saureus')
    parser.add_argument('-o', '--overwrite', default=True, action='store_true')
    parser.add_argument("-host", "--db_host", default='127.0.0.1')
    parser.add_argument('-V',
                        '--version',
                        action='version',
                        version=program_version_message)

    args = parser.parse_args()

    db = pymongo.MongoClient(args.db_host)[args.db_structure]
    BioMongoDB(args.db_genome)
    logging.getLogger("peewee").setLevel(logging.WARN)
    from peewee import MySQLDatabase
    from SNDG.BioMongo.Process.Taxon import tax_db
    tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito"))

    tax_cache = {}
    for t in Taxonomy.objects().no_cache():
        for n in t.names:
            tax_cache[n.lower()] = t
            tax_cache[t.ncbi_taxon_id] = t
    query = {}
    idx_name = "sndg_index"
    if not args.overwrite:
        query = {idx_name: {"$exists": 0}}
    # total = db.structures.count(query)
    # with tqdm(db.structures.find(query, {"organism": 1}), total=total) as pbar:
    #     for struct in pbar:
    #         if "organism" in struct:
    #             for org in [x for x in set(struct["organism"].lower().split(";") + struct["organism"].lower().split(",") +
    #                   [struct["organism"].lower().split("(")[0]]) if ";" not in x and "," not in x and "(" not in x]:
    #                 org = org.strip()
    #                 val = get_or_load_by_name(org, tax_cache)
    #                 if val:
    #                     db.structures.update({"_id": struct["_id"]}, {"$set": {idx_name + ".tax": list(val.keywords)}})
    #                 else:
    #                     tax_cache[org.lower()] = None
    #                     _log.warn(org + " not found")

    # db.structures.update({"ligands.0":{"$exists",1}},  {"$set": {idx_name + ".ligand": 1}},multi=True);

    db = pymongo.MongoClient(args.db_host)[args.db_genome]

    # total = db.barcodes.count(query)
    # with tqdm(db.barcodes.find(query, {"tax": 1}), total=total) as pbar:
    #     for barcode in pbar:
    #         val = get_or_load_by_id(barcode["tax"], tax_cache)
    #         update_element(val, db.barcodes, barcode, idx_name, tax_cache,barcode["tax"])

    total = db.sequence_collection.count(query)
    with tqdm(db.sequence_collection.find(query, {
            "name": 1,
            "tax": 1,
            "assemblyStatus": 1
    },
                                          no_cursor_timeout=True),
              total=total) as pbar:
        for genome in pbar:
            if "tax" in genome:
                val = get_or_load_by_id(int(genome["tax"]["tid"]), tax_cache)
                update_element(val, db.sequence_collection, genome, idx_name,
                               tax_cache, genome["tax"]["tid"])
                if val:
                    select = {"organism": genome["name"]}
                    kws = list(val.keywords)
                    db.proteins.update(select,
                                       {"$set": {
                                           idx_name + ".tax": kws
                                       }},
                                       multi=True)
                    db.proteins.update(
                        select, {"$addToSet": {
                            "keywords": {
                                "$each": kws
                            }
                        }},
                        multi=True)
                    db.contig_collection.update(select, {
                        "$set": {
                            idx_name + ".tax": kws,
                            idx_name + ".assemblyStatus":
                            genome["assemblyStatus"]
                        }
                    },
                                                multi=True)
                    db.contig_collection.update(
                        select, {"$addToSet": {
                            "keywords": {
                                "$each": kws
                            }
                        }},
                        multi=True)

    print("Ok")
Exemplo n.º 2
0
                        sf.qualifiers["locus_tag"] = [
                            x.id + "_" +
                            sf.qualifiers["locus_tag"][0].replace(tag, "")
                        ]
            contigs.append(x)
        GFF.write(contigs, h, False)


if __name__ == '__main__':
    init_log()

    logging.getLogger("peewee").setLevel(logging.WARN)
    from peewee import MySQLDatabase
    from SNDG.BioMongo.Process.Taxon import tax_db

    tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito"))
    mdb = BioMongoDB("saureus", port=27017)

    # mdb.delete_seq_collection("ILEX_PARA2")


    def extract_annotation_feature(feature):
        mrnas = [f for f in feature.sub_features if f.type == "mRNA"]
        return mrnas[0] if feature.type == "gene" and len(mrnas) else feature

    def accept_protein_feature(feature):
        return feature.type == "gene" and feature.sub_features and feature.sub_features[
            0].type == "mRNA"

    # prot_dict = bpio.to_dict(bpio.parse("/data/organismos/ILEX_PARA/contigs/ncbi_IP4.faa","fasta"))
    def extract_sequence(c, f):
    init_log()

    parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-p", "--dbpass", required=True)
    parser.add_argument("-a", "--assemblyAccession", required=True)
    parser.add_argument("-mdb", "--mongodbname", required=True)
    parser.add_argument("-mydbtax", "--mysqldbtaxname", default="bioseqdb")
    parser.add_argument("--cpus", default=multiprocessing.cpu_count())
    parser.add_argument("-mydbunip", "--mysqldbunip", default="unipmap")
    parser.add_argument("-myu", "--mysqldbuser", default="root")

    args = parser.parse_args()
    args.cpus = int(args.cpus)
    mdb = BioMongoDB(args.mongodbname)
    tax_db.initialize(
        MySQLDatabase(args.mysqldbtaxname,
                      user=args.mysqldbuser,
                      passwd=args.dbpass))
    ProteinAnnotator.connect_to_db(database=args.mysqldbunip,
                                   user=args.mysqldbuser,
                                   password=args.dbpass)

    assert not mdb.seq_col_exists(
        args.assemblyAccession), "assembly already exists"
    Entrez.email = "*****@*****.**"
    assembly_id = Entrez.read(
        Entrez.esearch(db="assembly", term=args.assemblyAccession,
                       retmax=1))["IdList"][0]
    resource = Entrez.read(
        Entrez.esummary(db="assembly", id=assembly_id, validate=False))
    try: