def save(self, file_path): mkdir(os.path.dirname(os.path.abspath(file_path))) with open(file_path, "w") as handle: json.dump( [{"number": p.pocket_num, "residues": p.residues, "as_lines": p.alpha_spheres, "atoms": p.atoms, "properties": p.properties} for p in self.pockets if p.properties["Druggability Score"] > 0.2], handle)
def trim(strains, source_dir, dst_dir, clip="", headcrop=13, quality=20, windowsize=4, minlen=36): """ :param strains: :param source_dir: :param dst_dir: :param clip: ILLUMINACLIP:../data/external/NexteraPE-PE.fa:2:30:10 :param headcrop: :param quality: :param windowsize: :param minlen: :return: """ mkdir(dst_dir) with tqdm(strains) as pbar: for strain in pbar: filenames = [ os.path.basename(x) for x in glob(source_dir + "/" + strain + "*.gz") ] FastQ.trim_pairs(filenames[0], filenames[1], source_dir, dst_dir, clip, headcrop, quality, windowsize, minlen)
def update_proteins(annotation_dir, proteome, seq_col_name, tax_id, identity=0.9, cpus=multiprocessing.cpu_count(), db_init=None): print seq_col_name if db_init: from SNDG.Sequence.ProteinAnnotator import PABase PABase.sqldb.initialize(db_init) mkdir(annotation_dir) out = annotation_dir + "/species_blast.tbl" tax = Tax.select().where(Tax.ncbi_taxon_id == tax_id).get() species_tax = None for tax in Tax.parents(tax): if tax.node_rank == "genus": species_tax = tax break tax_data = "/data/xomeq/tax/" species_fasta = tax_data + str(int(species_tax.ncbi_taxon_id)) + ".fasta" if not os.path.exists(out): if not os.path.exists(species_fasta): Uniprot.download_proteome_from_tax(str(species_tax.ncbi_taxon_id), tax_data) cmd = "blastp -query %s -db %s -evalue 0.00001 -outfmt 6 -max_hsps 1 -qcov_hsp_perc 0.9 -num_threads %i -out %s" execute(cmd % (proteome, species_fasta, cpus, out)) species_desc = { x.id.split("|")[1]: " ".join(x.description.split()[1:]) for x in bpio.parse(species_fasta, "fasta") } total = Protein.objects(organism=seq_col_name).count() with tqdm(bpsio.parse(out, "blast-tab"), total=total) as pbar: for query in pbar: pbar.set_description(query.id) if query[0][0].ident_pct > identity: unip = query[0].id.split( "|")[1] if "|" in query[0].id else query[0].id dbxrefs = [ x.db + "||" + x.value for x in Mapping.select().where(Mapping.uniprot == unip) ] p = Protein.objects(gene=query.id, organism=seq_col_name).no_cache().get() if not p.description and unip in species_desc: p.description = species_desc[unip].split( "OS=")[0] + " | homology with: " + unip p.save() if dbxrefs: p = SearchLoader.update_protein_with_dbxref( query.id, dbxrefs, seq_col_name) p.save()
def alignment(wd, ref, trimmed_1="trimmed_1.fastq", trimmed_2="trimmed_2.fastq", cpus=multiprocessing.cpu_count(), strain="sample1", species=None, force=False, read_group="group1"): if not species: species = strain mkdir(wd) wd = os.path.abspath(wd) + "/" ref = os.path.abspath(ref) assert os.path.exists(wd), f'{wd} could not be created' assert os.path.exists(ref), f'{ref} does not exist' assert os.path.exists(trimmed_1), f'{trimmed_1} does not exist' assert os.path.exists(trimmed_2), f'{trimmed_2} does not exist' # Generate a SAM file containing aligned reads if force or not os.path.exists(f"{wd}mapped_reads_raw.bam"): tab = "\\t" e(f"bwa mem -t {cpus} -M -R \'@RG{tab}ID:{read_group}{tab}SM:{strain}{tab}PL:illumina{tab}LB:{species}\' {ref} {trimmed_1} {trimmed_2} > {wd}aligned_reads.sam" ) assert os.path.getsize(f"{wd}aligned_reads.sam" ) > 10, f"{wd}aligned_reads.sam cant be empty" # Filter mapped reads and convert to BAM if force or (not os.path.exists(f"{wd}dedup.bam ") and not os.path.exists(f"{wd}mapped_reads_raw.bam")): e(f"samtools view -@ {cpus} -F 4 -S -b -h {wd}aligned_reads.sam | samtools sort - > {wd}mapped_reads_raw.bam" ) e(f"samtools view -@ {cpus} -f 4 -S -b -h {wd}aligned_reads.sam > {wd}unmapped_reads.bam" ) e(f"bedtools bamtofastq -i unmapped_reads.bam -fq {wd}unmapped_1.fastq -fq2 {wd}unmapped_2.fastq" ) if os.path.exists(f"{wd}unmapped_reads.bam"): os.remove(f"{wd}unmapped_reads.bam") if os.path.exists(f"{wd}aligned_reads.sam"): os.remove(f"{wd}aligned_reads.sam") # Sort and mark duplicates e(f"gatk MarkDuplicates -INPUT {wd}mapped_reads_raw.bam -OUTPUT {wd}dedup.bam -METRICS_FILE {wd}metrics.txt" ) assert os.path.getsize( f"{wd}dedup.bam") > 10, f"{wd}dedup.bam cant be empty" os.remove(f"{wd}mapped_reads_raw.bam") e(f'samtools sort {wd}dedup.bam > {wd}mapped_reads.bam') os.remove(f"{wd}dedup.bam") e(f'samtools index {wd}mapped_reads.bam') e(f"gatk CollectInsertSizeMetrics --I {wd}mapped_reads.bam --O {wd}insert_size_metrics.txt --H {wd}insert_size_histogram.pdf --M 0.5" ) return f'{wd}mapped_reads.bam'
def assemble_pe(r1: str, r2: str, out: str, name: str, ss: str = None, trusted_contigs: str = None, untrusted_contigs: str = None, cov_cutoff: int = 5, tmp_dir: str = "/tmp/", cpus=multiprocessing.cpu_count()): """ :param out: output dir :param trusted_contigs: trusted contigs path :param untrusted_contigs: untrusted contigs path :param cov_cutoff: :return: """ if tmp_dir == "/tmp/": tmp_dir = tmp_dir + name workdir1 = tmp_dir workdir2 = os.path.dirname(r1) assert workdir2 == os.path.dirname( r2), "r1 and r2 must be in the same directory" mkdir(out) mappings = f" -v {workdir1}:/out " in_dir = "/out/" if workdir1 != workdir2: mappings = mappings + f" -v {workdir2}:/in " in_dir = "/in/" template = """docker run -u $(id -u):$(id -g) --rm -w /out {mappings} {image} spades.py \ {libs} {tcont} {utcont} -t {cpus} --isolate --cov-cutoff {cov_cutoff} -o /out """ libs = "" i = 1 r1_img = in_dir + r1.split(workdir2)[1] r2_img = in_dir + r2.split(workdir2)[1] libs += f' --pe{i}-1 "{r1_img}" --pe{i}-2 "{r2_img}" ' if ss: ss_img = in_dir + ss.split(workdir2)[1] libs += f' --pe{i}-s "{ss_img}" ' tcont = " --trusted-contigs " + trusted_contigs if trusted_contigs else "" utcont = " --untrusted-contigs " + untrusted_contigs if untrusted_contigs else "" cmd = template.format(libs=libs, tcont=tcont, utcont=utcont, cov_cutoff=cov_cutoff, out=out, mappings=mappings, image=Assembly.SPADES_DOCKER_IMAGE, cpus=cpus) print(cmd)
def fastqc(source_dir, dst_dir): mkdir(dst_dir) for filename in tqdm( sorted( glob(source_dir + "/*.fastq") + glob(source_dir + "/*.fastq.gz") + glob(source_dir + "/*.fq") + glob(source_dir + "/*.fq.gz"))): execute("fastqc {src} -q --extract -o {dst}", src=filename, dst=dst_dir)
def update_pdb(self, pdb): pdb = pdb.lower() mkdir(self.pdbs_dir + pdb[1:3]) if os.path.exists(self.pdb_path_gzipped(pdb)): execute("gunzip " + self.pdb_path_gzipped(pdb)) elif not os.path.exists(self.pdb_path(pdb)): download_file( self.url_pdb_files + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention, self.pdbs_dir + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention) execute("gunzip " + self.pdb_path_gzipped(pdb))
def prepare_dir(directory, df, mfilter=lambda df: df[ (df.zqmean >= -2) & (df.zqmean <= 2)], csv_name="models.csv"): df = mfilter(df) mkdir(directory) df.to_csv(directory + "/" + csv_name, index=False, columns=Modelome.columns) for _, r in df.iterrows(): shutil.copy(r.path, directory + "/" + r.model + ".pdb")
def complete_pockets(pdb, strdoc, structure, pdbUtils): pdb_file = pdbUtils.pdb_path(pdb) pockets_json = pdbUtils.pdb_pockets_path(pdb) mkdir(os.path.dirname(pockets_json)) if not os.path.exists(pockets_json) or os.path.getsize(pockets_json) < 10: r = FPocket(pdb_file).hunt_pockets() r.save(pockets_json) r.delete_dir() if os.path.exists(pockets_json): strdoc.pockets = StructureAnotator.pocket_residue_set( pockets_json, structure.get_atoms())
def update_pdb(self, pdb): pdb = pdb.lower() mkdir(self.pdbs_dir + pdb[1:3]) if not os.path.exists(self.pdb_path(pdb)) or (os.path.getsize(self.pdb_path(pdb)) < 100): if os.path.exists(self.pdb_path_gzipped(pdb)) and (os.path.getsize(self.pdb_path_gzipped(pdb)) > 100): execute("gunzip " + self.pdb_path_gzipped(pdb)) if os.path.exists(self.pdb_path_gzipped(pdb)) and not os.path.exists(self.pdb_path(pdb)): os.remove(self.pdb_path_gzipped(pdb)) elif not os.path.exists(self.pdb_path(pdb)): download_file(self.url_pdb_files + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention, self.pdbs_dir + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention, ovewrite=True) execute("gunzip " + self.pdb_path_gzipped(pdb)) return self.pdb_path(pdb)
def load_pdb_pocket(self, pdb, pdb_dir="/data/databases/pdb/"): utils = PDBs(pdb_dir) if not os.path.exists(utils.pdb_pockets_path(pdb)): utils.update_pdb(pdb) fpocket = FPocket(utils.pdb_path(pdb)) result = fpocket.hunt_pockets() mkdir(os.path.dirname(utils.pdb_pockets_path(pdb))) result.save(utils.pdb_pockets_path(pdb)) with open(utils.pdb_pockets_path(pdb)) as h: result = json.load(h) self.pdb_data[pdb]["pockets"] = result return self.pdb_data[pdb]["pockets"]
def handle(self, *args, **options): input_file = options['input'] accession = options['accession'] self.stderr.write(f"trying to import {options['accession']} imported!") if not os.path.exists(input_file): raise CommandError(f'{input_file} does not exists') extra_attrs = {} taxon = self.detect_tax(input_file, extra_attrs) taxon = options['taxon'] if options['taxon'] else taxon description = options["description"] if options["description"] else " ".join( [f'{k}:{v}' for k, v in extra_attrs.items()]) io = BioIO(accession, taxon, stderr=self.stderr) if options["force"]: if io.exists(): res = io.delete() self.stderr.write(str(res)) elif io.exists(): raise CommandError(f'{accession} already exists, use --force to overwrite ') grep_cmd = 'grep -c "FEATURES *Location/Qualifiers" "%s"' % input_file if input_file.endswith(".gz"): grep_cmd = 'z' + grep_cmd total = int(sp.check_output(grep_cmd, shell=True)) io.create_db(description) seqstore = SeqStore.instance() if options['seqs']: if not os.path.exists(options['seqs']): raise CommandError(f'{options["seqs"]} does not exists') it = smart_parse(input_file, smart_parse(options["seqs"])) else: it = smart_parse(input_file) mkdir(seqstore.db_path(accession)) s1 = seqstore.stream(seqstore.genome_db_path(accession), force=True, stderr=sys.stderr, stdout=sys.stderr) s2 = seqstore.stream(seqstore.proteome_db_path(accession), force=True, stderr=sys.stderr, stdout=sys.stderr) with s1 as genome_stream, s2 as proteome_stream: io.process_record_list(it, total, genome_stream, proteome_stream) self.stderr.write(f"genome {options['accession']} imported!")
def create_human_microbiome(dst="/data/databases/human/", update=False): dst_accs = dst + "gut_microbiota_assemblies/" mkdir(dst_accs) final_file = dst + Offtarget.DEFAULT_GUT_FILENAME utils = GenebankUtils() with gzip.open(final_file, "wt") as h: for accession in tqdm(gut_microbiote_assemblies, file=sys.stderr): genome_path = dst_accs + accession + ".genomic.gbff.gz" if update or not os.path.exists(genome_path): genome_path = NCBI.download_assembly(accession, dst_accs) utils.proteins(genome_path, h) return final_file
def genotype_call(reference, vcf, output_file="./combined.vcf", ploidy=2): wd = os.path.dirname(os.path.abspath(output_file)) + "/" reference = os.path.abspath(reference) vcf = os.path.abspath(vcf) mkdir(wd) assert os.path.exists(wd), f'{wd} could not be created' assert os.path.exists(reference), f'{reference} does not exist' assert os.path.exists(vcf), f'{vcf} does not exist' e(f"""gatk GenotypeGVCFs \ -R "{reference}" -ploidy {ploidy} \ -V "{vcf}" \ -O "{output_file}" """) return
def offtarget(organism, offtarget_databases, offtarget_names, tmp_dir=None): if not tmp_dir: tmp_dir = "/data/organismos/" + organism + "/annotation/" mkdir(tmp_dir) proteins = tmp_dir + "proteins.fasta" if not os.path.exists(proteins): BioMongoDB.protein_fasta(proteins, organism) results = Offtarget.offtargets(proteins, tmp_dir, offtarget_databases) for i, name in enumerate(offtarget_names): load_blast_features(organism, results[i], name, min_identity=0.4, min_query_coverage=0.4, min_hit_coverage=0.4)
def process_domain(domains_dir, chain, dn_start, dn_end, pdb_model): mkdir(domains_dir) cs.filter = SelectResidues(chain.id, { y: 1 for y in [x.id[1] for x in chain.get_residues()][dn_start:dn_end] }) domain_pdb_path = cs.make_pdb(pdb_path, code, chain.id, overwrite=True) res = FPocket(domain_pdb_path, domains_dir).hunt_pockets() for pocket in res.pockets: rs = ResidueSet(name="DomainPocket%i" % pocket.pocket_num, pdb=pdb_model) rs.save() for k, v in pocket.properties.items(): ResidueSetProperty(residue_set=rs, name=k, value=v).save() res.delete_dir() qm = QMean.assesment(domain_pdb_path) residues_qm = qm["residues"] del qm["residues"] for k, v in qm.items(): ChainProperty(pdb=pdb_model, chain=chain.id, name=k, value=v).save()
def variant_call(wd, reference, alignment, ploidy=2): wd = os.path.abspath(wd) + "/" reference = os.path.abspath(reference) alignment = os.path.abspath(alignment) mkdir(wd) assert os.path.exists(wd), f'{wd} could not be created' assert os.path.exists(reference), f'{reference} does not exist' assert os.path.exists(alignment), f'{alignment} does not exist' e(f"""gatk HaplotypeCaller -ERC GVCF \ -R "{reference}" -ploidy {ploidy} \ -I "{alignment}" --output-mode EMIT_ALL_CONFIDENT_SITES \ -O "{wd}raw.g.vcf.gz" """) e(f"""gatk GenotypeGVCFs \ -R "{reference}" -ploidy {ploidy} \ -V "{wd}raw.g.vcf.gz" \ -O "{wd}output.vcf.gz" """)
from SNDG import mkdir, execute, execute_from, init_log from SNDG.WebServices import download_file from SNDG.Structure.PDBs import PDBs init_log("/tmp/createdb.log") def old_or_inexistent(filepath, period=30): return not os.path.exists(filepath) or (( (time.time() - os.path.getatime(filepath)) / 60 / 60 / 24) > period) #os.environ["http_proxy"] = "http://proxy.fcen.uba.ar:8080" #os.environ["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080" mkdir("/data/pdb/") download_file("ftp://ftp.wwpdb.org/pub/pdb/derived_data/index/entries.idx", "/data/pdb/entries.idx", ovewrite=True) pdbs = PDBs("/data/pdb/") pdbs.download_pdb_seq_ses() pdbs.update_pdb_dir() mkdir("/data/pdb/processed/") pdbs.pdbs_seq_for_modelling() execute("makeblastdb -dbtype prot -in /data/pdb/processed/seqs_from_pdb.fasta") if old_or_inexistent("/data/uniprot/uniref/uniref90/uniref90.fasta"): mkdir("/data/uniprot/uniref/uniref90") download_file( "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz",
'--annotation', action='store', dest='annotation', required=True) required.add_argument('-S', '--strain', action='store', dest='strain', default="sample") required.add_argument('-R1', '--read1', action='store', dest='read1', required=True) required.add_argument('-R2', '--read2', action='store', dest='read2', required=True) # parser.add_argument('--useSingletons', action = 'store_true', dest = 'singletons') args = parser.parse_args() mkdir(args.work_dir) Mapping.clean_reads(args.work_dir, args.read1, args.read2) alignment_path = Mapping.alignment(args.work_dir, args.reference, strain=args.strain) Mapping.variant_call(args.work_dir, args.reference, alignment_path, args.strain)
def from_TriTrypDB(name, gff, fasta, tax, tmp_dir=None): genome = {x.id: x for x in sp(fasta)} from BCBio import GFF import re annotation = list(GFF.parse(gff, base_dict=genome)) contig = annotation[0] seqCol = BioDocFactory.create_genome(name, contig, tax, Tax) seqCol.save() if not tmp_dir: tmp_dir = "/tmp/" + name + "/" mkdir(tmp_dir) gene_ids = {} with tqdm(annotation) as pbar: for contig in pbar: pbar.set_description(contig.id) if len(contig.seq) > 15000000: contig.seq = "" contigDoc, gene_ids2 = BioDocFactory.create_contig( contig, seqCol, type_map={ "rRNA": "rRNA", "ncRNA": "ncRNA", NCBI.f_mRNA: "gene", "exon": "exon", "gene": "gene", NCBI.f_CDS: NCBI.f_CDS, "rRNA": "rRNA", "tRNA": "tRNA", "tmRNA": "tmRNA", "snoRNA": "snoRNA", "three_prime_UTR": "three_prime_UTR", "five_prime_UTR": "five_prime_UTR" }) gene_ids.update(gene_ids2) contigDoc.save() prots = [] with tqdm(tritryp_protein_iter(annotation)) as pbar: for (protein, cds_f) in pbar: protDoc = Protein(seq=str(protein.seq), name=protein.id) if "description" in cds_f.qualifiers: protein_description = cds_f.qualifiers['description'][0] elif "Note" in cds_f.qualifiers: protein_description = cds_f.qualifiers['Note'][0] elif "product" in cds_f.qualifiers: protein_description = cds_f.qualifiers['product'][0] else: protein_description = "" protDoc.description = protein_description gos = [] if "Ontology_term" in cds_f.qualifiers: gos = [ x.lower() for x in cds_f.qualifiers["Ontology_term"] if "GO:" in x and ( x not in ["GO:0008150", "GO:0003674", "GO:0005575"]) ] note = cds_f.qualifiers["Note"][0].split( " ")[0] if "Note" in cds_f.qualifiers else "" ecs = ["ec:" + note] if re.match( '^[0-9]+\.[0-9\-]+\.[0-9\-]+\.[0-9\-]$', note) else [] ontologies = list(set(ecs + gos)) protDoc.gene = [protein.id] protDoc.ontologies = ontologies protDoc.alias = [protein.id] if len(protDoc.seq) > 30000: raise Exception("No existen proteinas tan largas...") protDoc.gene_id = gene_ids[protein.id] protDoc.organism = name protDoc.auth = str(BioMongoDB.demo_id) protDoc.seq_collection_id = seqCol prots.append(protDoc) if pbar.n and ((pbar.n % 1000) == 0): Protein.objects.insert(prots) prots = [] if prots: Protein.objects.insert(prots) _common_annotations(name, tmp_dir)
def from_ref_seq( name, ann_path, seqs=None, tax=None, tmp_dir=None, extract_annotation_feature=lambda feature: feature.sub_features[0] if feature.type == "gene" and hasattr(feature, "sub_features") and len( feature.sub_features) else feature, accept_protein_feature=lambda f: ( (f.type == "CDS") and ("translation" in f.qualifiers)), extract_sequence=lambda c, f: f.qualifiers["translation"][0] if "translation" in f.qualifiers else f.extract(c).seq.translate(), cpus=1): if seqs: seqs = {r.id: r.seq for r in bpio.parse(seqs, "fasta")} iter_seqs = list(sp(ann_path, seqs=seqs) if seqs else sp(ann_path)) for contig in iter_seqs: if has_tax: seqCol = BioDocFactory.create_genome(name, contig, tax, Tax) else: seqCol = BioDocFactory.create_genome(name, contig) seqCol.save() break if not tmp_dir: tmp_dir = "/tmp/" + name + "/" mkdir(tmp_dir) gene_ids = {} with tqdm(iter_seqs) as pbar: for contig in pbar: pbar.set_description(contig.id) if len(contig.seq) > 15000000: contig.seq = "" contigDoc, gene_ids2 = BioDocFactory.create_contig( contig, seqCol, type_map={ "rRNA": "rRNA", "ncRNA": "ncRNA", NCBI.f_mRNA: NCBI.f_mRNA, "gene": "gene", NCBI.f_CDS: NCBI.f_CDS, "rRNA": "rRNA", "tRNA": "tRNA", "tmRNA": "tmRNA" }, extract_annotation_feature=extract_annotation_feature, ) gene_ids.update(gene_ids2) contigDoc.save() prots = [] with tqdm( _protein_iter( iter_seqs, accept_feature=accept_protein_feature, extract_annotation_feature=extract_annotation_feature, extract_sequence=extract_sequence)) as pbar: for (protein, cds_f) in pbar: if "locus_tag" in cds_f.qualifiers: protDoc = BioDocFactory.create_protein(protein, cds_f) if len(protDoc.seq) > 30000: raise Exception("No existen proteinas tan largas...") if protDoc.seq.count("*") > 1: print( f"{cds_f.qualifiers['locus_tag'][0]}: Too many stop codons!" ) continue if protDoc.seq.count("+") > 1: print( f"{cds_f.qualifiers['locus_tag'][0]}: + signs found...!" ) continue protDoc.gene_id = gene_ids[cds_f.qualifiers["locus_tag"][0]] protDoc.organism = name protDoc.auth = str(BioMongoDB.demo_id) protDoc.seq_collection_id = seqCol for f in protein.features: protDoc.features.append( Feature(identifier=f.qualifiers["Ontology_term"][0], type=f.type, location=Location(start=int(f.location.start), end=int(f.location.end)))) prots.append(protDoc) if pbar.n and ((pbar.n % 1000) == 0): Protein.objects.insert(prots) prots = [] if prots: Protein.objects.insert(prots) # _common_annotations(name, tmp_dir, cpu=cpus) return seqCol
search.add_argument('--alns_dir', default=None, help='save blast aligments in this folder') search.add_argument("-d", '--database', required=True, help='db to be searched with the pssm/s') search.add_argument('--cpu', default=4, type=int, help='cpus to use') search.add_argument('--format', choices=["table", "fasta"], default="table") args = parser.parse_args() if args.command == "pssm": mkdir(args.output) assert os.path.exists( args.output), f"{args.output} could not be created" for record in bpio.parse(args.seqs, "fasta"): pssm_file = f'{args.output}/{record.id}.pssm' query_file = mktemp() bpio.write(record, query_file, "fasta") if (os.path.exists(pssm_file) and (os.path.getsize(pssm_file) > 100)): print(pssm_file) else: PsiProfile.build_profile(query_file, args.database, args.iterations, pssm_file, args.cpu) if (not os.path.exists(pssm_file)) or ( os.path.getsize(pssm_file) < 100):
for x in assessment[0].all_scores: result[x.name + "_norm"] = x.norm result[x.name + "_zscore"] = x.z_score result["residues"] = {} for row in assessment[1].score_table.rows: r = {f: row[i] for i, f in enumerate(assessment[1].score_table.col_names[4:], 4)} result["residues"][row[0] + "_" + str(row[2]) + "_" + str(row[3])] = r return result if __name__ == '__main__': from SNDG import init_log,arg_file_iter init_log() parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-acc", "--accpro", default="/opt/sspro4/bin/predict_acc.sh") parser.add_argument("-psi", "--psipred", default="/opt/psipred/runpsipred") parser.add_argument("-i", "--inputpdb", default="-",) parser.add_argument("-o", "--outdir", default="./") parser.add_argument( "--cpus", default=multiprocessing.cpu_count()) args = parser.parse_args() # "/data/databases/pdb/divided/ok/pdb4oke.ent" mkdir(args.outdir) assessment = QMean.assesment(args.inputpdb, output_dir=args.outdir, accpro_path=args.accpro, psipred_path=args.psipred,cpus=args.cpus) print(assessment)
tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito")) mysql_db.initialize(MySQLDatabase('sndg', user='******', passwd="mito")) assemblies = list(ExternalAssembly.select().where( ExternalAssembly.sample_source.is_null(False))) ProteinAnnotator.connect_to_db(database="unipmap", user="******", password="******") with tqdm(assemblies) as pbar: for x in pbar: if mdb.seq_col_exists(x.assembly_accession): continue pbar.set_description(x.assembly_accession) try: dst_dir = "/data/organismos/" + x.assembly_accession + "/annotation/" mkdir(dst_dir) gbpath = x.download_gbk(dst_dir) from_ref_seq(x.assembly_accession, gbpath, tax=x.ncbi_tax, tmp_dir=dst_dir) tid = int( mdb.db.sequence_collection.find_one( {"name": x.assembly_accession})["tax"]["tid"]) tmp_dir = "/data/organismos/" + x.assembly_accession + "/annotation/" proteome_dir = "/data/organismos/" + x.assembly_accession + "/contigs/" mkdir(tmp_dir) mkdir(proteome_dir) protein_fasta = create_proteome(proteome_dir, x.assembly_accession)
"uploader": "demo", "_class": "ar.com.bia.entity.SeqCollectionDoc", "type": "value", "options": ["No", "Yes"], "description": "Has a hit in Database of Essential Genes" }) } from SNDG.Sequence import read_blast_table from tqdm import tqdm # cols = list(SeqCollection.objects(name__nin=["cruzi","pdb"])) cols = list(SeqCollection.objects(name__nin=["cruzi", "pdb"])) cpus = 4 db = mdb.db for seqCol in tqdm(cols): mkdir("/data/organismos/" + seqCol.name + "/contigs") proteome = "/data/organismos/" + seqCol.name + "/contigs/genoma.fasta" if not os.path.exists(proteome): mdb.protein_fasta(proteome, seqCol.name) out = "/data/organismos/" + seqCol.name + "/annotation/offtarget/" mkdir(out) if not seqCol.has_druggability_param("human_offtarget"): seqCol.druggabilityParams.append(off_props["human_offtarget"]) db = "/data/databases/human/gencode.v17.pc_translations.fa" execute( "blastp -evalue 1e-5 -max_hsps 1 -outfmt 6 -max_target_seqs 1 -db {db} -query {query} -out {out} -num_threads {cpus}", db=db,
required.add_argument('-t', '--tmp_dir', default="/tmp") required.add_argument("--cpus", default=1) args = parser.parse_args() if not os.path.exists(args.ref_dirs): raise FileNotFoundError(f"{args.ref_dirs} does not exists") if args.in_fasta_s and not os.path.exists(args.in_fasta_s): raise FileNotFoundError(f"{args.in_fasta_s} does not exists") if not os.path.exists(args.in_fasta_r2): raise FileNotFoundError(f"{args.in_fasta_r2} does not exists") if not os.path.exists(args.in_fasta_r1): raise FileNotFoundError(f"{args.in_fasta_r1} does not exists") refs = glob(f'{args.ref_dirs}/*.fasta') + glob(f'{args.ref_dirs}/*.fna') if not refs: raise FileNotFoundError(f"no references detected in {args.ref_dirs}") mkdir(args.output) if not os.path.exists(args.output): raise FileNotFoundError(f"could not create {args.output}") if not args.output.endswith("/"): args.output = args.output + "/" mrca = MultiRefCoreAln(base_refs=refs, cpus=args.cpus) mrca.core_reads(args.output, args.sample_name, args.in_fasta_r1, args.in_fasta_r2, args.in_fasta_s, args.tmp_dir)
mysqldb = ProteinAnnotator.connect_to_db(database="unipmap", user="******", password="******") orgs = [ ("Mpylori26695", "Helicobacter pylori 26695 (e-proteobacteria)", "/data/organismos/Mpylori26695/GCF_000008525.1_ASM852v1_genomic.gbff", 85962), ("MpyloriIndia", "Helicobacter pylori India7 (e-proteobacteria)", "/data/organismos/MpyloriIndia/GCF_000185185.1_ASM18518v1_genomic.gbff", 907238), ] for name, org, ann_path, tax in orgs: organism = name mkdir("/data/organismos/" + name + "/annotation/offtarget") mkdir("/data/organismos/" + name + "/annotation/pwtools") mkdir("/data/organismos/" + name + "/annotation/pathways") mkdir("/data/organismos/" + name + "/estructura/raw") mkdir("/data/organismos/" + name + "/estructura/sndg/modelos") mkdir("/data/organismos/" + name + "/estructura/sndg/pockets") from_ref_seq(name, ann_path, tax=tax, cpus=3) mdb.protein_fasta("/data/organismos/" + name + "/annotation/proteins.faa", name) update_proteins("/tmp/" + name + "/", "/data/organismos/" + name + "/annotation/proteins.faa", name, 1003200, db_init=mysqldb)
parser.add_argument("-u", "--dbuser", default="root") args = parser.parse_args() from peewee import MySQLDatabase mysql_db = MySQLDatabase(args.dbname, user=args.dbuser, password=args.dbpass) sqldb.initialize(mysql_db) pdb_utils = PDBs(pdb_dir=args.pdb_dir) props = {x.name: x for x in Property.select()} pdbs = list(pdb_utils) with tqdm(pdbs) as pbar: for (code, pdb_path) in pbar: pdb_model = PDB.select().where(PDB.code == code).first() p = PDBParser(PERMISSIVE=True, QUIET=True) try: for chain in p.get_structure(code, pdb_path).get_chains(): chains_dir = args.pdb_dir + "/chains/" + code[1:3] + "/" mkdir(chains_dir) cs = ChainSplitter(chains_dir) process_chain(pdb_path, code, chain.id, pdb_model, props) except Exception as ex: traceback.print_stack() _log.error(code + ": " + str(ex))
from SNDG.WebServices import download_file init_log("/tmp/createdb.log") def old_or_inexistent(filepath, period=30): return not os.path.exists(filepath) or (((time.time() - os.path.getatime(filepath)) / 60 / 60 / 24) > period) os.environ["http_proxy"] = "http://proxy.fcen.uba.ar:8080" os.environ["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080" if not os.path.exists("/data/cog/whog"): mkdir("/data/cog/") download_file("ftp://ftp.ncbi.nih.gov/pub/COG/COG/whog", "/data/cog/whog") if not os.path.exists("/data/cog/myva"): mkdir("/data/cog/") download_file("ftp://ftp.ncbi.nih.gov/pub/COG/COG/myva", "/data/cog/myva") execute("formatdb -i /data/cog/myva -o T") if not os.path.exists("/data/ec/PRIAM_MAR15/priam"): mkdir("/data/ec/") download_file("http://priam.prabi.fr/REL_MAR15/Distribution.zip", "/data/ec/PRIAM_MAR15.zip") execute_from("unzip /data/ec/PRIAM_MAR15.zip; exit 0;", "/data/ec/",retcodes=[0,1])
) if args.databases in ["all", "human"]: path = f'{args.output}/human/' if args.force or not os.path.exists( path + Offtarget.DEFAULT_HUMAN_FILENAME): path = Offtarget.download_human_prots(dst=path) else: sys.stderr.write( f'{path} already exists, overwrite using --force') filename = os.path.basename(path) execute( f"zcat {path}{Offtarget.DEFAULT_HUMAN_FILENAME} | makeblastdb -title human -out {path}{Offtarget.DEFAULT_HUMAN_FILENAME} -dbtype prot -in -" ) if args.databases in ["all", "deg"]: mkdir(f'{args.output}/deg/') Offtarget.download_deg(f'{args.output}/deg/') elif args.command == "gut_microbiote_blast": blast_gut_path = f'{args.output}/gut_microbiome.blast.tbl' gut_result_path = f'{args.output}/gut_microbiome.tbl' # if not os.path.exists(args.database + ".phr"): # raise FileNotFoundError(f"{args.database} index files could not be found. Run makeblastdb") if args.force or not os.path.exists(blast_gut_path): Offtarget.offtargets(args.input_faa, blast_gut_path, offtarget_db=args.database, cpus=args.cpus) else: sys.stderr.write( f'{blast_gut_path} already exists, overwrite using --force')