def multi_align(): #eon_info_key = "{}_{}N{}Y".format(ref_exp_region, multi_best_hit_N_str, multi_best_hit_Y_str) # multi_align_indi_dict[ref_region][vcf_fname_key] = [ref_exp_region, iso_region, consensus_fasta_name] for ref_info in multi_align_indi_dict.keys(): (ref_region, ref_e_region) = ref_info print("region2: " + ref_region + " " + ref_e_region) out_fasta_fname = "{}.fasta".format(ref_region) multi_align_outF = "{}_{}.clustalo_num".format(prefix, ref_region) ## write ref to fasta file command("echo {} >{}".format(ref_e_region, tmp_region_fname)).run_comm(0) command("samtools faidx -r {} {} >{}".format( tmp_region_fname, REF_FASTA, out_fasta_fname)).run_comm(0) command("sed -i \"1s/.*/>ref_{}/\" {}".format( ref_e_region, out_fasta_fname)).run_comm(0) for vcf_file_key in sorted(multi_align_indi_dict[ref_info].keys()): (iso_e_region, iso_concensus_fasta ) = multi_align_indi_dict[ref_info][vcf_file_key] ## write isolates to fasta file iso_fasta_lines_str = get_region_seq(iso_e_region, iso_concensus_fasta) iso_fasta_lines = replace_the_first_line( iso_fasta_lines_str, ">{}_{}".format(vcf_file_key, iso_e_region)) command("echo \"{}\" >> {}".format(iso_fasta_lines, out_fasta_fname)).run_comm(0) ## do alignment for each expanded region command( "clustalo --infile {} --threads 8 --verbose --outfmt clustal --resno --outfile {} --output-order input-order --seqtype dna --force" .format(out_fasta_fname, multi_align_outF)).run_comm(0)
def qc_by_quast(): global qc_files qc_files = ["report.txt", "report.tsv", "report.pdf", "report.html"] REF_FASTA = "{}/../../ref/{}.fasta".format(workdir, genome_name) GFF3 = "{}/../../ref/{}.gff".format(workdir, genome_name) command("quast {} -r {} -g {} -o {}".format(scaffold_fasta, REF_FASTA, GFF3, workdir)).run_comm(0)
def get_gff_from_genome_name(): global genome_gff_fname if genome == "cryptosporidium_hominis": command("cp -p {} .".format(prop.get_attrib("ch_gff"))).run_comm(0) genome_gff_fname = "GCA_002223825.1_C.hominis.v1_genomic.gff" else: genome_gff_fname = misc.download("gff3", genome, "")
def run_trim_galore(fastqfiles): global fqout1 global fqout2 global tg_out_files tg_out_files = [] fastq1_name = os.path.basename(fastq1) fastq1_name_base = fastq1_name.rstrip(".fastq") report1 = os.path.join(workdir, fastq1_name + "_trimming_report.txt") if fastq2 is None: fqout1 = os.path.join(workdir, fastq1_name_base + "_trimmed.fq") tg_out_files = [fqout1, report1] else: fqout1 = os.path.join(workdir, fastq1_name_base + "_val_1.fq") fastq2_name = os.path.basename(fastq2) fastq2_name_base = fastq2_name.rstrip(".fastq") fqout2 = os.path.join(workdir, fastq2_name_base + "_val_2.fq") report2 = os.path.join(workdir, fastq2_name + "_trimming_report.txt") tg_out_files = [fqout1, report1, fqout2, report2] #pair fastq files if len(fastqfiles) == 2: command("trim_galore --paired -q 20 " + fastqfiles[0] + " " + fastqfiles[1]).run_comm(0) #single fastq files elif len(fastqfiles) == 1: command("trim_galore -q 20 " + fastqfiles[0]).run_comm(0)
def vcf_analysis(): global multi_align_indi_dict iso_name = "NA" multi_align_indi_dict = defaultdict(lambda: defaultdict(str)) for vcf_fpath in (vcf_files): vcf_fname_key = re.findall("(.*)\.", os.path.basename(vcf_fpath))[0] compressed_vcf_name = vcf_fname_key + ".bgzip" consensus_fasta_name = vcf_fname_key + ".consensus.fasta" command("bgzip -c {} > {}".format(vcf_fpath, compressed_vcf_name)).run_comm(0) command("tabix {} -f".format(compressed_vcf_name)).run_comm(0) command("cat {} | bcftools consensus {} > {}".format( REF_FASTA, compressed_vcf_name, consensus_fasta_name)).run_comm(0) command( "makeblastdb -in {} -parse_seqids -dbtype nucl -out {}.DBblast". format(consensus_fasta_name, vcf_fname_key)).run_comm(0) blast_out_fname = vcf_fname_key + ".ref.exp.blast_out" command( "blastn -query {} -db {}.DBblast -dust no -outfmt 7 -max_target_seqs 1 -out {}" .format(ref_exp_region_fasta, vcf_fname_key, blast_out_fname)).run_comm(0) #for map_key in mapping_dict.keys(): # if re.search(map_key, consensus_fasta_name): # iso_name = mapping_dict[map_key] #if iso_name == "NA": # iso_name = re.findall("(.*?)_IbA10G2", consensus_fasta_name)[0] exp_region_analysis(vcf_fname_key, blast_out_fname, consensus_fasta_name)
def get_info_from_gff3(): global gff3_dict global gene_bed_fPath gff3_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(str))) gene_bed_dict = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(str)))) REF_GFF = "{}/../ref/{}.gff".format(workdir, genome_name) fh_gff3 = open(REF_GFF, "r") gene_bed_fPath = "{}/gene.bed".format(workdir) tmp_fPath = "{}/tmp.bed".format(workdir) fh_tmp = open(tmp_fPath, "w") for line in fh_gff3: if not re.search('^#', line): (chrom, mol_type, start, end, strand, phase, gene_part_ori) = getVar(line.split(), [0, 2, 3, 4, 6, 7, 8]) if mol_type == 'gene': gene_name = re.findall("ID=(.*?);", gene_part_ori)[0] fh_tmp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( chrom, start, end, strand, phase, gene_name)) if mol_type == 'CDS': gene_name = re.findall("Parent=(.*?)-", gene_part_ori)[0] if mol_type == 'gene' or mol_type == 'CDS': gff3_dict[chrom][mol_type][(start, end)] = gene_name fh_gff3.close() fh_tmp.close() command("sort -k1,1 -k2,2n {} > {}".format(tmp_fPath, gene_bed_fPath)).run_comm(0) command("rm {}".format(tmp_fPath)).run_comm(0)
def run_QC(): global fastqc_out_files global fastqc_out_files_str global multiqc_out_prefix global sample_runID ## define variables if mapping_file is not None: sample = MISC.get_samples_by_runIDs(mapping_file)[runID] sample_runID = "{}_{}".format(sample, runID) else: sample_runID = runID fastqc_out_files = [] fastqc_out_files_str = "" if fastq2 is not None: infiles = [fq_ori1, fqout1, fq_ori2, fqout2] else: infiles = [fq_ori1, fqout1] ## fastQC for infile in infiles: fastqc_out_files.append(fastQC(infile)) for fastqc_out_file in fastqc_out_files: fastqc_out_files_str += fastqc_out_file + " " fastqc_out_files_str = fastqc_out_files_str.rstrip(" ") ## multiQC multiqc_out_prefix = sample_runID + ".multiQC" command("multiqc -f {} -o {} --filename {} -v".format( fastqc_out_files_str, workdir, multiqc_out_prefix)).run_comm(0)
def run_trim_galore(fastqfiles): global fastq1_name global fastq1_name_base global fastq2_name global fastq2_name_base global fqout1 global fqout2 fastq1_name = os.path.basename(fastq1) fastq1_name_base = fastq1_name.rstrip(".fastq") if fastq2 is None: fqout1 = os.path.join(workdir, fastq1_name_base + "_trimmed.fq") else: fqout1 = os.path.join(workdir, fastq1_name_base + "_val_1.fq") fastq2_name = os.path.basename(fastq2) fastq2_name_base = fastq2_name.rstrip(".fastq") fqout2 = os.path.join(workdir, fastq2_name_base + "_val_2.fq") #check whether qc_software path was defined in the property file qc_sw_path = prop.get_attrib(qc_sw) #pair fastq files if len(fastqfiles) == 2: command(qc_sw_path + " --paired -q 20 " + fastqfiles[0] + " " + fastqfiles[1]).run_comm(0) #single fastq files elif len(fastqfiles) == 1: command(qc_sw_path + " -q 20 " + fastqfiles[0]).run_comm(0)
def write_gene_matrix_summary(): global alt_isolates_nums global sample_num global gene_matrix_summary_file gene_matrix_summary_file = "{}_gene_summary.csv".format(prefix) sample_num=len(sample_names) q25=str(int(0.25*sample_num)) q50=str(int(0.5*sample_num)) q75=str(int(0.75*sample_num)) alt_isolates_nums=[q25,q50,q75] ann_types=["dN","dS"] fPath_out="{}_gene_summary_ori.csv".format(prefix) fileout=open(fPath_out,'w') # write the column names fileout.write("{}\t{}\t{}\t{}\t{}".format("GENE","CHROMOSOME","STRAT","END","LENGTH")) for go_col in go_col_names: fileout.write("\t{}".format(go_col)) for num_range in alt_isolates_nums: for ann_type in ann_types: fileout.write("\t{}_locusVar_num(>={}_isolate(s))\t{}_locusVar_num(>={}_isolate(s))/kb" .format(ann_type,num_range,ann_type,num_range)) fileout.write("\tdN/dS(>={}_isolate(s))".format(num_range)) fileout.write("\n") # write the content for gene in sorted(gene_variant_dict.keys()): fileout.write("{}".format(gene)) fileout.write("\t{}\t{}\t{}\t{}".format(genome_gene_dict[gene][0],genome_gene_dict[gene][1], genome_gene_dict[gene][2],genome_gene_dict[gene][3] )) for each_go in go_dict[gene]: fileout.write("\t{}".format(each_go)) fileout.write("\t{}\n".format(get_str(gene_locus[gene],genome_gene_dict[gene][3]))) fileout.close() cmd = "cat {} | awk '{{split($0,a,\"\\t\");if(a[14]!=0 || a[16]!=0) print $0}}' >{}" command(cmd.format(fPath_out, gene_matrix_summary_file)).run_comm(0)
def run_vcf_merge(): global merged_vcf variants_str = get_variant_str() merged_vcf = workdir + "/" + prefix + "_merged.vcf" command( "gatk -T CombineVariants -R {} {} -o {} -genotypeMergeOptions UNIQUIFY" .format(REF_FASTA, variants_str, merged_vcf)).run_comm(0)
def post_process(): print("post_processing...") out_files = ["scaffolds.fasta", "spades.log"] for out_file in out_files: FI.copy_file_add_prefix(out_file, outdir, prefix + "_") for qcfile in qc_files: FI.copy_file_to_destdir(qcfile, qcdir) command("cp -p {}.multiQC*.html {}".format(prefix, qcdir)).run_comm(0)
def create_upset_matrix(): global upset_mat_file upset_mat_file = "{}_upset.mat".format(prefix) command( "infoseq -sequence {} -only -name -length -outfile {}/genome.size -nohead -auto" .format(REF_FASTA, workdir)).run_comm(0) command( "bedtools multiinter -g {}/genome.size -emtpy -header -i {} >{}/{}". format(workdir, vcf_files_str, workdir, upset_mat_file)).run_comm(0) change_fileName_to_isoName(upset_mat_file)
def run_snpEff(): global ann_vcf_fpaths ann_vcf_fpaths = [] snpEff = prop.get_attrib("snpeff") for vcf_fpath in all_vcf_fpaths: vcf_fpath_prefix = os.path.basename(vcf_fpath).rstrip(".vcf") ann_vcf_fpath = vcf_fpath_prefix + ".ann.vcf" ann_vcf_fpaths.append(ann_vcf_fpath) command("java -jar {} -c snpEff.config {} {} > {}".format( snpEff, genome, vcf_fpath, ann_vcf_fpath)).run(0)
def run_snpEff(self): print("run annotation...") global fName_ann_vcf annot_sw = "snpeff" snpEff = self.prop.get_attrib("snpeff") snpeff_db = snpEff_db(self.properties_file, self.genome_name) snpeff_db.build_snpeff_db() fName_ann_vcf = self.prefix + ".ann.vcf" command("java -jar {} -c snpEff.config {} {} > {}".format( snpEff, self.genome_name, fName_str_vcf, fName_ann_vcf)).run_comm(0)
def fastqc(bam): global fastqc_file global bam_for_fastqc #sample_runID already exists in the bam file name bam_for_fastqc = os.path.basename(bam).replace( "_grouped_dedup", "").replace("_grouped", "").replace(prefix_ori + "_", "") FI.copy_file(bam, "{}/{}".format(workdir, bam_for_fastqc)) command("fastqc -o {} --noextract -f bam_mapped {}".format( workdir, bam_for_fastqc)).run_comm(0) fastqc_file = bam_for_fastqc.replace(".bam", "_fastqc.zip")
def generate_cluster3(): global cluster_fname genomeAnalysisTK = prop.get_attrib("gatk") command( "java -jar {} -T CombineVariants -R {} {} -o {} -genotypeMergeOptions UNIQUIFY" .format(genomeAnalysisTK, genome_fasta, get_variant_str(), prefix)).run_comm(0) plink = prop.get_attrib("plink") command("{} --vcf {} -cluster --allow-extra-chr -out {}".format( plink, prefix, prefix)).run_comm(0) cluster_fname = prefix + ".cluster3"
def check_seq_hunN(region): command("echo {} >{}".format(region, tmp_region_fname)).run_comm(0) region_fasta_str_ori = command("samtools faidx -r {} {}".format( tmp_region_fname, REF_FASTA)).run_comm(1).decode("utf-8").rstrip() region_fasta_str = region_fasta_str_ori.replace("\n", "") print("region_fasta_str=" + region + "\n" + region_fasta_str) if re.search(sixtyNstr, region_fasta_str): print("here 60N:" + region) return 1 else: return 0
def deduplication(): global deduped_fqs fq_dedup_out1 = fqout1.replace(".fq", ".dedup.fq") command("clumpify in={} out={} dedupe=t".format(fqout1, fq_dedup_out1)).run_comm(0) deduped_fqs = [fq_dedup_out1] if fastq2 is not None: fq_dedup_out2 = fqout2.replace(".fq", ".dedup.fq") command("clumpify in={} out={} dedupe=t".format( fqout2, fq_dedup_out2)).run_comm(0) deduped_fqs.append(fq_dedup_out2)
def deduplication(): global fq_dedup_out1 global fq_dedup_out2 rm_dup_sw_path = prop.get_attrib(rm_dup_sw) fq_dedup_out1 = fqout1.replace(".fq", ".dedup.fq") command("{} in={} out={} dedupe=t".format(rm_dup_sw_path, fqout1, fq_dedup_out1)).run_comm(0) if fastq2 is not None: fq_dedup_out2 = fqout2.replace(".fq", ".dedup.fq") command("{} in={} out={} dedupe=t".format(rm_dup_sw_path, fqout2, fq_dedup_out2)).run_comm(0)
def multiqc(): global multiQC_outFN global fastqc_html if mapping_tool == "bowtie2": multiQC_input = fastqc_file + " " + bowtie2_log_for_multiqc else: multiQC_input = fastqc_file multiQC_input += " " + qualimap_dir multiQC_outFN = "{}.multiQC".format(sample_runID) command("multiqc -f {} -o {} --filename {} -v".format( multiQC_input, workdir, multiQC_outFN)).run_comm(0) fastqc_html = "{}_{}_fastqc.html".format(prefix_ori, sample_runID)
def run_snpEff(): global ann_vcf_fpaths annot_sw = "snpeff" ann_vcf_fpaths = [] snpEff = prop.get_attrib(annot_sw) genome_db = {"ch": "c_hominis", "cp": "c_parvum"} for vcf_fpath in all_vcf_fpaths: vcf_fpath_prefix = os.path.basename(vcf_fpath).rstrip(".vcf") ann_vcf_fpath = vcf_fpath_prefix + ".ann.vcf" ann_vcf_fpaths.append(ann_vcf_fpath) command("java -jar {} -c snpEff.config {} {} > {}".format( snpEff, genome_db[genome], vcf_fpath, ann_vcf_fpath)).run_comm(0)
def add_group(fastqfiles): global grouped_bam global picard grouped_bam = "{}/{}_grouped.bam".format(workdir, sample_runID) fq_id_pat = "^\@(.*?)\." first_line_fq1 = command("head -n 1 " + fq1).run_comm(1).decode("utf-8").rstrip() fq_id = re.findall(fq_id_pat, first_line_fq1)[0] cmd_str = "picard AddOrReplaceReadGroups I={} O={} RGID={} RGPU={} RGSM={} RGLB={} RGPL={} VALIDATION_STRINGENCY=LENIENT" command( cmd_str.format(bam_sorted, grouped_bam, fq_id, "NA", fq_id, dna_library, platform)).run_comm(0)
def post_process(): print("post_processing...") fastqc_postfix = "_fastqc.zip" for tg_out_file in tg_out_files: FI.copy_file_add_prefix(tg_out_file, outdir, prefix + "_") for fastqc_out_file in fastqc_out_files: FI.copy_file_to_destdir(fastqc_out_file, qcdir) FI.copy_file_to_destdir(fastqc_out_file.replace(".zip", ".html"), qcdir) command("cp -p {}.html {}".format(multiqc_out_prefix, qcdir)).run_comm(0) if if_dedup: for deduped_fq in deduped_fqs: FI.copy_file_add_prefix(deduped_fq, outdir, prefix + "_")
def report(): print "report..." fh_report = open("report", 'w') for codeml_output in codeml_outputs: omega = command("grep omega {}".format(codeml_output)).run_comm(1) if re.search("\d+", omega): fh_report.write(omega) accs_str = command("grep '^#' {} | awk '{{print $2}}'".format( codeml_output)).run_comm(1) accs = accs_str.rstrip().split() for acc in accs: fh_report.write("{}_{}\n".format( acc, fasta_genome_map[acc_fasta_map[acc]])) fh_report.write("\n") fh_report.close()
def get_intersect(): global intersect_fPath global jaccard_fPath global fhout_intersect global fhout_jaccard vcf_dict = {} intersect_fPath = workdir + "/intersect.matrix" jaccard_fPath = workdir + "/jaccard.matrix" fhout_intersect = open(intersect_fPath, 'w') fhout_jaccard = open(jaccard_fPath, 'w') out_files_write("name") for vcf in vcf_files: vcf_dict[vcf] = MISC.get_runID(vcf) out_files_write(" " + vcf_dict[vcf]) out_files_write("\n") for vcf1 in vcf_files: out_files_write(vcf_dict[vcf1]) for vcf2 in vcf_files: (intersect, jaccard) = getVar( command( "bedtools jaccard -a {} -b {} |cut -f1,3|grep -v jaccard". format(vcf1, vcf2)).run_comm(1).decode("utf-8").rstrip().split(), [0, 1]) #overlaps fhout_intersect.write(" " + intersect) fhout_jaccard.write(" " + jaccard) out_files_write("\n") fhout_intersect.close() fhout_jaccard.close() change_fileName_to_isoName(intersect_fPath) change_fileName_to_isoName(jaccard_fPath)
def parse_mat(): headline = command("head -n 1 {}".format(upset_mat_file)).run_comm( 1).decode("utf-8").rstrip() eles = headline.split() new_headline = "" for ele in eles[:5]: new_headline += ele + "\t" for ele in eles[5:]: ele = re.sub("^.*/", "", ele) new_headline += ele + "\t" new_headline = new_headline.rstrip("\t") command("mv {} {}.tmp".format(upset_mat_file, upset_mat_file)).run_comm(0) command("sed '1s/.*/{}/' {}.tmp >{}".format( new_headline, upset_mat_file, upset_mat_file)).run_comm( 0) # remove dir of each sample from the head line change_fileName_to_isoName(upset_mat_file)
def ref_region_analsis(): global ref_exp_region_dict global ref_exp_region_fasta global ann_dict global sixtyNstr sixtyNstr = get_60Nstr() ref_exp_region_dict = {} ann_dict = {} # tandem repeats finder to detect tandem repeat in genome fasta file,output is TRF_out_name # parameter details, see *1, output table explanation, see *2 command("trf {} 2 7 7 80 10 50 500 -f -d -m".format(REF_FASTA)).run_comm(0) # convert it to the format that STRViper will read in, output explanation see *2, not only parsing, # but also skip some regions as better quality regions related ## define file names trf_out_name = os.path.basename(REF_FASTA) + ".2.7.7.80.10.50.500.dat" fName_str = "{}.trf.str".format(genome_name) exp_ref_region_fname = "{}.ref.exp.regions".format(prefix) fhout_exp_ref_region = open(exp_ref_region_fname, "w") ref_exp_region_fasta = "{}.ref.exp.fasta".format(prefix) tmp_bed_fPath = "{}/tmp.bed".format(workdir) fh_bed_tmp = open(tmp_bed_fPath, "w") ## run command command("jsat parseTRF --input {} --output {} --format str".format( trf_out_name, fName_str)).run_comm(0) cmd_str = "grep -v '^#' {} | grep -v '^$' | awk '{{print $1\" \"$2\" \"$3\" \"$4\" \"$5}}'" ref_regions_str = command(cmd_str.format(fName_str)).run_comm(1).decode( "utf-8").rstrip().split("\n") for ref_region_str in ref_regions_str: (ref_chrom, ref_start, ref_end, ref_str_unit_len, ref_unit_num_ori) = getVar(ref_region_str.split(), [0, 1, 2, 3, 4]) ref_unit_num = str(int(float(ref_unit_num_ori))) (exp_ref_region, exp_ref_region_len) = get_exp_ref_region(ref_chrom, ref_start, ref_end, chrom_len_dict[ref_chrom]) ref_region = "{}:{}-{}".format(ref_chrom, ref_start, ref_end) ref_len = str(abs(int(ref_start) - int(ref_end)) + 1) cmd = "grep '^[0-9]' {} | awk '{{if ($1=={} && $2=={} && $3=={}) print $14}}'" ref_str_seq = command( cmd.format(trf_out_name, ref_start, ref_end, ref_str_unit_len)).run_comm(1).decode("utf-8").rstrip() ref_exp_region_dict[exp_ref_region] = (exp_ref_region_len, ref_region, ref_str_seq, ref_str_unit_len, ref_len, ref_unit_num) fhout_exp_ref_region.write(exp_ref_region + "\n") ann_dict[ref_region] = get_region_ann(ref_region) print("ann_dict1={} {}".format(ann_dict[ref_region][0], ann_dict[ref_region][1])) fh_bed_tmp.write("{}\t{}\t{}\t{}:{}:{}\t.\t.\n".format( ref_chrom, ref_start, ref_end, ref_str_seq, ref_str_unit_len, ref_unit_num)) fhout_exp_ref_region.close() fh_bed_tmp.close() command("samtools faidx -r {} {} -o {}".format( exp_ref_region_fname, REF_FASTA, ref_exp_region_fasta)).run_comm(0) get_closest_region_and_gene(tmp_bed_fPath)
def download(genome): ftp_dir = "" cDNA_fasta_fname = "" ftp_root_dir = "ftp://ftp.ensemblgenomes.org/pub/current/protists/fasta/" sub_dirs1 = command("curl -s {} | awk '{{print $9}}'".format( ftp_root_dir)).run_comm(1).split() if genome.lower() in sub_dirs1: ftp_dir = "{}/{}/cdna/".format(ftp_root_dir, genome.lower()) else: for sub_dir1 in sub_dirs1: sub_dirs2 = command("curl -s {}/ | awk '{{print $9}}'".format( ftp_root_dir + sub_dir1)).run_comm(1).split() if genome.lower() in sub_dirs2: ftp_dir = "{}/{}/{}/cdna/".format(ftp_root_dir, sub_dir1, genome.lower()) if ftp_dir == "": print "can not find cDNA fasta file for {}".format(genome) sys.exit(1) else: cDNA_fasta_gz = command( "curl -s {} | awk '{{print $9}}' | grep cdna".format( ftp_dir)).run_comm(1) cDNA_fasta_gz = cDNA_fasta_gz.rstrip() command("curl -o {} {}".format(cDNA_fasta_gz, ftp_dir + cDNA_fasta_gz)).run_comm(0) cDNA_fasta_fname = cDNA_fasta_gz.rstrip(".gz") command("gunzip -c {} > {}".format( cDNA_fasta_gz, cDNA_fasta_fname)).run_comm_no_exit(1) fasta_genome_map[cDNA_fasta_fname] = genome return cDNA_fasta_fname
def get_chrom_len(): global chrom_len_dict chrom_len_dict = {} chro_ori_lines = command("grep '>' {}".format(REF_FASTA)).run_comm( 1).decode("utf-8").rstrip().split("\n") for chro_ori_line in chro_ori_lines: (chrom, chr_len) = re.findall(">(.*?) .*?length=(\d+) ", chro_ori_line)[0] chrom_len_dict[chrom] = chr_len
def run_snpEff(): global ann_stat_fpath global ann_stat_fpaths global ann_vcf_fpaths ann_vcf_fpaths=[] ann_stat_fpaths=[] snpeff_db=snpEff_db(properties_file,genome_name) snpeff_db.build_snpeff_db() for vcf_fpath in all_vcf_fpaths: runID=MISC.get_runID(vcf_fpath) vcf_prefix="{}_{}".format(prefix, os.path.basename(vcf_fpath).rstrip(".vcf")) ann_vcf_fpath=vcf_prefix+"_ann.vcf" ann_vcf_fpaths.append(ann_vcf_fpath) if mapping_file is not None: ann_stat_fpath="{}_{}.ann_stats".format(mirror[runID],runID) else: ann_stat_fpath=runID+".ann_stats" ann_stat_fpaths.append(ann_stat_fpath) command("snpEff -c snpEff.config {} {} -csvStats {} > {}".format(genome_name, vcf_fpath, ann_stat_fpath, ann_vcf_fpath)).run(0);