def generate_surv_params(param_file): logging.debug(f"Running SURVIVOR") ret = cmd_exe("SURVIVOR simSV {}".format(param_file)) logging.debug(ret.stderr) logging.debug(ret.stdout) if ret.ret_code != 0: logging.error("Problem running SURVIVOR") logging.error(ret.stderr) exit(ret.ret_code)
def sim_reads_art(workdir, coverage=30, readlen=150, meanfrag=400, insertsd=50, instrument="HS25"): """ Run art_illumina read simulator """ ret = cmd_exe("which art_illumina") if ret.ret_code != 0: logging.error("Cannot fine art_illumina executable in the environment") exit(ret.retcode) try: os.chdir(workdir) except OSError: logging.error(f"Cannot change into {workdir} directory") exit(1) alt_ref = 'svteaser.altered.fa' ret = cmd_exe((f"art_illumina -ss {instrument} -sam -na -i {alt_ref} -p " f"-l {readlen} -m {meanfrag} -s {insertsd} -f {coverage} -o art_illumina.simReads")) if ret.ret_code != 0: logging.error("Problem running art_illumina") logging.error(ret.stderr) logging.error(ret.stdout) exit(ret.ret_code)
def run_trf(self, altseqs, refseqs=None): """ Runs trf on the ref/alt sequences returns {'a': [althitsdict,..], 'r':[refhitsdict]} """ def parse_output(): """ Parse the outputs from trf, turn to a dictionary """ hits = defaultdict(list) with open(TRFAnno.TRNAME, 'r') as fh: name = fh.readline() if name == "": # no hits return hits name = name.strip()[1:] while True: # If there are multiple, need to parameters for 'take best' or take top N or something # Will need name now that there's ref/alt seq data = fh.readline() if data == "": break if data.startswith("@"): name = data.strip()[1:] continue data = data.strip().split(' ') data = { x[0]: y for x, y in zip(TRFAnno.TRFCOLS, data) if not x[0].startswith("unk") } # don't really need until parallel data["TRF_scores"] = int(data["TRF_scores"]) hits[name].append(data) return hits with open(TRFAnno.FANAME, 'w') as fout: for seq in altseqs: fout.write(">a\n%s\n" % (seq)) for seq in refseqs: fout.write(">r\n%s\n" % (seq)) ret = cmd_exe(self.cmd) if ret.ret_code != 0: logging.error("Couldn't run trf") logging.error(str(ret)) exit(ret.ret_code) return parse_output()
def pcmd_exe(cmd): """ Wraps a cmd_exe with set -o pipefail """ return cmd_exe("set -o pipefail; " + cmd)
def vcf_compress(fn): """ Run vcftools to sort/compress/index a vcf file """ ret = cmd_exe(f"vcf-sort {fn} | bgzip > {fn}.gz && tabix {fn}.gz")
def find_survivor(): ret = cmd_exe("SURVIVOR -h") if ret.ret_code != 0: logging.error("Cannot find SURVIVOR in environment") exit(ret.ret_code)
def process_regions(ref_file, regions, out_dir, param_file): out_vcf_path = os.path.join(out_dir, "svteaser.sim.vcf") out_ref_fa_path = os.path.join(out_dir, "svteaser.ref.fa") out_altered_fa_path = os.path.join(out_dir, "svteaser.altered.fa") out_vcf_fh = None out_ref_fa_fh = open(out_ref_fa_path, "w+") out_altered_fa_fh = open(out_altered_fa_path, "w+") ref = pysam.FastaFile(ref_file) # Define padding in reference region where SVs are not to be inserted. padding = 800 for i, (chrom, start, end) in enumerate(regions): # Track status. if (i + 1) % 50 == 0: logging.info("Processed {}/{} regions...".format(i + 1, len(regions))) # Temporary dir. temp_dir = os.path.join(out_dir, "temp") os.mkdir(temp_dir) # Extract ref sequence. name = "{}_{}_{}".format(chrom, start, end) ref_seq = ref.fetch(chrom, start, end) # Remove some buffer from beginning and ending, # so that the tails do not contain SVs. These will be added # back later on. ref_seq_surv = ref_seq[padding:len(ref_seq)-padding] # Write ref sequence to temporary fa file. temp_ref_fa = os.path.join(temp_dir, "temp_ref.fa") with open(temp_ref_fa, "w") as fh: add_fasta_entry(name, ref_seq_surv, fh) # Run SURVIVOR. prefix = os.path.join(temp_dir, "simulated") survivor_cmd = " ".join(["SURVIVOR", "simSV", temp_ref_fa, param_file, "0.0", "0", prefix]) ret = cmd_exe(survivor_cmd) # should be checking here # Read output of SURVIVOR altered_fa_path = "{}.fasta".format(prefix) insertions_fa_path = "{}.insertions.fa".format(prefix) sim_vcf = "{}.vcf".format(prefix) # Update VCF temp_vcf = os.path.join(temp_dir, "temp.vcf") update_vcf(temp_ref_fa, insertions_fa_path, sim_vcf, temp_vcf, pos_padding=padding) # Merge seqs and variants entries into single FA/VCF files # Add the initial and last 800bp back to the altered fasta altered_seq = pysam.FastaFile(altered_fa_path).fetch(name) altered_seq = update_altered_fa(ref_seq, altered_seq, padding) add_fasta_entry(name, altered_seq, out_altered_fa_fh) add_fasta_entry(name, ref_seq, out_ref_fa_fh) vcf_reader = pysam.VariantFile(temp_vcf) header = vcf_reader.header if not out_vcf_fh: out_vcf_fh = pysam.VariantFile(out_vcf_path, 'w', header=header) for record in vcf_reader: out_vcf_fh.write(record) # Remove temporary files. import shutil shutil.rmtree(temp_dir) out_altered_fa_fh.close() out_ref_fa_fh.close() out_vcf_fh.close() vcf_compress(out_vcf_path)