def bedtools_gcov(OMA): os.chdir(new_dir + OMA + "/bedtool_coverage/bam_files") for bam_file in glob.glob("*.bam"): bamID = bam_file[:-11] unix("bedtools genomecov -ibam " + bam_file + " -d > " + "positionsCov_" + bamID + ".txt", shell=True) unix("mv *.txt ../genCov_files", shell=True)
def bowtie_build(OMA): os.chdir(new_dir) os.mkdir(OMA) os.chdir(OMA) os.mkdir("bowtie_index") os.chdir(old_dir + OMA) os.chdir("fusion_nuc_fams") for fasta_file in glob.glob("*.fasta"): fam_geneID = fasta_file[:-18] unix("bowtie2-build -f " + fasta_file + " " + new_dir + OMA + "/bowtie_index/" + fam_geneID, shell=True)
def second_blast(fasta): sp_fasta = fasta.split('/')[-1] sp_assem = sp_fasta.split('.')[0] #Search for hbx genes with mmseq easy-search print('\n') print('Running MMseqs;', sp_assem) unix( 'mmseqs easy-search ../../raw/hbx_data/family_data/' + args.gene + '.fasta ' + fasta + ' hbx_mmseqoutput/' + sp_assem + '_' + args.gene + '.m8 tmp --spaced-kmer-pattern 1101111 -k 6 -a -e 1 --num-iterations 2', shell=True) mmseq_outfile.append(sp_assem + '_' + args.gene + '.m8')
def sam_to_bam(OMA): os.chdir(new_dir + OMA) os.mkdir("bedtool_coverage") os.chdir("bedtool_coverage") os.mkdir("bam_files") os.mkdir("genCov_files") os.chdir(new_dir + OMA + "/bowtie_mapping/") for sam_file in glob.glob("*.sam"): map_ID = sam_file[:-11] unix("samtools view -b " + sam_file + " | samtools sort -o " + map_ID + "_sorted.bam", shell=True) unix("mv *.bam ../bedtool_coverage/bam_files", shell=True)
def hbx_blast(fasta): sp_file = fasta.split('/')[-1] sp_name = sp_file.split('.fast')[0] print('Running BLAST;', sp_name) #Make blast database from genome fasta files unix('makeblastdb -dbtype nucl -in ' + fasta + ' -out genome_blastdb/' + sp_name, shell=True) #Search for hbx genes in unannotated genomes unix( 'tblastn -query ../../raw/hbx_data/homeobox.fasta -db genome_blastdb/' + sp_name + ' -evalue 1 -seg yes -max_target_seqs 5000 -outfmt "6 qseqid sseqid evalue pident bitscore qstart qend qlen sstart send slen" -out ' + sp_name + '.blastoutput.fa', shell=True)
def get_zpool_status(): """call zpool status""" st, ret = unix("ZPOOL_SCRIPTS_AS_ROOT=1 zpool status -c size") html = '<pre>\n' cksum = False for line in ret.split('\n'): meter = " " if line.find('state:') > -1: if line.find("ONLINE"): meter = "<meter id=\"bullet\" max=10 min=0 value=10 high=9 low=2 optimum=10></meter> " else: meter = "<meter id=\"bullet\" max=10 min=0 value=10 high=9 low=2 optimum=0></meter> " html += line + " " + meter + "<br>" elif line.find("CKSUM") > -1: cksum = True html += " " + line[1:] + '<br>' elif len(line.strip()) == 0: cksum = False html += '<br>' elif cksum: if line.find("ONLINE"): meter = "<meter id=\"bullet\" max=10 min=0 value=10 high=9 low=2 optimum=10></meter> " else: meter = "<meter id=\"bullet\" max=10 min=0 value=10 high=9 low=2 optimum=0></meter> " html += meter + line[1:] + "<br>" else: html += line + '<br>' #print(len(line)) html += '</pre>' return html
def show_data(): global Running, Preparing, BackingUp, ServerPresent out = "" if not hostisup(): out += "Unavaiable ]\n" out += "Unavaiable ]" return out if unix(cmds["running"])[0] == 0: Running = True if unix(cmds["backingup"])[0] == 0: BackingUp = True if unix(cmds["preparing"])[0] == 0: Preparing = True status, output = unix(cmds["latest"]) #print status, output if status == 0: ServerPresent = True if ServerPresent: datestr_in = output[output.find('->') + 3:] latest_time = time.strptime(datestr_in, "%Y-%m-%d-%H%M%S") #print "Latest Backup - %s" % datestr datestr_out = time.strftime("%d/%m/%y", latest_time) if not ServerPresent: out += "Server Off-line ]\n" else: if BackingUp: out += "Backing Up ]\n" elif Preparing: out += "Preparing ]\n" elif Running: out += "Running ]\n" else: out += "Idle ]\n" if latest_time < time.localtime(): out += "%s ]\n" % (datestr_out) return out[:-1]
def get_usage(): """get disk usage""" st, ret = unix("df -h | grep -vE ' /dev| /run| /sys| /boot| /etc| /$'") html = '<pre>\n' for line in ret.split('\n'): try: perc = int(line.split()[4].replace('%', '')) html += "<meter max=100 min=0 value=%d high=75 low=50 optimum=19></meter> " % perc except ValueError: html += " " html += line + '<br>' html += '</pre>' return html
def get_drive_temps(pwd): """run the drive temps script""" st, ret = unix(pwd + '/zfs_drive_temps.sh -t') if ret.startswith('\n'): ret = ret[1:] html = '<pre>\n' for line in ret.split('\n'): if line.find("Celsius") > -1: tempC = int(line.split()[1]) html += "<meter max=75 min=0 value=%d high=40 low=30 optimum=19></meter> " % tempC html += line.replace(' Celsius', '°C').replace('-', ' ').replace( '_', ' ') + '<br>' html += '</pre>' return html
def backupAlreadyRunning(errlog): """is the backup already running? - this works for MacOs and Linux """ from subprocess import getstatusoutput as unix pid = os.getpid() pidof = "ps -eo pid,command | grep -i 'python .*myowncrashplan' " pidof += "| grep -v grep | awk -F' ' '{print $1}' | grep -v '^%d$'" % pid _st, out = unix(pidof) if out != "": errlog.info("Backup already running. [pid %s], so exit here." % (out)) return True return False
def bowtie_map(OMA): os.chdir(new_dir + OMA) os.mkdir("bowtie_mapping") os.chdir("bowtie_index") if OMA in SE_reads: single_IDs = [] for single_file in glob.glob("*.bt2"): SRR_ID = single_file.split(".")[0] single_IDs.append(SRR_ID) for single_ID in set(single_IDs): for read_file in glob.glob(old_dir + OMA + "/data_fastq_trimmed/*.gz"): read = read_file.split("/")[-1] read_ID = read[:-14] unix("bowtie2 -x " + single_ID + " -U " + read_file + " | samtools view -S -h -F4 - > " + single_ID + "_" + read_ID + "_mapped.sam", shell=True) else: if OMA in PE_reads: IDs = [] for paired_file in glob.glob("*.bt2"): SRR_ID = paired_file.split(".")[0] IDs.append(SRR_ID) read_files = [] for ID in set(IDs): for read_file in glob.glob(old_dir + OMA + "/data_fastq_trimmed/*.gz"): read = read_file.split("/")[-1] read_ID = read[:-14] read_SRR = read_ID[:-9] trim_unpair = read_ID[-9:] if trim_unpair == '_unpaired': continue else: read_files.append(read_SRR) for reads in set(read_files): unix( "bowtie2 -x " + ID + " -1 " + old_dir + OMA + "/data_fastq_trimmed/" + reads + "_1_paired_trim.fastq.gz " + "-2 " + old_dir + OMA + "/data_fastq_trimmed/" + reads + "_2_paired_trim.fastq.gz | samtools view -S -h -F4 - > " + ID + "_" + reads + "_mapped.sam", shell=True) #Move output SAM files to bowtie_mapping directory unix("mv *.sam ../bowtie_mapping/", shell=True)
def data_download_user(query, species): ''' Function called when using --query flag. Requires fasta file of query species barcode, along with name of species or genus to search against from BOLD database. Gets barcode sequence for the query species and related species specified ''' os.makedirs('output', exist_ok=True) sp_query = query.split('.fa')[0] if species.split('_')[1] == '': try: os.mkdir('output/' + sp_query + '_genus_query') except: shutil.rmtree('output/' + sp_query + '_genus_query') os.mkdir('output/' + sp_query + '_genus_query') os.chdir('output/' + sp_query + '_genus_query') else: try: os.mkdir('output/' + sp_query + '_query') except: shutil.rmtree('output/' + sp_query + '_query') os.mkdir('output/' + sp_query + '_query') os.chdir('output/' + sp_query + '_query') with open('../../queries/' + query) as f, open(sp_query + '_query.fasta', 'w') as outF: for record in SeqIO.parse(f, 'fasta'): ID = record.description seq = str(record.seq) outF.write('>' + ID + '_query' + '\n' + seq + '\n') expected_sp_BOLD = sp_query.split('_')[0].title() + '%20' + sp_query.split( '_')[1] sister_sp_BOLD = species.split('_')[0].title() + '%20' + species.split( '_')[1] #Download barcode sequences from BOLD unix('wget http://www.boldsystems.org/index.php/API_Public/sequence?taxon=' + expected_sp_BOLD, shell=True) unix('wget http://www.boldsystems.org/index.php/API_Public/sequence?taxon=' + sister_sp_BOLD, shell=True) for bold in glob.glob('sequence*'): sp_fasta = bold.split('=')[1] sp_fasta = sp_fasta.replace(' ', '_') bold = bold.replace(' ', '\ ') unix('mv ' + bold + ' ' + sp_fasta + '.fas', shell=True) #Combine BOLD barcode seqs with DToL query barcode seq unix('cat *fas >> ' + sp_query + '_query.fasta', shell=True) #Remove duplicate sequences from the fasta file with open(sp_query + '_query.fasta') as f, open(sp_query + '_query.fa', 'w') as outF: bold_id_dict = {} for record in SeqIO.parse(f, 'fasta'): ID = record.description seq = str(record.seq) if ID not in bold_id_dict.keys(): bold_id_dict[ID] = seq for k, v in bold_id_dict.items(): if ('COI' in k) or ('query' in k): outF.write('>' + k + '\n' + v + '\n') os.chdir('../../')
def trim_data(OMA_ID): print(OMA_ID, ' is being processed') #Trim fastq files to remove adapter (TruSeq3 file) and based on quality and read length os.chdir("/data0/bspm/shortRead_rerun_mapping/" + OMA_ID) os.mkdir("data_fastq_trimmed") os.chdir("data_fastq") if OMA_ID in SE_reads: unix("cp ../../TruSeq3-SE.fa .", shell=True) single_IDs = [] for single_file in glob.glob("*.gz"): SRR_ID = single_file[:-9] single_IDs.append(SRR_ID) for single_ID in set(single_IDs): unix("java -jar /home/bspm/bin/Trimmomatic-0.38/trimmomatic-0.38.jar SE -phred33 -trimlog " + single_ID + "_Logfile.txt " + single_ID + ".fastq.gz" + " " + single_ID + "_trim.fastq.gz " + "ILLUMINACLIP:TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:10:30 MINLEN:35", shell=True) else: if OMA_ID in PE_reads: unix("cp ../../TruSeq3-PE.fa .", shell=True) IDs = [] for paired_file in glob.glob("*.gz"): SRR_ID = paired_file[:-11] IDs.append(SRR_ID) for ID in set(IDs): unix("java -jar /home/bspm/bin/Trimmomatic-0.38/trimmomatic-0.38.jar PE -phred33 -trimlog " + ID + "_Logfile.txt " + ID + "_1.fastq.gz " + ID + "_2.fastq.gz " + ID + "_1_paired_trim.fastq.gz " + ID + "_1_unpaired_trim.fastq.gz " + ID + "_2_paired_trim.fastq.gz " + ID + "_2_unpaired_trim.fastq.gz " + "ILLUMINACLIP:TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:10:30 MINLEN:35", shell=True) unix("mv *trim* ../data_fastq_trimmed/", shell=True) unix("mv *Logfile* ../data_fastq_trimmed/", shell=True) unix("rm *TruSeq3*", shell=True) #Run FastQC on trimmed files os.chdir("/data0/bspm/shortRead_rerun_mapping/" + OMA_ID) os.mkdir("FastQC_postTrim") os.chdir("data_fastq_trimmed") unix("perl /home/bspm/bin/FastQC/fastqc -o ../FastQC_postTrim --noextract -q *.gz", shell=True) print(OMA_ID, ' is finished processed')
gene_end + '.fasta', 'w') as outF1: outF1.write('>' + gene_header + '\n' + gene_nuc_seq[0] + '\n') else: with open( 'temp_seqs/' + spName + '_' + geneName + '_' + contig + '_' + gene_start + '_' + gene_end + '.fasta', 'w') as outF3: outF3.write('>' + gene_header + '\n' + gene_nuc_seq[0] + '\n') ##Translate nucleotide hbx sequences os.chdir('temp_seqs') for fasta in glob.glob("*.fasta"): unix('sixpack -sequence ' + fasta + ' -outfile ' + fasta + '.sixpack -outseq ' + fasta + '.sixpack.fa', shell=True) outF = open(args.gene + '_HD_AA.fasta', 'w') sorted_fasta = [] for fa in glob.glob("*fa"): sorted_fasta.append(fa) sorted_fasta = sorted(sorted_fasta) for fa in sorted_fasta: fa_info = fa.split('.')[0] if len(fa_info.split('_')) == 6: spName = '_'.join(fa_info.split('_')[:1]) geneID = fa_info.split('_')[2] contig = fa_info.split('_')[3] pos = '_'.join(fa_info.split('_')[4:5])
def download_data(OMA_ID, SRA_ID): print(OMA_ID, ' is being processed') dir = os.mkdir(OMA_ID) os.chdir("/data1/bspm/shortRead_mapping/" + OMA_ID) #Download the sra directories for each taxa bioproject for ID in SRA_ID: first_ID = ID[:3] second_ID = ID[:6] final_ID = ID get_SRA = "ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByStudy/sra/" + first_ID + "/" + second_ID + "/" + final_ID + "/" unix("wget -r " + get_SRA, shell=True) ##Put all the .sra read files into data_fastq dir, and remove ftp folder os.chdir("/data1/bspm/shortRead_mapping/" + OMA_ID) for ID in SRA_ID: first_ID = ID[:3] second_ID = ID[:6] final_ID = ID if first_ID == "SRP": read_ID = "SRR" else: read_ID = "ERR" SRA_files = "ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByStudy/sra/" + first_ID + "/" + second_ID + "/" + final_ID + "/" unix("mv " + SRA_files + "*" + read_ID + "*" + "/*.sra " + "/data1/bspm/shortRead_mapping/" + OMA_ID, shell=True) os.chdir("/data1/bspm/shortRead_mapping/" + OMA_ID) unix("rm -r ftp-trace.ncbi.nih.gov", shell=True) unix("mkdir data_sra", shell=True) unix("mv *.sra data_sra", shell=True) os.chdir("/data1/bspm/shortRead_mapping/") ##Get fastq file for the SRA files using fastq-dump os.chdir("/data1/bspm/shortRead_mapping/" + OMA_ID) os.mkdir("data_fastq") os.chdir("data_sra") for sra_file in glob.glob("*.sra"): unix( "fastq-dump --gzip --skip-technical --readids --dumpbase --split-files " + sra_file, shell=True) unix("mv *.gz ../data_fastq", shell=True) unix("rm *.sra", shell=True) os.chdir("../") unix("rm -r data_sra", shell=True) os.chdir("/data1/bspm/shortRead_mapping/") print(OMA_ID, ' is finished processing')
def data_download(barcode_results): ''' Function called when using --barcode flag. Parse DToL barcode excel sheet to find cases where species identifation do not match barcode sequence.Download barcode data from boldsystems.org for queries species. ''' #Convert excel format to csv for parsing if args.barcode.lower().endswith('.xlsx'): barcode_results = args.barcode.split('.xlsx')[0] data_xls = pd.read_excel(args.barcode, dtype=str, index_col=None) data_xls.to_csv(barcode_results + '.csv', encoding='utf-8', index=False) elif args.barcode.lower().endswith('.csv'): barcode_results = args.barcode.split('.csv')[0] else: print('Error:') print('Excel or csv file required as input for --barcode') return None os.makedirs('output', exist_ok=True) os.makedirs('queries', exist_ok=True) #Download data for query species from BOLD database with open(barcode_results + '.csv') as f: next(f) for line in f: lines = line.split(',') specimen_ID = lines[2] expected_sp = lines[6].title() + '_' + lines[7] result_sp = lines[10].title() + '_' + lines[11] DNA = lines[15] flag = lines[14] if flag == 'C': #Currently just parsing cases where barcode is flagged as yellow and has at least genus level hit if result_sp.split('_') != ['', '']: with open('barphy_results.csv', 'a+') as outBar: outBar.write(specimen_ID + ',' + expected_sp + ',' + result_sp + '\n') if expected_sp.split('_')[1] == 'sp': with open( 'queries/' + specimen_ID + '_' + expected_sp + '.fasta', 'w') as outF_B: outF_B.write('>' + expected_sp + '|' + specimen_ID + '\n' + DNA + '\n') else: #Save specimen IDs to list query_list.append(specimen_ID) try: os.mkdir('output/' + specimen_ID) except: shutil.rmtree('output/' + specimen_ID) os.mkdir('output/' + specimen_ID) os.chdir('output/' + specimen_ID) with open(specimen_ID + '_query.fasta', 'w') as outF: outF.write('>' + expected_sp + '_DToL' + '\n' + DNA + '\n') expected_sp_BOLD = lines[6].title() + '%20' + lines[7] result_sp_BOLD = lines[10].title() + '%20' + lines[11] #Download barcode sequences from BOLD print( 'Downloading barcode data from BOLD (boldsystems.org)...' ) unix( 'wget -q --show-progress http://www.boldsystems.org/index.php/API_Public/sequence?taxon=' + expected_sp_BOLD, shell=True) unix( 'wget -q --show-progress http://www.boldsystems.org/index.php/API_Public/sequence?taxon=' + result_sp_BOLD, shell=True) for bold in glob.glob('sequence*'): sp_fasta = bold.split('=')[1] sp_fasta = sp_fasta.replace(' ', '_') bold = bold.replace(' ', '\ ') unix('mv ' + bold + ' ' + sp_fasta + '.fas', shell=True) #Combine BOLD barcode seqs with DToL query barcode seq unix('cat *fas >> ' + specimen_ID + '_query.fasta', shell=True) #Remove duplicate sequences from the fasta file with open(specimen_ID + '_query.fasta') as f, open( specimen_ID + '_query.fa', 'w') as outF: bold_id_dict = {} for record in SeqIO.parse(f, 'fasta'): ID = record.description seq = str(record.seq) if ID not in bold_id_dict.keys(): bold_id_dict[ID] = seq for k, v in bold_id_dict.items(): if ('COI' in k) or ('DToL' in k): outF.write('>' + k + '\n' + v + '\n') os.chdir('../../') elif flag == 'B': with open('barphy_results.csv', 'a+') as outBar: outBar.write(specimen_ID + ',' + expected_sp + ',' + result_sp + '\n') with open( 'queries/' + specimen_ID + '_' + expected_sp + '.fasta', 'w') as outF_B: outF_B.write('>' + expected_sp + '|' + specimen_ID + '\n' + DNA + '\n')
def tree_build(query): ''' Created multiple sequence alignments from barcode fasta files using mafft. Create phylogenetic tree using IQTree ''' pwd = os.getcwd() os.chdir(query) #Align using mafft and infer gene tree with IQTree for fa in glob.glob("*fa"): unix('sed -i "s/ /_/g" ' + fa, shell=True) print('\n') print('Constructing MSA and phylogenetic tree...') print('(If this step fails, see log files for what went wrong)') print('\n') unix('mafft --quiet ' + fa + ' > ' + fa.split('.')[0] + '.mft', shell=True) #unix('mafft --maxiterate 1000 --localpair ' + fa + ' > ' + fa.split('.')[0] + '.mft', shell=True) #unix('iqtree -quiet -s ' + fa.split('.')[0] + '.mft', shell=True) unix('iqtree -quiet -m GTR+G -s ' + fa.split('.')[0] + '.mft', shell=True) #Midpoint rooting to root the gene tree for tree in glob.glob("*.treefile"): midpoint_root(tree) #Create pdf file with tree image for rooted_tree in glob.glob("*.rooted"): queryID = query.split('/')[-1] rtre = toytree.tree(rooted_tree) Nnodes = rtre.nnodes colorlist = [ "#de2d26" if ("query" in tip) or ("DToL" in tip) else "#000000" for tip in rtre.get_tip_labels() ] #colorlist = ["#de2d26" if "DToL" in tip else "#000000" for tip in rtre.get_tip_labels()] if Nnodes < 80: canvas, axes, mark = rtre.draw( tip_labels_align=True, tip_labels_colors=colorlist, width=1000, height=1000, tip_labels_style={"font-size": "15px"}) elif Nnodes < 600: #canvas, axes, mark = rtre.draw(tip_labels_colors=colorlist, edge_widths=0.1, layout='c', edge_type='p', width=800, height=800, tip_labels_style={"font-size": "2px"}); canvas, axes, mark = rtre.draw( tip_labels_colors=colorlist, tip_labels_align=True, width=1000, height=5000, tip_labels_style={"font-size": "15px"}) else: canvas, axes, mark = rtre.draw( tip_labels_colors=colorlist, tip_labels_align=True, width=1000, height=8000, tip_labels_style={"font-size": "15px"}) #canvas, axes, mark = rtre.draw(tip_labels_colors=colorlist, edge_widths=0.1, layout='c', edge_type='p', width=600, height=600, tip_labels_style={"font-size": "1px"}); toyplot.pdf.render(canvas, queryID + "_tree.pdf") #unix('Rscript ' + pwd + '/plot_tree.R -t ' + rooted_tree + ' -o ' + queryID + '_tree.pdf > /dev/null 2>&1', shell=True) #unix('Rscript ' + pwd + '/plot_tree.R -t ' + rooted_tree + ' -o ' + queryID + '_tree.pdf', shell=True) os.chdir(pwd)
def hostisup(): s, o = unix('ping -c1 -t1 -q skynet 2>/dev/null 1>&2') if s != 0: return False return True
if q.qsize() == (bp_end - bp_start + 1): outF.write(famID + "\t" + geneID + "\t" + readID + "\n") while q.empty() == False: outF1.write(q.get()) with open(my_file) as f: for line in f: lines = line.split('\t') cov = lines[-1].strip() nuc_pos = lines[1] if (int(nuc_pos) >= bp_start_extended) and ( int(nuc_pos) <= bp_end_extended): if int(cov) == 0: covered = 'FALSE' break else: bp_ext_len += 1 if bp_ext_len == (bp_end_extended - bp_start_extended + 1): outF_01.write(famID + "\t" + geneID + "\t" + readID + "\n") unix("find -iname '*breakpoint_cov.txt' -type f -empty -delete", shell=True) os.chdir("../../../") # if covered == 'TRUE': # outF.write(famID + "\t" + geneID + "\t" + readID + "\n") # os.chdir("../../../")
def tearDown(self): """remove files created on server""" unix("ssh skynet rm -rf /tmp/spam2.log") unix("ssh skynet rm -rf /tmp/fred")
parse.add_argument("--infile", type=str, help="input file to run reciprocal BLAST on", required=True) parse.add_argument("--outfile", type=str, help="output file name from reciprocal BLAST run", required=True) args = parse.parse_args() #Run reciprocal blast print('Running reciprocal BLASTx search...') unix( 'blastx -query ' + args.infile + ' -db ../../../raw/hbx_data/homeobox -evalue 1e-5 -num_threads 4 -seg yes -max_target_seqs 1 -outfmt "6 qseqid sseqid evalue pident bitscore qstart qend qlen sstart send slen" -out ' + args.outfile, shell=True) #Parse reciprocal BLASTx output print('\n') print('Parsing reciprocal BLAST output...') sp_assem_list = {} with open(args.outfile) as f: for line in f: sp = line.split('.')[0] sp_assem = line.split('|')[0] if sp_assem not in sp_assem_list: sp_assem_list[sp_assem] = sp for assemb, species in sp_assem_list.items():
# Usage python3 recip_blast.py --gene <gene name> parse = argparse.ArgumentParser() parse.add_argument("--gene", type=str, help="name of homeobox gene to obtain results for", required=True) args = parse.parse_args() #Run reciprocal blast print('Running reciprocal BLASTx search...') unix( 'blastx -query genome_' + args.gene + '_recipBlast.fasta -db ../../../raw/hbx_data/family_data/' + args.gene + ' -evalue 1e-5 -num_threads 5 -seg yes -max_target_seqs 1 -outfmt "6 qseqid sseqid evalue pident bitscore qstart qend qlen sstart send slen" -out recipBlast_' + args.gene + '.fa', shell=True) #Parse reciprocal BLASTx output print('\n') print('Parsing reciprocal BLAST output...') sp_list = [] with open('recipBlast_' + args.gene + '.fa') as f: for line in f: sp = line.split('|')[0] if sp not in sp_list: sp_list.append(sp) os.makedirs('species_hbx', exist_ok=True) for species in sp_list: