def get_otu_by_rdp(workdir,input_fa,genus_loc = '.'): safe_makedir(workdir) now_dir = os.getcwd() os.chdir(workdir) fasta_dir = os.path.dirname(input_fa) fasta_base_name = os.path.basename(input_fa) if genus_loc == '.': blast_dir = os.path.join(fasta_dir,'blast') genus_result = os.path.join(blast_dir, fasta_base_name.replace('.fa', '_blast_result.tsv')) else: blast_dir = fasta_dir genus_result = os.path.join(blast_dir,fasta_base_name.replace('.fa','_genus_result.tsv')) otu_result = os.path.basename(input_fa).replace('.fa','_otus.txt') analysis_result = os.path.basename(input_fa).replace('.fa','_otus_result.tsv') #/usr/lib/qiime/bin/ old path cmd1 = 'pick_otus.py -i %s -m blast -o ./ -b %s' %(input_fa,db_16s) cmd2 = 'pick_rep_set.py -i %s -f %s -o rep.fna' % (otu_result,input_fa) # cmd3 = "source /sam/anBank/lib/miniconda2/bin/deactivate" logger.info(cmd1) logger.info(cmd2) os.system(cmd1) os.system(cmd2) # try: # os.system(cmd3) # except: # pass analysis_otu_info(genus_result, otu_result, analysis_result) os.chdir(now_dir)
def get_fasta_seq(user, excel, input_dir_raw, output_dir, qual_length, seq_start, seq_end): input_dir = os.path.join(base_dir, 'raw_data', user, input_dir_raw) if not os.path.exists(input_dir): logger.warn('We can not find sequences dir %s' % input_dir) exit() excel_file = os.path.join(input_dir, excel) seqs_info = ReadFiles.read_excel_onesheet(excel_file, sheet_name=' sheet1', same_line_debug=False) output_dir1 = os.path.join(base_dir, 'data', user) output_dir2 = os.path.join(base_dir, 'data', user, input_dir_raw) safe_makedir(output_dir1) safe_makedir(output_dir2) fasta_file = os.path.join(output_dir2, '%s.fa' % input_dir_raw) fasta_info_file = os.path.join(output_dir2, '%s_seq_info.tsv' % input_dir_raw) data2 = open(fasta_info_file, 'w') header = ['#seq_name', 'status', 'result'] data2.write('%s\n' % '\t'.join(header)) sucessful_seqs = 0 seqs_name = [] for k1 in seqs_info: seq_name = seqs_info[k1]['样品名称'] seq_length = seqs_info[k1]['片段大小'] seq_status = seqs_info[k1]['反应结果'] result = 0 if int(seq_length) > qual_length and '成功' in seq_status: sucessful_seqs += 1 seqs_name.append(seq_name) result = 1 info = [seq_name, seq_status, str(result)] data2.write('%s\n' % '\t'.join(info)) data2.close() result_fp = open(fasta_file, 'w') for one_file in os.listdir(input_dir): if one_file.endswith('seq'): seq_name2 = one_file.split('_')[0] if seq_name2 in seqs_name: with open(os.path.join(input_dir, one_file)) as data1: seq = data1.read().strip() seq2 = seq[seq_start:seq_end] result_fp.write('>%s\n' % seq_name2) result_fp.write('%s\n' % seq2) logger.info("%s has %s successful seqs" % (input_dir, sucessful_seqs)) result_fp.close() if sucessful_seqs == 0: logger.info('There is no successful seqs,please check it!') exit() return fasta_file, fasta_info_file
def run_main(excel, input_dir, output_dir, qual_length, seq_start, seq_end, filter_identity, user, rdp): parallel = create_base_logger() setup_local_logging(config) fasta_file, fasta_info_file = get_fasta_seq(user, excel, input_dir, output_dir, qual_length, seq_start, seq_end) fasta_dir = os.path.dirname(fasta_file) fasta_base_name = os.path.basename(fasta_file) blast_dir = os.path.join(fasta_dir, 'blast') safe_makedir(blast_dir) blast_output = os.path.join( blast_dir, fasta_base_name.replace('.fa', '_blast_result.xml')) analysis_result = os.path.join( blast_dir, fasta_base_name.replace('.fa', '_blast_result.tsv')) fail_fasta_file = fasta_file.replace('.fa', '_fail_blast.fa') rdp_result_file = os.path.join( fasta_dir, 'rdp_assigned_taxonomy', fasta_base_name.replace('.fa', '_tax_assignments.txt')) blast_rdp_file = fasta_file.replace('.fa', '_blast_rdp_result.tsv') blast_input(fasta_file, blast_output) analysis_blast_result_xml(blast_output, analysis_result, fasta_file, fail_fasta_file, filter_identity) get_taxonomy_info_by_rdp(fasta_file) otu_dir = os.path.join(os.path.dirname(fasta_file), 'otu') get_otu_by_rdp(otu_dir, fasta_file) merge_blast_rdp_file(analysis_result, rdp_result_file, blast_rdp_file) user_data_dir = os.path.join(data_dir, user) merge_result(user_data_dir) ## total seq otu if rdp == 'False': print 'We will skip total seqs otu analysis!' pass else: get_otu_by_rdp(os.path.join(user_data_dir, 'Total', 'otu'), '../total.fa', genus_loc='Y') logger.warn('Finish analysis! Thanks for using anBank')
def run_split(fasta, otu_file, outdir): #otus = {} safe_makedir(outdir) with open(otu_file) as data1: for each_line in data1: if each_line.strip() == '': continue cnt = each_line.strip().split() otu_accession = cnt[0].split('|')[3].split('_')[1].split('.')[0] #otus[cnt] = cnt[1:] cmd = "extract_seqs_by_sample_id.py -i %s -o %s/%s.fa -s %s " % ( fasta, outdir, otu_accession, ','.join(cnt[1:])) print cmd logger.info(cmd) os.system(cmd) pass
def merge_result(data_dir): total_result = '%s/Total' % data_dir safe_makedir(total_result) total_fa = os.path.join(total_result, 'total.fa') total_seq_info = os.path.join(total_result, 'total_seq_info.tsv') total_genus_result = os.path.join(total_result, 'total_genus_result.tsv') data1 = open(total_fa, 'w') data2 = open(total_genus_result, 'w') data6 = open(total_seq_info, 'w') data6.write('#seq_name\tstatus\tresult\n') if os.path.exists(total_result): pass else: safe_makedir(total_result) for fn in os.listdir(data_dir): if fn.startswith('Total'): continue fn_fa = os.path.join(data_dir, fn, '%s.fa' % fn) fn_seq = os.path.join(data_dir, fn, '%s_seq_info.tsv' % fn) blast_analysis_fp = os.path.join(data_dir, fn, '%s_blast_rdp_result.tsv' % fn) seqs_name = [] skip_header = 0 if os.path.exists(fn_fa): with open(fn_fa) as data3: for each_line in data3: if each_line.strip() == '': continue if each_line.startswith('>'): seq_name = each_line.replace('>', '') if seq_name not in seqs_name: seqs_name.append(seq_name) else: # seqs_name.append(seq_name) print 'You have same name %s please check it!' % seq_name exit() else: pass data1.write('%s\n' % (each_line.strip())) if os.path.exists(fn_seq): with open(fn_seq) as data5: for each_line in data5: if each_line.strip() == '' or each_line.startswith('#'): continue data6.write('%s\n' % (each_line.strip())) if os.path.exists(blast_analysis_fp): with open(blast_analysis_fp) as data4: for each_line in data4: if each_line.strip() == '': continue elif each_line.startswith('#') and skip_header == 0: data2.write('%s\n' % (each_line.strip())) skip_header = 1 elif not each_line.startswith('#'): data2.write('%s\n' % (each_line.strip())) data1.close() data2.close() data6.close()
def _create_log_handler(config, add_hostname=False, direct_hostname=False): logbook.set_datetime_format("local") handlers = [logbook.NullHandler()] format_str = "".join([ "[{record.time:%Y-%m-%dT%H:%M}] " if config.get("include_time", True) else "", "{record.extra[source]}: " if add_hostname else "", "%s: " % (socket.gethostname)() if direct_hostname else "", "{record.message}" ]) # format_str = logbook.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') # print 'log_geno.py testsssss' # print format_str # exit() log_dir = get_log_dir(config) if log_dir: if not os.path.exists(log_dir): filehandler.safe_makedir(log_dir) # Wait to propagate, Otherwise see logging errors on distributed filesystems. time.sleep(5) handlers.append( logbook.FileHandler(os.path.join(log_dir, "%s.log" % LOG_NAME), format_string=format_str, level="INFO", filter=_not_cl)) handlers.append( logbook.FileHandler(os.path.join(log_dir, "%s-debug.log" % LOG_NAME), format_string=format_str, level="DEBUG", bubble=True, filter=_not_cl)) handlers.append( logbook.FileHandler(os.path.join(log_dir, "%s-commands.log" % LOG_NAME), format_string=format_str, level="DEBUG", filter=_is_cl)) handlers.append( logbook.StreamHandler(sys.stdout, format_string="{record.message}", level="DEBUG", filter=_is_stdout)) email = config.get("email", config.get("resources", {}).get("log", {}).get("email")) if email: email_str = u'''Subject: [bcbio-nextgen] {record.extra[run]} \n\n {record.message}''' handlers.append( logbook.MailHandler(email, [email], format_string=email_str, level='INFO', bubble=True)) handlers.append( logbook.StreamHandler(sys.stderr, format_string=format_str, bubble=True, filter=_not_cl)) return CloseableNestedSetup(handlers)