def __main__(): try: input_seqs_ul = sys.argv[1] target_seq = sys.argv[2] output_dir = sys.argv[3] except: sys.exit(1) outfile_name = os.path.basename(input_seqs_ul).split('.')[0] + '_greped.fq' output_fq = output_dir + '/' + outfile_name outf = open(output_fq, 'w') greped_num = 0 for seq_id, seq, qual in MinimalFastqParser(input_seqs_ul, strict=False): if target_seq in seq: outf.write('@%s\n%s\n+\n%s\n' % (seq_id, seq, qual)) greped_num += 1 elif reverse_com_seq(target_seq) in seq: outf.write('@%s\n%s\n+\n%s\n' % (seq_id + '|reverse', reverse_com_seq(seq), qual)) greped_num += 1 else: pass print 'greped_num is %s' % greped_num outf.close()
def __main__(): try: input_seqs_ul = sys.argv[1] mapping_ul = sys.argv[2] output_dir = sys.argv[3] except: usage() sys.exit(1) barcodes = get_barcodes(mapping_ul) output_fq = output_dir + '/greped.fq' outf = open(output_fq, 'w') grep_seqs_log = output_dir + '/grep_seqs_num_log.txt' grep_log = open(grep_seqs_log, 'w') #initiate an empty dic to record the greped seqs numbers. grep_seqs_dic = {} for barcode in barcodes: grep_seqs_dic[barcode] = 0 #no_matched_seqs = 0 matched_seqs = 0 total_seqs_num = 0 for seq_id, seq, qual in MinimalFastqParser(input_seqs_ul, strict=False): total_seqs_num = total_seqs_num + 1 for barcode in barcodes: if seq.startswith(barcode): outf.write('@%s\n%s\n+\n%s\n' % (seq_id, seq, qual)) grep_seqs_dic[barcode] = grep_seqs_dic[barcode] + 1 matched_seqs = matched_seqs + 1 break elif seq.endswith(reverse_com_seq(barcode)): new_seq = reverse_com_seq(seq) new_qual = reverse_seq(qual) outf.write('@%s\n%s\n+\n%s\n' % (seq_id + '|reverse', new_seq, new_qual)) grep_seqs_dic[barcode] = grep_seqs_dic[barcode] + 1 matched_seqs = matched_seqs + 1 break grep_log.write("There are %s seqs totally!\n" % total_seqs_num) grep_log.write("Matched seqs: %s !\n" % matched_seqs) lib_name = os.path.basename(input_seqs_ul).split('.')[0] for key in grep_seqs_dic: grep_log.write('%s\t%s\t%s\n' % (key, grep_seqs_dic[key], lib_name)) grep_log.close() outf.close()
def __main__(): try: input_seqs_ul = sys.argv[1] primer_ul = sys.argv[2] output_dir = sys.argv[3] except: usage() sys.exit(1) primers = primer_list( primer_ul ) output_fq = output_dir + '/greped.fq' outf = open( output_fq , 'w') grep_seqs_log = output_dir + '/grep_seqs_num_log.txt' grep_log = open( grep_seqs_log, 'w' ) #initiate an empty dic to record the greped seqs numbers. grep_seqs_dic = {} for primer in primers: grep_seqs_dic[ primer[0] ] = 0 #no_matched_seqs = 0 matched_seqs = 0 total_seqs_num = 0 for seq_id, seq, qual in MinimalFastqParser(input_seqs_ul, strict=False): total_seqs_num = total_seqs_num + 1 for primer in primers: barcode = primer[1] if seq.startswith(barcode): outf.write('>%s\n%s\n' % (primer[0]+'_'+str(matched_seqs)+' '+seq_id, seq)) grep_seqs_dic[ primer[0] ] = grep_seqs_dic[ primer[0] ] + 1 matched_seqs = matched_seqs + 1 break grep_log.write( "There are %s seqs totally!\n" % total_seqs_num ) grep_log.write( "Matched seqs: %s !\n" % matched_seqs ) grep_log.write( "No matched seqs: %s !\n" % (total_seqs_num-matched_seqs) ) lib_name = os.path.basename(input_seqs_ul).split('.')[0] for key in grep_seqs_dic: grep_log.write( '%s\t%s\t%s\n' % (key, grep_seqs_dic[key], lib_name) ) grep_log.close() outf.close()
def __main__(): try: input_seqs_ul = sys.argv[1] except: sys.exit(1) infile_name = os.path.basename(input_seqs_ul) outfile_name = infile_name + '.tmp' output_fq = os.path.dirname(input_seqs_ul) + '/' + outfile_name outf = open(output_fq, 'w') for seq_id, seq, qual in MinimalFastqParser(input_seqs_ul, strict=False): # @/share/bioCloud/cloud/rawdata/download/PRJDA50447/DRX000300/DRR000534.sra.12 HWI-EAS370_34:2:1:0:82 length=76 seq_id_spot_info = re.split('\s+', seq_id)[1] if '_1.fastq' in infile_name: new_seq_id = seq_id_spot_info + '/1' elif '_2.fastq' in infile_name: new_seq_id = seq_id_spot_info + '/2' else: new_seq_id = seq_id_spot_info outf.write('@%s\n%s\n+\n%s\n' % (new_seq_id, seq, qual)) outf.close() os.system('mv %s %s' % (input_seqs_ul, input_seqs_ul + '.bak')) os.system('mv %s %s' % (output_fq, input_seqs_ul))
def __main__(): try: input_seqs_ul = sys.argv[1] primer_ul = sys.argv[2] output_dir = sys.argv[3] except: usage() sys.exit(1) primers = primer_list(primer_ul) output_fq = output_dir + '/greped.fq' outf = open(output_fq, 'w') grep_seqs_log = output_dir + '/grep_seqs_num_log.txt' grep_log = open(grep_seqs_log, 'w') #initiate an empty dic to record the greped seqs numbers. grep_seqs_dic = {} for primer in primers: grep_seqs_dic[primer[0]] = 0 #no_matched_seqs = 0 matched_seqs = 0 total_seqs_num = 0 #pattern = re.compile( 'ATTAGATACCC[CTG]GGTAGTCC' )#reverse primer is GGACTACCVGGGTATCTAAT containing JianBing bases,rc is ATTAGATACCC[CTG]GGTAGTCC. rprimer = primers[0][2] rprimer_len = len(rprimer) rc_rprimer = change_primer_seqs(rprimer) pattern = re.compile(rc_rprimer) for seq_id, seq, qual in MinimalFastqParser(input_seqs_ul, strict=False): total_seqs_num = total_seqs_num + 1 for primer in primers: if seq.startswith(primer[1]): try: new_seq = pattern.split(seq)[0] #remove reverse primer outf.write('@%s\n%s\n+\n%s\n' % (seq_id, new_seq, qual)) grep_seqs_dic[primer[0]] = grep_seqs_dic[primer[0]] + 1 matched_seqs = matched_seqs + 1 break except: outf.write('@%s\n%s\n+\n%s\n' % (seq_id, seq, qual)) grep_seqs_dic[primer[0]] = grep_seqs_dic[primer[0]] + 1 matched_seqs = matched_seqs + 1 break elif reverse_com_seq(primer[1]) in seq[-20:]: fpart_seq = seq[rprimer_len:-20] rpart_seq = seq[-20:] new_rpart = rpart_seq[:rpart_seq. index(reverse_com_seq(primer[1]))] new_seq = primer[1] + reverse_com_seq(fpart_seq + new_rpart) #new_qual = qual[ :seq.index( reverse_com_seq(primer[1]) ) + len(primer[1]) ] new_qual = qual[:len(new_seq)] outf.write('@%s\n%s\n+\n%s\n' % (seq_id + '|reverse', new_seq, new_qual)) grep_seqs_dic[primer[0]] = grep_seqs_dic[primer[0]] + 1 matched_seqs = matched_seqs + 1 break #else: #no_matched_seqs = no_matched_seqs + 1 #break grep_log.write("There are %s seqs totally!\n" % total_seqs_num) grep_log.write("Matched seqs: %s !\n" % matched_seqs) grep_log.write("No matched seqs: %s !\n" % (total_seqs_num - matched_seqs)) grep_log.write("Reverse primer is: %s !\n" % rprimer) grep_log.write("RC. reverse primer is: %s !\n" % rc_rprimer) lib_name = os.path.basename(input_seqs_ul).split('.')[0] for key in grep_seqs_dic: grep_log.write('%s\t%s\t%s\n' % (key, grep_seqs_dic[key], lib_name)) grep_log.close() outf.close()