예제 #1
0
def __main__():
    try:
        input_seqs_ul = sys.argv[1]
        target_seq = sys.argv[2]
        output_dir = sys.argv[3]
    except:
        sys.exit(1)

    outfile_name = os.path.basename(input_seqs_ul).split('.')[0] + '_greped.fq'
    output_fq = output_dir + '/' + outfile_name
    outf = open(output_fq, 'w')

    greped_num = 0
    for seq_id, seq, qual in MinimalFastqParser(input_seqs_ul, strict=False):
        if target_seq in seq:
            outf.write('@%s\n%s\n+\n%s\n' % (seq_id, seq, qual))
            greped_num += 1
        elif reverse_com_seq(target_seq) in seq:
            outf.write('@%s\n%s\n+\n%s\n' %
                       (seq_id + '|reverse', reverse_com_seq(seq), qual))
            greped_num += 1
        else:
            pass
    print 'greped_num is %s' % greped_num
    outf.close()
예제 #2
0
def __main__():
    try:
        input_seqs_ul = sys.argv[1]
        mapping_ul = sys.argv[2]
        output_dir = sys.argv[3]
    except:
        usage()
        sys.exit(1)

    barcodes = get_barcodes(mapping_ul)
    output_fq = output_dir + '/greped.fq'
    outf = open(output_fq, 'w')
    grep_seqs_log = output_dir + '/grep_seqs_num_log.txt'
    grep_log = open(grep_seqs_log, 'w')

    #initiate an empty dic to record the greped seqs numbers.
    grep_seqs_dic = {}
    for barcode in barcodes:
        grep_seqs_dic[barcode] = 0

    #no_matched_seqs = 0
    matched_seqs = 0
    total_seqs_num = 0

    for seq_id, seq, qual in MinimalFastqParser(input_seqs_ul, strict=False):
        total_seqs_num = total_seqs_num + 1
        for barcode in barcodes:
            if seq.startswith(barcode):
                outf.write('@%s\n%s\n+\n%s\n' % (seq_id, seq, qual))
                grep_seqs_dic[barcode] = grep_seqs_dic[barcode] + 1
                matched_seqs = matched_seqs + 1
                break
            elif seq.endswith(reverse_com_seq(barcode)):
                new_seq = reverse_com_seq(seq)
                new_qual = reverse_seq(qual)
                outf.write('@%s\n%s\n+\n%s\n' %
                           (seq_id + '|reverse', new_seq, new_qual))
                grep_seqs_dic[barcode] = grep_seqs_dic[barcode] + 1
                matched_seqs = matched_seqs + 1
                break

    grep_log.write("There are %s seqs totally!\n" % total_seqs_num)
    grep_log.write("Matched seqs: %s !\n" % matched_seqs)

    lib_name = os.path.basename(input_seqs_ul).split('.')[0]
    for key in grep_seqs_dic:
        grep_log.write('%s\t%s\t%s\n' % (key, grep_seqs_dic[key], lib_name))
    grep_log.close()
    outf.close()
def __main__():
	try:
		input_seqs_ul = sys.argv[1]
		primer_ul = sys.argv[2]
		output_dir = sys.argv[3]
	except:
		usage()
		sys.exit(1)
	
	primers = primer_list( primer_ul )
	output_fq =  output_dir + '/greped.fq'
	outf = open( output_fq , 'w')
	grep_seqs_log = output_dir + '/grep_seqs_num_log.txt'
	grep_log = open( grep_seqs_log, 'w' )
	
	#initiate an empty dic to record the greped seqs numbers.
	grep_seqs_dic = {}
	for primer in primers:
		grep_seqs_dic[ primer[0] ] = 0
	
	#no_matched_seqs = 0
	matched_seqs = 0
	total_seqs_num = 0
	
	for seq_id, seq, qual  in MinimalFastqParser(input_seqs_ul, strict=False):
		total_seqs_num = total_seqs_num + 1
		for primer in primers:
			barcode = primer[1]
			if seq.startswith(barcode):
				outf.write('>%s\n%s\n' % (primer[0]+'_'+str(matched_seqs)+' '+seq_id, seq))
				grep_seqs_dic[ primer[0] ] = grep_seqs_dic[ primer[0] ] + 1
				matched_seqs = matched_seqs + 1
				break

	grep_log.write( "There are %s seqs totally!\n" % total_seqs_num )
	grep_log.write( "Matched seqs: %s !\n" % matched_seqs )
	grep_log.write( "No matched seqs: %s !\n" % (total_seqs_num-matched_seqs) )
	
	lib_name = os.path.basename(input_seqs_ul).split('.')[0]
	for key in grep_seqs_dic:
		grep_log.write( '%s\t%s\t%s\n' % (key, grep_seqs_dic[key], lib_name) )
	grep_log.close()
	outf.close()
예제 #4
0
def __main__():
    try:
        input_seqs_ul = sys.argv[1]
    except:
        sys.exit(1)

    infile_name = os.path.basename(input_seqs_ul)
    outfile_name = infile_name + '.tmp'
    output_fq = os.path.dirname(input_seqs_ul) + '/' + outfile_name
    outf = open(output_fq, 'w')

    for seq_id, seq, qual in MinimalFastqParser(input_seqs_ul, strict=False):
        # @/share/bioCloud/cloud/rawdata/download/PRJDA50447/DRX000300/DRR000534.sra.12 HWI-EAS370_34:2:1:0:82 length=76
        seq_id_spot_info = re.split('\s+', seq_id)[1]
        if '_1.fastq' in infile_name:
            new_seq_id = seq_id_spot_info + '/1'
        elif '_2.fastq' in infile_name:
            new_seq_id = seq_id_spot_info + '/2'
        else:
            new_seq_id = seq_id_spot_info
        outf.write('@%s\n%s\n+\n%s\n' % (new_seq_id, seq, qual))
    outf.close()
    os.system('mv %s %s' % (input_seqs_ul, input_seqs_ul + '.bak'))
    os.system('mv %s %s' % (output_fq, input_seqs_ul))
def __main__():
    try:
        input_seqs_ul = sys.argv[1]
        primer_ul = sys.argv[2]
        output_dir = sys.argv[3]
    except:
        usage()
        sys.exit(1)

    primers = primer_list(primer_ul)
    output_fq = output_dir + '/greped.fq'
    outf = open(output_fq, 'w')
    grep_seqs_log = output_dir + '/grep_seqs_num_log.txt'
    grep_log = open(grep_seqs_log, 'w')

    #initiate an empty dic to record the greped seqs numbers.
    grep_seqs_dic = {}
    for primer in primers:
        grep_seqs_dic[primer[0]] = 0

    #no_matched_seqs = 0
    matched_seqs = 0
    total_seqs_num = 0

    #pattern = re.compile( 'ATTAGATACCC[CTG]GGTAGTCC' )#reverse primer is GGACTACCVGGGTATCTAAT containing JianBing bases,rc is ATTAGATACCC[CTG]GGTAGTCC.
    rprimer = primers[0][2]
    rprimer_len = len(rprimer)
    rc_rprimer = change_primer_seqs(rprimer)
    pattern = re.compile(rc_rprimer)

    for seq_id, seq, qual in MinimalFastqParser(input_seqs_ul, strict=False):
        total_seqs_num = total_seqs_num + 1
        for primer in primers:
            if seq.startswith(primer[1]):
                try:
                    new_seq = pattern.split(seq)[0]  #remove reverse primer
                    outf.write('@%s\n%s\n+\n%s\n' % (seq_id, new_seq, qual))
                    grep_seqs_dic[primer[0]] = grep_seqs_dic[primer[0]] + 1
                    matched_seqs = matched_seqs + 1
                    break
                except:
                    outf.write('@%s\n%s\n+\n%s\n' % (seq_id, seq, qual))
                    grep_seqs_dic[primer[0]] = grep_seqs_dic[primer[0]] + 1
                    matched_seqs = matched_seqs + 1
                    break
            elif reverse_com_seq(primer[1]) in seq[-20:]:
                fpart_seq = seq[rprimer_len:-20]
                rpart_seq = seq[-20:]
                new_rpart = rpart_seq[:rpart_seq.
                                      index(reverse_com_seq(primer[1]))]
                new_seq = primer[1] + reverse_com_seq(fpart_seq + new_rpart)
                #new_qual = qual[ :seq.index( reverse_com_seq(primer[1]) ) + len(primer[1]) ]
                new_qual = qual[:len(new_seq)]
                outf.write('@%s\n%s\n+\n%s\n' %
                           (seq_id + '|reverse', new_seq, new_qual))
                grep_seqs_dic[primer[0]] = grep_seqs_dic[primer[0]] + 1
                matched_seqs = matched_seqs + 1
                break
            #else:
            #no_matched_seqs = no_matched_seqs + 1
            #break

    grep_log.write("There are %s seqs totally!\n" % total_seqs_num)
    grep_log.write("Matched seqs: %s !\n" % matched_seqs)
    grep_log.write("No matched seqs: %s !\n" % (total_seqs_num - matched_seqs))
    grep_log.write("Reverse primer is: %s !\n" % rprimer)
    grep_log.write("RC. reverse primer is: %s !\n" % rc_rprimer)

    lib_name = os.path.basename(input_seqs_ul).split('.')[0]
    for key in grep_seqs_dic:
        grep_log.write('%s\t%s\t%s\n' % (key, grep_seqs_dic[key], lib_name))
    grep_log.close()
    outf.close()