def main(chrs, scale, step, meth_dir, cpg_dir, out_dir, sname): LOGFH = open('create_microarray_tracks.LOG', 'w') LOGFH.write('Start the program ... %s\n' % datetime.datetime.now()) # Create methylation tracks for each chromosome for chr in chrs: # 1. Build CpGs LOGFH.write('... Build CpG sites for [ %s ] ... %s\n' % (chr, datetime.datetime.now())) try: cpgfile = check_file(chr+'_cpgs', PATH=cpg_dir) except MethError, e: print >> sys.stderr, e.value continue siteparser = SiteParser(chr) siteparser.parse_sites(cpgfile, 'CpGSimple') cpg_sites = siteparser.get_sites() for cpg in cpg_sites: # define empty meth score cpg.meth_score = '-10000' # 2. Get methylation scores # note files are saved in 'methstatus_chrN' LOGFH.write('... Obtain methylation scores ...\n') try: mfile = check_file('methstatus_' + chr, PATH=meth_dir) except MethError, e: print >> sys.stderr, e.value sys.exit(2)
def main(chrs, scale, step, meth_dir, cpg_dir, out_dir, sname): LOGFH = open('create_microarray_tracks.LOG', 'w') LOGFH.write('Start the program ... %s\n' % datetime.datetime.now()) # Create methylation tracks for each chromosome for chr in chrs: # 1. Build CpGs LOGFH.write('... Build CpG sites for [ %s ] ... %s\n' % (chr, datetime.datetime.now())) try: cpgfile = check_file(chr + '_cpgs', PATH=cpg_dir) except MethError, e: print >> sys.stderr, e.value continue siteparser = SiteParser(chr) siteparser.parse_sites(cpgfile, 'CpGSimple') cpg_sites = siteparser.get_sites() for cpg in cpg_sites: # define empty meth score cpg.meth_score = '-10000' # 2. Get methylation scores # note files are saved in 'methstatus_chrN' LOGFH.write('... Obtain methylation scores ...\n') try: mfile = check_file('methstatus_' + chr, PATH=meth_dir) except MethError, e: print >> sys.stderr, e.value sys.exit(2)
def main(parafile, chr, anno_dir, re_fragfile, mcrbc_fragfile, out_dir): # Get annotation files: e.g., chr1_cpgs, chr1_re, chr1_mcrbc try: cpg_file = check_file(chr+'_cpgs', PATH=anno_dir) re_sitefile = check_file(chr+'_re', PATH=anno_dir) mcrbc_sitefile = check_file(chr+'_mcrbc', PATH=anno_dir) except MethError, e: print >> sys.stderr, e.value sys.exit(2)
def main(chrs, meth_dir, out_dir, sname): LOGFH = open('create_wiggle_tracks.LOG', 'w') LOGFH.write('Start the program ... %s\n' % datetime.datetime.now()) # Create methylation tracks for each chromosome for chr in chrs: LOGFH.write('... Obtain methylation scores for [ %s ] ...\n' % chr) # 1. Get methylation scores # note files are saved in 'methstatus_chrN' try: mfile = check_file('methstatus_' + chr, PATH=meth_dir) except MethError, e: print >> sys.stderr, e.value sys.exit(2) cpg_sites = {} # coordinate: score mfh = open(mfile) for line in mfh: line_list = line.rstrip().split('\t') coordinate = int(line_list[1]) cpg_sites[coordinate] = line_list[8] mfh.close() # 2. Write to the output file # Some fixed info in the output file header1 = 'track type=wiggle_0 name=' + chr + ' description=Wiggle custom track for ' \ + sname + '_' + chr + ' color=128,0,0 visibility=full' header2 = 'variableStep chrom=' + chr outfile = 'meth_' + chr + '.wig' fout = open(outfile, 'w') fout.write(header1 + '\n' + header2 + '\n') sorted_coords = cpg_sites.keys() sorted_coords.sort() for coord in sorted_coords: record = '\t'.join([str(coord), cpg_sites[coord]]) fout.write(record + '\n') fout.close() del cpg_sites
def main(chrs, meth_dir, out_dir, sname): LOGFH = open('create_wiggle_tracks.LOG', 'w') LOGFH.write('Start the program ... %s\n' % datetime.datetime.now()) # Create methylation tracks for each chromosome for chr in chrs: LOGFH.write('... Obtain methylation scores for [ %s ] ...\n' % chr) # 1. Get methylation scores # note files are saved in 'methstatus_chrN' try: mfile = check_file('methstatus_' + chr, PATH=meth_dir) except MethError, e: print >> sys.stderr, e.value sys.exit(2) cpg_sites = {} # coordinate: score mfh = open(mfile) for line in mfh: line_list = line.rstrip().split('\t') coordinate = int(line_list[1]) cpg_sites[coordinate] = line_list[8] mfh.close() # 2. Write to the output file # Some fixed info in the output file header1 = 'track type=wiggle_0 name=' + chr + ' description=Wiggle custom track for ' \ + sname + '_' + chr + ' color=128,0,0 visibility=full' header2 = 'variableStep chrom=' + chr outfile = 'meth_' + chr + '.wig' fout = open(outfile, 'w') fout.write(header1 + '\n' + header2 + '\n') sorted_coords = cpg_sites.keys() sorted_coords.sort() for coord in sorted_coords: record = '\t'.join([str(coord), cpg_sites[coord]]) fout.write(record + '\n') fout.close() del cpg_sites
def main(chrs, cpg_dir, out_dir): LOGFH = open('create_cpg_tracks.LOG', 'w') LOGFH.write('Start the program ... %s\n' % datetime.datetime.now()) # Get track values for each chromosome for chr in chrs: LOGFH.write('... Read CpGs on [ %s ] ... %s\n' % (chr, datetime.datetime.now())) # Some fixed info in the output file header1 = 'browser position ' + chr + ':1-10000' header_cpg = 'track name="'+chr+' CpG" description="CpGs on '+chr+'" color=0,0,0' header_re = 'track name="'+chr+' RE" description="RE sites on '+chr+'" color=255,0,0' header_mcrbc = 'track name="'+chr+' McrBC" description="McrBC sites on '+chr+'" color=0,0,255' # Read CpG file try: cpgfile = check_file(chr+'_cpgs', cpg_dir) except MethError, e: print >> stderr, e.value sys.exit(2) infh = open(cpgfile) cpgout = open(os.path.join(out_dir, chr+'_cpgs.bed'), 'w') reout = open(os.path.join(out_dir, chr+'_re.bed'), 'w') mcrbcout = open(os.path.join(out_dir, chr+'_mcrbc.bed'), 'w') cpgout.write(header1 + '\n') cpgout.write(header_cpg + '\n') reout.write(header1 + '\n') reout.write(header_re + '\n') mcrbcout.write(header1 + '\n') mcrbcout.write(header_mcrbc + '\n') for line in infh: line_list = line.rstrip().split('\t') position = int(line_list[1]) isre = int(line_list[2]) ismcrbc = int(line_list[3]) cpgout.write('%s\t%d\t%d\n' % (chr, position, position+2)) if isre: reout.write('%s\t%d\t%d\n' % (chr, position, position+2)) if ismcrbc: mcrbcout.write('%s\t%d\t%d\n' % (chr, position, position+2)) infh.close() cpgout.close() reout.close() mcrbcout.close()
return lib_dict[arg] if __name__ == '__main__': parser = argparse.ArgumentParser(description='Parse mate-pair reads to get methylation compartments: \ methylated fragments and unmethylated fragments.', epilog='Save coordinates of parsed methylated/unmethylated fragments \ for each chromosome in BED format, generating files like chr*_re.bed.') parser.add_argument('lib', choices=['re', 'mcrbc'], help='sequences generated by the RE or McrBC library') parser.add_argument('cmap', help='chromosome map provided by SOLiD') parser.add_argument('mates', help='mate-pair reads from either the RE or McrBC library') parser.add_argument('--out_dir', default=os.getcwd(), help='directory for parsed fragments, default=current dir') # Parse arguments args = parser.parse_args() readlib = get_readlib(args.lib) try: cmapfile = check_file(args.cmap) matesfile = check_file(args.mates) except MethError, e: parser.print_usage() print >> sys.stderr, 'MethError: ', e.value sys.exit(2) if os.path.isdir(args.out_dir): out_dir = os.path.abspath(args.out_dir) else: parser.print_usage() print >> sys.stderr, 'Invalid output directory' sys.exit(2) main(readlib, cmapfile, matesfile, out_dir)
parser = argparse.ArgumentParser(description='Parse mate-pair reads for a specific chromosome. Reads are saved in SAM/BAM format.', epilog='Save coordinates of fragments that are formed by properly paired reads \ in BED formate, generating files like chr1_re.bed.') parser.add_argument('--flag', type=bool, nargs='?', const=True, default=False, \ help='use flag value to extract paired reads, default=False') parser.add_argument('--min_ins', type=int, default=0, help='the minimum insert size of mate-pair reads, default=0') parser.add_argument('--max_ins', type=int, default=15000, help='the maximum insert size of mate-pair reads, default=15000') parser.add_argument('--out_dir', default=os.getcwd(), help='directory for parsed fragments, default=current dir') parser.add_argument('format', choices=['sam', 'bam'], help='input file format') parser.add_argument('lib', choices=['re', 'mcrbc'], help='sequences generated by the RE or McrBC library') parser.add_argument('chr', help='chromosome name, e.g., chr1') parser.add_argument('input', help='input file name') # Parse arguments args = parser.parse_args() readlib = get_readlib(args.lib) fformat = get_format(args.format) try: infile = check_file(args.input) except MethError, e: parser.print_usage() print >> sys.stderr, 'MethError: ', e.value sys.exit(2) if os.path.isdir(args.out_dir): out_dir = os.path.abspath(args.out_dir) else: parser.print_usage() print >> sys.stderr, 'Invalid output directory' sys.exit(2) main(args.flag, args.min_ins, args.max_ins, fformat, readlib, args.chr, infile, out_dir)
for line in fragfh: # BED format line_list = line.rstrip().split("\t") length = str(int(line_list[2]) - int(line_list[1])) id = line_list[3] + '-' + length id_insert_point = bisect.bisect_right(idlist, id) if id_insert_point != 0 and id_insert_point <= len(idlist) and idlist[id_insert_point-1] == id: newfh.write(line) fragfh.close() newfh.close() if __name__ == "__main__": parser = argparse.ArgumentParser(description='Create custom track files for RE/McrBC fragments (BED format) to \ display in the UCSC Genome Browser.') parser.add_argument('--header', help='number of header lines, default=2', type=int, default=2) parser.add_argument('id_list', help='list of [fragment IDs, start coordinate, and end coordinate], \ generated by filter.py') parser.add_argument('frags', help='the origianl fragments (BED format) generated by parse_mates.py') parser.add_argument('newfrags', help='the new fragments based on id_list') # Parse arguments args = parser.parse_args() header = args.header outfile = args.newfrags try: listfile = check_file(args.id_list) fragfile = check_file(args.frags) except MethError, e: parser.print_usage() print >> stderr, e.value sys.exit(2) main(header, listfile, fragfile, outfile)
if __name__ == '__main__': parser = argparse.ArgumentParser( description= 'The master script to generate all sub-scripts for the data analysis pipeline' ) parser.add_argument('--run', type=bool, nargs='?', const=True, default=False, \ help='run the analysis pipeline or just save scripts, default=False') parser.add_argument('--format', choices=['mates', 'sam', 'bam'], default='mates', help='paired file format, default=mates') parser.add_argument('para', help='parameters required in the pipeline') parser.add_argument('re_reads', help='paired read file for RE fragments') parser.add_argument('mcrbc_reads', help='paired read file for McrBC fragments') parser.add_argument('out_dir', help='directory for all output files of the pipeline') # Parse arguments args = parser.parse_args() try: parafile = check_file(args.para) refile = check_file(args.re_reads) mcrbcfile = check_file(args.mcrbc_reads) except MethError, e: parser.print_usage() print >> sys.stderr, e.value sys.exit(2) if os.path.isdir(args.out_dir): out_dir = os.path.abspath(args.out_dir) main(args.run, args.format, parafile, refile, mcrbcfile, out_dir)
1. failed fragments in four classes: failed_refrags_ends, failed_refrags_mid, failed_mcrbcfrags_ends, failed_mcrbcfrags_mid 2. passed fragments in four classes: passed_refrags_1end, passed_refrags_2ends, passed_mcrbcfrags_2ends, passed_mcrbcfrags_1end 3. save filtering results to files''')) parser.add_argument('--para', help='filtering parameters', default=os.path.join(DIR, 'data/filter_para')) parser.add_argument('--out_dir', help='directory for output files, default=currect dir', default=os.getcwd()) parser.add_argument('chr', help='chromosome') parser.add_argument('anno_dir', help='directory storing annotation files for CpG, RE, and McrBC sites') parser.add_argument('re_frags', help='RE fragments') parser.add_argument('mcrbc_frags', help='McrBC fragments') # Parse arguments args = parser.parse_args() try: parafile = check_file(args.para) # parafile except MethError, e: parser.print_usage() print >> sys.stderr, e.value sys.exit(2) if os.path.isdir(args.anno_dir) is True and os.path.isdir(args.out_dir) is True: anno_dir = os.path.abspath(args.anno_dir) # anno_dir out_dir = os.path.abspath(args.out_dir) # out_dir else: parser.print_usage() print >> sys.stderr, 'Invalid directories' sys.exit(2) try: chr = check_chr(args.chr) # chr re_fragfile = check_file(args.re_frags) # re_fragfile mcrbc_fragfile = check_file(args.mcrbc_frags) # mcrbc_fragfile
outfh.close() LOGFH.write('Finish the program ... %s\n\n' % str(datetime.datetime.now())) LOGFH.close() if __name__ == '__main__': parser = argparse.ArgumentParser(description='Estimate DNA methylation states for CpGs in a sample') parser.add_argument('--out_dir', help='directory for output files, default=currect dir', default=os.getcwd()) parser.add_argument('meth_ave', type=float, help='global methylation level estimated by LUMA') parser.add_argument('chr', help='chromosome') parser.add_argument('chr_len', type=int, help='chromosome length') parser.add_argument('methdata', help='methylation data generated by filter.py') # Parse arguments args = parser.parse_args() p_bar = args.meth_ave chr_len = args.chr_len try: chr = check_chr(args.chr) meth_data = check_file(args.methdata) except MethError, e: parser.print_usage() print >> sys.stderr, e.value sys.exit(2) if os.path.isdir(args.out_dir) is True: out_dir = os.path.abspath(args.out_dir) else: parser.print_usage() print >> sys.stderr, 'Invalid directories' sys.exit(2) main(p_bar, chr, chr_len, meth_data, out_dir)
LOGFH.close() def write2output(filename, sites, chr): fh = open(filename, 'w') for pos in sites.sorted_iter(): record = pos.get_record(chr) fh.write(record + '\n') fh.close() if __name__ == '__main__': parser = argparse.ArgumentParser(description='Parse genomic sequences for CpG, RE, and McrBC sites') parser.add_argument('chr', help='chromosome') parser.add_argument('fasta', help='Fasta sequence for the chromosome') parser.add_argument('--outname', help='name for the output files, default=chrN') # Parse arguments args = parser.parse_args() try: chr = check_chr(args.chr) seqfile = check_file(args.fasta) except MethError, e: parser.print_usage() print >> sys.stderr, e.value sys.exit(2) if args.outname is not None: outname = args.outname else: outname = chr main(chr, seqfile, outname)
parser.add_argument( '--out_dir', default=os.getcwd(), help='directory for parsed fragments, default=current dir') parser.add_argument('format', choices=['sam', 'bam'], help='input file format') parser.add_argument('lib', choices=['re', 'mcrbc'], help='sequences generated by the RE or McrBC library') parser.add_argument('chr', help='chromosome name, e.g., chr1') parser.add_argument('input', help='input file name') # Parse arguments args = parser.parse_args() readlib = get_readlib(args.lib) fformat = get_format(args.format) try: infile = check_file(args.input) except MethError, e: parser.print_usage() print >> sys.stderr, 'MethError: ', e.value sys.exit(2) if os.path.isdir(args.out_dir): out_dir = os.path.abspath(args.out_dir) else: parser.print_usage() print >> sys.stderr, 'Invalid output directory' sys.exit(2) main(args.flag, args.min_ins, args.max_ins, fformat, readlib, args.chr, infile, out_dir)
if __name__ == "__main__": parser = argparse.ArgumentParser( description= 'Create custom track files for RE/McrBC fragments (BED format) to \ display in the UCSC Genome Browser.') parser.add_argument('--header', help='number of header lines, default=2', type=int, default=2) parser.add_argument( 'id_list', help='list of [fragment IDs, start coordinate, and end coordinate], \ generated by filter.py') parser.add_argument( 'frags', help='the origianl fragments (BED format) generated by parse_mates.py') parser.add_argument('newfrags', help='the new fragments based on id_list') # Parse arguments args = parser.parse_args() header = args.header outfile = args.newfrags try: listfile = check_file(args.id_list) fragfile = check_file(args.frags) except MethError, e: parser.print_usage() print >> stderr, e.value sys.exit(2) main(header, listfile, fragfile, outfile)
epilog='Save coordinates of parsed methylated/unmethylated fragments \ for each chromosome in BED format, generating files like chr*_re.bed.' ) parser.add_argument('lib', choices=['re', 'mcrbc'], help='sequences generated by the RE or McrBC library') parser.add_argument('cmap', help='chromosome map provided by SOLiD') parser.add_argument( 'mates', help='mate-pair reads from either the RE or McrBC library') parser.add_argument( '--out_dir', default=os.getcwd(), help='directory for parsed fragments, default=current dir') # Parse arguments args = parser.parse_args() readlib = get_readlib(args.lib) try: cmapfile = check_file(args.cmap) matesfile = check_file(args.mates) except MethError, e: parser.print_usage() print >> sys.stderr, 'MethError: ', e.value sys.exit(2) if os.path.isdir(args.out_dir): out_dir = os.path.abspath(args.out_dir) else: parser.print_usage() print >> sys.stderr, 'Invalid output directory' sys.exit(2) main(readlib, cmapfile, matesfile, out_dir)
for line in lines: if not re.search('^#', line): # skip comments k, v = line.rstrip().split('\t') para_dict[k] = v return para_dict if __name__ == '__main__': parser = argparse.ArgumentParser(description='The master script to generate all sub-scripts for the data analysis pipeline') parser.add_argument('--run', type=bool, nargs='?', const=True, default=False, \ help='run the analysis pipeline or just save scripts, default=False') parser.add_argument('--format', choices=['mates', 'sam', 'bam'], default='mates', help='paired file format, default=mates') parser.add_argument('para', help='parameters required in the pipeline') parser.add_argument('re_reads', help='paired read file for RE fragments') parser.add_argument('mcrbc_reads', help='paired read file for McrBC fragments') parser.add_argument('out_dir', help='directory for all output files of the pipeline') # Parse arguments args = parser.parse_args() try: parafile = check_file(args.para) refile = check_file(args.re_reads) mcrbcfile = check_file(args.mcrbc_reads) except MethError, e: parser.print_usage() print >> sys.stderr, e.value sys.exit(2) if os.path.isdir(args.out_dir): out_dir = os.path.abspath(args.out_dir) main(args.run, args.format, parafile, refile, mcrbcfile, out_dir)