# This function emulates the exact formating that perl uses for # float values. It is of no functional importance for this script. def perl_like_float_format(value): int_part = str(int(value)) format_string = "{0:." + str(15 - len(int_part)) + "f}" return format_string.format(value).rstrip('0').rstrip('.') outfile.write(chromo + '\t' + str(position) + '\t' + perl_like_float_format(normal) + '\t' + perl_like_float_format(tumor) + '\n') for chromo in [str(n) for n in range(1, 22 + 1)] + ["X", "Y"]: print chromo input_filename = options["inputpath"] + chromo + options["inputsuffix"] infile = Tabfile.Input(gzip.open(input_filename, 'rb')) accum_lines = [infile.readline()] for line in infile: if (int(line["normal"]) < options["coverage"] or float(line["map"]) < options["mappability"]): continue if (line["chr"] == accum_lines[0]["chr"] and int(line["pos"]) // 10000 == int(accum_lines[0]["pos"]) // 10000): accum_lines.append(line) else: process_accumulated_lines(accum_lines)
parser.add_argument('--file', '-f', type=file, help="segment file with copy number information") parser.add_argument('--out', '-o', default=sys.stdout, type=str, help='outputfile') parser.add_argument('--length', '-l', default=900, type=str, help='outputfile') args = parser.parse_args() out = args.out try: infile = Tabfile.Input(args.file) except IOError as (errno, strerr): sys.stderr.write("IOError %i: %s\n" % (errno, strerr)) sys.exit(2) if out != sys.stdout: try: out = open(args.out, 'w') except IOError as (errno, strerr): sys.exit("IOError %i: %s\n" % (errno, strerr)) def merge_lines_CN(prior_line, newline, next_line): start = "" end = "" #check whether segment could be merged
# This script replaces segments_to_data.pl and segments_to_data_2.pl. # # usage: segments_to_data.py --pscbs [FILE] --input [FILE] --output [FILE] from python_modules import Tabfile from python_modules import Options import subprocess import gzip import sys options = Options.parse({"pscbs": str, "input": str, "output": str}) if options: pscbsfile = Tabfile.Input(gzip.open(options["pscbs"])) #SNPs could be gzipped or not try: outfile = subprocess.Popen("bgzip >%s" % options["output"], shell=True, stdin=subprocess.PIPE) except IOError as (errno, strerror): syst.stderr.write("I/O error (%i): %s\n" % (errno, strerror)) pscbs_line = pscbsfile.readline() while pscbs_line: current_chromo = pscbs_line["chromosome"] print current_chromo
) sys.exit(2) if not args.sv_out or not args.output: sys.stderr.write( "Please specify all output files. For more information use -h\n") sys.exit(2) if not args.DDI_length: sys.stderr.write( "Please specify all minimum duplication deletion and inversion (DDI) lengths. For more information use -h\n" ) sys.exit(2) try: sv_file = Tabfile.Input(open(args.variants, "r")) sv_out = open(args.sv_out, "w") knownseg_file = Tabfile.Input(open(args.known_segments, "r")) file_out = open(args.output, "w") files = [sv_file] except IOError as (errno, strerr): sys.stderr.write("IOError %i: %s\n" % (errno, strerr)) sys.exit(errno) breakpoints = [] chromosomes = [str(a) for a in range(1, 24 + 1)] for f in files: for line in f: if line['svtype'] == 'INV' or line['svtype'] == 'DUP' or line[
if not args.snps: args.snps = "" out = args.out if args.out != sys.stdout: try: out = subprocess.Popen("bgzip >%s" % args.out, shell=True, stdin=subprocess.PIPE) except IOError as (errno, strerr): sys.stderr.write("IOError %i: %s\n" % (errno, strerr)) sys.exit(errno) try: snpFile = Tabfile.Input(gzip.open(args.snps)) except IOError as (errno, strerr): try: snpFile = Tabfile.Input(sys.stdin) except IOError: sys.stderr.write("IOError %i: %s\n" % (errno, strerr)) sys.exit(errno) curr_chrom = '' for line in snpFile: chrom = line['chr'] if chrom.startswith('chr'): chrom = chrom.replace('chr', '') line['chr'] = line['chr'].replace('chr', '') pos = int(line['startPos'])
if options: outfile = subprocess.Popen("bgzip >%s" % options["output"], shell=True, stdin=subprocess.PIPE) outfile.stdin.write( "chr\tstartPos\tAnormal\tBnormal\tAtumor\tBtumor\thaplotype\n" ) #header for chromo in [str(n) for n in range(1, 22 + 1)] + ["X", "Y"]: infile = gzip.open( options["inputpath"] + chromo + options["inputsuffix"], 'rb') for line in Tabfile.Input(infile): if (int(line["An"]) + int(line["Bn"]) >= options["coverage"]): line['haplotype'] = 0 if line["chr"].startswith("chr"): line["chr"] = line["chr"].replace("chr", "") line["chr"] = line["chr"].replace("X", "23") line["chr"] = line["chr"].replace("Y", "24") outfile.stdin.write(line["chr"] + '\t' + line["pos"] + '\t' + line["An"] + '\t' + line["Bn"] + '\t' + line["At"] + '\t' + line["Bt"] + '\t' + str(line['haplotype']) + '\n')
# This script merges all segmentation approaches into a final segmentation. from python_modules import Tabfile from python_modules import Options options = Options.parse({ "crest_deldupinv": str, "crest_tx": str, "known_segments": str, "output": str, "crest_out": str, "DDI_length": int }) if options: crest_ddi_file = Tabfile.Input(open(options["crest_deldupinv"], "r")) crest_tx_file = Tabfile.Input(open(options["crest_tx"], "r")) crest_out = open(options["crest_out"], "w") file_out = open(options["output"], "w") breakpoints = [] for line in crest_ddi_file: line["LENGTH"] = str(int(line["END"]) - int(line["POS"]) + 1) if (line["SOMATIC_GERMLINE_CLASSIFICATION"] == "somatic" and int(line["LENGTH"]) >= options["DDI_length"] and "CHROM" in line): line["CHROM"] = line["CHROM"].replace("chr", "").replace(
coverage = sum([float(line["coverage"]) for line in lines]) * 10 / len(lines) # This function emulates the exact formating that perl uses for # float values. It is of no functional importance for this script. def perl_like_float_format(value): int_part = str(int(value)) format_string = "{0:." + str(15 - len(int_part)) + "f}" return format_string.format(value).rstrip('0').rstrip('.') outfile.write(chromo + '\t' + str(position) + '\t' + perl_like_float_format(coverage) + '\t' + '\n') # input_filename = options["inputpath"] + chromo + options["inputsuffix"] infile = Tabfile.Input(gzip.open(options['inputfile'], 'rb')) accum_lines = [infile.readline()] for line in infile: if (int(line["coverage"]) < options["coverage"] or float(line["map"]) < options["mappability"]): continue if (line["chr"] == accum_lines[0]["chr"] and int(line["pos"]) // 10000 == int(accum_lines[0]["pos"]) // 10000): accum_lines.append(line) else: process_accumulated_lines(accum_lines) accum_lines = [line]
distances.append(float(fields[3]) ) ploidies.append(float(fields[1]) ) entries.append(fields) contin=1 except: sys.stderr.write( "FILE for %s does not exist\n"% pid) sys.exit(2) m = min( [ abs( j-2.0 ) for j in ploidies ] ) index = [i for i,j in enumerate(ploidies) if abs(j-2.0)==m ] solutions={1: "%s_%s"% (entries[index[0]][1], entries[index[0]][2] ) } count=2 for i,j in enumerate(entries): if i==index[0]: continue solutions[count]="%s_%s"% ( entries[i][1], entries[i][2] ) count+=1 jsonMain={} for key in solutions.keys(): try: infile="%s/%s_cnv_parameter_%s.txt"% (path, pid, solutions[key]) tabfile = Tabfile.Input( open(infile) ) except IOError as (errno, strerr ): sys.exit("IOError %i:%s\n" % (errno, strerr)) #convert simple tab seperated file wit header jsonMain[key]=tabfile.readline() out.write( json.dumps(jsonMain, indent=2, separators=(",",":")) )