示例#1
0
文件: roc.py 项目: drio/py.analysis
def parse_args():
  parser = argparse.ArgumentParser(description='cnv pipeline')
  parser.add_argument('-c', '--calls', metavar='calls', required=True,
                        dest='calls_f', action='store',
                        help='input bed file with the calls of your classifier')

  parser.add_argument('-t', '--truth', metavar='truth', required=True,
                        dest='truth_f', action='store',
                        help='list of validated cnv calls')

  parser.add_argument('-o', '--output_f', metavar='output_fn', required=False,
                        dest='output_f', action='store', default=sys.stdout,
                        help='list of validated cnv calls')

  parser.add_argument('-w', '--buffer', metavar='win_buffer', required=False,
                        dest='buffer_size', action='store', type=int, default=0,
                        help='buffer to use when checking hit against truth')

  parser.add_argument('-m', '--min_size', metavar='min_size', required=False,
                        dest='min_size', action='store', type=int, default=0,
                        help='min size of the events you want to consider when loading the truth. ')

  parser.add_argument('-r', '--chrm', metavar='calls_chrm', required=False,
                        dest='calls_chrm', action='store',
                        help='Chrm we used for the calls.')

  args = parser.parse_args()
  args.calls_f = drdcommon.xopen(args.calls_f)
  args.truth_f = drdcommon.xopen(args.truth_f)
  return args
示例#2
0
def main():
    if len(sys.argv) == 4:
        fd_vcf = drdcommon.xopen("-")
        fd_csv = drdcommon.xopen(sys.argv[1])
        do_work(fd_vcf, fd_csv)
        fd_vcf.close()
        fd_csv.close()
    else:
        drdcommon.error("Incorrect # of params.", usage)
示例#3
0
def main():
  if len(sys.argv) == 3:
    fd_vcf       = drdcommon.xopen("-")
    fd_pheno_tsv = drdcommon.xopen(sys.argv[1])
    fd_haplo_tsv = drdcommon.xopen(sys.argv[2])
    do_work(fd_vcf, fd_pheno_tsv, fd_haplo_tsv)
    fd_vcf.close()
    fd_pheno_tsv.close()
    fd_haplo_tsv.close()
  else:
    drdcommon.error("Incorrect # of params.", usage)
示例#4
0
def parse_args():
  parser = argparse.ArgumentParser(description='cnv pipeline')
  parser.add_argument('-e', '--events', metavar='events', required=True,
                        dest='events', action='store',
                        help='List of events to introduce in the genome')

  parser.add_argument('-r', '--reference', metavar='reference', required=True,
                        dest='reference', action='store',
                        help='original fasta file of the reference')

  args = parser.parse_args()
  args.events_stream    = drdcommon.xopen(args.events)
  args.reference_stream = drdcommon.xopen(args.reference)
  return args
示例#5
0
 def load_mapq(self):
     drdcommon.log("Loading mapq")
     for line in drdcommon.xopen(self.sam_fn):
         if line[0] != '@':
             s = line.split()
             probe_id, chrm, coor, mq = s[0], s[2], s[3], s[4]
             self.d_mq[probe_id] = [chrm, coor, mq]
示例#6
0
 def iterate_over_eg_cov(self):
   fd_hits = drdcommon.xopen(self.fn_hits)
   for line in fd_hits:
     s = line.split()
     n_hits, p_id = int(s[0]), s[1].rstrip()
     yield n_hits, p_id
   fd_hits.close()
示例#7
0
def main():
    args = parse_args()
    stream = drdcommon.xopen("-")
    if not drdcommon.data_in_stdin():
        drdcommon.error(usage)
    print Saturation(stream, args.at_least_seen).csv("\t")
    stream.close()
示例#8
0
def main():
  if len(sys.argv) == 1:
    fd_reads = drdcommon.xopen("-")
    do_work(fd_reads)
    fd_reads.close()
  else:
    drdcommon.error("Incorrect # of params.", usage)
示例#9
0
def method1(input1, input2):
    """
           4      3              5            1
    i1  |------|-------|------------------|--------|
              7                10               1
    i2  |----------|-------------------------|-----|
             3.5             4                  1
    out |----------|-------------------------|-----|
    """
    a = np.zeros(shape=(three_hundre_mil))

    # Save the metric values for all the bp from the stdin input
    working_chrm = ""
    for i in input1:
        working_chrm, start, stop, val = i.split()
        for j in range(int(start), int(stop) + 1):
            a[j] = val

    # Iterate over second output and generate new windows for first input
    for i in drd.xopen(input2):
        chrm, start, stop, val = i.split()
        if working_chrm == chrm:
            s, e = int(start), int(stop)
            print "%s\t%s\t%s\t%s" % (chrm, start, stop,
                                      int(np.median(a[s:e + 1])))
示例#10
0
 def load_mapq(self):
   drdcommon.log("Loading mapq")
   for line in drdcommon.xopen(self.sam_fn):
     if line[0] != '@':
       s = line.split()
       probe_id, chrm, coor, mq = s[0], s[2], s[3], s[4]
       self.d_mq[probe_id] = [ chrm, coor, mq ]
示例#11
0
 def iterate_over_eg_cov(self):
     fd_hits = drdcommon.xopen(self.fn_hits)
     for line in fd_hits:
         s = line.split()
         n_hits, p_id = int(s[0]), s[1].rstrip()
         yield n_hits, p_id
     fd_hits.close()
示例#12
0
def main():
    if len(sys.argv) == 1:
        fd_reads = drdcommon.xopen("-")
        do_work(fd_reads)
        fd_reads.close()
    else:
        drdcommon.error("Incorrect # of params.", usage)
示例#13
0
def main():
    if len(sys.argv) != 2:
        drdcommon.error("Wrong # of args", usage)
    if not drdcommon.data_in_stdin():
        drdcommon.error("No data in stdin.", usage)
    ratios_stream = drdcommon.xopen("-")
    threshold = float(sys.argv[1])
    CnvStateMachine(ratios_stream, threshold).run()
示例#14
0
def main():
    if len(sys.argv) != 1:
        drdcommon.error("Wrong # of args", usage)
    if not drdcommon.data_in_stdin():
        drdcommon.error("No data in stdin.", usage)
    fd_vcf = drdcommon.xopen("-")
    do_work(fd_vcf)
    fd_vcf.close()
示例#15
0
def main():
    if not drdcommon.data_in_stdin():
        drdcommon.error("I need a data stream in stdin.", usage=_usage)
    if not len(sys.argv) == 4:
        drdcommon.error("Wrong number of parameters", usage=_usage)

    title, _xl, _yl = sys.argv[1:]
    x, y = process_data(drdcommon.xopen("-"))
    plot(x, y, title, xlabel=_xl, ylabel=_yl)
示例#16
0
def main():
    if len(sys.argv) != 2:
        drdcommon.error("Wrong # of args", usage)
    if not drdcommon.data_in_stdin():
        drdcommon.error("No data in stdin.", usage)
    fd_vcf = drdcommon.xopen("-")
    w_size = int(sys.argv[1])
    do_work(fd_vcf, w_size)
    fd_vcf.close()
示例#17
0
def main():
    if not drdcommon.data_in_stdin():
        drdcommon.error("I need a data stream in stdin.", usage=_usage)
    if not len(sys.argv) == 4:
        drdcommon.error("Wrong number of parameters", usage=_usage)

    title, _xl, _yl = sys.argv[1:]
    x, y = process_data(drdcommon.xopen("-"))
    plot(x, y, title, xlabel=_xl, ylabel=_yl)
示例#18
0
 def process_alignments(self):
   drdcommon.log("Processing alignments")
   init_doesnt_pass_eg_hits = False
   for line in drdcommon.xopen(self.fn_sam):
     if line[0] != '@':
       s = line.split()
       probe_id, mq = s[0], int(s[4])
       has_good_qual = mq > self.min_mq
       self.probe_info[probe_id] = [ has_good_qual, init_doesnt_pass_eg_hits ]
示例#19
0
def main():
    if not drdcommon.data_in_stdin():
        drdcommon.error("I need a data stream in stdin.", usage="-")
    if not len(sys.argv) == 2:
        drdcommon.error("Wrong number of parameters", usage="-")

    title = sys.argv[1]
    x, y = process_data(drdcommon.xopen("-"))
    plot(x, y, title, xlabel="genomic window", ylabel="Average Read Depth")
示例#20
0
 def run(self):
     logging.basicConfig(level=logging.INFO)
     fd_vcf = drdcommon.xopen(self.options.vcf_fn)
     sf = SnpFreq(fd_vcf, self.exp_type, self.options)
     if self.options.list_s_snps:
         sf.run()
     else:
         print "%f" % sf.run()
     fd_vcf.close()
示例#21
0
def main():
  if len(sys.argv) == 2:
    drdcommon.error("Wrong # of args", usage)
  if drdcommon.data_in_stdin() == False:
    drdcommon.error("Need data in stdin.", usage)

  fd_vcf = drdcommon.xopen("-")
  do_work(fd_vcf)
  fd_vcf.close()
示例#22
0
 def run(self):
   logging.basicConfig(level=logging.INFO)
   fd_vcf = drdcommon.xopen(self.options.vcf_fn)
   sf = SnpFreq(fd_vcf, self.exp_type, self.options)
   if self.options.list_s_snps:
     sf.run()
   else:
     print "%f" % sf.run()
   fd_vcf.close()
示例#23
0
 def load_hits(self):
     drdcommon.log("Loading hits")
     for f in drdcommon.files_in_dir('.', self.pattern):
         sample_id = self.extract_id(f)
         drdcommon.log("fn: %s | id: %s" % (f, sample_id))
         for line in drdcommon.xopen(f):
             s = line.split()
             assert len(s) == 2
             n_hits, p_id = s[0], s[1].rstrip()
             self.d_hits[p_id][sample_id] = n_hits
示例#24
0
def main():
    if len(sys.argv) != 2:
        drdcommon.error("Wrong # of args", usage)
    if not drdcommon.data_in_stdin():
        drdcommon.error("No data in stdin.", usage)
    windows = drdcommon.xopen("-")
    bam_name = sys.argv[1]
    if not os.path.isfile(bam_name):
        drdcommon.error("Invalid bam file.", usage)
    compute_ratios(windows, bam_name)
示例#25
0
def main():
    if len(sys.argv) != 2:
        drdcommon.error("Wrong # of args", usage)
    if drdcommon.data_in_stdin() == False:
        drdcommon.error("Need data in stdin.", usage)

    min_num_samples = int(sys.argv[1])
    fd_vcf = drdcommon.xopen("-")
    do_work(fd_vcf, min_num_samples)
    fd_vcf.close()
示例#26
0
def main():
    if len(sys.argv) == 3:
        sample_id, vcf_file = sys.argv[1:]

        d = {}
        for l in drdcommon.xopen(vcf_file):
            if l[0] == "#":
                continue
            v_chrm, v_coor = l.split("\t")[0:2]
            d[v_chrm + "_" + v_coor] = True

        for l in drdcommon.xopen("-"):
            l = l.rstrip()
            s = l.split("\t")
            chrm, coor = s[0:2]
            if (chrm + "_" + coor) in d:
                print sample_id + "\t" + l
    else:
        print usage
示例#27
0
 def load_hits(self):
   drdcommon.log("Loading hits")
   for f in drdcommon.files_in_dir('.', self.pattern):
     sample_id = self.extract_id(f)
     drdcommon.log("fn: %s | id: %s" % (f, sample_id))
     for line in drdcommon.xopen(f):
       s = line.split()
       assert len(s) == 2
       n_hits, p_id = s[0], s[1].rstrip()
       self.d_hits[p_id][sample_id] = n_hits
示例#28
0
def main():
  if len(sys.argv) == 3:
    logratios = process_data(drdcommon.xopen("-"))
    bin_nums  = range(1, len(logratios)+1)
    title     = sys.argv[1]
    output_fn = sys.argv[2]
    plot(output_fn,
      bin_nums, logratios, title, xlabel="bin #", ylabel="log2ratios (sample/control)")
  else:
    drdcommon.error("Wrong number of args. <title> <output.filename>")
示例#29
0
 def process_alignments(self):
     drdcommon.log("Processing alignments")
     init_doesnt_pass_eg_hits = False
     for line in drdcommon.xopen(self.fn_sam):
         if line[0] != '@':
             s = line.split()
             probe_id, mq = s[0], int(s[4])
             has_good_qual = mq > self.min_mq
             self.probe_info[probe_id] = [
                 has_good_qual, init_doesnt_pass_eg_hits
             ]
示例#30
0
def main():
  dep_fn = "deps." + str(randint(1,1000000))
  if len(sys.argv) == 2:
    # Dirty hack since I don't know how to make pandas.read_table work
    # off of stdin.
    data = common.xopen(sys.argv[1]).read()
    f = tempfile.NamedTemporaryFile(delete=False)
    f.write(data)
    f.close()
    for i,s in pd.read_table(f.name).iterrows(): # index, pandas series (line)
      print Job(s, dep_fn, i == 0)
  else:
    main_help('Need input file (use - for stdin).', main_help=True)
示例#31
0
def main():
    if len(sys.argv) == 2:
        lo_file = sys.argv[1]
        link = {}
        for l in drdcommon.xopen(lo_file):
            s = l.rstrip().split("\t")
            chrm_from, start_from, end_from = s[0:3]  # hsap
            chrm_to, start_to, end_to = s[3:6]  # rhmac
            # human -> rhmac
            link[chrm_from + "_" + end_from] = chrm_to + "_" + end_to
            #link[chrm_from + "_" + start_from] = chrm_to + "_" + start_to

        for l in drdcommon.xopen("-"):
            s = l.rstrip().split("\t")
            key_hsap = "_".join(s[0:2])
            if key_hsap in link:
                rh_coor = link[key_hsap]
            else:
                rh_coor = "-_-"
            print rh_coor.replace("_", "\t") + "\t" + l.rstrip()
    else:
        print usage
示例#32
0
def main():
    if len(sys.argv) == 2:
        lo_file = sys.argv[1]
        link = {}
        for l in drdcommon.xopen(lo_file):
            s = l.rstrip().split("\t")
            chrm_from, start_from, end_from = s[0:3] # hsap
            chrm_to, start_to, end_to = s[3:6] # rhmac
            # human -> rhmac
            link[chrm_from + "_" + end_from] = chrm_to + "_" + end_to
            #link[chrm_from + "_" + start_from] = chrm_to + "_" + start_to

        for l in drdcommon.xopen("-"):
            s = l.rstrip().split("\t")
            key_hsap = "_".join(s[0:2])
            if key_hsap in link:
                rh_coor = link[key_hsap]
            else:
                rh_coor = "-_-"
            print rh_coor.replace("_", "\t") + "\t" + l.rstrip()
    else:
        print usage
示例#33
0
def main():
  if len(sys.argv) == 1:
    fd = drdcommon.xopen("-")
    std, counts = process_data(fd)
    title = "std dev freq of var allele ratios"

    drdplots.scatter_plot("std.dist.png",
                          std, log_it(counts, 10),
                          title=title, xlabel="std deviation",
                          ylabel="log10(counts)", dot_size=10)
    fd.close()
  else:
    drdcommon.error("Wrong number of args. Just need std values in stdin.")
示例#34
0
def main():
    if len(sys.argv) == 2:
        lines = []
        prev_dep_file = None
        for idx, line in enumerate(common.xopen(sys.argv[1])):
            if line in ['\n', '\r\n']:
                print "#--------"
                prev_dep_file = cmd2submit(lines, prev_dep_file)
                lines = []
            else:
                lines.append(line.rstrip())
    else:
        main_help('Need input file (use - for stdin).', main_help=True)
示例#35
0
def main():
    dep_fn = "deps." + str(randint(1, 1000000))
    if len(sys.argv) == 2:
        # Dirty hack since I don't know how to make pandas.read_table work
        # off of stdin.
        data = common.xopen(sys.argv[1]).read()
        f = tempfile.NamedTemporaryFile(delete=False)
        f.write(data)
        f.close()
        for i, s in pd.read_table(
                f.name).iterrows():  # index, pandas series (line)
            print Job(s, dep_fn, i == 0)
    else:
        main_help('Need input file (use - for stdin).', main_help=True)
示例#36
0
def main():
    if len(sys.argv) == 3:
        logratios = process_data(drdcommon.xopen("-"))
        bin_nums = range(1, len(logratios) + 1)
        title = sys.argv[1]
        output_fn = sys.argv[2]
        plot(output_fn,
             bin_nums,
             logratios,
             title,
             xlabel="bin #",
             ylabel="log2ratios (sample/control)")
    else:
        drdcommon.error("Wrong number of args. <title> <output.filename>")
示例#37
0
def load_predictions(i_file, chrm_col, coor_col, columns):
    pre = {}
    header = True
    for l in drdcommon.xopen(i_file):
        if header:
            header=False
            continue
        s = l.split("\t")
        key = drdcommon.canonic_chrm(s[chrm_col]) + "_" + s[coor_col]
        _tmp = []
        for c in columns:
            _tmp.append(s[c])
        pre[key] = "\t".join(_tmp)
    return pre
示例#38
0
 def __load_species_snp_coordinates(self):
   fd = drdcommon.xopen(self.coor_fn)
   d = {}
   self.d_species_coor = d
   n = 0
   for l in fd:
     n += 1
     chrm, coor = l.split()
     if not d.has_key(chrm):
       d[chrm] = {}
     d[chrm][int(coor)] = 1
   fd.close()
   logging.info("# of coordinates loaded: %d" % n)
   logging.info("current memory usage in %dkb" % drdcommon.memory_usage())
示例#39
0
 def __load_species_snp_coordinates(self):
     fd = drdcommon.xopen(self.coor_fn)
     d = {}
     self.d_species_coor = d
     n = 0
     for l in fd:
         n += 1
         chrm, coor = l.split()
         if not d.has_key(chrm):
             d[chrm] = {}
         d[chrm][int(coor)] = 1
     fd.close()
     logging.info("# of coordinates loaded: %d" % n)
     logging.info("current memory usage in %dkb" % drdcommon.memory_usage())
示例#40
0
def process_file(wild_file, d, _passes):
    for f in glob.glob(wild_file):
        match = re.search("^(\d+)\.", f)
        if match:
            _id = int(match.group(1))
            first_line = True
            for l in xopen(f):
                if first_line:
                    sys.stderr.write("%s\n" % _id)
                    first_line = False
                    continue
                else:
                    d.add(l, _passes)
    return d
示例#41
0
def main():
    if len(sys.argv) == 2:
        lines = []
        prev_dep_file = None
        for idx, line in enumerate(common.xopen(sys.argv[1])):
            if line in ['\n', '\r\n']:
                print ""
                prev_dep_file = cmd2submit(lines, prev_dep_file)
                lines = []
            else:
                lines.append(line.rstrip())
        print ""
        cmd2submit(lines, prev_dep_file)
    else:
        main_help('Need input file (use - for stdin).', main_help=True)
示例#42
0
def main():
    if len(sys.argv) == 1:
        fd = drdcommon.xopen("-")
        std, counts = process_data(fd)
        title = "std dev freq of var allele ratios"

        drdplots.scatter_plot("std.dist.png",
                              std,
                              log_it(counts, 10),
                              title=title,
                              xlabel="std deviation",
                              ylabel="log10(counts)",
                              dot_size=10)
        fd.close()
    else:
        drdcommon.error("Wrong number of args. Just need std values in stdin.")
示例#43
0
def loadCalls(ds, fn, idx, chrm):
	log("Loading calls from %s; idx=%s" % (fn, idx))
	chrm_found = False
	nbp = 0
	for l in drdcommon.xopen(fn):
		c, start, end, cnv = l.strip().split()
		if c == chrm:
			chrm_found = True
			for i in range(int(start), int(end)+1):
				if nbp % 1000000 == 0:
					sys.stderr.write("MEM: %s nbp: %s\r" % (drdcommon.memory_usage(), nbp))
				ds[idx][i] = round(float(cnv))
				nbp += 1

	if not chrm_found:
		error("\nCould not find chrm in file. Bailing out.")
	log("\n%s bp loaded" % nbp)
示例#44
0
def loadCalls(ds, fn, idx, chrm):
    log("Loading calls from %s; idx=%s" % (fn, idx))
    chrm_found = False
    nbp = 0
    for l in drdcommon.xopen(fn):
        c, start, end, cnv = l.strip().split()
        if c == chrm:
            chrm_found = True
            for i in range(int(start), int(end) + 1):
                if nbp % 1000000 == 0:
                    sys.stderr.write("MEM: %s nbp: %s\r" %
                                     (drdcommon.memory_usage(), nbp))
                ds[idx][i] = round(float(cnv))
                nbp += 1

    if not chrm_found:
        error("\nCould not find chrm in file. Bailing out.")
    log("\n%s bp loaded" % nbp)
示例#45
0
def loadChrm(ds, ref, chrm):
	if not os.path.exists(ref):
		error("Cannot find reference file: %s", ref)

	log("Reading reference genome chrm: %s" % chrm)
	i = 1
	for l in drdcommon.xopen(ref):
		l = l.strip()
		if i == 1 and l[0] == '>' and l[1:] == chrm:
			continue
		if i > 1 and l[0] == '>':
			break

		for bp in l:
			if bp.upper() != 'N':
				ds[0][i] = 1
			if i % 10000000 == 0:
				sys.stderr.write("MEM: %s nbp: %s\r" % (drdcommon.memory_usage(), i))
			i += 1
	log("\n%s bp read." % i)
示例#46
0
def parse_args():
    parser = argparse.ArgumentParser(description='cnv pipeline')
    parser.add_argument('-i',
                        '--input_fn',
                        metavar='input_fn',
                        required=True,
                        dest='input_fn',
                        action='store',
                        help='input data file')

    parser.add_argument('-o',
                        '--output_fn',
                        metavar='output_fn',
                        required=False,
                        dest='output_fn',
                        action='store',
                        help='output data file')

    parser.add_argument('-r',
                        '--resolution',
                        metavar='resolution',
                        required=False,
                        dest='change resolution to this value',
                        action='store',
                        type=int,
                        help='Change the resolution of the resulting')

    parser.add_argument('-t',
                        '--threshold',
                        metavar='threshold',
                        required=True,
                        dest='threshold',
                        action='store',
                        type=float,
                        help='read depth threashold for calling an event')

    args = parser.parse_args()
    args.input_fn = xopen(args.input_fn)
    if not args.output_fn:
        args.output_fn = sys.stdout
    return args
示例#47
0
def loadChrm(ds, ref, chrm):
    if not os.path.exists(ref):
        error("Cannot find reference file: %s", ref)

    log("Reading reference genome chrm: %s" % chrm)
    i = 1
    for l in drdcommon.xopen(ref):
        l = l.strip()
        if i == 1 and l[0] == '>' and l[1:] == chrm:
            continue
        if i > 1 and l[0] == '>':
            break

        for bp in l:
            if bp.upper() != 'N':
                ds[0][i] = 1
            if i % 10000000 == 0:
                sys.stderr.write("MEM: %s nbp: %s\r" %
                                 (drdcommon.memory_usage(), i))
            i += 1
    log("\n%s bp read." % i)
示例#48
0
def parse_args():
  parser = argparse.ArgumentParser(description='cnv pipeline')
  parser.add_argument('-i', '--input_fn', metavar='input_fn', required=True,
                        dest='input_fn', action='store',
                        help='input data file')

  parser.add_argument('-o', '--output_fn', metavar='output_fn', required=False,
                        dest='output_fn', action='store',
                        help='output data file')

  parser.add_argument('-r', '--resolution', metavar='resolution', required=False,
                        dest='change resolution to this value', action='store', type=int,
                        help='Change the resolution of the resulting')

  parser.add_argument('-t', '--threshold', metavar='threshold', required=True,
                        dest='threshold', action='store', type=float,
                        help='read depth threashold for calling an event')

  args = parser.parse_args()
  args.input_fn = xopen(args.input_fn)
  if not args.output_fn:
    args.output_fn = sys.stdout
  return args
示例#49
0
def method1(input1, input2):
    """
           4      3              5            1
    i1  |------|-------|------------------|--------|
              7                10               1
    i2  |----------|-------------------------|-----|
             3.5             4                  1
    out |----------|-------------------------|-----|
    """
    a = np.zeros(shape=(three_hundre_mil))

    # Save the metric values for all the bp from the stdin input
    working_chrm = ""
    for i in input1:
        working_chrm, start, stop, val = i.split()
        for j in range(int(start), int(stop)+1):
            a[j] = val

    # Iterate over second output and generate new windows for first input
    for i in drd.xopen(input2):
        chrm, start, stop, val = i.split()
        if working_chrm == chrm:
            s, e = int(start), int(stop)
            print "%s\t%s\t%s\t%s" % (chrm, start, stop, int(np.median(a[s:e+1])))
示例#50
0
      if num_ns > _max:
        return True
  return False

if len(sys.argv) < 4 or not drdcommon.data_in_stdin():
  sys.stderr.write("cat ref.fa | tool <n_events> <chrm> <chrm_size>" + "\n")
  sys.exit(1)

_ = sys.argv
n_events, chrm, chrm_size = int(_[1]), _[2], int(_[3])

# Store N locations
sys.stderr.write("Loading N locations ..." + "\n")
ref = BitMask()
i = 0
for l in drdcommon.xopen("-"):
  if l[0] != '-':
    for c in l.rstrip():
      if c.upper() == 'N':
        ref.set(i)
    i += 1

# Generate events
sys.stderr.write("Generating events ..." + "\n")
coor = next_coor(100)
i = 0
while (i < n_events):
  # chrm start end 0..n
  s = next_size()
  if coor + s < chrm_size:
    if i % 2 == 0: # deletion
示例#51
0
#!/usr/bin/env python

import sys
from drdcommon import xopen

min_num_samples = int(sys.argv[1])

first_line = True
for l in xopen("-"):
    if first_line:
        first_line = False
        continue
    else:
        #chrm    start   end     gene    exon_number     transcript_number       32510 ..
        num = 0
        for i in l.strip().split("\t")[6:]:
            i = int(i)
            if i > 0:
                num += 1


        out = "\t".join(l.strip().split("\t")[0:6]) + "\t" + str(num) + "\n"

        if num >= min_num_samples:
            sys.stdout.write(out)
        else:
            sys.stderr.write(out)
示例#52
0
    s2   |------|-----------------------------|-----------|

    out  |------|----|--------------|---------|-----------|
           4,5    2,5      10,2         4,2       4,6
    """
    log("Creating arrays for first stream")
    a_s1_vals, a_s1_wins = compute_vals_wins(stream1, working_chrm)
    log("Creating arrays for second stream")
    a_s2_vals, a_s2_wins = compute_vals_wins(stream2, working_chrm)

    log("Finding coordinate locations")
    o_wins = a_s1_wins | a_s2_wins
    log("Iterating over %s windows" % len(o_wins))
    _first, _prev = True, None
    for coor in np.where(o_wins == 1)[0]:
        if _first:
            _prev = coor
            _first = False
        else:
            s, e = _prev, coor
            print "%s\t%s\t%s\t%s\t%s" % (working_chrm, s, e, cv(a_s1_vals, s, e), cv(a_s2_vals, s, e))
            _prev = coor

if __name__ == "__main__":
    # method1(drd.xopen(sys.argv[1]), drd.xopen(sys.argv[2]))
    if len(sys.argv) != 4:
        sys.stderr.write("Usage: tool <bed_file1> <bed_file2> <chromosome>\n")
        exit(1)
    method2(drd.xopen(sys.argv[1]), drd.xopen(sys.argv[2]), sys.argv[3])

示例#53
0
from pandas import Series

if len(sys.argv) != 3:
  sys.stderr.write("tool <target/exons bed file> <bed base coverage>" + "\n")
  sys.exit(1)

out = sys.stdout.write
err = sys.stderr.write

_ = sys.argv
fn_targets, fn_base_cov = _[1], _[2]

err("Loading exons/targets\n")
depth = {}
n = 0
for t in drdcommon.xopen(fn_targets):
    l = t.strip()
    sl = [c for c in l.split()] # splitted line
    chrm, start, end = sl[0:3]
    if chrm not in depth:
        depth[chrm] = {}
    if start in depth[chrm]:
        raise(Exception('Two exons starting in same location! bailing out: ' + l))
    for i in range(int(start), int(end)+1):
        depth[chrm][i] = 0
    n += 1
err("%s\n" % n)

err("Reading read depth bed\n")
total = n
hits = 0
示例#54
0
文件: means.py 项目: drio/py.analysis
        o = ""
        o += "chrm start end gene exon_number transcript_number "
        for i in ids:
            o += ("%s " % i)
        print re.sub("\s", self.sep, o)
        for k, means in self.d.items():
            o = k + " "
            for _id in ids:
                if _id not in means:
                    means[_id] = 0
                o += "%s " % means[_id]
            print re.sub("\s", self.sep, o)

d = Data()
ids = []
for f in glob.glob("*.pass.gz"):
    match = re.search("^(\d+)\.", f)
    if match:
        _id = int(match.group(1))
        first_line = True
        for l in xopen(f):
            if first_line:
                sys.stderr.write("%s\n" % _id)
                ids.append(_id)
                first_line = False
                continue
            else:
                d.add(l, _id)

d.dump(ids)
示例#55
0
#!/usr/bin/env python
#
# Given the enumeration from enumerate.sh as input,
# report, per each location, how many samples we have
# each filtering category
#
import sys
import drdcommon

d = {}
_a = {}
for l in drdcommon.xopen("-"):
    _id, chrm, start, end, _type = l.rstrip().split("\t")

    if _id not in _a:
        sys.stderr.write(_id + "\n")
        _a[_id] = {}

    k = "%s_%s_%s" % (chrm, start, end)

    if k not in _a[_id]:
        _a[_id][k] = True

        if not k in d:
            d[k] = {}
            for _t in ["min", "max", "pass"]:
                d[k][_t] = 0

        d[k][_type] += 1

print "chrm start stop min max pass".replace("\s", "\t")
示例#56
0
def load_data(sid, fn, h):
  for l in drdcommon.xopen(fn):
    # chrm start end n_reads n_reads_ref log2ratio
    chrm, start, end, nref, nr, log = l.split()
    chrm = re.sub(r'(^[cC]hrm?)', '', chrm)
    h[chrm][int(start)][sid] = (int(nr), float(log))
示例#57
0
# Load data for all genes all samples
data = {}
ids = []
for f in drdcommon.files_in_dir(".", file_pattern):
    # extract sample id
    match = re_id.search(f)
    if match:
        _id = match.group(1)
    else:
        raise (Exception("Problems extracting id for: " + f))

    err("Working on id: %s\n" % (_id))
    ids.append(_id)

    first_line = True
    for l in drdcommon.xopen(f):
        if first_line:
            first_line = False
            continue

        chrm, start, end, g_name = l.strip().split()
        start, end = int(start), int(end)

        k = "%s %s %s" % (chrm, start, end)
        if k not in data:
            data[k] = {}
            data[k]["coor"] = [chrm, start, end, g_name]
            data[k]["samples"] = {}

        data[k]["samples"][
            _id] = True  # This sample (_id) passes the filters for that gene
示例#58
0
import re

def help(msg):
  sys.stderr.write("ERROR: " + msg + "\n")
  sys.stderr.write("Usage: cat gtf.txt | tool list_genes_names.txt > genes.coor.bed\n")
  sys.exit(1)

# Main
if not drdcommon.data_in_stdin():
  help("Need data in stdin")

if len(sys.argv) != 2:
  help("Invalid list of arguments")

gene_names = {}
for l in drdcommon.xopen(sys.argv[1]):
  name = l.split()[0]
  gene_names[name] = True
drdcommon.log("%s genes loaded." % len(gene_names))

for l in drdcommon.xopen("-"):
  s = l.split("\t")
  if s[2] == "CDS":
    chrm, start, end, _list = s[0], s[3], s[4], s[8]
    g_name, e_name, t_name = None, None, None

    for e in _list.split(";"):
      _ = e.split()
      if len(_) == 2 and _[0] == "transcript_name":
        t_name = re.sub('\"', '', _[1])
      if len(_) == 2 and _[0] == "gene_name":