Пример #1
0
def test_cleanup():
    """
    make sure the tempdir and cleanup work
    """
    assert os.path.abspath(pybedtools.get_tempdir()) == os.path.abspath('.')

    # make a fake tempfile, not created during this pybedtools session
    testfn = 'pybedtools.TESTING.tmp'
    os.system('touch %s' % testfn)
    assert os.path.exists(testfn)

    # make some temp files
    a = pybedtools.BedTool(os.path.join(testdir, 'data', 'a.bed'))
    b = pybedtools.BedTool(os.path.join(testdir, 'data', 'b.bed'))
    c = a.intersect(b)

    # after standard cleanup, c's fn should be gone but the fake one still
    # there...
    pybedtools.cleanup(verbose=True)
    assert os.path.exists(testfn)
    assert not os.path.exists(c.fn)

    # Unless we force the removal of all temp files.
    pybedtools.cleanup(remove_all=True)
    assert not os.path.exists(testfn)

    # a.fn and b.fn better be there still!
    assert os.path.exists(a.fn)
    assert os.path.exists(b.fn)
Пример #2
0
def test_stream():
    """
    Stream and file-based equality, both whole-file and Interval by
    Interval
    """
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    c = a.intersect(b)

    # make an unwriteable dir...
    orig_tempdir = pybedtools.get_tempdir()
    if os.path.exists('unwriteable'):
        os.system('rm -rf unwriteable')
    os.system('mkdir unwriteable')
    os.system('chmod -w unwriteable')

    # ...set that to the new tempdir
    pybedtools.set_tempdir('unwriteable')

    # this should really not be written anywhere
    d = a.intersect(b, stream=True)

    assert_raises(NotImplementedError, c.__eq__, d)
    d_contents = d.fn.read()
    c_contents = open(c.fn).read()
    assert d_contents == c_contents

    # reconstruct d and check Interval-by-Interval equality
    pybedtools.set_tempdir('unwriteable')
    d = a.intersect(b, stream=True)

    for i,j in zip(c, d):
        assert str(i) == str(j)

    # Now do something similar with GFF files.
    a = pybedtools.example_bedtool('a.bed')
    f = pybedtools.example_bedtool('d.gff')

    # file-based
    pybedtools.set_tempdir(orig_tempdir)
    g1 = f.intersect(a)

    # streaming
    pybedtools.set_tempdir('unwriteable')
    g2 = f.intersect(a, stream=True)

    for i,j in zip(g1, g2):
        assert str(i) == str(j)

    # this was segfaulting at one point, just run to make sure
    g3 = f.intersect(a, stream=True)
    for i in iter(g3):
        print i

    for row in f.cut(range(3), stream=True):
        row[0], row[1], row[2]
        assert_raises(IndexError, row.__getitem__, 3)

    pybedtools.set_tempdir(orig_tempdir)
    os.system('rm -fr unwriteable')
Пример #3
0
def cast(bedx, new_type):

	allowed_types = gqltypes.flat_bed_types

	if not type(bedx) in allowed_types:
		raise ToolsException('Type mismatch in CAST. ' +\
					bedx.name + ' not supported.',\
					'cast')

	if type(bedx) == gqltypes.BED12 and  \
				not new_type in \
				(gqltypes.BED3,gqltypes.BED4,gqltypes.BED6,gqltypes.BED12):
		raise ToolsException (\
				'Type mismatch in CAST. Cannot cast from ' + \
				bedx.name + ' to ' + new_type.name,\
				'cast')

	elif type(bedx) == gqltypes.BED6 and \
				not new_type in \
				(gqltypes.BED3,gqltypes.BED4,gqltypes.BED6):
		raise ToolsException (\
				'Type mismatch in CAST. Cannot cast from ' + \
				bedx.name + ' to ' + new_type.name,\
				'cast')

	elif type(bedx) == gqltypes.BED4 and \
				not new_type in \
				(gqltypes.BED3,gqltypes.BED4):
		raise ToolsException (\
				'Type mismatch in CAST. Cannot cast from ' + \
				bedx.name + ' to ' + new_type.name,\
				'cast')

	elif type(bedx) == gqltypes.BED3 and \
				not new_type  in \
				(gqltypes.BED3,gqltypes.BED3):
		raise ToolsException (\
				'Type mismatch in CAST. Cannot cast from ' + \
				bedx.name + ' to ' + new_type.name,\
				'cast')

	start_range = 0
	end_range = new_type.cols

	new_file_name = get_temp_file_name(pybedtools.get_tempdir(), \
									 'cast', \
									 'tmp')

	new_file = new_type(new_file_name, True)
	add_tmp_file(new_file)

	in_file = open(bedx.val, 'r')
	out_file = open(new_file_name, 'w')
	for line in in_file:
		cols = line.rstrip().split('\t')
		out_file.write("\t".join(cols[start_range:end_range]) + "\n")
	in_file.close()
	out_file.close()

	return new_file
Пример #4
0
def test_stream():
    """
    Stream and file-based equality, both whole-file and Interval by
    Interval
    """
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    c = a.intersect(b)

    # make an unwriteable dir...
    orig_tempdir = pybedtools.get_tempdir()
    if os.path.exists('unwriteable'):
        os.system('rm -rf unwriteable')
    os.system('mkdir unwriteable')
    os.system('chmod -w unwriteable')

    # ...set that to the new tempdir
    pybedtools.set_tempdir('unwriteable')

    # this should really not be written anywhere
    d = a.intersect(b, stream=True)

    assert_raises(NotImplementedError, c.__eq__, d)
    d_contents = d.fn.read()
    c_contents = open(c.fn).read()
    assert d_contents == c_contents

    # reconstruct d and check Interval-by-Interval equality
    pybedtools.set_tempdir('unwriteable')
    d = a.intersect(b, stream=True)

    for i, j in zip(c, d):
        assert str(i) == str(j)

    # Now do something similar with GFF files.
    a = pybedtools.example_bedtool('a.bed')
    f = pybedtools.example_bedtool('d.gff')

    # file-based
    pybedtools.set_tempdir(orig_tempdir)
    g1 = f.intersect(a)

    # streaming
    pybedtools.set_tempdir('unwriteable')
    g2 = f.intersect(a, stream=True)

    for i, j in zip(g1, g2):
        assert str(i) == str(j)

    # this was segfaulting at one point, just run to make sure
    g3 = f.intersect(a, stream=True)
    for i in iter(g3):
        print i

    for row in f.cut(range(3), stream=True):
        row[0], row[1], row[2]
        assert_raises(IndexError, row.__getitem__, 3)

    pybedtools.set_tempdir(orig_tempdir)
    os.system('rm -fr unwriteable')
 def print_inputs_summary(self):
     print('VCF\tis:\t', self.vcf_file)
     print('annof\tis:\t', self.annof)
     print('outvcfname\tis:\t', self.outvcfname)
     print('get_inbtw_genes\tis:\t', self.get_genes_btw_bps)
     print('threads\tis:\t', self.threads)
     print("pyBedtools Temporary Dir:\t{}".format(str(get_tempdir())))
     print("pyBedtools Temporary Dir:\t{}".format(str(self.tempdir)))
Пример #6
0
def getoptions():
    desc = "Intersect BED file with conservation bedGraph files " + \
           "and return conservation scores"
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument('bedfile',
                        metavar='BED',
                        nargs=1,
                        help="Input BED file")
    parser.add_argument('consfiles',
                        metavar='BEDGRAPH',
                        nargs='+',
                        help="Conservation bedGraph files to intersect. "
                        "e.g. could be all chr*.bedGraph.gz files")
    parser.add_argument('-s',
                        '--summarize',
                        action="store_true",
                        help="Summarize conservation scores by taking the "
                        "average per BED interval [%(default)s]")
    parser.add_argument('-c',
                        '--cores',
                        type=int,
                        default=4,
                        help="Number of processing cores [%(default)s]")
    parser.add_argument('-d',
                        '--cores2',
                        type=int,
                        default=2,
                        help="Number of processing cores for summary step. "
                        "[%(default)s]")
    parser.add_argument('-S',
                        '--splitdir',
                        type=str,
                        default=None,
                        help="Directory to keep intersections separate for "
                        "each conservation file. e.g. chromosome-specific. "
                        "Output won't be written to stdout. [%(default)s]")
    parser.add_argument('-t',
                        '--temp',
                        type=str,
                        help="set temp directory [{}]".format(
                            pybedtools.get_tempdir()))
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='%(prog)s ' + __version__)
    args = parser.parse_args()

    if args.splitdir and not os.path.exists(args.splitdir):
        os.makedirs(args.splitdir)

    return args
Пример #7
0
def get_intersect_result(bed_pair):
	A = bed_pair[0]
	B = bed_pair[1]
	AB = bed_pair[2]

	R_file_name = get_temp_file_name(pybedtools.get_tempdir(), \
									 'unary_intersect_beds', \
									 'tmp')

	offset = A.cols

#	if A.name == 'BED3':
#		offset = 3
#	elif A.name == 'BED6':
#		offset = 6
#	if A.name == 'BED12':
#		offset = 12 

	curr_line = 1

	out_file = open(R_file_name, 'w')
	in_file = open(AB.val, 'r')
	for line in in_file:
		cols = line.rstrip().split('\t')

		bed6_1 = cols[0:6]
		bed6_2 = cols[(offset + 0):(offset + 6)]

		bed6_r = [ bed6_1[0],
				   str( max(int(bed6_1[1]), int(bed6_2[1])) ),
				   str( min(int(bed6_1[2]), int(bed6_2[2])) ),
				   str(0),
				   str(curr_line),
				   bed6_1[5],
		]

		out_file.write("\t".join(bed6_r) + "\n")

		curr_line+=1
	in_file.close()
	out_file.close()

	R = gqltypes.BED6(R_file_name, True)

	add_tmp_file(R)

	return R
Пример #8
0
    def test_tempfile_management(self):
        R_file_name = gqltools.get_temp_file_name(pybedtools.get_tempdir(), "unittest", "tmp")
        r = random.randint(1, sys.maxint)
        f = open(R_file_name, "w")
        f.write(str(r))
        f.close()

        # test to see if the file was created
        self.assertTrue(os.path.isfile(R_file_name))

        R = gqltypes.BED6(R_file_name, True)

        gqltools.add_tmp_file(R)

        gqltools.clear_tmp_files()

        self.assertEqual(os.path.isfile(R_file_name), False)
Пример #9
0
def test_call():
    tmp = os.path.join(pybedtools.get_tempdir(), 'test.output')
    from pybedtools.helpers import call_bedtools, BEDToolsError
    assert_raises(BEDToolsError, call_bedtools, *(['intersectBe'], tmp))

    a = pybedtools.example_bedtool('a.bed')

    # momentarily redirect stderr to file so the error message doesn't spew all
    # over the place when testing
    orig_stderr = sys.stderr
    sys.stderr = open(a._tmp(), 'w')
    #assert_raises(BEDToolsError, a.intersect, a=a.fn, b=a.fn, z=True)
    sys.stderr = orig_stderr

    pybedtools.set_bedtools_path('nonexistent')
    a = pybedtools.example_bedtool('a.bed')
    assert_raises(OSError, a.intersect, a)
    pybedtools.set_bedtools_path()
    assert a.intersect(a,u=True) == a
Пример #10
0
def test_call():
    tmp = os.path.join(pybedtools.get_tempdir(), "test.output")
    from pybedtools.helpers import call_bedtools, BEDToolsError

    with pytest.raises(BEDToolsError):
        call_bedtools(*(["intersectBe"], tmp))

    a = pybedtools.example_bedtool("a.bed")

    # momentarily redirect stderr to file so the error message doesn't spew all
    # over the place when testing
    orig_stderr = sys.stderr
    sys.stderr = open(a._tmp(), "w")
    sys.stderr = orig_stderr

    pybedtools.set_bedtools_path("nonexistent")
    a = pybedtools.example_bedtool("a.bed")
    with pytest.raises(NotImplementedError):
        a.intersect(a)
    pybedtools.set_bedtools_path()
    a = pybedtools.example_bedtool("a.bed")
    assert a.intersect(a, u=True) == a
Пример #11
0
def subset_conservation(bg, bd):
    """
    Perform intersectBed between BED file and chromosome-specific
    bedGraph file
    """
    # Load bedGraph
    cons = pybedtools.BedTool(bg)
    chrom, track = get_chrom_from_file(bg)

    # Filter bed file
    chrom_bd = subset_chrom(bd, chrom)
    if chrom_bd.file_type == 'empty':
        eprint("Skipping %s" % chrom)
        fn = None
    else:
        assert len(chrom_bd) > 0
        outfile = os.path.join(pybedtools.get_tempdir(),
                               'pybedtools.%s.%s.tmp' % (chrom, track))
        inter = chrom_bd.intersect(cons, wo=True, sorted=True, output=outfile)
        fn = inter.fn

    os.remove(chrom_bd.fn)
    return fn
Пример #12
0
def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
               scan_window_size, scan_maf, min_mapq,
               min_dp, max_dp, good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao,
               ins_min_af, del_min_af, del_merge_min_af,
               ins_merge_min_af, merge_r, truth_vcf, tsv_batch_size,
               matrix_width, matrix_base_pad, min_ev_frac_per_col,
               ensemble_tsv, long_read, restart, first_do_without_qual, 
               filter_duplicate,
               num_threads,
               scan_alignments_binary,):
    logger = logging.getLogger(preprocess.__name__)

    logger.info("----------------------Preprocessing------------------------")
    if restart or not os.path.exists(work):
        os.mkdir(work)

    original_tempdir = pybedtools.get_tempdir()
    pybedtmp = os.path.join(work, "pybedtmp_preprocess")
    if not os.path.exists(pybedtmp):
        os.mkdir(pybedtmp)
    pybedtools.set_tempdir(pybedtmp)

    if not os.path.exists(tumor_bam):
        logger.error("Aborting!")
        raise Exception("No tumor BAM file {}".format(tumor_bam))
    if not os.path.exists(normal_bam):
        logger.error("Aborting!")
        raise Exception("No normal BAM file {}".format(normal_bam))
    if not os.path.exists(tumor_bam + ".bai"):
        logger.error("Aborting!")
        raise Exception(
            "No tumor .bai index file {}".format(tumor_bam + ".bai"))
    if not os.path.exists(normal_bam + ".bai"):
        logger.error("Aborting!")
        raise Exception(
            "No normal .bai index file {}".format(normal_bam + ".bai"))

    ensemble_bed = None
    if ensemble_tsv:
        ensemble_bed = os.path.join(work, "ensemble.bed")
        logger.info("Extract ensemble info.")
        if restart or not os.path.exists(ensemble_bed):
            ensemble_bed = extract_ensemble(work, ensemble_tsv)

    merge_d_for_short_read = 100
    candidates_split_regions = []
    dbsnp_regions_q = []
    ensemble_beds = []
    if not long_read and first_do_without_qual:
        logger.info("Scan tumor bam (first without quality scores).")
        work_tumor_without_q = os.path.join(work, "work_tumor_without_q")
        if restart or not os.path.exists(work_tumor_without_q):
            os.mkdir(work_tumor_without_q)
        filtered_candidates_vcf_without_q = os.path.join(
            work_tumor_without_q, "filtered_candidates.vcf")

        tumor_outputs_without_q = process_split_region("tumor", work_tumor_without_q, region_bed, reference, mode,
                                                       tumor_bam, dbsnp, scan_window_size, scan_maf, min_mapq,
                                                       filtered_candidates_vcf_without_q, min_dp, max_dp, 
                                                       filter_duplicate,
                                                       good_ao, min_ao,
                                                       snp_min_af, -10000, snp_min_ao,
                                                       ins_min_af, del_min_af, del_merge_min_af,
                                                       ins_merge_min_af, merge_r,
                                                       scan_alignments_binary, restart, num_threads,
                                                       calc_qual=False, dbsnp_regions=[])
        tumor_counts_without_q, split_regions, filtered_candidates_vcfs_without_q, dbsnp_regions_q = tumor_outputs_without_q

        if ensemble_tsv:
            ensemble_beds = get_ensemble_beds(
                work, reference, ensemble_bed, split_regions, matrix_base_pad, num_threads)
        candidates_split_regions = extract_candidate_split_regions(
            work_tumor_without_q, filtered_candidates_vcfs_without_q, split_regions, ensemble_beds,
            reference, matrix_base_pad, merge_d_for_short_read)
    work_tumor = os.path.join(work, "work_tumor")
    if restart or not os.path.exists(work_tumor):
        os.mkdir(work_tumor)
    filtered_candidates_vcf = os.path.join(
        work_tumor, "filtered_candidates.vcf")

    logger.info("Scan tumor bam (and extracting quality scores).")
    tumor_outputs = process_split_region("tumor", work_tumor, region_bed, reference, mode,
                                         tumor_bam, dbsnp, scan_window_size, scan_maf, min_mapq,
                                         filtered_candidates_vcf, min_dp, max_dp, 
                                         filter_duplicate,
                                         good_ao, min_ao,
                                         snp_min_af, snp_min_bq, snp_min_ao,
                                         ins_min_af, del_min_af, del_merge_min_af,
                                         ins_merge_min_af, merge_r,
                                         scan_alignments_binary, restart, num_threads,
                                         calc_qual=True,
                                         regions=candidates_split_regions,
                                         dbsnp_regions=dbsnp_regions_q)
    tumor_counts, split_regions, filtered_candidates_vcfs, _ = tumor_outputs

    if ensemble_tsv and not ensemble_beds:
        ensemble_beds = get_ensemble_beds(
            work, reference, ensemble_bed, split_regions, matrix_base_pad, num_threads)

    if (not long_read):
        candidates_split_regions = extract_candidate_split_regions(
            work_tumor, filtered_candidates_vcfs, split_regions, ensemble_beds,
            reference, matrix_base_pad, merge_d_for_short_read)

    if not candidates_split_regions:
        candidates_split_regions = split_regions
    work_normal = os.path.join(work, "work_normal")
    if restart or not os.path.exists(work_normal):
        os.mkdir(work_normal)
    logger.info("Scan normal bam (and extracting quality scores).")
    normal_counts, _, _, _ = process_split_region("normal", work_normal, region_bed, reference, mode, normal_bam,
                                                  None, scan_window_size, 0.2, min_mapq,
                                                  None, min_dp, max_dp, 
                                                  filter_duplicate,
                                                  good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao,
                                                  ins_min_af, del_min_af, del_merge_min_af,
                                                  ins_merge_min_af, merge_r,
                                                  scan_alignments_binary, restart, num_threads,
                                                  calc_qual=True,
                                                  regions=candidates_split_regions,
                                                  dbsnp_regions=[])

    work_dataset = os.path.join(work, "dataset")
    if restart or not os.path.exists(work_dataset):
        os.mkdir(work_dataset)
    logger.info("Generate dataset.")
    for i, (tumor_count, normal_count, filtered_vcf, candidates_split_region) in enumerate(zip(tumor_counts, normal_counts, filtered_candidates_vcfs, candidates_split_regions)):
        logger.info("Dataset for region {}".format(candidates_split_region))
        work_dataset_split = os.path.join(work_dataset, "work.{}".format(i))
        if restart or not os.path.exists("{}/done.txt".format(work_dataset_split)):
            if os.path.exists(work_dataset_split):
                shutil.rmtree(work_dataset_split)
            os.mkdir(work_dataset_split)
            generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf,
                                    candidates_split_region, tumor_count, normal_count, reference,
                                    matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads,
                                    ensemble_beds[i] if ensemble_tsv else None, tsv_batch_size)

    shutil.rmtree(pybedtmp)
    pybedtools.set_tempdir(original_tempdir)

    logger.info("Preprocessing is Done.")
Пример #13
0
def mergemin_bedn(bednfile):

	allowed_types = gqltypes.BEDN

	if not ( type(bednfile) in allowed_types ):
		raise ToolsException('Type mismatch in MEREGEMIN. ' +
				bednfile.name + ' not supported.',\
				'mergemin_bedn')
	
	pybedtools.settings.KEEP_TEMPFILES=True

	#relative positions: starts 1, ends 2, name 3, score 4
	o_starts = []
	o_ends = []
	o_names = []
	o_scores = []
	o_strands = []
	
	bed_types = bednfile.types
	curr_offset = 0
	for bed_type in bed_types:
		o_starts.append(curr_offset + 1)
		o_ends.append(curr_offset + 2)
		if bed_type in (gql_types.BED6, gql_types.BED12):
			o_names.append(curr_offset + 3)
			o_scores.append(curr_offset + 4)
			o_strands.append(curr_offset + 5)
		if bed_type == gql_types.BED4:
			o_names.append(curr_offset + 3)

		curr_offset += bed_type.cols
#		if bed_type == 'BED3':
#			curr_offset += 3
#		elif bed_type == 'BED4':
#			curr_offset += 4
#		elif bed_type == 'BED6':
#			curr_offset += 6
#		elif bed_type == 'BED12':
#			curr_offset += 12 
#
	

	R_file_name = get_temp_file_name(pybedtools.get_tempdir(), \
									 'mergemin', \
									 'tmp')

	in_file = open(bednfile.val, 'r')
	out_file = open(R_file_name, 'w')

	for line in in_file:
		cols = line.rstrip().split('\t')
		out_row = (cols[0], \
			str(max( [ int(cols[i]) for i in range(0,len(cols)) \
				if i in o_starts] )), \
			str(min( [ int(cols[i]) for i in range(0,len(cols)) \
				if i in o_ends] )), \
		)

		# if o_names is empty, then all of the beds were BED3
		if len(o_names) > 0 :
			out_row = out_row + ( \
					"::".join([ cols[i] for i in range(0,len(cols)) \
						if i in o_names]), \
					str(0), \
					random.choice([ cols[i] for i in range(0,len(cols)) \
						if i in o_strands]), \
			)
		out_file.write("\t".join(out_row) + "\n")
	in_file.close()
	out_file.close()

	new_type = gqltypes.BED6
	if len(o_names) == 0 :
		new_type = gqltypes.BED3

	#new_class = gsltypes.source_type_map(new_type)

	R = new_type(R_file_name, True)

	add_tmp_file(R)

	return R
Пример #14
0
def filter_bedx(_N_list, filter_opts):
	pybedtools.settings.KEEP_TEMPFILES=True

	allowed_types = gqltypes.bed_types

	N_list = make_mixed_list(_N_list, allowed_types,'FILTER')

	input_types = []

	for bed in N_list:
		input_types.append(type(bed))

	output_type = ''
	if gqltypes.BED3 in input_types:
		output_type = gqltypes.BED3
	elif gqltypes.BED4 in input_types:
		output_type = gqltypes.BED4
	elif gqltypes.BED6 in input_types:
		output_type = gqltypes.BED6
	elif gqltypes.BED12 in input_types:
		output_type = gqltypes.BED12
	else:
		raise ToolsException(\
				'Output type could not be determined in FILTER.',\
				'filter_bedx')

	filter_file_name = get_temp_file_name(pybedtools.get_tempdir(), \
									 'filter_bedx', \
									 'tmp')
	filter_bedx=output_type(filter_file_name, True)
	add_tmp_file(filter_bedx)
	filter_file = open(filter_bedx.val, 'w')
	
	for bed in N_list:
		f = open(bed.val,'r')
		bed_type =  gqltypes.source_type_map[bed.name]
		for line in f:
			cols = line.rstrip().split('\t')
			keep_line = True
			for opt in filter_opts:
				bool_string = ""
				if not opt in  bed_type.col:
					raise ToolsException(\
							'Invalid field for given filetype ' + \
							'in FOREACH. ' + opt + ' and ' + bed_type.name,\
							'filter_bedx')
				opt_col = bed_type.col[opt]

				for test in filter_opts[opt]:
					if len(test)==2:
						op=test[0]
						val=test[1]
						test=cols[opt_col]
						if type(val) is str:
							test='"'+str(test)+'"'
						result = eval(str(test) + op + str(val))
						bool_string = bool_string +	str(result)
					else:
						bool_string = bool_string + test[0]
				keep_line = keep_line &  eval(bool_string)
			if keep_line:
				filter_file.write(line)

	return filter_bedx
Пример #15
0
def merge_beds(merge_type, _N_list, merge_opts):
	pybedtools.settings.KEEP_TEMPFILES=True

	allowed_types = gqltypes.bed_types

	N_list = make_mixed_list(_N_list, allowed_types,'MERGE')

	input_types = []

	for bed in N_list:
		input_types.append(type(bed))

	# Parse input arguments and add/modify default argumetns
	# Default args
	valid_args = {'distance':'d', \
				  'score':'scores', \
				  'name':'nms', \
				  'stranded':'s'}

	score_functions = { 'MIN':'min', 'MAX':'max', 'SUM':'sum', \
			'MEAN':'mean', 'MEDIAN':'median', 'MODE':'mode', \
			'ANITMODE':'antimode', 'COLLAPSE':'collapse',
			'COUNT':'count'}

	kwargs = {}

	for merge_opt in merge_opts:
		if not ( merge_opt in valid_args ):
			raise ToolsException('Invalid option in MERGE. ' + \
						merge_opt + ' not supported.',\
						'merge_beds')

		if merge_opt == 'score':
			if not ( merge_opts[ merge_opt ] in score_functions ) :
				raise ToolsException(\
						'SCORE funciton not supported by MERGE. ' + \
						merge_opts[ merge_opt ],
						'merge_beds')
			else:
				kwargs[ valid_args[ merge_opt ] ] = \
						score_functions[ merge_opts[ merge_opt ] ]
		elif merge_opt == 'stranded':
			if (gqltypes.BED3 in input_types ) or \
				( gqltypes.BED4 in input_types) or \
				( gqltypes.BED5 in input_types)  :
				raise ToolsException(\
						'Type mismatch in MERGE. Cannot match by ' + \
						'strand with givne input types',\
						'merge_beds')
			kwargs[ valid_args[ merge_opt ] ] = True

		elif (merge_opt == 'distance'):
			if merge_type == 'flat':
				raise ToolsException('DISTANCE not supported for MERGEFLAT',\
						'merge_beds')
			elif merge_type == 'min':
				raise ToolsException('DISTANCE not supported for MERGEMIN',\
						'merge_beds')
			elif merge_type == 'max':
				raise ToolsException('DISTANCE not supported for MERGEMAX',\
						'merge_beds')
			else:
				kwargs[ valid_args[ merge_opt ] ] = merge_opts[ merge_opt ]

		elif (merge_opt == 'name'):
			if merge_opts[ merge_opt ] == 'COLLAPSE':
				kwargs[ valid_args[ merge_opt ] ] = True
			else :
				raise ToolsException(\
						'NAME funciton not supported by MERGE. ' + \
						merge_opts[ merge_opt ],
						'merge_beds')
	

	output_type = gqltypes.BED3
	if (len(kwargs) > 0) :
		output_type = gqltypes.BED6

	# merge the file
	merge_bed = pybedtools.BedTool()

	if merge_type == 'merge':
		#{{{ combine files into one 
		combo_file_name = get_temp_file_name(pybedtools.get_tempdir(), \
										 'merge_beds', \
										 'tmp')

		combo_file = open(combo_file_name, 'w')
		for bed in N_list:
			in_file = open(bed.val, 'r')
			for line in in_file:
				combo_file.write(line)
			in_file.close()
		combo_file.close()
		add_tmp_file(input_type(combo_file_name, True))
		# sort the combined file
		sorted_bed = pybedtools.BedTool(combo_file_name).sort()
		add_tmp_file(input_type(sorted_bed.fn, True))

		#}}}
		merged_bed = sorted_bed.merge(**kwargs)
	elif  merge_type in ['flat','min','max'] :
		#{{{ sort each file, make list of files
		# make sure all the input files are sorted
		sorted_beds = []
		sorted_bed_files = []
		for bed in N_list:
			sorted_bed = pybedtools.BedTool(bed.val).sort()
			add_tmp_file( eval( 'gqltypes.'+ bed.name + \
				'("' + sorted_bed.fn + '",True)' ) )
			sorted_beds.append(sorted_bed)
			sorted_bed_files.append(sorted_bed.fn)

		kwargs['gql'] = True
		kwargs['i'] = sorted_bed_files
		#}}}
		x = pybedtools.BedTool()
		if merge_type == 'flat':
			try:
				merged_bed = x.multi_intersect(**kwargs)
			except pybedtools.helpers.BEDToolsError as e:
				raise ToolsException('Error in MERGE. ' +  e.msg,\
						'merge_beds')
		elif merge_type == 'min':
			kwargs['cluster'] = True
			try:
				merged_bed = x.multi_intersect(**kwargs)
			except pybedtools.helpers.BEDToolsError as e:
				raise ToolsException('Error in MERGE. ' +  e.msg,\
						'merge_beds')
		elif merge_type == 'max':
			kwargs['merge'] = True
			try:
				merged_bed = x.multi_intersect(**kwargs)
			except pybedtools.helpers.BEDToolsError as e:
				raise ToolsException('Error in MERGE. ' +  e.msg,\
						'merge_beds')
	else:
		raise ToolsException('Supported by MERGE. ' + merge_type,\
				'merge_beds')
	
	result = output_type(merged_bed.fn, True)

	add_tmp_file(result)

	return result
Пример #16
0
def merge_bed_stack(out_stack):

	out_rows = deque()
	next_rows = []
	next_row_i = 0
	offset = 6

	while ( len(out_stack) > 0 ):
		A,B,AB = out_stack.pop()
		curr_next_rows = []
		curr_out_rows = []
		curr_row = 1

		in_file = open(AB.val, 'r')

		for line in in_file:
			curr_row_takes = 0

			if len(next_rows) == 0:
				curr_row_takes = 1
			else: 
				while ( (next_row_i < len(next_rows)) and
						( next_rows[next_row_i] == curr_row ) ):
					curr_row_takes += 1
					next_row_i += 1
				
			for i in range(0, curr_row_takes):
				cols = line.rstrip().split('\t')
				# the score in the first entry give the line number is the
				# associated file that pairs with the current line
				curr_next_rows.append(int(cols[4]) )
				if len(out_stack) > 0:
					# the 2nd and on entries have a pointer and a data entry,
					# pointer entries are BED6, so take all but the first 6
					# entries, also the last col is the size of the overlap,
					# ignore it
					curr_out_rows.append("\t".join(cols[6:-1]))
				else:
					# the first entry tin the stack does not have a pointer
					# entry, so take both 
					curr_out_rows.append("\t".join(cols[:-1]))

			curr_row+=1
		next_rows = curr_next_rows
		next_row_i = 0

		out_rows.appendleft(curr_out_rows)

	R_file_name = get_temp_file_name(pybedtools.get_tempdir(), \
									 'unary_intersect_beds', \
									 'tmp')

	# check to make sure all the out_rows are the same length

	same_size = True
	for i in range(0, len(out_rows) - 1):
		same_size = same_size and \
				( len(out_rows[i]) == len(out_rows[i + 1]) )

	if not same_size:
		raise ToolsException('Unmached sizes in intersection', \
				'merge_bed_stack')


	out_file = open(R_file_name, 'w')

	for i in range(0, len(out_rows[0])):
		out_line = ''
		for j in range(0, len(out_rows)):
			if j != 0:
				out_line = out_line + '\t'
			out_line = out_line + (out_rows[j])[i]
		out_file.write(out_line + '\n')

	out_file.close()

	R = gqltypes.BEDN(R_file_name, True)

	add_tmp_file(R)

	return R
Пример #17
0
def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ensemble_tsv,
                tumor_bam, min_len,
                postprocess_max_dist, long_read,
                lr_pad, lr_chunk_size, lr_chunk_scale,
                lr_snp_min_af, lr_ins_min_af, lr_del_min_af, lr_match_score, lr_mismatch_penalty,
                lr_gap_open_penalty, lr_gap_ext_penalty,
                pass_threshold, lowqual_threshold,
                msa_binary, num_threads):
    logger = logging.getLogger(postprocess.__name__)

    logger.info("----------------------Postprocessing-----------------------")
    if not os.path.exists(work):
        os.mkdir(work)

    original_tempdir = pybedtools.get_tempdir()
    pybedtmp = os.path.join(work, "pybedtmp_postprocess")
    if not os.path.exists(pybedtmp):
        os.mkdir(pybedtmp)
    pybedtools.set_tempdir(pybedtmp)

    candidates_preds = os.path.join(work, "candidates_preds.vcf")
    ensembled_preds = os.path.join(work, "ensembled_preds.vcf")
    pred_vcf = pybedtools.BedTool(pred_vcf_file)
    pred_vcf.window(candidates_vcf, w=5, v=True).saveas(ensembled_preds)
    pred_vcf.window(candidates_vcf, w=5, u=True).saveas(candidates_preds)

    logger.info("Extract targets")
    postprocess_pad = 1 if not long_read else 10
    extract_postprocess_targets(
        candidates_preds, min_len, postprocess_max_dist, postprocess_pad)

    no_resolve = os.path.join(work, "candidates_preds.no_resolve.vcf")
    target_vcf = os.path.join(work, "candidates_preds.resolve_target.vcf")
    target_bed = os.path.join(work, "candidates_preds.resolve_target.bed")
    resolved_vcf = os.path.join(work, "candidates_preds.resolved.vcf")

    logger.info("Resolve targets")
    if not long_read:
        resolve_variants(tumor_bam, resolved_vcf,
                         reference, target_vcf, target_bed, num_threads)
    else:
        work_lr_indel_realign = os.path.join(work, "work_lr_indel_realign")
        if os.path.exists(work_lr_indel_realign):
            shutil.rmtree(work_lr_indel_realign)
        os.mkdir(work_lr_indel_realign)
        ra_resolved_vcf = os.path.join(
            work, "candidates_preds.ra_resolved.vcf")
        long_read_indelrealign(work_lr_indel_realign, tumor_bam, None, ra_resolved_vcf, target_bed,
                               reference, num_threads, lr_pad,
                               lr_chunk_size, lr_chunk_scale, lr_snp_min_af,
                               lr_del_min_af, lr_ins_min_af,
                               lr_match_score, lr_mismatch_penalty, lr_gap_open_penalty,
                               lr_gap_ext_penalty, msa_binary)
        resolve_scores(tumor_bam, ra_resolved_vcf, target_vcf, resolved_vcf)

    all_no_resolve = concatenate_files(
        [no_resolve, ensembled_preds], os.path.join(work, "no_resolve.vcf"))

    logger.info("Merge vcfs")
    merged_vcf = os.path.join(work, "merged_preds.vcf")
    merge_post_vcfs(reference, resolved_vcf,
                    all_no_resolve, merged_vcf,
                    pass_threshold, lowqual_threshold)
    add_vcf_info(work, reference, merged_vcf,
                 candidates_vcf, ensemble_tsv, output_vcf,
                 pass_threshold, lowqual_threshold)

    logger.info("Output NeuSomatic prediction at {}".format(output_vcf))

    shutil.rmtree(pybedtmp)
    pybedtools.set_tempdir(original_tempdir)

    logger.info("Postprocessing is Done.")
    return output_vcf
Пример #18
0
def load_file(file_path, filetype_name):

	# local files are not temp files, but remote files are
	is_remote = False

	return_files = []
	return_labels = []

	# attempt to get the files from the local path
	files = glob.glob(file_path)

	if len(files) == 0:
		# if nothing at the local path, then see if it is a remote path
		# if so, then fetch the files, store at temp path, and place 
		# the temp file path in the files list
		is_remote = True
		#url = 'http://localhost/cgi-bin/name.py?path=' + file_path

		# retrieve the path from the name server 
		try:
			url = config['fileserver'] + 'name.py?path=' + file_path
			json_response = urllib.urlopen(url)
			s = json_response.read()
			remote_paths = json.loads(s)
			json_response.close()
		except Exception as e:
			raise ToolsException ('Error retrieving file',\
					'load_file')

		# fetch remote files
		for remote_path in remote_paths:
			tmp_file_path = get_temp_file_name(pybedtools.get_tempdir(), \
											   'load', \
											   'tmp')
			# first value is the label
			return_labels.append(remote_path[0])
			# second is the path
			urllib.urlretrieve(remote_path[1], tmp_file_path)
			files.append(tmp_file_path)

		# if there is not remote file at this url, then raise
		if len(remote_paths) == 0:
			raise ToolsException (\
					'No file(s) not found at ' + file_path, 'load_file')

	for f in files:
		if (filetype_name == 'auto') :
			type_found = False
			# loops through the types to see which one matches
			for source_type in gqltypes.source_types:
				if source_type.test_filetype(f):
					type_found = True
					# if the files is remote, then the temp paramater is true
					# otherwise it is false
					new_file = source_type(f, is_remote)

					if is_remote:
						add_tmp_file(new_file)
					else:
						# remote labels were collected previously 
						return_labels.append(os.path.basename(f))

					return_files.append(new_file)
			if not type_found:
				raise ToolsException('Unknown filetype for:' + f,'load_file')
		else:
			source_type = gqltypes.source_type_map[filetype_name]
			if source_type.test_filetype(f):
				# if the files is remote, then the temp paramater is true
				# otherwise it is false
				new_file = source_type(f, is_remote)
				if is_remote:
					add_tmp_file(new_file)
				else:
					# remote labels were collected previously 
					return_labels.append(os.path.basename(f))

				return_files.append(new_file)
			else:
				raise ToolsException('Filetype mismatch:' + f + \
						" does not appear to be " + filetype_name,
						'load_file')

	if len(return_files) == 1:
		return return_files[0]
	else:
		return gqltypes.BEDL(return_files, return_labels)