def split(parameters): parser = parsers.split_parser() args = parser.parse_args(parameters) if args.colormap is None: colormap = config.colormap_hic if not args.oe else 'bwr' else: colormap = args.colormap matrix_bottom = fanc.load(os.path.expanduser(args.hic_bottom), mode='r') matrix_top = fanc.load(os.path.expanduser(args.hic_top), mode='r') norm = "lin" if not args.log else "log" sp = kplt.HicComparisonPlot2D( matrix_top, matrix_bottom, colormap=colormap, norm=norm, vmin=args.vmin, adjust_range=args.adjust_range, vmax=args.vmax, scale_matrices=args.scaling, oe=args.oe, log=args.oe, colorbar_symmetry=0 if args.oe and args.colorbar_symmetry is None else args.colorbar_symmetry, show_colorbar=args.show_colorbar) return sp, args
def test_bed(self): this_dir = os.path.dirname(os.path.realpath(__file__)) bed_file = this_dir + '/test_load/test.bed' with fanc.load(bed_file) as bed: assert isinstance(bed, gr.Bed) with pytest.raises(ValueError): foo_file = this_dir + '/test_load/foo.txt' fanc.load(foo_file)
def test_sambam(self): this_dir = os.path.dirname(os.path.realpath(__file__)) sam_file = this_dir + '/test_load/test.sam' with fanc.load(sam_file, mode='r') as bw: assert isinstance(bw, pysam.AlignmentFile) bam_file = this_dir + '/test_load/test.bam' with fanc.load(bam_file, mode='r') as bw: assert isinstance(bw, pysam.AlignmentFile)
def mirror(parameters): parser = parsers.mirror_parser() args = parser.parse_args(parameters) if args.colormap_lower is None: colormap_lower = config.colormap_hic if not args.oe_lower else 'bwr' else: colormap_lower = args.colormap_lower if args.colormap_upper is None: colormap_upper = config.colormap_hic if not args.oe_upper else 'bwr' else: colormap_upper = args.colormap_upper matrix_upper = fanc.load(os.path.expanduser(args.hic_upper), mode='r') matrix_lower = fanc.load(os.path.expanduser(args.hic_lower), mode='r') from fanc.tools.general import str_to_int norm_upper = "lin" if not args.log_upper else "log" upper_plot = kplt.HicPlot(matrix_upper, colormap=colormap_upper, max_dist=str_to_int(args.max_dist), norm=norm_upper, vmin=args.vmin_upper, vmax=args.vmax_upper, oe=args.oe_upper, log=args.oe_upper, colorbar_symmetry=0 if args.oe_upper and args.colorbar_symmetry_upper is None else args.colorbar_symmetry_upper, show_colorbar=args.show_colorbar, adjust_range=False) norm_lower = "lin" if not args.log_lower else "log" lower_plot = kplt.HicPlot(matrix_lower, colormap=colormap_lower, max_dist=str_to_int(args.max_dist), norm=norm_lower, vmin=args.vmin_lower, vmax=args.vmax_lower, oe=args.oe_lower, log=args.oe_lower, colorbar_symmetry=0 if args.oe_lower and args.colorbar_symmetry_lower is None else args.colorbar_symmetry_lower, show_colorbar=args.show_colorbar, adjust_range=False) vsp = kplt.VerticalSplitPlot(upper_plot, lower_plot) return vsp, args
def bar(parameters): parser = parsers.bar_parser() args = parser.parse_args(parameters) regions = [fanc.load(file_name) for file_name in args.regions] attribute = args.attribute labels = args.labels ylim = args.ylim colors = args.colors alpha = args.alpha legend_location = args.legend_location if labels is not None and len(labels) != len(regions): parser.error("Number of labels ({}) must be the same as number " "of datasets ({})".format(len(labels), len(regions))) p = kplt.BarPlot(regions, attribute=attribute, labels=labels, ylim=ylim, plot_kwargs={'alpha': alpha}, colors=colors, legend_location=legend_location) return p, args
def square(parameters): parser = parsers.square_parser() args = parser.parse_args(parameters) norm = "lin" if not args.log else "log" if args.colormap is None: colormap = config.colormap_hic if not args.oe else 'bwr' else: colormap = args.colormap matrix = fanc.load(os.path.expanduser(args.hic), mode='r') return kplt.HicPlot2D( matrix, colormap=colormap, norm=norm, vmin=args.vmin, vmax=args.vmax, show_colorbar=args.show_colorbar, adjust_range=args.adjust_range, oe=args.oe, log=args.oe, colorbar_symmetry=0 if args.oe and args.colorbar_symmetry is None else args.colorbar_symmetry, flip=args.flip, matrix_norm=args.norm, weight_field=args.weight_field), args
def scores(parameters): parser = parsers.scores_parser() args = parser.parse_args(parameters) array = fanc.load(os.path.expanduser(args.scores), mode='r') norm = "linear" if not args.log else "log" if args.range is not None: data_selection = [] for i, y in enumerate(array._parameters): if args.range[0] <= y <= args.range[1]: data_selection.append(y) elif args.parameters is not None: data_selection = args.parameters else: data_selection = array._parameters colorbar_symmetry = 0 if args.symmetry else None p = kplt.GenomicVectorArrayPlot(array, parameters=data_selection, y_scale=norm, colormap=args.colormap, colorbar_symmetry=colorbar_symmetry, vmin=args.vmin, vmax=args.vmax, show_colorbar=args.show_colorbar, replacement_color=args.replacement_color, genomic_format=args.genomic_format) return p, args
def test_conversion(self, tmpdir): file_name = str(tmpdir) + '/x.hic' with dummy.sample_hic(file_name=file_name) as hic: # simulate old-style object hic.file.remove_node('/meta_information', recursive=True) hic = fanc.load(file_name, mode='r') assert isinstance(hic, fanc.Hic) hic.close() hic = fanc.Hic(file_name) hic.close() hic = fanc.load(file_name, mode='r') hic.close() assert isinstance(hic, fanc.Hic)
def load_score_data(data): # If it's already an instance of fanc data, just return it if isinstance(data, RegionBased): return data # If it's anything else let's hope fanc.load can deal with it return fanc.load(data)
def write_insulation(hic_file): logging.info("working on %s", hic_file) hic = fanc.load(hic_file, mode='r') prefix = os.path.basename(hic_file).replace(".hic", "") res = int(re.findall('([0-9]+)kb', os.path.basename(hic_file))[0]) logging.info("resolution detected as %s", str(res)) window_sizes = [res * 1000 * w for w in [4, 6, 8, 10]] # calculate insulation index with InsulationScores.from_hic(hic, normalise=True, log=True, window_sizes=window_sizes, file_name=os.path.join( "data", "boundaries", prefix + "_micro-c.ii")) as ii: for window_size in window_sizes: w = window_size / res / 1000 logging.info("Writing insulation index for window size %i", window_size) output_file = os.path.join( "data", "boundaries", prefix + '_micro-c_{}.bw'.format(int(w))) ii.to_bigwig(output_file, window_size)
def triangular(parameters): parser = parsers.triangular_parser() args = parser.parse_args(parameters) if args.colormap is None: colormap = config.colormap_hic if not args.oe else 'bwr' else: colormap = args.colormap matrix = fanc.load(os.path.expanduser(args.hic), mode='r') from fanc.tools.general import str_to_int norm = "lin" if not args.log else "log" return kplt.HicPlot( matrix, colormap=colormap, max_dist=str_to_int(args.max_dist) if args.max_dist is not None else None, norm=norm, vmin=args.vmin, vmax=args.vmax, show_colorbar=args.show_colorbar, adjust_range=args.adjust_range, oe=args.oe, log=args.oe, colorbar_symmetry=0 if args.oe and args.colorbar_symmetry is None else args.colorbar_symmetry, ylabel=args.ylabel, weight_field=args.weight_field, default_value=args.default_value, matrix_norm=args.norm), args
def line(parameters): parser = parsers.line_parser() args = parser.parse_args(parameters) regions = [fanc.load(file_name) for file_name in args.regions] attribute = args.attribute bin_size = args.bin_size labels = args.labels colors = args.colors fill = args.fill line_style = args.line_style ylim = args.ylim alpha = args.alpha legend_location = args.legend_location if labels is not None and len(labels) != len(regions): parser.error("Number of labels ({}) must be the same as number " "of datasets ({})".format(len(labels), len(regions))) p = kplt.LinePlot(regions, bin_size=bin_size, fill=fill, attribute=attribute, labels=labels, style=line_style, ylim=ylim, colors=colors, legend_location=legend_location, plot_kwargs={'alpha': alpha}) return p, args
def export_marginals(hic_file, output_file): hic = fanc.load(hic_file) marginals = hic.marginals(masked=False, norm=False) regions = list(hic.regions()) for pos, r in enumerate(regions): r.set_attribute("score", marginals[pos]) write_bed(output_file, regions)
def test_auto_identification(self, tmpdir): for class_name in ('Hic', 'AccessOptimisedHic', 'FragmentMappedReadPairs', 'Reads', 'GenomicTrack', 'RaoPeakInfo', 'AccessOptimisedReadPairs'): file_name = str(tmpdir) + '/{}.h5'.format(class_name) cls_ = class_name_dict[class_name] x = cls_(file_name=file_name, mode='w') x.close() x = fanc.load(file_name, mode='r') assert isinstance(x, cls_) x.close()
def test_old_style_index(self, tmpdir): with dummy.sample_hic() as hic: for class_name in ('ABDomains', 'ABDomainMatrix', 'ExpectedContacts', 'ObservedExpectedRatio', 'ABDomains', 'PossibleContacts', 'RegionContactAverage', 'InsulationIndex', 'DirectionalityIndex'): file_name = str(tmpdir) + '/{}.h5'.format(class_name) cls_ = class_name_dict[class_name] x = cls_(hic, file_name=file_name, mode='w') # simulate missing meta-information x.close() x = fanc.load(file_name, mode='r') assert isinstance(x, cls_) x.close() for class_name in ('FoldChangeMatrix', ): file_name = str(tmpdir) + '/{}.h5'.format(class_name) cls_ = class_name_dict[class_name] x = cls_(hic, hic, file_name=file_name, mode='w') # simulate missing meta-information x.close() x = fanc.load(file_name, mode='r') assert isinstance(x, cls_) x.close() for class_name in ('Hic', 'AccessOptimisedHic', 'FragmentMappedReadPairs', 'Reads', 'GenomicTrack'): file_name = str(tmpdir) + '/{}.h5'.format(class_name) cls_ = class_name_dict[class_name] x = cls_(file_name=file_name, mode='w') # simulate missing meta-information x.file.remove_node('/meta_information', recursive=True) x.close() x = fanc.load(file_name, mode='r') assert isinstance(x, cls_) x.close()
def calc_and_write(hic_file): logging.info("working on %s", hic_file) eig1_out = hic_file.replace(".hic", "_eig1.bed") eig2_out = hic_file.replace(".hic", "_eig2.bed") if not os.path.exists(eig1_out): cor_file = hic_file.replace(".hic", ".cor") if os.path.exists(cor_file): logging.info("Correlation matrix %s exists, loading it", cor_file) ab = fanc.load(cor_file) else: logging.info("Calculating correlation matrix and saving to %s", cor_file) hic = fanc.load(hic_file) ab = fanc.ABCompartmentMatrix.from_hic(hic, file_name=cor_file) eig1, eig2 = calc_2_eigenvectors(ab) write_bed_eig(ab, eig1, file_name=eig1_out) write_bed_eig(ab, eig2, file_name=eig2_out) else: logging.info("output file exists; skipping!")
def write_stats(input_dir, output_file): """Write stats to file.""" pairs_files = [fn for fn in os.listdir(input_dir) if fn.endswith("pairs")] for p in pairs_files: logging.info("Working on %s", p) pairs = fanc.load(os.path.join(input_dir, p)) statistics, total = stats(pairs, pairs._edges) with open(output_file, "a+") as out: out = csv.writer(out) out.writerow([p, "total", total]) for key, val in statistics.items(): out.writerow([p, key, val])
def test_auto_identification(self, tmpdir): for class_name in ('Hic', 'LegacyHic', 'ReadPairs', 'AggregateMatrix', 'ComparisonMatrix', 'FoldChangeMatrix', 'DifferenceMatrix', 'ComparisonRegions', 'FoldChangeRegions', 'DifferenceRegions', 'DirectionalityIndex'): file_name = str(tmpdir) + '/{}.h5'.format(class_name) cls_ = class_name_dict[class_name] x = cls_(file_name=file_name, mode='w') x.close() x = fanc.load(file_name, mode='r') assert isinstance(x, cls_) x.close()
def test_hic_based_auto_identification(self, tmpdir): with dummy.sample_hic() as hic: for class_name in ('ABDomains', 'ABDomainMatrix', 'ExpectedContacts', 'ObservedExpectedRatio', 'PossibleContacts', 'RegionContactAverage', 'InsulationIndex', 'DirectionalityIndex'): file_name = str(tmpdir) + '/{}.h5'.format(class_name) cls_ = class_name_dict[class_name] x = cls_(hic, file_name=file_name, mode='w') x.close() x = fanc.load(file_name, mode='r') assert isinstance(x, cls_) x.close() for class_name in ('FoldChangeMatrix', ): file_name = str(tmpdir) + '/{}.h5'.format(class_name) cls_ = class_name_dict[class_name] x = cls_(hic, hic, file_name=file_name, mode='w') x.close() x = fanc.load(file_name, mode='r') assert isinstance(x, cls_) x.close()
def load_oe_contacts(matrix_file, regions_file=None): import fanc try: # try loading via fanc reference_loaded = fanc.load(matrix_file) edges = edges_dict_from_fanc(reference_loaded) regions = reference_loaded.regions region_trees = region_interval_trees(regions) except ValueError: try: assert regions_file is not None regions, ix_converter, _ = load_regions(regions_file) region_trees = region_interval_trees(regions) edges = edges_dict_from_sparse( edges_from_sparse_matrix(matrix_file, ix_converter)) except AssertionError: raise ValueError return edges, region_trees, regions
def __init__(self, hic_data, adjust_range=False, buffering_strategy="relative", buffering_arg=1, weight_field=None, default_value=None, smooth_sigma=None, matrix_norm=True, oe=False, log=False, **kwargs): """ :param hic_data: Path to Hi-C data on disk or :class:`~fanc.data.genomic.Hic` or :class:`~fanc.data.genomic.RegionMatrix` :param adjust_range: Draw a slider to adjust vmin/vmax interactively. Default: False :param buffering_strategy: A valid buffering strategy for :class:`~BufferedMatrix` :param buffering_arg: Adjust range of buffering for :class:`~BufferedMatrix` """ super(BasePlotterHic, self).__init__(**kwargs) if isinstance(hic_data, string_types): hic_data = fanc.load(hic_data, mode="r") self.hic_data = hic_data self.hic_buffer = prepare_hic_buffer(hic_data, buffering_strategy=buffering_strategy, buffering_arg=buffering_arg, weight_field=weight_field, default_value=default_value, smooth_sigma=smooth_sigma, norm=matrix_norm, oe=oe, log=log) self.slider = None self.adjust_range = adjust_range self.vmax_slider = None
def load_contacts(matrix_file, regions_file=None): import fanc from chess.oe import observed_expected try: # try loading via fanc reference_loaded = fanc.load(matrix_file) edges = oe_edges_dict_from_fanc(reference_loaded) regions = reference_loaded.regions region_trees = region_interval_trees(regions) except ValueError as initial_error: print(initial_error) try: assert regions_file is not None, ("Regions file needs to be" "specified for sparse input.") regions, _ix_converter, _ = load_regions(regions_file) region_trees = region_interval_trees(regions) _, reference_oe = observed_expected(regions_file, matrix_file) edges = edges_dict_from_sparse(reference_oe) except AssertionError as error: raise ValueError(error) return edges, region_trees, regions
def write_expected(input_file, output_prefix): hic = fanc.load(input_file, mode="a") intra_expected, expected_by_chromosome, inter_expected = hic.expected_values( ) bin_size = hic.bin_size distances = list(range(0, len(intra_expected) * bin_size, bin_size)) output_file = output_prefix + "_expected_values_all.txt" with open(output_file, "w+") as out: out.write("distance\texpected\n") for i in range(0, len(intra_expected)): out.write(str(distances[i]) + "\t" + str(intra_expected[i]) + "\n") output_file = output_prefix + "_expected_values_per_chrom.txt" with open(output_file, "w+") as out: out.write("chrom\tdistance\texpected\n") for chrom in expected_by_chromosome.keys(): expected = expected_by_chromosome[chrom] for i in range(0, len(expected)): out.write(chrom + "\t" + str(distances[i]) + "\t" + str(expected[i]) + "\n")
def test_call_peaks(self): dir = os.path.dirname(os.path.realpath(__file__)) hic_10kb = fanc.load(dir + "/test_peaks/rao2014.chr11_77400000_78600000.hic", mode='r') peak_caller = RaoPeakCaller() peaks = peak_caller.call_peaks(hic_10kb) assert len(peaks.edges) == 6525 valid_peaks = [] has_43_57 = False for peak in peaks.edges: if peak.fdr_ll < 0.1 and peak.fdr_v < 0.1 and peak.fdr_h < 0.1 and peak.fdr_d < 0.1: valid_peaks.append(peak) if peak.source == 43 and peak.sink == 57: has_43_57 = True assert len(valid_peaks) == 134 assert has_43_57 hic_10kb.close() peaks.close()
def auto(argv, **kwargs): from fanc.tools.general import which parser = auto_parser() args = parser.parse_args(argv[2:]) verbosity = kwargs.get("verbosity", 2) if verbosity > 0: verbosity_flag = '-' + 'v' * verbosity fanc_base_command = ['fanc', verbosity_flag] else: fanc_base_command = ['fanc'] log_file = kwargs.get("log_file", None) if log_file is not None: fanc_base_command += ['-l', log_file] bin_sizes = args.bin_sizes split_ligation_junction = args.split_ligation_junction restriction_enzyme = args.restriction_enzyme threads = args.threads genome = args.genome genome_index = args.genome_index basename = args.basename quality_cutoff = args.quality_cutoff iterative_quality_cutoff = args.iterative_quality_cutoff tmp = args.tmp mapper_parallel = args.mapper_parallel split_fastq = args.split_fastq memory_map = args.memory_map iterative = args.iterative step_size = args.step_size sam_sort = args.sam_sort filter_pairs = args.filter_pairs inward_cutoff = args.inward_cutoff outward_cutoff = args.outward_cutoff auto_le_cutoff = args.auto_le_cutoff process_hic = args.process_hic ice = args.ice norm_method = args.norm_method restore_coverage = args.restore_coverage run_with = args.run_with job_prefix = args.job_prefix grid_startup_commands = os.path.expanduser(args.grid_startup_commands) \ if args.grid_startup_commands is not None else None grid_cleanup_commands = os.path.expanduser(args.grid_cleanup_commands) \ if args.grid_cleanup_commands is not None else None force_overwrite = args.force_overwrite output_folder = os.path.expanduser(args.output_folder) file_names = [os.path.expanduser(file_name) for file_name in args.input] file_types = [file_type(file_name) for file_name in file_names] file_basenames = [file_basename(file_name) for file_name in file_names] if ice: warnings.warn("The --ice option is deprecated. Please use '--norm-method ice' instead!") norm_method = 'ice' for file_name in file_names: if not os.path.exists(file_name): parser.error("File '{}' does not exist!".format(file_name)) runner = None if run_with == 'parallel': runner = ParallelTaskRunner(threads) elif run_with == 'sge': from fanc.config import config if which(config.sge_qsub_path) is None: parser.error("Using SGE not possible: " "Cannot find 'qsub' at path '{}'. You can change " "this path using fanc config files and the " "'sge_qsub_path' parameter".format(config.sge_qsub_path)) from fanc.tools.files import mkdir sge_log_dir = mkdir(output_folder, 'sge_logs') runner = SgeTaskRunner(log_dir=sge_log_dir, task_prefix=job_prefix, startup_commands_file=grid_startup_commands, cleanup_commands_file=grid_cleanup_commands) elif run_with == 'slurm': from fanc.config import config if which(config.slurm_sbatch_path) is None: parser.error("Using Slurm not possible: " "Cannot find 'sbatch' at path '{}'. You can change " "this path using fanc config files and the " "'slurm_sbatch_path' parameter".format(config.slurm_sbatch_path)) from fanc.tools.files import mkdir slurm_log_dir = mkdir(output_folder, 'slurm_logs') runner = SlurmTaskRunner(log_dir=slurm_log_dir, task_prefix=job_prefix, startup_commands_file=grid_startup_commands, cleanup_commands_file=grid_cleanup_commands) elif run_with == 'test': runner = ParallelTaskRunner(threads, test=True) else: parser.error("Runner '{}' is not valid. See --run-with " "parameter for options".format(run_with)) for i in range(len(file_types)): if file_types[i] not in ('fastq', 'sam', 'pairs_txt', 'pairs', 'hic'): import fanc try: ds = fanc.load(file_names[i], mode='r') if isinstance(ds, fanc.Hic): file_types[i] = 'hic' elif isinstance(ds, fanc.ReadPairs): file_types[i] = 'pairs' else: raise ValueError("Could not detect file type using fanc load.") except ValueError: parser.error("Not a valid input file type: {}".format(file_types[i])) if basename is None: if len(file_basenames) == 1: basename = file_basenames[0] else: basename = [] for pair in zip(*file_basenames): if pair[0] == pair[1]: basename.append(pair[0]) else: break if len(basename) == 0: basename = file_basenames[0] else: if basename[-1] in ['.', '_']: basename = "".join(basename[:-1]) else: basename = "".join(basename) if not output_folder[-1] == '/': output_folder += '/' # 0. Do some sanity checks on required flags is_bwa = False is_bowtie2 = False if 'fastq' in file_types: if args.genome_index is None: parser.error("Must provide genome index (-i) when mapping FASTQ files!") else: check_path = os.path.expanduser(genome_index) if check_path.endswith('.'): check_path = check_path[:-1] is_bowtie2 = True for i in range(1, 5): if not os.path.exists(check_path + '.{}.bt2'.format(i)): is_bowtie2 = False for i in range(1, 3): if not os.path.exists(check_path + '.rev.{}.bt2'.format(i)): is_bowtie2 = False is_bwa = True for ending in ('amb', 'ann', 'bwt', 'pac', 'sa'): if not os.path.exists(check_path + '.{}'.format(ending)): is_bwa = False if not is_bowtie2 and not is_bwa: parser.error("Cannot detect Bowtie2 or BWA index.") if is_bowtie2 and not which('bowtie2'): parser.error("bowtie2 must be in PATH for mapping!") if is_bwa and not which('bwa'): parser.error("bwa must be in PATH for mapping!") if 'fastq' in file_types or 'sam' in file_types: if genome is None: parser.error("Must provide genome (-g) to process read pair files!") if restriction_enzyme is None: from fanc.regions import genome_regions try: genome_regions(genome) except ValueError: parser.error("Must provide restriction enzyme (-r) to process read pair files!") else: res = restriction_enzyme.split(",") from Bio import Restriction for r in res: try: getattr(Restriction, r) except AttributeError: parser.error("Restriction enzyme string '{}' is not recognized".format(restriction_enzyme)) logger.info("Output folder: {}".format(output_folder)) logger.info("Input files: {}".format(", ".join(file_names))) logger.info("Input file types: {}".format(", ".join(file_types))) logger.info("Final basename: %s (you can change this with the -n option!)" % basename) # 1. create default folders in root directory if run_with != 'test': logger.info("Creating output folders...") from ..tools.general import mkdir mkdir(output_folder, 'fastq') mkdir(output_folder, 'sam') mkdir(output_folder, 'pairs/') mkdir(output_folder, 'hic/binned') mkdir(output_folder, 'plots/stats') # 2. If input files are (gzipped) FASTQ, map them iteratively first fastq_files = [] for i in range(len(file_names)): if file_types[i] != 'fastq': continue fastq_files.append(i) sam_created = [False] * len(file_names) mapping_tasks = [] if len(fastq_files) > 0: if genome_index.endswith('.'): genome_index = genome_index[:-1] bam_files = [] for i, ix in enumerate(fastq_files): bam_file = output_folder + 'sam/' + file_basenames[ix] + '.bam' if not force_overwrite and os.path.exists(bam_file): parser.error("File exists ({}), use -f to force overwriting it.".format(bam_file)) bam_files.append(bam_file) mapping_command = fanc_base_command + ['map', '-m', '25', '-s', str(step_size), '-t', str(threads)] if iterative_quality_cutoff is not None: mapping_command += ['-q', str(iterative_quality_cutoff)] if tmp: mapping_command.append('-tmp') if not mapper_parallel: mapping_command.append('--fanc-parallel') if split_fastq: mapping_command.append('--split-fastq') if memory_map: mapping_command.append('--memory-map') if not iterative: mapping_command.append('--no-iterative') if split_ligation_junction: mapping_command.append('--restriction-enzyme') mapping_command.append(restriction_enzyme) mapping_command += [file_names[ix], genome_index, bam_file] mapping_task = CommandTask(mapping_command) runner.add_task(mapping_task, threads=threads) mapping_tasks.append(mapping_task) for ix, i in enumerate(fastq_files): file_names[i] = bam_files[ix] file_types[i] = 'sam' sam_created[i] = True if sam_sort: sort_threads = min(4, threads) sam_sort_tasks = [] # sort SAM files sam_files = [] in_place = [] for i in range(len(file_names)): if file_types[i] != 'sam': continue sam_files.append(i) in_place.append(sam_created[i]) if len(sam_files) > 0: sorted_sam_files = [] for i, ix in enumerate(sam_files): sort_command = fanc_base_command + ['sort_sam', '-t', str(sort_threads), file_names[ix]] if in_place[i]: sorted_sam_files.append(file_names[ix]) else: sam_path, sam_extension = os.path.splitext(file_names[ix]) sam_basename = os.path.basename(sam_path) sorted_sam_file = os.path.join(output_folder, 'sam', sam_basename + '_sort' + sam_extension) if not force_overwrite and os.path.exists(sorted_sam_file): parser.error("File exists ({}), use -f to force overwriting it.".format(sorted_sam_file)) sorted_sam_files.append(sorted_sam_file) sort_command.append(sorted_sam_file) if tmp: sort_command.append('-tmp') sam_sort_task = CommandTask(sort_command) runner.add_task(sam_sort_task, wait_for=mapping_tasks, threads=1) sam_sort_tasks.append(sam_sort_task) for ix, i in enumerate(sam_files): file_names[i] = sorted_sam_files[ix] else: sam_sort_tasks = mapping_tasks total_pairs = 0 pairs_txt_tasks = [] # sort SAM files pairs_txt_files = [] for i in range(len(file_names)): if file_types[i] != 'pairs_txt': continue pairs_txt_files.append(i) if len(pairs_txt_files) > 0: load_threads = max(int(threads / len(pairs_txt_files)), 1) pairs_files = [] for ix in pairs_txt_files: pairs_txt_file = file_names[ix] pairs_file = os.path.join(output_folder, 'pairs', '{}_{}.pairs'.format(basename, total_pairs)) total_pairs += 1 if not force_overwrite and os.path.exists(pairs_file): parser.error("File exists ({}), use -f to force overwriting it.".format(pairs_file)) pairs_files.append(pairs_file) pairs_command = fanc_base_command + ['pairs', '-f', # loading '-g', genome, '-t', str(load_threads)] if restriction_enzyme is not None: pairs_command += ['-r', restriction_enzyme] if is_bwa: pairs_command.append('--bwa') if tmp: pairs_command.append('-tmp') if sam_sort: pairs_command.append('-S') pairs_command += [pairs_txt_file, pairs_file] pairs_task = CommandTask(pairs_command) runner.add_task(pairs_task, wait_for=[], threads=load_threads) pairs_txt_tasks.append(pairs_task) pairs_files.append(pairs_file) for ix, i in enumerate(pairs_txt_files): file_names[i] = pairs_files[ix] file_types[i] = 'pairs' # load pairs directly from SAM sam_file_pairs = [] i = 0 while i < len(file_names): if file_types[i] == 'sam': if not file_types[i + 1] == 'sam': parser.error("Cannot create SAM pairs, because {} " "is missing a partner file".format(file_names[i])) sam_file_pairs.append((i, i + 1)) i += 1 i += 1 if len(sam_file_pairs) > 0: sam_to_pairs_tasks = pairs_txt_tasks load_threads = max(int(threads/len(sam_file_pairs)), 1) pairs_files = [] for i, j in sam_file_pairs: if len(sam_file_pairs) > 1 or total_pairs > 0: pairs_file = os.path.join(output_folder, 'pairs', '{}_{}.pairs'.format(basename, total_pairs)) total_pairs += 1 else: pairs_file = output_folder + 'pairs/' + basename + '.pairs' if not force_overwrite and os.path.exists(pairs_file): parser.error("File exists ({}), use -f to force overwriting it.".format(pairs_file)) pairs_command = fanc_base_command + ['pairs', '-f', # loading '-g', genome, '-t', str(load_threads), # filtering '-us'] if restriction_enzyme is not None: pairs_command += ['-r', restriction_enzyme] if quality_cutoff is not None: pairs_command += ['-q', str(quality_cutoff)] if is_bwa: pairs_command.append('--bwa') if tmp: pairs_command.append('-tmp') if sam_sort: pairs_command.append('-S') pairs_command += [file_names[i], file_names[j], pairs_file] pairs_task = CommandTask(pairs_command) runner.add_task(pairs_task, wait_for=sam_sort_tasks, threads=load_threads) sam_to_pairs_tasks.append(pairs_task) pairs_files.append(pairs_file) for ix, sam_pair in enumerate(reversed(sam_file_pairs)): file_names[sam_pair[0]] = pairs_files[ix] del file_names[sam_pair[1]] file_types[sam_pair[0]] = 'pairs' del file_types[sam_pair[1]] else: sam_to_pairs_tasks = pairs_txt_tasks + sam_sort_tasks # 7. Pairs stats and filtering pairs_files = [] for i in range(len(file_names)): if file_types[i] != 'pairs': continue pairs_files.append(i) if len(pairs_files) > 0 and filter_pairs: pairs_tasks = [] for ix in pairs_files: pair_basename = os.path.basename(os.path.splitext(file_names[ix])[0]) pairs_stats_file = output_folder + 'plots/stats/' + pair_basename + '.pairs.stats.pdf' ligation_error_file = output_folder + 'plots/stats/' + pair_basename + '.pairs.ligation_error.pdf' re_dist_file = output_folder + 'plots/stats/' + pair_basename + '.pairs.re_dist.pdf' pairs_command = fanc_base_command + ['pairs', # filtering '-d', '10000', '-l', '-p', '2'] if tmp: pairs_command.append('-tmp') if inward_cutoff is not None: pairs_command += ['-i', str(inward_cutoff)] if outward_cutoff is None: pairs_command += ['-o', '0'] if outward_cutoff is not None: pairs_command += ['-o', str(outward_cutoff)] if inward_cutoff is None: pairs_command += ['-i', '0'] if inward_cutoff is None and outward_cutoff is None and auto_le_cutoff: pairs_command += ['--filter-ligation-auto'] pairs_command += ['--statistics-plot', pairs_stats_file, file_names[ix]] pairs_task = CommandTask(pairs_command) runner.add_task(pairs_task, wait_for=sam_to_pairs_tasks, threads=1) pairs_tasks.append(pairs_task) ligation_error_command = fanc_base_command + ['pairs', '--ligation-error-plot', ligation_error_file, file_names[ix]] ligation_error_task = CommandTask(ligation_error_command) runner.add_task(ligation_error_task, wait_for=pairs_task, threads=1) re_dist_command = fanc_base_command + ['pairs', '--re-dist-plot', re_dist_file, file_names[ix]] re_dist_task = CommandTask(re_dist_command) runner.add_task(re_dist_task, wait_for=pairs_task, threads=1) else: pairs_tasks = sam_to_pairs_tasks # 8. Pairs to Hic pairs_files = [] for i in range(len(file_names)): if file_types[i] != 'pairs': continue pairs_files.append(i) if len(pairs_files) > 0 and process_hic: hic_tasks = [] hic_files = [] for ix in pairs_files: hic_basename = os.path.basename(os.path.splitext(file_names[ix])[0]) if hic_basename.endswith('_filtered'): hic_basename = hic_basename[:-9] hic_file = output_folder + 'hic/' + hic_basename + '.hic' if not force_overwrite and os.path.exists(hic_file): parser.error("File exists ({}), use -f to force overwriting it.".format(hic_file)) hic_command = fanc_base_command + ['hic', '-f'] if tmp: hic_command.append('-tmp') hic_command += [file_names[ix], hic_file] hic_task = CommandTask(hic_command) runner.add_task(hic_task, wait_for=pairs_tasks, threads=1) hic_tasks.append(hic_task) hic_files.append(hic_file) for ix, i in enumerate(pairs_files): file_names[i] = hic_files[ix] file_types[i] = 'hic' else: hic_tasks = pairs_tasks # 9. Merge Hic hic_files = [] for i in range(len(file_names)): if file_types[i] != 'hic': continue hic_files.append(i) if len(hic_files) > 1: merge_hic_tasks = [] output_hic = output_folder + 'hic/' + basename + '.hic' if not force_overwrite and os.path.exists(output_hic): parser.error("File exists ({}), use -f to force overwriting it.".format(output_hic)) merge_hic_command = fanc_base_command + ['hic', '-f'] if tmp: merge_hic_command.append('-tmp') hics = [file_names[i] for i in hic_files] merge_hic_command += hics + [output_hic] merge_hic_task = CommandTask(merge_hic_command) runner.add_task(merge_hic_task, wait_for=hic_tasks, threads=1) merge_hic_tasks.append(merge_hic_task) file_names[hic_files[0]] = output_hic hic_files.pop(0) for ix, i in enumerate(reversed(hic_files)): del file_names[i] del file_types[i] else: merge_hic_tasks = hic_tasks from fanc.tools.general import human_format, str_to_int hic_files = [] for i in range(len(file_names)): if file_types[i] != 'hic': continue hic_files.append(i) if len(hic_files) > 0: for ix in hic_files: hic_file = file_names[ix] binned_hic_file_base = output_folder + 'hic/binned/' + basename + '_' bin_threads = min(4, threads) for bin_size in bin_sizes: bin_size = str_to_int(str(bin_size)) bin_size_str = human_format(bin_size, 0).lower() + 'b' binned_hic_file = binned_hic_file_base + bin_size_str + '.hic' if not force_overwrite and os.path.exists(binned_hic_file): parser.error("File exists ({}), use -f to force overwriting it.".format(binned_hic_file)) hic_basename = os.path.basename(os.path.splitext(binned_hic_file)[0]) hic_stats_file = output_folder + 'plots/stats/' + \ hic_basename + '.stats.pdf' hic_command = fanc_base_command + ['hic', '-f', '-b', str(bin_size), '-r', '0.1', '-t', str(bin_threads), '--statistics-plot', hic_stats_file, '-n', '--norm-method', norm_method] if tmp: hic_command.append('-tmp') if restore_coverage: hic_command.append('-c') hic_command += [hic_file, binned_hic_file] hic_task = CommandTask(hic_command) runner.add_task(hic_task, wait_for=merge_hic_tasks, threads=bin_threads) runner.run() return 0
# chr18: 899140-(899308[-1])-899476 -- chr18: 1509911-(1510021[1])-1510076 # end snippet pairs filter masked # start snippet pairs filter exclude for pair in pairs.pairs(excluded_filters=['self-ligation']): print(pair) # end snippet pairs filter exclude # start snippet hic convert hic_folder = mkdir(os.path.join(output_folder, 'hic')) hic_file = os.path.join(hic_folder, 'example.hic') hic = pairs.to_hic(file_name=hic_file) # end snippet hic convert hic.close() hic = fanc.load(hic_file) # start snippet hic bin binned_hic = hic.bin(1000000, file_name=os.path.join(hic_folder, 'binned_1mb.hic'), threads=4) # end snippet hic bin # start snippet hic filter from fanc.hic import LowCoverageFilter lc_filter = LowCoverageFilter(binned_hic, rel_cutoff=0.2) binned_hic.filter(lc_filter) binned_hic.run_queued_filters() # end snippet hic filter # start snippet hic balance
# mode="r") # Tollrm910_nc14_hic_plot = fancplot.HicPlot(Tollrm910_nc14_hic, vmin=1e-03, # vmax=1e-01, norm="log", # draw_minor_ticks=False, title="Tollrm910") # Toll10B_nc14_hic = fanc.load(os.path.join("data", "hic", "merged", # "Toll10B-nc14", "hic", # "Toll10B-nc14_1kb.hic"), mode="r") # Toll10B_nc14_hic_plot = fancplot.HicPlot(Toll10B_nc14_hic, vmin=1e-03, # vmax=1e-01, norm="log", # draw_minor_ticks=False, # title="Toll10B") gd7_microc = fanc.load(os.path.join("data", "micro-c", "merged", "gd7", "hic", "gd7_" + res + ".hic"), mode="r") gd7_microc_plot = fancplot.HicPlot(gd7_microc, vmin=1e-03, vmax=1e-01, norm="log", draw_minor_ticks=False, title="gd7") control_microc = fanc.load(os.path.join("data", "micro-c", "merged", "control", "hic", "control_" + res + ".hic"), mode="r") control_microc_plot = fancplot.HicPlot(control_microc, vmin=1e-03, vmax=1e-01, norm="log", draw_minor_ticks=False, title="control") genes = "external_data/flybase/dmel-all-r6.30.gtf.gz"
# start snippet ab setup import fanc import fanc.plotting as fancplot import matplotlib.pyplot as plt hic_1mb = fanc.load("output/hic/binned/fanc_example_1mb.hic") # end snippet ab setup # start snippet alternative cooler hic_1mb = fanc.load("architecture/other-hic/fanc_example.mcool@1mb") # end snippet alternative cooler # start snippet alternative juicer hic_1mb = fanc.load("architecture/other-hic/fanc_example.juicer.hic@1mb") # end snippet alternative juicer # start snippet ab matrix ab = fanc.ABCompartmentMatrix.from_hic(hic_1mb) # end snippet ab matrix # start snippet ab subset ab_chr18 = ab.matrix(('chr18', 'chr18')) # end snippet ab subset # start snippet ab fancplot-correlation fig, ax = plt.subplots() mp = fancplot.SquareMatrixPlot(ab, ax=ax, norm='lin', colormap='RdBu_r', vmin=-1, vmax=1, draw_minor_ticks=False) mp.plot('chr18')
def plot_regions(regions): h = fanc.load(os.path.join("data", "hic", "merged", "3-4h", "hic", "3-4h_2kb.hic"), mode="r") h_plot = fancplot.HicPlot(h, vmin=1e-03, vmax=1e-01, norm="log", draw_minor_ticks=False) genes = "external_data/flybase/dmel-all-r6.30.gtf.gz" genes_plot = fancplot.GenePlot(genes, squash=True, group_by="gene_symbol", aspect=0.15, label_field="gene_symbol", show_labels=False, draw_minor_ticks=False) rnaseq_dict = { name: os.path.join("external_data", "koenecke_2016_2017", "rnaseq_aligned", name + "_sorted_filtered_merged_canonical_chrs_rnaseq.bw") for name in ["gd7", "tlrm910", "tl10b"] } h3k27ac_dict = { name: os.path.join( "external_data", "koenecke_2016_2017", "chipseq_aligned", "H3K27ac_" + name + "_sorted_filtered_merged_canonical_chrs.bw") for name in ["gd7", "tl10b"] } h3k27ac_dict["Tollrm910"] = os.path.join( "external_data", "extra_chip-seq", "chipseq_aligned", "H3K27ac_Tollrm910_sorted_filtered_merged_canonical_chrs.bw") rnaseq_ylim = fancplot.helpers.LimitGroup() h3k27ac_ylim = fancplot.helpers.LimitGroup() polii_ylim = fancplot.helpers.LimitGroup() # polii_chip_early = os.path.join("external_data", "blythe_2015", "aligned", # "PolII-pSer5_NC14-early_sorted_filtered_merged_canonical_chrs.bw") # polii_chip_mid = os.path.join("external_data", "blythe_2015", "aligned", # "PolII-pSer5_NC14-middle_sorted_filtered_merged_canonical_chrs.bw") polii_chip_late = os.path.join( "external_data", "blythe_2015", "aligned", "PolII-pSer5_NC14-late_sorted_filtered_merged_canonical_chrs.bw") # polii_early_plot = fancplot.LinePlot(polii_chip_early, fill=True, plot_kwargs={'color': "black"}, # draw_minor_ticks=False, aspect=0.05, # ylim=polii_ylim, n_yticks=2) # polii_mid_plot = fancplot.LinePlot(polii_chip_mid, fill=True, plot_kwargs={'color': "black"}, # draw_minor_ticks=False, aspect=0.05, # ylim=polii_ylim, n_yticks=2) polii_late_plot = fancplot.LinePlot(polii_chip_late, fill=True, plot_kwargs={'color': "black"}, draw_minor_ticks=False, aspect=0.05, ylim=polii_ylim, n_yticks=2) rnaseq_plot_gd7 = fancplot.LinePlot(rnaseq_dict['gd7'], fill=True, plot_kwargs={'color': "#648fff"}, draw_minor_ticks=False, aspect=0.05, n_yticks=2) h3k27ac_plot_gd7 = fancplot.LinePlot(h3k27ac_dict['gd7'], fill=True, plot_kwargs={'color': "#648fff"}, draw_minor_ticks=False, aspect=0.05, ylim=h3k27ac_ylim, n_yticks=2) rnaseq_plot_Tollrm910 = fancplot.LinePlot(rnaseq_dict['tlrm910'], fill=True, plot_kwargs={'color': "#dc267f"}, draw_minor_ticks=False, aspect=0.05, n_yticks=2) h3k27ac_plot_Tollrm910 = fancplot.LinePlot( h3k27ac_dict['Tollrm910'], fill=True, plot_kwargs={'color': "#dc267f"}, draw_minor_ticks=False, aspect=0.05, ylim=h3k27ac_ylim, n_yticks=2) rnaseq_plot_toll10b = fancplot.LinePlot(rnaseq_dict['tl10b'], fill=True, plot_kwargs={'color': "#ffb000"}, draw_minor_ticks=False, aspect=0.05, n_yticks=2) h3k27ac_plot_toll10b = fancplot.LinePlot(h3k27ac_dict['tl10b'], fill=True, plot_kwargs={'color': "#ffb000"}, draw_minor_ticks=False, aspect=0.05, ylim=h3k27ac_ylim, n_yticks=2) gd7_enh = "data/supplementary_tables/gd7_candidate_enhancers.bed" gd7_enh_plot = fancplot.GenomicFeaturePlot(gd7_enh, aspect=0.02, color="#648fff", draw_minor_ticks=False) Tollrm910_enh = "data/supplementary_tables/Tollrm910_candidate_enhancers.bed" Tollrm910_enh_plot = fancplot.GenomicFeaturePlot(Tollrm910_enh, aspect=0.02, color="#dc267f", draw_minor_ticks=False) toll10b_enh = "data/supplementary_tables/Toll10B_candidate_enhancers.bed" toll10b_enh_plot = fancplot.GenomicFeaturePlot(toll10b_enh, aspect=0.02, color="#ffb000", draw_minor_ticks=False) plots = [ h_plot, # ins_plot, # boundaries_plot, genes_plot, # hk_plot, # polii_early_plot, polii_mid_plot, polii_late_plot, rnaseq_plot_gd7, rnaseq_plot_Tollrm910, rnaseq_plot_toll10b, h3k27ac_plot_gd7, h3k27ac_plot_Tollrm910, h3k27ac_plot_toll10b, gd7_enh_plot, Tollrm910_enh_plot, toll10b_enh_plot ] with PdfPages(output_file) as pdf: with fancplot.GenomicFigure(plots, ticks_last=True) as gfig: for name, region, rnaseq_ylim in regions: logging.info(region) fig, axes = gfig.plot(region) axes[3].set_ylim([0, rnaseq_ylim]) axes[4].set_ylim([0, rnaseq_ylim]) axes[5].set_ylim([0, rnaseq_ylim]) pdf.savefig()
# start snippet oe setup import fanc import matplotlib.pyplot as plt import fanc.plotting as fancplot hic_500kb = fanc.load("output/hic/binned/fanc_example_500kb.hic") # end snippet oe setup hic_500kb.close() # start snippet oe append hic_500kb = fanc.load("output/hic/binned/fanc_example_500kb.hic", mode='a') # end snippet oe append # start snippet alternative cooler hic_500kb = fanc.load("architecture/other-hic/fanc_example.mcool@500kb") # end snippet alternative cooler # start snippet alternative juicer hic_500kb = fanc.load("architecture/other-hic/fanc_example.juicer.hic@500kb") # end snippet alternative juicer # start snippet oe basic intra_expected, intra_expected_chromosome, inter_expected = hic_500kb.expected_values( ) # end snippet oe basic # start snippet oe ddplot # obtain bin distances bin_size = hic_500kb.bin_size distance = list( range(0, bin_size * len(intra_expected_chromosome['chr19']), bin_size))