def fromSegments(options, args): '''run analysis from segment files. This is the most common use case. ''' tstart = time.time() ################################################## ################################################## ################################################## # build segments segments, annotations, workspaces, isochores = IO.buildSegments(options) E.info("intervals loaded in %i seconds" % (time.time() - tstart)) ################################################## ################################################## ################################################## # open various additional output files ################################################## outfiles = {} for section in ("sample", "segment_metrics", "sample_metrics", ): if section in options.output_stats or \ "all" in options.output_stats or \ len([x for x in options.output_stats if re.search(x, "section")]) > 0: outfiles[section] = E.openOutputFile(section) if 'sample_metrics' in outfiles: outfiles['sample_metrics'].write( "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders())) # filter segments by workspace workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores, truncate_segments_to_workspace=options.truncate_segments_to_workspace, truncate_workspace_to_annotations=options.truncate_workspace_to_annotations, restrict_workspace=options.restrict_workspace) ################################################## ################################################## ################################################## # check memory requirements counts = segments.countsPerTrack() max_counts = max(counts.values()) # previous algorithm: memory requirements if all samples are stored memory = 8 * 2 * options.num_samples * max_counts * len(workspace) ################################################## ################################################## ################################################## # initialize sampler if options.sampler == "annotator": sampler = GatEngine.SamplerAnnotator( bucket_size=options.bucket_size, nbuckets=options.nbuckets) elif options.sampler == "shift": sampler = GatEngine.SamplerShift( radius=options.shift_expansion, extension=options.shift_extension) elif options.sampler == "segments": sampler = GatEngine.SamplerSegments() elif options.sampler == "local-permutation": sampler = GatEngine.SamplerLocalPermutation() elif options.sampler == "global-permutation": sampler = GatEngine.SamplerGlobalPermutation() elif options.sampler == "brute-force": sampler = GatEngine.SamplerBruteForce() elif options.sampler == "uniform": sampler = GatEngine.SamplerUniform() ################################################## ################################################## ################################################## # initialize counter counters = [] for counter in options.counters: if counter == "nucleotide-overlap": counters.append(GatEngine.CounterNucleotideOverlap()) elif counter == "nucleotide-density": counters.append(GatEngine.CounterNucleotideDensity()) elif counter == "segment-overlap": counters.append(GatEngine.CounterSegmentOverlap()) elif counter == "annotations-overlap": counters.append(GatEngine.CounterAnnotationsOverlap()) elif counter == "segment-midoverlap": counters.append(GatEngine.CounterSegmentMidpointOverlap()) elif counter == "annotations-midoverlap": counters.append(GatEngine.CounterAnnotationsMidpointOverlap()) else: raise ValueError("unknown counter '%s'" % counter) ################################################## ################################################## ################################################## # initialize workspace generator if options.conditional == "unconditional": workspace_generator = GatEngine.UnconditionalWorkspace() elif options.conditional == "cooccurance": workspace_generator = GatEngine.ConditionalWorkspaceCooccurance() elif options.conditional == "annotation-centered": if options.conditional_extension == options.conditional_expansion is None: raise ValueError( "please specify either --conditional-expansion or " "--conditional-extension") workspace_generator = GatEngine.ConditionalWorkspaceAnnotationCentered( options.conditional_extension, options.conditional_expansion) elif options.conditional == "segment-centered": if options.conditional_extension == options.conditional_expansion is None: raise ValueError( "please specify either --conditional-expansion or " "--conditional-extension") workspace_generator = GatEngine.ConditionalWorkspaceSegmentCentered( options.conditional_extension, options.conditional_expansion) else: raise ValueError("unknown conditional workspace '%s'" % options.conditional) ################################################## ################################################## ################################################## # check if reference is compplete ################################################## if options.reference: for track in segments.tracks: if track not in options.reference: raise ValueError("missing track '%s' in reference" % track) r = options.reference[track] for annotation in annotations.tracks: if annotation not in r: raise ValueError( "missing annotation '%s' in annotations for " "track='%s'" % (annotation, track)) ################################################## ################################################## ################################################## # compute ################################################## annotator_results = gat.run( segments, annotations, workspace, sampler, counters, workspace_generator=workspace_generator, num_samples=options.num_samples, cache=options.cache, outfiles=outfiles, output_counts_pattern=options.output_counts_pattern, output_samples_pattern=options.output_samples_pattern, sample_files=options.sample_files, conditional=options.conditional, conditional_extension=options.conditional_extension, reference=options.reference, pseudo_count=options.pseudo_count, num_threads=options.num_threads) return annotator_results
def dumpStats(coll, section, options): if section in options.output_stats or \ "all" in options.output_stats or \ len([x for x in options.output_stats if re.search(x, section)]) > 0: coll.outputStats(E.openOutputFile(section))
def dumpBed(coll, section, options): if section in options.output_bed or \ "all" in options.output_bed or \ len([x for x in options.output_bed if re.search(x, section)]) > 0: coll.save(E.openOutputFile(section + ".bed"))
def applyIsochores( segments, annotations, workspaces, options, isochores=None, truncate_segments_to_workspace=False, truncate_workspace_to_annotations=False, restrict_workspace=False, ): '''apply isochores to segments and annotations. Segments and annotations are filtered in place to keep only those overlapping the workspace. If *isochores* are given, isochores are applied. If *truncate_segments_to_workspace*, truncate segments to workspace. If *restrict_workspace* is set, the workspace is confined to those parts that overlap both a segment and an annotation. If *truncate_workspace_to_annotations* is set, the workspace is truncated to keep only those parts that overlap annotations. returns a workspace divided into isochores. ''' if isochores: # intersect isochores and workspaces, segments and annotations # workspace and annotations are truncated # with segments it is optional. E.info("adding isochores to workspace") workspaces.toIsochores(isochores, truncate=True) annotations.toIsochores(isochores, truncate=True) segments.toIsochores(isochores, truncate=options.truncate_segments_to_workspace) if workspaces.sum() == 0: raise ValueError("isochores and workspaces do not overlap") if annotations.sum() == 0: raise ValueError("isochores and annotations do not overlap") if segments.sum() == 0: raise ValueError("isochores and segments do not overlap") dumpStats(workspaces, "stats_workspaces_isochores", options) dumpStats(annotations, "stats_annotations_isochores", options) dumpStats(segments, "stats_segments_isochores", options) dumpBed(workspaces, "workspaces_isochores", options) dumpBed(annotations, "annotations_isochores", options) dumpBed(segments, "segments_isochores", options) else: # intersect workspace and segments/annotations # annotations and segments are truncated by workspace if options.truncate_segments_to_workspace: segments.intersect(workspaces["collapsed"]) else: segments.filter(workspaces["collapsed"]) annotations.intersect(workspaces["collapsed"]) dumpStats(annotations, "stats_annotations_truncated", options) dumpStats(segments, "stats_segments_truncated", options) workspace = workspaces["collapsed"] if restrict_workspace: E.info("restricting workspace") # this is very cumbersome - refactor merge and collapse # to return an IntervalDictionary instead of adding it # to the list of tracks for x in (segments, annotations): if "merged" in segments: workspace.filter(segments["merged"]) else: segments.merge() workspace.filter(segments["merged"]) del segments["merged"] dumpStats(workspaces, "stats_workspaces_restricted", options) if truncate_workspace_to_annotations: E.info("truncating workspace to annotations") annotations.merge() annotations["merged"].normalize() workspace.intersect(annotations["merged"]) del annotations["merged"] dumpStats(workspaces, "stats_workspaces_truncated", options) # segments.dump( open("segments_dump.bed", "w" ) ) # workspaces.dump( open("workspaces_dump.bed", "w" ) ) # output overlap stats # output segment densities per workspace if "overlap" in options.output_stats or \ "all" in options.output_stats: for track in segments.tracks: workspaces.outputOverlapStats( E.openOutputFile("overlap_%s" % track), segments[track]) return workspace
def applyIsochores(segments, annotations, workspaces, options, isochores=None, truncate_segments_to_workspace=False, truncate_workspace_to_annotations=False, restrict_workspace=False, ): '''apply isochores to segments and annotations. Segments and annotations are filtered in place to keep only those overlapping the workspace. If *isochores* are given, isochores are applied. If *truncate_segments_to_workspace*, truncate segments to workspace. If *restrict_workspace* is set, the workspace is confined to those parts that overlap both a segment and an annotation. If *truncate_workspace_to_annotations* is set, the workspace is truncated to keep only those parts that overlap annotations. returns a workspace divided into isochores. ''' if isochores: # intersect isochores and workspaces, segments and annotations # workspace and annotations are truncated # with segments it is optional. E.info("adding isochores to workspace") workspaces.toIsochores(isochores, truncate=True) annotations.toIsochores(isochores, truncate=True) segments.toIsochores( isochores, truncate=options.truncate_segments_to_workspace) if workspaces.sum() == 0: raise ValueError("isochores and workspaces do not overlap") if annotations.sum() == 0: raise ValueError("isochores and annotations do not overlap") if segments.sum() == 0: raise ValueError("isochores and segments do not overlap") dumpStats(workspaces, "stats_workspaces_isochores", options) dumpStats(annotations, "stats_annotations_isochores", options) dumpStats(segments, "stats_segments_isochores", options) dumpBed(workspaces, "workspaces_isochores", options) dumpBed(annotations, "annotations_isochores", options) dumpBed(segments, "segments_isochores", options) else: # intersect workspace and segments/annotations # annotations and segments are truncated by workspace if options.truncate_segments_to_workspace: segments.intersect(workspaces["collapsed"]) else: segments.filter(workspaces["collapsed"]) annotations.intersect(workspaces["collapsed"]) dumpStats(annotations, "stats_annotations_truncated", options) dumpStats(segments, "stats_segments_truncated", options) workspace = workspaces["collapsed"] if restrict_workspace: E.info("restricting workspace") # this is very cumbersome - refactor merge and collapse # to return an IntervalDictionary instead of adding it # to the list of tracks for x in (segments, annotations): if "merged" in segments: workspace.filter(segments["merged"]) else: segments.merge() workspace.filter(segments["merged"]) del segments["merged"] dumpStats(workspaces, "stats_workspaces_restricted", options) if truncate_workspace_to_annotations: E.info("truncating workspace to annotations") annotations.merge() workspace.intersect(annotations["merged"]) del annotations["merged"] dumpStats(workspaces, "stats_workspaces_truncated", options) # segments.dump( open("segments_dump.bed", "w" ) ) # workspaces.dump( open("workspaces_dump.bed", "w" ) ) # output overlap stats # output segment densities per workspace if "overlap" in options.output_stats or \ "all" in options.output_stats: for track in segments.tracks: workspaces.outputOverlapStats(E.openOutputFile("overlap_%s" % track), segments[track]) return workspace
def fromSegments(options, args): '''run analysis from segment files. This is the most common use case. ''' tstart = time.time() # build segments segments, annotations, workspaces, isochores = IO.buildSegments(options) E.info("intervals loaded in %i seconds" % (time.time() - tstart)) # open various additional output files outfiles = {} for section in ( "sample", "segment_metrics", "sample_metrics", ): if section in options.output_stats or \ "all" in options.output_stats or \ len([x for x in options.output_stats if re.search(x, "section")]) > 0: outfiles[section] = E.openOutputFile(section) if 'sample_metrics' in outfiles: outfiles['sample_metrics'].write( "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders())) # filter segments by workspace workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores, truncate_segments_to_workspace=options.truncate_segments_to_workspace, truncate_workspace_to_annotations=options. truncate_workspace_to_annotations, restrict_workspace=options.restrict_workspace) # check memory requirements # previous algorithm: memory requirements if all samples are stored # counts = segments.countsPerTrack() # max_counts = max(counts.values()) # memory = 8 * 2 * options.num_samples * max_counts * len(workspace) # initialize sampler if options.sampler == "annotator": sampler = Engine.SamplerAnnotator(bucket_size=options.bucket_size, nbuckets=options.nbuckets) elif options.sampler == "shift": sampler = Engine.SamplerShift(radius=options.shift_expansion, extension=options.shift_extension) elif options.sampler == "segments": sampler = Engine.SamplerSegments() elif options.sampler == "local-permutation": sampler = Engine.SamplerLocalPermutation() elif options.sampler == "global-permutation": sampler = Engine.SamplerGlobalPermutation() elif options.sampler == "brute-force": sampler = Engine.SamplerBruteForce() elif options.sampler == "uniform": sampler = Engine.SamplerUniform() # initialize counter counters = [] for counter in options.counters: if counter == "nucleotide-overlap": counters.append(Engine.CounterNucleotideOverlap()) elif counter == "nucleotide-density": counters.append(Engine.CounterNucleotideDensity()) elif counter == "segment-overlap": counters.append(Engine.CounterSegmentOverlap()) elif counter == "annotation-overlap": counters.append(Engine.CounterAnnotationOverlap()) elif counter == "segment-midoverlap": counters.append(Engine.CounterSegmentMidpointOverlap()) elif counter == "annotation-midoverlap": counters.append(Engine.CounterAnnotationMidpointOverlap()) else: raise ValueError("unknown counter '%s'" % counter) # initialize workspace generator if options.conditional == "unconditional": workspace_generator = Engine.UnconditionalWorkspace() elif options.conditional == "cooccurance": workspace_generator = Engine.ConditionalWorkspaceCooccurance() elif options.conditional == "annotation-centered": if options.conditional_expansion is None: raise ValueError( "please specify either --conditional-expansion or " "--conditional-extension") workspace_generator = Engine.ConditionalWorkspaceAnnotationCentered( options.conditional_extension, options.conditional_expansion) elif options.conditional == "segment-centered": if options.conditional_expansion is None: raise ValueError( "please specify either --conditional-expansion or " "--conditional-extension") workspace_generator = Engine.ConditionalWorkspaceSegmentCentered( options.conditional_extension, options.conditional_expansion) else: raise ValueError("unknown conditional workspace '%s'" % options.conditional) # check if reference is compplete if options.reference: for track in segments.tracks: if track not in options.reference: raise ValueError("missing track '%s' in reference" % track) r = options.reference[track] for annotation in annotations.tracks: if annotation not in r: raise ValueError( "missing annotation '%s' in annotations for " "track='%s'" % (annotation, track)) # compute annotator_results = gat.run( segments, annotations, workspace, sampler, counters, workspace_generator=workspace_generator, num_samples=options.num_samples, cache=options.cache, outfiles=outfiles, output_counts_pattern=options.output_counts_pattern, output_samples_pattern=options.output_samples_pattern, sample_files=options.sample_files, conditional=options.conditional, conditional_extension=options.conditional_extension, reference=options.reference, pseudo_count=options.pseudo_count, num_threads=options.num_threads) return annotator_results