def test_correct_multidim_result(self): data = np.random.randn(10, 16, 2) correct_ans = np.array([dtw_std(x, y) for x, y in combinations(data, 2)]) parallel_ans = parallel_pdist(data, n_processes=1) assert_array_equal(correct_ans, parallel_ans)
def main(): # --- Argument parsing ----------------------- parser = argument_parser() args = parser.parse_args() if args.datasets and args.processed_dataset: parser.error( 'Must specify either --dataset or --processed_dataset only.') elif not args.processed_dataset: if not args.regions or not args.datasets: parser.error('Must specify both --regions and --dataset') if args.metric is None: if args.processed_dataset: parser.error('Must provide a metric if using processed dataset') elif len(args.datasets) >= 2: print "> Defaulting to cosine distance as more than 2 dataset given" args.metric = 'cosine' else: print "> Defaulting to sqeuclidean distance as only one dataset given" args.metric = 'sqeuclidean' elif args.metric == 'cosine': if args.datasets and len(args.datasets) < 2: parser.error( 'Cannot use cosine distance with just one dataset. Choose sqeuclidean or euclidean instead.' ) if args.no_dtw: # That's what no-dtw actually does args.slanted_band = 0 args.scale = True if args.prototyping_method is None: args.prototyping_method = 'mean' else: if args.prototyping_method is None: args.prototyping_method = 'standard' if args.verbose: logging.root.setLevel(logging.DEBUG) # Disable trying to reverse regions if strand information given if args.use_strand_information: args.no_reverse = True configuration = Configuration(args) # --- pre-processing ------------------------ if args.regions: print '> Reading regions from {0!r} ....'.format(args.regions) regions, total_regions, used_regions = read_regions( args.regions, args.random_sample, args.resolution) if args.use_strand_information and not regions.has_strand_data(): logging.debug('Parsed columns: {0}'.format(regions.columns)) parser.error( '--use-strand-information is set but the input BED file has no strand information.' ) too_short_regions = (regions.lengths / args.resolution ) < args.min_bins # Set the threshold to 4 bins too_short_regions = regions.ix[ too_short_regions[too_short_regions].index] if len(too_short_regions) > 0: print '> {0} regions have their length shorter than {1} bins. Saving them to {2!r} as they won\'t be processed'\ .format(len(too_short_regions), args.min_bins, configuration.too_short_regions_filename) too_short_regions.to_bed(configuration.too_short_regions_filename) regions = regions.ix[regions.index - too_short_regions.index] if args.max_bins: too_long_regions = (regions.lengths / args.resolution) >= args.max_bins too_long_regions = regions.ix[ too_long_regions[too_long_regions].index] if len(too_long_regions) > 0: print '> {0} regions have their length longer than {1} bins. ' \ 'Saving them to {2!r} as they won\'t be processed due to --max-bins constraint'\ .format(len(too_long_regions), args.max_bins, configuration.too_long_regions_filename) too_long_regions.to_bed( configuration.too_long_regions_filename) regions = regions.ix[regions.index - too_long_regions.index] print '> {0} regions remain'.format(len(regions)) else: regions = None if args.points_of_interest: print '> Reading points of interest' poi_file = args.points_of_interest try: poi = from_simple(poi_file, regions, resolution=configuration.resolution) except ValueError: poi = Regions.from_bed(poi_file) poi = poi.as_bins_of( regions, resolution=configuration.resolution, ignore_non_overlaps=args.ignore_poi_non_overlaps, account_for_strand_information=configuration. use_strand_information) if not poi: raise Exception( 'POI file provided, but no POIs were parsed from {}. Try using dgw-overlaps2poi' .format(poi_file)) else: poi = None print '> Reading dataset ...' dataset, missing_regions, filtered_regions = read_datasets(args, regions) if args.datasets: if poi: dataset.add_points_of_interest(poi, name=args.points_of_interest) if args.ignore_no_poi_regions: poi_dataset = dataset.drop_no_pois() if len(poi_dataset) != len(dataset): dropped_regions = regions.ix[dataset.items - poi_dataset.items] print '> {0} regions were removed as they have no POI data with them ' \ 'and --ignore-no-poi-regions was set'.format(len(dropped_regions)) print '> Saving them to {0!r}'.format( configuration.no_poi_regions_filename) dropped_regions.to_bed( configuration.no_poi_regions_filename) dataset = poi_dataset del dropped_regions del poi_dataset if len(missing_regions) > 0: print "> {0} regions were not found in the dataset, they were saved to {1}".format( len(missing_regions), configuration.missing_regions_filename) regions.ix[missing_regions].to_bed( configuration.missing_regions_filename, track_title='DGWMissingRegions', track_description= 'Regions that are in input, but missing from the dataset') if len(filtered_regions) > 0: print "> {0} regions were filtered out from dataset due to --min-pileup constraint, they were saved to {1}".format( len(filtered_regions), configuration.filtered_regions_filename) regions.ix[filtered_regions].to_bed( configuration.filtered_regions_filename, track_title='DGWFilteredRegions', track_description= 'Regions that were filtered out from the dataset') # Get remaining regions regions = regions.ix[dataset.items] if len(missing_regions) > 0 or len(filtered_regions) > 0: print '> {0} regions remaining and will be processed'.format( len(regions)) if args.output_raw_dataset: print '> Serialising raw dataset to {0}'.format( configuration.raw_dataset_filename) serialise(dataset, configuration.raw_dataset_filename) dataset = dataset.to_log_scale() if args.normalise_pileups: print '> Dividing the number of reads in each bin by the maximum number of reads per region as --normalise-pileups is set' dataset = dataset.normalise_bin_heights() missing_regions = regions.index - dataset.items if len(missing_regions) > 0: print "> {0} regions were not found in the dataset, they were saved to {1}".format( len(missing_regions), configuration.missing_regions_filename) regions.ix[missing_regions].to_bed( configuration.missing_regions_filename, track_title='DGWMissingRegions', track_description= 'Regions that are in input, but missing from the dataset') else: print "> Not converting dataset to log scale as processed dataset already provided" # --- Serialise the regions as they will be needed in explorer ---------- if regions is not None: print '> Serialising regions to {0}'.format( configuration.parsed_regions_filename) serialise(regions, configuration.parsed_regions_filename) # --- Saving of dataset ------------------- print '> Saving dataset to {0}'.format(configuration.dataset_filename) serialise(dataset, configuration.dataset_filename) if not args.blank: # --- actual work --------------------------- print '> Calculating pairwise distances (this might take a while) ...' if args.n_processes is not None: print '> Using {0} processes'.format(args.n_processes) else: args.n_processes = cpu_count() print '> Using all available cpu cores ({0})'.format( args.n_processes) if args.no_dtw: print '> Not using DTW as --no-dtw option is set' logging.debug('Running DTW with the following kwargs: {0!r}'.format( configuration.dtw_kwargs)) start = datetime.now() dm = parallel_pdist(dataset, args.n_processes, **configuration.dtw_kwargs) end = datetime.now() delta = end - start print '> Pairwise distances calculation took {0} s'.format( delta.total_seconds()) if args.random_sample: multiplier = binomial_coefficent(total_regions, 2) / float( binomial_coefficent(args.random_sample, 2)) print '> Expected calculation duration if random-sample was not used: {0} s'\ .format(delta.total_seconds() * multiplier) # --- Saving of the work -------------- if configuration.pairwise_distances_filename: print '> Saving the pairwise distance matrix to {0!r}'.format( configuration.pairwise_distances_filename) np.save(configuration.pairwise_distances_filename, dm) # Linkage matrix print '> Computing linkage matrix' linkage = fastcluster.complete(dm) print '> Saving linkage matrix to {0!r}'.format( configuration.linkage_filename) np.save(configuration.linkage_filename, linkage) print '> Computing prototypes' # Hierarchical clustering object to compute the prototypes hc = HierarchicalClustering( dataset, regions, linkage, dtw_function=configuration.dtw_function, prototyping_method=configuration.prototyping_method) prototypes = hc.extract_prototypes() print '> Saving prototypes to {0!r}'.format( configuration.prototypes_filename) serialise(prototypes, configuration.prototypes_filename) print '> Computing warping paths' nodes = hc.tree_nodes_list paths = compute_paths(dataset, nodes, hc.num_obs, n_processes=args.n_processes, **configuration.dtw_kwargs) print '> Saving warping paths to {0!r}'.format( configuration.warping_paths_filename) serialise(paths, configuration.warping_paths_filename) else: print '> Skipping pairwise distances step because of --blank option set' print '> Saving configuration to {0!r}'.format( configuration.configuration_filename) f = open(configuration.configuration_filename, 'w') try: configuration.to_json(f) finally: f.close() print '> Done'
def test_result_returned_same_as_scipy_spatial_distance_pdist_one_process_with_metric(self): correct_ans = pdist(self.sample_data, lambda x, y: dtw_std(x, y, metric='euclidean')) parallel_ans = parallel_pdist(self.sample_data_three_dim, n_processes=1, metric='euclidean') assert_array_equal(correct_ans, parallel_ans)
def test_result_returned_same_as_scipy_spatial_distance_pdist(self): correct_ans = pdist(self.sample_data, dtw_std) parallel_ans = parallel_pdist(self.sample_data_three_dim) assert_array_equal(correct_ans, parallel_ans)
def main(): # --- Argument parsing ----------------------- parser = argument_parser() args = parser.parse_args() if args.datasets and args.processed_dataset: parser.error('Must specify either --dataset or --processed_dataset only.') elif not args.processed_dataset: if not args.regions or not args.datasets: parser.error('Must specify both --regions and --dataset') if args.metric is None: if args.processed_dataset: parser.error('Must provide a metric if using processed dataset') elif len(args.datasets) >= 2: print "> Defaulting to cosine distance as more than 2 dataset given" args.metric = 'cosine' else: print "> Defaulting to sqeuclidean distance as only one dataset given" args.metric = 'sqeuclidean' elif args.metric == 'cosine': if args.datasets and len(args.datasets) < 2: parser.error('Cannot use cosine distance with just one dataset. Choose sqeuclidean or euclidean instead.') if args.no_dtw: # That's what no-dtw actually does args.slanted_band = 0 args.scale = True if args.prototyping_method is None: args.prototyping_method = 'mean' else: if args.prototyping_method is None: args.prototyping_method = 'standard' if args.verbose: logging.root.setLevel(logging.DEBUG) # Disable trying to reverse regions if strand information given if args.use_strand_information: args.no_reverse = True configuration = Configuration(args) # --- pre-processing ------------------------ if args.regions: print '> Reading regions from {0!r} ....'.format(args.regions) regions, total_regions, used_regions = read_regions(args.regions, args.random_sample, args.resolution) if args.use_strand_information and not regions.has_strand_data(): logging.debug('Parsed columns: {0}'.format(regions.columns)) parser.error('--use-strand-information is set but the input BED file has no strand information.') too_short_regions = (regions.lengths / args.resolution) < args.min_bins # Set the threshold to 4 bins too_short_regions = regions.ix[too_short_regions[too_short_regions].index] if len(too_short_regions) > 0: print '> {0} regions have their length shorter than {1} bins. Saving them to {2!r} as they won\'t be processed'\ .format(len(too_short_regions), args.min_bins, configuration.too_short_regions_filename) too_short_regions.to_bed(configuration.too_short_regions_filename) regions = regions.ix[regions.index - too_short_regions.index] if args.max_bins: too_long_regions = (regions.lengths / args.resolution) >= args.max_bins too_long_regions = regions.ix[too_long_regions[too_long_regions].index] if len(too_long_regions) > 0: print '> {0} regions have their length longer than {1} bins. ' \ 'Saving them to {2!r} as they won\'t be processed due to --max-bins constraint'\ .format(len(too_long_regions), args.max_bins, configuration.too_long_regions_filename) too_long_regions.to_bed(configuration.too_long_regions_filename) regions = regions.ix[regions.index - too_long_regions.index] print '> {0} regions remain'.format(len(regions)) else: regions = None if args.points_of_interest: print '> Reading points of interest' poi_file = args.points_of_interest try: poi = from_simple(poi_file, regions, resolution=configuration.resolution) except ValueError: poi = Regions.from_bed(poi_file) poi = poi.as_bins_of(regions, resolution=configuration.resolution, ignore_non_overlaps=args.ignore_poi_non_overlaps, account_for_strand_information=configuration.use_strand_information) if not poi: raise Exception( 'POI file provided, but no POIs were parsed from {}. Try using dgw-overlaps2poi'.format( poi_file)) else: poi = None print '> Reading dataset ...' dataset, missing_regions, filtered_regions = read_datasets(args, regions) if args.datasets: if poi: dataset.add_points_of_interest(poi, name=args.points_of_interest) if args.ignore_no_poi_regions: poi_dataset = dataset.drop_no_pois() if len(poi_dataset) != len(dataset): dropped_regions = regions.ix[dataset.items - poi_dataset.items] print '> {0} regions were removed as they have no POI data with them ' \ 'and --ignore-no-poi-regions was set'.format(len(dropped_regions)) print '> Saving them to {0!r}'.format(configuration.no_poi_regions_filename) dropped_regions.to_bed(configuration.no_poi_regions_filename) dataset = poi_dataset del dropped_regions del poi_dataset if len(missing_regions) > 0: print "> {0} regions were not found in the dataset, they were saved to {1}".format(len(missing_regions), configuration.missing_regions_filename) regions.ix[missing_regions].to_bed(configuration.missing_regions_filename, track_title='DGWMissingRegions', track_description='Regions that are in input, but missing from the dataset') if len(filtered_regions) > 0: print "> {0} regions were filtered out from dataset due to --min-pileup constraint, they were saved to {1}".format(len(filtered_regions), configuration.filtered_regions_filename) regions.ix[filtered_regions].to_bed(configuration.filtered_regions_filename, track_title='DGWFilteredRegions', track_description='Regions that were filtered out from the dataset') # Get remaining regions regions = regions.ix[dataset.items] if len(missing_regions) > 0 or len(filtered_regions) > 0: print '> {0} regions remaining and will be processed'.format(len(regions)) if args.output_raw_dataset: print '> Serialising raw dataset to {0}'.format(configuration.raw_dataset_filename) serialise(dataset, configuration.raw_dataset_filename) dataset = dataset.to_log_scale() if args.normalise_pileups: print '> Dividing the number of reads in each bin by the maximum number of reads per region as --normalise-pileups is set' dataset = dataset.normalise_bin_heights() missing_regions = regions.index - dataset.items if len(missing_regions) > 0: print "> {0} regions were not found in the dataset, they were saved to {1}".format(len(missing_regions), configuration.missing_regions_filename) regions.ix[missing_regions].to_bed(configuration.missing_regions_filename, track_title='DGWMissingRegions', track_description='Regions that are in input, but missing from the dataset') else: print "> Not converting dataset to log scale as processed dataset already provided" # --- Serialise the regions as they will be needed in explorer ---------- if regions is not None: print '> Serialising regions to {0}'.format(configuration.parsed_regions_filename) serialise(regions, configuration.parsed_regions_filename) # --- Saving of dataset ------------------- print '> Saving dataset to {0}'.format(configuration.dataset_filename) serialise(dataset, configuration.dataset_filename) if not args.blank: # --- actual work --------------------------- print '> Calculating pairwise distances (this might take a while) ...' if args.n_processes is not None: print '> Using {0} processes'.format(args.n_processes) else: args.n_processes = cpu_count() print '> Using all available cpu cores ({0})'.format(args.n_processes) if args.no_dtw: print '> Not using DTW as --no-dtw option is set' logging.debug('Running DTW with the following kwargs: {0!r}'.format(configuration.dtw_kwargs)) start = datetime.now() dm = parallel_pdist(dataset, args.n_processes, **configuration.dtw_kwargs) end = datetime.now() delta = end - start print '> Pairwise distances calculation took {0} s'.format(delta.total_seconds()) if args.random_sample: multiplier = binomial_coefficent(total_regions, 2) / float(binomial_coefficent(args.random_sample, 2)) print '> Expected calculation duration if random-sample was not used: {0} s'\ .format(delta.total_seconds() * multiplier) # --- Saving of the work -------------- if configuration.pairwise_distances_filename: print '> Saving the pairwise distance matrix to {0!r}'.format(configuration.pairwise_distances_filename) np.save(configuration.pairwise_distances_filename, dm) # Linkage matrix print '> Computing linkage matrix' linkage = fastcluster.complete(dm) print '> Saving linkage matrix to {0!r}'.format(configuration.linkage_filename) np.save(configuration.linkage_filename, linkage) print '> Computing prototypes' # Hierarchical clustering object to compute the prototypes hc = HierarchicalClustering(dataset, regions, linkage, dtw_function=configuration.dtw_function, prototyping_method=configuration.prototyping_method) prototypes = hc.extract_prototypes() print '> Saving prototypes to {0!r}'.format(configuration.prototypes_filename) serialise(prototypes, configuration.prototypes_filename) print '> Computing warping paths' nodes = hc.tree_nodes_list paths = compute_paths(dataset, nodes, hc.num_obs, n_processes=args.n_processes, **configuration.dtw_kwargs) print '> Saving warping paths to {0!r}'.format(configuration.warping_paths_filename) serialise(paths, configuration.warping_paths_filename) else: print '> Skipping pairwise distances step because of --blank option set' print '> Saving configuration to {0!r}'.format(configuration.configuration_filename) f = open(configuration.configuration_filename, 'w') try: configuration.to_json(f) finally: f.close() print '> Done'