示例#1
0
    def test_correct_multidim_result(self):

        data = np.random.randn(10, 16, 2)

        correct_ans = np.array([dtw_std(x, y) for x, y in combinations(data, 2)])

        parallel_ans = parallel_pdist(data, n_processes=1)
        assert_array_equal(correct_ans, parallel_ans)
示例#2
0
def main():
    # --- Argument parsing -----------------------
    parser = argument_parser()

    args = parser.parse_args()
    if args.datasets and args.processed_dataset:
        parser.error(
            'Must specify either --dataset or --processed_dataset only.')
    elif not args.processed_dataset:
        if not args.regions or not args.datasets:
            parser.error('Must specify both --regions and --dataset')

    if args.metric is None:
        if args.processed_dataset:
            parser.error('Must provide a metric if using processed dataset')
        elif len(args.datasets) >= 2:
            print "> Defaulting to cosine distance as more than 2 dataset given"
            args.metric = 'cosine'
        else:
            print "> Defaulting to sqeuclidean distance as only one dataset given"
            args.metric = 'sqeuclidean'
    elif args.metric == 'cosine':
        if args.datasets and len(args.datasets) < 2:
            parser.error(
                'Cannot use cosine distance with just one dataset. Choose sqeuclidean or euclidean instead.'
            )

    if args.no_dtw:
        # That's what no-dtw actually does
        args.slanted_band = 0
        args.scale = True
        if args.prototyping_method is None:
            args.prototyping_method = 'mean'
    else:
        if args.prototyping_method is None:
            args.prototyping_method = 'standard'

    if args.verbose:
        logging.root.setLevel(logging.DEBUG)

    # Disable trying to reverse regions if strand information given
    if args.use_strand_information:
        args.no_reverse = True

    configuration = Configuration(args)

    # --- pre-processing ------------------------
    if args.regions:
        print '> Reading regions from {0!r} ....'.format(args.regions)
        regions, total_regions, used_regions = read_regions(
            args.regions, args.random_sample, args.resolution)
        if args.use_strand_information and not regions.has_strand_data():
            logging.debug('Parsed columns: {0}'.format(regions.columns))
            parser.error(
                '--use-strand-information is set but the input BED file has no strand information.'
            )

        too_short_regions = (regions.lengths / args.resolution
                             ) < args.min_bins  # Set the threshold to 4 bins
        too_short_regions = regions.ix[
            too_short_regions[too_short_regions].index]
        if len(too_short_regions) > 0:
            print '> {0} regions have their length shorter than {1} bins. Saving them to {2!r} as they won\'t be processed'\
                .format(len(too_short_regions), args.min_bins, configuration.too_short_regions_filename)
            too_short_regions.to_bed(configuration.too_short_regions_filename)

            regions = regions.ix[regions.index - too_short_regions.index]

        if args.max_bins:
            too_long_regions = (regions.lengths /
                                args.resolution) >= args.max_bins
            too_long_regions = regions.ix[
                too_long_regions[too_long_regions].index]

            if len(too_long_regions) > 0:
                print '> {0} regions have their length longer than {1} bins. ' \
                      'Saving them to {2!r} as they won\'t be processed due to --max-bins constraint'\
                      .format(len(too_long_regions), args.max_bins, configuration.too_long_regions_filename)
                too_long_regions.to_bed(
                    configuration.too_long_regions_filename)

                regions = regions.ix[regions.index - too_long_regions.index]

        print '> {0} regions remain'.format(len(regions))
    else:
        regions = None

    if args.points_of_interest:
        print '> Reading points of interest'

        poi_file = args.points_of_interest
        try:
            poi = from_simple(poi_file,
                              regions,
                              resolution=configuration.resolution)
        except ValueError:
            poi = Regions.from_bed(poi_file)
            poi = poi.as_bins_of(
                regions,
                resolution=configuration.resolution,
                ignore_non_overlaps=args.ignore_poi_non_overlaps,
                account_for_strand_information=configuration.
                use_strand_information)

        if not poi:
            raise Exception(
                'POI file provided, but no POIs were parsed from {}. Try using dgw-overlaps2poi'
                .format(poi_file))
    else:
        poi = None

    print '> Reading dataset ...'
    dataset, missing_regions, filtered_regions = read_datasets(args, regions)

    if args.datasets:

        if poi:
            dataset.add_points_of_interest(poi, name=args.points_of_interest)

            if args.ignore_no_poi_regions:
                poi_dataset = dataset.drop_no_pois()

                if len(poi_dataset) != len(dataset):
                    dropped_regions = regions.ix[dataset.items -
                                                 poi_dataset.items]
                    print '> {0} regions were removed as they have no POI data with them ' \
                          'and --ignore-no-poi-regions was set'.format(len(dropped_regions))
                    print '> Saving them to {0!r}'.format(
                        configuration.no_poi_regions_filename)
                    dropped_regions.to_bed(
                        configuration.no_poi_regions_filename)
                    dataset = poi_dataset
                    del dropped_regions
                del poi_dataset

        if len(missing_regions) > 0:
            print "> {0} regions were not found in the dataset, they were saved to {1}".format(
                len(missing_regions), configuration.missing_regions_filename)
            regions.ix[missing_regions].to_bed(
                configuration.missing_regions_filename,
                track_title='DGWMissingRegions',
                track_description=
                'Regions that are in input, but missing from the dataset')

        if len(filtered_regions) > 0:
            print "> {0} regions were filtered out from dataset due to --min-pileup constraint, they were saved to {1}".format(
                len(filtered_regions), configuration.filtered_regions_filename)
            regions.ix[filtered_regions].to_bed(
                configuration.filtered_regions_filename,
                track_title='DGWFilteredRegions',
                track_description=
                'Regions that were filtered out from the dataset')

        # Get remaining regions
        regions = regions.ix[dataset.items]
        if len(missing_regions) > 0 or len(filtered_regions) > 0:
            print '> {0} regions remaining and will be processed'.format(
                len(regions))

        if args.output_raw_dataset:
            print '> Serialising raw dataset to {0}'.format(
                configuration.raw_dataset_filename)
            serialise(dataset, configuration.raw_dataset_filename)

        dataset = dataset.to_log_scale()

        if args.normalise_pileups:
            print '> Dividing the number of reads in each bin by the maximum number of reads per region as --normalise-pileups is set'
            dataset = dataset.normalise_bin_heights()

        missing_regions = regions.index - dataset.items

        if len(missing_regions) > 0:
            print "> {0} regions were not found in the dataset, they were saved to {1}".format(
                len(missing_regions), configuration.missing_regions_filename)
            regions.ix[missing_regions].to_bed(
                configuration.missing_regions_filename,
                track_title='DGWMissingRegions',
                track_description=
                'Regions that are in input, but missing from the dataset')
    else:
        print "> Not converting dataset to log scale as processed dataset already provided"

    # --- Serialise the regions as they will be needed in explorer ----------
    if regions is not None:
        print '> Serialising regions to {0}'.format(
            configuration.parsed_regions_filename)
        serialise(regions, configuration.parsed_regions_filename)

    # --- Saving of dataset -------------------
    print '> Saving dataset to {0}'.format(configuration.dataset_filename)
    serialise(dataset, configuration.dataset_filename)

    if not args.blank:
        # --- actual work ---------------------------
        print '> Calculating pairwise distances (this might take a while) ...'
        if args.n_processes is not None:
            print '> Using {0} processes'.format(args.n_processes)
        else:
            args.n_processes = cpu_count()
            print '> Using all available cpu cores ({0})'.format(
                args.n_processes)

        if args.no_dtw:
            print '> Not using DTW as --no-dtw option is set'

        logging.debug('Running DTW with the following kwargs: {0!r}'.format(
            configuration.dtw_kwargs))
        start = datetime.now()
        dm = parallel_pdist(dataset, args.n_processes,
                            **configuration.dtw_kwargs)
        end = datetime.now()

        delta = end - start
        print '> Pairwise distances calculation took {0} s'.format(
            delta.total_seconds())

        if args.random_sample:
            multiplier = binomial_coefficent(total_regions, 2) / float(
                binomial_coefficent(args.random_sample, 2))
            print '> Expected calculation duration if random-sample was not used: {0} s'\
                   .format(delta.total_seconds() * multiplier)

        # --- Saving of the work --------------
        if configuration.pairwise_distances_filename:
            print '> Saving the pairwise distance matrix to {0!r}'.format(
                configuration.pairwise_distances_filename)
            np.save(configuration.pairwise_distances_filename, dm)

        # Linkage matrix
        print '> Computing linkage matrix'
        linkage = fastcluster.complete(dm)

        print '> Saving linkage matrix to {0!r}'.format(
            configuration.linkage_filename)
        np.save(configuration.linkage_filename, linkage)

        print '> Computing prototypes'
        # Hierarchical clustering object to compute the prototypes
        hc = HierarchicalClustering(
            dataset,
            regions,
            linkage,
            dtw_function=configuration.dtw_function,
            prototyping_method=configuration.prototyping_method)
        prototypes = hc.extract_prototypes()
        print '> Saving prototypes to {0!r}'.format(
            configuration.prototypes_filename)
        serialise(prototypes, configuration.prototypes_filename)

        print '> Computing warping paths'
        nodes = hc.tree_nodes_list
        paths = compute_paths(dataset,
                              nodes,
                              hc.num_obs,
                              n_processes=args.n_processes,
                              **configuration.dtw_kwargs)
        print '> Saving warping paths to {0!r}'.format(
            configuration.warping_paths_filename)
        serialise(paths, configuration.warping_paths_filename)
    else:
        print '> Skipping pairwise distances step because of --blank option set'

    print '> Saving configuration to {0!r}'.format(
        configuration.configuration_filename)
    f = open(configuration.configuration_filename, 'w')
    try:
        configuration.to_json(f)
    finally:
        f.close()

    print '> Done'
示例#3
0
 def test_result_returned_same_as_scipy_spatial_distance_pdist_one_process_with_metric(self):
     correct_ans = pdist(self.sample_data, lambda x, y: dtw_std(x, y, metric='euclidean'))
     parallel_ans = parallel_pdist(self.sample_data_three_dim, n_processes=1, metric='euclidean')
     assert_array_equal(correct_ans, parallel_ans)
示例#4
0
 def test_result_returned_same_as_scipy_spatial_distance_pdist(self):
     correct_ans = pdist(self.sample_data, dtw_std)
     parallel_ans = parallel_pdist(self.sample_data_three_dim)
     assert_array_equal(correct_ans, parallel_ans)
示例#5
0
文件: worker.py 项目: lukauskas/dgw
def main():
    # --- Argument parsing -----------------------
    parser = argument_parser()

    args = parser.parse_args()
    if args.datasets and args.processed_dataset:
        parser.error('Must specify either --dataset or --processed_dataset only.')
    elif not args.processed_dataset:
        if not args.regions or not args.datasets:
            parser.error('Must specify both --regions and --dataset')

    if args.metric is None:
        if args.processed_dataset:
            parser.error('Must provide a metric if using processed dataset')
        elif len(args.datasets) >= 2:
            print "> Defaulting to cosine distance as more than 2 dataset given"
            args.metric = 'cosine'
        else:
            print "> Defaulting to sqeuclidean distance as only one dataset given"
            args.metric = 'sqeuclidean'
    elif args.metric == 'cosine':
        if args.datasets and len(args.datasets) < 2:
            parser.error('Cannot use cosine distance with just one dataset. Choose sqeuclidean or euclidean instead.')

    if args.no_dtw:
        # That's what no-dtw actually does
        args.slanted_band = 0
        args.scale = True
        if args.prototyping_method is None:
            args.prototyping_method = 'mean'
    else:
        if args.prototyping_method is None:
            args.prototyping_method = 'standard'

    if args.verbose:
        logging.root.setLevel(logging.DEBUG)

    # Disable trying to reverse regions if strand information given
    if args.use_strand_information:
        args.no_reverse = True

    configuration = Configuration(args)

    # --- pre-processing ------------------------
    if args.regions:
        print '> Reading regions from {0!r} ....'.format(args.regions)
        regions, total_regions, used_regions = read_regions(args.regions, args.random_sample, args.resolution)
        if args.use_strand_information and not regions.has_strand_data():
            logging.debug('Parsed columns: {0}'.format(regions.columns))
            parser.error('--use-strand-information is set but the input BED file has no strand information.')

        too_short_regions = (regions.lengths / args.resolution) < args.min_bins  # Set the threshold to 4 bins
        too_short_regions = regions.ix[too_short_regions[too_short_regions].index]
        if len(too_short_regions) > 0:
            print '> {0} regions have their length shorter than {1} bins. Saving them to {2!r} as they won\'t be processed'\
                .format(len(too_short_regions), args.min_bins, configuration.too_short_regions_filename)
            too_short_regions.to_bed(configuration.too_short_regions_filename)

            regions = regions.ix[regions.index - too_short_regions.index]

        if args.max_bins:
            too_long_regions = (regions.lengths / args.resolution) >= args.max_bins
            too_long_regions = regions.ix[too_long_regions[too_long_regions].index]

            if len(too_long_regions) > 0:
                print '> {0} regions have their length longer than {1} bins. ' \
                      'Saving them to {2!r} as they won\'t be processed due to --max-bins constraint'\
                      .format(len(too_long_regions), args.max_bins, configuration.too_long_regions_filename)
                too_long_regions.to_bed(configuration.too_long_regions_filename)

                regions = regions.ix[regions.index - too_long_regions.index]

        print '> {0} regions remain'.format(len(regions))
    else:
        regions = None

    if args.points_of_interest:
        print '> Reading points of interest'

        poi_file = args.points_of_interest
        try:
            poi = from_simple(poi_file, regions, resolution=configuration.resolution)
        except ValueError:
            poi = Regions.from_bed(poi_file)
            poi = poi.as_bins_of(regions, resolution=configuration.resolution,
                                 ignore_non_overlaps=args.ignore_poi_non_overlaps,
                                 account_for_strand_information=configuration.use_strand_information)

        if not poi:
            raise Exception(
                'POI file provided, but no POIs were parsed from {}. Try using dgw-overlaps2poi'.format(
                    poi_file))
    else:
        poi = None

    print '> Reading dataset ...'
    dataset, missing_regions, filtered_regions = read_datasets(args, regions)

    if args.datasets:

        if poi:
            dataset.add_points_of_interest(poi, name=args.points_of_interest)

            if args.ignore_no_poi_regions:
                poi_dataset = dataset.drop_no_pois()

                if len(poi_dataset) != len(dataset):
                    dropped_regions = regions.ix[dataset.items - poi_dataset.items]
                    print '> {0} regions were removed as they have no POI data with them ' \
                          'and --ignore-no-poi-regions was set'.format(len(dropped_regions))
                    print '> Saving them to {0!r}'.format(configuration.no_poi_regions_filename)
                    dropped_regions.to_bed(configuration.no_poi_regions_filename)
                    dataset = poi_dataset
                    del dropped_regions
                del poi_dataset

        if len(missing_regions) > 0:
            print "> {0} regions were not found in the dataset, they were saved to {1}".format(len(missing_regions),
                                                                                configuration.missing_regions_filename)
            regions.ix[missing_regions].to_bed(configuration.missing_regions_filename, track_title='DGWMissingRegions',
                                   track_description='Regions that are in input, but missing from the dataset')

        if len(filtered_regions) > 0:
            print "> {0} regions were filtered out from dataset due to --min-pileup constraint, they were saved to {1}".format(len(filtered_regions),
                                                                                           configuration.filtered_regions_filename)
            regions.ix[filtered_regions].to_bed(configuration.filtered_regions_filename, track_title='DGWFilteredRegions',
                                        track_description='Regions that were filtered out from the dataset')

        # Get remaining regions
        regions = regions.ix[dataset.items]
        if len(missing_regions) > 0 or len(filtered_regions) > 0:
            print '> {0} regions remaining and will be processed'.format(len(regions))


        if args.output_raw_dataset:
            print '> Serialising raw dataset to {0}'.format(configuration.raw_dataset_filename)
            serialise(dataset, configuration.raw_dataset_filename)

        dataset = dataset.to_log_scale()

        if args.normalise_pileups:
            print '> Dividing the number of reads in each bin by the maximum number of reads per region as --normalise-pileups is set'
            dataset = dataset.normalise_bin_heights()


        missing_regions = regions.index - dataset.items

        if len(missing_regions) > 0:
            print "> {0} regions were not found in the dataset, they were saved to {1}".format(len(missing_regions),
                                                                                               configuration.missing_regions_filename)
            regions.ix[missing_regions].to_bed(configuration.missing_regions_filename, track_title='DGWMissingRegions',
                                               track_description='Regions that are in input, but missing from the dataset')
    else:
        print "> Not converting dataset to log scale as processed dataset already provided"

    # --- Serialise the regions as they will be needed in explorer ----------
    if regions is not None:
        print '> Serialising regions to {0}'.format(configuration.parsed_regions_filename)
        serialise(regions, configuration.parsed_regions_filename)

    # --- Saving of dataset -------------------
    print '> Saving dataset to {0}'.format(configuration.dataset_filename)
    serialise(dataset, configuration.dataset_filename)

    if not args.blank:
        # --- actual work ---------------------------
        print '> Calculating pairwise distances (this might take a while) ...'
        if args.n_processes is not None:
            print '> Using {0} processes'.format(args.n_processes)
        else:
            args.n_processes = cpu_count()
            print '> Using all available cpu cores ({0})'.format(args.n_processes)

        if args.no_dtw:
            print '> Not using DTW as --no-dtw option is set'

        logging.debug('Running DTW with the following kwargs: {0!r}'.format(configuration.dtw_kwargs))
        start = datetime.now()
        dm = parallel_pdist(dataset, args.n_processes, **configuration.dtw_kwargs)
        end = datetime.now()

        delta = end - start
        print '> Pairwise distances calculation took {0} s'.format(delta.total_seconds())

        if args.random_sample:
            multiplier = binomial_coefficent(total_regions, 2) / float(binomial_coefficent(args.random_sample, 2))
            print '> Expected calculation duration if random-sample was not used: {0} s'\
                   .format(delta.total_seconds() * multiplier)


        # --- Saving of the work --------------
        if configuration.pairwise_distances_filename:
            print '> Saving the pairwise distance matrix to {0!r}'.format(configuration.pairwise_distances_filename)
            np.save(configuration.pairwise_distances_filename, dm)

        # Linkage matrix
        print '> Computing linkage matrix'
        linkage = fastcluster.complete(dm)

        print '> Saving linkage matrix to {0!r}'.format(configuration.linkage_filename)
        np.save(configuration.linkage_filename, linkage)

        print '> Computing prototypes'
        # Hierarchical clustering object to compute the prototypes
        hc = HierarchicalClustering(dataset, regions, linkage, dtw_function=configuration.dtw_function,
                                    prototyping_method=configuration.prototyping_method)
        prototypes = hc.extract_prototypes()
        print '> Saving prototypes to {0!r}'.format(configuration.prototypes_filename)
        serialise(prototypes, configuration.prototypes_filename)

        print '> Computing warping paths'
        nodes = hc.tree_nodes_list
        paths = compute_paths(dataset, nodes, hc.num_obs, n_processes=args.n_processes,
                              **configuration.dtw_kwargs)
        print '> Saving warping paths to {0!r}'.format(configuration.warping_paths_filename)
        serialise(paths, configuration.warping_paths_filename)
    else:
        print '> Skipping pairwise distances step because of --blank option set'

    print '> Saving configuration to {0!r}'.format(configuration.configuration_filename)
    f = open(configuration.configuration_filename, 'w')
    try:
        configuration.to_json(f)
    finally:
        f.close()

    print '> Done'