Пример #1
0
    def test_multi_chr_dataset(self):
        """Tests that the lassa dataset can be read.

        This does not test that the genomes are read correctly -- just
        that they can be read without issues.
        """
        genomes = seq_io.read_dataset_genomes(lassa)
Пример #2
0
def main(args):
    # Read the genomes from FASTA sequences
    genomes_grouped = []
    genomes_grouped_names = []
    for ds in args.dataset:
        if ds.startswith('download:'):
            # Download a FASTA for an NCBI taxonomic ID
            taxid = ds[len('download:'):]
            ds_fasta_tf = ncbi_neighbors.construct_fasta_for_taxid(taxid)
            genomes_grouped += [
                seq_io.read_genomes_from_fasta(ds_fasta_tf.name)
            ]
            genomes_grouped_names += ['taxid:' + str(taxid)]
            ds_fasta_tf.close()
        elif os.path.isfile(ds):
            # Process a custom fasta file with sequences
            genomes_grouped += [seq_io.read_genomes_from_fasta(ds)]
            genomes_grouped_names += [os.path.basename(ds)]
        else:
            # Process an individual dataset
            try:
                dataset = importlib.import_module('catch.datasets.' + ds)
            except ImportError:
                raise ValueError("Unknown dataset %s" % ds)
            genomes_grouped += [seq_io.read_dataset_genomes(dataset)]
            genomes_grouped_names += [ds]

    if args.limit_target_genomes:
        genomes_grouped = [
            genomes[:args.limit_target_genomes] for genomes in genomes_grouped
        ]

    # Set the maximum number of processes in multiprocessing pools
    if args.max_num_processes:
        probe.set_max_num_processes_for_probe_finding_pools(
            args.max_num_processes)

    # Read the FASTA file of probes
    fasta = seq_io.read_fasta(args.probes_fasta)
    probes = [probe.Probe.from_str(seq) for _, seq in fasta.items()]

    # Run the coverage analyzer
    analyzer = coverage_analysis.Analyzer(
        probes,
        args.mismatches,
        args.lcf_thres,
        genomes_grouped,
        genomes_grouped_names,
        island_of_exact_match=args.island_of_exact_match,
        cover_extension=args.cover_extension,
        kmer_probe_map_k=args.kmer_probe_map_k)
    analyzer.run()
    if args.write_analysis_to_tsv:
        analyzer.write_data_matrix_as_tsv(args.write_analysis_to_tsv)
    if args.write_sliding_window_coverage:
        analyzer.write_sliding_window_coverage(
            args.write_sliding_window_coverage)
    if args.print_analysis:
        analyzer.print_analysis()
Пример #3
0
    def test_single_chr_dataset(self):
        """Tests that the genomes obtained from reading the
        ebola_zaire_with_2014 dataset are the same as those obtained
        from directly reading the FASTA.

        This is effectively executing most of the same code as
        seq_io.read_dataset_genomes() but does check that it correctly
        enters the condition of reading just one sequence per genome.
        """
        genomes = seq_io.read_dataset_genomes(ebola_zaire_with_2014)
        assert len(ebola_zaire_with_2014.fasta_paths) == 1
        desired_genomes = [
            genome.Genome.from_one_seq(s)
            for s in seq_io.read_fasta(ebola_zaire_with_2014.fasta_paths[0]).\
                values()
        ]
        self.assertEqual(genomes, desired_genomes)
Пример #4
0
    def setUp(self):
        """Read the dataset's genomes and create candidate probes.

        Only process the first 100 genomes to avoid using too much memory
        with the candidate probes.
        """
        # Disable logging
        logging.disable(logging.WARNING)

        seqs = [
            gnm.seqs[0]
            for gnm in seq_io.read_dataset_genomes(zaire_ebolavirus)
        ]
        seqs = seqs[:100]
        self.probes_100 = candidate_probes.make_candidate_probes_from_sequences(
            seqs, probe_length=100, probe_stride=50, min_n_string_length=2)
        self.probes_75 = candidate_probes.make_candidate_probes_from_sequences(
            seqs, probe_length=75, probe_stride=25, min_n_string_length=2)
Пример #5
0
def main(args):
    # Read the genomes from FASTA sequences
    genomes_grouped = []
    genomes_grouped_names = []
    for ds in args.dataset:
        try:
            dataset = importlib.import_module('catch.datasets.' + ds)
        except ImportError:
            raise ValueError("Unknown dataset %s" % ds)
        genomes_grouped += [seq_io.read_dataset_genomes(dataset)]
        genomes_grouped_names += [ds]

    if args.limit_target_genomes:
        genomes_grouped = [
            genomes[:args.limit_target_genomes] for genomes in genomes_grouped
        ]

    # Set the maximum number of processes in multiprocessing pools
    if args.max_num_processes:
        probe.set_max_num_processes_for_probe_finding_pools(
            args.max_num_processes)

    # Read the FASTA file of probes
    fasta = seq_io.read_fasta(args.probes_fasta)
    probes = [probe.Probe.from_str(seq) for _, seq in fasta.items()]

    # Run the coverage analyzer
    analyzer = coverage_analysis.Analyzer(
        probes,
        args.mismatches,
        args.lcf_thres,
        genomes_grouped,
        genomes_grouped_names,
        island_of_exact_match=args.island_of_exact_match,
        cover_extension=args.cover_extension,
        kmer_probe_map_k=args.kmer_probe_map_k)
    analyzer.run()
    if args.write_analysis_to_tsv:
        analyzer.write_data_matrix_as_tsv(args.write_analysis_to_tsv)
    if args.write_sliding_window_coverage:
        analyzer.write_sliding_window_coverage(
            args.write_sliding_window_coverage)
    if args.print_analysis:
        analyzer.print_analysis()
Пример #6
0
def main(args):
    # Read the FASTA sequences
    ds = args.dataset
    try:
        if os.path.isfile(ds):
            # Process a custom fasta file with sequences
            seqs = [seq_io.read_genomes_from_fasta(ds)]
        else:
            dataset = importlib.import_module(
                'catch.datasets.' + ds)
            seqs = [seq_io.read_dataset_genomes(dataset)]
    except ImportError:
        raise ValueError("Unknown file or dataset '%s'" % ds)

    if (args.limit_target_genomes and
            args.limit_target_genomes_randomly_with_replacement):
        raise Exception(("Cannot --limit-target-genomes and "
                         "--limit-target-genomes-randomly-with-replacement at "
                         "the same time"))
    elif args.limit_target_genomes:
        seqs = [genomes[:args.limit_target_genomes] for genomes in seqs]
    elif args.limit_target_genomes_randomly_with_replacement:
        k = args.limit_target_genomes_randomly_with_replacement
        seqs = [random.choices(genomes, k=k) for genomes in seqs]

    # Setup the filters needed for replication
    filters = []
    # The filters we use are, in order:

    #  Duplicate filter (df) -- condense all candidate probes that
    #  are identical down to one; this is not necessary for
    #  correctness, as the naive redundant filter achieves the same
    #  task implicitly, but it does significantly lower runtime by
    #  decreasing the input size to the naive redundant filter
    df = duplicate_filter.DuplicateFilter()
    filters += [df]

    if args.naive_redundant_filter and args.dominating_set_filter:
        raise Exception(("Cannot use both 'naive_redundant_filter' and "
            "'dominating_set_filter' at the same time. (You could "
            "of course do one after the other, but it was probably "
            "a mistake to specify both.)"))
    elif args.naive_redundant_filter or args.dominating_set_filter:
        if args.naive_redundant_filter:
            # Naive redundant filter -- execute a greedy algorithm to
            # condense 'similar' probes down to one
            mismatches, lcf_thres = args.naive_redundant_filter
            filt_class = naive_redundant_filter.NaiveRedundantFilter
        if args.dominating_set_filter:
            # Dominating set filter (dsf) -- construct a graph where each
            # node is a probe and edges connect 'similar' probes; then
            # approximate the smallest dominating set
            mismatches, lcf_thres = args.dominating_set_filter
            filt_class = dominating_set_filter.DominatingSetFilter
        # Construct a function to determine whether two probes are
        # redundant, and then instantiate the appropriate filter
        redundant_fn = naive_redundant_filter.redundant_longest_common_substring(
                            mismatches, lcf_thres)
        filt = filt_class(redundant_fn)
        filters += [filt]

    if args.add_reverse_complements:
        # Reverse complement (rc) -- add the reverse complement of
        # each probe as a candidate
        rc = reverse_complement_filter.ReverseComplementFilter()
        filters += [rc]

    # Design the probes
    pb = probe_designer.ProbeDesigner(seqs, filters,
                                      probe_length=args.probe_length,
                                      probe_stride=args.probe_stride)
    pb.design()

    if args.print_analysis:
        if args.naive_redundant_filter or args.dominating_set_filter:
            mismatch_thres = mismatches
        else:
            mismatch_thres = 0
        analyzer = coverage_analysis.Analyzer(pb.final_probes,
                                              mismatch_thres,
                                              args.probe_length,
                                              seqs,
                                              [args.dataset])
        analyzer.run()
        analyzer.print_analysis()
    else:
        # Just print the number of probes
        print(len(pb.final_probes))
Пример #7
0
def main(args):
    logger = logging.getLogger(__name__)

    # Set NCBI API key
    if args.ncbi_api_key:
        ncbi_neighbors.ncbi_api_key = args.ncbi_api_key

    # Read the genomes from FASTA sequences
    genomes_grouped = []
    genomes_grouped_names = []
    for ds in args.dataset:
        if ds.startswith('collection:'):
            # Process a collection of datasets
            collection_name = ds[len('collection:'):]
            try:
                collection = importlib.import_module(
                    'catch.datasets.collections.' + collection_name)
            except ImportError:
                raise ValueError("Unknown dataset collection %s" %
                                 collection_name)
            for name, dataset in collection.import_all():
                genomes_grouped += [seq_io.read_dataset_genomes(dataset)]
                genomes_grouped_names += [name]
        elif ds.startswith('download:'):
            # Download a FASTA for an NCBI taxonomic ID
            taxid = ds[len('download:'):]
            if args.write_taxid_acc:
                taxid_fn = os.path.join(args.write_taxid_acc,
                                        str(taxid) + '.txt')
            else:
                taxid_fn = None
            if '-' in taxid:
                taxid, segment = taxid.split('-')
            else:
                segment = None
            ds_fasta_tf = ncbi_neighbors.construct_fasta_for_taxid(
                taxid, segment=segment, write_to=taxid_fn)
            genomes_grouped += [
                seq_io.read_genomes_from_fasta(ds_fasta_tf.name)
            ]
            genomes_grouped_names += ['taxid:' + str(taxid)]
            ds_fasta_tf.close()
        elif os.path.isfile(ds):
            # Process a custom fasta file with sequences
            genomes_grouped += [seq_io.read_genomes_from_fasta(ds)]
            genomes_grouped_names += [os.path.basename(ds)]
        else:
            # Process an individual dataset
            try:
                dataset = importlib.import_module('catch.datasets.' + ds)
            except ImportError:
                raise ValueError("Unknown file or dataset '%s'" % ds)
            genomes_grouped += [seq_io.read_dataset_genomes(dataset)]
            genomes_grouped_names += [ds]

    if (args.limit_target_genomes
            and args.limit_target_genomes_randomly_with_replacement):
        raise Exception(("Cannot --limit-target-genomes and "
                         "--limit-target-genomes-randomly-with-replacement at "
                         "the same time"))
    elif args.limit_target_genomes:
        genomes_grouped = [
            genomes[:args.limit_target_genomes] for genomes in genomes_grouped
        ]
    elif args.limit_target_genomes_randomly_with_replacement:
        k = args.limit_target_genomes_randomly_with_replacement
        genomes_grouped = [
            random.choices(genomes, k=k) for genomes in genomes_grouped
        ]

    # Store the FASTA paths of blacklisted genomes
    blacklisted_genomes_fasta = []
    if args.blacklist_genomes:
        for bg in args.blacklist_genomes:
            if os.path.isfile(bg):
                # Process a custom fasta file with sequences
                blacklisted_genomes_fasta += [bg]
            else:
                # Process an individual dataset
                try:
                    dataset = importlib.import_module('catch.datasets.' + bg)
                except ImportError:
                    raise ValueError("Unknown file or dataset '%s'" % bg)
                for fp in dataset.fasta_paths:
                    blacklisted_genomes_fasta += [fp]

    # Setup and verify parameters related to probe length
    if not args.lcf_thres:
        args.lcf_thres = args.probe_length
    if args.probe_stride > args.probe_length:
        logger.warning(("PROBE_STRIDE (%d) is greater than PROBE_LENGTH "
                        "(%d), which is usually undesirable and may lead "
                        "to undefined behavior"), args.probe_stride,
                       args.probe_length)
    if args.lcf_thres > args.probe_length:
        logger.warning(("LCF_THRES (%d) is greater than PROBE_LENGTH "
                        "(%d), which is usually undesirable and may lead "
                        "to undefined behavior"), args.lcf_thres,
                       args.probe_length)
    if args.island_of_exact_match > args.probe_length:
        logger.warning(("ISLAND_OF_EXACT_MATCH (%d) is greater than "
                        "PROBE_LENGTH (%d), which is usually undesirable "
                        "and may lead to undefined behavior"),
                       args.island_of_exact_match, args.probe_length)

    # Setup and verify parameters related to k-mer length in probe map
    if args.kmer_probe_map_k:
        # Check that k is sufficiently small
        if args.kmer_probe_map_k > args.probe_length:
            raise Exception(("KMER_PROBE_MAP_K (%d) exceeds PROBE_LENGTH "
                             "(%d), which is not permitted") %
                            (args.kmer_probe_map_k, args.probe_length))

        # Use this value for the SetCoverFilter, AdapterFilter, and
        # the Analyzer
        kmer_probe_map_k_scf = args.kmer_probe_map_k
        kmer_probe_map_k_af = args.kmer_probe_map_k
        kmer_probe_map_k_analyzer = args.kmer_probe_map_k
    else:
        if args.probe_length <= 20:
            logger.warning(("PROBE_LENGTH (%d) is small; you may want to "
                            "consider setting --kmer-probe-map-k to be "
                            "small as well in order to be more sensitive "
                            "in mapping candidate probes to target sequence"),
                           args.probe_length)

        # Use a default k of 20 for the SetCoverFilter and AdapterFilter,
        # and 10 for the Analyzer since we would like to be more sensitive
        # (potentially at the cost of slower runtime) for the latter
        kmer_probe_map_k_scf = 20
        kmer_probe_map_k_af = 20
        kmer_probe_map_k_analyzer = 10

    # Set the maximum number of processes in multiprocessing pools
    if args.max_num_processes:
        probe.set_max_num_processes_for_probe_finding_pools(
            args.max_num_processes)
        cluster.set_max_num_processes_for_creating_distance_matrix(
            args.max_num_processes)

    # Raise exceptions or warn based on use of adapter arguments
    if args.add_adapters:
        if not (args.adapter_a or args.adapter_b):
            logger.warning(("Adapter sequences will be added, but default "
                            "sequences will be used; to provide adapter "
                            "sequences, use --adapter-a and --adapter-b"))
    else:
        if args.adapter_a or args.adapter_b:
            raise Exception(
                ("Adapter sequences were provided with "
                 "--adapter-a and --adapter-b, but --add-adapters is required "
                 "to add adapter sequences onto the ends of probes"))

    # Do not allow both --small-seq-skip and --small-seq-min, since they
    # have different intentions
    if args.small_seq_skip is not None and args.small_seq_min is not None:
        raise Exception(("Both --small-seq-skip and --small-seq-min were "
                         "specified, but both cannot be used together"))

    # Check arguments involving clustering
    if args.cluster_and_design_separately and args.identify:
        raise Exception(
            ("Cannot use --cluster-and-design-separately with "
             "--identify, because clustering collapses genome groupings into "
             "one"))
    if args.cluster_from_fragments and not args.cluster_and_design_separately:
        raise Exception(("Cannot use --cluster-from-fragments without also "
                         "setting --cluster-and-design-separately"))

    # Check for whether a custom hybridization function was provided
    if args.custom_hybridization_fn:
        custom_cover_range_fn = tuple(args.custom_hybridization_fn)
    else:
        custom_cover_range_fn = None
    if args.custom_hybridization_fn_tolerant:
        custom_cover_range_tolerant_fn = tuple(
            args.custom_hybridization_fn_tolerant)
    else:
        custom_cover_range_tolerant_fn = None

    # Setup the filters
    # The filters we use are, in order:
    filters = []

    # [Optional]
    # Fasta filter (ff) -- leave out candidate probes
    if args.filter_from_fasta:
        ff = fasta_filter.FastaFilter(args.filter_from_fasta,
                                      skip_reverse_complements=True)
        filters += [ff]

    # [Optional]
    # Poly(A) filter (paf) -- leave out probes with stretches of 'A' or 'T'
    if args.filter_polya:
        polya_length, polya_mismatches = args.filter_polya
        if polya_length > args.probe_length:
            logger.warning(("Length of poly(A) stretch to filter (%d) is "
                            "greater than PROBE_LENGTH (%d), which is usually "
                            "undesirable"), polya_length, args.probe_length)
        if polya_length < 10:
            logger.warning(("Length of poly(A) stretch to filter (%d) is "
                            "short, and may lead to many probes being "
                            "filtered"), polya_length)
        if polya_mismatches > 10:
            logger.warning(("Number of mismatches to tolerate when searching "
                            "for poly(A) stretches (%d) is high, and may "
                            "lead to many probes being filtered"),
                           polya_mismatches)
        paf = polya_filter.PolyAFilter(polya_length, polya_mismatches)
        filters += [paf]

    # Duplicate filter (df) -- condense all candidate probes that
    #     are identical down to one; this is not necessary for
    #     correctness, as the set cover filter achieves the same task
    #     implicitly, but it does significantly lower runtime by
    #     decreasing the input size to the set cover filter
    # Near duplicate filter (ndf) -- condense candidate probes that
    #     are near-duplicates down to one using locality-sensitive
    #     hashing; like the duplicate filter, this is not necessary
    #     but can significantly lower runtime and reduce memory usage
    #     (even more than the duplicate filter)
    if (args.filter_with_lsh_hamming is not None
            and args.filter_with_lsh_minhash is not None):
        raise Exception(("Cannot use both --filter-with-lsh-hamming "
                         "and --filter-with-lsh-minhash"))
    if args.filter_with_lsh_hamming is not None:
        if args.filter_with_lsh_hamming > args.mismatches:
            logger.warning(
                ("Setting FILTER_WITH_LSH_HAMMING (%d) to be greater "
                 "than MISMATCHES (%d) may cause the probes to achieve less "
                 "than the desired coverage"), args.filter_with_lsh_hamming,
                args.mismatches)
        ndf = near_duplicate_filter.NearDuplicateFilterWithHammingDistance(
            args.filter_with_lsh_hamming, args.probe_length)
        filters += [ndf]
    elif args.filter_with_lsh_minhash is not None:
        ndf = near_duplicate_filter.NearDuplicateFilterWithMinHash(
            args.filter_with_lsh_minhash)
        filters += [ndf]
    else:
        df = duplicate_filter.DuplicateFilter()
        filters += [df]

    # Set cover filter (scf) -- solve the problem by treating it as
    #     an instance of the set cover problem
    scf = set_cover_filter.SetCoverFilter(
        mismatches=args.mismatches,
        lcf_thres=args.lcf_thres,
        island_of_exact_match=args.island_of_exact_match,
        mismatches_tolerant=args.mismatches_tolerant,
        lcf_thres_tolerant=args.lcf_thres_tolerant,
        island_of_exact_match_tolerant=args.island_of_exact_match_tolerant,
        custom_cover_range_fn=custom_cover_range_fn,
        custom_cover_range_tolerant_fn=custom_cover_range_tolerant_fn,
        identify=args.identify,
        blacklisted_genomes=blacklisted_genomes_fasta,
        coverage=args.coverage,
        cover_extension=args.cover_extension,
        cover_groupings_separately=args.cover_groupings_separately,
        kmer_probe_map_k=kmer_probe_map_k_scf,
        kmer_probe_map_use_native_dict=args.
        use_native_dict_when_finding_tolerant_coverage)
    filters += [scf]

    # [Optional]
    # Adapter filter (af) -- add adapters to both the 5' and 3' ends
    #    of each probe
    if args.add_adapters:
        # Set default adapter sequences, if not provided
        if args.adapter_a:
            adapter_a = tuple(args.adapter_a)
        else:
            adapter_a = ('ATACGCCATGCTGGGTCTCC', 'CGTACTTGGGAGTCGGCCAT')
        if args.adapter_b:
            adapter_b = tuple(args.adapter_b)
        else:
            adapter_b = ('AGGCCCTGGCTGCTGATATG', 'GACCTTTTGGGACAGCGGTG')

        af = adapter_filter.AdapterFilter(adapter_a,
                                          adapter_b,
                                          mismatches=args.mismatches,
                                          lcf_thres=args.lcf_thres,
                                          island_of_exact_match=\
                                            args.island_of_exact_match,
                                          custom_cover_range_fn=\
                                            custom_cover_range_fn,
                                          kmer_probe_map_k=kmer_probe_map_k_af)
        filters += [af]

    # [Optional]
    # N expansion filter (nef) -- expand Ns in probe sequences
    # to avoid ambiguity
    if args.expand_n is not None:
        nef = n_expansion_filter.NExpansionFilter(
            limit_n_expansion_randomly=args.expand_n)
        filters += [nef]

    # [Optional]
    # Reverse complement (rc) -- add the reverse complement of each
    #    probe that remains
    if args.add_reverse_complements:
        rc = reverse_complement_filter.ReverseComplementFilter()
        filters += [rc]

    # If requested, don't apply the set cover filter
    if args.skip_set_cover:
        filter_before_scf = filters[filters.index(scf) - 1]
        filters.remove(scf)

    # Define parameters for clustering sequences
    if args.cluster_and_design_separately:
        cluster_threshold = args.cluster_and_design_separately
        if args.skip_set_cover:
            cluster_merge_after = filter_before_scf
        else:
            cluster_merge_after = scf
        cluster_fragment_length = args.cluster_from_fragments
    else:
        cluster_threshold = None
        cluster_merge_after = None
        cluster_fragment_length = None

    # Design the probes
    pb = probe_designer.ProbeDesigner(
        genomes_grouped,
        filters,
        probe_length=args.probe_length,
        probe_stride=args.probe_stride,
        allow_small_seqs=args.small_seq_min,
        seq_length_to_skip=args.small_seq_skip,
        cluster_threshold=cluster_threshold,
        cluster_merge_after=cluster_merge_after,
        cluster_fragment_length=cluster_fragment_length)
    pb.design()

    # Write the final probes to the file args.output_probes
    seq_io.write_probe_fasta(pb.final_probes, args.output_probes)

    if (args.print_analysis or args.write_analysis_to_tsv
            or args.write_sliding_window_coverage
            or args.write_probe_map_counts_to_tsv):
        analyzer = coverage_analysis.Analyzer(
            pb.final_probes,
            args.mismatches,
            args.lcf_thres,
            genomes_grouped,
            genomes_grouped_names,
            island_of_exact_match=args.island_of_exact_match,
            custom_cover_range_fn=custom_cover_range_fn,
            cover_extension=args.cover_extension,
            kmer_probe_map_k=kmer_probe_map_k_analyzer,
            rc_too=args.add_reverse_complements)
        analyzer.run()
        if args.write_analysis_to_tsv:
            analyzer.write_data_matrix_as_tsv(args.write_analysis_to_tsv)
        if args.write_sliding_window_coverage:
            analyzer.write_sliding_window_coverage(
                args.write_sliding_window_coverage)
        if args.write_probe_map_counts_to_tsv:
            analyzer.write_probe_map_counts(args.write_probe_map_counts_to_tsv)
        if args.print_analysis:
            analyzer.print_analysis()
    else:
        # Just print the number of probes
        print(len(pb.final_probes))