예제 #1
0
def test_clusterfinder_include_duplicates(datadir_copy):  # noqa: D103
    input_path = str(datadir_copy[INPUT])
    cf = ClusterFinder(input_path=input_path,
                       include_duplicates=True,
                       max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
    assert len(cf.clusters) == 1
    assert cf.clusters[0].nalt == 26
예제 #2
0
def test_clusterfinder_multiple_cluster(datadir_copy, tmpdir):  # noqa: D103
    input_path = str(datadir_copy[EXTENDED])
    output_bam = tmpdir.join('tagged_clusters.bam')
    cf = ClusterFinder(input_path=input_path,
                       output_bam=output_bam.strpath,
                       max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
    assert len(cf.clusters) == 3
예제 #3
0
def test_clusterfinder_single_cluster(datadir_copy):  # noqa: D103
    input_path = str(datadir_copy[INPUT])
    cf = ClusterFinder(input_path=input_path,
                       max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
    assert len(cf.clusters) == 1
    assert cf.clusters[0].nalt == 19
    assert cf.clusters[0].reference_name == '3R'
예제 #4
0
def test_cornercase(datadir_copy, tmpdir):  # noqa: D103
    input_path = str(datadir_copy[CORNERCASE])
    output_gff = tmpdir.join('output.gff').strpath
    cf = ClusterFinder(input_path=input_path,
                       output_gff=output_gff,
                       min_mapq=-1,
                       max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
    assert len(cf.clusters) == 1
    input_path = str(datadir_copy[CORNERCASE2])
    cf = ClusterFinder(input_path=input_path,
                       output_gff=output_gff,
                       max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
    assert len(cf.clusters) == 0
    input_path = str(datadir_copy[CORNERCASE3])
    cf = ClusterFinder(input_path=input_path,
                       output_gff=output_gff,
                       max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
예제 #5
0
def test_nonevidence(datadir_copy):  # noqa: D103
    input_path = str(datadir_copy[NON_SUPPORT])
    result = ClusterFinder(input_path=input_path).clusters[:4]
    assert len(result) == 4
    assert result[0].nref == 12
    assert result[1].nref == 39
    assert result[2].nref == 32
    assert result[3].nref == 30
예제 #6
0
def test_clusterfinder_decoy_chromosome(datadir_copy, tmpdir,
                                        reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[DECOY])
    output_gff = tmpdir.join('output.gff').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=None,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=649)
    assert len(clusters.clusters) == 84
예제 #7
0
def test_clusterfinder_start_end_problem(datadir_copy, tmpdir,
                                         reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[START_END_PROBLEM])
    output_gff = tmpdir.join('output.gff').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=None,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=649,
                             skip_decoy=False)
    assert len(clusters.clusters) == 0
예제 #8
0
def test_clusterfinder_multisample(datadir_copy, tmpdir,
                                   reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[MULTI_H6])
    output_gff = tmpdir.join('output.gff').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=None,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=649)
    assert len(clusters.softclip_finder.clusters) == 8
    assert clusters.clusters[0].nalt == 1
    assert clusters.clusters[0].valid_tsd is False
예제 #9
0
def test_clusterfinder_split_cluster(datadir_copy, tmpdir):  # noqa: D103
    input_path = str(datadir_copy[SPLIT_CLUSTER])
    cf = ClusterFinder(input_path=input_path,
                       output_bam=tmpdir.join('output.bam').strpath,
                       include_duplicates=False,
                       remove_supplementary_without_primary=False,
                       output_gff=tmpdir.join('out.gff').strpath,
                       max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
    assert len(cf.clusters) == 3
    assert len(cf.clusters[1]) == 39
    assert len(cf.clusters[2]) == 27
    assert cf.clusters[1] != cf.clusters[2]
예제 #10
0
def test_clusterfinder_skip_abnormal(datadir_copy, tmpdir,
                                     reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[ARTEFACT_ACCUMULATION])
    output_gff = tmpdir.join('output.gff').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=None,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=649,
                             skip_decoy=False)
    assert len(clusters.clusters) == 26
    assert clusters.clusters[2].abnormal
예제 #11
0
def test_clusterfinder_reorganize_cluster(datadir_copy, tmpdir,
                                          reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[REORGANIZE_CLUSTER])
    output_gff = tmpdir.join('output.gff').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=None,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
    cluster = clusters.clusters[-1]
    genotype = cluster.genotype_likelihoods
    assert genotype.genotype == 'homozygous'
예제 #12
0
def test_clusterfinder_refine_split(datadir_copy, tmpdir):  # noqa: D103
    input_path = str(datadir_copy[SPLIT_CLUSTER_OPT])
    output_bam = tmpdir.join('output.bam').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=output_bam,
                             output_gff=tmpdir.join('output.gff').strpath,
                             transposon_reference_fasta=None,
                             max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
    cluster = clusters.clusters[0]
    genotype = cluster.genotype_likelihoods
    assert genotype.nref == 73
    assert genotype.nalt == 117
    assert genotype.genotype == 'heterozygous'
예제 #13
0
def test_clusterfinder_reassemble(datadir_copy, tmpdir,
                                  reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[REASSEMBLE])
    genome_reference_fasta = str(datadir_copy[GENOME_FRAGMENT])
    output_bam = tmpdir.join('output.bam').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=output_bam,
                             output_gff=tmpdir.join('output.gff').strpath,
                             genome_reference_fasta=genome_reference_fasta,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=480)
    assert len(clusters.clusters) == 1
    assert clusters.clusters[0].valid_tsd
예제 #14
0
def test_clusterfinder_refine_coord(datadir_copy, tmpdir):  # noqa: D103
    input_path = str(datadir_copy[REFINE_COORD])
    output_bam = tmpdir.join('output.bam').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=output_bam,
                             output_gff=tmpdir.join('output.gff').strpath,
                             transposon_reference_fasta=None,
                             max_proper_pair_size=480)
    cluster = clusters.clusters[-1]
    genotype = cluster.genotype_likelihoods
    assert genotype.nref == 0
    assert genotype.nalt == 3
    assert genotype.genotype == 'homozygous'
예제 #15
0
def test_clusterfinder_refine_tsd(datadir_copy, tmpdir,
                                  reference_fasta):  # noqa: D103, F811
    # This cluster should not be split, since there is a small deletion
    # associated with a TE insertion.
    input_path = str(datadir_copy[REFINE_TSD])
    output_gff = tmpdir.join('output.gff').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=None,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=649)
    cluster = clusters.clusters[0]
    assert cluster.valid_tsd
예제 #16
0
def test_clusterfinder_predicted_insertion_rover(
        datadir_copy, tmpdir, reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[PREDICTED_INSERTION])
    output_gff = tmpdir.join('output.gff').strpath
    output_fasta = tmpdir.join('output.fa').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=None,
                             output_fasta=output_fasta,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=649)
    assert len(clusters.softclip_finder.clusters) == 2
    assert len(clusters.clusters[0].feature_args) == 1
예제 #17
0
def test_clusterfinder_nonsupport(datadir_copy, tmpdir):  # noqa: D103
    input_path = str(datadir_copy[NON_SUPPORT])
    output_bam = tmpdir.join('output.bam').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=output_bam,
                             output_gff=None,
                             transposon_reference_fasta=None,
                             max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
    cluster = clusters.clusters[-1]
    assert cluster.nref == 25  # Could also be 26 -- need to figure that out.
    genotype = cluster.genotype_likelihoods
    assert genotype.nref == 25
    assert genotype.nalt == 1
    assert genotype.genotype == 'reference'
예제 #18
0
def test_clusterfinder_join_cluster_test(datadir_copy, tmpdir,
                                         reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy['join_cluster_test.bam'])
    output_bam = tmpdir.join('output.bam').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=output_bam,
                             output_gff=tmpdir.join('output.gff').strpath,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
    cluster_gate, cluster_rover = clusters.clusters
    assert len(cluster_gate) == 1
    assert len(cluster_rover) == 21
    assert not cluster_gate.valid_tsd
    assert cluster_rover.valid_tsd
예제 #19
0
def test_clusterfinder_skip_abnormal(datadir_copy, tmpdir,
                                     reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[ARTEFACT_ACCUMULATION])
    output_gff = tmpdir.join('output.gff').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=None,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=649,
                             skip_decoy=False)
    assert len(clusters.clusters) in (
        21, 22
    )  # TODO: this should really be deterministic, what's going on here?
    assert clusters.clusters[2].abnormal
예제 #20
0
def test_clusterfinder_jockey_not_found(datadir_copy, tmpdir,
                                        reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[JOCKEY_NOT_FOUND])
    output_gff = tmpdir.join('output.gff').strpath
    output_fasta = tmpdir.join('output.fa').strpath
    output_bam = tmpdir.join('output.bam').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=output_bam,
                             output_fasta=output_fasta,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=649)
    rover, jockey = clusters.clusters
    assert rover.insert_reference_name == 'rover'
    assert jockey.insert_reference_name == 'transposable_element_Ivk'
예제 #21
0
def test_clusterfinder_dont_split(datadir_copy, tmpdir,
                                  reference_fasta):  # noqa: D103, F811
    # This cluster should not be split, since there is a small deletion
    # associated with a TE insertion.
    input_path = str(datadir_copy[DONT_SPLIT])
    output_gff = tmpdir.join('output.gff').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=None,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=649)
    assert len(clusters.clusters) == 1
    cluster = clusters.clusters[0]
    assert cluster.nref == 18
    assert cluster.nalt == 37
예제 #22
0
def test_clusterfinder_nonsupport_reference_genotype(datadir_copy,
                                                     tmpdir):  # noqa: D103
    input_path = str(datadir_copy[MATE_SUPPORT_REFERENCE_GENOTYPE])
    output_bam = tmpdir.join('output.bam').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=output_bam,
                             output_gff=None,
                             transposon_reference_fasta=None,
                             max_proper_pair_size=578)
    cluster = clusters.clusters[-1]
    assert cluster.nref == 50  # Could also be 26 -- need to figure that out.
    genotype = cluster.genotype_likelihoods
    assert genotype.nref == 50
    assert genotype.nalt == 2
    assert genotype.genotype == 'reference'
예제 #23
0
def test_clusterfinder_estimate_coverage(datadir_copy, tmpdir,
                                         reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[DONT_MERGE_5])
    output_gff = tmpdir.join('output.gff').strpath
    output_bam = tmpdir.join('output.bam').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=output_bam,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=1500)
    assert len(clusters.clusters) == 2
    cluster_one = clusters.clusters[0]
    assert cluster_one.genotype == 'reference'
    assert cluster_one.nref == 66
    assert cluster_one.nalt == 4
예제 #24
0
def test_clusterfinder_homozygous_copia(datadir_copy, tmpdir,
                                        reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[HOMOZYGOUS_COPIA])
    output_bam = tmpdir.join('output.bam').strpath
    output_gff = tmpdir.join('output.gff').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=output_bam,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
    cluster = clusters.clusters[0]
    assert cluster.nalt == 57
    genotype = cluster.genotype_likelihoods
    assert genotype.genotype == 'homozygous'
    assert genotype.nref == 0
    assert genotype.nalt == 57
예제 #25
0
def test_clusterfinder_clip_assigned_to_insertion(
        datadir_copy, tmpdir, reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[CLIP_TO_INSERTION])
    output_gff = tmpdir.join('output.gff').strpath
    output_vcf = tmpdir.join('output.vcf').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=None,
                             output_gff=output_gff,
                             output_vcf=output_vcf,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=649)
    assert len(clusters.softclip_finder.clusters) == 16
    assert len(clusters.clusters) == 3
    assert clusters.clusters[1].nalt == 66
    assert clusters.clusters[1].valid_tsd
    assert len(clusters.clusters[1].feature_args) == 2
예제 #26
0
def test_clusterfinder_nanopore(datadir_copy, tmpdir,
                                reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[NANOPORE_ROVER])
    output_bam = tmpdir.join('output.bam').strpath
    output_gff = tmpdir.join('output.gff').strpath
    output_fasta = tmpdir.join('output.fasta').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=output_bam,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             output_fasta=output_fasta,
                             max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
    cluster = clusters.clusters[0]
    assert cluster.nalt == 6
    assert cluster.total_left_count == 1
    assert cluster.total_right_count == 5
예제 #27
0
def test_clusterfinder_check_consistency(datadir_copy, tmpdir,
                                         reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[DONT_MERGE_6])
    output_gff = tmpdir.join('output.gff').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=None,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=649)
    cluster_one = clusters.clusters[0]
    assert cluster_one.genotype == 'reference'
    assert cluster_one.nref == 69
    assert cluster_one.nalt == 3
    cluster_two = clusters.clusters[1]
    assert cluster_two.genotype == 'reference'
    assert cluster_two.nref == 64
    assert cluster_two.nalt == 1
예제 #28
0
def test_clusterfinder_do_not_merge7(datadir_copy, tmpdir,
                                     reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[DONT_MERGE_7])
    output_gff = tmpdir.join('output.gff').strpath
    output_bam = tmpdir.join('output.bam').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=output_bam,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=900)
    assert len(clusters.clusters) == 2
    cluster_one, cluster_two = clusters.clusters
    assert cluster_one.genotype == 'reference'
    assert cluster_one.nref == 31
    assert cluster_one.nalt == 2
    assert cluster_two.genotype == 'heterozygous'
    assert cluster_two.nref == 45
    assert cluster_two.nalt == 71
예제 #29
0
def test_clusterfinder_do_not_merge5(datadir_copy, tmpdir,
                                     reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[DONT_MERGE_5])
    output_gff = tmpdir.join('output.gff').strpath
    output_bam = tmpdir.join('output.bam').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=output_bam,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             max_proper_pair_size=649)
    cluster_one = clusters.clusters[0]
    assert cluster_one.genotype == 'reference'
    assert cluster_one.nref == 108
    assert cluster_one.nalt == 4
    cluster_two = clusters.clusters[1]
    assert cluster_two.genotype == 'reference'
    assert cluster_two.nref == 66
    assert cluster_two.nalt == 4
예제 #30
0
def test_clusterfinder_complex_genotype(datadir_copy, tmpdir,
                                        reference_fasta):  # noqa: D103, F811
    input_path = str(datadir_copy[COMPLEX])
    output_bam = tmpdir.join('output.bam').strpath
    output_gff = tmpdir.join('output.gff').strpath
    output_fasta = tmpdir.join('output.fasta').strpath
    clusters = ClusterFinder(input_path=input_path,
                             output_bam=output_bam,
                             output_gff=output_gff,
                             transposon_reference_fasta=reference_fasta,
                             output_fasta=output_fasta,
                             max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
    cluster = clusters.clusters[0]
    assert cluster.nalt == 28
    genotype = cluster.genotype_likelihoods
    assert genotype.nref == 17
    assert genotype.nalt == 28
    assert genotype.genotype == 'heterozygous'
    assert len(open(output_fasta).readlines()) == 4