def test_clusterfinder_include_duplicates(datadir_copy): # noqa: D103 input_path = str(datadir_copy[INPUT]) cf = ClusterFinder(input_path=input_path, include_duplicates=True, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE) assert len(cf.clusters) == 1 assert cf.clusters[0].nalt == 26
def test_clusterfinder_multiple_cluster(datadir_copy, tmpdir): # noqa: D103 input_path = str(datadir_copy[EXTENDED]) output_bam = tmpdir.join('tagged_clusters.bam') cf = ClusterFinder(input_path=input_path, output_bam=output_bam.strpath, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE) assert len(cf.clusters) == 3
def test_clusterfinder_single_cluster(datadir_copy): # noqa: D103 input_path = str(datadir_copy[INPUT]) cf = ClusterFinder(input_path=input_path, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE) assert len(cf.clusters) == 1 assert cf.clusters[0].nalt == 19 assert cf.clusters[0].reference_name == '3R'
def test_cornercase(datadir_copy, tmpdir): # noqa: D103 input_path = str(datadir_copy[CORNERCASE]) output_gff = tmpdir.join('output.gff').strpath cf = ClusterFinder(input_path=input_path, output_gff=output_gff, min_mapq=-1, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE) assert len(cf.clusters) == 1 input_path = str(datadir_copy[CORNERCASE2]) cf = ClusterFinder(input_path=input_path, output_gff=output_gff, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE) assert len(cf.clusters) == 0 input_path = str(datadir_copy[CORNERCASE3]) cf = ClusterFinder(input_path=input_path, output_gff=output_gff, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE)
def test_nonevidence(datadir_copy): # noqa: D103 input_path = str(datadir_copy[NON_SUPPORT]) result = ClusterFinder(input_path=input_path).clusters[:4] assert len(result) == 4 assert result[0].nref == 12 assert result[1].nref == 39 assert result[2].nref == 32 assert result[3].nref == 30
def test_clusterfinder_decoy_chromosome(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[DECOY]) output_gff = tmpdir.join('output.gff').strpath clusters = ClusterFinder(input_path=input_path, output_bam=None, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=649) assert len(clusters.clusters) == 84
def test_clusterfinder_start_end_problem(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[START_END_PROBLEM]) output_gff = tmpdir.join('output.gff').strpath clusters = ClusterFinder(input_path=input_path, output_bam=None, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=649, skip_decoy=False) assert len(clusters.clusters) == 0
def test_clusterfinder_multisample(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[MULTI_H6]) output_gff = tmpdir.join('output.gff').strpath clusters = ClusterFinder(input_path=input_path, output_bam=None, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=649) assert len(clusters.softclip_finder.clusters) == 8 assert clusters.clusters[0].nalt == 1 assert clusters.clusters[0].valid_tsd is False
def test_clusterfinder_split_cluster(datadir_copy, tmpdir): # noqa: D103 input_path = str(datadir_copy[SPLIT_CLUSTER]) cf = ClusterFinder(input_path=input_path, output_bam=tmpdir.join('output.bam').strpath, include_duplicates=False, remove_supplementary_without_primary=False, output_gff=tmpdir.join('out.gff').strpath, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE) assert len(cf.clusters) == 3 assert len(cf.clusters[1]) == 39 assert len(cf.clusters[2]) == 27 assert cf.clusters[1] != cf.clusters[2]
def test_clusterfinder_skip_abnormal(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[ARTEFACT_ACCUMULATION]) output_gff = tmpdir.join('output.gff').strpath clusters = ClusterFinder(input_path=input_path, output_bam=None, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=649, skip_decoy=False) assert len(clusters.clusters) == 26 assert clusters.clusters[2].abnormal
def test_clusterfinder_reorganize_cluster(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[REORGANIZE_CLUSTER]) output_gff = tmpdir.join('output.gff').strpath clusters = ClusterFinder(input_path=input_path, output_bam=None, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE) cluster = clusters.clusters[-1] genotype = cluster.genotype_likelihoods assert genotype.genotype == 'homozygous'
def test_clusterfinder_refine_split(datadir_copy, tmpdir): # noqa: D103 input_path = str(datadir_copy[SPLIT_CLUSTER_OPT]) output_bam = tmpdir.join('output.bam').strpath clusters = ClusterFinder(input_path=input_path, output_bam=output_bam, output_gff=tmpdir.join('output.gff').strpath, transposon_reference_fasta=None, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE) cluster = clusters.clusters[0] genotype = cluster.genotype_likelihoods assert genotype.nref == 73 assert genotype.nalt == 117 assert genotype.genotype == 'heterozygous'
def test_clusterfinder_reassemble(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[REASSEMBLE]) genome_reference_fasta = str(datadir_copy[GENOME_FRAGMENT]) output_bam = tmpdir.join('output.bam').strpath clusters = ClusterFinder(input_path=input_path, output_bam=output_bam, output_gff=tmpdir.join('output.gff').strpath, genome_reference_fasta=genome_reference_fasta, transposon_reference_fasta=reference_fasta, max_proper_pair_size=480) assert len(clusters.clusters) == 1 assert clusters.clusters[0].valid_tsd
def test_clusterfinder_refine_coord(datadir_copy, tmpdir): # noqa: D103 input_path = str(datadir_copy[REFINE_COORD]) output_bam = tmpdir.join('output.bam').strpath clusters = ClusterFinder(input_path=input_path, output_bam=output_bam, output_gff=tmpdir.join('output.gff').strpath, transposon_reference_fasta=None, max_proper_pair_size=480) cluster = clusters.clusters[-1] genotype = cluster.genotype_likelihoods assert genotype.nref == 0 assert genotype.nalt == 3 assert genotype.genotype == 'homozygous'
def test_clusterfinder_refine_tsd(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 # This cluster should not be split, since there is a small deletion # associated with a TE insertion. input_path = str(datadir_copy[REFINE_TSD]) output_gff = tmpdir.join('output.gff').strpath clusters = ClusterFinder(input_path=input_path, output_bam=None, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=649) cluster = clusters.clusters[0] assert cluster.valid_tsd
def test_clusterfinder_predicted_insertion_rover( datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[PREDICTED_INSERTION]) output_gff = tmpdir.join('output.gff').strpath output_fasta = tmpdir.join('output.fa').strpath clusters = ClusterFinder(input_path=input_path, output_bam=None, output_fasta=output_fasta, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=649) assert len(clusters.softclip_finder.clusters) == 2 assert len(clusters.clusters[0].feature_args) == 1
def test_clusterfinder_nonsupport(datadir_copy, tmpdir): # noqa: D103 input_path = str(datadir_copy[NON_SUPPORT]) output_bam = tmpdir.join('output.bam').strpath clusters = ClusterFinder(input_path=input_path, output_bam=output_bam, output_gff=None, transposon_reference_fasta=None, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE) cluster = clusters.clusters[-1] assert cluster.nref == 25 # Could also be 26 -- need to figure that out. genotype = cluster.genotype_likelihoods assert genotype.nref == 25 assert genotype.nalt == 1 assert genotype.genotype == 'reference'
def test_clusterfinder_join_cluster_test(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy['join_cluster_test.bam']) output_bam = tmpdir.join('output.bam').strpath clusters = ClusterFinder(input_path=input_path, output_bam=output_bam, output_gff=tmpdir.join('output.gff').strpath, transposon_reference_fasta=reference_fasta, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE) cluster_gate, cluster_rover = clusters.clusters assert len(cluster_gate) == 1 assert len(cluster_rover) == 21 assert not cluster_gate.valid_tsd assert cluster_rover.valid_tsd
def test_clusterfinder_skip_abnormal(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[ARTEFACT_ACCUMULATION]) output_gff = tmpdir.join('output.gff').strpath clusters = ClusterFinder(input_path=input_path, output_bam=None, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=649, skip_decoy=False) assert len(clusters.clusters) in ( 21, 22 ) # TODO: this should really be deterministic, what's going on here? assert clusters.clusters[2].abnormal
def test_clusterfinder_jockey_not_found(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[JOCKEY_NOT_FOUND]) output_gff = tmpdir.join('output.gff').strpath output_fasta = tmpdir.join('output.fa').strpath output_bam = tmpdir.join('output.bam').strpath clusters = ClusterFinder(input_path=input_path, output_bam=output_bam, output_fasta=output_fasta, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=649) rover, jockey = clusters.clusters assert rover.insert_reference_name == 'rover' assert jockey.insert_reference_name == 'transposable_element_Ivk'
def test_clusterfinder_dont_split(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 # This cluster should not be split, since there is a small deletion # associated with a TE insertion. input_path = str(datadir_copy[DONT_SPLIT]) output_gff = tmpdir.join('output.gff').strpath clusters = ClusterFinder(input_path=input_path, output_bam=None, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=649) assert len(clusters.clusters) == 1 cluster = clusters.clusters[0] assert cluster.nref == 18 assert cluster.nalt == 37
def test_clusterfinder_nonsupport_reference_genotype(datadir_copy, tmpdir): # noqa: D103 input_path = str(datadir_copy[MATE_SUPPORT_REFERENCE_GENOTYPE]) output_bam = tmpdir.join('output.bam').strpath clusters = ClusterFinder(input_path=input_path, output_bam=output_bam, output_gff=None, transposon_reference_fasta=None, max_proper_pair_size=578) cluster = clusters.clusters[-1] assert cluster.nref == 50 # Could also be 26 -- need to figure that out. genotype = cluster.genotype_likelihoods assert genotype.nref == 50 assert genotype.nalt == 2 assert genotype.genotype == 'reference'
def test_clusterfinder_estimate_coverage(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[DONT_MERGE_5]) output_gff = tmpdir.join('output.gff').strpath output_bam = tmpdir.join('output.bam').strpath clusters = ClusterFinder(input_path=input_path, output_bam=output_bam, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=1500) assert len(clusters.clusters) == 2 cluster_one = clusters.clusters[0] assert cluster_one.genotype == 'reference' assert cluster_one.nref == 66 assert cluster_one.nalt == 4
def test_clusterfinder_homozygous_copia(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[HOMOZYGOUS_COPIA]) output_bam = tmpdir.join('output.bam').strpath output_gff = tmpdir.join('output.gff').strpath clusters = ClusterFinder(input_path=input_path, output_bam=output_bam, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE) cluster = clusters.clusters[0] assert cluster.nalt == 57 genotype = cluster.genotype_likelihoods assert genotype.genotype == 'homozygous' assert genotype.nref == 0 assert genotype.nalt == 57
def test_clusterfinder_clip_assigned_to_insertion( datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[CLIP_TO_INSERTION]) output_gff = tmpdir.join('output.gff').strpath output_vcf = tmpdir.join('output.vcf').strpath clusters = ClusterFinder(input_path=input_path, output_bam=None, output_gff=output_gff, output_vcf=output_vcf, transposon_reference_fasta=reference_fasta, max_proper_pair_size=649) assert len(clusters.softclip_finder.clusters) == 16 assert len(clusters.clusters) == 3 assert clusters.clusters[1].nalt == 66 assert clusters.clusters[1].valid_tsd assert len(clusters.clusters[1].feature_args) == 2
def test_clusterfinder_nanopore(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[NANOPORE_ROVER]) output_bam = tmpdir.join('output.bam').strpath output_gff = tmpdir.join('output.gff').strpath output_fasta = tmpdir.join('output.fasta').strpath clusters = ClusterFinder(input_path=input_path, output_bam=output_bam, output_gff=output_gff, transposon_reference_fasta=reference_fasta, output_fasta=output_fasta, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE) cluster = clusters.clusters[0] assert cluster.nalt == 6 assert cluster.total_left_count == 1 assert cluster.total_right_count == 5
def test_clusterfinder_check_consistency(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[DONT_MERGE_6]) output_gff = tmpdir.join('output.gff').strpath clusters = ClusterFinder(input_path=input_path, output_bam=None, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=649) cluster_one = clusters.clusters[0] assert cluster_one.genotype == 'reference' assert cluster_one.nref == 69 assert cluster_one.nalt == 3 cluster_two = clusters.clusters[1] assert cluster_two.genotype == 'reference' assert cluster_two.nref == 64 assert cluster_two.nalt == 1
def test_clusterfinder_do_not_merge7(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[DONT_MERGE_7]) output_gff = tmpdir.join('output.gff').strpath output_bam = tmpdir.join('output.bam').strpath clusters = ClusterFinder(input_path=input_path, output_bam=output_bam, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=900) assert len(clusters.clusters) == 2 cluster_one, cluster_two = clusters.clusters assert cluster_one.genotype == 'reference' assert cluster_one.nref == 31 assert cluster_one.nalt == 2 assert cluster_two.genotype == 'heterozygous' assert cluster_two.nref == 45 assert cluster_two.nalt == 71
def test_clusterfinder_do_not_merge5(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[DONT_MERGE_5]) output_gff = tmpdir.join('output.gff').strpath output_bam = tmpdir.join('output.bam').strpath clusters = ClusterFinder(input_path=input_path, output_bam=output_bam, output_gff=output_gff, transposon_reference_fasta=reference_fasta, max_proper_pair_size=649) cluster_one = clusters.clusters[0] assert cluster_one.genotype == 'reference' assert cluster_one.nref == 108 assert cluster_one.nalt == 4 cluster_two = clusters.clusters[1] assert cluster_two.genotype == 'reference' assert cluster_two.nref == 66 assert cluster_two.nalt == 4
def test_clusterfinder_complex_genotype(datadir_copy, tmpdir, reference_fasta): # noqa: D103, F811 input_path = str(datadir_copy[COMPLEX]) output_bam = tmpdir.join('output.bam').strpath output_gff = tmpdir.join('output.gff').strpath output_fasta = tmpdir.join('output.fasta').strpath clusters = ClusterFinder(input_path=input_path, output_bam=output_bam, output_gff=output_gff, transposon_reference_fasta=reference_fasta, output_fasta=output_fasta, max_proper_pair_size=DEFAULT_MAX_PROPER_PAIR_SIZE) cluster = clusters.clusters[0] assert cluster.nalt == 28 genotype = cluster.genotype_likelihoods assert genotype.nref == 17 assert genotype.nalt == 28 assert genotype.genotype == 'heterozygous' assert len(open(output_fasta).readlines()) == 4