def get_dbs(self, sort=False, orientation=None, rm_duplicate=False, dbd_tag=False): """Return GenomicRegionSet which contains all DNA binding sites""" dna_set = GenomicRegionSet(name="DNA_binding_sites") if len(self) == 0: return dna_set for rd in self.sequences: if dbd_tag: dbs = GenomicRegion(chrom=rd.dna.chrom, initial=rd.dna.initial, final=rd.dna.final, name=rd.rna.str_rna(), orientation=rd.dna.orientation, data=rd.score) else: dbs = GenomicRegion(chrom=rd.dna.chrom, initial=rd.dna.initial, final=rd.dna.final, name=rd.dna.name, orientation=rd.dna.orientation, data=rd.score) if not orientation: dna_set.add(dbs) else: if orientation == rd.orient: dna_set.add(dbs) else: pass if sort: dna_set.sort() if rm_duplicate: dna_set.remove_duplicates() return dna_set
def get_biotypes(self, gene_set=None): """Get the region sets of different Biotypes. *Keyword arguments:* *Return:* - result_grs -- A list of GenomicRegionSets containing the regions for each Biotype. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None # Fetching exons query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("exon") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] gr.name = e[self.GeneField.TRANSCRIPT_ID] result_grs.add(gr) if gene_set: return result_grs, unmapped_gene_list else: return result_grs
def get_genes(self, gene_set = None): """ Gets regions of genes. It returns a GenomicRegionSet with such genes. The id of each gene will be put in the NAME field of each GenomicRegion. Keyword arguments: gene_set -- A set of genes to narrow the search. Return: result_grs -- A GenomicRegionSet containing the genes. unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if(gene_set): mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set) # Fetching genes if(gene_set): query_dictionary = {self.GeneField.FEATURE_TYPE:"gene", self.GeneField.GENE_ID:mapped_gene_list} else: query_dictionary = {self.GeneField.FEATURE_TYPE:"gene"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("genes") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] gr.name = e[self.GeneField.GENE_ID] result_grs.add(gr) result_grs.merge() if(gene_set): return result_grs, unmapped_gene_list else: return result_grs
def merge_rbs(self, rm_duplicate=False, asgene_organism=None, cutoff=0): """Merge the RNA binding regions which have overlap to each other and combine their corresponding DNA binding regions. extend -> Define the extending length in basepair of each RNA binding regions perfect_match -> Merge only the exactly same RNA binding regions """ # Merge RBS rna_merged = self.get_rbs() rna_merged.merge() # A dict: RBS as key, and GenomicRegionSet as its value new_dict = OrderedDict() for rbsm in rna_merged: regions = GenomicRegionSet(rbsm.toString()) for rd in self: if rbsm.overlap(rd.rna): regions.add(rd.dna) if rm_duplicate: regions.remove_duplicates() if len(regions) > cutoff: new_dict[rbsm] = regions if asgene_organism: try: new_dict[rbsm] = new_dict[rbsm].gene_association(organism=asgene_organism) except: print("* No annotation file for mapping associated genes.") else: continue self.merged_dict = new_dict
def get_transcripts(self, gene_set = None): """Gets transcripts of genes. It returns a GenomicRegionSet with such transcripts. The id of each gene will be put in the NAME field of each GenomicRegion. *Keyword arguments:* - gene_set -- A set of genes to narrow the search. *Return:* - result_grs -- A GenomicRegionSet containing the exons. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set) # Fetching exons if gene_set: query_dictionary = {self.GeneField.FEATURE_TYPE:"exon", self.GeneField.GENE_ID:mapped_gene_list} else: query_dictionary = {self.GeneField.FEATURE_TYPE:"exon"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("exon") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] gr.name = e[self.GeneField.TRANSCRIPT_ID] result_grs.add(gr) if gene_set: return result_grs, unmapped_gene_list else: return result_grs
def merge_rbs(self, rm_duplicate=False, asgene_organism=None, region_set=None, cutoff=0): """Merge the RNA binding regions which have overlap to each other and combine their corresponding DNA binding regions. extend -> Define the extending length in basepair of each RNA binding regions perfect_match -> Merge only the exactly same RNA binding regions """ # Merge RBS rna_merged = self.get_rbs() rna_merged.merge() # A dict: RBS as key, and GenomicRegionSet as its value new_dict = OrderedDict() for rbsm in rna_merged: regions = GenomicRegionSet(rbsm.toString()) for rd in self: if rbsm.overlap(rd.rna): regions.add(rd.dna) if rm_duplicate: regions.remove_duplicates() if len(regions) > cutoff: new_dict[rbsm] = regions if asgene_organism: try: new_dict[rbsm] = new_dict[rbsm].gene_association(organism=asgene_organism) except: pass if region_set: new_dict[rbsm].replace_region_name(regions=region_set) else: continue self.merged_dict = new_dict
def get_exons(self, start_site=False, end_site=False, gene_set=None, merge=True): """Gets exons of genes. It returns a GenomicRegionSet with such exons. The id of each gene will be put in the NAME field of each GenomicRegion. *Keyword arguments:* - start_site -- Whether to relocate the start sites. - end_site -- Whether to relocate the end sites. - gene_set -- A set of genes to narrow the search. *Return:* - result_grs -- A GenomicRegionSet containing the exons. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names( gene_set) # Fetching exons if gene_set: query_dictionary = { self.GeneField.FEATURE_TYPE: "exon", self.GeneField.GENE_ID: mapped_gene_list } else: query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("exon") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] # gr.name = e[self.GeneField.GENE_ID] gr.name = e[self.GeneField.TRANSCRIPT_ID] result_grs.add(gr) if start_site: result_grs.relocate_regions("leftend", left_length=1, right_length=1) elif end_site: result_grs.relocate_regions("rightend", left_length=1, right_length=1) if merge: result_grs.merge() if gene_set: return result_grs, unmapped_gene_list else: return result_grs
def get_promoters(self, promoterLength=1000, gene_set=None, unmaplist=False): """ Gets promoters of genes given a specific promoter length. It returns a GenomicRegionSet with such promoters. The ID of each gene will be put in the NAME field of each GenomicRegion. Each promoter includes also the coordinate of the 5' base pair, therefore each promoter actual length is promoterLength+1. Keyword arguments: promoterLength -- The length of the promoter region. gene_set -- A set of genes to narrow the search. Return: result_grs -- A GenomicRegionSet containing the promoters. unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if (gene_set): mapped_gene_list, unmapped_gene_list = self.fix_gene_names( gene_set) # Fetching genes #if(gene_set): query_dictionary = {self.GeneField.FEATURE_TYPE:"gene", self.GeneField.GENE_ID:mapped_gene_list} #else: query_dictionary = {self.GeneField.FEATURE_TYPE:"gene"} if (gene_set): query_dictionary = { self.GeneField.FEATURE_TYPE: "transcript", self.GeneField.GENE_ID: mapped_gene_list } else: query_dictionary = {self.GeneField.FEATURE_TYPE: "transcript"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("promoters") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] if (gr.orientation == "+"): gr.final = gr.initial + 1 gr.initial = gr.initial - promoterLength else: gr.initial = gr.final - 1 gr.final = gr.initial + promoterLength + 1 gr.name = e[self.GeneField.GENE_ID] result_grs.add(gr) if unmaplist: return result_grs, unmapped_gene_list else: return result_grs
def get_dbs(self, sort=False, orientation=None, rm_duplicate=False): """Return GenomicRegionSet which contains all DNA binding sites""" dna_set = GenomicRegionSet(name="DNA_binding_sites") for rd in self.sequences: if not orientation: dna_set.add(rd.dna) else: if orientation == rd.orient: dna_set.add(rd.dna) else: pass if sort: dna_set.sort() if rm_duplicate: dna_set.remove_duplicates() return dna_set
def get_promoters(self, promoterLength=1000, gene_set=None, unmaplist=False, variants=False): """ Gets promoters of genes given a specific promoter length. It returns a GenomicRegionSet with such promoters. The ID of each gene will be put in the NAME field of each GenomicRegion. Each promoter includes also the coordinate of the 5' base pair, therefore each promoter actual length is promoterLength+1. *Keyword arguments:* - promoterLength -- The length of the promoter region. - gene_set -- A set of genes to narrow the search. - unmaplist -- If True than also return the unmappable genes list (default = False). *Return:* - result_grs -- A GenomicRegionSet containing the promoters. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set) # Fetching genes if not variants: target = "gene" else: target = "transcript" if(gene_set): query_dictionary = {self.GeneField.FEATURE_TYPE:target, self.GeneField.GENE_ID:mapped_gene_list} else: query_dictionary = {self.GeneField.FEATURE_TYPE:target} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("promoters") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] if gr.orientation == "+": gr.final = gr.initial + 1 gr.initial = gr.initial - promoterLength else: gr.initial = gr.final - 1 gr.final = gr.initial + promoterLength + 1 gr.name = e[self.GeneField.GENE_ID] result_grs.add(gr) if unmaplist: return result_grs, unmapped_gene_list else: return result_grs
def get_tts(self, gene_set=None): """Gets TTS(Transcription termination site) of genes. It returns a GenomicRegionSet with such TTS. The ID of each gene will be put in the NAME field of each GenomicRegion. *Keyword arguments:* - gene_set -- A set of genes to narrow the search. *Return:* - result_grs -- A GenomicRegionSet containing TTS. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names( gene_set) # Fetching genes if gene_set: query_dictionary = { self.GeneField.FEATURE_TYPE: "gene", self.GeneField.GENE_ID: mapped_gene_list } else: query_dictionary = {self.GeneField.FEATURE_TYPE: "gene"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("TTS") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] if gr.orientation == "+": gr.initial = gr.initial gr.final = gr.initial + 1 else: gr.initial = gr.final - 1 gr.final = gr.final gr.name = e[self.GeneField.GENE_ID] result_grs.add(gr) result_grs.merge() if gene_set: return result_grs, unmapped_gene_list else: return result_grs
def get_exons(self, start_site=False, end_site=False, gene_set=None, merge=True): """Gets exons of genes. It returns a GenomicRegionSet with such exons. The id of each gene will be put in the NAME field of each GenomicRegion. *Keyword arguments:* - start_site -- Whether to relocate the start sites. - end_site -- Whether to relocate the end sites. - gene_set -- A set of genes to narrow the search. *Return:* - result_grs -- A GenomicRegionSet containing the exons. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set) # Fetching exons if gene_set: query_dictionary = {self.GeneField.FEATURE_TYPE: "exon", self.GeneField.GENE_ID: mapped_gene_list} else: query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("exon") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] # gr.name = e[self.GeneField.GENE_ID] gr.name = e[self.GeneField.TRANSCRIPT_ID] result_grs.add(gr) if start_site: result_grs.relocate_regions("leftend", left_length=1, right_length=1) elif end_site: result_grs.relocate_regions("rightend", left_length=1, right_length=1) if merge: result_grs.merge() if gene_set: return result_grs, unmapped_gene_list else: return result_grs
def get_tts(self, gene_set=None): """Gets TTS(Transcription termination site) of genes. It returns a GenomicRegionSet with such TTS. The ID of each gene will be put in the NAME field of each GenomicRegion. *Keyword arguments:* - gene_set -- A set of genes to narrow the search. *Return:* - result_grs -- A GenomicRegionSet containing TTS. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set) # Fetching genes if gene_set: query_dictionary = {self.GeneField.FEATURE_TYPE: "gene", self.GeneField.GENE_ID: mapped_gene_list} else: query_dictionary = {self.GeneField.FEATURE_TYPE: "gene"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("TTS") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] if gr.orientation == "+": gr.initial = gr.initial gr.final = gr.initial + 1 else: gr.initial = gr.final - 1 gr.final = gr.final gr.name = e[self.GeneField.GENE_ID] result_grs.add(gr) result_grs.merge() if gene_set: return result_grs, unmapped_gene_list else: return result_grs
def get_genes(self, gene_set=None): """ Gets regions of genes. It returns a GenomicRegionSet with such genes. The id of each gene will be put in the NAME field of each GenomicRegion. Keyword arguments: gene_set -- A set of genes to narrow the search. Return: result_grs -- A GenomicRegionSet containing the genes. unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if (gene_set): mapped_gene_list, unmapped_gene_list = self.fix_gene_names( gene_set) # Fetching genes if (gene_set): query_dictionary = { self.GeneField.FEATURE_TYPE: "gene", self.GeneField.GENE_ID: mapped_gene_list } else: query_dictionary = {self.GeneField.FEATURE_TYPE: "gene"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("genes") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] gr.name = e[self.GeneField.GENE_ID] result_grs.add(gr) result_grs.merge() if (gene_set): return result_grs, unmapped_gene_list else: return result_grs
Updated on 22 May 2014 by Joseph """ ############################# Parameters ############################## parser = argparse.ArgumentParser(description='Return the random sequences according to the given parameters.') parser.add_argument('-o','-organism', default= "hg19", help='Define the organism. Default: hg19') parser.add_argument('-l','-length', type=int, help='Define the length of each sequence.') parser.add_argument('-n','-number', type=int, help='Define the number of random regions.') parser.add_argument('-f','-filter', default=None, help='Given the path to the BED file as the filter.') args = parser.parse_args() # Setup the entries region = GenomicRegion("sample", initial=0, final=args.l) template = GenomicRegionSet("tamplate") template.add(region) if not os.path.exists(bed_dir): os.makedirs(bed_dir) # Random region result = template.random_regions(organism= "hg19", total_size=args.n, multiply_factor=0, overlap_result=True, overlap_input=True, chrom_X=False, chrom_M=False, filter_path=args.f) result.write(os.path.join(bed_dir, "00total.bed")) chrom = GenomicRegionSet("chrom") chrom.get_genome_data(organism=args.o, chrom_X=False, chrom_M=False) chrom_list = [] for r in chrom.sequences: chrom_list.append(r.chrom) print("Settings:\n\tAllowing overlapping within random regions.")
def get_promoters(self, promoter_length=1000, tss=0, gene_set=None, unmaplist=False, variants=False, gene_id=False, regiondata=False): """ Gets promoters of genes given a specific promoter length. It returns a GenomicRegionSet with such promoters. The ID of each gene will be put in the NAME field of each GenomicRegion. Each promoter includes also the coordinate of the 5' base pair, therefore each promoter actual length is promoter_length+1. *Keyword arguments:* - promoter_length -- The length of the promoter region. - gene_set -- A set of genes to narrow the search. - unmaplist -- If True than also return the unmappable genes list (default = False). *Return:* - result_grs -- A GenomicRegionSet containing the promoters. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list, mapping_dict = self.fix_gene_names( gene_set, output_dict=True) # Fetching genes if not variants: target = "gene" else: target = "transcript" if gene_set: query_dictionary = { self.GeneField.FEATURE_TYPE: target, self.GeneField.GENE_ID: mapped_gene_list } else: query_dictionary = {self.GeneField.FEATURE_TYPE: target} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("promoters") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] if gr.orientation == "+": gr.final = gr.initial + 1 + tss gr.initial = gr.initial - promoter_length else: gr.initial = gr.final - 1 - tss gr.final = gr.initial + promoter_length + 1 if gene_set: try: gr.name = mapping_dict[e[self.GeneField.GENE_ID]] except: gr.name = e[self.GeneField.GENE_ID] elif gene_id: gr.name = e[self.GeneField.GENE_ID] else: gr.name = e[self.GeneField.GENE_NAMES] if gene_set and regiondata: gr.data = gene_set.values[gr.name] result_grs.add(gr) if unmaplist: return result_grs, unmapped_gene_list else: return result_grs
class TestGenomicRegionSet(unittest.TestCase): def region_sets(self,listA,listB): """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """ self.setA = GenomicRegionSet('for Unit Test') for i in range(len(listA)): self.setA.add(GenomicRegion(chrom=listA[i][0], initial=listA[i][1], final=listA[i][2])) self.setB = GenomicRegionSet('for Unit Test') for i in range(len(listB)): self.setB.add(GenomicRegion(chrom=listB[i][0], initial=listB[i][1], final=listB[i][2])) def test_extend(self): """ Two empty sets A : none R : none """ self.region_sets([], []) self.setA.extend(100,100) self.assertEqual(len(self.setA.sequences), 0) """ One region A : ----- R : --------- """ self.region_sets([['chr1',5,10]], []) result = self.setA result.extend(4,4) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 14) """ Many region A : ----- ------ ----- ----- R : --------=--------- ------------------ """ self.region_sets([['chr1',5,10],['chr1',15,20],['chr1',40,50],['chr1',65,75]], []) result = self.setA result.extend(5,5) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 15) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 25) self.assertEqual(result[2].initial, 35) self.assertEqual(result[2].final, 55) self.assertEqual(result[3].initial, 60) self.assertEqual(result[3].final, 80) """ Many region in different chromosome A : ----- ------ ----- ----- R : none """ self.region_sets([['chr1',5,10],['chr2',15,20],['chr3',40,50],['chr4',65,75]], []) result = self.setA result.extend(5,5) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 15) self.assertEqual(result[0].chrom, 'chr1') self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 25) self.assertEqual(result[1].chrom, 'chr2') self.assertEqual(result[2].initial, 35) self.assertEqual(result[2].final, 55) self.assertEqual(result[2].chrom, 'chr3') self.assertEqual(result[3].initial, 60) self.assertEqual(result[3].final, 80) self.assertEqual(result[3].chrom, 'chr4') """ One region A : ----- R : --------- """ self.region_sets([['chr1',100,200]], []) result = self.setA result.extend(10,10,percentage=True) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 90) self.assertEqual(result[0].final, 210) def test_sort(self): self.region_sets([['chr1',15,20],['chr1',40,50],['chr1',65,75],['chr1',5,10]], []) self.setA.sort() def test_intersect(self): """ Two empty sets A : none B : none R : none """ self.region_sets([], []) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ One empty set A : ----- B : none R : none """ self.region_sets([['chr1',5,10]], []) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ A : none B : ----- R : none """ self.region_sets([], [['chr1',5,10]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ No overlapping A : ------ --------- ------- B : ---- ------ ------ R : none """ self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]], [['chr1',7,9],['chr1',20,25],['chr1',26,31]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ End-to-end attach A : ------ ------ B : ------ R : none """ self.region_sets([['chr1',1,5],['chr1',11,20]], [['chr1',5,11]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ No length attach A : . . B : . . R : none """ self.region_sets([['chr1',2,2],['chr1',20,20]], [['chr1',5,5],['chr1',20,20]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Perfect overlapping A : ------ B : ------ R : ------ """ self.region_sets([['chr1',1,10],['chr1',500,550],['chr1',600,650],['chr1',700,750],['chr1',725,800]], [['chr1',1,10],['chr1',500,550],['chr1',600,650],['chr1',700,750],['chr1',725,800]]) result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP, rm_duplicates=True) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) """ One overlapping region A : ------ B : -------- R1: -- (overlap) R2: ------ (original) R3: (comp_incl) """ self.region_sets([['chr1',1,10]], [['chr1',7,20]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Two simple overlapping regions A : ------- -------- B : ------------- R1: --- ---- (overlap) R2: ------- -------- (original) R3: (comp_incl) """ self.region_sets([['chr1',1,10],['chr1',26,35]], [['chr1',7,30]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 30) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Two separately overlapping regions A : ------- -------- B : ----- -------- R1: --- ---- (overlap) R2: ------- -------- (original) R3: (comp_incl) """ self.region_sets([['chr1',1,10],['chr1',26,35]], [['chr1',7,15],['chr1',30,40]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 30) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Many various overlapping (mixed) A : ------------------ -------- --------- B : ---- ------- ------ ---------- R1: -- ------- -- ---- --- (overlap) R2: ------------------ -------- --------- (original) R3: (comp_incl) """ self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]], [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]]) result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 3) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 27) self.assertEqual(result[2].final, 30) self.assertEqual(result[3].initial, 55) self.assertEqual(result[3].final, 60) self.assertEqual(result[4].initial, 70) self.assertEqual(result[4].final, 75) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 3) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 50) self.assertEqual(result[1].final, 60) self.assertEqual(result[2].initial, 70) self.assertEqual(result[2].final, 85) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Different chromosomes A : chr1 ------- B : chr2 ------- R : none """ self.region_sets([['chr1',1,10]], [['chr2',1,10]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Completely included overlapping A : --------------------------- B : ---- ------ ----------- R1: ---- ------ ------ (overlap) R2: --------------------------- (original) R3: (comp_incl) """ self.region_sets([['chr1',1,50]], [['chr1',1,5],['chr1',10,19],['chr1',45,60]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ A : ---- ------ ----------- B : --------------------------- R1: ---- ------ ------ (overlap) R2: ---- ------ ----------- (original) R3: ---- ------ (comp_incl) """ self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]], [['chr1',1,50]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 60) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) """ A : -------------- ------- ------ B : ----- ---------------- R1: ----- ------- (overlap) ---- R2: -------------- ------- (original) ------ R3: ------- (comp_incl) """ self.region_sets([['chr1',1,50],['chr1',20,40],['chr1',70,80]], [['chr1',25,45],['chr1',65,95]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 25) self.assertEqual(result[0].final, 45) self.assertEqual(result[1].initial, 70) self.assertEqual(result[1].final, 80) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[1].initial, 20) self.assertEqual(result[1].final, 40) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 50) self.assertEqual(result[2].initial, 70) self.assertEqual(result[2].final, 80) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 70) self.assertEqual(result[0].final, 80) def test_closest(self): """ Two empty sets A : none B : none R : none """ self.region_sets([], []) result = self.setA.closest(self.setB) self.assertEqual(len(result), 0) # """ # One empty set # A : ----- # B : none # R : none # """ # self.region_sets([['chr1',5,10]], # []) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # A : none # B : ----- # R : none # """ # self.region_sets([], # [['chr1',5,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # Overlapping within set # A : -----====----- # B : ---- # R : ---- # """ # self.region_sets([['chr1',1,10],['chr1',6,15]], # [['chr1',6,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # """ # A : ---- # B : -----====----- # R : -----====----- # """ # self.region_sets([['chr1',6,10]], # [['chr1',1,10],['chr1',6,15]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # """ # No overlapping # A : ------ --------- ------- # B : ---- ------ ------ # R : ------ # """ # self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]], # [['chr1',7,9],['chr1',20,25],['chr1',26,31]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 3) # # self.assertEqual(result[0].initial, 20) # # self.assertEqual(result[0].final, 25) # """ # End-to-end attach # A : ------ ------ # B : ------ # R : ------ # """ # self.region_sets([['chr1',1,5],['chr1',11,20]], # [['chr1',5,11]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # # self.assertEqual(result[0].initial, 5) # # self.assertEqual(result[0].final, 11) # """ # Perfect overlapping # A : ------ # B : ------ # R : ------ # """ # self.region_sets([['chr1',1,10]], # [['chr1',1,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # self.assertEqual(result[0].initial, 1) # self.assertEqual(result[0].final, 10) # """ # One overlapping region # A : ------ # B : -------- # R : -------- # """ # self.region_sets([['chr1',1,10]], # [['chr1',7,20]]) # result = self.setA.closest(self.setB) # self.assertEqual(result[0].initial, 7) # self.assertEqual(result[0].final, 20) # """ # Two simple overlapping regions # A : ------- -------- # B : ------------- # R : ------------- # """ # self.region_sets([['chr1',1,10],['chr1',26,35]], # [['chr1',7,30]]) # result = self.setA.closest(self.setB) # self.assertEqual(result[0].initial, 7) # self.assertEqual(result[0].final, 30) # """ # Two separately overlapping regions # A : ------- -------- # B : ----- -------- # R : none # """ # self.region_sets([['chr1',1,10],['chr1',26,35]], # [['chr1',7,15],['chr1',30,40]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # """ # Many various overlapping (mixed) # A : ------------------ -------- --------- # B : ---- ------- ------ ---------- # R : none # """ # self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]], # [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 4) # """ # Different chromosomes # A : chr1 ------- # B : chr2 ------- # R : chr2 ------- # # """ # self.region_sets([['chr1',1,10]], # [['chr2',1,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # Completely included overlapping # A : --------------------------- # B : ---- ------ ----------- # R : ---- ------ ----------- # """ # self.region_sets([['chr1',1,50]], # [['chr1',1,5],['chr1',10,19],['chr1',45,60]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 3) # """ # A : ---- ------ ----------- # B : --------------------------- # R : none # """ # self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]], # [['chr1',1,50]]) # result = self.setA.closest(self.setB) # self.assertEqual(result, False) # """ # A : ---- ------ --- # B : --- ----- # R : --- # """ # self.region_sets([['chr1',1,5],['chr1',27,45],['chr1',85,95]], # [['chr1',15,20],['chr1',55,65]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # self.assertEqual(result[0].initial, 15) # self.assertEqual(result[0].final, 20) def test_remove_duplicates(self): """ A : ===== ----- R : ----- ----- """ self.region_sets([['chr1',1,10],['chr1',1,10],['chr1',15,25]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) """ A : =====--- ----- R : =====--- ----- """ self.region_sets([['chr1',1,10],['chr1',1,15],['chr1',20,25]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 1) self.assertEqual(result[1].final, 15) self.assertEqual(result[2].initial, 20) self.assertEqual(result[2].final, 25) """ A : ===== ----- ------ ==== R : ----- ----- ------ ---- """ self.region_sets([['chr1',1,10],['chr1',1,10],['chr1',15,25],['chr1',30,35],['chr1',40,45],['chr1',40,45]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) self.assertEqual(result[2].initial, 30) self.assertEqual(result[2].final, 35) self.assertEqual(result[3].initial, 40) self.assertEqual(result[3].final, 45) def test_window(self): """ A : ------- B : ------[ 99 ] [ 199 ]--- window = 100 R : - only one base overlaps with extending A """ self.region_sets([['chr1',200,300]], [['chr1',1,101],['chr1',499,550]]) result = self.setA.window(self.setB,adding_length=100) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 100) self.assertEqual(result[0].final, 101) """ A : ------- B : ------[ 99 ] [ 199 ]--- window = 200 R : ------ - left-hand side is covered, and the right-hand side is only one base overlapped """ self.region_sets([['chr1',200,300]], [['chr1',1,101],['chr1',499,550]]) result = self.setA.window(self.setB,adding_length=200) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) # GenomicRegion.extend will choose 1 rather than 0 self.assertEqual(result[0].final, 101) self.assertEqual(result[1].initial, 499) self.assertEqual(result[1].final, 500) """ A : ---- ---- B : -------- ---- window = 1000 (default) R : ---- ---- """ self.region_sets([['chr1',3000,3500],['chr1',4000,4500]], [['chr1',1500,2500],['chr1',5000,5500]]) result = self.setA.window(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 2000) self.assertEqual(result[0].final, 2500) self.assertEqual(result[1].initial, 5000) self.assertEqual(result[1].final, 5500) """ A : ---- ---- B : -------- ---- window = 2000 R : -------- ---- ---- ---- window = 100 R : none """ self.region_sets([['chr1',3000,3500],['chr1',4000,4500]], [['chr1',1500,2500],['chr1',5000,5500]]) result = self.setA.window(self.setB,adding_length=2000) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1500) self.assertEqual(result[0].final, 2500) self.assertEqual(result[1].initial, 5000) self.assertEqual(result[1].final, 5500) result = self.setA.window(self.setB,adding_length=100) self.assertEqual(len(result), 0) def test_subtract(self): """ A : none B : ------ R : none """ self.region_sets([], [['chr1',6,15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : ------ B : none R : ------ """ self.region_sets([['chr1',6,15]], []) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 6) self.assertEqual(result[0].final, 15) """ A : ------ B : ------ R : --- """ self.region_sets([['chr1',1,10]], [['chr1',6,15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 6) """ A : ------ B : ------ R : --- """ self.region_sets([['chr1',6,15]], [['chr1',1,10]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 10) self.assertEqual(result[0].final, 15) """ A : --- B : --------- R : none """ self.region_sets([['chr1',6,10]], [['chr1',1,15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : --------- B : --- R : --- --- """ self.region_sets([['chr1',1,15]], [['chr1',6,10]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 6) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 15) """ A : ------ B : ------ R : none """ self.region_sets([['chr1',6,15]], [['chr1',6,15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : ---------- ------ B : ---------- ---- R : ------- ------ """ self.region_sets([['chr1',5,30],['chr1',70,85]], [['chr1',20,50],['chr1',100,110]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 5) self.assertEqual(result[0].final, 20) self.assertEqual(result[1].initial, 70) self.assertEqual(result[1].final, 85) """ A : ------ ----- B : ------ R : ---- ----- """ self.region_sets([['chr1',20,30],['chr1',35,55]], [['chr1',10,23],['chr1',100,110]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 23) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 35) self.assertEqual(result[1].final, 55) """ A : ch1 --------------------- ch2 ------------------------- B : ch1 ------ ch2 ------ R : ch1 -------- ------- ch2 ------------------- """ self.region_sets([['chr1',0,30000],['chr2',0,35000]], [['chr1',20000,23000],['chr2',31000,35000]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 20000) self.assertEqual(result[1].initial, 23000) self.assertEqual(result[1].final, 30000) self.assertEqual(result[2].initial, 0) self.assertEqual(result[2].final, 31000) """ A : ----------------------------------------------------------- B : --- --------- ---- ---- R : - ---- --------- ----------- -------------- """ self.region_sets([['chr1',5,1000]], [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 5) """ A : ----------------------- ------ ----- ----- ----------- B : --- --------- ---- ---- R : - ---- ------ ---- --- --- ---- --- """ self.region_sets([['chr1',5,100],['chr1',20,40],['chr1',60,80],['chr1',95,150],['chr1',180,220]], [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240]]) result = self.setA.subtract(self.setB) #print(result.sequences) self.assertEqual(len(result), 8) self.assertEqual(result[0].initial, 5) """ A : ----------------------------------------------------------- B : --- --------- ---- ---- R : - ---- --------- ----------- -------------- """ self.region_sets([['chr1',5,1000],['chr2',5,1000],['chr4',5,1000]], [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240], ['chr2',10,15],['chr2',30,70],['chr2',120,140],['chr2',200,240], ['chr4',10,15],['chr4',30,70],['chr4',120,140],['chr4',200,240]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 15) def test_merge(self): """ A : none R : none """ self.region_sets([], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 0) """ A : ----- ----- R : ----- ----- """ self.region_sets([['chr1',1,10],['chr1',15,25]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) """ A1: ------------ ---- A2: ----- R : ------------ ---- """ self.region_sets([['chr1',1,30],['chr1',11,20],['chr1',40,50]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 40) self.assertEqual(result[1].final, 50) """ A1: -------- ---- A2: --------- R : ------------ ---- """ self.region_sets([['chr1',1,30],['chr1',20,40],['chr1',50,60]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 40) self.assertEqual(result[1].initial, 50) self.assertEqual(result[1].final, 60) """ A : ======= R : ------- """ self.region_sets([['chr1',1,30],['chr1',1,30]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 30) def test_cluster(self): """ Empty sets A : none R : none """ self.region_sets([], []) result = self.setA.cluster(10) self.assertEqual(len(result), 0) """ A : ------- R : ------- """ self.region_sets([['chr1',1,10]], []) result = self.setA.cluster(10) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) """ A : ----- ------ R : ----------- """ self.region_sets([['chr1',1,10],['chr1',10,20]], []) result = self.setA.cluster(10) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 20) """ A : ----- ----- R1: ----- ----- R2: ------------ """ self.region_sets([['chr1',1,10],['chr1',15,25]], []) result = self.setA.cluster(1) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) result = self.setA.cluster(5) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) result = self.setA.cluster(6) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 25) """ A : ---- ---- ---- ---- ---- R1: --------- ---- ---- ---- R2: --------------- ---- ---- R3: ---------------------- ---- R4: ------------------------------ R5: ------------------------------ """ self.region_sets([['chr1',1,10],['chr1',15,25],['chr1',35,45], ['chr1',60,70],['chr1',90,100]], []) result = self.setA.cluster(6) self.assertEqual(len(result), 4) result = self.setA.cluster(11) self.assertEqual(len(result), 3) result = self.setA.cluster(16) self.assertEqual(len(result), 2) result = self.setA.cluster(21) self.assertEqual(len(result), 1) result = self.setA.cluster(26) self.assertEqual(len(result), 1) def test_flank(self): """ A : ----- R1: --- --- """ self.region_sets([['chr1',60,75]], []) result = self.setA.flank(10) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 50) self.assertEqual(result[0].final, 60) self.assertEqual(result[1].initial, 75) self.assertEqual(result[1].final, 85) """ A : ----- ---- R1: ----- ===== ---- """ self.region_sets([['chr1',60,75],['chr1',90,100]], []) result = self.setA.flank(15) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 45) self.assertEqual(result[0].final, 60) self.assertEqual(result[1].initial, 75) self.assertEqual(result[1].final, 90) self.assertEqual(result[2].initial, 75) self.assertEqual(result[2].final, 90) self.assertEqual(result[3].initial, 100) self.assertEqual(result[3].final, 115) def test_jaccard(self): """ self --8-- ---10--- -4- y ---10--- ---10--- intersect -5- -4- similarity: ( 5 + 4 )/[(8 + 10 + 4) + (10 +10) - (5 + 4 )] = 9/33 """ self.region_sets([['chr1',50,58],['chr1',70,80],['chr1',90,94]], [['chr1',45,55],['chr1',76,86]]) result = self.setA.jaccard(self.setB) self.assertEqual(result, 9/33) def test_get_genome_data(self): """hg19""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19") self.assertEqual(len(result), 23) """hg19, with Mitochondria chromosome""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19",chrom_M=True) self.assertEqual(len(result), 24) def test_random_regions(self): self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=False, overlap_input=False) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=True, overlap_input=False) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=False, overlap_input=True) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=True, overlap_input=True) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,1000],['chr2',0,2000],['chrX',0,3000]], []) result = self.setA.random_regions(organism="mm9", multiply_factor=100, overlap_result=False, overlap_input=False) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,1000],['chr2',0,2000],['chrX',0,3000]], []) result = self.setA.random_regions(organism="mm9", multiply_factor=100, overlap_result=False, overlap_input=False, chrom_M=True) result.sort()
class TestGenomicRegionSet(unittest.TestCase): def region_sets(self, listA, listB): """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """ self.setA = GenomicRegionSet('for Unit Test') for i in range(len(listA)): self.setA.add( GenomicRegion(chrom=listA[i][0], initial=listA[i][1], final=listA[i][2])) self.setB = GenomicRegionSet('for Unit Test') for i in range(len(listB)): self.setB.add( GenomicRegion(chrom=listB[i][0], initial=listB[i][1], final=listB[i][2])) def test_extend(self): """ Two empty sets A : none R : none """ self.region_sets([], []) self.setA.extend(100, 100) self.assertEqual(len(self.setA.sequences), 0) """ One region A : ----- R : --------- """ self.region_sets([['chr1', 5, 10]], []) result = self.setA result.extend(4, 4) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 14) """ Many region A : ----- ------ ----- ----- R : --------=--------- ------------------ """ self.region_sets([['chr1', 5, 10], ['chr1', 15, 20], ['chr1', 40, 50], ['chr1', 65, 75]], []) result = self.setA result.extend(5, 5) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 15) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 25) self.assertEqual(result[2].initial, 35) self.assertEqual(result[2].final, 55) self.assertEqual(result[3].initial, 60) self.assertEqual(result[3].final, 80) """ Many region in different chromosome A : ----- ------ ----- ----- R : none """ self.region_sets([['chr1', 5, 10], ['chr2', 15, 20], ['chr3', 40, 50], ['chr4', 65, 75]], []) result = self.setA result.extend(5, 5) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 15) self.assertEqual(result[0].chrom, 'chr1') self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 25) self.assertEqual(result[1].chrom, 'chr2') self.assertEqual(result[2].initial, 35) self.assertEqual(result[2].final, 55) self.assertEqual(result[2].chrom, 'chr3') self.assertEqual(result[3].initial, 60) self.assertEqual(result[3].final, 80) self.assertEqual(result[3].chrom, 'chr4') """ One region A : ----- R : --------- """ self.region_sets([['chr1', 100, 200]], []) result = self.setA result.extend(10, 10, percentage=True) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 90) self.assertEqual(result[0].final, 210) def test_sort(self): self.region_sets([['chr1', 15, 20], ['chr1', 40, 50], ['chr1', 65, 75], ['chr1', 5, 10]], []) self.setA.sort() def test_intersect(self): """ Two empty sets A : none B : none R : none """ self.region_sets([], []) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ One empty set A : ----- B : none R : none """ self.region_sets([['chr1', 5, 10]], []) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ A : none B : ----- R : none """ self.region_sets([], [['chr1', 5, 10]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ No overlapping A : ------ --------- ------- B : ---- ------ ------ R : none """ self.region_sets([['chr1', 1, 5], ['chr1', 11, 20], ['chr1', 33, 38]], [['chr1', 7, 9], ['chr1', 20, 25], ['chr1', 26, 31]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ End-to-end attach A : ------ ------ B : ------ R : none """ self.region_sets([['chr1', 1, 5], ['chr1', 11, 20]], [['chr1', 5, 11]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ No length attach A : . . B : . . R : none """ self.region_sets([['chr1', 2, 2], ['chr1', 20, 20]], [['chr1', 5, 5], ['chr1', 20, 20]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Perfect overlapping A : ------ B : ------ R : ------ """ self.region_sets( [['chr1', 1, 10], ['chr1', 500, 550], ['chr1', 600, 650], ['chr1', 700, 750], ['chr1', 725, 800]], [['chr1', 1, 10], ['chr1', 500, 550], ['chr1', 600, 650], ['chr1', 700, 750], ['chr1', 725, 800]]) result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP, rm_duplicates=True) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) """ One overlapping region A : ------ B : -------- R1: -- (overlap) R2: ------ (original) R3: (comp_incl) """ self.region_sets([['chr1', 1, 10]], [['chr1', 7, 20]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Two simple overlapping regions A : ------- -------- B : ------------- R1: --- ---- (overlap) R2: ------- -------- (original) R3: (comp_incl) """ self.region_sets([['chr1', 1, 10], ['chr1', 26, 35]], [['chr1', 7, 30]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 30) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Two separately overlapping regions A : ------- -------- B : ----- -------- R1: --- ---- (overlap) R2: ------- -------- (original) R3: (comp_incl) """ self.region_sets([['chr1', 1, 10], ['chr1', 26, 35]], [['chr1', 7, 15], ['chr1', 30, 40]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 30) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Many various overlapping (mixed) A : ------------------ -------- --------- B : ---- ------- ------ ---------- R1: -- ------- -- ---- --- (overlap) R2: ------------------ -------- --------- (original) R3: (comp_incl) """ self.region_sets([['chr1', 3, 30], ['chr1', 50, 60], ['chr1', 70, 85]], [['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 27, 35], ['chr1', 55, 75]]) result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 3) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 27) self.assertEqual(result[2].final, 30) self.assertEqual(result[3].initial, 55) self.assertEqual(result[3].final, 60) self.assertEqual(result[4].initial, 70) self.assertEqual(result[4].final, 75) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 3) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 50) self.assertEqual(result[1].final, 60) self.assertEqual(result[2].initial, 70) self.assertEqual(result[2].final, 85) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Different chromosomes A : chr1 ------- B : chr2 ------- R : none """ self.region_sets([['chr1', 1, 10]], [['chr2', 1, 10]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Completely included overlapping A : --------------------------- B : ---- ------ ----------- R1: ---- ------ ------ (overlap) R2: --------------------------- (original) R3: (comp_incl) """ self.region_sets([['chr1', 1, 50]], [['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 45, 60]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ A : ---- ------ ----------- B : --------------------------- R1: ---- ------ ------ (overlap) R2: ---- ------ ----------- (original) R3: ---- ------ (comp_incl) """ self.region_sets([['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 45, 60]], [['chr1', 1, 50]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 60) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) """ A : -------------- ------- ------ B : ----- ---------------- R1: ----- ------- (overlap) ---- R2: -------------- ------- (original) ------ R3: ------- (comp_incl) """ self.region_sets([['chr1', 1, 50], ['chr1', 20, 40], ['chr1', 70, 80]], [['chr1', 25, 45], ['chr1', 65, 95]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 25) self.assertEqual(result[0].final, 45) self.assertEqual(result[1].initial, 70) self.assertEqual(result[1].final, 80) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[1].initial, 20) self.assertEqual(result[1].final, 40) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 50) self.assertEqual(result[2].initial, 70) self.assertEqual(result[2].final, 80) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 70) self.assertEqual(result[0].final, 80) def test_closest(self): """ Two empty sets A : none B : none R : none """ self.region_sets([], []) result = self.setA.closest(self.setB) self.assertEqual(len(result), 0) # """ # One empty set # A : ----- # B : none # R : none # """ # self.region_sets([['chr1',5,10]], # []) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # A : none # B : ----- # R : none # """ # self.region_sets([], # [['chr1',5,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # Overlapping within set # A : -----====----- # B : ---- # R : ---- # """ # self.region_sets([['chr1',1,10],['chr1',6,15]], # [['chr1',6,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # """ # A : ---- # B : -----====----- # R : -----====----- # """ # self.region_sets([['chr1',6,10]], # [['chr1',1,10],['chr1',6,15]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # """ # No overlapping # A : ------ --------- ------- # B : ---- ------ ------ # R : ------ # """ # self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]], # [['chr1',7,9],['chr1',20,25],['chr1',26,31]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 3) # # self.assertEqual(result[0].initial, 20) # # self.assertEqual(result[0].final, 25) # """ # End-to-end attach # A : ------ ------ # B : ------ # R : ------ # """ # self.region_sets([['chr1',1,5],['chr1',11,20]], # [['chr1',5,11]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # # self.assertEqual(result[0].initial, 5) # # self.assertEqual(result[0].final, 11) # """ # Perfect overlapping # A : ------ # B : ------ # R : ------ # """ # self.region_sets([['chr1',1,10]], # [['chr1',1,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # self.assertEqual(result[0].initial, 1) # self.assertEqual(result[0].final, 10) # """ # One overlapping region # A : ------ # B : -------- # R : -------- # """ # self.region_sets([['chr1',1,10]], # [['chr1',7,20]]) # result = self.setA.closest(self.setB) # self.assertEqual(result[0].initial, 7) # self.assertEqual(result[0].final, 20) # """ # Two simple overlapping regions # A : ------- -------- # B : ------------- # R : ------------- # """ # self.region_sets([['chr1',1,10],['chr1',26,35]], # [['chr1',7,30]]) # result = self.setA.closest(self.setB) # self.assertEqual(result[0].initial, 7) # self.assertEqual(result[0].final, 30) # """ # Two separately overlapping regions # A : ------- -------- # B : ----- -------- # R : none # """ # self.region_sets([['chr1',1,10],['chr1',26,35]], # [['chr1',7,15],['chr1',30,40]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # """ # Many various overlapping (mixed) # A : ------------------ -------- --------- # B : ---- ------- ------ ---------- # R : none # """ # self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]], # [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 4) # """ # Different chromosomes # A : chr1 ------- # B : chr2 ------- # R : chr2 ------- # # """ # self.region_sets([['chr1',1,10]], # [['chr2',1,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # Completely included overlapping # A : --------------------------- # B : ---- ------ ----------- # R : ---- ------ ----------- # """ # self.region_sets([['chr1',1,50]], # [['chr1',1,5],['chr1',10,19],['chr1',45,60]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 3) # """ # A : ---- ------ ----------- # B : --------------------------- # R : none # """ # self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]], # [['chr1',1,50]]) # result = self.setA.closest(self.setB) # self.assertEqual(result, False) # """ # A : ---- ------ --- # B : --- ----- # R : --- # """ # self.region_sets([['chr1',1,5],['chr1',27,45],['chr1',85,95]], # [['chr1',15,20],['chr1',55,65]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # self.assertEqual(result[0].initial, 15) # self.assertEqual(result[0].final, 20) def test_remove_duplicates(self): """ A : ===== ----- R : ----- ----- """ self.region_sets([['chr1', 1, 10], ['chr1', 1, 10], ['chr1', 15, 25]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) """ A : =====--- ----- R : =====--- ----- """ self.region_sets([['chr1', 1, 10], ['chr1', 1, 15], ['chr1', 20, 25]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 1) self.assertEqual(result[1].final, 15) self.assertEqual(result[2].initial, 20) self.assertEqual(result[2].final, 25) """ A : ===== ----- ------ ==== R : ----- ----- ------ ---- """ self.region_sets( [['chr1', 1, 10], ['chr1', 1, 10], ['chr1', 15, 25], ['chr1', 30, 35], ['chr1', 40, 45], ['chr1', 40, 45]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) self.assertEqual(result[2].initial, 30) self.assertEqual(result[2].final, 35) self.assertEqual(result[3].initial, 40) self.assertEqual(result[3].final, 45) def test_window(self): """ A : ------- B : ------[ 99 ] [ 199 ]--- window = 100 R : - only one base overlaps with extending A """ self.region_sets([['chr1', 200, 300]], [['chr1', 1, 101], ['chr1', 499, 550]]) result = self.setA.window(self.setB, adding_length=100) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 100) self.assertEqual(result[0].final, 101) """ A : ------- B : ------[ 99 ] [ 199 ]--- window = 200 R : ------ - left-hand side is covered, and the right-hand side is only one base overlapped """ self.region_sets([['chr1', 200, 300]], [['chr1', 1, 101], ['chr1', 499, 550]]) result = self.setA.window(self.setB, adding_length=200) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) # GenomicRegion.extend will choose 1 rather than 0 self.assertEqual(result[0].final, 101) self.assertEqual(result[1].initial, 499) self.assertEqual(result[1].final, 500) """ A : ---- ---- B : -------- ---- window = 1000 (default) R : ---- ---- """ self.region_sets([['chr1', 3000, 3500], ['chr1', 4000, 4500]], [['chr1', 1500, 2500], ['chr1', 5000, 5500]]) result = self.setA.window(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 2000) self.assertEqual(result[0].final, 2500) self.assertEqual(result[1].initial, 5000) self.assertEqual(result[1].final, 5500) """ A : ---- ---- B : -------- ---- window = 2000 R : -------- ---- ---- ---- window = 100 R : none """ self.region_sets([['chr1', 3000, 3500], ['chr1', 4000, 4500]], [['chr1', 1500, 2500], ['chr1', 5000, 5500]]) result = self.setA.window(self.setB, adding_length=2000) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1500) self.assertEqual(result[0].final, 2500) self.assertEqual(result[1].initial, 5000) self.assertEqual(result[1].final, 5500) result = self.setA.window(self.setB, adding_length=100) self.assertEqual(len(result), 0) def test_subtract(self): """ A : none B : ------ R : none """ self.region_sets([], [['chr1', 6, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : ------ B : none R : ------ """ self.region_sets([['chr1', 6, 15]], []) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 6) self.assertEqual(result[0].final, 15) """ A : ------ B : ------ R : --- """ self.region_sets([['chr1', 1, 10]], [['chr1', 6, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 6) """ A : ------ B : ------ R : --- """ self.region_sets([['chr1', 6, 15]], [['chr1', 1, 10]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 10) self.assertEqual(result[0].final, 15) """ A : --- B : --------- R : none """ self.region_sets([['chr1', 6, 10]], [['chr1', 1, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : --------- B : --- R : --- --- """ self.region_sets([['chr1', 1, 15]], [['chr1', 6, 10]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 6) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 15) """ A : ------ B : ------ R : none """ self.region_sets([['chr1', 6, 15]], [['chr1', 6, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : ---------- ------ B : ---------- ---- R : ------- ------ """ self.region_sets([['chr1', 5, 30], ['chr1', 70, 85]], [['chr1', 20, 50], ['chr1', 100, 110]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 5) self.assertEqual(result[0].final, 20) self.assertEqual(result[1].initial, 70) self.assertEqual(result[1].final, 85) """ A : ------ ----- B : ------ R : ---- ----- """ self.region_sets([['chr1', 20, 30], ['chr1', 35, 55]], [['chr1', 10, 23], ['chr1', 100, 110]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 23) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 35) self.assertEqual(result[1].final, 55) """ A : ch1 --------------------- ch2 ------------------------- B : ch1 ------ ch2 ------ R : ch1 -------- ------- ch2 ------------------- """ self.region_sets([['chr1', 0, 30000], ['chr2', 0, 35000]], [['chr1', 20000, 23000], ['chr2', 31000, 35000]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 20000) self.assertEqual(result[1].initial, 23000) self.assertEqual(result[1].final, 30000) self.assertEqual(result[2].initial, 0) self.assertEqual(result[2].final, 31000) """ A : ----------------------------------------------------------- B : --- --------- ---- ---- R : - ---- --------- ----------- -------------- """ self.region_sets([['chr1', 5, 1000]], [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140], ['chr1', 200, 240]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 5) """ A : ----------------------- ------ ----- ----- ----------- B : --- --------- ---- ---- R : - ---- ------ ---- --- --- ---- --- """ self.region_sets([['chr1', 5, 100], ['chr1', 20, 40], ['chr1', 60, 80], ['chr1', 95, 150], ['chr1', 180, 220]], [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140], ['chr1', 200, 240]]) result = self.setA.subtract(self.setB) # print(result.sequences) self.assertEqual(len(result), 8) self.assertEqual(result[0].initial, 5) """ A : ----------------------------------------------------------- B : --- --------- ---- ---- R : - ---- --------- ----------- -------------- """ self.region_sets( [['chr1', 5, 1000], ['chr2', 5, 1000], ['chr4', 5, 1000]], [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140], ['chr1', 200, 240], ['chr2', 10, 15], ['chr2', 30, 70], ['chr2', 120, 140], ['chr2', 200, 240], ['chr4', 10, 15], ['chr4', 30, 70], ['chr4', 120, 140], ['chr4', 200, 240]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 15) def test_merge(self): """ A : none R : none """ self.region_sets([], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 0) """ A : ----- ----- R : ----- ----- """ self.region_sets([['chr1', 1, 10], ['chr1', 15, 25]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) """ A1: ------------ ---- A2: ----- R : ------------ ---- """ self.region_sets([['chr1', 1, 30], ['chr1', 11, 20], ['chr1', 40, 50]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 40) self.assertEqual(result[1].final, 50) """ A1: -------- ---- A2: --------- R : ------------ ---- """ self.region_sets([['chr1', 1, 30], ['chr1', 20, 40], ['chr1', 50, 60]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 40) self.assertEqual(result[1].initial, 50) self.assertEqual(result[1].final, 60) """ A : ======= R : ------- """ self.region_sets([['chr1', 1, 30], ['chr1', 1, 30]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 30) def test_cluster(self): """ Empty sets A : none R : none """ self.region_sets([], []) result = self.setA.cluster(10) self.assertEqual(len(result), 0) """ A : ------- R : ------- """ self.region_sets([['chr1', 1, 10]], []) result = self.setA.cluster(10) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) """ A : ----- ------ R : ----------- """ self.region_sets([['chr1', 1, 10], ['chr1', 10, 20]], []) result = self.setA.cluster(10) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 20) """ A : ----- ----- R1: ----- ----- R2: ------------ """ self.region_sets([['chr1', 1, 10], ['chr1', 15, 25]], []) result = self.setA.cluster(1) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) result = self.setA.cluster(5) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) result = self.setA.cluster(6) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 25) """ A : ---- ---- ---- ---- ---- R1: --------- ---- ---- ---- R2: --------------- ---- ---- R3: ---------------------- ---- R4: ------------------------------ R5: ------------------------------ """ self.region_sets([['chr1', 1, 10], ['chr1', 15, 25], ['chr1', 35, 45], ['chr1', 60, 70], ['chr1', 90, 100]], []) result = self.setA.cluster(6) self.assertEqual(len(result), 4) result = self.setA.cluster(11) self.assertEqual(len(result), 3) result = self.setA.cluster(16) self.assertEqual(len(result), 2) result = self.setA.cluster(21) self.assertEqual(len(result), 1) result = self.setA.cluster(26) self.assertEqual(len(result), 1) def test_flank(self): """ A : ----- R1: --- --- """ self.region_sets([['chr1', 60, 75]], []) result = self.setA.flank(10) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 50) self.assertEqual(result[0].final, 60) self.assertEqual(result[1].initial, 75) self.assertEqual(result[1].final, 85) """ A : ----- ---- R1: ----- ===== ---- """ self.region_sets([['chr1', 60, 75], ['chr1', 90, 100]], []) result = self.setA.flank(15) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 45) self.assertEqual(result[0].final, 60) self.assertEqual(result[1].initial, 75) self.assertEqual(result[1].final, 90) self.assertEqual(result[2].initial, 75) self.assertEqual(result[2].final, 90) self.assertEqual(result[3].initial, 100) self.assertEqual(result[3].final, 115) def test_jaccard(self): """ self --8-- ---10--- -4- y ---10--- ---10--- intersect -5- -4- similarity: ( 5 + 4 )/[(8 + 10 + 4) + (10 +10) - (5 + 4 )] = 9/33 """ self.region_sets( [['chr1', 50, 58], ['chr1', 70, 80], ['chr1', 90, 94]], [['chr1', 45, 55], ['chr1', 76, 86]]) result = self.setA.jaccard(self.setB) self.assertEqual(result, 9 / 33) def test_get_genome_data(self): """hg19""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19") self.assertEqual(len(result), 23) """hg19, with Mitochondria chromosome""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19", chrom_M=True) self.assertEqual(len(result), 24) def test_random_regions(self): self.region_sets( [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=False, overlap_input=False) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=True, overlap_input=False) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=False, overlap_input=True) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=True, overlap_input=True) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 1000], ['chr2', 0, 2000], ['chrX', 0, 3000]], []) result = self.setA.random_regions(organism="mm9", multiply_factor=100, overlap_result=False, overlap_input=False) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 1000], ['chr2', 0, 2000], ['chrX', 0, 3000]], []) result = self.setA.random_regions(organism="mm9", multiply_factor=100, overlap_result=False, overlap_input=False, chrom_M=True) result.sort()
def load_exon_sequence(bed, directory, genome_path): """Load the exon sequence from the the transcripts. Input BED format should contain: blockCount - The number of blocks (exons) in the BED line. blockSizes - A comma-separated list of the block sizes. blockStarts - A comma-separated list of block starts. see details: http://genome.ucsc.edu/FAQ/FAQformat#format1 Output: Each FASTA file represants a transcript and contains all the exons within the file. """ regionset = GenomicRegionSet("bed") regionset.read(bed) regionset.sort() genome = pysam.Fastafile(genome_path) try: if len(regionset.sequences[0].data.split("\t")) == 7: blockinfor = True no_exon = False except: blockinfor = False regionset.sequences.sort(key=lambda g: g.name) no_exon = True if blockinfor: for gr in regionset: if not gr.name: print("Error: For fetching exon sequences, please define the transcript name.") sys.exit() else: if not os.path.exists(directory): os.makedirs(directory) f = open(os.path.join(directory, gr.name+".fa"), "w") data = gr.data.split("\t") #print(len(data)) if len(data) == 7: #print(data) n = int(data[4]) blocks = [ int(b) for b in filter(None, data[5].split(",")) ] starts = [ int(s) for s in filter(None, data[6].split(",")) ] printstr = [] for i in range(n): start = gr.initial + starts[i] end = start + blocks[i] if no_exon and i == 0: ex = "" elif gr.orientation == "-": ex = "exon:"+str(n-i) else: ex = "exon:"+str(i+1) if gr.orientation == "-": seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna) seq = seq.reverse_complement() p = [ ">"+ " ".join([ gr.name, ex, "_".join(["REGION",gr.chrom, str(start),str(end), gr.orientation]) ]), seq ] printstr.append(p) else: p = [ ">"+ " ".join([gr.name, ex, "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]), genome.fetch(gr.chrom, start-1, end-1) ] printstr.append(p) if gr.orientation == "-": printstr = printstr[::-1] for i in range(n): print(printstr[i][0], file=f) print(printstr[i][1], file=f) else: print("Warning: The given regions have no block information, please try write_bed_blocks") f.close() else: pre_id = "" for gr in regionset: if not gr.name: gr.name = gr.toString() if pre_id == "": pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) elif gr.name == pre_id: z.add(gr) else: f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close() pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) # Last TX f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close()
def load_exon_sequence(bed, directory, genome_path): """Load the exon sequence from the the transcripts. Input BED format should contain: blockCount - The number of blocks (exons) in the BED line. blockSizes - A comma-separated list of the block sizes. blockStarts - A comma-separated list of block starts. see details: http://genome.ucsc.edu/FAQ/FAQformat#format1 Output: Each FASTA file represants a transcript and contains all the exons within the file. """ regionset = GenomicRegionSet("bed") regionset.read_bed(bed) regionset.sort() genome = pysam.Fastafile(genome_path) try: if len(regionset.sequences[0].data.split("\t")) == 7: blockinfor = True no_exon = False except: blockinfor = False regionset.sequences.sort(key=lambda g: g.name) no_exon = True if blockinfor: for gr in regionset: if not gr.name: print("Error: For fetching exon sequences, please define the transcript name.") sys.exit() else: if not os.path.exists(directory): os.makedirs(directory) f = open(os.path.join(directory, gr.name+".fa"), "w") data = gr.data.split("\t") #print(len(data)) if len(data) == 7: #print(data) n = int(data[4]) blocks = [ int(b) for b in filter(None, data[5].split(",")) ] starts = [ int(s) for s in filter(None, data[6].split(",")) ] printstr = [] for i in range(n): start = gr.initial + starts[i] end = start + blocks[i] if no_exon and i == 0: ex = "" elif gr.orientation == "-": ex = "exon:"+str(n-i) else: ex = "exon:"+str(i+1) if gr.orientation == "-": seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna) seq = seq.reverse_complement() p = [ ">"+ " ".join([ gr.name, ex, "_".join(["REGION",gr.chrom, str(start),str(end), gr.orientation]) ]), seq ] printstr.append(p) else: p = [ ">"+ " ".join([gr.name, ex, "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]), genome.fetch(gr.chrom, start-1, end-1) ] printstr.append(p) if gr.orientation == "-": printstr = printstr[::-1] for i in range(n): print(printstr[i][0], file=f) print(printstr[i][1], file=f) else: print("Warning: The given regions have no block information, please try write_bed_blocks") f.close() else: pre_id = "" for gr in regionset: if not gr.name: gr.name = gr.toString() if pre_id == "": pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) elif gr.name == pre_id: z.add(gr) else: f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close() pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) # Last TX f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close()
import unittest from rgt.GenomicRegionSet import * from rgt.CoverageSet import CoverageSet regions = GenomicRegionSet("test") regions.add(GenomicRegion("chr1", 10000, 11000, "+")) regions.add(GenomicRegion("chr1", 20000, 21000, "-")) cov = CoverageSet("coverage", regions) bamfile = "/projects/lncRNA/local/cardio/total_rna/bam/d4_1.bam" bedfile = "~/rgtdata/hg38/genes_hg38.bed" class CoverageSet_Test(unittest.TestCase): def coverage_from_genomicset(self): cov.coverage_from_genomicset(bamfile) print(cov.coverage) self.assertEqual(cov.coverage, 4)
def test_filter_tts(self): txp = RNADNAInteractionSet(organism="hg19", filename=sample_txp) g = GenomicRegionSet("g") s = GenomicRegion(chrom="chr2", initial=74000000, final=75000000) g.add(s) result = txp.count_tts(g)