示例#1
0
    def get_dbs(self,
                sort=False,
                orientation=None,
                rm_duplicate=False,
                dbd_tag=False):
        """Return GenomicRegionSet which contains all DNA binding sites"""
        dna_set = GenomicRegionSet(name="DNA_binding_sites")
        if len(self) == 0: return dna_set
        for rd in self.sequences:
            if dbd_tag:
                dbs = GenomicRegion(chrom=rd.dna.chrom,
                                    initial=rd.dna.initial,
                                    final=rd.dna.final,
                                    name=rd.rna.str_rna(),
                                    orientation=rd.dna.orientation,
                                    data=rd.score)
            else:
                dbs = GenomicRegion(chrom=rd.dna.chrom,
                                    initial=rd.dna.initial,
                                    final=rd.dna.final,
                                    name=rd.dna.name,
                                    orientation=rd.dna.orientation,
                                    data=rd.score)

            if not orientation:
                dna_set.add(dbs)
            else:
                if orientation == rd.orient:
                    dna_set.add(dbs)
                else:
                    pass
        if sort: dna_set.sort()
        if rm_duplicate: dna_set.remove_duplicates()
        return dna_set
示例#2
0
    def get_biotypes(self, gene_set=None):
        """Get the region sets of different Biotypes.

        *Keyword arguments:*

        *Return:*

            - result_grs -- A list of GenomicRegionSets containing the regions for each Biotype.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None

        # Fetching exons
        query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("exon")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            gr.name = e[self.GeneField.TRANSCRIPT_ID]
            result_grs.add(gr)

        if gene_set:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#3
0
    def get_genes(self, gene_set = None):
        """
        Gets regions of genes.
        It returns a GenomicRegionSet with such genes. The id of each gene will be put
        in the NAME field of each GenomicRegion.

        Keyword arguments:
        gene_set -- A set of genes to narrow the search.

        Return:
        result_grs -- A GenomicRegionSet containing the genes.
        unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if(gene_set): mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set)

        # Fetching genes
        if(gene_set): query_dictionary = {self.GeneField.FEATURE_TYPE:"gene", self.GeneField.GENE_ID:mapped_gene_list}
        else: query_dictionary = {self.GeneField.FEATURE_TYPE:"gene"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("genes")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            gr.name = e[self.GeneField.GENE_ID]
            result_grs.add(gr)
        result_grs.merge()
        if(gene_set): return result_grs, unmapped_gene_list
        else: return result_grs
示例#4
0
    def merge_rbs(self, rm_duplicate=False, asgene_organism=None, cutoff=0):
        """Merge the RNA binding regions which have overlap to each other and 
           combine their corresponding DNA binding regions.
        
        extend -> Define the extending length in basepair of each RNA binding regions
        perfect_match -> Merge only the exactly same RNA binding regions
        """
        # Merge RBS
        rna_merged = self.get_rbs()
        rna_merged.merge()
        # A dict: RBS as key, and GenomicRegionSet as its value
        new_dict = OrderedDict()

        for rbsm in rna_merged:
            regions = GenomicRegionSet(rbsm.toString())
            
            for rd in self:
                if rbsm.overlap(rd.rna):
                    regions.add(rd.dna)
            if rm_duplicate: 
                regions.remove_duplicates()
            if len(regions) > cutoff:
                new_dict[rbsm] = regions
                if asgene_organism:
                    try:
                        new_dict[rbsm] = new_dict[rbsm].gene_association(organism=asgene_organism)
                    except:
                        print("* No annotation file for mapping associated genes.")
            else: continue

        self.merged_dict = new_dict
示例#5
0
    def get_biotypes(self, gene_set=None):
        """Get the region sets of different Biotypes.

        *Keyword arguments:*

        *Return:*

            - result_grs -- A list of GenomicRegionSets containing the regions for each Biotype.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None

        # Fetching exons
        query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("exon")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            gr.name = e[self.GeneField.TRANSCRIPT_ID]
            result_grs.add(gr)

        if gene_set:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#6
0
    def get_transcripts(self, gene_set = None):
        """Gets transcripts of genes. It returns a GenomicRegionSet with such transcripts. The id of each gene will be put in the NAME field of each GenomicRegion.

        *Keyword arguments:*

            - gene_set -- A set of genes to narrow the search.

        *Return:*

            - result_grs -- A GenomicRegionSet containing the exons.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set)

        # Fetching exons
        if gene_set: query_dictionary = {self.GeneField.FEATURE_TYPE:"exon", self.GeneField.GENE_ID:mapped_gene_list}
        else: query_dictionary = {self.GeneField.FEATURE_TYPE:"exon"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("exon")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            gr.name = e[self.GeneField.TRANSCRIPT_ID]
            result_grs.add(gr)

        if gene_set: return result_grs, unmapped_gene_list
        else: return result_grs
示例#7
0
    def merge_rbs(self, rm_duplicate=False, asgene_organism=None, region_set=None, cutoff=0):
        """Merge the RNA binding regions which have overlap to each other and 
           combine their corresponding DNA binding regions.
        
        extend -> Define the extending length in basepair of each RNA binding regions
        perfect_match -> Merge only the exactly same RNA binding regions
        """
        # Merge RBS
        rna_merged = self.get_rbs()
        rna_merged.merge()
        # A dict: RBS as key, and GenomicRegionSet as its value
        new_dict = OrderedDict()

        for rbsm in rna_merged:
            regions = GenomicRegionSet(rbsm.toString())
            
            for rd in self:
                if rbsm.overlap(rd.rna):
                    regions.add(rd.dna)
            if rm_duplicate: 
                regions.remove_duplicates()
            if len(regions) > cutoff:
                new_dict[rbsm] = regions
                if asgene_organism:
                    try:
                        new_dict[rbsm] = new_dict[rbsm].gene_association(organism=asgene_organism)
                    except:
                        pass
                if region_set:
                    new_dict[rbsm].replace_region_name(regions=region_set)
            else: continue

        self.merged_dict = new_dict
示例#8
0
    def get_exons(self,
                  start_site=False,
                  end_site=False,
                  gene_set=None,
                  merge=True):
        """Gets exons of genes. It returns a GenomicRegionSet with such exons. The id of each gene will be put in the NAME field of each GenomicRegion.

        *Keyword arguments:*

            - start_site -- Whether to relocate the start sites.
            - end_site -- Whether to relocate the end sites.
            - gene_set -- A set of genes to narrow the search.

        *Return:*

            - result_grs -- A GenomicRegionSet containing the exons.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if gene_set:
            mapped_gene_list, unmapped_gene_list = self.fix_gene_names(
                gene_set)

        # Fetching exons
        if gene_set:
            query_dictionary = {
                self.GeneField.FEATURE_TYPE: "exon",
                self.GeneField.GENE_ID: mapped_gene_list
            }
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("exon")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            # gr.name = e[self.GeneField.GENE_ID]
            gr.name = e[self.GeneField.TRANSCRIPT_ID]
            result_grs.add(gr)
        if start_site:
            result_grs.relocate_regions("leftend",
                                        left_length=1,
                                        right_length=1)
        elif end_site:
            result_grs.relocate_regions("rightend",
                                        left_length=1,
                                        right_length=1)
        if merge: result_grs.merge()
        if gene_set:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#9
0
    def get_promoters(self,
                      promoterLength=1000,
                      gene_set=None,
                      unmaplist=False):
        """
        Gets promoters of genes given a specific promoter length.
        It returns a GenomicRegionSet with such promoters. The ID of each gene will be put
        in the NAME field of each GenomicRegion. Each promoter includes also the coordinate of
        the 5' base pair, therefore each promoter actual length is promoterLength+1.

        Keyword arguments:
        promoterLength -- The length of the promoter region.
        gene_set -- A set of genes to narrow the search.

        Return:
        result_grs -- A GenomicRegionSet containing the promoters.
        unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if (gene_set):
            mapped_gene_list, unmapped_gene_list = self.fix_gene_names(
                gene_set)

        # Fetching genes
        #if(gene_set): query_dictionary = {self.GeneField.FEATURE_TYPE:"gene", self.GeneField.GENE_ID:mapped_gene_list}
        #else: query_dictionary = {self.GeneField.FEATURE_TYPE:"gene"}
        if (gene_set):
            query_dictionary = {
                self.GeneField.FEATURE_TYPE: "transcript",
                self.GeneField.GENE_ID: mapped_gene_list
            }
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "transcript"}

        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("promoters")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            if (gr.orientation == "+"):
                gr.final = gr.initial + 1
                gr.initial = gr.initial - promoterLength
            else:
                gr.initial = gr.final - 1
                gr.final = gr.initial + promoterLength + 1

            gr.name = e[self.GeneField.GENE_ID]
            result_grs.add(gr)
        if unmaplist: return result_grs, unmapped_gene_list
        else: return result_grs
示例#10
0
 def get_dbs(self, sort=False, orientation=None, rm_duplicate=False):
     """Return GenomicRegionSet which contains all DNA binding sites"""
     dna_set = GenomicRegionSet(name="DNA_binding_sites")
     for rd in self.sequences:
         if not orientation:
             dna_set.add(rd.dna)
         else:
             if orientation == rd.orient:
                 dna_set.add(rd.dna)
             else: pass
     if sort: dna_set.sort()
     if rm_duplicate: dna_set.remove_duplicates()
     return dna_set
示例#11
0
 def get_dbs(self, sort=False, orientation=None, rm_duplicate=False):
     """Return GenomicRegionSet which contains all DNA binding sites"""
     dna_set = GenomicRegionSet(name="DNA_binding_sites")
     for rd in self.sequences:
         if not orientation:
             dna_set.add(rd.dna)
         else:
             if orientation == rd.orient:
                 dna_set.add(rd.dna)
             else: pass
     if sort: dna_set.sort()
     if rm_duplicate: dna_set.remove_duplicates()
     return dna_set
示例#12
0
    def get_promoters(self, promoterLength=1000, gene_set=None, unmaplist=False, variants=False):
        """
        Gets promoters of genes given a specific promoter length. It returns a GenomicRegionSet with such promoters.
        The ID of each gene will be put in the NAME field of each GenomicRegion.
        Each promoter includes also the coordinate of the 5' base pair, therefore each promoter actual
        length is promoterLength+1.

        *Keyword arguments:*

            - promoterLength -- The length of the promoter region.
            - gene_set -- A set of genes to narrow the search.
            - unmaplist -- If True than also return the unmappable genes list (default = False).

        *Return:*

            - result_grs -- A GenomicRegionSet containing the promoters.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None


        if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set)

        # Fetching genes

        if not variants: target = "gene"
        else: target = "transcript"
        if(gene_set): query_dictionary = {self.GeneField.FEATURE_TYPE:target, self.GeneField.GENE_ID:mapped_gene_list}
        else: query_dictionary = {self.GeneField.FEATURE_TYPE:target}
        
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("promoters")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            if gr.orientation == "+":
                gr.final = gr.initial + 1
                gr.initial = gr.initial - promoterLength
            else:
                gr.initial = gr.final - 1
                gr.final = gr.initial + promoterLength + 1

            gr.name = e[self.GeneField.GENE_ID]
            result_grs.add(gr)
        if unmaplist: return result_grs, unmapped_gene_list
        else: return result_grs
示例#13
0
    def get_tts(self, gene_set=None):
        """Gets TTS(Transcription termination site) of genes. It returns a GenomicRegionSet with such TTS. The ID of each gene will be put in the NAME field of each GenomicRegion.

        *Keyword arguments:*

            - gene_set -- A set of genes to narrow the search.
        
        *Return:*

            - result_grs -- A GenomicRegionSet containing TTS.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if gene_set:
            mapped_gene_list, unmapped_gene_list = self.fix_gene_names(
                gene_set)

        # Fetching genes
        if gene_set:
            query_dictionary = {
                self.GeneField.FEATURE_TYPE: "gene",
                self.GeneField.GENE_ID: mapped_gene_list
            }
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "gene"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("TTS")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            if gr.orientation == "+":
                gr.initial = gr.initial
                gr.final = gr.initial + 1
            else:
                gr.initial = gr.final - 1
                gr.final = gr.final
            gr.name = e[self.GeneField.GENE_ID]
            result_grs.add(gr)
        result_grs.merge()
        if gene_set:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#14
0
    def get_exons(self, start_site=False, end_site=False, gene_set=None, merge=True):
        """Gets exons of genes. It returns a GenomicRegionSet with such exons. The id of each gene will be put in the NAME field of each GenomicRegion.

        *Keyword arguments:*

            - start_site -- Whether to relocate the start sites.
            - end_site -- Whether to relocate the end sites.
            - gene_set -- A set of genes to narrow the search.

        *Return:*

            - result_grs -- A GenomicRegionSet containing the exons.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set)

        # Fetching exons
        if gene_set:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "exon", self.GeneField.GENE_ID: mapped_gene_list}
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("exon")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            # gr.name = e[self.GeneField.GENE_ID]
            gr.name = e[self.GeneField.TRANSCRIPT_ID]
            result_grs.add(gr)
        if start_site:
            result_grs.relocate_regions("leftend", left_length=1, right_length=1)
        elif end_site:
            result_grs.relocate_regions("rightend", left_length=1, right_length=1)
        if merge: result_grs.merge()
        if gene_set:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#15
0
    def get_tts(self, gene_set=None):
        """Gets TTS(Transcription termination site) of genes. It returns a GenomicRegionSet with such TTS. The ID of each gene will be put in the NAME field of each GenomicRegion.

        *Keyword arguments:*

            - gene_set -- A set of genes to narrow the search.
        
        *Return:*

            - result_grs -- A GenomicRegionSet containing TTS.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set)

        # Fetching genes
        if gene_set:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "gene", self.GeneField.GENE_ID: mapped_gene_list}
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "gene"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("TTS")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            if gr.orientation == "+":
                gr.initial = gr.initial
                gr.final = gr.initial + 1
            else:
                gr.initial = gr.final - 1
                gr.final = gr.final
            gr.name = e[self.GeneField.GENE_ID]
            result_grs.add(gr)
        result_grs.merge()
        if gene_set:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#16
0
    def get_genes(self, gene_set=None):
        """
        Gets regions of genes.
        It returns a GenomicRegionSet with such genes. The id of each gene will be put
        in the NAME field of each GenomicRegion.

        Keyword arguments:
        gene_set -- A set of genes to narrow the search.

        Return:
        result_grs -- A GenomicRegionSet containing the genes.
        unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if (gene_set):
            mapped_gene_list, unmapped_gene_list = self.fix_gene_names(
                gene_set)

        # Fetching genes
        if (gene_set):
            query_dictionary = {
                self.GeneField.FEATURE_TYPE: "gene",
                self.GeneField.GENE_ID: mapped_gene_list
            }
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "gene"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("genes")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            gr.name = e[self.GeneField.GENE_ID]
            result_grs.add(gr)
        result_grs.merge()
        if (gene_set): return result_grs, unmapped_gene_list
        else: return result_grs
示例#17
0
    def get_dbs(self, sort=False, orientation=None, rm_duplicate=False, dbd_tag=False):
        """Return GenomicRegionSet which contains all DNA binding sites"""
        dna_set = GenomicRegionSet(name="DNA_binding_sites")
        if len(self) == 0: return dna_set
        for rd in self.sequences:
            if dbd_tag:
                dbs = GenomicRegion(chrom=rd.dna.chrom, initial=rd.dna.initial, final=rd.dna.final,
                                    name=rd.rna.str_rna(), orientation=rd.dna.orientation, 
                                    data=rd.score)
            else:
                dbs = GenomicRegion(chrom=rd.dna.chrom, initial=rd.dna.initial, final=rd.dna.final,
                                    name=rd.dna.name, orientation=rd.dna.orientation, 
                                    data=rd.score)

            if not orientation:
                dna_set.add(dbs)
            else:
                if orientation == rd.orient:
                    dna_set.add(dbs)
                else: pass
        if sort: dna_set.sort()
        if rm_duplicate: dna_set.remove_duplicates()
        return dna_set
示例#18
0
Updated on 22 May 2014 by Joseph
"""

#############################  Parameters    ##############################
parser = argparse.ArgumentParser(description='Return the random sequences according to the given parameters.')
parser.add_argument('-o','-organism', default= "hg19", help='Define the organism. Default: hg19')
parser.add_argument('-l','-length', type=int, help='Define the length of each sequence.')
parser.add_argument('-n','-number', type=int, help='Define the number of random regions.')
parser.add_argument('-f','-filter', default=None, help='Given the path to the BED file as the filter.')

args = parser.parse_args()

# Setup the entries
region = GenomicRegion("sample", initial=0, final=args.l)
template = GenomicRegionSet("tamplate")
template.add(region)
    
if not os.path.exists(bed_dir):
    os.makedirs(bed_dir)
    
# Random region
result = template.random_regions(organism= "hg19", total_size=args.n, multiply_factor=0, overlap_result=True, overlap_input=True, chrom_X=False, chrom_M=False, filter_path=args.f)
result.write(os.path.join(bed_dir, "00total.bed"))
chrom = GenomicRegionSet("chrom")
chrom.get_genome_data(organism=args.o, chrom_X=False, chrom_M=False)
            
chrom_list = []
for r in chrom.sequences:
    chrom_list.append(r.chrom)
        
print("Settings:\n\tAllowing overlapping within random regions.")
示例#19
0
    def get_promoters(self,
                      promoter_length=1000,
                      tss=0,
                      gene_set=None,
                      unmaplist=False,
                      variants=False,
                      gene_id=False,
                      regiondata=False):
        """
        Gets promoters of genes given a specific promoter length. It returns a GenomicRegionSet with such promoters.
        The ID of each gene will be put in the NAME field of each GenomicRegion.
        Each promoter includes also the coordinate of the 5' base pair, therefore each promoter actual
        length is promoter_length+1.

        *Keyword arguments:*

            - promoter_length -- The length of the promoter region.
            - gene_set -- A set of genes to narrow the search.
            - unmaplist -- If True than also return the unmappable genes list (default = False).

        *Return:*

            - result_grs -- A GenomicRegionSet containing the promoters.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None

        if gene_set:
            mapped_gene_list, unmapped_gene_list, mapping_dict = self.fix_gene_names(
                gene_set, output_dict=True)

        # Fetching genes

        if not variants:
            target = "gene"
        else:
            target = "transcript"
        if gene_set:
            query_dictionary = {
                self.GeneField.FEATURE_TYPE: target,
                self.GeneField.GENE_ID: mapped_gene_list
            }
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: target}

        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("promoters")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            if gr.orientation == "+":
                gr.final = gr.initial + 1 + tss
                gr.initial = gr.initial - promoter_length
            else:
                gr.initial = gr.final - 1 - tss
                gr.final = gr.initial + promoter_length + 1

            if gene_set:
                try:
                    gr.name = mapping_dict[e[self.GeneField.GENE_ID]]
                except:
                    gr.name = e[self.GeneField.GENE_ID]
            elif gene_id:
                gr.name = e[self.GeneField.GENE_ID]
            else:
                gr.name = e[self.GeneField.GENE_NAMES]

            if gene_set and regiondata:
                gr.data = gene_set.values[gr.name]
            result_grs.add(gr)

        if unmaplist:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#20
0
class TestGenomicRegionSet(unittest.TestCase):
    
    def region_sets(self,listA,listB):
        """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """
        self.setA = GenomicRegionSet('for Unit Test')
        for i in range(len(listA)):
            self.setA.add(GenomicRegion(chrom=listA[i][0], initial=listA[i][1], final=listA[i][2]))
        
        self.setB = GenomicRegionSet('for Unit Test')
        for i in range(len(listB)):
            self.setB.add(GenomicRegion(chrom=listB[i][0], initial=listB[i][1], final=listB[i][2]))
    
    def test_extend(self):
        """
        Two empty sets
        A : none 
        R : none
        """
        self.region_sets([],
                         [])
        self.setA.extend(100,100)
        self.assertEqual(len(self.setA.sequences), 0)
        """
        One region
        A :   -----
        R : ---------
        """
        self.region_sets([['chr1',5,10]],
                         [])
        result = self.setA
        result.extend(4,4)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 14)
        """
        Many region
        A :   -----   ------         -----    -----
        R : --------=---------     ------------------
        """
        self.region_sets([['chr1',5,10],['chr1',15,20],['chr1',40,50],['chr1',65,75]],
                         [])
        result = self.setA
        result.extend(5,5)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 15)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[2].initial, 35)
        self.assertEqual(result[2].final, 55)
        self.assertEqual(result[3].initial, 60)
        self.assertEqual(result[3].final, 80)
        """
        Many region in different chromosome
        A :   -----   ------         -----    -----
        R : none
        """
        self.region_sets([['chr1',5,10],['chr2',15,20],['chr3',40,50],['chr4',65,75]],
                         [])
        result = self.setA
        result.extend(5,5)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 15)
        self.assertEqual(result[0].chrom, 'chr1')
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[1].chrom, 'chr2')
        self.assertEqual(result[2].initial, 35)
        self.assertEqual(result[2].final, 55)
        self.assertEqual(result[2].chrom, 'chr3')
        self.assertEqual(result[3].initial, 60)
        self.assertEqual(result[3].final, 80)
        self.assertEqual(result[3].chrom, 'chr4')
        """
        One region
        A :   -----
        R : ---------
        """
        self.region_sets([['chr1',100,200]],
                         [])
        result = self.setA
        result.extend(10,10,percentage=True)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 90)
        self.assertEqual(result[0].final, 210)
        
    def test_sort(self):
        self.region_sets([['chr1',15,20],['chr1',40,50],['chr1',65,75],['chr1',5,10]],
                         [])
        self.setA.sort()
    
    def test_intersect(self):
        """
        Two empty sets
        A : none 
        B : none
        R : none
        """
        self.region_sets([],
                         [])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        One empty set
        A :   -----
        B : none
        R : none
        """
        self.region_sets([['chr1',5,10]],
                         [])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        A : none
        B :   -----
        R : none
        """
        self.region_sets([],
                         [['chr1',5,10]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        No overlapping
        A : ------      ---------               ------- 
        B :        ----          ------  ------   
        R : none
        """
        self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]],
                         [['chr1',7,9],['chr1',20,25],['chr1',26,31]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        End-to-end attach
        A : ------      ------
        B :       ------
        R : none
        """
        self.region_sets([['chr1',1,5],['chr1',11,20]],
                         [['chr1',5,11]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        No length attach
        A : .      .
        B :    .   .
        R : none
        """
        self.region_sets([['chr1',2,2],['chr1',20,20]],
                         [['chr1',5,5],['chr1',20,20]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        
        """
        Perfect overlapping
        A : ------
        B : ------
        R : ------
        """
        self.region_sets([['chr1',1,10],['chr1',500,550],['chr1',600,650],['chr1',700,750],['chr1',725,800]],
                         [['chr1',1,10],['chr1',500,550],['chr1',600,650],['chr1',700,750],['chr1',725,800]])
        result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP, rm_duplicates=True)
        
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        """
        One overlapping region
        A : ------
        B :     --------
        R1:     --       (overlap)
        R2: ------       (original)
        R3:              (comp_incl)
        """

        self.region_sets([['chr1',1,10]],
                         [['chr1',7,20]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Two simple overlapping regions
        A : -------      --------
        B :     -------------
        R1:     ---      ----     (overlap)
        R2: -------      -------- (original)
        R3:                       (comp_incl)
        """
        self.region_sets([['chr1',1,10],['chr1',26,35]],
                         [['chr1',7,30]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 30)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Two separately overlapping regions 
        A : -------      --------
        B :     -----        --------
        R1:     ---          ----     (overlap)
        R2: -------      --------     (original)
        R3:                           (comp_incl)
        """
        self.region_sets([['chr1',1,10],['chr1',26,35]],
                         [['chr1',7,15],['chr1',30,40]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 30)
        self.assertEqual(result[1].final, 35)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Many various overlapping (mixed)
        A :   ------------------            --------   ---------
        B : ----   -------    ------            ----------      
        R1:   --   -------    --                ----   ---       (overlap)
        R2:   ------------------            --------   --------- (original)
        R3:                                                      (comp_incl) 
        """

        self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]],
                         [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]])

        result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 3)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 27)
        self.assertEqual(result[2].final, 30)
        self.assertEqual(result[3].initial, 55)
        self.assertEqual(result[3].final, 60)
        self.assertEqual(result[4].initial, 70)
        self.assertEqual(result[4].final, 75)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 3)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 50)
        self.assertEqual(result[1].final, 60)
        self.assertEqual(result[2].initial, 70)
        self.assertEqual(result[2].final, 85)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Different chromosomes
        A : chr1  -------
        B : chr2  -------
        R : none
        """
        self.region_sets([['chr1',1,10]],
                         [['chr2',1,10]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Completely included overlapping
        A : ---------------------------
        B : ----    ------       -----------
        R1: ----    ------       ------      (overlap)
        R2: ---------------------------      (original)
        R3:                                  (comp_incl)
        """
        self.region_sets([['chr1',1,50]],
                         [['chr1',1,5],['chr1',10,19],['chr1',45,60]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 50)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 50)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        A : ----    ------       -----------
        B : ---------------------------
        R1: ----    ------       ------      (overlap)
        R2: ----    ------       ----------- (original)
        R3: ----    ------                   (comp_incl)
        """

        self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]],
                         [['chr1',1,50]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 50)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 60)
        
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)

        """
        A : --------------         -------
                ------
        B :       -----          ----------------
        R1:       -----            -------      (overlap)
                  ----
        R2: --------------         -------      (original)
                ------
        R3:                        -------      (comp_incl)
        """
        self.region_sets([['chr1',1,50],['chr1',20,40],['chr1',70,80]],
                         [['chr1',25,45],['chr1',65,95]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 25)
        self.assertEqual(result[0].final, 45)
        self.assertEqual(result[1].initial, 70)
        self.assertEqual(result[1].final, 80)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[1].initial, 20)
        self.assertEqual(result[1].final, 40)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 50)
        self.assertEqual(result[2].initial, 70)
        self.assertEqual(result[2].final, 80)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 70)
        self.assertEqual(result[0].final, 80)

    def test_closest(self):
        """
        Two empty sets
        A : none 
        B : none
        R : none
        """
        self.region_sets([],
                         [])
        result = self.setA.closest(self.setB)
        self.assertEqual(len(result), 0)
        # """
        # One empty set
        # A :   -----
        # B : none
        # R : none
        # """
        # self.region_sets([['chr1',5,10]],
        #                  [])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # A : none
        # B :   -----
        # R : none
        # """
        # self.region_sets([],
        #                  [['chr1',5,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # Overlapping within set
        # A : -----====-----
        # B :      ----
        # R :      ----
        # """
        # self.region_sets([['chr1',1,10],['chr1',6,15]],
        #                  [['chr1',6,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # """
        # A :      ----
        # B : -----====-----
        # R : -----====-----
        # """
        # self.region_sets([['chr1',6,10]],
        #                  [['chr1',1,10],['chr1',6,15]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # """
        # No overlapping
        # A : ------      ---------               -------
        # B :        ----          ------  ------
        # R :                      ------
        # """
        # self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]],
        #                  [['chr1',7,9],['chr1',20,25],['chr1',26,31]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 3)
        # # self.assertEqual(result[0].initial, 20)
        # # self.assertEqual(result[0].final, 25)
        # """
        # End-to-end attach
        # A : ------      ------
        # B :       ------
        # R :       ------
        # """
        # self.region_sets([['chr1',1,5],['chr1',11,20]],
        #                  [['chr1',5,11]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # # self.assertEqual(result[0].initial, 5)
        # # self.assertEqual(result[0].final, 11)
        # """
        # Perfect overlapping
        # A : ------
        # B : ------
        # R : ------
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr1',1,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # self.assertEqual(result[0].initial, 1)
        # self.assertEqual(result[0].final, 10)
        # """
        # One overlapping region
        # A : ------
        # B :     --------
        # R :     --------
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr1',7,20]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result[0].initial, 7)
        # self.assertEqual(result[0].final, 20)
        # """
        # Two simple overlapping regions
        # A : -------      --------
        # B :     -------------
        # R :     -------------
        # """
        # self.region_sets([['chr1',1,10],['chr1',26,35]],
        #                  [['chr1',7,30]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result[0].initial, 7)
        # self.assertEqual(result[0].final, 30)
        # """
        # Two separately overlapping regions
        # A : -------      --------
        # B :     -----        --------
        # R : none
        # """
        # self.region_sets([['chr1',1,10],['chr1',26,35]],
        #                  [['chr1',7,15],['chr1',30,40]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # """
        # Many various overlapping (mixed)
        # A :   ------------------            --------   ---------
        # B : ----   -------    ------            ----------
        # R : none
        # """
        # self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]],
        #                  [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 4)
        # """
        # Different chromosomes
        # A : chr1  -------
        # B : chr2  -------
        # R : chr2  -------
        #
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr2',1,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # Completely included overlapping
        # A : ---------------------------
        # B : ----    ------       -----------
        # R : ----    ------       -----------
        # """
        # self.region_sets([['chr1',1,50]],
        #                  [['chr1',1,5],['chr1',10,19],['chr1',45,60]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 3)
        # """
        # A : ----    ------       -----------
        # B : ---------------------------
        # R : none
        # """
        # self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]],
        #                  [['chr1',1,50]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result, False)
        # """
        # A : ----         ------                  ---
        # B :        ---              -----
        # R :        ---
        # """
        # self.region_sets([['chr1',1,5],['chr1',27,45],['chr1',85,95]],
        #                  [['chr1',15,20],['chr1',55,65]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # self.assertEqual(result[0].initial, 15)
        # self.assertEqual(result[0].final, 20)
    
    def test_remove_duplicates(self):
        """
        A : ===== -----
        R : ----- -----
        """
        self.region_sets([['chr1',1,10],['chr1',1,10],['chr1',15,25]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        """
        A : =====--- -----
        R : =====--- -----
        """
        self.region_sets([['chr1',1,10],['chr1',1,15],['chr1',20,25]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 1)
        self.assertEqual(result[1].final, 15)
        self.assertEqual(result[2].initial, 20)
        self.assertEqual(result[2].final, 25)
        """
        A : ===== ----- ------  ====
        R : ----- ----- ------  ----
        """
        self.region_sets([['chr1',1,10],['chr1',1,10],['chr1',15,25],['chr1',30,35],['chr1',40,45],['chr1',40,45]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[2].initial, 30)
        self.assertEqual(result[2].final, 35)
        self.assertEqual(result[3].initial, 40)
        self.assertEqual(result[3].final, 45)

    def test_window(self):
        """
        A :             -------
        B : ------[ 99 ]       [   199   ]---
        window = 100
        R :       -                           only one base overlaps with extending A
        """   
        self.region_sets([['chr1',200,300]],
                         [['chr1',1,101],['chr1',499,550]])
        result = self.setA.window(self.setB,adding_length=100)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 100)
        self.assertEqual(result[0].final, 101)
        """
        A :             -------
        B : ------[ 99 ]       [   199   ]---
        window = 200
        R : ------                        -   
        left-hand side is covered, and the right-hand side is only one base overlapped
        """   
        self.region_sets([['chr1',200,300]],
                         [['chr1',1,101],['chr1',499,550]])
        result = self.setA.window(self.setB,adding_length=200)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)  # GenomicRegion.extend will choose 1 rather than 0
        self.assertEqual(result[0].final, 101)
        self.assertEqual(result[1].initial, 499)
        self.assertEqual(result[1].final, 500)
        """
        A :                         ----    ----
        B :             --------                    ----
        window = 1000 (default)
        R :                 ----                    ----
        """   
        self.region_sets([['chr1',3000,3500],['chr1',4000,4500]],
                         [['chr1',1500,2500],['chr1',5000,5500]])
        result = self.setA.window(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 2000)
        self.assertEqual(result[0].final, 2500)
        self.assertEqual(result[1].initial, 5000)
        self.assertEqual(result[1].final, 5500)
        """
        A :                         ----    ----
        B :             --------                    ----
        window = 2000
        R :             --------                    ----
                            ----                    ----
        window = 100
        R : none
        """   
        self.region_sets([['chr1',3000,3500],['chr1',4000,4500]],
                         [['chr1',1500,2500],['chr1',5000,5500]])
        result = self.setA.window(self.setB,adding_length=2000)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1500)
        self.assertEqual(result[0].final, 2500)
        self.assertEqual(result[1].initial, 5000)
        self.assertEqual(result[1].final, 5500)
        result = self.setA.window(self.setB,adding_length=100)
        self.assertEqual(len(result), 0)
        
    def test_subtract(self):
        """
        A : none
        B :    ------
        R : none
        """
        self.region_sets([],
                         [['chr1',6,15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A :    ------
        B : none
        R :    ------
        """
        self.region_sets([['chr1',6,15]],
                         [])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 6)
        self.assertEqual(result[0].final, 15)
        """
        A : ------
        B :    ------
        R : ---
        """
        self.region_sets([['chr1',1,10]],
                         [['chr1',6,15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 6)
        """
        A :    ------
        B : ------
        R :       ---
        """
        self.region_sets([['chr1',6,15]],
                         [['chr1',1,10]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 10)
        self.assertEqual(result[0].final, 15)
        """
        A :    ---
        B : ---------
        R : none
        """
        self.region_sets([['chr1',6,10]],
                         [['chr1',1,15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A : ---------
        B :    ---
        R : ---   ---
        """
        self.region_sets([['chr1',1,15]],
                         [['chr1',6,10]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 6)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 15)
        """
        A :    ------
        B :    ------
        R : none
        """
        self.region_sets([['chr1',6,15]],
                         [['chr1',6,15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A :   ----------              ------
        B :          ----------                    ----
        R :   -------                 ------
        """
        self.region_sets([['chr1',5,30],['chr1',70,85]],
                         [['chr1',20,50],['chr1',100,110]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 5)
        self.assertEqual(result[0].final, 20)
        self.assertEqual(result[1].initial, 70)
        self.assertEqual(result[1].final, 85)
        """
        A :        ------   -----
        B :    ------
        R :          ----   -----
        """
        self.region_sets([['chr1',20,30],['chr1',35,55]],
                         [['chr1',10,23],['chr1',100,110]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 23)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 35)
        self.assertEqual(result[1].final, 55)
        """
        A :   ch1     ---------------------
              ch2     -------------------------
        B :   ch1             ------
              ch2                        ------
        R :   ch1     --------      -------
              ch2     -------------------
        """
        self.region_sets([['chr1',0,30000],['chr2',0,35000]],
                         [['chr1',20000,23000],['chr2',31000,35000]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 20000)
        self.assertEqual(result[1].initial, 23000)
        self.assertEqual(result[1].final, 30000)
        self.assertEqual(result[2].initial, 0)
        self.assertEqual(result[2].final, 31000)
        """
        A :   -----------------------------------------------------------
        B :    ---    ---------         ----           ----
        R :   -   ----         ---------    -----------    --------------
        """
        self.region_sets([['chr1',5,1000]],
                         [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 5)
        
        """
        A :   -----------------------              ------
                   -----     -----  -----------
        B :    ---    ---------         ----           ----
        R :   -   ----         ------              ----
                   ---         ---  ----    ---
        """
        self.region_sets([['chr1',5,100],['chr1',20,40],['chr1',60,80],['chr1',95,150],['chr1',180,220]],
                         [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240]])
        result = self.setA.subtract(self.setB)
        #print(result.sequences)
        self.assertEqual(len(result), 8)
        self.assertEqual(result[0].initial, 5)
        """
        A :   -----------------------------------------------------------
        B :    ---    ---------         ----           ----
        R :   -   ----         ---------    -----------    --------------
        """
        self.region_sets([['chr1',5,1000],['chr2',5,1000],['chr4',5,1000]],
                         [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240],
                          ['chr2',10,15],['chr2',30,70],['chr2',120,140],['chr2',200,240],
                          ['chr4',10,15],['chr4',30,70],['chr4',120,140],['chr4',200,240]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 15)
        
        
    def test_merge(self):
        """
        A : none
        R : none
        """
        self.region_sets([],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 0)
        """
        A : -----  -----
        R : -----  -----
        """
        self.region_sets([['chr1',1,10],['chr1',15,25]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        """
        A1: ------------   ----
        A2:    -----
        R : ------------   ----
        """
        self.region_sets([['chr1',1,30],['chr1',11,20],['chr1',40,50]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 40)
        self.assertEqual(result[1].final, 50)
        """
        A1: --------       ----
        A2:    ---------
        R : ------------   ----
        """
        self.region_sets([['chr1',1,30],['chr1',20,40],['chr1',50,60]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 40)
        self.assertEqual(result[1].initial, 50)
        self.assertEqual(result[1].final, 60)
        """
        A : =======
        R : -------
        """
        self.region_sets([['chr1',1,30],['chr1',1,30]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 30)
        
    def test_cluster(self):
        """
        Empty sets
        A : none 
        R : none
        """
        self.region_sets([],
                         [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 0)
        """
        A :  ------- 
        R :  -------
        """
        self.region_sets([['chr1',1,10]],
                         [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        """
        A :  -----
                  ------
        R :  -----------
        """
        self.region_sets([['chr1',1,10],['chr1',10,20]],
                         [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 20)
        """
        A :  -----  -----
        R1:  -----  -----
        R2:  ------------
        """
        self.region_sets([['chr1',1,10],['chr1',15,25]],
                         [])
        result = self.setA.cluster(1)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        result = self.setA.cluster(5)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        result = self.setA.cluster(6)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 25)
        """
        A :  ---- ----  ----   ----    ----
        R1:  ---------  ----   ----    ----
        R2:  ---------------   ----    ----
        R3:  ----------------------    ----
        R4:  ------------------------------
        R5:  ------------------------------
        """
        self.region_sets([['chr1',1,10],['chr1',15,25],['chr1',35,45],
                          ['chr1',60,70],['chr1',90,100]],
                         [])
        result = self.setA.cluster(6)
        self.assertEqual(len(result), 4)
        result = self.setA.cluster(11)
        self.assertEqual(len(result), 3)
        result = self.setA.cluster(16)
        self.assertEqual(len(result), 2)
        result = self.setA.cluster(21)
        self.assertEqual(len(result), 1)
        result = self.setA.cluster(26)
        self.assertEqual(len(result), 1)
        
    def test_flank(self):
        """
        A :        -----
        R1:     ---     ---
        """
        self.region_sets([['chr1',60,75]],
                         [])
        result = self.setA.flank(10)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 50)
        self.assertEqual(result[0].final, 60)
        self.assertEqual(result[1].initial, 75)
        self.assertEqual(result[1].final, 85)
        """
        A :        -----     ----
        R1:   -----     =====    ----
        """
        self.region_sets([['chr1',60,75],['chr1',90,100]],
                         [])
        result = self.setA.flank(15)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 45)
        self.assertEqual(result[0].final, 60)
        self.assertEqual(result[1].initial, 75)
        self.assertEqual(result[1].final, 90)
        self.assertEqual(result[2].initial, 75)
        self.assertEqual(result[2].final, 90)
        self.assertEqual(result[3].initial, 100)
        self.assertEqual(result[3].final, 115)
        
    def test_jaccard(self):
        """
        self           --8--      ---10---      -4-
        y         ---10---             ---10---
        intersect      -5-             -4-    
        similarity:   ( 5 + 4 )/[(8 + 10 + 4) + (10 +10) - (5 + 4 )]
                      = 9/33
        """
        self.region_sets([['chr1',50,58],['chr1',70,80],['chr1',90,94]],
                         [['chr1',45,55],['chr1',76,86]])
        result = self.setA.jaccard(self.setB)
        self.assertEqual(result, 9/33)
    
    def test_get_genome_data(self):
        """hg19"""
        result = GenomicRegionSet("hg19")
        result.get_genome_data(organism="hg19")
        self.assertEqual(len(result), 23)
        """hg19, with Mitochondria chromosome"""
        result = GenomicRegionSet("hg19")
        result.get_genome_data(organism="hg19",chrom_M=True)
        self.assertEqual(len(result), 24)
        
    def test_random_regions(self):
        
        self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          total_size=100, 
                                          overlap_result=False, 
                                          overlap_input=False)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        
        self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          total_size=100, 
                                          overlap_result=True, 
                                          overlap_input=False)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          total_size=100, 
                                          overlap_result=False, 
                                          overlap_input=True)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          total_size=100, 
                                          overlap_result=True, 
                                          overlap_input=True)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        self.region_sets([['chr1',0,1000],['chr2',0,2000],['chrX',0,3000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          multiply_factor=100, 
                                          overlap_result=False, 
                                          overlap_input=False)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        self.region_sets([['chr1',0,1000],['chr2',0,2000],['chrX',0,3000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          multiply_factor=100, 
                                          overlap_result=False, 
                                          overlap_input=False,
                                          chrom_M=True)
        result.sort()
示例#21
0
class TestGenomicRegionSet(unittest.TestCase):
    def region_sets(self, listA, listB):
        """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """
        self.setA = GenomicRegionSet('for Unit Test')
        for i in range(len(listA)):
            self.setA.add(
                GenomicRegion(chrom=listA[i][0],
                              initial=listA[i][1],
                              final=listA[i][2]))

        self.setB = GenomicRegionSet('for Unit Test')
        for i in range(len(listB)):
            self.setB.add(
                GenomicRegion(chrom=listB[i][0],
                              initial=listB[i][1],
                              final=listB[i][2]))

    def test_extend(self):
        """
        Two empty sets
        A : none 
        R : none
        """
        self.region_sets([], [])
        self.setA.extend(100, 100)
        self.assertEqual(len(self.setA.sequences), 0)
        """
        One region
        A :   -----
        R : ---------
        """
        self.region_sets([['chr1', 5, 10]], [])
        result = self.setA
        result.extend(4, 4)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 14)
        """
        Many region
        A :   -----   ------         -----    -----
        R : --------=---------     ------------------
        """
        self.region_sets([['chr1', 5, 10], ['chr1', 15, 20], ['chr1', 40, 50],
                          ['chr1', 65, 75]], [])
        result = self.setA
        result.extend(5, 5)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 15)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[2].initial, 35)
        self.assertEqual(result[2].final, 55)
        self.assertEqual(result[3].initial, 60)
        self.assertEqual(result[3].final, 80)
        """
        Many region in different chromosome
        A :   -----   ------         -----    -----
        R : none
        """
        self.region_sets([['chr1', 5, 10], ['chr2', 15, 20], ['chr3', 40, 50],
                          ['chr4', 65, 75]], [])
        result = self.setA
        result.extend(5, 5)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 15)
        self.assertEqual(result[0].chrom, 'chr1')
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[1].chrom, 'chr2')
        self.assertEqual(result[2].initial, 35)
        self.assertEqual(result[2].final, 55)
        self.assertEqual(result[2].chrom, 'chr3')
        self.assertEqual(result[3].initial, 60)
        self.assertEqual(result[3].final, 80)
        self.assertEqual(result[3].chrom, 'chr4')
        """
        One region
        A :   -----
        R : ---------
        """
        self.region_sets([['chr1', 100, 200]], [])
        result = self.setA
        result.extend(10, 10, percentage=True)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 90)
        self.assertEqual(result[0].final, 210)

    def test_sort(self):
        self.region_sets([['chr1', 15, 20], ['chr1', 40, 50], ['chr1', 65, 75],
                          ['chr1', 5, 10]], [])
        self.setA.sort()

    def test_intersect(self):
        """
        Two empty sets
        A : none 
        B : none
        R : none
        """
        self.region_sets([], [])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        One empty set
        A :   -----
        B : none
        R : none
        """
        self.region_sets([['chr1', 5, 10]], [])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        A : none
        B :   -----
        R : none
        """
        self.region_sets([], [['chr1', 5, 10]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        No overlapping
        A : ------      ---------               ------- 
        B :        ----          ------  ------   
        R : none
        """
        self.region_sets([['chr1', 1, 5], ['chr1', 11, 20], ['chr1', 33, 38]],
                         [['chr1', 7, 9], ['chr1', 20, 25], ['chr1', 26, 31]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        End-to-end attach
        A : ------      ------
        B :       ------
        R : none
        """
        self.region_sets([['chr1', 1, 5], ['chr1', 11, 20]], [['chr1', 5, 11]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        No length attach
        A : .      .
        B :    .   .
        R : none
        """
        self.region_sets([['chr1', 2, 2], ['chr1', 20, 20]],
                         [['chr1', 5, 5], ['chr1', 20, 20]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Perfect overlapping
        A : ------
        B : ------
        R : ------
        """
        self.region_sets(
            [['chr1', 1, 10], ['chr1', 500, 550], ['chr1', 600, 650],
             ['chr1', 700, 750], ['chr1', 725, 800]],
            [['chr1', 1, 10], ['chr1', 500, 550], ['chr1', 600, 650],
             ['chr1', 700, 750], ['chr1', 725, 800]])
        result = self.setA.intersect(self.setB,
                                     mode=OverlapType.OVERLAP,
                                     rm_duplicates=True)

        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        """
        One overlapping region
        A : ------
        B :     --------
        R1:     --       (overlap)
        R2: ------       (original)
        R3:              (comp_incl)
        """

        self.region_sets([['chr1', 1, 10]], [['chr1', 7, 20]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Two simple overlapping regions
        A : -------      --------
        B :     -------------
        R1:     ---      ----     (overlap)
        R2: -------      -------- (original)
        R3:                       (comp_incl)
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 26, 35]],
                         [['chr1', 7, 30]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 30)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Two separately overlapping regions 
        A : -------      --------
        B :     -----        --------
        R1:     ---          ----     (overlap)
        R2: -------      --------     (original)
        R3:                           (comp_incl)
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 26, 35]],
                         [['chr1', 7, 15], ['chr1', 30, 40]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 30)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Many various overlapping (mixed)
        A :   ------------------            --------   ---------
        B : ----   -------    ------            ----------      
        R1:   --   -------    --                ----   ---       (overlap)
        R2:   ------------------            --------   --------- (original)
        R3:                                                      (comp_incl) 
        """

        self.region_sets([['chr1', 3, 30], ['chr1', 50, 60], ['chr1', 70, 85]],
                         [['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 27, 35],
                          ['chr1', 55, 75]])

        result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 3)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 27)
        self.assertEqual(result[2].final, 30)
        self.assertEqual(result[3].initial, 55)
        self.assertEqual(result[3].final, 60)
        self.assertEqual(result[4].initial, 70)
        self.assertEqual(result[4].final, 75)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 3)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 50)
        self.assertEqual(result[1].final, 60)
        self.assertEqual(result[2].initial, 70)
        self.assertEqual(result[2].final, 85)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Different chromosomes
        A : chr1  -------
        B : chr2  -------
        R : none
        """
        self.region_sets([['chr1', 1, 10]], [['chr2', 1, 10]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Completely included overlapping
        A : ---------------------------
        B : ----    ------       -----------
        R1: ----    ------       ------      (overlap)
        R2: ---------------------------      (original)
        R3:                                  (comp_incl)
        """
        self.region_sets([['chr1', 1, 50]],
                         [['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 45, 60]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 50)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 50)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        A : ----    ------       -----------
        B : ---------------------------
        R1: ----    ------       ------      (overlap)
        R2: ----    ------       ----------- (original)
        R3: ----    ------                   (comp_incl)
        """

        self.region_sets([['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 45, 60]],
                         [['chr1', 1, 50]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 50)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 60)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        """
        A : --------------         -------
                ------
        B :       -----          ----------------
        R1:       -----            -------      (overlap)
                  ----
        R2: --------------         -------      (original)
                ------
        R3:                        -------      (comp_incl)
        """
        self.region_sets([['chr1', 1, 50], ['chr1', 20, 40], ['chr1', 70, 80]],
                         [['chr1', 25, 45], ['chr1', 65, 95]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 25)
        self.assertEqual(result[0].final, 45)
        self.assertEqual(result[1].initial, 70)
        self.assertEqual(result[1].final, 80)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[1].initial, 20)
        self.assertEqual(result[1].final, 40)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 50)
        self.assertEqual(result[2].initial, 70)
        self.assertEqual(result[2].final, 80)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 70)
        self.assertEqual(result[0].final, 80)

    def test_closest(self):
        """
        Two empty sets
        A : none 
        B : none
        R : none
        """
        self.region_sets([], [])
        result = self.setA.closest(self.setB)
        self.assertEqual(len(result), 0)
        # """
        # One empty set
        # A :   -----
        # B : none
        # R : none
        # """
        # self.region_sets([['chr1',5,10]],
        #                  [])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # A : none
        # B :   -----
        # R : none
        # """
        # self.region_sets([],
        #                  [['chr1',5,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # Overlapping within set
        # A : -----====-----
        # B :      ----
        # R :      ----
        # """
        # self.region_sets([['chr1',1,10],['chr1',6,15]],
        #                  [['chr1',6,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # """
        # A :      ----
        # B : -----====-----
        # R : -----====-----
        # """
        # self.region_sets([['chr1',6,10]],
        #                  [['chr1',1,10],['chr1',6,15]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # """
        # No overlapping
        # A : ------      ---------               -------
        # B :        ----          ------  ------
        # R :                      ------
        # """
        # self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]],
        #                  [['chr1',7,9],['chr1',20,25],['chr1',26,31]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 3)
        # # self.assertEqual(result[0].initial, 20)
        # # self.assertEqual(result[0].final, 25)
        # """
        # End-to-end attach
        # A : ------      ------
        # B :       ------
        # R :       ------
        # """
        # self.region_sets([['chr1',1,5],['chr1',11,20]],
        #                  [['chr1',5,11]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # # self.assertEqual(result[0].initial, 5)
        # # self.assertEqual(result[0].final, 11)
        # """
        # Perfect overlapping
        # A : ------
        # B : ------
        # R : ------
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr1',1,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # self.assertEqual(result[0].initial, 1)
        # self.assertEqual(result[0].final, 10)
        # """
        # One overlapping region
        # A : ------
        # B :     --------
        # R :     --------
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr1',7,20]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result[0].initial, 7)
        # self.assertEqual(result[0].final, 20)
        # """
        # Two simple overlapping regions
        # A : -------      --------
        # B :     -------------
        # R :     -------------
        # """
        # self.region_sets([['chr1',1,10],['chr1',26,35]],
        #                  [['chr1',7,30]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result[0].initial, 7)
        # self.assertEqual(result[0].final, 30)
        # """
        # Two separately overlapping regions
        # A : -------      --------
        # B :     -----        --------
        # R : none
        # """
        # self.region_sets([['chr1',1,10],['chr1',26,35]],
        #                  [['chr1',7,15],['chr1',30,40]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # """
        # Many various overlapping (mixed)
        # A :   ------------------            --------   ---------
        # B : ----   -------    ------            ----------
        # R : none
        # """
        # self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]],
        #                  [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 4)
        # """
        # Different chromosomes
        # A : chr1  -------
        # B : chr2  -------
        # R : chr2  -------
        #
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr2',1,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # Completely included overlapping
        # A : ---------------------------
        # B : ----    ------       -----------
        # R : ----    ------       -----------
        # """
        # self.region_sets([['chr1',1,50]],
        #                  [['chr1',1,5],['chr1',10,19],['chr1',45,60]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 3)
        # """
        # A : ----    ------       -----------
        # B : ---------------------------
        # R : none
        # """
        # self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]],
        #                  [['chr1',1,50]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result, False)
        # """
        # A : ----         ------                  ---
        # B :        ---              -----
        # R :        ---
        # """
        # self.region_sets([['chr1',1,5],['chr1',27,45],['chr1',85,95]],
        #                  [['chr1',15,20],['chr1',55,65]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # self.assertEqual(result[0].initial, 15)
        # self.assertEqual(result[0].final, 20)

    def test_remove_duplicates(self):
        """
        A : ===== -----
        R : ----- -----
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 1, 10], ['chr1', 15, 25]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        """
        A : =====--- -----
        R : =====--- -----
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 1, 15], ['chr1', 20, 25]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 1)
        self.assertEqual(result[1].final, 15)
        self.assertEqual(result[2].initial, 20)
        self.assertEqual(result[2].final, 25)
        """
        A : ===== ----- ------  ====
        R : ----- ----- ------  ----
        """
        self.region_sets(
            [['chr1', 1, 10], ['chr1', 1, 10], ['chr1', 15, 25],
             ['chr1', 30, 35], ['chr1', 40, 45], ['chr1', 40, 45]], [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[2].initial, 30)
        self.assertEqual(result[2].final, 35)
        self.assertEqual(result[3].initial, 40)
        self.assertEqual(result[3].final, 45)

    def test_window(self):
        """
        A :             -------
        B : ------[ 99 ]       [   199   ]---
        window = 100
        R :       -                           only one base overlaps with extending A
        """
        self.region_sets([['chr1', 200, 300]],
                         [['chr1', 1, 101], ['chr1', 499, 550]])
        result = self.setA.window(self.setB, adding_length=100)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 100)
        self.assertEqual(result[0].final, 101)
        """
        A :             -------
        B : ------[ 99 ]       [   199   ]---
        window = 200
        R : ------                        -   
        left-hand side is covered, and the right-hand side is only one base overlapped
        """
        self.region_sets([['chr1', 200, 300]],
                         [['chr1', 1, 101], ['chr1', 499, 550]])
        result = self.setA.window(self.setB, adding_length=200)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial,
                         1)  # GenomicRegion.extend will choose 1 rather than 0
        self.assertEqual(result[0].final, 101)
        self.assertEqual(result[1].initial, 499)
        self.assertEqual(result[1].final, 500)
        """
        A :                         ----    ----
        B :             --------                    ----
        window = 1000 (default)
        R :                 ----                    ----
        """
        self.region_sets([['chr1', 3000, 3500], ['chr1', 4000, 4500]],
                         [['chr1', 1500, 2500], ['chr1', 5000, 5500]])
        result = self.setA.window(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 2000)
        self.assertEqual(result[0].final, 2500)
        self.assertEqual(result[1].initial, 5000)
        self.assertEqual(result[1].final, 5500)
        """
        A :                         ----    ----
        B :             --------                    ----
        window = 2000
        R :             --------                    ----
                            ----                    ----
        window = 100
        R : none
        """
        self.region_sets([['chr1', 3000, 3500], ['chr1', 4000, 4500]],
                         [['chr1', 1500, 2500], ['chr1', 5000, 5500]])
        result = self.setA.window(self.setB, adding_length=2000)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1500)
        self.assertEqual(result[0].final, 2500)
        self.assertEqual(result[1].initial, 5000)
        self.assertEqual(result[1].final, 5500)
        result = self.setA.window(self.setB, adding_length=100)
        self.assertEqual(len(result), 0)

    def test_subtract(self):
        """
        A : none
        B :    ------
        R : none
        """
        self.region_sets([], [['chr1', 6, 15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A :    ------
        B : none
        R :    ------
        """
        self.region_sets([['chr1', 6, 15]], [])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 6)
        self.assertEqual(result[0].final, 15)
        """
        A : ------
        B :    ------
        R : ---
        """
        self.region_sets([['chr1', 1, 10]], [['chr1', 6, 15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 6)
        """
        A :    ------
        B : ------
        R :       ---
        """
        self.region_sets([['chr1', 6, 15]], [['chr1', 1, 10]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 10)
        self.assertEqual(result[0].final, 15)
        """
        A :    ---
        B : ---------
        R : none
        """
        self.region_sets([['chr1', 6, 10]], [['chr1', 1, 15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A : ---------
        B :    ---
        R : ---   ---
        """
        self.region_sets([['chr1', 1, 15]], [['chr1', 6, 10]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 6)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 15)
        """
        A :    ------
        B :    ------
        R : none
        """
        self.region_sets([['chr1', 6, 15]], [['chr1', 6, 15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A :   ----------              ------
        B :          ----------                    ----
        R :   -------                 ------
        """
        self.region_sets([['chr1', 5, 30], ['chr1', 70, 85]],
                         [['chr1', 20, 50], ['chr1', 100, 110]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 5)
        self.assertEqual(result[0].final, 20)
        self.assertEqual(result[1].initial, 70)
        self.assertEqual(result[1].final, 85)
        """
        A :        ------   -----
        B :    ------
        R :          ----   -----
        """
        self.region_sets([['chr1', 20, 30], ['chr1', 35, 55]],
                         [['chr1', 10, 23], ['chr1', 100, 110]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 23)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 35)
        self.assertEqual(result[1].final, 55)
        """
        A :   ch1     ---------------------
              ch2     -------------------------
        B :   ch1             ------
              ch2                        ------
        R :   ch1     --------      -------
              ch2     -------------------
        """
        self.region_sets([['chr1', 0, 30000], ['chr2', 0, 35000]],
                         [['chr1', 20000, 23000], ['chr2', 31000, 35000]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 20000)
        self.assertEqual(result[1].initial, 23000)
        self.assertEqual(result[1].final, 30000)
        self.assertEqual(result[2].initial, 0)
        self.assertEqual(result[2].final, 31000)
        """
        A :   -----------------------------------------------------------
        B :    ---    ---------         ----           ----
        R :   -   ----         ---------    -----------    --------------
        """
        self.region_sets([['chr1', 5, 1000]],
                         [['chr1', 10, 15], ['chr1', 30, 70],
                          ['chr1', 120, 140], ['chr1', 200, 240]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 5)
        """
        A :   -----------------------              ------
                   -----     -----  -----------
        B :    ---    ---------         ----           ----
        R :   -   ----         ------              ----
                   ---         ---  ----    ---
        """
        self.region_sets([['chr1', 5, 100], ['chr1', 20, 40], ['chr1', 60, 80],
                          ['chr1', 95, 150], ['chr1', 180, 220]],
                         [['chr1', 10, 15], ['chr1', 30, 70],
                          ['chr1', 120, 140], ['chr1', 200, 240]])
        result = self.setA.subtract(self.setB)
        # print(result.sequences)
        self.assertEqual(len(result), 8)
        self.assertEqual(result[0].initial, 5)
        """
        A :   -----------------------------------------------------------
        B :    ---    ---------         ----           ----
        R :   -   ----         ---------    -----------    --------------
        """
        self.region_sets(
            [['chr1', 5, 1000], ['chr2', 5, 1000], ['chr4', 5, 1000]],
            [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140],
             ['chr1', 200, 240], ['chr2', 10, 15], ['chr2', 30, 70],
             ['chr2', 120, 140], ['chr2', 200, 240], ['chr4', 10, 15],
             ['chr4', 30, 70], ['chr4', 120, 140], ['chr4', 200, 240]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 15)

    def test_merge(self):
        """
        A : none
        R : none
        """
        self.region_sets([], [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 0)
        """
        A : -----  -----
        R : -----  -----
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 15, 25]], [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        """
        A1: ------------   ----
        A2:    -----
        R : ------------   ----
        """
        self.region_sets([['chr1', 1, 30], ['chr1', 11, 20], ['chr1', 40, 50]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 40)
        self.assertEqual(result[1].final, 50)
        """
        A1: --------       ----
        A2:    ---------
        R : ------------   ----
        """
        self.region_sets([['chr1', 1, 30], ['chr1', 20, 40], ['chr1', 50, 60]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 40)
        self.assertEqual(result[1].initial, 50)
        self.assertEqual(result[1].final, 60)
        """
        A : =======
        R : -------
        """
        self.region_sets([['chr1', 1, 30], ['chr1', 1, 30]], [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 30)

    def test_cluster(self):
        """
        Empty sets
        A : none 
        R : none
        """
        self.region_sets([], [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 0)
        """
        A :  ------- 
        R :  -------
        """
        self.region_sets([['chr1', 1, 10]], [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        """
        A :  -----
                  ------
        R :  -----------
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 10, 20]], [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 20)
        """
        A :  -----  -----
        R1:  -----  -----
        R2:  ------------
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 15, 25]], [])
        result = self.setA.cluster(1)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        result = self.setA.cluster(5)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        result = self.setA.cluster(6)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 25)
        """
        A :  ---- ----  ----   ----    ----
        R1:  ---------  ----   ----    ----
        R2:  ---------------   ----    ----
        R3:  ----------------------    ----
        R4:  ------------------------------
        R5:  ------------------------------
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 15, 25], ['chr1', 35, 45],
                          ['chr1', 60, 70], ['chr1', 90, 100]], [])
        result = self.setA.cluster(6)
        self.assertEqual(len(result), 4)
        result = self.setA.cluster(11)
        self.assertEqual(len(result), 3)
        result = self.setA.cluster(16)
        self.assertEqual(len(result), 2)
        result = self.setA.cluster(21)
        self.assertEqual(len(result), 1)
        result = self.setA.cluster(26)
        self.assertEqual(len(result), 1)

    def test_flank(self):
        """
        A :        -----
        R1:     ---     ---
        """
        self.region_sets([['chr1', 60, 75]], [])
        result = self.setA.flank(10)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 50)
        self.assertEqual(result[0].final, 60)
        self.assertEqual(result[1].initial, 75)
        self.assertEqual(result[1].final, 85)
        """
        A :        -----     ----
        R1:   -----     =====    ----
        """
        self.region_sets([['chr1', 60, 75], ['chr1', 90, 100]], [])
        result = self.setA.flank(15)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 45)
        self.assertEqual(result[0].final, 60)
        self.assertEqual(result[1].initial, 75)
        self.assertEqual(result[1].final, 90)
        self.assertEqual(result[2].initial, 75)
        self.assertEqual(result[2].final, 90)
        self.assertEqual(result[3].initial, 100)
        self.assertEqual(result[3].final, 115)

    def test_jaccard(self):
        """
        self           --8--      ---10---      -4-
        y         ---10---             ---10---
        intersect      -5-             -4-    
        similarity:   ( 5 + 4 )/[(8 + 10 + 4) + (10 +10) - (5 + 4 )]
                      = 9/33
        """
        self.region_sets(
            [['chr1', 50, 58], ['chr1', 70, 80], ['chr1', 90, 94]],
            [['chr1', 45, 55], ['chr1', 76, 86]])
        result = self.setA.jaccard(self.setB)
        self.assertEqual(result, 9 / 33)

    def test_get_genome_data(self):
        """hg19"""
        result = GenomicRegionSet("hg19")
        result.get_genome_data(organism="hg19")
        self.assertEqual(len(result), 23)
        """hg19, with Mitochondria chromosome"""
        result = GenomicRegionSet("hg19")
        result.get_genome_data(organism="hg19", chrom_M=True)
        self.assertEqual(len(result), 24)

    def test_random_regions(self):

        self.region_sets(
            [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          total_size=100,
                                          overlap_result=False,
                                          overlap_input=False)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          total_size=100,
                                          overlap_result=True,
                                          overlap_input=False)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          total_size=100,
                                          overlap_result=False,
                                          overlap_input=True)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          total_size=100,
                                          overlap_result=True,
                                          overlap_input=True)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 1000], ['chr2', 0, 2000], ['chrX', 0, 3000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          multiply_factor=100,
                                          overlap_result=False,
                                          overlap_input=False)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 1000], ['chr2', 0, 2000], ['chrX', 0, 3000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          multiply_factor=100,
                                          overlap_result=False,
                                          overlap_input=False,
                                          chrom_M=True)
        result.sort()
示例#22
0
def load_exon_sequence(bed, directory, genome_path):
    """Load the exon sequence from the the transcripts. 
    Input BED format should contain:
        blockCount - The number of blocks (exons) in the BED line.
        blockSizes - A comma-separated list of the block sizes.
        blockStarts - A comma-separated list of block starts. 
        see details: http://genome.ucsc.edu/FAQ/FAQformat#format1

    Output:
        Each FASTA file represants a transcript and contains all the exons within the file.

    """
    regionset = GenomicRegionSet("bed")
    regionset.read(bed)
    regionset.sort()

    
    genome = pysam.Fastafile(genome_path)
    
    try:
        if len(regionset.sequences[0].data.split("\t")) == 7: 
            blockinfor = True
            no_exon = False
    except:
        blockinfor = False
        regionset.sequences.sort(key=lambda g: g.name)
        no_exon = True

    if blockinfor:
        
        for gr in regionset:
            if not gr.name:
                print("Error: For fetching exon sequences, please define the transcript name.")
                sys.exit()
            else:
                if not os.path.exists(directory):
                    os.makedirs(directory)
                f = open(os.path.join(directory, gr.name+".fa"), "w")
                data = gr.data.split("\t")
                #print(len(data))
                if len(data) == 7:
                    #print(data)
                    n = int(data[4])
                    
                    blocks = [ int(b) for b in filter(None, data[5].split(",")) ]
                    starts = [ int(s) for s in filter(None, data[6].split(",")) ]
                    printstr = []

                    for i in range(n):
                        start = gr.initial + starts[i]
                        end = start + blocks[i]
                        if no_exon and i == 0:
                            ex = ""
                        elif gr.orientation == "-":
                            ex = "exon:"+str(n-i)
                        else:
                            ex = "exon:"+str(i+1)

                        if gr.orientation == "-":
                            seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna)
                            seq = seq.reverse_complement()
                            p = [ ">"+ " ".join([ gr.name, 
                                                  ex, 
                                                  "_".join(["REGION",gr.chrom,
                                                            str(start),str(end), 
                                                            gr.orientation]) ]),
                                  seq ]
                            
                            printstr.append(p)
                            

                        else:
                            p = [ ">"+ " ".join([gr.name, ex, 
                                  "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]),
                                  genome.fetch(gr.chrom, start-1, end-1)
                                ]
                            printstr.append(p)
                            

                    if gr.orientation == "-": printstr = printstr[::-1]
                    for i in range(n):
                        print(printstr[i][0], file=f)
                        print(printstr[i][1], file=f)
                        

                else:
                    print("Warning: The given regions have no block information, please try write_bed_blocks")
                f.close()
    else:
        pre_id = ""
        for gr in regionset:
            if not gr.name: 
                gr.name = gr.toString()

            if pre_id == "": 
                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)
            elif gr.name == pre_id:
                z.add(gr)
            else:
                f = open(os.path.join(directory, pre_id+".fa"), "w")
                for i, g in enumerate(z):
                    try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
                    except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )

                    print( ">"+ " ".join([g.name,  
                                          regiontag ]), file=f)
                    print(genome.fetch(g.chrom, g.initial, g.final), file=f)
                f.close()

                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)

        # Last TX
        f = open(os.path.join(directory, pre_id+".fa"), "w")
        for i, g in enumerate(z):
            try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
            except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )
            print( ">"+ " ".join([g.name, 
                                  regiontag ]), file=f)
            print(genome.fetch(g.chrom, g.initial, g.final), file=f)
        f.close()
示例#23
0
def load_exon_sequence(bed, directory, genome_path):
    """Load the exon sequence from the the transcripts. 
    Input BED format should contain:
        blockCount - The number of blocks (exons) in the BED line.
        blockSizes - A comma-separated list of the block sizes.
        blockStarts - A comma-separated list of block starts. 
        see details: http://genome.ucsc.edu/FAQ/FAQformat#format1

    Output:
        Each FASTA file represants a transcript and contains all the exons within the file.

    """
    regionset = GenomicRegionSet("bed")
    regionset.read_bed(bed)
    regionset.sort()

    
    genome = pysam.Fastafile(genome_path)
    
    try:
        if len(regionset.sequences[0].data.split("\t")) == 7: 
            blockinfor = True
            no_exon = False
    except:
        blockinfor = False
        regionset.sequences.sort(key=lambda g: g.name)
        no_exon = True

    if blockinfor:
        
        for gr in regionset:
            if not gr.name:
                print("Error: For fetching exon sequences, please define the transcript name.")
                sys.exit()
            else:
                if not os.path.exists(directory):
                    os.makedirs(directory)
                f = open(os.path.join(directory, gr.name+".fa"), "w")
                data = gr.data.split("\t")
                #print(len(data))
                if len(data) == 7:
                    #print(data)
                    n = int(data[4])
                    
                    blocks = [ int(b) for b in filter(None, data[5].split(",")) ]
                    starts = [ int(s) for s in filter(None, data[6].split(",")) ]
                    printstr = []

                    for i in range(n):
                        start = gr.initial + starts[i]
                        end = start + blocks[i]
                        if no_exon and i == 0:
                            ex = ""
                        elif gr.orientation == "-":
                            ex = "exon:"+str(n-i)
                        else:
                            ex = "exon:"+str(i+1)

                        if gr.orientation == "-":
                            seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna)
                            seq = seq.reverse_complement()
                            p = [ ">"+ " ".join([ gr.name, 
                                                  ex, 
                                                  "_".join(["REGION",gr.chrom,
                                                            str(start),str(end), 
                                                            gr.orientation]) ]),
                                  seq ]
                            
                            printstr.append(p)
                            

                        else:
                            p = [ ">"+ " ".join([gr.name, ex, 
                                  "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]),
                                  genome.fetch(gr.chrom, start-1, end-1)
                                ]
                            printstr.append(p)
                            

                    if gr.orientation == "-": printstr = printstr[::-1]
                    for i in range(n):
                        print(printstr[i][0], file=f)
                        print(printstr[i][1], file=f)
                        

                else:
                    print("Warning: The given regions have no block information, please try write_bed_blocks")
                f.close()
    else:
        pre_id = ""
        for gr in regionset:
            if not gr.name: 
                gr.name = gr.toString()

            if pre_id == "": 
                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)
            elif gr.name == pre_id:
                z.add(gr)
            else:
                f = open(os.path.join(directory, pre_id+".fa"), "w")
                for i, g in enumerate(z):
                    try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
                    except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )

                    print( ">"+ " ".join([g.name,  
                                          regiontag ]), file=f)
                    print(genome.fetch(g.chrom, g.initial, g.final), file=f)
                f.close()

                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)

        # Last TX
        f = open(os.path.join(directory, pre_id+".fa"), "w")
        for i, g in enumerate(z):
            try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
            except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )
            print( ">"+ " ".join([g.name, 
                                  regiontag ]), file=f)
            print(genome.fetch(g.chrom, g.initial, g.final), file=f)
        f.close()
示例#24
0
import unittest
from rgt.GenomicRegionSet import *
from rgt.CoverageSet import CoverageSet

regions = GenomicRegionSet("test")
regions.add(GenomicRegion("chr1", 10000, 11000, "+"))
regions.add(GenomicRegion("chr1", 20000, 21000, "-"))

cov = CoverageSet("coverage", regions)

bamfile = "/projects/lncRNA/local/cardio/total_rna/bam/d4_1.bam"
bedfile = "~/rgtdata/hg38/genes_hg38.bed"

class CoverageSet_Test(unittest.TestCase):
    def coverage_from_genomicset(self):
        cov.coverage_from_genomicset(bamfile)
        print(cov.coverage)
        self.assertEqual(cov.coverage, 4)
示例#25
0
 def test_filter_tts(self):
     txp = RNADNAInteractionSet(organism="hg19", filename=sample_txp)
     g = GenomicRegionSet("g")
     s = GenomicRegion(chrom="chr2", initial=74000000, final=75000000)
     g.add(s)
     result = txp.count_tts(g)
 def test_filter_tts(self):
 	txp = RNADNAInteractionSet(organism="hg19", filename=sample_txp)
 	g = GenomicRegionSet("g")
 	s = GenomicRegion(chrom="chr2", initial=74000000, final=75000000)
 	g.add(s)
 	result = txp.count_tts(g)