示例#1
0
def mode_2(exp_matrix):
    
    #remember value of bedgraph, ugly way
    value = {}
    for regions in exp_matrix.get_regionsets():
        for region in regions:
            value[(region.chrom, region.initial, region.final)] = region.data
    
    for region in exp_matrix.get_regionsets():
        f = open("region_" + str(region.name) + ".data", 'w')
        
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, threshDist=2000)
        
        for k in gene_peaks_mapping.keys():
            chr, raw_positions = k.split(':')
            start, end = map(lambda x: int(x), raw_positions.split('-'))
            
            #if peak is not assigned, an empty string occurs
            if "" in gene_peaks_mapping[k]:
                gene_peaks_mapping[k].remove("")
            
            list = 'NA' if not gene_peaks_mapping[k] else ','.join(gene_peaks_mapping[k])
            
            print(chr, start, end, value[(chr, start, end)], list, sep='\t', file = f)
        
        f.close()
示例#2
0
    def get_biotypes(self, gene_set=None):
        """Get the region sets of different Biotypes.

        *Keyword arguments:*

        *Return:*

            - result_grs -- A list of GenomicRegionSets containing the regions for each Biotype.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None

        # Fetching exons
        query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("exon")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            gr.name = e[self.GeneField.TRANSCRIPT_ID]
            result_grs.add(gr)

        if gene_set:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#3
0
    def merge_rbs(self, rm_duplicate=False, asgene_organism=None, region_set=None, cutoff=0):
        """Merge the RNA binding regions which have overlap to each other and 
           combine their corresponding DNA binding regions.
        
        extend -> Define the extending length in basepair of each RNA binding regions
        perfect_match -> Merge only the exactly same RNA binding regions
        """
        # Merge RBS
        rna_merged = self.get_rbs()
        rna_merged.merge()
        # A dict: RBS as key, and GenomicRegionSet as its value
        new_dict = OrderedDict()

        for rbsm in rna_merged:
            regions = GenomicRegionSet(rbsm.toString())
            
            for rd in self:
                if rbsm.overlap(rd.rna):
                    regions.add(rd.dna)
            if rm_duplicate: 
                regions.remove_duplicates()
            if len(regions) > cutoff:
                new_dict[rbsm] = regions
                if asgene_organism:
                    try:
                        new_dict[rbsm] = new_dict[rbsm].gene_association(organism=asgene_organism)
                    except:
                        pass
                if region_set:
                    new_dict[rbsm].replace_region_name(regions=region_set)
            else: continue

        self.merged_dict = new_dict
示例#4
0
def mode_2(exp_matrix):
    
    #remember value of bedgraph, ugly way
    value = {}
    for regions in exp_matrix.get_regionsets():
        for region in regions:
            value[(region.chrom, region.initial, region.final)] = region.data
    
    for region in exp_matrix.get_regionsets():
        f = open("region_" + str(region.name) + ".data", 'w')
        
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, thresh_dist=2000)
        
        for k in list(gene_peaks_mapping.keys()):
            chr, raw_positions = k.split(':')
            start, end = [int(x) for x in raw_positions.split('-')]
            
            #if peak is not assigned, an empty string occurs
            if "" in gene_peaks_mapping[k]:
                gene_peaks_mapping[k].remove("")
            
            list = 'NA' if not gene_peaks_mapping[k] else ','.join(gene_peaks_mapping[k])
            
            print(chr, start, end, value[(chr, start, end)], list, sep='\t', file = f)
        
        f.close()
示例#5
0
    def get_biotypes(self, gene_set=None):
        """Get the region sets of different Biotypes.

        *Keyword arguments:*

        *Return:*

            - result_grs -- A list of GenomicRegionSets containing the regions for each Biotype.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None

        # Fetching exons
        query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("exon")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            gr.name = e[self.GeneField.TRANSCRIPT_ID]
            result_grs.add(gr)

        if gene_set:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#6
0
def mode_3(exp_matrix):
    #remember value of bedgraph, ugly way
    score = {}
    for regions in exp_matrix.get_regionsets():
        for region in regions:
            score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = region.data
    
    
    for region in exp_matrix.get_regionsets():
        f = open("region_" + str(region.name) + ".data", 'w')
        
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, thresh_dist=2000)
        
        avg_score = {} #score per peak
        genes = {}
        
        for peak, gene_list in list(gene_peaks_mapping.items()):
            for gen in gene_list: #reverse mapping peak -> gene to gene -> peak
                if not gen:
                    continue
                genes[gen] = genes.get(gen, set())
                genes[gen].add(peak)
                avg_score[gen] = avg_score.get(gen, [])
                avg_score[gen].append(score[peak]) #join all scores of peaks assigned to a gen
        
        for gen in list(genes.keys()):
            avg = sum([float(x) for x in avg_score[gen]])/ float(len(avg_score[gen]))
            print(gen, avg, ", ".join(str(t) for t in genes[gen]), sep='\t', file = f)
               
        f.close()
示例#7
0
    def get_transcripts(self, gene_set = None):
        """Gets transcripts of genes. It returns a GenomicRegionSet with such transcripts. The id of each gene will be put in the NAME field of each GenomicRegion.

        *Keyword arguments:*

            - gene_set -- A set of genes to narrow the search.

        *Return:*

            - result_grs -- A GenomicRegionSet containing the exons.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set)

        # Fetching exons
        if gene_set: query_dictionary = {self.GeneField.FEATURE_TYPE:"exon", self.GeneField.GENE_ID:mapped_gene_list}
        else: query_dictionary = {self.GeneField.FEATURE_TYPE:"exon"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("exon")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            gr.name = e[self.GeneField.TRANSCRIPT_ID]
            result_grs.add(gr)

        if gene_set: return result_grs, unmapped_gene_list
        else: return result_grs
示例#8
0
    def get_genes(self, gene_set = None):
        """
        Gets regions of genes.
        It returns a GenomicRegionSet with such genes. The id of each gene will be put
        in the NAME field of each GenomicRegion.

        Keyword arguments:
        gene_set -- A set of genes to narrow the search.

        Return:
        result_grs -- A GenomicRegionSet containing the genes.
        unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if(gene_set): mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set)

        # Fetching genes
        if(gene_set): query_dictionary = {self.GeneField.FEATURE_TYPE:"gene", self.GeneField.GENE_ID:mapped_gene_list}
        else: query_dictionary = {self.GeneField.FEATURE_TYPE:"gene"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("genes")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            gr.name = e[self.GeneField.GENE_ID]
            result_grs.add(gr)
        result_grs.merge()
        if(gene_set): return result_grs, unmapped_gene_list
        else: return result_grs
示例#9
0
    def get_dbs(self,
                sort=False,
                orientation=None,
                rm_duplicate=False,
                dbd_tag=False):
        """Return GenomicRegionSet which contains all DNA binding sites"""
        dna_set = GenomicRegionSet(name="DNA_binding_sites")
        if len(self) == 0: return dna_set
        for rd in self.sequences:
            if dbd_tag:
                dbs = GenomicRegion(chrom=rd.dna.chrom,
                                    initial=rd.dna.initial,
                                    final=rd.dna.final,
                                    name=rd.rna.str_rna(),
                                    orientation=rd.dna.orientation,
                                    data=rd.score)
            else:
                dbs = GenomicRegion(chrom=rd.dna.chrom,
                                    initial=rd.dna.initial,
                                    final=rd.dna.final,
                                    name=rd.dna.name,
                                    orientation=rd.dna.orientation,
                                    data=rd.score)

            if not orientation:
                dna_set.add(dbs)
            else:
                if orientation == rd.orient:
                    dna_set.add(dbs)
                else:
                    pass
        if sort: dna_set.sort()
        if rm_duplicate: dna_set.remove_duplicates()
        return dna_set
示例#10
0
def mode_1(exp_matrix):
    for region in exp_matrix.get_regionsets():
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, _ = region_set.filter_by_gene_association(
            region.fileName, None, gene_file, genome_file, thresh_dist=50000)
        print('#number of mapped genes:', mappedGenes)
        print(region.name + "\t" + ("\t".join(region_set.genes)))
示例#11
0
def mode_3(exp_matrix):
    #remember value of bedgraph, ugly way
    score = {}
    for regions in exp_matrix.get_regionsets():
        for region in regions:
            score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = region.data
    
    
    for region in exp_matrix.get_regionsets():
        f = open("region_" + str(region.name) + ".data", 'w')
        
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, threshDist=2000)
        
        avg_score = {} #score per peak
        genes = {}
        
        for peak, gene_list in gene_peaks_mapping.items():
            for gen in gene_list: #reverse mapping peak -> gene to gene -> peak
                if not gen:
                    continue
                genes[gen] = genes.get(gen, set())
                genes[gen].add(peak)
                avg_score[gen] = avg_score.get(gen, [])
                avg_score[gen].append(score[peak]) #join all scores of peaks assigned to a gen
        
        for gen in genes.keys():
            avg = sum(map(lambda x: float(x), avg_score[gen]))/ float(len(avg_score[gen]))
            print(gen, avg, ", ".join(str(t) for t in genes[gen]), sep='\t', file = f)
               
        f.close()
示例#12
0
 def region_sets(self,listA,listB):
     """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """
     self.setA = GenomicRegionSet('for Unit Test')
     for i in range(len(listA)):
         self.setA.add(GenomicRegion(chrom=listA[i][0], initial=listA[i][1], final=listA[i][2]))
     
     self.setB = GenomicRegionSet('for Unit Test')
     for i in range(len(listB)):
         self.setB.add(GenomicRegion(chrom=listB[i][0], initial=listB[i][1], final=listB[i][2]))
示例#13
0
    def get_exons(self,
                  start_site=False,
                  end_site=False,
                  gene_set=None,
                  merge=True):
        """Gets exons of genes. It returns a GenomicRegionSet with such exons. The id of each gene will be put in the NAME field of each GenomicRegion.

        *Keyword arguments:*

            - start_site -- Whether to relocate the start sites.
            - end_site -- Whether to relocate the end sites.
            - gene_set -- A set of genes to narrow the search.

        *Return:*

            - result_grs -- A GenomicRegionSet containing the exons.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if gene_set:
            mapped_gene_list, unmapped_gene_list = self.fix_gene_names(
                gene_set)

        # Fetching exons
        if gene_set:
            query_dictionary = {
                self.GeneField.FEATURE_TYPE: "exon",
                self.GeneField.GENE_ID: mapped_gene_list
            }
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("exon")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            # gr.name = e[self.GeneField.GENE_ID]
            gr.name = e[self.GeneField.TRANSCRIPT_ID]
            result_grs.add(gr)
        if start_site:
            result_grs.relocate_regions("leftend",
                                        left_length=1,
                                        right_length=1)
        elif end_site:
            result_grs.relocate_regions("rightend",
                                        left_length=1,
                                        right_length=1)
        if merge: result_grs.merge()
        if gene_set:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#14
0
    def load_objects(self, is_bedgraph, verbose=False, test=False):
        """Load files and initialize object.

        *Keyword arguments:*

            - is_bedgraph -- Whether regions are in bedgraph format (default = False).
            - verbose -- Verbose output (default = False).
            - test -- Fetch only 10 regions form each BED files for test.
        """
        for i, t in enumerate(self.types):
            if verbose: print("Loading file ", self.files[self.names[i]], file = sys.stderr)
            
            if t not in ["regions", "genes"] and verbose:
                print("Cannot load objects", file=sys.stderr)
            
            if t == "regions":
                regions = GenomicRegionSet(self.names[i])
                if is_bedgraph:
                    regions.read_bedgraph(os.path.abspath(self.files[self.names[i]]))
                    
                else:
                    if test:
                        g = GenomicRegionSet(self.names[i])
                        g.read_bed(os.path.abspath(self.files[self.names[i]]))
                        regions.sequences = g.sequences[0:11]
                    else:
                        regions.read_bed(os.path.abspath(self.files[self.names[i]]))  # Here change the relative path into absolute path
                self.objectsDict[self.names[i]] = regions
            
            elif t == "genes":
                genes = GeneSet(self.names[i])
                genes.read(os.path.abspath(self.files[self.names[i]]))  # Here change the relative path into absolute path
                self.objectsDict[self.names[i]] = genes
示例#15
0
    def get_promoters(self,
                      promoterLength=1000,
                      gene_set=None,
                      unmaplist=False):
        """
        Gets promoters of genes given a specific promoter length.
        It returns a GenomicRegionSet with such promoters. The ID of each gene will be put
        in the NAME field of each GenomicRegion. Each promoter includes also the coordinate of
        the 5' base pair, therefore each promoter actual length is promoterLength+1.

        Keyword arguments:
        promoterLength -- The length of the promoter region.
        gene_set -- A set of genes to narrow the search.

        Return:
        result_grs -- A GenomicRegionSet containing the promoters.
        unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if (gene_set):
            mapped_gene_list, unmapped_gene_list = self.fix_gene_names(
                gene_set)

        # Fetching genes
        #if(gene_set): query_dictionary = {self.GeneField.FEATURE_TYPE:"gene", self.GeneField.GENE_ID:mapped_gene_list}
        #else: query_dictionary = {self.GeneField.FEATURE_TYPE:"gene"}
        if (gene_set):
            query_dictionary = {
                self.GeneField.FEATURE_TYPE: "transcript",
                self.GeneField.GENE_ID: mapped_gene_list
            }
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "transcript"}

        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("promoters")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            if (gr.orientation == "+"):
                gr.final = gr.initial + 1
                gr.initial = gr.initial - promoterLength
            else:
                gr.initial = gr.final - 1
                gr.final = gr.initial + promoterLength + 1

            gr.name = e[self.GeneField.GENE_ID]
            result_grs.add(gr)
        if unmaplist: return result_grs, unmapped_gene_list
        else: return result_grs
示例#16
0
 def get_dbs(self, sort=False, orientation=None, rm_duplicate=False):
     """Return GenomicRegionSet which contains all DNA binding sites"""
     dna_set = GenomicRegionSet(name="DNA_binding_sites")
     for rd in self.sequences:
         if not orientation:
             dna_set.add(rd.dna)
         else:
             if orientation == rd.orient:
                 dna_set.add(rd.dna)
             else: pass
     if sort: dna_set.sort()
     if rm_duplicate: dna_set.remove_duplicates()
     return dna_set
示例#17
0
    def sort_dbs_by_regions(self, regionset):
        """Sort the DBS by given GenomicRegionSet"""
        dbss = self.get_dbs(sort=True)

        result = {}

        if not regionset.sorted: regionset.sort()

        iter_dbs = iter(dbss)
        dbs = iter_dbs.next()

        last_j = len(regionset) - 1
        j = 0
        cont_loop = True
        pre_inter = 0
        cont_overlap = False

        while cont_loop:
            # When the regions overlap
            if dbs.overlap(regionset[j]):
                result[regionset[j].toString()].add(dbs)

                if cont_overlap == False: pre_inter = j
                if j == last_j:
                    try:
                        dbs = iter_dbs.next()
                    except:
                        cont_loop = False
                else:
                    j = j + 1
                    result[regionset[j].toString()] = GenomicRegionSet(
                        "RBS_" + regionset[j].toString())
                cont_overlap = True

            elif dbs < regionset[j]:
                try:
                    dbs = iter_dbs.next()
                    j = pre_inter
                    cont_overlap = False
                except:
                    cont_loop = False

            elif dbs > regionset[j]:
                if j == last_j:
                    cont_loop = False
                else:
                    j = j + 1
                    result[regionset[j].toString()] = GenomicRegionSet(
                        "RBS_" + regionset[j].toString())
                    cont_overlap = False
        return result
示例#18
0
    def get_promoters(self, promoterLength=1000, gene_set=None, unmaplist=False, variants=False):
        """
        Gets promoters of genes given a specific promoter length. It returns a GenomicRegionSet with such promoters.
        The ID of each gene will be put in the NAME field of each GenomicRegion.
        Each promoter includes also the coordinate of the 5' base pair, therefore each promoter actual
        length is promoterLength+1.

        *Keyword arguments:*

            - promoterLength -- The length of the promoter region.
            - gene_set -- A set of genes to narrow the search.
            - unmaplist -- If True than also return the unmappable genes list (default = False).

        *Return:*

            - result_grs -- A GenomicRegionSet containing the promoters.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None


        if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set)

        # Fetching genes

        if not variants: target = "gene"
        else: target = "transcript"
        if(gene_set): query_dictionary = {self.GeneField.FEATURE_TYPE:target, self.GeneField.GENE_ID:mapped_gene_list}
        else: query_dictionary = {self.GeneField.FEATURE_TYPE:target}
        
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("promoters")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            if gr.orientation == "+":
                gr.final = gr.initial + 1
                gr.initial = gr.initial - promoterLength
            else:
                gr.initial = gr.final - 1
                gr.final = gr.initial + promoterLength + 1

            gr.name = e[self.GeneField.GENE_ID]
            result_grs.add(gr)
        if unmaplist: return result_grs, unmapped_gene_list
        else: return result_grs
示例#19
0
    def region_sets(self, listA, listB):
        """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """
        self.setA = GenomicRegionSet('for Unit Test')
        for i in range(len(listA)):
            self.setA.add(
                GenomicRegion(chrom=listA[i][0],
                              initial=listA[i][1],
                              final=listA[i][2]))

        self.setB = GenomicRegionSet('for Unit Test')
        for i in range(len(listB)):
            self.setB.add(
                GenomicRegion(chrom=listB[i][0],
                              initial=listB[i][1],
                              final=listB[i][2]))
示例#20
0
    def match_ms_tags(self, field, test=False):
        """Add more entries to match the missing tags of the given field. For example, there are tags for cell like 'cell_A' and 'cell_B' for reads, but no these tag for regions. Then the regions are repeated for each tags from reads to match all reads.

        *Keyword arguments:*

            - field -- Field to add extra entries.
        """

        # check regions or reads have empty tag
        altypes = self.fieldsDict[field].keys()
        if "ALL" in altypes:
            altypes.remove("ALL")
            for name in self.fieldsDict[field]["ALL"]:
                i = self.names.index(name)
                for t in altypes:
                    # print("\t"+t)
                    n = name + "_" + t
                    # print("\t\t"+n)
                    self.names.append(n)
                    self.types.append(self.types[i])
                    self.files[n] = self.files[name]
                    # types = self.get_types(name,skip_all=True)
                    # print("************")
                    # print(types)

                    for f in self.fields[3:]:
                        if f == field:
                            try:
                                self.fieldsDict[f][t].append(n)
                            except:
                                self.fieldsDict[f][t] = [n]
                        else:
                            try:
                                self.fieldsDict[f][self.get_type(
                                    name=name, field=f)].append(n)
                            except:
                                self.fieldsDict[f][self.get_type(
                                    name=name, field=f)] = [n]
                    # for f in self.fieldsDict.keys():
                    #     for ty in types:
                    #         try: self.fieldsDict[f][ty].append(n)
                    #         except: pass
                    if self.types[i] == "regions":
                        g = GenomicRegionSet(n)
                        g.read_bed(self.files[name])
                        if test: g.sequences = g.sequences[0:11]
                        self.objectsDict[n] = g
                    self.trash.append(name)
示例#21
0
 def test_get_genome_data(self):
     """hg19"""
     result = GenomicRegionSet("hg19")
     result.get_genome_data(organism="hg19")
     self.assertEqual(len(result), 23)
     """hg19, with Mitochondria chromosome"""
     result = GenomicRegionSet("hg19")
     result.get_genome_data(organism="hg19", chrom_M=True)
     self.assertEqual(len(result), 24)
示例#22
0
 def get_dbs(self, sort=False, orientation=None, rm_duplicate=False):
     """Return GenomicRegionSet which contains all DNA binding sites"""
     dna_set = GenomicRegionSet(name="DNA_binding_sites")
     for rd in self.sequences:
         if not orientation:
             dna_set.add(rd.dna)
         else:
             if orientation == rd.orient:
                 dna_set.add(rd.dna)
             else: pass
     if sort: dna_set.sort()
     if rm_duplicate: dna_set.remove_duplicates()
     return dna_set
示例#23
0
    def load_objects(self, is_bedgraph, verbose=False, test=False):
        """Load files and initialize object.

        *Keyword arguments:*

            - is_bedgraph -- Whether regions are in bedgraph format (default = False).
            - verbose -- Verbose output (default = False).
            - test -- Fetch only 10 regions form each BED files for test.
        """
        for i, t in enumerate(self.types):
            if verbose:
                print("Loading file ",
                      self.files[self.names[i]],
                      file=sys.stderr)

            if t not in ["regions", "genes"] and verbose:
                print("Cannot load objects", file=sys.stderr)

            if t == "regions":
                regions = GenomicRegionSet(self.names[i])
                if is_bedgraph:
                    regions.read_bedgraph(
                        os.path.abspath(self.files[self.names[i]]))
                else:
                    regions.read_bed(os.path.abspath(
                        self.files[self.names[i]]))
                    if test: regions.sequences = regions.sequences[0:11]
                self.objectsDict[self.names[i]] = regions

            elif t == "genes":
                genes = GeneSet(self.names[i])
                genes.read(
                    os.path.abspath(self.files[self.names[i]])
                )  # Here change the relative path into absolute path
                self.objectsDict[self.names[i]] = genes
示例#24
0
    def match_ms_tags(self,field):
        """Add more entries to match the missing tags of the given field. For example, there are tags for cell like 'cell_A' and 'cell_B' for reads, but no these tag for regions. Then the regions are repeated for each tags from reads to match all reads.

        *Keyword arguments:*

            - field -- Field to add extra entries.
        """
        
        # print(field)
        # print(self.fieldsDict)
        # check regions or reads have empty tag
        altypes = self.fieldsDict[field].keys()
        if "ALL" in altypes:
            altypes.remove("ALL")
            for name in self.fieldsDict[field]["ALL"]:
                # print(name)
                i = self.names.index(name)
                for t in altypes:
                    # print("\t"+t)
                    n = name+"_"+t
                    # print("\t\t"+n)
                    self.names.append(n)
                    self.types.append(self.types[i])
                    self.files[n] = self.files[name]
                    # types = self.get_types(name,skip_all=True)
                    # print("************")
                    # print(types)

                    for f in self.fields[3:]:
                        if f == field: 
                            try: self.fieldsDict[f][t].append(n)
                            except: self.fieldsDict[f][t] = [n]
                        else:
                            try: self.fieldsDict[f][self.get_type(name=name,field=f)].append(n)
                            except: self.fieldsDict[f][self.get_type(name=name,field=f)] = [n]
                    # for f in self.fieldsDict.keys():
                    #     for ty in types:
                    #         try: self.fieldsDict[f][ty].append(n)
                    #         except: pass
                    if self.types[i] == "regions":
                        g = GenomicRegionSet(n)
                        g.read_bed(self.files[name])
                        self.objectsDict[n] = g
                    self.trash.append(name)
示例#25
0
def mode_3(exp_matrix, thresh, type_file):
    #remember value of bedgraph, ugly way
    score = {}
    for regions in exp_matrix.get_regionsets():
        for region in regions:
            if type_file=="ODIN":
              aux=(region.data).split("\t")
              aux=aux[-1].split(";")
              score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = float(region.data[-1])
            if type_file=="THOR":
              aux=(region.data).split(";")
              score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = float(aux[-1])
            else:
               score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = region.data
    
    
    for i, region in enumerate(exp_matrix.get_regionsets()):
        f = open("region_" + str(region.name) + ".data", 'w')
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association_old(region.fileName, None, gene_file, genome_file, threshDist=thresh)

        avg_score = {} #score per peak
        genes = {}
        
        print('Consider row %s of exp. matrix, number of mapped genes is %s' %(i, mappedGenes), file=sys.stderr)
        for peak, gene_list in gene_peaks_mapping.items():            
            for gen in gene_list: #reverse mapping peak -> gene to gene -> peak
                if not gen:
                    continue
                genes[gen] = genes.get(gen, set())
                genes[gen].add(peak)
                
                avg_score[gen] = avg_score.get(gen, [])
                avg_score[gen].append(score[peak]) #join all scores of peaks assigned to a gen
        
        for gen in genes.keys():
            if options.metric == 'mean':
                avg = np.mean(avg_score[gen])
            elif options.metric == 'max':
                avg = np.max(avg_score[gen])
            print(gen, avg, ", ".join(str(t) for t in genes[gen]), sep='\t', file = f)
        
        f.close()
示例#26
0
    def get_tts(self, gene_set=None):
        """Gets TTS(Transcription termination site) of genes. It returns a GenomicRegionSet with such TTS. The ID of each gene will be put in the NAME field of each GenomicRegion.

        *Keyword arguments:*

            - gene_set -- A set of genes to narrow the search.
        
        *Return:*

            - result_grs -- A GenomicRegionSet containing TTS.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set)

        # Fetching genes
        if gene_set:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "gene", self.GeneField.GENE_ID: mapped_gene_list}
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "gene"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("TTS")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            if gr.orientation == "+":
                gr.initial = gr.initial
                gr.final = gr.initial + 1
            else:
                gr.initial = gr.final - 1
                gr.final = gr.final
            gr.name = e[self.GeneField.GENE_ID]
            result_grs.add(gr)
        result_grs.merge()
        if gene_set:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#27
0
    def merge_rbs(self, rm_duplicate=False, asgene_organism=None, cutoff=0):
        """Merge the RNA binding regions which have overlap to each other and 
           combine their corresponding DNA binding regions.
        
        extend -> Define the extending length in basepair of each RNA binding regions
        perfect_match -> Merge only the exactly same RNA binding regions
        """
        # Merge RBS
        rna_merged = self.get_rbs()
        rna_merged.merge()
        # A dict: RBS as key, and GenomicRegionSet as its value
        new_dict = OrderedDict()

        for rbsm in rna_merged:
            regions = GenomicRegionSet(rbsm.toString())
            
            for rd in self:
                if rbsm.overlap(rd.rna):
                    regions.add(rd.dna)
            if rm_duplicate: 
                regions.remove_duplicates()
            if len(regions) > cutoff:
                new_dict[rbsm] = regions
                if asgene_organism:
                    try:
                        new_dict[rbsm] = new_dict[rbsm].gene_association(organism=asgene_organism)
                    except:
                        print("* No annotation file for mapping associated genes.")
            else: continue

        self.merged_dict = new_dict
示例#28
0
    def load_objects(self, is_bedgraph, verbose=False):
        """Load files and initialize object"""
        for i, t in enumerate(self.types):
            if verbose:
                print("Loading file ",
                      self.files[self.names[i]],
                      file=sys.stderr)

            if t not in ["regions", "genes"] and verbose:
                print("Cannot load objects", file=sys.stderr)

            if t == "regions":
                regions = GenomicRegionSet(self.names[i])
                if is_bedgraph:
                    regions.read_bedgraph(
                        os.path.abspath(self.files[self.names[i]]))

                else:
                    regions.read_bed(
                        os.path.abspath(self.files[self.names[i]])
                    )  # Here change the relative path into absolute path
                self.objectsDict[self.names[i]] = regions

            elif t == "genes":
                genes = GeneSet(self.names[i])
                genes.read(
                    os.path.abspath(self.files[self.names[i]])
                )  # Here change the relative path into absolute path
                self.objectsDict[self.names[i]] = genes
示例#29
0
    def get_exons(self, start_site=False, end_site=False, gene_set=None, merge=True):
        """Gets exons of genes. It returns a GenomicRegionSet with such exons. The id of each gene will be put in the NAME field of each GenomicRegion.

        *Keyword arguments:*

            - start_site -- Whether to relocate the start sites.
            - end_site -- Whether to relocate the end sites.
            - gene_set -- A set of genes to narrow the search.

        *Return:*

            - result_grs -- A GenomicRegionSet containing the exons.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set)

        # Fetching exons
        if gene_set:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "exon", self.GeneField.GENE_ID: mapped_gene_list}
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("exon")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            # gr.name = e[self.GeneField.GENE_ID]
            gr.name = e[self.GeneField.TRANSCRIPT_ID]
            result_grs.add(gr)
        if start_site:
            result_grs.relocate_regions("leftend", left_length=1, right_length=1)
        elif end_site:
            result_grs.relocate_regions("rightend", left_length=1, right_length=1)
        if merge: result_grs.merge()
        if gene_set:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#30
0
 def test_get_genome_data(self):
     """hg19"""
     result = GenomicRegionSet("hg19")
     result.get_genome_data(organism="hg19")
     self.assertEqual(len(result), 23)
     """hg19, with Mitochondria chromosome"""
     result = GenomicRegionSet("hg19")
     result.get_genome_data(organism="hg19",chrom_M=True)
     self.assertEqual(len(result), 24)
示例#31
0
    def get_dbs(self, sort=False, orientation=None, rm_duplicate=False, dbd_tag=False):
        """Return GenomicRegionSet which contains all DNA binding sites"""
        dna_set = GenomicRegionSet(name="DNA_binding_sites")
        if len(self) == 0: return dna_set
        for rd in self.sequences:
            if dbd_tag:
                dbs = GenomicRegion(chrom=rd.dna.chrom, initial=rd.dna.initial, final=rd.dna.final,
                                    name=rd.rna.str_rna(), orientation=rd.dna.orientation, 
                                    data=rd.score)
            else:
                dbs = GenomicRegion(chrom=rd.dna.chrom, initial=rd.dna.initial, final=rd.dna.final,
                                    name=rd.dna.name, orientation=rd.dna.orientation, 
                                    data=rd.score)

            if not orientation:
                dna_set.add(dbs)
            else:
                if orientation == rd.orient:
                    dna_set.add(dbs)
                else: pass
        if sort: dna_set.sort()
        if rm_duplicate: dna_set.remove_duplicates()
        return dna_set
示例#32
0
    def merge_by(self, rbss, rm_duplicate=False, asgene_organism=False):
        """Merge the RNA Binding Sites by the given list of Binding sites"""
        new_dict = OrderedDict()

        for rbsm in rbss:
            new_dict[rbsm] = GenomicRegionSet(rbsm.toString())
            
            for rd in self:
                if rbsm.overlap(rd.rna):
                    new_dict[rbsm].add(rd.dna)

            if rm_duplicate: 
                new_dict[rbsm].remove_duplicates()
            if asgene_organism:
                try:
                    new_dict[rbsm] = new_dict[rbsm].gene_association(organism=asgene_organism)
                except:
                    print("* No annotation file for mapping associated genes.")
        self.merged_dict = new_dict
示例#33
0
    def get_tts(self, gene_set=None):
        """Gets TTS(Transcription termination site) of genes. It returns a GenomicRegionSet with such TTS. The ID of each gene will be put in the NAME field of each GenomicRegion.

        *Keyword arguments:*

            - gene_set -- A set of genes to narrow the search.
        
        *Return:*

            - result_grs -- A GenomicRegionSet containing TTS.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if gene_set:
            mapped_gene_list, unmapped_gene_list = self.fix_gene_names(
                gene_set)

        # Fetching genes
        if gene_set:
            query_dictionary = {
                self.GeneField.FEATURE_TYPE: "gene",
                self.GeneField.GENE_ID: mapped_gene_list
            }
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "gene"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("TTS")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            if gr.orientation == "+":
                gr.initial = gr.initial
                gr.final = gr.initial + 1
            else:
                gr.initial = gr.final - 1
                gr.final = gr.final
            gr.name = e[self.GeneField.GENE_ID]
            result_grs.add(gr)
        result_grs.merge()
        if gene_set:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#34
0
    def get_genes(self, gene_set=None):
        """
        Gets regions of genes.
        It returns a GenomicRegionSet with such genes. The id of each gene will be put
        in the NAME field of each GenomicRegion.

        Keyword arguments:
        gene_set -- A set of genes to narrow the search.

        Return:
        result_grs -- A GenomicRegionSet containing the genes.
        unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None
        if (gene_set):
            mapped_gene_list, unmapped_gene_list = self.fix_gene_names(
                gene_set)

        # Fetching genes
        if (gene_set):
            query_dictionary = {
                self.GeneField.FEATURE_TYPE: "gene",
                self.GeneField.GENE_ID: mapped_gene_list
            }
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: "gene"}
        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("genes")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            gr.name = e[self.GeneField.GENE_ID]
            result_grs.add(gr)
        result_grs.merge()
        if (gene_set): return result_grs, unmapped_gene_list
        else: return result_grs
示例#35
0
def load_exon_sequence(bed, directory, genome_path):
    """Load the exon sequence from the the transcripts. 
    Input BED format should contain:
        blockCount - The number of blocks (exons) in the BED line.
        blockSizes - A comma-separated list of the block sizes.
        blockStarts - A comma-separated list of block starts. 
        see details: http://genome.ucsc.edu/FAQ/FAQformat#format1

    Output:
        Each FASTA file represants a transcript and contains all the exons within the file.

    """
    regionset = GenomicRegionSet("bed")
    regionset.read_bed(bed)
    regionset.sort()

    
    genome = pysam.Fastafile(genome_path)
    
    try:
        if len(regionset.sequences[0].data.split("\t")) == 7: 
            blockinfor = True
            no_exon = False
    except:
        blockinfor = False
        regionset.sequences.sort(key=lambda g: g.name)
        no_exon = True

    if blockinfor:
        
        for gr in regionset:
            if not gr.name:
                print("Error: For fetching exon sequences, please define the transcript name.")
                sys.exit()
            else:
                if not os.path.exists(directory):
                    os.makedirs(directory)
                f = open(os.path.join(directory, gr.name+".fa"), "w")
                data = gr.data.split("\t")
                #print(len(data))
                if len(data) == 7:
                    #print(data)
                    n = int(data[4])
                    
                    blocks = [ int(b) for b in filter(None, data[5].split(",")) ]
                    starts = [ int(s) for s in filter(None, data[6].split(",")) ]
                    printstr = []

                    for i in range(n):
                        start = gr.initial + starts[i]
                        end = start + blocks[i]
                        if no_exon and i == 0:
                            ex = ""
                        elif gr.orientation == "-":
                            ex = "exon:"+str(n-i)
                        else:
                            ex = "exon:"+str(i+1)

                        if gr.orientation == "-":
                            seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna)
                            seq = seq.reverse_complement()
                            p = [ ">"+ " ".join([ gr.name, 
                                                  ex, 
                                                  "_".join(["REGION",gr.chrom,
                                                            str(start),str(end), 
                                                            gr.orientation]) ]),
                                  seq ]
                            
                            printstr.append(p)
                            

                        else:
                            p = [ ">"+ " ".join([gr.name, ex, 
                                  "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]),
                                  genome.fetch(gr.chrom, start-1, end-1)
                                ]
                            printstr.append(p)
                            

                    if gr.orientation == "-": printstr = printstr[::-1]
                    for i in range(n):
                        print(printstr[i][0], file=f)
                        print(printstr[i][1], file=f)
                        

                else:
                    print("Warning: The given regions have no block information, please try write_bed_blocks")
                f.close()
    else:
        pre_id = ""
        for gr in regionset:
            if not gr.name: 
                gr.name = gr.toString()

            if pre_id == "": 
                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)
            elif gr.name == pre_id:
                z.add(gr)
            else:
                f = open(os.path.join(directory, pre_id+".fa"), "w")
                for i, g in enumerate(z):
                    try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
                    except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )

                    print( ">"+ " ".join([g.name,  
                                          regiontag ]), file=f)
                    print(genome.fetch(g.chrom, g.initial, g.final), file=f)
                f.close()

                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)

        # Last TX
        f = open(os.path.join(directory, pre_id+".fa"), "w")
        for i, g in enumerate(z):
            try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
            except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )
            print( ">"+ " ".join([g.name, 
                                  regiontag ]), file=f)
            print(genome.fetch(g.chrom, g.initial, g.final), file=f)
        f.close()
    annotation_path = args[2]
    outputdir = args[3]
    
    
#     experimental_matrix_file = "/home/manuel/workspace/cluster_p/THOR/exp/exp23_macs2_payton/1"
#     gene_exp = "/home/manuel/workspace/cluster_p/allhoff/project_THOR/data/payton/gene_expression/CCmean.data"
#     annotation_path = "/home/manuel/workspace/cluster_h/rgtdata/hg19/"
#     outputdir = "/home/manuel/test/"
    
    exps = ExperimentalMatrix()
    exps.read(experimental_matrix_file)
    regionsets = exps.get_regionsets()
    
    genome_file = annotation_path + "/chrom.sizes"
    gene_file = annotation_path + "/association_file.bed"
    
    genes = GeneSet("Expression")
    genes.read_expression(gene_exp)
    
    for region in regionsets:
        bedNew = GenomicRegionSet("")
        [degenes, de_peak_genes, mappedGenes, totalPeaks, regionsToGenes] \
        = bedNew.filter_by_gene_association_old(region.fileName, genes.genes, gene_file, genome_file, threshDist=options.dist)
        
        [ct, labels] = averageExpression(region, genes, regionsToGenes)
        fileName = path.splitext(path.basename(region.fileName))[0]
	output(genes.cond, labels, ct, path.join(outputdir, fileName + ".txt"))
        
        

示例#37
0
class TestGenomicRegionSet(unittest.TestCase):
    
    def region_sets(self,listA,listB):
        """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """
        self.setA = GenomicRegionSet('for Unit Test')
        for i in range(len(listA)):
            self.setA.add(GenomicRegion(chrom=listA[i][0], initial=listA[i][1], final=listA[i][2]))
        
        self.setB = GenomicRegionSet('for Unit Test')
        for i in range(len(listB)):
            self.setB.add(GenomicRegion(chrom=listB[i][0], initial=listB[i][1], final=listB[i][2]))
    
    def test_extend(self):
        """
        Two empty sets
        A : none 
        R : none
        """
        self.region_sets([],
                         [])
        self.setA.extend(100,100)
        self.assertEqual(len(self.setA.sequences), 0)
        """
        One region
        A :   -----
        R : ---------
        """
        self.region_sets([['chr1',5,10]],
                         [])
        result = self.setA
        result.extend(4,4)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 14)
        """
        Many region
        A :   -----   ------         -----    -----
        R : --------=---------     ------------------
        """
        self.region_sets([['chr1',5,10],['chr1',15,20],['chr1',40,50],['chr1',65,75]],
                         [])
        result = self.setA
        result.extend(5,5)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 15)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[2].initial, 35)
        self.assertEqual(result[2].final, 55)
        self.assertEqual(result[3].initial, 60)
        self.assertEqual(result[3].final, 80)
        """
        Many region in different chromosome
        A :   -----   ------         -----    -----
        R : none
        """
        self.region_sets([['chr1',5,10],['chr2',15,20],['chr3',40,50],['chr4',65,75]],
                         [])
        result = self.setA
        result.extend(5,5)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 15)
        self.assertEqual(result[0].chrom, 'chr1')
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[1].chrom, 'chr2')
        self.assertEqual(result[2].initial, 35)
        self.assertEqual(result[2].final, 55)
        self.assertEqual(result[2].chrom, 'chr3')
        self.assertEqual(result[3].initial, 60)
        self.assertEqual(result[3].final, 80)
        self.assertEqual(result[3].chrom, 'chr4')
        """
        One region
        A :   -----
        R : ---------
        """
        self.region_sets([['chr1',100,200]],
                         [])
        result = self.setA
        result.extend(10,10,percentage=True)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 90)
        self.assertEqual(result[0].final, 210)
        
    def test_sort(self):
        self.region_sets([['chr1',15,20],['chr1',40,50],['chr1',65,75],['chr1',5,10]],
                         [])
        self.setA.sort()
    
    def test_intersect(self):
        """
        Two empty sets
        A : none 
        B : none
        R : none
        """
        self.region_sets([],
                         [])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        One empty set
        A :   -----
        B : none
        R : none
        """
        self.region_sets([['chr1',5,10]],
                         [])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        A : none
        B :   -----
        R : none
        """
        self.region_sets([],
                         [['chr1',5,10]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        No overlapping
        A : ------      ---------               ------- 
        B :        ----          ------  ------   
        R : none
        """
        self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]],
                         [['chr1',7,9],['chr1',20,25],['chr1',26,31]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        End-to-end attach
        A : ------      ------
        B :       ------
        R : none
        """
        self.region_sets([['chr1',1,5],['chr1',11,20]],
                         [['chr1',5,11]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        No length attach
        A : .      .
        B :    .   .
        R : none
        """
        self.region_sets([['chr1',2,2],['chr1',20,20]],
                         [['chr1',5,5],['chr1',20,20]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        
        """
        Perfect overlapping
        A : ------
        B : ------
        R : ------
        """
        self.region_sets([['chr1',1,10],['chr1',500,550],['chr1',600,650],['chr1',700,750],['chr1',725,800]],
                         [['chr1',1,10],['chr1',500,550],['chr1',600,650],['chr1',700,750],['chr1',725,800]])
        result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP, rm_duplicates=True)
        
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        """
        One overlapping region
        A : ------
        B :     --------
        R1:     --       (overlap)
        R2: ------       (original)
        R3:              (comp_incl)
        """

        self.region_sets([['chr1',1,10]],
                         [['chr1',7,20]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Two simple overlapping regions
        A : -------      --------
        B :     -------------
        R1:     ---      ----     (overlap)
        R2: -------      -------- (original)
        R3:                       (comp_incl)
        """
        self.region_sets([['chr1',1,10],['chr1',26,35]],
                         [['chr1',7,30]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 30)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Two separately overlapping regions 
        A : -------      --------
        B :     -----        --------
        R1:     ---          ----     (overlap)
        R2: -------      --------     (original)
        R3:                           (comp_incl)
        """
        self.region_sets([['chr1',1,10],['chr1',26,35]],
                         [['chr1',7,15],['chr1',30,40]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 30)
        self.assertEqual(result[1].final, 35)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Many various overlapping (mixed)
        A :   ------------------            --------   ---------
        B : ----   -------    ------            ----------      
        R1:   --   -------    --                ----   ---       (overlap)
        R2:   ------------------            --------   --------- (original)
        R3:                                                      (comp_incl) 
        """

        self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]],
                         [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]])

        result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 3)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 27)
        self.assertEqual(result[2].final, 30)
        self.assertEqual(result[3].initial, 55)
        self.assertEqual(result[3].final, 60)
        self.assertEqual(result[4].initial, 70)
        self.assertEqual(result[4].final, 75)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 3)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 50)
        self.assertEqual(result[1].final, 60)
        self.assertEqual(result[2].initial, 70)
        self.assertEqual(result[2].final, 85)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Different chromosomes
        A : chr1  -------
        B : chr2  -------
        R : none
        """
        self.region_sets([['chr1',1,10]],
                         [['chr2',1,10]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Completely included overlapping
        A : ---------------------------
        B : ----    ------       -----------
        R1: ----    ------       ------      (overlap)
        R2: ---------------------------      (original)
        R3:                                  (comp_incl)
        """
        self.region_sets([['chr1',1,50]],
                         [['chr1',1,5],['chr1',10,19],['chr1',45,60]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 50)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 50)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        A : ----    ------       -----------
        B : ---------------------------
        R1: ----    ------       ------      (overlap)
        R2: ----    ------       ----------- (original)
        R3: ----    ------                   (comp_incl)
        """

        self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]],
                         [['chr1',1,50]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 50)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 60)
        
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)

        """
        A : --------------         -------
                ------
        B :       -----          ----------------
        R1:       -----            -------      (overlap)
                  ----
        R2: --------------         -------      (original)
                ------
        R3:                        -------      (comp_incl)
        """
        self.region_sets([['chr1',1,50],['chr1',20,40],['chr1',70,80]],
                         [['chr1',25,45],['chr1',65,95]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 25)
        self.assertEqual(result[0].final, 45)
        self.assertEqual(result[1].initial, 70)
        self.assertEqual(result[1].final, 80)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[1].initial, 20)
        self.assertEqual(result[1].final, 40)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 50)
        self.assertEqual(result[2].initial, 70)
        self.assertEqual(result[2].final, 80)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 70)
        self.assertEqual(result[0].final, 80)

    def test_closest(self):
        """
        Two empty sets
        A : none 
        B : none
        R : none
        """
        self.region_sets([],
                         [])
        result = self.setA.closest(self.setB)
        self.assertEqual(len(result), 0)
        # """
        # One empty set
        # A :   -----
        # B : none
        # R : none
        # """
        # self.region_sets([['chr1',5,10]],
        #                  [])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # A : none
        # B :   -----
        # R : none
        # """
        # self.region_sets([],
        #                  [['chr1',5,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # Overlapping within set
        # A : -----====-----
        # B :      ----
        # R :      ----
        # """
        # self.region_sets([['chr1',1,10],['chr1',6,15]],
        #                  [['chr1',6,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # """
        # A :      ----
        # B : -----====-----
        # R : -----====-----
        # """
        # self.region_sets([['chr1',6,10]],
        #                  [['chr1',1,10],['chr1',6,15]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # """
        # No overlapping
        # A : ------      ---------               -------
        # B :        ----          ------  ------
        # R :                      ------
        # """
        # self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]],
        #                  [['chr1',7,9],['chr1',20,25],['chr1',26,31]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 3)
        # # self.assertEqual(result[0].initial, 20)
        # # self.assertEqual(result[0].final, 25)
        # """
        # End-to-end attach
        # A : ------      ------
        # B :       ------
        # R :       ------
        # """
        # self.region_sets([['chr1',1,5],['chr1',11,20]],
        #                  [['chr1',5,11]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # # self.assertEqual(result[0].initial, 5)
        # # self.assertEqual(result[0].final, 11)
        # """
        # Perfect overlapping
        # A : ------
        # B : ------
        # R : ------
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr1',1,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # self.assertEqual(result[0].initial, 1)
        # self.assertEqual(result[0].final, 10)
        # """
        # One overlapping region
        # A : ------
        # B :     --------
        # R :     --------
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr1',7,20]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result[0].initial, 7)
        # self.assertEqual(result[0].final, 20)
        # """
        # Two simple overlapping regions
        # A : -------      --------
        # B :     -------------
        # R :     -------------
        # """
        # self.region_sets([['chr1',1,10],['chr1',26,35]],
        #                  [['chr1',7,30]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result[0].initial, 7)
        # self.assertEqual(result[0].final, 30)
        # """
        # Two separately overlapping regions
        # A : -------      --------
        # B :     -----        --------
        # R : none
        # """
        # self.region_sets([['chr1',1,10],['chr1',26,35]],
        #                  [['chr1',7,15],['chr1',30,40]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # """
        # Many various overlapping (mixed)
        # A :   ------------------            --------   ---------
        # B : ----   -------    ------            ----------
        # R : none
        # """
        # self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]],
        #                  [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 4)
        # """
        # Different chromosomes
        # A : chr1  -------
        # B : chr2  -------
        # R : chr2  -------
        #
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr2',1,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # Completely included overlapping
        # A : ---------------------------
        # B : ----    ------       -----------
        # R : ----    ------       -----------
        # """
        # self.region_sets([['chr1',1,50]],
        #                  [['chr1',1,5],['chr1',10,19],['chr1',45,60]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 3)
        # """
        # A : ----    ------       -----------
        # B : ---------------------------
        # R : none
        # """
        # self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]],
        #                  [['chr1',1,50]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result, False)
        # """
        # A : ----         ------                  ---
        # B :        ---              -----
        # R :        ---
        # """
        # self.region_sets([['chr1',1,5],['chr1',27,45],['chr1',85,95]],
        #                  [['chr1',15,20],['chr1',55,65]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # self.assertEqual(result[0].initial, 15)
        # self.assertEqual(result[0].final, 20)
    
    def test_remove_duplicates(self):
        """
        A : ===== -----
        R : ----- -----
        """
        self.region_sets([['chr1',1,10],['chr1',1,10],['chr1',15,25]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        """
        A : =====--- -----
        R : =====--- -----
        """
        self.region_sets([['chr1',1,10],['chr1',1,15],['chr1',20,25]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 1)
        self.assertEqual(result[1].final, 15)
        self.assertEqual(result[2].initial, 20)
        self.assertEqual(result[2].final, 25)
        """
        A : ===== ----- ------  ====
        R : ----- ----- ------  ----
        """
        self.region_sets([['chr1',1,10],['chr1',1,10],['chr1',15,25],['chr1',30,35],['chr1',40,45],['chr1',40,45]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[2].initial, 30)
        self.assertEqual(result[2].final, 35)
        self.assertEqual(result[3].initial, 40)
        self.assertEqual(result[3].final, 45)

    def test_window(self):
        """
        A :             -------
        B : ------[ 99 ]       [   199   ]---
        window = 100
        R :       -                           only one base overlaps with extending A
        """   
        self.region_sets([['chr1',200,300]],
                         [['chr1',1,101],['chr1',499,550]])
        result = self.setA.window(self.setB,adding_length=100)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 100)
        self.assertEqual(result[0].final, 101)
        """
        A :             -------
        B : ------[ 99 ]       [   199   ]---
        window = 200
        R : ------                        -   
        left-hand side is covered, and the right-hand side is only one base overlapped
        """   
        self.region_sets([['chr1',200,300]],
                         [['chr1',1,101],['chr1',499,550]])
        result = self.setA.window(self.setB,adding_length=200)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)  # GenomicRegion.extend will choose 1 rather than 0
        self.assertEqual(result[0].final, 101)
        self.assertEqual(result[1].initial, 499)
        self.assertEqual(result[1].final, 500)
        """
        A :                         ----    ----
        B :             --------                    ----
        window = 1000 (default)
        R :                 ----                    ----
        """   
        self.region_sets([['chr1',3000,3500],['chr1',4000,4500]],
                         [['chr1',1500,2500],['chr1',5000,5500]])
        result = self.setA.window(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 2000)
        self.assertEqual(result[0].final, 2500)
        self.assertEqual(result[1].initial, 5000)
        self.assertEqual(result[1].final, 5500)
        """
        A :                         ----    ----
        B :             --------                    ----
        window = 2000
        R :             --------                    ----
                            ----                    ----
        window = 100
        R : none
        """   
        self.region_sets([['chr1',3000,3500],['chr1',4000,4500]],
                         [['chr1',1500,2500],['chr1',5000,5500]])
        result = self.setA.window(self.setB,adding_length=2000)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1500)
        self.assertEqual(result[0].final, 2500)
        self.assertEqual(result[1].initial, 5000)
        self.assertEqual(result[1].final, 5500)
        result = self.setA.window(self.setB,adding_length=100)
        self.assertEqual(len(result), 0)
        
    def test_subtract(self):
        """
        A : none
        B :    ------
        R : none
        """
        self.region_sets([],
                         [['chr1',6,15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A :    ------
        B : none
        R :    ------
        """
        self.region_sets([['chr1',6,15]],
                         [])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 6)
        self.assertEqual(result[0].final, 15)
        """
        A : ------
        B :    ------
        R : ---
        """
        self.region_sets([['chr1',1,10]],
                         [['chr1',6,15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 6)
        """
        A :    ------
        B : ------
        R :       ---
        """
        self.region_sets([['chr1',6,15]],
                         [['chr1',1,10]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 10)
        self.assertEqual(result[0].final, 15)
        """
        A :    ---
        B : ---------
        R : none
        """
        self.region_sets([['chr1',6,10]],
                         [['chr1',1,15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A : ---------
        B :    ---
        R : ---   ---
        """
        self.region_sets([['chr1',1,15]],
                         [['chr1',6,10]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 6)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 15)
        """
        A :    ------
        B :    ------
        R : none
        """
        self.region_sets([['chr1',6,15]],
                         [['chr1',6,15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A :   ----------              ------
        B :          ----------                    ----
        R :   -------                 ------
        """
        self.region_sets([['chr1',5,30],['chr1',70,85]],
                         [['chr1',20,50],['chr1',100,110]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 5)
        self.assertEqual(result[0].final, 20)
        self.assertEqual(result[1].initial, 70)
        self.assertEqual(result[1].final, 85)
        """
        A :        ------   -----
        B :    ------
        R :          ----   -----
        """
        self.region_sets([['chr1',20,30],['chr1',35,55]],
                         [['chr1',10,23],['chr1',100,110]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 23)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 35)
        self.assertEqual(result[1].final, 55)
        """
        A :   ch1     ---------------------
              ch2     -------------------------
        B :   ch1             ------
              ch2                        ------
        R :   ch1     --------      -------
              ch2     -------------------
        """
        self.region_sets([['chr1',0,30000],['chr2',0,35000]],
                         [['chr1',20000,23000],['chr2',31000,35000]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 20000)
        self.assertEqual(result[1].initial, 23000)
        self.assertEqual(result[1].final, 30000)
        self.assertEqual(result[2].initial, 0)
        self.assertEqual(result[2].final, 31000)
        """
        A :   -----------------------------------------------------------
        B :    ---    ---------         ----           ----
        R :   -   ----         ---------    -----------    --------------
        """
        self.region_sets([['chr1',5,1000]],
                         [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 5)
        
        """
        A :   -----------------------              ------
                   -----     -----  -----------
        B :    ---    ---------         ----           ----
        R :   -   ----         ------              ----
                   ---         ---  ----    ---
        """
        self.region_sets([['chr1',5,100],['chr1',20,40],['chr1',60,80],['chr1',95,150],['chr1',180,220]],
                         [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240]])
        result = self.setA.subtract(self.setB)
        #print(result.sequences)
        self.assertEqual(len(result), 8)
        self.assertEqual(result[0].initial, 5)
        """
        A :   -----------------------------------------------------------
        B :    ---    ---------         ----           ----
        R :   -   ----         ---------    -----------    --------------
        """
        self.region_sets([['chr1',5,1000],['chr2',5,1000],['chr4',5,1000]],
                         [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240],
                          ['chr2',10,15],['chr2',30,70],['chr2',120,140],['chr2',200,240],
                          ['chr4',10,15],['chr4',30,70],['chr4',120,140],['chr4',200,240]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 15)
        
        
    def test_merge(self):
        """
        A : none
        R : none
        """
        self.region_sets([],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 0)
        """
        A : -----  -----
        R : -----  -----
        """
        self.region_sets([['chr1',1,10],['chr1',15,25]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        """
        A1: ------------   ----
        A2:    -----
        R : ------------   ----
        """
        self.region_sets([['chr1',1,30],['chr1',11,20],['chr1',40,50]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 40)
        self.assertEqual(result[1].final, 50)
        """
        A1: --------       ----
        A2:    ---------
        R : ------------   ----
        """
        self.region_sets([['chr1',1,30],['chr1',20,40],['chr1',50,60]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 40)
        self.assertEqual(result[1].initial, 50)
        self.assertEqual(result[1].final, 60)
        """
        A : =======
        R : -------
        """
        self.region_sets([['chr1',1,30],['chr1',1,30]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 30)
        
    def test_cluster(self):
        """
        Empty sets
        A : none 
        R : none
        """
        self.region_sets([],
                         [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 0)
        """
        A :  ------- 
        R :  -------
        """
        self.region_sets([['chr1',1,10]],
                         [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        """
        A :  -----
                  ------
        R :  -----------
        """
        self.region_sets([['chr1',1,10],['chr1',10,20]],
                         [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 20)
        """
        A :  -----  -----
        R1:  -----  -----
        R2:  ------------
        """
        self.region_sets([['chr1',1,10],['chr1',15,25]],
                         [])
        result = self.setA.cluster(1)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        result = self.setA.cluster(5)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        result = self.setA.cluster(6)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 25)
        """
        A :  ---- ----  ----   ----    ----
        R1:  ---------  ----   ----    ----
        R2:  ---------------   ----    ----
        R3:  ----------------------    ----
        R4:  ------------------------------
        R5:  ------------------------------
        """
        self.region_sets([['chr1',1,10],['chr1',15,25],['chr1',35,45],
                          ['chr1',60,70],['chr1',90,100]],
                         [])
        result = self.setA.cluster(6)
        self.assertEqual(len(result), 4)
        result = self.setA.cluster(11)
        self.assertEqual(len(result), 3)
        result = self.setA.cluster(16)
        self.assertEqual(len(result), 2)
        result = self.setA.cluster(21)
        self.assertEqual(len(result), 1)
        result = self.setA.cluster(26)
        self.assertEqual(len(result), 1)
        
    def test_flank(self):
        """
        A :        -----
        R1:     ---     ---
        """
        self.region_sets([['chr1',60,75]],
                         [])
        result = self.setA.flank(10)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 50)
        self.assertEqual(result[0].final, 60)
        self.assertEqual(result[1].initial, 75)
        self.assertEqual(result[1].final, 85)
        """
        A :        -----     ----
        R1:   -----     =====    ----
        """
        self.region_sets([['chr1',60,75],['chr1',90,100]],
                         [])
        result = self.setA.flank(15)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 45)
        self.assertEqual(result[0].final, 60)
        self.assertEqual(result[1].initial, 75)
        self.assertEqual(result[1].final, 90)
        self.assertEqual(result[2].initial, 75)
        self.assertEqual(result[2].final, 90)
        self.assertEqual(result[3].initial, 100)
        self.assertEqual(result[3].final, 115)
        
    def test_jaccard(self):
        """
        self           --8--      ---10---      -4-
        y         ---10---             ---10---
        intersect      -5-             -4-    
        similarity:   ( 5 + 4 )/[(8 + 10 + 4) + (10 +10) - (5 + 4 )]
                      = 9/33
        """
        self.region_sets([['chr1',50,58],['chr1',70,80],['chr1',90,94]],
                         [['chr1',45,55],['chr1',76,86]])
        result = self.setA.jaccard(self.setB)
        self.assertEqual(result, 9/33)
    
    def test_get_genome_data(self):
        """hg19"""
        result = GenomicRegionSet("hg19")
        result.get_genome_data(organism="hg19")
        self.assertEqual(len(result), 23)
        """hg19, with Mitochondria chromosome"""
        result = GenomicRegionSet("hg19")
        result.get_genome_data(organism="hg19",chrom_M=True)
        self.assertEqual(len(result), 24)
        
    def test_random_regions(self):
        
        self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          total_size=100, 
                                          overlap_result=False, 
                                          overlap_input=False)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        
        self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          total_size=100, 
                                          overlap_result=True, 
                                          overlap_input=False)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          total_size=100, 
                                          overlap_result=False, 
                                          overlap_input=True)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          total_size=100, 
                                          overlap_result=True, 
                                          overlap_input=True)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        self.region_sets([['chr1',0,1000],['chr2',0,2000],['chrX',0,3000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          multiply_factor=100, 
                                          overlap_result=False, 
                                          overlap_input=False)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        self.region_sets([['chr1',0,1000],['chr2',0,2000],['chrX',0,3000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          multiply_factor=100, 
                                          overlap_result=False, 
                                          overlap_input=False,
                                          chrom_M=True)
        result.sort()
示例#38
0
if __name__ == "__main__":

  import sys

  from rgt.GenomicRegionSet import *

  bam_file=sys.argv[1]
  fasta_file=sys.argv[2]
  bed_file=sys.argv[3]
  kmer=int(sys.argv[4])
  shift=int(sys.argv[5])
  out=sys.argv[6]


  regions=GenomicRegionSet("regions")
  regions.read_bed(bed_file)


  table=BiasTable(regions=regions,dnase_file_name=bam_file,genome_file_name=fasta_file,k_nb=kmer,shift=shift)
  table.write_tables(out)









示例#39
0
def mode_4(exp_matrix,thresh,type_file,geneexp_file):
    #remember value of bedgraph, ugly way
        
    gene_set = GeneSet("")    
    gene_set.read_expression(geneexp_file)

    score = {}
    for regions in exp_matrix.get_regionsets():
        for region in regions:
            if type_file=="ODIN":
              aux=(region.data).split("\t")
              aux=aux[-1].split(";")
              score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = aux[-1]
            else:
              score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = region.data
    
    
    for region in exp_matrix.get_regionsets():
        f = open("region_" + str(region.name) + ".data", 'w')
        
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association_old(region.fileName, gene_set.genes, gene_file, genome_file, threshDist=thresh)

        print(mappedGenes)

        #region.filter_by_gene_association(organism=organism,threshDist=thresh)
        # _, _, mappedGenes, _, gene_peaks_mapping

         
        
        avg_score = {} #score per peak
        genes = {}
        
        print(region)
        for peak, gene_list in gene_peaks_mapping.items():            
            for gen in gene_list: #reverse mapping peak -> gene to gene -> peak
                if not gen:
                    continue
                genes[gen] = genes.get(gen, set())
                genes[gen].add(peak)
                    
                
                avg_score[gen] = avg_score.get(gen, [])
                avg_score[gen].append(score[peak]) #join all scores of peaks assigned to a gen

        print(avg_score)
        
        for gen in gene_set.genes:
            try:
              avg = sum(map(lambda x: float(x), avg_score[gen]))/ float(len(avg_score[gen]))
              peaks = ", ".join(str(t) for t in genes[gen])
              siz=avg*len(avg_score[gen])
            except:
              avg = 0.0 
              siz=0
              peaks = "_"           
            try:
              print(gen, "\t".join([str(t) for t in gene_set.values[gen.upper()]]),  avg, siz,peaks , sep='\t', file = f)
            except:
              pass
               
        f.close()
示例#40
0
def load_exon_sequence(bed, directory, genome_path):
    """Load the exon sequence from the the transcripts. 
    Input BED format should contain:
        blockCount - The number of blocks (exons) in the BED line.
        blockSizes - A comma-separated list of the block sizes.
        blockStarts - A comma-separated list of block starts. 
        see details: http://genome.ucsc.edu/FAQ/FAQformat#format1

    Output:
        Each FASTA file represants a transcript and contains all the exons within the file.

    """
    regionset = GenomicRegionSet("bed")
    regionset.read(bed)
    regionset.sort()

    
    genome = pysam.Fastafile(genome_path)
    
    try:
        if len(regionset.sequences[0].data.split("\t")) == 7: 
            blockinfor = True
            no_exon = False
    except:
        blockinfor = False
        regionset.sequences.sort(key=lambda g: g.name)
        no_exon = True

    if blockinfor:
        
        for gr in regionset:
            if not gr.name:
                print("Error: For fetching exon sequences, please define the transcript name.")
                sys.exit()
            else:
                if not os.path.exists(directory):
                    os.makedirs(directory)
                f = open(os.path.join(directory, gr.name+".fa"), "w")
                data = gr.data.split("\t")
                #print(len(data))
                if len(data) == 7:
                    #print(data)
                    n = int(data[4])
                    
                    blocks = [ int(b) for b in filter(None, data[5].split(",")) ]
                    starts = [ int(s) for s in filter(None, data[6].split(",")) ]
                    printstr = []

                    for i in range(n):
                        start = gr.initial + starts[i]
                        end = start + blocks[i]
                        if no_exon and i == 0:
                            ex = ""
                        elif gr.orientation == "-":
                            ex = "exon:"+str(n-i)
                        else:
                            ex = "exon:"+str(i+1)

                        if gr.orientation == "-":
                            seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna)
                            seq = seq.reverse_complement()
                            p = [ ">"+ " ".join([ gr.name, 
                                                  ex, 
                                                  "_".join(["REGION",gr.chrom,
                                                            str(start),str(end), 
                                                            gr.orientation]) ]),
                                  seq ]
                            
                            printstr.append(p)
                            

                        else:
                            p = [ ">"+ " ".join([gr.name, ex, 
                                  "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]),
                                  genome.fetch(gr.chrom, start-1, end-1)
                                ]
                            printstr.append(p)
                            

                    if gr.orientation == "-": printstr = printstr[::-1]
                    for i in range(n):
                        print(printstr[i][0], file=f)
                        print(printstr[i][1], file=f)
                        

                else:
                    print("Warning: The given regions have no block information, please try write_bed_blocks")
                f.close()
    else:
        pre_id = ""
        for gr in regionset:
            if not gr.name: 
                gr.name = gr.toString()

            if pre_id == "": 
                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)
            elif gr.name == pre_id:
                z.add(gr)
            else:
                f = open(os.path.join(directory, pre_id+".fa"), "w")
                for i, g in enumerate(z):
                    try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
                    except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )

                    print( ">"+ " ".join([g.name,  
                                          regiontag ]), file=f)
                    print(genome.fetch(g.chrom, g.initial, g.final), file=f)
                f.close()

                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)

        # Last TX
        f = open(os.path.join(directory, pre_id+".fa"), "w")
        for i, g in enumerate(z):
            try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
            except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )
            print( ">"+ " ".join([g.name, 
                                  regiontag ]), file=f)
            print(genome.fetch(g.chrom, g.initial, g.final), file=f)
        f.close()
示例#41
0
class TestGenomicRegionSet(unittest.TestCase):
    def region_sets(self, listA, listB):
        """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """
        self.setA = GenomicRegionSet('for Unit Test')
        for i in range(len(listA)):
            self.setA.add(
                GenomicRegion(chrom=listA[i][0],
                              initial=listA[i][1],
                              final=listA[i][2]))

        self.setB = GenomicRegionSet('for Unit Test')
        for i in range(len(listB)):
            self.setB.add(
                GenomicRegion(chrom=listB[i][0],
                              initial=listB[i][1],
                              final=listB[i][2]))

    def test_extend(self):
        """
        Two empty sets
        A : none 
        R : none
        """
        self.region_sets([], [])
        self.setA.extend(100, 100)
        self.assertEqual(len(self.setA.sequences), 0)
        """
        One region
        A :   -----
        R : ---------
        """
        self.region_sets([['chr1', 5, 10]], [])
        result = self.setA
        result.extend(4, 4)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 14)
        """
        Many region
        A :   -----   ------         -----    -----
        R : --------=---------     ------------------
        """
        self.region_sets([['chr1', 5, 10], ['chr1', 15, 20], ['chr1', 40, 50],
                          ['chr1', 65, 75]], [])
        result = self.setA
        result.extend(5, 5)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 15)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[2].initial, 35)
        self.assertEqual(result[2].final, 55)
        self.assertEqual(result[3].initial, 60)
        self.assertEqual(result[3].final, 80)
        """
        Many region in different chromosome
        A :   -----   ------         -----    -----
        R : none
        """
        self.region_sets([['chr1', 5, 10], ['chr2', 15, 20], ['chr3', 40, 50],
                          ['chr4', 65, 75]], [])
        result = self.setA
        result.extend(5, 5)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 15)
        self.assertEqual(result[0].chrom, 'chr1')
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[1].chrom, 'chr2')
        self.assertEqual(result[2].initial, 35)
        self.assertEqual(result[2].final, 55)
        self.assertEqual(result[2].chrom, 'chr3')
        self.assertEqual(result[3].initial, 60)
        self.assertEqual(result[3].final, 80)
        self.assertEqual(result[3].chrom, 'chr4')
        """
        One region
        A :   -----
        R : ---------
        """
        self.region_sets([['chr1', 100, 200]], [])
        result = self.setA
        result.extend(10, 10, percentage=True)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 90)
        self.assertEqual(result[0].final, 210)

    def test_sort(self):
        self.region_sets([['chr1', 15, 20], ['chr1', 40, 50], ['chr1', 65, 75],
                          ['chr1', 5, 10]], [])
        self.setA.sort()

    def test_intersect(self):
        """
        Two empty sets
        A : none 
        B : none
        R : none
        """
        self.region_sets([], [])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        One empty set
        A :   -----
        B : none
        R : none
        """
        self.region_sets([['chr1', 5, 10]], [])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        A : none
        B :   -----
        R : none
        """
        self.region_sets([], [['chr1', 5, 10]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        No overlapping
        A : ------      ---------               ------- 
        B :        ----          ------  ------   
        R : none
        """
        self.region_sets([['chr1', 1, 5], ['chr1', 11, 20], ['chr1', 33, 38]],
                         [['chr1', 7, 9], ['chr1', 20, 25], ['chr1', 26, 31]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        End-to-end attach
        A : ------      ------
        B :       ------
        R : none
        """
        self.region_sets([['chr1', 1, 5], ['chr1', 11, 20]], [['chr1', 5, 11]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        No length attach
        A : .      .
        B :    .   .
        R : none
        """
        self.region_sets([['chr1', 2, 2], ['chr1', 20, 20]],
                         [['chr1', 5, 5], ['chr1', 20, 20]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Perfect overlapping
        A : ------
        B : ------
        R : ------
        """
        self.region_sets(
            [['chr1', 1, 10], ['chr1', 500, 550], ['chr1', 600, 650],
             ['chr1', 700, 750], ['chr1', 725, 800]],
            [['chr1', 1, 10], ['chr1', 500, 550], ['chr1', 600, 650],
             ['chr1', 700, 750], ['chr1', 725, 800]])
        result = self.setA.intersect(self.setB,
                                     mode=OverlapType.OVERLAP,
                                     rm_duplicates=True)

        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        """
        One overlapping region
        A : ------
        B :     --------
        R1:     --       (overlap)
        R2: ------       (original)
        R3:              (comp_incl)
        """

        self.region_sets([['chr1', 1, 10]], [['chr1', 7, 20]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Two simple overlapping regions
        A : -------      --------
        B :     -------------
        R1:     ---      ----     (overlap)
        R2: -------      -------- (original)
        R3:                       (comp_incl)
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 26, 35]],
                         [['chr1', 7, 30]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 30)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Two separately overlapping regions 
        A : -------      --------
        B :     -----        --------
        R1:     ---          ----     (overlap)
        R2: -------      --------     (original)
        R3:                           (comp_incl)
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 26, 35]],
                         [['chr1', 7, 15], ['chr1', 30, 40]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 30)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Many various overlapping (mixed)
        A :   ------------------            --------   ---------
        B : ----   -------    ------            ----------      
        R1:   --   -------    --                ----   ---       (overlap)
        R2:   ------------------            --------   --------- (original)
        R3:                                                      (comp_incl) 
        """

        self.region_sets([['chr1', 3, 30], ['chr1', 50, 60], ['chr1', 70, 85]],
                         [['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 27, 35],
                          ['chr1', 55, 75]])

        result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 3)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 27)
        self.assertEqual(result[2].final, 30)
        self.assertEqual(result[3].initial, 55)
        self.assertEqual(result[3].final, 60)
        self.assertEqual(result[4].initial, 70)
        self.assertEqual(result[4].final, 75)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 3)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 50)
        self.assertEqual(result[1].final, 60)
        self.assertEqual(result[2].initial, 70)
        self.assertEqual(result[2].final, 85)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Different chromosomes
        A : chr1  -------
        B : chr2  -------
        R : none
        """
        self.region_sets([['chr1', 1, 10]], [['chr2', 1, 10]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Completely included overlapping
        A : ---------------------------
        B : ----    ------       -----------
        R1: ----    ------       ------      (overlap)
        R2: ---------------------------      (original)
        R3:                                  (comp_incl)
        """
        self.region_sets([['chr1', 1, 50]],
                         [['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 45, 60]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 50)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 50)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        A : ----    ------       -----------
        B : ---------------------------
        R1: ----    ------       ------      (overlap)
        R2: ----    ------       ----------- (original)
        R3: ----    ------                   (comp_incl)
        """

        self.region_sets([['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 45, 60]],
                         [['chr1', 1, 50]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 50)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 60)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        """
        A : --------------         -------
                ------
        B :       -----          ----------------
        R1:       -----            -------      (overlap)
                  ----
        R2: --------------         -------      (original)
                ------
        R3:                        -------      (comp_incl)
        """
        self.region_sets([['chr1', 1, 50], ['chr1', 20, 40], ['chr1', 70, 80]],
                         [['chr1', 25, 45], ['chr1', 65, 95]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 25)
        self.assertEqual(result[0].final, 45)
        self.assertEqual(result[1].initial, 70)
        self.assertEqual(result[1].final, 80)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[1].initial, 20)
        self.assertEqual(result[1].final, 40)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 50)
        self.assertEqual(result[2].initial, 70)
        self.assertEqual(result[2].final, 80)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 70)
        self.assertEqual(result[0].final, 80)

    def test_closest(self):
        """
        Two empty sets
        A : none 
        B : none
        R : none
        """
        self.region_sets([], [])
        result = self.setA.closest(self.setB)
        self.assertEqual(len(result), 0)
        # """
        # One empty set
        # A :   -----
        # B : none
        # R : none
        # """
        # self.region_sets([['chr1',5,10]],
        #                  [])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # A : none
        # B :   -----
        # R : none
        # """
        # self.region_sets([],
        #                  [['chr1',5,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # Overlapping within set
        # A : -----====-----
        # B :      ----
        # R :      ----
        # """
        # self.region_sets([['chr1',1,10],['chr1',6,15]],
        #                  [['chr1',6,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # """
        # A :      ----
        # B : -----====-----
        # R : -----====-----
        # """
        # self.region_sets([['chr1',6,10]],
        #                  [['chr1',1,10],['chr1',6,15]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # """
        # No overlapping
        # A : ------      ---------               -------
        # B :        ----          ------  ------
        # R :                      ------
        # """
        # self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]],
        #                  [['chr1',7,9],['chr1',20,25],['chr1',26,31]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 3)
        # # self.assertEqual(result[0].initial, 20)
        # # self.assertEqual(result[0].final, 25)
        # """
        # End-to-end attach
        # A : ------      ------
        # B :       ------
        # R :       ------
        # """
        # self.region_sets([['chr1',1,5],['chr1',11,20]],
        #                  [['chr1',5,11]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # # self.assertEqual(result[0].initial, 5)
        # # self.assertEqual(result[0].final, 11)
        # """
        # Perfect overlapping
        # A : ------
        # B : ------
        # R : ------
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr1',1,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # self.assertEqual(result[0].initial, 1)
        # self.assertEqual(result[0].final, 10)
        # """
        # One overlapping region
        # A : ------
        # B :     --------
        # R :     --------
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr1',7,20]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result[0].initial, 7)
        # self.assertEqual(result[0].final, 20)
        # """
        # Two simple overlapping regions
        # A : -------      --------
        # B :     -------------
        # R :     -------------
        # """
        # self.region_sets([['chr1',1,10],['chr1',26,35]],
        #                  [['chr1',7,30]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result[0].initial, 7)
        # self.assertEqual(result[0].final, 30)
        # """
        # Two separately overlapping regions
        # A : -------      --------
        # B :     -----        --------
        # R : none
        # """
        # self.region_sets([['chr1',1,10],['chr1',26,35]],
        #                  [['chr1',7,15],['chr1',30,40]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # """
        # Many various overlapping (mixed)
        # A :   ------------------            --------   ---------
        # B : ----   -------    ------            ----------
        # R : none
        # """
        # self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]],
        #                  [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 4)
        # """
        # Different chromosomes
        # A : chr1  -------
        # B : chr2  -------
        # R : chr2  -------
        #
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr2',1,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # Completely included overlapping
        # A : ---------------------------
        # B : ----    ------       -----------
        # R : ----    ------       -----------
        # """
        # self.region_sets([['chr1',1,50]],
        #                  [['chr1',1,5],['chr1',10,19],['chr1',45,60]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 3)
        # """
        # A : ----    ------       -----------
        # B : ---------------------------
        # R : none
        # """
        # self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]],
        #                  [['chr1',1,50]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result, False)
        # """
        # A : ----         ------                  ---
        # B :        ---              -----
        # R :        ---
        # """
        # self.region_sets([['chr1',1,5],['chr1',27,45],['chr1',85,95]],
        #                  [['chr1',15,20],['chr1',55,65]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # self.assertEqual(result[0].initial, 15)
        # self.assertEqual(result[0].final, 20)

    def test_remove_duplicates(self):
        """
        A : ===== -----
        R : ----- -----
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 1, 10], ['chr1', 15, 25]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        """
        A : =====--- -----
        R : =====--- -----
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 1, 15], ['chr1', 20, 25]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 1)
        self.assertEqual(result[1].final, 15)
        self.assertEqual(result[2].initial, 20)
        self.assertEqual(result[2].final, 25)
        """
        A : ===== ----- ------  ====
        R : ----- ----- ------  ----
        """
        self.region_sets(
            [['chr1', 1, 10], ['chr1', 1, 10], ['chr1', 15, 25],
             ['chr1', 30, 35], ['chr1', 40, 45], ['chr1', 40, 45]], [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[2].initial, 30)
        self.assertEqual(result[2].final, 35)
        self.assertEqual(result[3].initial, 40)
        self.assertEqual(result[3].final, 45)

    def test_window(self):
        """
        A :             -------
        B : ------[ 99 ]       [   199   ]---
        window = 100
        R :       -                           only one base overlaps with extending A
        """
        self.region_sets([['chr1', 200, 300]],
                         [['chr1', 1, 101], ['chr1', 499, 550]])
        result = self.setA.window(self.setB, adding_length=100)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 100)
        self.assertEqual(result[0].final, 101)
        """
        A :             -------
        B : ------[ 99 ]       [   199   ]---
        window = 200
        R : ------                        -   
        left-hand side is covered, and the right-hand side is only one base overlapped
        """
        self.region_sets([['chr1', 200, 300]],
                         [['chr1', 1, 101], ['chr1', 499, 550]])
        result = self.setA.window(self.setB, adding_length=200)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial,
                         1)  # GenomicRegion.extend will choose 1 rather than 0
        self.assertEqual(result[0].final, 101)
        self.assertEqual(result[1].initial, 499)
        self.assertEqual(result[1].final, 500)
        """
        A :                         ----    ----
        B :             --------                    ----
        window = 1000 (default)
        R :                 ----                    ----
        """
        self.region_sets([['chr1', 3000, 3500], ['chr1', 4000, 4500]],
                         [['chr1', 1500, 2500], ['chr1', 5000, 5500]])
        result = self.setA.window(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 2000)
        self.assertEqual(result[0].final, 2500)
        self.assertEqual(result[1].initial, 5000)
        self.assertEqual(result[1].final, 5500)
        """
        A :                         ----    ----
        B :             --------                    ----
        window = 2000
        R :             --------                    ----
                            ----                    ----
        window = 100
        R : none
        """
        self.region_sets([['chr1', 3000, 3500], ['chr1', 4000, 4500]],
                         [['chr1', 1500, 2500], ['chr1', 5000, 5500]])
        result = self.setA.window(self.setB, adding_length=2000)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1500)
        self.assertEqual(result[0].final, 2500)
        self.assertEqual(result[1].initial, 5000)
        self.assertEqual(result[1].final, 5500)
        result = self.setA.window(self.setB, adding_length=100)
        self.assertEqual(len(result), 0)

    def test_subtract(self):
        """
        A : none
        B :    ------
        R : none
        """
        self.region_sets([], [['chr1', 6, 15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A :    ------
        B : none
        R :    ------
        """
        self.region_sets([['chr1', 6, 15]], [])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 6)
        self.assertEqual(result[0].final, 15)
        """
        A : ------
        B :    ------
        R : ---
        """
        self.region_sets([['chr1', 1, 10]], [['chr1', 6, 15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 6)
        """
        A :    ------
        B : ------
        R :       ---
        """
        self.region_sets([['chr1', 6, 15]], [['chr1', 1, 10]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 10)
        self.assertEqual(result[0].final, 15)
        """
        A :    ---
        B : ---------
        R : none
        """
        self.region_sets([['chr1', 6, 10]], [['chr1', 1, 15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A : ---------
        B :    ---
        R : ---   ---
        """
        self.region_sets([['chr1', 1, 15]], [['chr1', 6, 10]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 6)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 15)
        """
        A :    ------
        B :    ------
        R : none
        """
        self.region_sets([['chr1', 6, 15]], [['chr1', 6, 15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A :   ----------              ------
        B :          ----------                    ----
        R :   -------                 ------
        """
        self.region_sets([['chr1', 5, 30], ['chr1', 70, 85]],
                         [['chr1', 20, 50], ['chr1', 100, 110]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 5)
        self.assertEqual(result[0].final, 20)
        self.assertEqual(result[1].initial, 70)
        self.assertEqual(result[1].final, 85)
        """
        A :        ------   -----
        B :    ------
        R :          ----   -----
        """
        self.region_sets([['chr1', 20, 30], ['chr1', 35, 55]],
                         [['chr1', 10, 23], ['chr1', 100, 110]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 23)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 35)
        self.assertEqual(result[1].final, 55)
        """
        A :   ch1     ---------------------
              ch2     -------------------------
        B :   ch1             ------
              ch2                        ------
        R :   ch1     --------      -------
              ch2     -------------------
        """
        self.region_sets([['chr1', 0, 30000], ['chr2', 0, 35000]],
                         [['chr1', 20000, 23000], ['chr2', 31000, 35000]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 20000)
        self.assertEqual(result[1].initial, 23000)
        self.assertEqual(result[1].final, 30000)
        self.assertEqual(result[2].initial, 0)
        self.assertEqual(result[2].final, 31000)
        """
        A :   -----------------------------------------------------------
        B :    ---    ---------         ----           ----
        R :   -   ----         ---------    -----------    --------------
        """
        self.region_sets([['chr1', 5, 1000]],
                         [['chr1', 10, 15], ['chr1', 30, 70],
                          ['chr1', 120, 140], ['chr1', 200, 240]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 5)
        """
        A :   -----------------------              ------
                   -----     -----  -----------
        B :    ---    ---------         ----           ----
        R :   -   ----         ------              ----
                   ---         ---  ----    ---
        """
        self.region_sets([['chr1', 5, 100], ['chr1', 20, 40], ['chr1', 60, 80],
                          ['chr1', 95, 150], ['chr1', 180, 220]],
                         [['chr1', 10, 15], ['chr1', 30, 70],
                          ['chr1', 120, 140], ['chr1', 200, 240]])
        result = self.setA.subtract(self.setB)
        # print(result.sequences)
        self.assertEqual(len(result), 8)
        self.assertEqual(result[0].initial, 5)
        """
        A :   -----------------------------------------------------------
        B :    ---    ---------         ----           ----
        R :   -   ----         ---------    -----------    --------------
        """
        self.region_sets(
            [['chr1', 5, 1000], ['chr2', 5, 1000], ['chr4', 5, 1000]],
            [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140],
             ['chr1', 200, 240], ['chr2', 10, 15], ['chr2', 30, 70],
             ['chr2', 120, 140], ['chr2', 200, 240], ['chr4', 10, 15],
             ['chr4', 30, 70], ['chr4', 120, 140], ['chr4', 200, 240]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 15)

    def test_merge(self):
        """
        A : none
        R : none
        """
        self.region_sets([], [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 0)
        """
        A : -----  -----
        R : -----  -----
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 15, 25]], [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        """
        A1: ------------   ----
        A2:    -----
        R : ------------   ----
        """
        self.region_sets([['chr1', 1, 30], ['chr1', 11, 20], ['chr1', 40, 50]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 40)
        self.assertEqual(result[1].final, 50)
        """
        A1: --------       ----
        A2:    ---------
        R : ------------   ----
        """
        self.region_sets([['chr1', 1, 30], ['chr1', 20, 40], ['chr1', 50, 60]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 40)
        self.assertEqual(result[1].initial, 50)
        self.assertEqual(result[1].final, 60)
        """
        A : =======
        R : -------
        """
        self.region_sets([['chr1', 1, 30], ['chr1', 1, 30]], [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 30)

    def test_cluster(self):
        """
        Empty sets
        A : none 
        R : none
        """
        self.region_sets([], [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 0)
        """
        A :  ------- 
        R :  -------
        """
        self.region_sets([['chr1', 1, 10]], [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        """
        A :  -----
                  ------
        R :  -----------
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 10, 20]], [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 20)
        """
        A :  -----  -----
        R1:  -----  -----
        R2:  ------------
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 15, 25]], [])
        result = self.setA.cluster(1)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        result = self.setA.cluster(5)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        result = self.setA.cluster(6)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 25)
        """
        A :  ---- ----  ----   ----    ----
        R1:  ---------  ----   ----    ----
        R2:  ---------------   ----    ----
        R3:  ----------------------    ----
        R4:  ------------------------------
        R5:  ------------------------------
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 15, 25], ['chr1', 35, 45],
                          ['chr1', 60, 70], ['chr1', 90, 100]], [])
        result = self.setA.cluster(6)
        self.assertEqual(len(result), 4)
        result = self.setA.cluster(11)
        self.assertEqual(len(result), 3)
        result = self.setA.cluster(16)
        self.assertEqual(len(result), 2)
        result = self.setA.cluster(21)
        self.assertEqual(len(result), 1)
        result = self.setA.cluster(26)
        self.assertEqual(len(result), 1)

    def test_flank(self):
        """
        A :        -----
        R1:     ---     ---
        """
        self.region_sets([['chr1', 60, 75]], [])
        result = self.setA.flank(10)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 50)
        self.assertEqual(result[0].final, 60)
        self.assertEqual(result[1].initial, 75)
        self.assertEqual(result[1].final, 85)
        """
        A :        -----     ----
        R1:   -----     =====    ----
        """
        self.region_sets([['chr1', 60, 75], ['chr1', 90, 100]], [])
        result = self.setA.flank(15)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 45)
        self.assertEqual(result[0].final, 60)
        self.assertEqual(result[1].initial, 75)
        self.assertEqual(result[1].final, 90)
        self.assertEqual(result[2].initial, 75)
        self.assertEqual(result[2].final, 90)
        self.assertEqual(result[3].initial, 100)
        self.assertEqual(result[3].final, 115)

    def test_jaccard(self):
        """
        self           --8--      ---10---      -4-
        y         ---10---             ---10---
        intersect      -5-             -4-    
        similarity:   ( 5 + 4 )/[(8 + 10 + 4) + (10 +10) - (5 + 4 )]
                      = 9/33
        """
        self.region_sets(
            [['chr1', 50, 58], ['chr1', 70, 80], ['chr1', 90, 94]],
            [['chr1', 45, 55], ['chr1', 76, 86]])
        result = self.setA.jaccard(self.setB)
        self.assertEqual(result, 9 / 33)

    def test_get_genome_data(self):
        """hg19"""
        result = GenomicRegionSet("hg19")
        result.get_genome_data(organism="hg19")
        self.assertEqual(len(result), 23)
        """hg19, with Mitochondria chromosome"""
        result = GenomicRegionSet("hg19")
        result.get_genome_data(organism="hg19", chrom_M=True)
        self.assertEqual(len(result), 24)

    def test_random_regions(self):

        self.region_sets(
            [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          total_size=100,
                                          overlap_result=False,
                                          overlap_input=False)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          total_size=100,
                                          overlap_result=True,
                                          overlap_input=False)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          total_size=100,
                                          overlap_result=False,
                                          overlap_input=True)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          total_size=100,
                                          overlap_result=True,
                                          overlap_input=True)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 1000], ['chr2', 0, 2000], ['chrX', 0, 3000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          multiply_factor=100,
                                          overlap_result=False,
                                          overlap_input=False)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 1000], ['chr2', 0, 2000], ['chrX', 0, 3000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          multiply_factor=100,
                                          overlap_result=False,
                                          overlap_input=False,
                                          chrom_M=True)
        result.sort()
示例#42
0
def mode_1(exp_matrix):
    for region in exp_matrix.get_regionsets():
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, _ = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, threshDist=50000)
        print('#number of mapped genes:', mappedGenes)
        print(region.name+"\t"+("\t".join(region_set.genes)))
 def test_filter_tts(self):
 	txp = RNADNAInteractionSet(organism="hg19", filename=sample_txp)
 	g = GenomicRegionSet("g")
 	s = GenomicRegion(chrom="chr2", initial=74000000, final=75000000)
 	g.add(s)
 	result = txp.count_tts(g)
示例#44
0
from fisher import pvalue

back = False
designFile = sys.argv[1]
anotationPath = sys.argv[2]
genomeFile = anotationPath + "chrom.sizes"
geneFile = anotationPath + "association_file.bed"

exps = ExperimentalMatrix()
exps.read(designFile)

beds = []
geneLists = []

#this should be improved
bedGenes = GenomicRegionSet(geneFile)
bedGenes.read_bed(geneFile)
allgenes = []
for r in bedGenes:
    allgenes.append(r.name)
allgenes = list(set(allgenes))

genesets = exps.get_genesets()

if len(sys.argv) > 3:
    back = True
    backGroundPeaks = sys.argv[3]
    backBed = GenomicRegionSet("BACK")
    backBed.read_bed(backGroundPeaks)

backBed = GenomicRegionSet("BACK")
示例#45
0
from fisher import pvalue

back=False
designFile = sys.argv[1]
anotationPath = sys.argv[2]
genomeFile=anotationPath+"chrom.sizes"
geneFile=anotationPath+"association_file.bed"

exps=ExperimentalMatrix()
exps.read(designFile)

beds=[]
geneLists=[]

#this should be improved
bedGenes = GenomicRegionSet(geneFile)
bedGenes.read_bed(geneFile)
allgenes=[]
for r in bedGenes:
 allgenes.append(r.name)
allgenes=list(set(allgenes))

genesets=exps.get_genesets()

if len(sys.argv) > 3:
    back=True
    backGroundPeaks = sys.argv[3]
    backBed=GenomicRegionSet("BACK")
    backBed.read_bed(backGroundPeaks)

示例#46
0
##################################################################################
parser = argparse.ArgumentParser(description='Replace TCONs in BED file by assoicated gene names', 
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-bed', type=str, help="BED file or a directory containing BED files")
parser.add_argument('-output', type=str, help="Define the output directory")
parser.add_argument('-organism', type=str, help="Define the organism")
args = parser.parse_args()




genome = GenomeData(args.organism)

if os.path.isfile(args.bed):
    regionset = GenomicRegionSet("bed")
    regionset.read_bed(args.bed)
    gr = regionset.gene_association(organism=args.organism, promoterLength=1000, 
                                    threshDist=500000, show_dis=True)
    regionset.replace_region_name(gr,combine=True)
    
    regionset.write_bed(args.output)

elif os.path.isdir(args.bed):
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    for root, dirnames, filenames in os.walk(args.bed):
            
        for filename in filenames:
            if ".bed" in filename:
                print(filename)
示例#47
0
##################################################################################
parser = argparse.ArgumentParser(description='Replace TCONs in BED file by assoicated gene names', 
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-bed', type=str, help="BED file or a directory containing BED files")
parser.add_argument('-output', type=str, help="Define the output directory")
parser.add_argument('-organism', type=str, help="Define the organism")
args = parser.parse_args()




genome = GenomeData(args.organism)

if os.path.isfile(args.bed):
    regionset = GenomicRegionSet("bed")
    regionset.read(args.bed)
    gr = regionset.gene_association(organism=args.organism, promoter_length=1000,
                                    thresh_dist=500000, show_dis=True)
    regionset.replace_region_name(gr,combine=True)
    
    regionset.write(args.output)

elif os.path.isdir(args.bed):
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    for root, dirnames, filenames in os.walk(args.bed):
            
        for filename in filenames:
            if ".bed" in filename:
                print(filename)
示例#48
0
    def get_promoters(self,
                      promoter_length=1000,
                      tss=0,
                      gene_set=None,
                      unmaplist=False,
                      variants=False,
                      gene_id=False,
                      regiondata=False):
        """
        Gets promoters of genes given a specific promoter length. It returns a GenomicRegionSet with such promoters.
        The ID of each gene will be put in the NAME field of each GenomicRegion.
        Each promoter includes also the coordinate of the 5' base pair, therefore each promoter actual
        length is promoter_length+1.

        *Keyword arguments:*

            - promoter_length -- The length of the promoter region.
            - gene_set -- A set of genes to narrow the search.
            - unmaplist -- If True than also return the unmappable genes list (default = False).

        *Return:*

            - result_grs -- A GenomicRegionSet containing the promoters.
            - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID.
        """

        # Fetching gene names
        mapped_gene_list = None
        unmapped_gene_list = None

        if gene_set:
            mapped_gene_list, unmapped_gene_list, mapping_dict = self.fix_gene_names(
                gene_set, output_dict=True)

        # Fetching genes

        if not variants:
            target = "gene"
        else:
            target = "transcript"
        if gene_set:
            query_dictionary = {
                self.GeneField.FEATURE_TYPE: target,
                self.GeneField.GENE_ID: mapped_gene_list
            }
        else:
            query_dictionary = {self.GeneField.FEATURE_TYPE: target}

        query_annset = self.get(query_dictionary)

        # Creating GenomicRegionSet
        result_grs = GenomicRegionSet("promoters")
        for e in query_annset.gene_list:
            gr = e[self.GeneField.GENOMIC_REGION]
            if gr.orientation == "+":
                gr.final = gr.initial + 1 + tss
                gr.initial = gr.initial - promoter_length
            else:
                gr.initial = gr.final - 1 - tss
                gr.final = gr.initial + promoter_length + 1

            if gene_set:
                try:
                    gr.name = mapping_dict[e[self.GeneField.GENE_ID]]
                except:
                    gr.name = e[self.GeneField.GENE_ID]
            elif gene_id:
                gr.name = e[self.GeneField.GENE_ID]
            else:
                gr.name = e[self.GeneField.GENE_NAMES]

            if gene_set and regiondata:
                gr.data = gene_set.values[gr.name]
            result_grs.add(gr)

        if unmaplist:
            return result_grs, unmapped_gene_list
        else:
            return result_grs
示例#49
0
from rgt.GenomicRegionSet import *
from rgt.ExperimentalMatrix import *
#from fisher import pvalue
import scipy.stats

outdir = ""

back = False
designFile = sys.argv[1]
genomeName = sys.argv[2]
geneFile = sys.argv[3]
randomize = int(sys.argv[4])
backGroundPeaks = False
if len(sys.argv) > 5:
    backGroundPeaksName = sys.argv[6]
    backBed = GenomicRegionSet("BACK")
    backBed.read_bed(backGroundPeaksName)
    backGroundPeaks = True

distance = 50000
if len(sys.argv) > 6:
    distance = len(sys.argv[6])

if len(sys.argv) > 7:
    outdir = sys.argv[7]

#genomeFile=anotationPath+"chrom.sizes"
#geneFile=anotationPath+"association_file.bed"

exps = ExperimentalMatrix()
exps.read(designFile)
    annotation_path = args[2]
    outputdir = args[3]
    
    
#     experimental_matrix_file = "/home/manuel/workspace/cluster_p/THOR/exp/exp23_macs2_payton/1"
#     gene_exp = "/home/manuel/workspace/cluster_p/allhoff/project_THOR/data/payton/gene_expression/CCmean.data"
#     annotation_path = "/home/manuel/workspace/cluster_h/rgtdata/hg19/"
#     outputdir = "/home/manuel/test/"
    
    exps = ExperimentalMatrix()
    exps.read(experimental_matrix_file)
    regionsets = exps.get_regionsets()
    
    genome_file = annotation_path + "/chrom.sizes"
    gene_file = annotation_path + "/association_file.bed"
    
    genes = GeneSet("Expression")
    genes.read_expression(gene_exp)
    
    for region in regionsets:
        bedNew = GenomicRegionSet("")
        [degenes, de_peak_genes, mappedGenes, totalPeaks, regionsToGenes] \
        = bedNew.filter_by_gene_association_old(region.fileName, genes.genes, gene_file, genome_file, threshDist=options.dist)
        
        [ct, labels] = averageExpression(region, genes, regionsToGenes)
        fileName = path.splitext(path.basename(region.fileName))[0]
	output(genes.cond, labels, ct, path.join(outputdir, fileName + ".txt"))
        
        

示例#51
0
"""
Updated on 22 May 2014 by Joseph
"""

#############################  Parameters    ##############################
parser = argparse.ArgumentParser(description='Return the random sequences according to the given parameters.')
parser.add_argument('-o','-organism', default= "hg19", help='Define the organism. Default: hg19')
parser.add_argument('-l','-length', type=int, help='Define the length of each sequence.')
parser.add_argument('-n','-number', type=int, help='Define the number of random regions.')
parser.add_argument('-f','-filter', default=None, help='Given the path to the BED file as the filter.')

args = parser.parse_args()

# Setup the entries
region = GenomicRegion("sample", initial=0, final=args.l)
template = GenomicRegionSet("tamplate")
template.add(region)
    
if not os.path.exists(bed_dir):
    os.makedirs(bed_dir)
    
# Random region
result = template.random_regions(organism= "hg19", total_size=args.n, multiply_factor=0, overlap_result=True, overlap_input=True, chrom_X=False, chrom_M=False, filter_path=args.f)
result.write(os.path.join(bed_dir, "00total.bed"))
chrom = GenomicRegionSet("chrom")
chrom.get_genome_data(organism=args.o, chrom_X=False, chrom_M=False)
            
chrom_list = []
for r in chrom.sequences:
    chrom_list.append(r.chrom)
        
示例#52
0
    
    
#     experimental_matrix_file = "/home/manuel/workspace/cluster_p/THOR/exp/exp23_macs2_payton/1"
#     gene_exp = "/home/manuel/workspace/cluster_p/allhoff/project_THOR/data/payton/gene_expression/CCmean.data"
#     annotation_path = "/home/manuel/workspace/cluster_h/rgtdata/hg19/"
#     outputdir = "/home/manuel/test/"
    
    exps = ExperimentalMatrix()
    exps.read(experimental_matrix_file)
    regionsets = exps.get_regionsets()
    
    genome_file = annotation_path + "/chrom.sizes"
    gene_file = annotation_path + "/association_file.bed"
    
    genes = GeneSet("Expression")
    genes.read_expression(gene_exp)
    
    for region in regionsets:
        bedNew = GenomicRegionSet("")
        [degenes, de_peak_genes, mappedGenes, totalPeaks, regionsToGenes] \
        = bedNew.filter_by_gene_association_old(region.fileName, genes.genes, gene_file, genome_file)
        
        [ct, labels] = averageExpression(region, genes, regionsToGenes)
        aux = region.fileName.split("/")
        fileName = aux[-1]
        fileName = fileName.split(".")
        output(genes.cond, labels, ct, outputdir + "/" + fileName[0] + ".txt")
        
        

示例#53
0
 def test_filter_tts(self):
     txp = RNADNAInteractionSet(organism="hg19", filename=sample_txp)
     g = GenomicRegionSet("g")
     s = GenomicRegion(chrom="chr2", initial=74000000, final=75000000)
     g.add(s)
     result = txp.count_tts(g)
示例#54
0
import unittest
from rgt.GenomicRegionSet import *
from rgt.CoverageSet import CoverageSet

regions = GenomicRegionSet("test")
regions.add(GenomicRegion("chr1", 10000, 11000, "+"))
regions.add(GenomicRegion("chr1", 20000, 21000, "-"))

cov = CoverageSet("coverage", regions)

bamfile = "/projects/lncRNA/local/cardio/total_rna/bam/d4_1.bam"
bedfile = "~/rgtdata/hg38/genes_hg38.bed"

class CoverageSet_Test(unittest.TestCase):
    def coverage_from_genomicset(self):
        cov.coverage_from_genomicset(bamfile)
        print(cov.coverage)
        self.assertEqual(cov.coverage, 4)
示例#55
0
from rgt.GenomicRegionSet import *
from rgt.ExperimentalMatrix import *
#from fisher import pvalue
import scipy.stats


outdir=""

back=False
designFile = sys.argv[1]
anotationPath = sys.argv[2]
randomize = int(sys.argv[3])
backGroundPeaks=False
if len(sys.argv) > 4:
  backGroundPeaksName = sys.argv[4]
  backBed=GenomicRegionSet("BACK")
  backBed.read_bed(backGroundPeaksName)  
  backGroundPeaks=True
   
distance=50000
if len(sys.argv) > 5:
  distance=len(sys.argv[5])

if len(sys.argv) > 6:
  outdir=sys.argv[6]


genomeFile=anotationPath+"chrom.sizes"
geneFile=anotationPath+"association_file.bed"

exps=ExperimentalMatrix()
示例#56
0
from rgt.ExperimentalMatrix import *
#from fisher import pvalue
import scipy.stats


outdir=""

back=False
designFile = sys.argv[1]
genomeName = sys.argv[2]
geneFile = sys.argv[3]
randomize = int(sys.argv[4])
backGroundPeaks=False
if len(sys.argv) > 5:
  backGroundPeaksName = sys.argv[6]
  backBed=GenomicRegionSet("BACK")
  backBed.read(backGroundPeaksName)  
  backGroundPeaks=True
   
distance=50000
if len(sys.argv) > 6:
  distance=len(sys.argv[6])

if len(sys.argv) > 7:
  outdir=sys.argv[7]


#genomeFile=anotationPath+"chrom.sizes"
#geneFile=anotationPath+"association_file.bed"

exps=ExperimentalMatrix()