def test_len(self): # 1234567890 # ========== gene1 = Gene("chr1", "a", "b", "1", "10", "+") self.assertEqual(gene1.len(), 10) # 01234567890 # =========== gene2 = Gene("chr1", "a", "b", "10", "20", "+") self.assertEqual(gene2.len(), 11)
def test_set_type_of_5_prime_tss(self): self.tss_gene_mapper.genes_and_5_prime_tss = {} self.tss_gene_mapper.tss_and_hit_genes = {} tss_primary = TSS("genomeX", 40, "+") tss_secondary = TSS("genomeX", 30, "+") gene = Gene("genomeX", "g", "g", 50, 100, "+") self.tss_gene_mapper.genes_and_5_prime_tss[gene] = [[10, tss_primary], [ 20, tss_secondary ]] self.tss_gene_mapper.tss_and_hit_genes[tss_primary] = {} self.tss_gene_mapper.tss_and_hit_genes[tss_secondary] = {} self.tss_gene_mapper.tss_and_hit_genes[tss_primary][gene] = { "location": tssgenemapper.loc_5_prime_str, "orientation": tssgenemapper.sense_str, "distance": 10, "tss_type": None } self.tss_gene_mapper.tss_and_hit_genes[tss_secondary][gene] = { "location": tssgenemapper.loc_5_prime_str, "orientation": tssgenemapper.sense_str, "distance": 20, "tss_type": None } self.tss_gene_mapper._set_type_of_5_prime_tss() self.assertEqual( self.tss_gene_mapper.tss_and_hit_genes[tss_primary][gene] ["tss_type"], tssgenemapper.primary_str) self.assertEqual( self.tss_gene_mapper.tss_and_hit_genes[tss_secondary][gene] ["tss_type"], tssgenemapper.secondary_str)
def test_init_without_extra(self): gene = Gene("chr1", "gene_2342", "hacY", "3", "83", "+") self.assertEqual(gene.seq_id, "chr1") self.assertEqual(gene.gene_id, "gene_2342") self.assertEqual(gene.name, "hacY") self.assertEqual(gene.start, 3) self.assertEqual(gene.end, 83) self.assertEqual(gene.strand, "+") self.assertFalse(hasattr(gene, "extra"))
def test_init_with_extra(self): gene = Gene("chr2", "gene_0005", "hacZ", "15", "30", "-", extra="mope") self.assertEqual(gene.seq_id, "chr2") self.assertEqual(gene.gene_id, "gene_0005") self.assertEqual(gene.name, "hacZ") self.assertEqual(gene.start, 15) self.assertEqual(gene.end, 30) self.assertEqual(gene.strand, "-") self.assertEqual(gene.extra, "mope")
def create_gene_list(self): self.gene_list = [] gff_parser = Gff3Parser() for entry in gff_parser.entries(self.gff_fh): if entry.feature != "gene": continue self.gene_list.append( Gene(entry.seq_id, entry.attributes["locus_tag"], entry.attributes["Name"], entry.start, entry.end, entry.strand))
def _try_gene_merge(self, gene, row): overlapping_gene = self._sql_row_to_gene(row) if not self._have_sufficient_overlap(gene, overlapping_gene) is True: return if self._genes_are_identical(gene, overlapping_gene) is True: return start = min(gene.start, overlapping_gene.start) end = max(gene.end, overlapping_gene.end) gene_id = "%s_merged_with_%s" % (overlapping_gene.gene_id, gene.gene_id) self._remove_row(row) self._store_gene_in_db( Gene(gene.seq_id, gene_id, gene_id, start, end, gene.strand))
def test_len(self): # 1234567890 # ========== gene1 = Gene("chr1", "a", "b", "1", "10", "+") self.assertEqual(gene1.len(), 10) # 01234567890 # =========== gene2 = Gene("chr1", "a", "b", "10", "20", "+") self.assertEqual(gene2.len(), 11)
def _try_multi_gene_merge(self, gene, rows): overlapping_genes = [self._sql_row_to_gene(row) for row in rows] genes_to_merge = [] rows_to_remove = [] for overlapping_gene, row in zip(overlapping_genes, rows): if not self._have_sufficient_overlap(gene, overlapping_gene) is True: continue genes_to_merge.append(overlapping_gene) rows_to_remove.append(row) for row in rows_to_remove: self._remove_row(row) start = min([gene.start] + [gene.start for gene in genes_to_merge]) end = max([gene.end] + [gene.end for gene in genes_to_merge]) gene_id = "_merged_with_".join( [gene.gene_id for gene in genes_to_merge] + [gene.gene_id]) self._store_gene_in_db( Gene(gene.seq_id, gene_id, gene_id, start, end, gene.strand))
def test_init_start_end_sorting(self): """Test that the start and end position are ordered""" gene = Gene("chr1", "gene_2342", "hacY", "1000", "5", "+") self.assertEqual(gene.start, 5) self.assertEqual(gene.end, 1000)
def test_has_5_prime_association_10(self): """None - in 3' region""" tss = TSS("genomeX", 5, "-") gene = Gene("genomeX", "g", "g", 10, 100, "-") self.assertEqual( self.tss_gene_mapper._has_5_prime_association(tss, gene), None)
def test_has_5_prime_association_9(self): """True - In 5' region""" tss = TSS("genomeX", 105, "-") gene = Gene("genomeX", "g", "g", 10, 100, "-") self.assertEqual( self.tss_gene_mapper._has_5_prime_association(tss, gene), True)
def test_has_5_prime_association_8(self): """None - antisense""" tss = TSS("genomeX", 600, "+") gene = Gene("genomeX", "g", "g", 10, 100, "-") self.assertEqual( self.tss_gene_mapper._has_5_prime_association(tss, gene), None)
def test_has_antisense_association_8(self): """True - antisense""" tss = TSS("genomeX", 200, "-") gene = Gene("genomeX", "g", "g", 10, 100, "+") self.assertEqual( self.tss_gene_mapper._has_antisense_association(tss, gene), True)
def _merge_genes(self, gene_1, gene_2): start = min([gene_1.start, gene_2.end]) end = max([gene_1.start, gene_2.end]) name = "_merged_with_".join([gene_1.name, gene_2.name]) gene_id = "_merged_with_".join([gene_1.gene_id, gene_2.gene_id]) return Gene(gene_1.seq_id, gene_id, name, start, end, gene_1.strand)
def test_5_prime_dist_4(self): tss = TSS("genomeX", 120, "-") gene = Gene("genomeX", "g", "g", 10, 100, "-") self.assertEqual(self.tss_gene_mapper._5_prime_dist(tss, gene), 20)
def _sql_row_to_gene(self, row): return Gene(row[2], row[1], row[1], row[4], row[5], row[3])
def test_has_internal_association_8(self): """None - on the first base => 5' leaderless """ tss = TSS("genomeX", 100, "-") gene = Gene("genomeX", "g", "g", 10, 100, "-") self.assertEqual( self.tss_gene_mapper._has_internal_association(tss, gene), None)
def test_has_internal_association_7(self): """True - on the last base """ tss = TSS("genomeX", 10, "-") gene = Gene("genomeX", "g", "g", 10, 100, "-") self.assertEqual( self.tss_gene_mapper._has_internal_association(tss, gene), True)
def test_has_internal_association_3(self): """None - 3' region """ tss = TSS("genomeX", 150, "+") gene = Gene("genomeX", "g", "g", 10, 100, "+") self.assertEqual( self.tss_gene_mapper._has_internal_association(tss, gene), None)
def test_has_internal_association_1(self): """True - internal""" tss = TSS("genomeX", 20, "+") gene = Gene("genomeX", "g", "g", 10, 100, "+") self.assertEqual( self.tss_gene_mapper._has_internal_association(tss, gene), True)
def test_has_antisense_association_10(self): """None - out of range""" tss = TSS("genomeX", 201, "-") gene = Gene("genomeX", "g", "g", 10, 100, "+") self.assertEqual( self.tss_gene_mapper._has_antisense_association(tss, gene), None)
def test_has_5_prime_association_3(self): """True - Leaderless TSS""" tss = TSS("genomeX", 10, "+") gene = Gene("genomeX", "g", "g", 10, 100, "+") self.assertEqual( self.tss_gene_mapper._has_5_prime_association(tss, gene), True)
def test_has_5_prime_association_4(self): """None - in 5' but too far away """ tss = TSS("genomeX", 10, "+") gene = Gene("genomeX", "g", "g", 1000, 1100, "+") self.assertEqual( self.tss_gene_mapper._has_5_prime_association(tss, gene), None)
parser = argparse.ArgumentParser(description=__description__) parser.add_argument("gff_file", type=argparse.FileType("r")) parser.add_argument("output_file", type=argparse.FileType("w")) parser.add_argument("--margin", type=int, default=0) parser.add_argument("--plus_only", default=False, action="store_true") args = parser.parse_args() # Build gene list gene_list = [] gff_parser = Gff3Parser() region_entry = None for entry in gff_parser.entries(args.gff_file): if entry.feature == "region": region_entry = entry continue gene_list.append(Gene( entry.seq_id, "", "", entry.start, entry.end, entry.strand)) # Find IGRs and generate GFF file igr_finder = IGRFinder() args.output_file.write("##gff-version 3\n") strands = ["+", "-"] if args.plus_only is True: strands = ["+"] for start, end in igr_finder.find_igrs(gene_list, region_entry.end): start = start + args.margin end = end - args.margin if end <= start: continue for strand in strands:
def test_has_antisense_association_7(self): """True - antisense and inside the range""" tss = TSS("genomeX", 900, "-") gene = Gene("genomeX", "g", "g", 1000, 1100, "+") self.assertEqual( self.tss_gene_mapper._has_antisense_association(tss, gene), True)