def test_novelty_merging(self): a_gtf = 'input_files/annot.gtf' b_gtf = 'input_files/annot_2.gtf' # only one dataset has novelty categorizations # and there are transcripts in the dataset that # doesn't have categories that aren't in the # dataset that does print('Testing if novelty merging works...') sg = swan.SwanGraph() sg.add_dataset('a', a_gtf, include_isms=True) sg.t_df['novelty'] = ['Known', 'NIC', 'ISM'] sg.add_dataset('b', b_gtf, include_isms=True) test = sg.t_df.apply(lambda x: (x.tid, x.novelty), axis=1) control = [('ENST01', 'Known'), ('ENST02', 'Undefined'), ('ENST08', 'Undefined'), ('ENST03', 'NIC'), ('ENST04', 'Undefined'), ('ENST07', 'ISM')] check_pairs(control, test) # one dataset has already been given "undefined" label # that needs to be overwritten print('Testing merging with novelty=undefined given a new label') sg_b = swan.SwanGraph() sg_b.add_dataset('b_2', b_gtf, include_isms=True) sg_b.t_df['novelty'] = ['ISM', 'Antisense', 'Intergenic', 'Genomic'] print(sg.t_df) print(sg_b.t_df) sg.merge_dfs(sg_b, 'b_2') control = [('ENST01', 'Ambiguous'), ('ENST02', 'Antisense'), ('ENST08', 'Genomic'), ('ENST03', 'NIC'), ('ENST04', 'Intergenic'), ('ENST07', 'ISM')] test = sg.t_df.apply(lambda x: (x.tid, x.novelty), axis=1) check_pairs(control, test) # both datasets have novelty categorizations ab_gtf = 'input_files/annot_3.gtf' sg_a = swan.SwanGraph() sg_a.add_dataset('a', ab_gtf, include_isms=True) sg_a.t_df['novelty'] = ['Known', 'Known', 'ISM', np.nan, 'NIC', 'NNC'] sg_b = swan.SwanGraph() sg_b.add_dataset('b', ab_gtf, include_isms=True) sg_b.t_df['novelty'] = ['Known', 'ISM', np.nan, 'NIC', 'NNC', 'NNC'] sg_a.merge_dfs(sg_b, 'b') test = sg_a.t_df.apply(lambda x: (x.tid, x.novelty), axis=1) control = [('ENST01', 'Known'), ('ENST02', 'Ambiguous'), ('ENST08', 'NNC'), ('ENST03', 'ISM'), ('ENST04', 'NIC'), ('ENST07', 'Ambiguous')] check_pairs(control, test) # neither dataset has novelty types sg = swan.SwanGraph() sg.add_dataset('a', a_gtf, include_isms=True) sg.add_dataset('b', b_gtf, include_isms=True) assert 'novelty' not in sg.t_df.columns
def test_check_abundances(self): g = swan.SwanGraph() g.datasets = ['a', 'b', 'c'] g.counts = ['a_counts', 'b_counts'] # 1: test for abundance that isn't there # also makes sure it works with character input (ie not list) with pytest.raises(Exception) as excinfo: g.check_abundances('d') assert 'Abundance for dataset d' in str(excinfo.value) # 2: test for a abundance from a dataset that is there but # does not have abundance info # also makes sure it works with len(list) = 1 info with pytest.raises(Exception) as excinfo: g.check_abundances(['c']) assert 'Abundance for dataset c' in str(excinfo.value) # 3: test for multiple datasets that are in the graph result = g.check_abundances(['a', 'b']) assert result == None # 4: test for a dataset that's not there in a list # of datasets that are there with pytest.raises(Exception) as excinfo: g.check_abundances(['a', 'b', 'd']) assert 'Abundance for dataset d' in str(excinfo.value)
def test_no_gene_name_gtf(self): sg = swan.SwanGraph() sg.add_dataset('test', 'input_files/Canx.gtf') gnames = sg.t_df.gname.tolist() gids = sg.t_df.gid.tolist() assert gnames == gids
def test_no_gene_name_db(self): sg = swan.SwanGraph() sg.add_dataset('test', 'input_files/chr11_and_Tcf3_no_gname.db') gnames = sg.t_df.gname.tolist() gids = sg.t_df.gid.tolist() assert gnames == gids
def test_get_tpm_cols(self): a = swan.SwanGraph() # 1: empty graph, no datasets argument assert a.get_tpm_cols() == [] # 2: one dataset in graph, no datasets argument a.tpm = ['a_tpm'] assert a.get_tpm_cols() == ['a_tpm'] # 3: > one dataset in graph, no datasets argument a.tpm = ['a_tpm', 'b_tpm'] assert a.get_tpm_cols() == ['a_tpm', 'b_tpm'] # 4: empty graph, datasets argument # also tests ability to handle char input a = swan.SwanGraph() with pytest.raises(Exception) as excinfo: a.get_tpm_cols('a') assert 'Abundance for dataset a' in str(excinfo.value) # 5: graph with one dataset and datasets argument already in graph a.tpm = ['a_tpm'] a.counts = ['a_counts'] assert a.get_tpm_cols('a') == ['a_tpm'] # 6: graph with one dataset and datasets argument not in graph with pytest.raises(Exception) as excinfo: a.get_tpm_cols('b') assert 'Abundance for dataset b' in str(excinfo.value) # 7: graph with more than one dataset and datasets argument in graph a.tpm = ['a_tpm', 'b_tpm'] a.counts = ['a_counts', 'b_counts'] assert a.get_tpm_cols(['a', 'b']) == ['a_tpm', 'b_tpm'] # 8: graph with more than one dataset and not all datasets argument in graph with pytest.raises(Exception) as excinfo: a.get_tpm_cols(['a', 'b', 'c']) assert 'Abundance for dataset c'
def get_dummy_sg(special=None): a = swan.SwanGraph() loc_df = pd.DataFrame({ 'chrom': [1, 1, 1], 'coord': [1, 3, 2], 'strand': ['+', '+', '+'], 'vertex_id': [0, 1, 2] }) loc_df = swan.create_dupe_index(loc_df, 'vertex_id') loc_df = swan.set_dupe_index(loc_df, 'vertex_id') edge_df = pd.DataFrame({ 'edge_id': [(0, 2), (0, 1), (1, 2)], 'v1': [0, 0, 1], 'v2': [2, 1, 2], 'edge_type': ['exon', 'exon', 'intron'], 'strand': ['+', '+', '+'] }) edge_df = swan.create_dupe_index(edge_df, 'edge_id') edge_df = swan.set_dupe_index(edge_df, 'edge_id') t_df = pd.DataFrame({ 'tid': [2, 1, 0], 'gid': [0, 0, 0], 'gname': ['0', '0', '0'], 'path': [[0, 1, 2], [1, 2], [0, 1]], 'counts_a': [0, 0, 12], 'counts_b': [1, 0, 14] }) t_df = swan.create_dupe_index(t_df, 'tid') t_df = swan.set_dupe_index(t_df, 'tid') if special == 'intron': edge_df.loc[(0, 1), 'edge_type'] = 'intron' a.loc_df = loc_df a.edge_df = edge_df a.t_df = t_df if special == 'no_locs': pass else: a.get_loc_types() if special == 'dataset': a.datasets = ['dataset_a'] a.loc_df['dataset_a'] = True a.edge_df['dataset_a'] = True a.t_df['dataset_a'] = True return a
def test_add_annotation(self): # adding an annotation to an empty graph print('testing for correct novelty assignment when adding annotation') sg = swan.SwanGraph() sg.add_annotation('input_files/annot.gtf') control = len(sg.t_df.index) test = len(sg.t_df.loc[sg.t_df.novelty == 'Known'].index) assert control == test # adding an annotation to a graph that already has data # but preexisting data does not have novelty categories print( 'testing for correct novelty assignment when adding annotation ' 'to graph with preexisting data that does not contain novelty info' ) sg = swan.SwanGraph() sg.add_dataset('a', 'input_files/annot_2.gtf') sg.add_annotation('input_files/annot.gtf') control = [('ENST01', 'Known'), ('ENST03', 'Known'), ('ENST07', 'Known'), ('ENST02', 'Undefined'), ('ENST04', 'Undefined'), ('ENST08', 'Undefined')] test = sg.t_df.apply(lambda x: (x.tid, x.novelty), axis=1) check_pairs(control, test) # adding an annotation to a graph that already has data # and preexisting data has novelty categories print('testing for correct novelty assignment when adding annotation ' 'to graph with preexisting data that does contain novelty info') sg = swan.SwanGraph() sg.add_dataset('a', 'input_files/annot_2.gtf') sg.t_df['novelty'] = ['ISM', 'NNC', 'NIC', 'NIC'] sg.add_annotation('input_files/annot.gtf') control = [('ENST01', 'Known'), ('ENST03', 'Known'), ('ENST07', 'Known'), ('ENST02', 'NNC'), ('ENST04', 'NIC'), ('ENST08', 'NIC')] test = sg.t_df.apply(lambda x: (x.tid, x.novelty), axis=1) check_pairs(control, test)
def test_is_empty(self): a = swan.SwanGraph() # 1: graph is empty, no datasets (including annotation) print(a.datasets) assert a.is_empty() == True a.datasets = ['a'] # 2: graph has one dataset print(a.datasets) assert a.is_empty() == False a.datasets = ['a', 'b'] # 3: graph has more than one dataset print(a.datasets) assert a.is_empty() == False
def test_get_dataset_cols(self): a = swan.SwanGraph() # 1: graph is empty, no datasets (including annotation) print(a.datasets) assert a.get_dataset_cols() == [] a.datasets = ['a'] # 2: graph has one dataset print(a.datasets) assert a.get_dataset_cols() == ['a'] a.datasets = ['a', 'b'] # 3: graph has more than one dataset print(a.datasets) assert a.get_dataset_cols() == ['a', 'b']
def test_subset_on_gene(self): gid = 0 a = swan.SwanGraph() a.t_df = pd.DataFrame({ 'tid': [0, 1, 2, 3, 4, 5], 'gid': [0, 0, 1, 1, 2, 2], 'path': [[0, 1, 2], [2, 3, 4], [5, 6, 7], [6, 7, 8], [9, 10, 11, 12], [9, 11, 12]] }) a.loc_df = pd.DataFrame({ 'vertex_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'strand': ['+', '+', '+', '+', '+', '-', '-', '-', '-', '-', '-', '-', '-'], 'chrom': [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2], 'coord': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] }) a.edge_df = pd.DataFrame({ 'edge_id': [(0, 1), (1, 2), (2, 3), (3, 4), (5, 6), (6, 7), (7, 8), (9, 10), (10, 11), (11, 12)] }) a.t_df = swan.create_dupe_index(a.t_df, 'tid') a.t_df = swan.set_dupe_index(a.t_df, 'tid') a.loc_df = swan.create_dupe_index(a.loc_df, 'vertex_id') a.loc_df = swan.set_dupe_index(a.loc_df, 'vertex_id') a.edge_df = swan.create_dupe_index(a.edge_df, 'edge_id') a.edge_df = swan.set_dupe_index(a.edge_df, 'edge_id') # check subsetting for gene 0 a = swan.subset_on_gene(a, 0) test = a.t_df['tid'].tolist() control = [0, 1] check_pairs(control, test) test = a.loc_df['vertex_id'].tolist() control = [0, 1, 2, 3, 4] check_pairs(control, test) test = a.edge_df['edge_id'].tolist() control = [(0, 1), (1, 2), (2, 3), (3, 4)] check_pairs(control, test)
def test_check_datasets(self): g = swan.SwanGraph() g.datasets = ['a', 'b'] # 1: test for a dataset that's not there with pytest.raises(Exception) as excinfo: g.check_datasets(['c']) assert "c not present in graph" in str(excinfo.value) # 2: test for a dataset that is there, input in string form result = g.check_datasets('a') assert result == None # 3: test for multiple datasets that are in the graph result = g.check_datasets(['a', 'b']) assert result == None # 4: test for a dataset that's not there in a list # of datasets that are there with pytest.raises(Exception) as excinfo: g.check_datasets(['a', 'b', 'c']) assert 'c not present in graph' in str(excinfo.value)
def test_plotting(self): # set up testing swangraph sg = swan.SwanGraph() sg.datasets = ['annotation', 'a', 'b'] gene1_loc_df = pd.DataFrame({ 'chrom': [1, 1, 1, 1, 1, 1, 1], 'coord': [5, 10, 15, 20, 25, 30, 35], 'strand': ['+', '+', '+', '+', '+', '+', '+'], 'vertex_id': [0, 1, 2, 3, 4, 5, 6], 'annotation': [True, True, True, True, True, True, False], 'a': [True, True, False, True, True, True, True], 'b': [False, False, True, True, True, True, False] }) gene1_t_df = pd.DataFrame({ 'gname': ['GENE01', 'GENE01', 'GENE01', 'GENE01'], 'gid': ['ENSG01', 'ENSG01', 'ENSG01', 'ENSG01'], 'tid': ['ENST01', 'ENST02', 'ENST03', 'ENST04'], 'path': [[0, 1, 2, 3, 4, 5], [0, 3, 4, 5], [0, 1, 4, 6], [2, 3, 4, 5]], 'annotation': [True, True, True, False], 'a': [False, True, True, False], 'b': [False, False, False, True] }) gene1_edge_df = pd.DataFrame({ 'v1': [0, 1, 2, 3, 4, 0, 4, 1], 'v2': [1, 2, 3, 4, 5, 3, 6, 4], 'edge_type': [ 'exon', 'intron', 'exon', 'intron', 'exon', 'exon', 'exon', 'intron' ], 'annotation': [True, True, True, True, True, True, False, False], 'a': [True, False, False, True, True, True, True, True], 'b': [False, False, True, True, True, False, False, False] }) gene1_edge_df['edge_id'] = gene1_edge_df.apply(lambda x: (x.v1, x.v2), axis=1) sg.loc_df = gene1_loc_df sg.edge_df = gene1_edge_df sg.t_df = gene1_t_df sg.loc_df = create_dupe_index(sg.loc_df, 'vertex_id') sg.loc_df = set_dupe_index(sg.loc_df, 'vertex_id') sg.edge_df = create_dupe_index(sg.edge_df, 'edge_id') sg.edge_df = set_dupe_index(sg.edge_df, 'edge_id') sg.t_df = create_dupe_index(sg.t_df, 'tid') sg.t_df = set_dupe_index(sg.t_df, 'tid') sg.get_loc_types() # testing gene1_tids = gene1_t_df.tid.tolist() gene1_locs = gene1_loc_df.vertex_id.tolist() gene1_edges = gene1_edge_df.edge_id.tolist() # 0th plot - gene summary graph of ENSG01 sg.datasets = ['annotation', 'a'] sg = plot0(sg, gene1_tids, gene1_locs, gene1_edges) gene2_loc_df = pd.DataFrame({ 'chrom': [4, 4, 4, 4, 4, 4, 4], 'coord': [35, 30, 25, 20, 15, 10, 5], 'strand': ['-', '-', '-', '-', '-', '-', '-'], # 'vertex_id': [0, 1, 2, 3, 4, 5, 6], 'vertex_id': [7, 8, 9, 10, 11, 12, 13], 'annotation': [True, True, True, True, True, True, False], 'a': [True, True, False, True, True, True, True], 'b': [False, False, True, True, True, True, False] }) gene2_t_df = pd.DataFrame({ 'gname': ['GENE02', 'GENE02', 'GENE02', 'GENE02'], 'gid': ['ENSG02', 'ENSG02', 'ENSG02', 'ENSG02'], 'tid': ['ENST05', 'ENST06', 'ENST07', 'ENST08'], 'path': [[7, 8, 9, 10, 11, 12], [7, 10, 11, 12], [7, 8, 11, 13], [9, 10, 11, 12]], 'annotation': [True, True, True, False], 'a': [False, True, True, False], 'b': [False, False, False, True] }) gene2_edge_df = pd.DataFrame({ 'v1': [7, 8, 9, 10, 11, 7, 11, 8], 'v2': [8, 9, 10, 11, 12, 10, 13, 11], 'edge_type': [ 'exon', 'intron', 'exon', 'intron', 'exon', 'exon', 'exon', 'intron' ], 'annotation': [True, True, True, True, True, True, False, False], 'a': [True, False, False, True, True, True, True, True], 'b': [False, False, True, True, True, False, False, False] }) gene2_edge_df['edge_id'] = gene2_edge_df.apply(lambda x: (x.v1, x.v2), axis=1) sg.loc_df = pd.concat([gene1_loc_df, gene2_loc_df]) sg.edge_df = pd.concat([gene1_edge_df, gene2_edge_df]) sg.t_df = pd.concat([gene1_t_df, gene2_t_df]) sg.loc_df = create_dupe_index(sg.loc_df, 'vertex_id') sg.loc_df = set_dupe_index(sg.loc_df, 'vertex_id') sg.edge_df = create_dupe_index(sg.edge_df, 'edge_id') sg.edge_df = set_dupe_index(sg.edge_df, 'edge_id') sg.t_df = create_dupe_index(sg.t_df, 'tid') sg.t_df = set_dupe_index(sg.t_df, 'tid') sg.get_loc_types() # testing gene2_tids = gene2_t_df.tid.tolist() gene2_locs = gene2_loc_df.vertex_id.tolist() gene2_edges = gene2_edge_df.edge_id.tolist() # remake the same plot and force it to update sg.datasets = ['annotation', 'a', 'b'] sg = plot0_5(sg, gene2_tids, gene1_locs, gene1_edges) # first plot - gene summary graph of ENSG01 sg = plot1(sg, gene1_tids, gene1_locs, gene1_edges) # plot a transcript through the same gene sg = plot2(sg, gene1_tids, gene1_locs, gene1_edges) # make sure we are doing the right thing after plotting ENST01 # after plotting it as a browser image sg = plot3(sg, gene1_tids, gene1_locs, gene1_edges) # plot the same transcript with indicate_novel sg = plot4(sg, gene1_tids, gene1_locs, gene1_edges) # plot a different transcript but change the indicate opt sg = plot5(sg, gene1_tids, gene1_locs, gene1_edges) # plot a new gene and use indicate_dataset sg = plot6(sg, gene2_tids, gene1_locs, gene1_edges) # plot a transcript from the other gene using browser sg = plot7(sg, gene1_tids, gene1_locs, gene1_edges) # plot a transcript from the other gene using # indicate_dataset b sg = plot8(sg, gene2_tids, gene1_locs, gene1_edges)
def gen_toy_sg(): sg = swan.SwanGraph() sg.add_dataset('a', 'input_files/annot.gtf') return sg
def test_merge_sgs(self): a_gtf = 'input_files/annot.gtf' b_gtf = 'input_files/annot_2.gtf' sg = swan.SwanGraph() sg.add_dataset('a', a_gtf, include_isms=True) sg.add_dataset('b', b_gtf, include_isms=True) print(sg.loc_df.head()) print(sg.edge_df.head()) print(sg.t_df.head()) # print(sg.loc_df[['chrom', 'coord', 'strand', 'a', 'b']]) # print(sg.edge_df[['edge_type', 'a', 'b']]) # print(sg.t_df[['path', 'a', 'b']]) # check that the format of dfs are ok assert sg.loc_df.index.names == ['vertex_id'] control = [ 'coord', 'chrom', 'strand', 'a', 'b', 'vertex_id', 'internal', 'TSS', 'TES' ] test = sg.loc_df.columns.tolist() check_pairs(control, test) assert sg.edge_df.index.names == ['edge_id'] control = ['v1', 'v2', 'edge_type', 'strand', 'a', 'b', 'edge_id'] test = sg.edge_df.columns.tolist() check_pairs(control, test) assert sg.t_df.index.names == ['tid'] control = ['tid', 'gid', 'gname', 'path', 'a', 'b'] test = sg.t_df.columns.tolist() check_pairs(control, test) # test that loc_df merging happened correctly # query chr, coord, strand, a and b columns chrs = sg.loc_df['chrom'].tolist() control = [1, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 1, 1, 1, 1, 1, 4, 4] print('test chrs: ') print(chrs) print('control chrs: ') print(chrs) assert control == control coords = sg.loc_df['coord'].tolist() control = [ 1, 90, 100, 500, 600, 900, 1000, 1, 10, 15, 20, 2000, 1500, 1000, 900, 800, 4000, 1000 ] print('test coords: ') print(coords) print('control coords: ') print(control) assert coords == control strand = sg.loc_df['strand'].tolist() control = [ '+', '+', '+', '+', '+', '+', '+', '+', '+', '+', '+', '-', '-', '-', '-', '-', '-', '-' ] print('test strands: ') print(strand) print('control strands: ') print(control) assert strand == control a = sg.loc_df['a'].tolist() control = [ bool(i) for i in [1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1] ] print('test a presence: ') print(a) print('control a presence: ') print(control) assert a == control b = sg.loc_df['b'].tolist() control = [ bool(i) for i in [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0] ] print('test b presence: ') print(b) print('control b presence: ') print(control) assert b == control # test that edge_df merging and id mapping happened correctly # query edge_id, edge_type, a and b columns edge_id = sg.edge_df['edge_id'].tolist() control = [(0, 1), (0, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (7, 8), (8, 9), (9, 10), (11, 12), (12, 13), (13, 14), (13, 15), (16, 17)] print('test edge_ids: ') print(edge_id) print('control edge_ids') print(control) assert edge_id == control edge_type = sg.edge_df['edge_type'].tolist() control = [ 'exon', 'exon', 'intron', 'intron', 'exon', 'intron', 'exon', 'exon', 'intron', 'exon', 'exon', 'intron', 'exon', 'exon', 'exon' ] print('test edge_types: ') print(edge_type) print('control edge_types: ') print(control) assert edge_type == control a = sg.edge_df['a'].tolist() control = [ bool(i) for i in [0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1] ] print('test a presence: ') print(a) print('control a presence: ') print(control) assert a == control b = sg.edge_df['b'].tolist() control = [ bool(i) for i in [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0] ] print('test b presence: ') print(b) print('control b presence: ') print(control) assert b == control # test that t_df merging and id mapping happened correctly # query tid, path, a and b columns tid = sg.t_df['tid'].tolist() control = ['ENST01', 'ENST02', 'ENST03', 'ENST04', 'ENST07', 'ENST08'] print('test tids: ') print(tid) print('control tids: ') print(control) assert tid == control paths = [tuple(path) for path in sg.t_df['path'].tolist()] control = [(0, 2, 3, 4, 5, 6), (0, 1, 3, 4, 5, 6), (11, 12, 13, 14), (11, 12, 13, 15), (16, 17), (7, 8, 9, 10)] print('test paths: ') print(paths) print('control paths: ') print(control) assert paths == control a = sg.t_df['a'].tolist() control = [bool(i) for i in [1, 0, 1, 0, 1, 0]] print('test a presence: ') print(a) print('control a presence: ') print(control) assert a == control b = sg.t_df['b'].tolist() control = [bool(i) for i in [1, 1, 0, 1, 0, 1]] print('test b presence: ') print(b) print('control b presence: ') print(control) assert b == control
def process_gtf(): sg = swan.SwanGraph() sg.add_dataset('test', 'input_files/weird_gtf_entries.gtf') return sg
import swan_vis as swan ab_file = 'all_talon_abundance_filtered.tsv' ref_gtf = '/Users/fairliereese/mortazavi_lab/ref/gencode.v29/gencode.v29.annotation.gtf' hep_1_gtf = 'hepg2_1_talon.gtf' hep_2_gtf = 'hepg2_2_talon.gtf' hff_1_gtf = 'hffc6_1_talon.gtf' hff_2_gtf = 'hffc6_2_talon.gtf' hff_3_gtf = 'hffc6_3_talon.gtf' # adding data to the swangraph sg = swan.SwanGraph() sg.add_annotation(ref_gtf) sg.add_dataset('HepG2_1', hep_1_gtf, counts_file=ab_file, count_cols='hepg2_1') sg.add_dataset('HepG2_2', hep_2_gtf, counts_file=ab_file, count_cols='hepg2_2') sg.add_dataset('HFFc6_1', hff_1_gtf, counts_file=ab_file, count_cols='hffc6_1') sg.add_dataset('HFFc6_2', hff_2_gtf, counts_file=ab_file, count_cols='hffc6_2') sg.add_dataset('HFFc6_3', hff_3_gtf, counts_file=ab_file, count_cols='hffc6_3') sg.save_graph('swan') sg = swan.SwanGraph('swan.p') # de gene and transcript tests dataset_groups = [['HepG2_1', 'HepG2_2'], ['HFFc6_1', 'HFFc6_2', 'HFFc6_3']] sg.de_gene_test(dataset_groups) sg.de_transcript_test(dataset_groups) de_gids, _ = sg.get_de_genes() print('Found {} differentially expressed genes'.format(len(de_gids))) de_tids, _ = sg.get_de_transcripts() print('Found {} differentially expressed transcripts'.format(len(de_tids))) is_gids, _ = sg.find_isoform_switching_genes() print('Found {} isoform switching genes'.format(len(is_gids)))
def test_num_novel_known_isoforms(self): sg = swan.SwanGraph() sg.t_df = pd.DataFrame({ 'tid': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'gid': [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3], 'annotation': [ True, True, True, True, False, True, True, False, False, False, True, False, False, True, True, True ], 'a': [ False, True, True, False, True, False, False, True, True, False, False, True, False, False, False, False ], 'b': [ False, False, True, True, True, False, True, False, True, True, True, False, True, False, False, False ] }) sg.t_df = swan.create_dupe_index(sg.t_df, 'tid') sg.t_df = swan.set_dupe_index(sg.t_df, 'tid') sg.datasets = ['annotation', 'a', 'b'] genes, g_df = sg.find_genes_with_novel_isoforms() # check that the genes were returned in the right order print(genes) control = [2, 1, 0] assert genes == control # check that gene 3 didn't get included g_df_genes = g_df.index.tolist() print(g_df_genes) control = [0, 1, 2] check_pairs(control, g_df_genes) # check that sum of known/novel models is correct known_models = g_df.loc[0, 'known'] print('gene 0 num known:') print(known_models) known_control = 3 novel_models = g_df.loc[0, 'novel'] print('gene 0 num novel:') print(novel_models) novel_control = 0 assert known_models == known_control assert novel_models == novel_control known_models = g_df.loc[1, 'known'] print('gene 1 num known:') print(known_models) known_control = 1 novel_models = g_df.loc[1, 'novel'] print('gene 1 num novel:') print(novel_models) novel_control = 1 assert known_models == known_control assert novel_models == novel_control known_models = g_df.loc[2, 'known'] print('gene 2 num known:') print(known_models) known_control = 1 novel_models = g_df.loc[2, 'novel'] print('gene 2 num novel:') print(novel_models) novel_control = 5 assert known_models == known_control assert novel_models == novel_control # make sure that when we're trying this w/o an annotation, # we raise the error sg.t_df.drop('annotation', axis=1, inplace=True) sg.datasets.remove('annotation') with pytest.raises(Exception) as excinfo: genes, g_df = sg.find_genes_with_novel_isoforms() assert 'No annotation data' in str(excinfo.value)
def get_dummy_merge_sgs(): a = swan.SwanGraph() b = swan.SwanGraph() a.loc_df = pd.DataFrame({ 'chrom': [1, 1, 1, 2], 'coord': [1, 2, 3, 1], 'strand': ['+', '+', '+', '+'], 'vertex_id': [0, 1, 2, 3] }) b.loc_df = pd.DataFrame({ 'chrom': [1, 1, 1, 2, 3], 'coord': [1, 2, 4, 1, 1], 'strand': ['+', '+', '+', '-', '+'], 'vertex_id': [1, 2, 3, 4, 5] }) a.loc_df = swan.create_dupe_index(a.loc_df, 'vertex_id') b.loc_df = swan.create_dupe_index(b.loc_df, 'vertex_id') a.loc_df = swan.set_dupe_index(a.loc_df, 'vertex_id') b.loc_df = swan.set_dupe_index(b.loc_df, 'vertex_id') a.edge_df = pd.DataFrame({ 'edge_id': [(0, 1), (1, 2), (0, 2), (2, 3)], 'v1': [0, 1, 0, 2], 'v2': [1, 2, 2, 3], 'edge_type': ['exon', 'intron', 'exon', 'exon'], 'strand': ['+', '+', '+', '+'] }) b.edge_df = pd.DataFrame({ 'edge_id': [(1, 2), (1, 3), (2, 4), (3, 4)], 'v1': [1, 1, 2, 3], 'v2': [2, 3, 4, 4], 'strand': ['+', '+', '+', '+'], 'edge_type': ['exon', 'exon', 'intron', 'intron'] }) a.edge_df = swan.create_dupe_index(a.edge_df, 'edge_id') b.edge_df = swan.create_dupe_index(b.edge_df, 'edge_id') a.edge_df = swan.set_dupe_index(a.edge_df, 'edge_id') b.edge_df = swan.set_dupe_index(b.edge_df, 'edge_id') a.t_df = pd.DataFrame({ 'tid': [0, 1, 3], 'gid': [0, 0, 0], 'gname': ['0', '0', '0'], 'path': [[0, 1], [0, 1, 2, 3], [0, 2]] }) b.t_df = pd.DataFrame({ 'tid': [0, 2, 4], 'gid': [0, 0, 0], 'gname': ['0', '0', '0'], 'path': [[1, 2], [1, 2, 4], [1, 2, 3]] }) a.t_df = swan.create_dupe_index(a.t_df, 'tid') b.t_df = swan.create_dupe_index(b.t_df, 'tid') a.t_df = swan.set_dupe_index(a.t_df, 'tid') b.t_df = swan.set_dupe_index(b.t_df, 'tid') # add 'dataset a' to a a.datasets = ['a'] a.loc_df['a'] = True a.edge_df['a'] = True a.t_df['a'] = True a.get_loc_types() b.get_loc_types() return a, b
### Getting Started import swan_vis as swan annot_gtf = 'data/gencode.v29.annotation.gtf' hep_1_gtf = 'data/hepg2_1_talon.gtf' hep_2_gtf = 'data/hepg2_2_talon.gtf' hff_1_gtf = 'data/hffc6_1_talon.gtf' hff_2_gtf = 'data/hffc6_2_talon.gtf' hff_3_gtf = 'data/hffc6_3_talon.gtf' ab_file = 'data/all_talon_abundance_filtered.tsv' talon_db = 'data/talon.db' # initialize a new SwanGraph sg = swan.SwanGraph() # add an annotation transcriptome sg.add_annotation(annot_gtf) # add a dataset's transcriptome and abundance information to # the SwanGraph sg.add_dataset('HepG2_1', hep_1_gtf, counts_file=ab_file, count_cols='hepg2_1') sg.add_dataset('HepG2_2', hep_2_gtf, counts_file=ab_file, count_cols='hepg2_2') sg.add_dataset('HFFc6_1', hff_1_gtf, counts_file=ab_file, count_cols='hffc6_1') sg.add_dataset('HFFc6_2', hff_2_gtf, counts_file=ab_file, count_cols='hffc6_2') sg.add_dataset('HFFc6_3', hff_3_gtf, counts_file=ab_file, count_cols='hffc6_3') # save the SwanGraph as a Python pickle file sg.save_graph('swan') ### Analysis