def test_find_datasets_ontology(df_datasets_ontology): """Test the available datasets returned by find_datasets() for the ENSEMBL_MART_ONTOLOGY mart.""" expect = (df_datasets_ontology.sort_values(by="Dataset_ID", axis=0).reset_index(drop=True)) result = (find_datasets("ENSEMBL_MART_ONTOLOGY").sort_values( by="Dataset_ID", axis=0).reset_index(drop=True)) assert_frame_equal(result, expect)
def test_find_datasets_sequence(df_datasets_sequence): """Test the available datasets returned by find_datasets() for the ENSEMBL_MART_SEQUENCE mart.""" expect = (df_datasets_sequence.sort_values(by="Dataset_ID", axis=0).reset_index(drop=True)) result = (find_datasets("ENSEMBL_MART_SEQUENCE").sort_values( by="Dataset_ID", axis=0).reset_index(drop=True)) assert_frame_equal(result, expect)
def test_find_datasets_ensembl(df_datasets_ensembl): """Test the available datasets returned by find_datasets() for the ENSEMBL_MART_ENSEMBL mart.""" expect = (df_datasets_ensembl.sort_values(by="Dataset_ID", axis=0).reset_index(drop=True)) result = (find_datasets("ENSEMBL_MART_ENSEMBL").sort_values( by="Dataset_ID", axis=0).reset_index(drop=True)) assert_frame_equal(result, expect)
def test_find_datasets_output(df_datasets_ensembl): """Test the available datasets returned by find_datasets with a given filename for the default mart (ENSEMBL_MART_ENSEMBL).""" expect = (df_datasets_ensembl.sort_values(by="Dataset_ID", axis=0).reset_index(drop=True)) _ = find_datasets(save=True, output="tested.csv") saved = pd.read_csv("tested.csv") result = (saved.replace(pd.np.nan, "").sort_values(by="Dataset_ID", axis=0).reset_index(drop=True)) try: assert_frame_equal(result, expect) finally: os.remove("tested.csv")
def create_datasets(): """Create and store the pickled datasets dataframes.""" df1 = apy.find_datasets("ENSEMBL_MART_ENSEMBL") df1.to_pickle(os.path.join(DATADIR, "datasets_ensembl.pkl")) df2 = apy.find_datasets("ENSEMBL_MART_MOUSE") df2.to_pickle(os.path.join(DATADIR, "datasets_mouse.pkl")) df3 = apy.find_datasets("ENSEMBL_MART_SEQUENCE") df3.to_pickle(os.path.join(DATADIR, "datasets_sequence.pkl")) df4 = apy.find_datasets("ENSEMBL_MART_ONTOLOGY") df4.to_pickle(os.path.join(DATADIR, "datasets_ontology.pkl")) df5 = apy.find_datasets("ENSEMBL_MART_GENOMIC") df5.to_pickle(os.path.join(DATADIR, "datasets_genomic.pkl")) df6 = apy.find_datasets("ENSEMBL_MART_SNP") df6.to_pickle(os.path.join(DATADIR, "datasets_snp.pkl")) df7 = apy.find_datasets("ENSEMBL_MART_FUNCGEN") df7.to_pickle(os.path.join(DATADIR, "datasets_funcgen.pkl"))
def pull_ensembl(complete_file): f = find_datasets() cols = set([ "ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source", "external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id', 'rgd_id', 'flybase_gene_id', 'sgd_gene', 'wormbase_gene' ]) for ds in f['Dataset_ID']: print(ds) outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}') #Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our # config, and keep it up to date. Maybe you could have a job that gets the datasets and writes a dataset file, # but then updates the config? That sounds bogus. if os.path.exists(outfile): continue atts = find_attributes(ds) existingatts = set(atts['Attribute_ID'].to_list()) attsIcanGet = cols.intersection(existingatts) df = query(attributes=attsIcanGet, filters={}, dataset=ds) df.to_csv(outfile, index=False, sep='\t') with open(complete_file, 'w') as outf: outf.write(f'Downloaded gene sets for {len(f)} data sets.')