示例#1
0
def test_find_datasets_ontology(df_datasets_ontology):
    """Test the available datasets returned by find_datasets() for the
    ENSEMBL_MART_ONTOLOGY mart."""
    expect = (df_datasets_ontology.sort_values(by="Dataset_ID",
                                               axis=0).reset_index(drop=True))
    result = (find_datasets("ENSEMBL_MART_ONTOLOGY").sort_values(
        by="Dataset_ID", axis=0).reset_index(drop=True))

    assert_frame_equal(result, expect)
示例#2
0
def test_find_datasets_sequence(df_datasets_sequence):
    """Test the available datasets returned by find_datasets() for the
    ENSEMBL_MART_SEQUENCE mart."""
    expect = (df_datasets_sequence.sort_values(by="Dataset_ID",
                                               axis=0).reset_index(drop=True))
    result = (find_datasets("ENSEMBL_MART_SEQUENCE").sort_values(
        by="Dataset_ID", axis=0).reset_index(drop=True))

    assert_frame_equal(result, expect)
示例#3
0
def test_find_datasets_ensembl(df_datasets_ensembl):
    """Test the available datasets returned by find_datasets() for the
    ENSEMBL_MART_ENSEMBL mart."""
    expect = (df_datasets_ensembl.sort_values(by="Dataset_ID",
                                              axis=0).reset_index(drop=True))
    result = (find_datasets("ENSEMBL_MART_ENSEMBL").sort_values(
        by="Dataset_ID", axis=0).reset_index(drop=True))

    assert_frame_equal(result, expect)
示例#4
0
def test_find_datasets_output(df_datasets_ensembl):
    """Test the available datasets returned by find_datasets with a given
    filename for the default mart (ENSEMBL_MART_ENSEMBL)."""
    expect = (df_datasets_ensembl.sort_values(by="Dataset_ID",
                                              axis=0).reset_index(drop=True))
    _ = find_datasets(save=True, output="tested.csv")
    saved = pd.read_csv("tested.csv")
    result = (saved.replace(pd.np.nan,
                            "").sort_values(by="Dataset_ID",
                                            axis=0).reset_index(drop=True))

    try:
        assert_frame_equal(result, expect)
    finally:
        os.remove("tested.csv")
示例#5
0
def create_datasets():
    """Create and store the pickled datasets dataframes."""
    df1 = apy.find_datasets("ENSEMBL_MART_ENSEMBL")
    df1.to_pickle(os.path.join(DATADIR, "datasets_ensembl.pkl"))
    df2 = apy.find_datasets("ENSEMBL_MART_MOUSE")
    df2.to_pickle(os.path.join(DATADIR, "datasets_mouse.pkl"))
    df3 = apy.find_datasets("ENSEMBL_MART_SEQUENCE")
    df3.to_pickle(os.path.join(DATADIR, "datasets_sequence.pkl"))
    df4 = apy.find_datasets("ENSEMBL_MART_ONTOLOGY")
    df4.to_pickle(os.path.join(DATADIR, "datasets_ontology.pkl"))
    df5 = apy.find_datasets("ENSEMBL_MART_GENOMIC")
    df5.to_pickle(os.path.join(DATADIR, "datasets_genomic.pkl"))
    df6 = apy.find_datasets("ENSEMBL_MART_SNP")
    df6.to_pickle(os.path.join(DATADIR, "datasets_snp.pkl"))
    df7 = apy.find_datasets("ENSEMBL_MART_FUNCGEN")
    df7.to_pickle(os.path.join(DATADIR, "datasets_funcgen.pkl"))
示例#6
0
def pull_ensembl(complete_file):
    f = find_datasets()
    cols = set([
        "ensembl_gene_id", "ensembl_peptide_id", "description",
        "external_gene_name", "external_gene_source", "external_synonym",
        "chromosome_name", "source", "gene_biotype", "entrezgene_id",
        "zfin_id_id", 'mgi_id', 'rgd_id', 'flybase_gene_id', 'sgd_gene',
        'wormbase_gene'
    ])
    for ds in f['Dataset_ID']:
        print(ds)
        outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}')
        #Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our
        # config, and keep it up to date.  Maybe you could have a job that gets the datasets and writes a dataset file,
        # but then updates the config? That sounds bogus.
        if os.path.exists(outfile):
            continue
        atts = find_attributes(ds)
        existingatts = set(atts['Attribute_ID'].to_list())
        attsIcanGet = cols.intersection(existingatts)
        df = query(attributes=attsIcanGet, filters={}, dataset=ds)
        df.to_csv(outfile, index=False, sep='\t')
    with open(complete_file, 'w') as outf:
        outf.write(f'Downloaded gene sets for {len(f)} data sets.')