示例#1
0
def create_queries():
    """Create and store the pickled queries dataframes."""
    df1 = apy.query(attributes=["ensembl_gene_id", "external_gene_name"],
                    filters={"chromosome_name": "1"},
                    dataset="hsapiens_gene_ensembl")
    df1.to_pickle(os.path.join(DATADIR, "query_hsapiens_gene_chrom_1.pkl"))
    df2 = apy.query(attributes=["ensembl_gene_id", "external_gene_name"],
                    filters={"chromosome_name": "2"},
                    dataset="hsapiens_gene_ensembl")
    df2.to_pickle(os.path.join(DATADIR, "query_hsapiens_gene_chrom_2.pkl"))
    df3 = apy.query(attributes=["ensembl_gene_id", "external_gene_name"],
                    filters={"chromosome_name": "3"},
                    dataset="hsapiens_gene_ensembl")
    df3.to_pickle(os.path.join(DATADIR, "query_hsapiens_gene_chrom_3.pkl"))
示例#2
0
def getBiomart(fname):
	# marts = find_marts()
	# # ENSEMBL_MART_ENSEMBL ==  Ensembl Genes 101
	# print(marts) 
	# ds = find_datasets(mart="ENSEMBL_MART_ENSEMBL")
	# # print(ds)
	# qry = ds["Dataset_name"].str.contains('[Hh][Uu][Mm][Aa][Nn]')
	# print(ds[qry])
	# attrs = find_attributes(dataset="hsapiens_gene_ensembl")
	# print(attrs)
	if not os.path.isfile(fname):
		print("Downloading Biomart ...")
		attrs = ["ensembl_gene_id","chromosome_name","start_position",
			"end_position","strand","band","percentage_gene_gc_content",
			"gene_biotype","external_gene_name"]
		chrs = [str(i) for i in range(1,22)]
		chrs.append('X')
		chrs.append('Y')
		bm = query(attributes=attrs,
			filters={"chromosome_name": chrs},
			dataset="hsapiens_gene_ensembl")
		# print(bm["Chromosome/scaffold name"].value_counts())
		bm = bm[bm["Gene type"] == "protein_coding"]
		bm.columns = ['stableID', 'chromName', 'gStart','gEnd', 'strand', 
			'band', 'gcCont', 'gBiotype', 'gName']
		bm.to_csv(fname,sep="\t", index=False)
	else:
		print("Reading Biomart ...")
		bm = pd.read_csv(fname,sep="\t")
	print("Biomart genes: ", bm.shape[0])
	return bm
示例#3
0
def test_query_ensembl(df_query_ensembl_hsapiens_gene_chrom_2):
    """Test the query results for the hsapiens_gene_ensembl dataset."""
    expect = (df_query_ensembl_hsapiens_gene_chrom_2
              .reset_index(drop=True))
    result = (query(attributes=["ensembl_gene_id", "external_gene_name"],
                    filters={"chromosome_name": "2"},
                    dataset="hsapiens_gene_ensembl")
              .reset_index(drop=True))

    assert_frame_equal(result, expect)
示例#4
0
def test_query_default_int(df_query_ensembl_hsapiens_gene_chrom_2):
    """Test the query results for the default dataset (hsapiens_gene_ensembl)
    with int filters parameter."""
    expect = (df_query_ensembl_hsapiens_gene_chrom_2
              .reset_index(drop=True))
    result = (query(attributes=["ensembl_gene_id", "external_gene_name"],
                    filters={"chromosome_name": 2})
              .reset_index(drop=True))

    assert_frame_equal(result, expect)
示例#5
0
def test_query_save(df_query_ensembl_hsapiens_gene_chrom_2):
    """Test the saved query results for the default dataset
    (hsapiens_gene_ensembl)."""
    expect = (df_query_ensembl_hsapiens_gene_chrom_2.reset_index(drop=True))
    _ = query(attributes=["ensembl_gene_id", "external_gene_name"],
              filters={"chromosome_name": "2"},
              save=True)
    saved = pd.read_csv("apybiomart_query.csv")
    result = (saved.replace(np.nan, "").reset_index(drop=True))

    try:
        assert_frame_equal(result, expect)
    finally:
        os.remove("apybiomart_query.csv")
示例#6
0
    def response(self):
        """Retrieve the related information for the given variant.

        Results are converted to a dictionary by the .to_dict() method of
        the pandas dataframe returned. If no results are retrieved, the
        empty dict is returned.
        """
        resp = apy.query(attributes=[
            "allele", "ensembl_gene_stable_id", "refsnp_id",
            "consequence_allele_string", "consequence_type_tv"
        ],
                         filters={
                             "chr_name": "MT",
                             "start": str(self.position),
                             "end": str(self.position)
                         },
                         dataset=f"{self.species}_snp")
        resp.drop_duplicates("Variant alleles", inplace=True)

        return resp.to_dict(orient="records")
示例#7
0
def pull_ensembl(complete_file):
    f = find_datasets()
    cols = set([
        "ensembl_gene_id", "ensembl_peptide_id", "description",
        "external_gene_name", "external_gene_source", "external_synonym",
        "chromosome_name", "source", "gene_biotype", "entrezgene_id",
        "zfin_id_id", 'mgi_id', 'rgd_id', 'flybase_gene_id', 'sgd_gene',
        'wormbase_gene'
    ])
    for ds in f['Dataset_ID']:
        print(ds)
        outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}')
        #Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our
        # config, and keep it up to date.  Maybe you could have a job that gets the datasets and writes a dataset file,
        # but then updates the config? That sounds bogus.
        if os.path.exists(outfile):
            continue
        atts = find_attributes(ds)
        existingatts = set(atts['Attribute_ID'].to_list())
        attsIcanGet = cols.intersection(existingatts)
        df = query(attributes=attsIcanGet, filters={}, dataset=ds)
        df.to_csv(outfile, index=False, sep='\t')
    with open(complete_file, 'w') as outf:
        outf.write(f'Downloaded gene sets for {len(f)} data sets.')
示例#8
0
# Define input/output files
infile = sys.argv[1]
out01 = sys.argv[2]
out03 = sys.argv[3]

# Read in the APA-Scan .tsv file
apa = pd.read_csv(infile, sep="\t")

# Format the APA-Scan df so that the gene names are uppercase
apa["Gene Name"] = apa["Gene Name"].str.upper(
)  # Need to do this as APA-Scan outputs genenames lowercase. Not actually sure what naming convention that is but uppercasing means they should work for the apybiomart queries

# Run apybiomart query
bmart = query(attributes=["ensembl_gene_id", "hgnc_symbol"],
              filters={},
              dataset="hsapiens_gene_ensembl")

# Merge dataframes to get common
outdf = pd.merge(bmart, apa, left_on="HGNC symbol", right_on="Gene Name")

# Rearrange to BED format (Format 01)/Format 03 and remove any duplicated rows (PolyA sites)
outbed = outdf[[
    "Chrom", "Start", "End", "Gene stable ID", "p-value", "Strand"
]].drop_duplicates()
outbed[
    "p-value"] = "."  # Set p-value filed to "." so that it matches the BED format (Format 01) specification from the Execution Workflows README: https://github.com/iRNA-COSI/APAeval/tree/main/execution_workflows
out03 = outdf[["Gene stable ID", "p-value"]].drop_duplicates()

# Remove existing files (else pandas will append when saving)
if os.path.exists(out01):