def main(args): if not len(args) == 3: sys.exit( "USAGE: python getExpressionFromXena.py cancerType[KIRC] geneList.txt > outFile" ) # TCGA hub hub = "https://tcga.xenahubs.net" # PanCan normalized dataset dataset = "TCGA." + str(args[1]) + ".sampleMap/HiSeqV2_PANCAN" # get sample IDs samples = xena.dataset_samples(hub, dataset, None) #read geneList into list f = open(args[2]) line = f.readline()[:-1] genes = [] while line != "": gene = line genes.append(gene) line = f.readline()[:-1] #print header print 'Gene\t' + '\t'.join(samples) #print expression nGenes = len(genes) k = xena.dataset_gene_probe_avg(hub, dataset, samples, genes) for i in range(0, nGenes): geneName = k[i]['gene'] scores = k[i]['scores'][0] print geneName + '\t' + '\t'.join(str(x) for x in scores)
def main(args): if not len(args) == 2: sys.exit("USAGE: python checkGeneList.py geneList.txt > outFile") # TCGA hub hub = "https://tcga.xenahubs.net" # PanCan normalized dataset dataset = "TCGA.KIRC.sampleMap/HiSeqV2_PANCAN" # get sample IDs samples = xena.dataset_samples(hub, dataset, None) #read geneList into list f = open(args[1]) line = f.readline()[:-1] genes = [] while line != "": gene = line genes.append(gene) line = f.readline()[:-1] #print expression nGenes = len(genes) k = xena.dataset_gene_probe_avg(hub, dataset, samples, genes) for i in range(0, nGenes): if k[i]['position'] == []: print k[i]['gene']
def add_gene_expression_by_genes( target_genes: Sequence, clinicals: pd.DataFrame, xena_hub: str, ds: str, *, gene_names: Union[None, dict] = None, colprefix: str = "gex_", ) -> pd.DataFrame: """ Downloads gene expression of a set of genes for all samples in the cohort. Parameters ---------- target_genes A list of genes whose expression we are interested in. clinicals A dataframe containing sample information. A column for each gene will be added. xena_hub Url of the data repository hub. ds Name of the dataset on the repository hub. gene_names If gene names are not identical to gene symbols (or the genome version is different), this maps genes to their expression data. colprefix Added to the gene name to label the column with expression data. Returns ------- Dataframe with gene expression data added to the previous content. """ genes = [] if gene_names is None: gene_names = dict() for g in target_genes: if g in gene_names: g = gene_names[g] genes.append(g) expression_matrix = xena.dataset_gene_probe_avg( xena_hub, ds, clinicals.index.values.tolist(), genes) for i in range(len(target_genes)): colname = colprefix + target_genes[i] if len(expression_matrix[i]["scores"][0]) < 1: print( colname, "not found. Are you sure you provided the gene symbol corresponding to the genome version?", ) clinicals[colname] = "NaN" else: clinicals[colname] = expression_matrix[i]["scores"][0] return clinicals
def gather_expression_data(hub, dataset, samples, genes): """Collect the expression data from Xena Hub.""" expression_data = np.array([]) n_genes = len(genes) t_0 = time.time() # We collect the expression data 100 genes by 100 genes for i in range(int(n_genes / 100) + 1): if i % 20 == 0: print('%i genes collected in %s s' % (100 * i, str(time.time() - t_0))) lower_bound = min(i * 100, n_genes) upper_bound = min((i + 1) * 100, n_genes) genes_batch = genes[lower_bound:upper_bound] new_expression_batch = np.array( xena.dataset_gene_probe_avg(hub, dataset, samples, genes_batch)) expression_data = np.append(expression_data, new_expression_batch) return expression_data.tolist()
def findAliasInXena(hub, dataset, samples, gene): xenaAlias = '' f = open("aliases.txt") line = f.readline()[:-1] while line != "": aliases = [] aliases.append(line.split("\t")[1]) aliases += line.split("\t")[4].split(',') aliases += line.split("\t")[5].split(',') if gene in aliases: k = xena.dataset_gene_probe_avg(hub, dataset, samples, aliases) for i in range(0, len(aliases)): if k[i]['position'] != [] and k[i]['scores'][0][0] != 'NaN': xenaAlias = k[i]['gene'] break line = f.readline()[:-1] f.close() return xenaAlias
def main(args): if not len(args) == 3: sys.exit( "USAGE: python replaceGeneListWithXenaAliasNames.py cancerType[KIRC] geneList.txt > newGeneList.txt" ) # TCGA hub hub = "https://tcga.xenahubs.net" # PanCan normalized dataset dataset = "TCGA." + str(args[1]) + ".sampleMap/HiSeqV2_PANCAN" # get sample IDs samples = xena.dataset_samples(hub, dataset, None) #read geneList into list f = open(args[2]) line = f.readline()[:-1] genes = [] while line != "": gene = line genes.append(gene) line = f.readline()[:-1] f.close() #if in Xena, print gene. If not in Xena, determine if gene has an alias that is in Xena nGenes = len(genes) k = xena.dataset_gene_probe_avg(hub, dataset, samples, genes) for i in range(0, nGenes): if k[i]['position'] == [] or k[i]['scores'][0][0] == 'NaN': # determine if gene has an alias in Xena alias = findAliasInXena(hub, dataset, samples, k[i]['gene']) if alias != '': print alias else: print k[i]['gene']