示例#1
0
def main(args):
    if not len(args) == 3:
        sys.exit(
            "USAGE: python getExpressionFromXena.py cancerType[KIRC] geneList.txt  > outFile"
        )

    # TCGA hub
    hub = "https://tcga.xenahubs.net"

    # PanCan normalized dataset
    dataset = "TCGA." + str(args[1]) + ".sampleMap/HiSeqV2_PANCAN"

    # get sample IDs
    samples = xena.dataset_samples(hub, dataset, None)

    #read geneList into list
    f = open(args[2])
    line = f.readline()[:-1]
    genes = []
    while line != "":
        gene = line
        genes.append(gene)
        line = f.readline()[:-1]

    #print header
    print 'Gene\t' + '\t'.join(samples)

    #print expression
    nGenes = len(genes)
    k = xena.dataset_gene_probe_avg(hub, dataset, samples, genes)
    for i in range(0, nGenes):
        geneName = k[i]['gene']
        scores = k[i]['scores'][0]
        print geneName + '\t' + '\t'.join(str(x) for x in scores)
示例#2
0
def main(args):
    if not len(args) == 2:
        sys.exit("USAGE: python checkGeneList.py geneList.txt  > outFile")

    # TCGA hub
    hub = "https://tcga.xenahubs.net"

    # PanCan normalized dataset
    dataset = "TCGA.KIRC.sampleMap/HiSeqV2_PANCAN"

    # get sample IDs
    samples = xena.dataset_samples(hub, dataset, None)

    #read geneList into list
    f = open(args[1])
    line = f.readline()[:-1]
    genes = []
    while line != "":
        gene = line
        genes.append(gene)
        line = f.readline()[:-1]

    #print expression
    nGenes = len(genes)
    k = xena.dataset_gene_probe_avg(hub, dataset, samples, genes)
    for i in range(0, nGenes):
        if k[i]['position'] == []:
            print k[i]['gene']
示例#3
0
def add_gene_expression_by_genes(
    target_genes: Sequence,
    clinicals: pd.DataFrame,
    xena_hub: str,
    ds: str,
    *,
    gene_names: Union[None, dict] = None,
    colprefix: str = "gex_",
) -> pd.DataFrame:
    """
    Downloads gene expression of a set of genes for all samples in the cohort.

    Parameters
    ----------
    target_genes
        A list of genes whose expression we are interested in.
    clinicals
        A dataframe containing sample information. A column for each gene will be added.
    xena_hub
        Url of the data repository hub.
    ds
        Name of the dataset on the repository hub.
    gene_names
        If gene names are not identical to gene symbols (or the genome version is
        different), this maps genes to their expression data.
    colprefix
        Added to the gene name to label the column with expression data.

    Returns
    -------
    Dataframe with gene expression data added to the previous content.
    """

    genes = []
    if gene_names is None:
        gene_names = dict()
    for g in target_genes:
        if g in gene_names:
            g = gene_names[g]
        genes.append(g)

    expression_matrix = xena.dataset_gene_probe_avg(
        xena_hub, ds, clinicals.index.values.tolist(), genes)
    for i in range(len(target_genes)):
        colname = colprefix + target_genes[i]
        if len(expression_matrix[i]["scores"][0]) < 1:
            print(
                colname,
                "not found. Are you sure you provided the gene symbol corresponding to the genome version?",
            )
            clinicals[colname] = "NaN"
        else:
            clinicals[colname] = expression_matrix[i]["scores"][0]
    return clinicals
示例#4
0
def gather_expression_data(hub, dataset, samples, genes):
    """Collect the expression data from Xena Hub."""
    expression_data = np.array([])
    n_genes = len(genes)
    t_0 = time.time()
    # We collect the expression data 100 genes by 100 genes
    for i in range(int(n_genes / 100) + 1):
        if i % 20 == 0:
            print('%i genes collected in %s s' %
                  (100 * i, str(time.time() - t_0)))
        lower_bound = min(i * 100, n_genes)
        upper_bound = min((i + 1) * 100, n_genes)
        genes_batch = genes[lower_bound:upper_bound]
        new_expression_batch = np.array(
            xena.dataset_gene_probe_avg(hub, dataset, samples, genes_batch))
        expression_data = np.append(expression_data, new_expression_batch)
    return expression_data.tolist()
def findAliasInXena(hub, dataset, samples, gene):
    xenaAlias = ''
    f = open("aliases.txt")
    line = f.readline()[:-1]
    while line != "":
        aliases = []
        aliases.append(line.split("\t")[1])
        aliases += line.split("\t")[4].split(',')
        aliases += line.split("\t")[5].split(',')
        if gene in aliases:
            k = xena.dataset_gene_probe_avg(hub, dataset, samples, aliases)
            for i in range(0, len(aliases)):
                if k[i]['position'] != [] and k[i]['scores'][0][0] != 'NaN':
                    xenaAlias = k[i]['gene']
            break
        line = f.readline()[:-1]
    f.close()
    return xenaAlias
def main(args):
    if not len(args) == 3:
        sys.exit(
            "USAGE: python replaceGeneListWithXenaAliasNames.py cancerType[KIRC] geneList.txt > newGeneList.txt"
        )

    # TCGA hub
    hub = "https://tcga.xenahubs.net"

    # PanCan normalized dataset
    dataset = "TCGA." + str(args[1]) + ".sampleMap/HiSeqV2_PANCAN"

    # get sample IDs
    samples = xena.dataset_samples(hub, dataset, None)

    #read geneList into list
    f = open(args[2])
    line = f.readline()[:-1]
    genes = []
    while line != "":
        gene = line
        genes.append(gene)
        line = f.readline()[:-1]
    f.close()

    #if in Xena, print gene. If not in Xena, determine if gene has an alias that is in Xena
    nGenes = len(genes)
    k = xena.dataset_gene_probe_avg(hub, dataset, samples, genes)
    for i in range(0, nGenes):
        if k[i]['position'] == [] or k[i]['scores'][0][0] == 'NaN':
            # determine if gene has an alias in Xena
            alias = findAliasInXena(hub, dataset, samples, k[i]['gene'])
            if alias != '':
                print alias
        else:
            print k[i]['gene']