示例#1
0
def get_variants_mc3(syn):
    """Reads ICGC mutation data from the MC3 synapse file.

    Args:
        syn (Synapse): A logged-in synapseclient instance.

    Returns:
        muts (pandas DataFrame), shape = [n_mutations, mut_levels + 1]
            An array of mutation data, with a row for each mutation
            appearing in an individual sample.

    Examples:
        >>> import synapseclient
        >>> syn = synapseclient.Synapse()
        >>> syn.login()
        >>> muts = get_variants_mc3(syn)

    """
    mc3 = syn.get('syn7824274')

    # defines which mutation annotation MAF columns to use
    use_cols = [0, 8, 15, 36, 37, 38, 39, 40, 41, 71, 72]
    use_names = ['Gene', 'Form', 'Sample', 'Protein', 'Transcript', 'Exon',
                 'depth', 'ref_count', 'alt_count', 'SIFT', 'PolyPhen']

    # imports mutation data into a DataFrame, parses TCGA sample barcodes
    # and PolyPhen scores
    i = 0
    while i < 10:
        try:
            muts = pd.read_csv(mc3.path, engine='python',
                               usecols=use_cols, sep='\t', header=None,
                               names=use_names, comment='#', skiprows=1)
            break

        except OSError:
            i = i + 1

    for annt, null_val in zip(['PolyPhen', 'SIFT'], [0, 1]):
        muts[annt] = muts[annt].apply(
            lambda val: (np.float(gsub('\)$', '', gsub('^.*\(', '', val)))
                         if val != '.' else null_val)
            )

    muts.Sample = muts.Sample.apply(lambda smp: '-'.join(smp.split('-')[:4]))
    muts.SIFT = 1 - muts.SIFT

    return muts
示例#2
0
def get_mtree_newick(mtree):
    """Get the Newick tree format representation of this MuTree."""
    newick_str = ''

    for nm, mut in sorted(mtree, key=lambda x: x[0]):
        if isinstance(mut, MuTree):
            newick_str += '(' + gsub(',$', '', get_mtree_newick(mut)) + ')'

        if nm == ".":
            newick_str += '{*none*},'
        else:
            newick_str += '{' + nm + '},'

    if mtree.depth == 0:
        newick_str = gsub(',$', '', newick_str) + ';'

    return newick_str
示例#3
0
def get_variants_mc3(syn):
    """Reads ICGC mutation data from the MC3 synapse file.

    Args:
        syn (Synapse): A logged-in synapseclient instance.

    Returns:
        muts (pandas DataFrame), shape = [n_mutations, mut_levels + 1]
            An array of mutation data, with a row for each mutation
            appearing in an individual sample.

    Examples:
        >>> import synapseclient
        >>> syn = synapseclient.Synapse()
        >>> syn.login()
        >>> muts = get_variants_mc3(syn)

    """
    mc3 = syn.get('syn7824274')

    # defines which mutation annotation MAF columns to use
    use_cols = [0, 8, 15, 36, 37, 38, 72]
    use_names = [
        'Gene', 'Form', 'Sample', 'Protein', 'Transcript', 'Exon', 'PolyPhen'
    ]

    # imports mutation data into a DataFrame, parses TCGA sample barcodes
    # and PolyPhen scores
    muts = pd.read_csv(mc3.path,
                       usecols=use_cols,
                       sep='\t',
                       header=None,
                       names=use_names,
                       comment='#',
                       skiprows=1)
    muts['Sample'] = [
        reduce(lambda x, y: x + '-' + y,
               s.split('-', 4)[:4]) for s in muts['Sample']
    ]
    muts['PolyPhen'] = [
        gsub('\)$', '', gsub('^.*\(', '', x)) if x != '.' else 0
        for x in muts['PolyPhen']
    ]

    return muts
示例#4
0
	def eval_node(self, variables, fallback):
		# self.variableValues  =variables #woot?
		whot = self.full_value()
		try:
			whot = re.gsub("\\", "", whot)  # where from?? token?
			res = eval(whot)  # except fallback ## v0.0
			return res
		except SyntaxError as se:
			return fallback
示例#5
0
    def get_newick(self):
        """Get the Newick tree format representation of this MuTree."""
        newick_str = ''

        for nm, mut in self.sort_iter():

            if isinstance(mut, MuTree):
                newick_str += '(' + gsub(',$', '', mut.get_newick()) + ')'

            if nm == "0":
                newick_str += '{*none*},'
            else:
                newick_str += '{' + nm + '},'

        if self.depth == 0:
            newick_str = gsub(',$', '', newick_str) + ';'

        return newick_str
示例#6
0
    def sort_iter(self):
        """Iterates through the branches of the tree, ordering mutation 
           attributes where possible."""

        if self.mut_level in ['Exon', 'Location']:
            return iter(
                sorted([("0", branch) if lbl == '.' else (lbl, branch)
                        for lbl, branch in self._child.items()],
                       key=lambda x: int(
                           gsub('[^0-9]', '0', x[0].split('/')[0]))))

        else:
            return self.__iter__()
示例#7
0
def get_gencode():
    """Gets annotation data for protein-coding genes on non-sex
       chromosomes from a Gencode file.

    Returns
    -------
    annot : dict
        Dictionary with keys corresponding to Ensembl gene IDs and values
        consisting of dicts with annotation fields.
    """
    annot = pd.read_csv(DATA_PATH + "gencode.v22.annotation.gtf.gz",
                        usecols=[0, 2, 3, 4, 8],
                        names=['Chr', 'Type', 'Start', 'End', 'Info'],
                        sep='\t',
                        header=None,
                        comment='#')

    # filter out annotation records that aren't
    # protein-coding genes on non-sex chromosomes
    chroms_use = ['chr' + str(i + 1) for i in range(22)]
    annot = annot.loc[annot['Type'] == 'gene', ]
    chr_indx = np.array([chrom in chroms_use for chrom in annot['Chr']])
    annot = annot.loc[chr_indx, ]

    # parse the info field to get each gene's annotation data
    gn_annot = {
        gsub('\.[0-9]+', '', z['gene_id']).replace('"', ''): z
        for z in [
            dict([['chr', an[0]]] + [['Start', an[2]]] + [['End', an[3]]] + [
                y for y in [x.split(' ')
                            for x in an[4].split('; ')] if len(y) == 2
            ]) for an in annot.values
        ] if z['gene_type'] == '"protein_coding"'
    }

    for g in gn_annot:
        gn_annot[g]['gene_name'] = gn_annot[g]['gene_name'].replace('"', '')

    return gn_annot
示例#8
0
    def __str__(self):
        """Printing a MuTree shows each of the branches of the tree and
           the samples at the end of each branch."""

        new_str = self.mut_level

        for nm, mut in self:
            new_str += ' IS {}'.format(nm)

            if isinstance(mut, MuTree):
                new_str += (' AND ' + '\n' + '\t' * (self.depth + 1) +
                            str(mut))

            # if we have reached a root node, print the samples
            elif len(mut) > 8:
                new_str += ': ({} samples)'.format(len(mut))
            else:
                new_str += ': {}'.format(
                    reduce(lambda x, y: '{},{}'.format(x, y), mut))

            new_str += ('\n' + '\t' * self.depth)
        new_str = gsub('\n$', '', new_str)

        return new_str
示例#9
0
def no_attrib_error(module, text):
    output = """<div class="%s">%s</div>"""
    text = re.gsub(r"\r\n|\n", "", text[0:20])
    print "* Error: '%s' with text: %s...\n" % (module, text[0:20])
    return output % ("general-error", "Error! No attribue is found for '%s' with text => %s ..." % (module,text[0:20]))       
示例#10
0
def plot_performance(clf_set='base', mtype_set='default'):
    """Plots barplots of classifier performance for a set of mutations."""
    out_data = load_output('baseline', clf_set, mtype_set)
    alg_order = [clf.__name__ for clf in clf_list[clf_set]]

    # gets AUC data, sets up plot and subplots
    auc_data = [x['AUC'] for x in out_data]
    auc_min = min([min(x.values()) for x in auc_data]) * 0.9
    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(7, 11))

    for i, gene in enumerate(mtype_list[mtype_set]):

        # cast performance data into matrix format
        perf_data = pd.DataFrame(
            [{k[0].split('_')[0]: v
              for k, v in x.items() if k[1] == gene} for x in auc_data])
        alg_indx = [list(perf_data.columns).index(x) for x in alg_order]
        perf_data = perf_data.ix[:, alg_indx]

        # create and plot the subplot titles describing mutation types
        gene_lbl = '{}-{}'.format(gene[0], gsub('(-|, )', '\n', str(gene[1])))
        axes[i // 3, i % 3].set_title(gene_lbl, fontsize=13)

        # plot the boxes showing performances
        axes[i // 3, i % 3].boxplot(x=np.array(perf_data),
                                    boxprops={'linewidth': 1.5},
                                    medianprops={
                                        'linewidth': 3,
                                        'color': '#960c20'
                                    },
                                    flierprops={'markersize': 2})

        # label x-axis ticks with algorithm names if we are on bottom row
        if (i // 3) == 1:
            axes[i // 3, i % 3].set_xticklabels(perf_data.columns,
                                                fontsize=12,
                                                rotation=45,
                                                ha='right')
        else:
            axes[i // 3, i % 3].set_xticklabels(np.repeat('', len(alg_indx)))

        # add y-axis title if we are on left-most column
        if (i % 3) == 0:
            axes[i // 3, i % 3].set_ylabel('AUC', fontsize=19)
        else:
            axes[i // 3, i % 3].set_yticklabels([])

        # add dotted line at AUC=0.5, set AUC axis limits
        axes[i // 3, i % 3].plot(list(range(len(alg_indx) + 2)),
                                 np.repeat(0.5,
                                           len(alg_indx) + 2),
                                 c="black",
                                 lw=0.8,
                                 ls='--',
                                 alpha=0.8)
        axes[i // 3, i % 3].set_ylim(auc_min, 1.0)

    # tweak subplot spacing and save plot to file
    plt.tight_layout(w_pad=-1.2, h_pad=1.5)
    plt.savefig(base_dir + '/plots/' + get_set_plotlbl(clf_set) + '_' +
                get_set_plotlbl(mtype_set) + '__performance.png',
                dpi=700)
示例#11
0
def get_variant_data(cohort, var_source, **var_args):
    if var_source == 'mc3':
        mc3 = var_args['syn'].get('syn7824274')

        field_dict = (('Gene', 0), ('Chr', 4), ('Start', 5), ('End', 6),
                      ('Strand', 7), ('Form', 8), ('RefAllele', 10),
                      ('TumorAllele', 12), ('Sample', 15), ('HGVS', 34),
                      ('Protein', 36), ('Transcript', 37), ('Exon', 38),
                      ('depth', 39), ('ref_count', 40), ('alt_count', 41),
                      ('SIFT', 71), ('PolyPhen', 72), ('Filter', 108))

        if 'mut_fields' not in var_args or var_args['mut_fields'] is None:
            use_fields, use_cols = tuple(zip(*field_dict))

        else:
            use_fields, use_cols = tuple(
                zip(*[(name, col) for name, col in field_dict
                      if name in {'Sample', 'Filter'}
                      | set(var_args['mut_fields'])]))

        # imports mutation data into a DataFrame, parses TCGA sample barcodes
        # and PolyPhen scores
        i = 0
        while i < 10:
            #TODO: handle I/O errors on the cohort/experiment level?
            try:
                var_data = pd.read_csv(mc3.path,
                                       engine='c',
                                       dtype='object',
                                       sep='\t',
                                       header=None,
                                       usecols=use_cols,
                                       names=use_fields,
                                       comment='#',
                                       skiprows=1)
                break

            except OSError:
                i = i + 1

        #TODO: more fine-grained Filtering control?
        var_data = var_data.loc[~var_data.Filter.str.
                                contains('nonpreferredpair')]

        for annt, null_val in zip(['PolyPhen', 'SIFT'], [0, 1]):
            if annt in var_data:
                var_data[annt] = var_data[annt].apply(lambda val: (
                    np.float(gsub('\)$', '', gsub('^.*\(', '', val)))
                    if val != '.' else null_val))

                if annt == 'SIFT':
                    var_data[annt] = 1 - var_data[annt]

        var_data.Sample = var_data.Sample.apply(
            lambda smp: '-'.join(smp.split('-')[:4]))

    elif var_source == 'Firehose':
        mut_tar = tarfile.open(
            glob.glob(
                os.path.join(
                    data_dir, "stddata__2016_01_28", cohort, "20160128",
                    "*Mutation_Packager_Oncotated_Calls.Level_3*tar.gz"))[0])

        mut_list = []
        for mut_fl in mut_tar.getmembers():

            try:
                mut_tbl = pd.read_csv(
                    BytesIO(mut_tar.extractfile(mut_fl).read()),
                    sep='\t',
                    skiprows=4,
                    usecols=[0, 8, 15, 37, 41],
                    names=['Gene', 'Form', 'Sample', 'Exon', 'Protein'])
                mut_list += [mut_tbl]

            except:
                print("Skipping mutations for {}".format(mut_fl))

        muts = pd.concat(mut_list)
        muts.Sample = muts.Sample.apply(
            lambda smp: "-".join(smp.split("-")[:4]))
        mut_tar.close()

    elif var_source == 'BMEG':
        oph = Ophion("http://bmeg.io")
        mut_list = {samp: {} for samp in sample_list}
        gene_lbls = ["gene:" + gn for gn in gene_list]

        print(oph.query().has(
            "gid", "biosample:" +
            sample_list[0]).incoming("variantInBiosample").outEdge(
                "variantInGene").mark("variant").inVertex().has(
                    "gid", oph.within(gene_lbls)).count().execute())
        # .mark("gene").select(["gene", "variant"]).count().execute())

        for samp in sample_list:
            for i in oph.query().has("gid", "biosample:" + samp)\
                    .incoming("variantInBiosample")\
                    .outEdge("variantInGene").mark("variant")\
                    .inVertex().has("gid", oph.within(gene_lbls))\
                    .mark("gene").select(["gene", "variant"]).execute():
                dt = json.loads(i)
                gene_name = dt["gene"]["properties"]["symbol"]
                mut_list[samp][gene_name] = {
                    k: v
                    for k, v in dt["variant"]["properties"].items()
                    if k in mut_fields
                }

        mut_table = pd.DataFrame(mut_list)

    else:
        raise ValueError("Unrecognized source of variant data!")

    return var_data
示例#12
0
def subn_filter(s, find, replace, count=0):
    """A non-optimal implementation of a regex filter"""
    return re.gsub(find, replace, count, s)