Exemplo n.º 1
0
def testZmGO(Zm5bFGS):
    if cf.test.force.Ontology:
        tools.del_dataset("GOnt", "ZmGO", force=True)
    if not tools.available_datasets("GOnt", "ZmGO"):
        obo = os.path.join(cf.options.testdir, "raw", "GOnt", "go.obo.gz")
        gene_map_file = os.path.join(cf.options.testdir, "raw", "GOnt",
                                     "zm_go.tsv.gz")
        return co.GOnt.from_obo(obo, gene_map_file, "ZmGO",
                                "Maize Gene Ontology", Zm5bFGS)
    else:
        return co.GOnt("ZmGO")
Exemplo n.º 2
0
def ZmGO(Zm5bFGS):
    if cf.test.force.Ontology:
        tools.del_dataset('GOnt', 'ZmGO', force=True)
    if not tools.available_datasets('GOnt', 'ZmGO'):
        obo = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.obo.bz2')
        gene_map_file = os.path.join(cf.options.testdir, 'raw', 'GOnt',
                                     'zm_go.tsv.bz2')
        return co.GOnt.from_obo(obo, gene_map_file, 'ZmGO',
                                'Maize Gene Ontology', Zm5bFGS)
    else:
        return co.GOnt('ZmGO')
Exemplo n.º 3
0
def TestGO(Zm5bFGS):
    if cf.test.force.Ontology:
        co.del_dataset('GOnt', 'TestGO', force=True)
    if not co.available_datasets('GOnt', 'TestGO'):
        obo = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.test.obo')
        gene_map_file = os.path.join(cf.options.testdir, 'raw', 'GOnt',
                                     'go.test.tsv')
        return co.GOnt.from_obo(obo, gene_map_file, 'TestGO', 'Test GO',
                                Zm5bFGS)
    else:
        return co.GOnt('TestGO')
Exemplo n.º 4
0
def AtGO(AtTair10):
    if cf.test.force.Ontology:
        tools.del_dataset('GOnt', 'AtGO', force=True)
    if not tools.available_datasets('GOnt', 'AtGO'):
        obo = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.obo.bz2')
        gene_map_file = os.path.join(cf.options.testdir, 'raw', 'GOnt',
                                     'ath_go.tsv.bz2')
        return co.GOnt.from_obo(obo,
                                gene_map_file,
                                'AtGO',
                                'Arabidopsis Gene Ontology',
                                AtTair10,
                                id_col=0,
                                go_col=5)
    else:
        return co.GOnt('AtGO')
Exemplo n.º 5
0
def list_command(args):
    if args.type != None and args.name != None:
        if args.terms:
            if args.type == 'GWAS':
                gwas = co.GWAS(args.name)
                print('\n'.join([x.id for x in gwas.iter_terms()]))
            elif args.type =='GOnt':
                gont = co.GOnt(args.name)
                print('\n'.join([x.id for x in gont.iter_terms()]))
        else:
            print(co.available_datasets(args.type,args.name))
    elif args.type != None and args.name == None:
        args.name = '%'
        print(co.available_datasets(args.type,args.name).to_string())
    else:
        args.type = '%'
        args.name = '%'
        print(co.available_datasets(args.type,args.name).to_string())
Exemplo n.º 6
0
def list_command(args):
    if args.type != None and args.name != None:
        if args.terms:
            if args.type == "GWAS":
                gwas = co.GWAS(args.name)
                print("\n".join([x.id for x in gwas.iter_terms()]))
            elif args.type == "GOnt":
                gont = co.GOnt(args.name)
                print("\n".join([x.id for x in gont.iter_terms()]))
        if args.names:
            print(" ".join(available_datasets(args.type, args.name).Name))
        else:
            print(available_datasets(args.type, args.name))
    elif args.type != None and args.name == None:
        args.name = "%"
        print(available_datasets(args.type, args.name).to_string())
    else:
        args.type = "%"
        args.name = "%"
        print(available_datasets(args.type, args.name).to_string())
Exemplo n.º 7
0
    def from_CLI(cls, args):
        """
            Implements an interface for the CLI to perform overlap
            Analysis
        """
        if args.genes != [None]:
            source = "genes"
        elif args.go is not None:
            source = "go"
        elif args.gwas is not None:
            source = "gwas"
        elif args.ontology is not None:
            source = 'ontology'
        self = cls.create(source+'_CLI', description="CLI Overlap")
        self.source = source
        self.args = args
        # Build base camoco objects
        self.cob = co.COB(args.cob)

        # Generate the ontology of terms that we are going to look
        # at the overlap of
        if source == "genes":
            # Be smart about this
            import re

            args.genes = list(chain(*[re.split("[,; ]", x) for x in args.genes]))
            self.ont = pd.DataFrame()
            self.ont.name = "GeneList"
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        elif source == "go":
            self.ont = co.GOnt(args.go)
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        elif source == "gwas":
            self.ont = co.GWAS(args.gwas)
        elif source == 'ontology':
            self.ont = co.Ontology(args.ontology)
        else:
            raise ValueError(
                "Please provide a valid overlap source (--genes, --go or --gwas)"
            )
        try:
            self.generate_output_name()
        except ValueError as e:
            return

        # Save strongest description arguments if applicable
        if "strongest" in self.args.snp2gene:
            if not (self.ont._global("strongest_attr") == args.strongest_attr):
                self.ont.set_strongest(attr=args.strongest_attr)
            if not (
                bool(int(self.ont._global("strongest_higher")))
                == bool(args.strongest_higher)
            ):
                self.ont.set_strongest(higher=args.strongest_higher)

        # Generate a terms iterable
        if self.source == "genes":
            # make a single term
            loci = self.cob.refgen.from_ids(self.args.genes)
            if len(loci) < len(self.args.genes):
                self.cob.log("Some input genes not in network")
            terms = [Term("CustomTerm", desc="Custom from CLI", loci=loci)]
        else:
            # Generate terms from the ontology
            if "all" in self.args.terms:
                terms = list(self.ont.iter_terms())
            else:
                terms = [self.ont[term] for term in self.args.terms]
        all_results = list()
        results = []

        num_total_terms = len(terms)
        # Iterate through terms and calculate
        for i, term in enumerate(terms):
            self.cob.log(
                " ---------- Calculating overlap for {} of {} Terms", i, num_total_terms
            )
            if term.id in self.args.skip_terms:
                self.cob.log("Skipping {} since it was in --skip-terms", term.id)
            self.cob.log("Generating SNP-to-gene mapping")
            # If appropriate, generate SNP2Gene Loci
            if self.args.candidate_flank_limit > 0:
                loci = self.snp2gene(term, self.ont)
            else:
                loci = list(term.loci)
                for x in loci:
                    x.window = 1

            # Filter out terms with insufficient or too many genes
            if len(loci) < 2 or len(loci) < args.min_term_size:
                self.cob.log("Not enough genes to perform overlap")
                continue
            if args.max_term_size != None and len(loci) > args.max_term_size:
                self.cob.log("Too many genes to perform overlap")
                continue

            # Send some output to the terminal
            self.cob.log(
                "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)",
                term.id,
                self.ont.name,
                self.cob.name,
                self.args.candidate_window_size,
                self.args.candidate_flank_limit,
                len(loci),
            )
            if args.dry_run:
                continue
            # Do the dirty
            try:
                overlap = self.overlap(loci)
            except DataError as e:
                continue
            self.cob.log("Generating bootstraps")
            bootstraps = self.generate_bootstraps(loci, overlap)
            bs_mean = bootstraps.groupby("iter").score.apply(np.mean).mean()
            bs_std = bootstraps.groupby("iter").score.apply(np.std).mean()
            # Calculate z scores for density
            self.cob.log("Calculating Z-Scores")
            if bs_std != 0:
                overlap["zscore"] = (overlap.score - bs_mean) / bs_std
                bootstraps["zscore"] = (bootstraps.score - bs_mean) / bs_std
            else:
                # If there is no variation, make all Z-scores 0
                overlap["zscore"] = bootstraps["zscore"] = 0
            # Calculate FDR
            self.cob.log("Calculating FDR")
            overlap["fdr"] = np.nan
            max_zscore = int(overlap.zscore.max()) + 1
            for zscore in np.arange(0, max_zscore, 0.25):
                num_random = (
                    bootstraps.groupby("iter")
                    .apply(lambda df: sum(df.zscore >= zscore))
                    .mean()
                )
                num_real = sum(overlap.zscore >= zscore)
                # Calculate FDR
                if num_real != 0 and num_random != 0:
                    fdr = num_random / num_real
                elif num_real != 0 and num_random == 0:
                    fdr = 0
                else:
                    fdr = 1
                overlap.loc[overlap.zscore >= zscore, "fdr"] = fdr
                overlap.loc[overlap.zscore >= zscore, "num_real"] = num_real
                overlap.loc[overlap.zscore >= zscore, "num_random"] = num_random
                overlap.loc[overlap.zscore >= zscore, "bs_mean"] = bs_mean
                overlap.loc[overlap.zscore >= zscore, "bs_std"] = bs_std
                overlap.sort_values(by=["zscore"], ascending=False, inplace=True)
            overlap_pval = (
                sum(
                    bootstraps.groupby("iter").apply(lambda x: x.score.mean())
                    >= overlap.score.mean()
                )
            ) / len(bootstraps.iter.unique())
            # This gets collated into all_results below
            overlap["COB"] = self.cob.name
            overlap["Ontology"] = self.ont.name
            overlap["Term"] = term.id
            overlap["WindowSize"] = self.args.candidate_window_size
            overlap["FlankLimit"] = self.args.candidate_flank_limit
            overlap["TermLoci"] = len(term.loci)
            overlap["TermCollapsedLoci"] = len(loci)
            overlap["TermPValue"] = overlap_pval
            overlap["NumBootstraps"] = len(bootstraps.iter.unique())
            overlap["Method"] = self.args.method
            overlap["SNP2Gene"] = self.args.snp2gene
            results.append(overlap.reset_index())
            # Summarize results
            if self.args.method == "density":
                overlap_score = np.nanmean(overlap.score) / (
                    1 / np.sqrt(overlap.num_trans_edges.mean())
                )
            elif self.args.method == "locality":
                overlap_score = np.nanmean(overlap.score)
            self.cob.log(
                "Overlap Score ({}): {} (p<{})".format(
                    self.args.method, overlap_score, overlap_pval
                )
            )
        if not args.dry_run:
            # Consolidate results and output to files
            self.results = pd.concat(results)
            self.results.to_csv(self.args.out, sep="\t", index=None)

            # Make an actual results object if not exists
            overlap_object = cls.create(self.ont)

            # Save the results to the SQLite table
            self.results.to_sql(
                "overlap",
                sqlite3.connect(overlap_object.db.filename),
                if_exists="append",
                index=False,
            )
Exemplo n.º 8
0
def cob_health(args):
    log = coblog()
    log('\n'
        '-----------------------\n'
        '   Network Health      \n'
        '-----------------------\n')
    cob = co.COB(args.cob)
    if args.out is None:
        args.out = '{}_Health'.format(cob.name)

    log('Plotting Scores ----------------------------------------------------')
    if not path.exists('{}_CoexPCC_raw.png'.format(args.out)):
        cob.plot_scores('{}_CoexPCC_raw.png'.format(args.out), pcc=True)
    else:
        log('Skipped Raw.')

    if not path.exists('{}_CoexScore_zscore.png'.format(args.out)):
        cob.plot_scores('{}_CoexScore_zscore.png'.format(args.out), pcc=False)
    else:
        log('Skipped Norm.')

    log('Plotting Expression ------------------------------------------------')
    if not path.exists('{}_Expr_raw.png'.format(args.out)):
        cob.plot('{}_Expr_raw.png'.format(args.out),
                 include_accession_labels=True,
                 raw=True,
                 cluster_method=None)
    else:
        log('Skipped raw.')
    if not path.exists('{}_Expr_norm.png'.format(args.out)):
        cob.plot('{}_Expr_norm.png'.format(args.out),
                 include_accession_labels=True,
                 raw=False,
                 cluster_method='leaf',
                 cluster_accessions=True)
    else:
        log('Skipped norm.')
    log('Plotting Cluster Expression-----------------------------------------')
    if not path.exists('{}_Expr_cluster.png'.format(args.out)):
        cob.plot('{}_Expr_cluster.png'.format(args.out),
                 include_accession_labels=True,
                 raw=False,
                 cluster_accessions=True,
                 avg_by_cluster=True)
    else:
        log('Skipped norm.')
    log('Printing Summary ---------------------------------------------------')
    if not path.exists('{}.summary.txt'.format(args.out)):
        with open('{}.summary.txt'.format(args.out), 'w') as OUT:
            # Print out the network summary
            cob.summary(file=OUT)
    else:
        log('Skipped summary.')

    log('Printing QC Statistics ---------------------------------------------')
    if args.refgen is not None:
        if not path.exists('{}_qc_gene.txt'.format(args.out)):
            # Print out the breakdown of QC Values
            refgen = co.RefGen(args.refgen)
            gene_qc = cob._bcolz('qc_gene')
            gene_qc = gene_qc[gene_qc.pass_membership]
            gene_qc['chrom'] = [
                'chr' + str(refgen[x].chrom) for x in gene_qc.index
            ]
            gene_qc = gene_qc.groupby('chrom').agg(sum, axis=0)
            # Add totals at the bottom
            totals = gene_qc.ix[:, slice(1, None)].apply(sum)
            totals.name = 'TOTAL'
            gene_qc = gene_qc.append(totals)
            gene_qc.to_csv('{}_qc_gene.txt'.format(args.out), sep='\t')
        else:
            log('Skipped QC summary.')

    #if not path.exists('{}_CisTrans.png'.format(args.out)):
    # Get trans edges

    log('Plotting Degree Distribution ---------------------------------------')
    if not path.exists('{}_DegreeDist.png'.format(args.out)):
        degree = cob.degree['Degree'].values
        #Using powerlaw makes run-time warning the first time you use it.
        #This is still an open issue on the creators github.
        #The creator recommends removing this warning as long as there is a fit.
        np.seterr(divide='ignore', invalid='ignore')
        fit = powerlaw.Fit(degree, discrete=True, xmin=1)
        # get an axis
        ax = plt.subplot()
        # Calculate log ratios
        t2p = fit.distribution_compare('truncated_power_law', 'power_law')
        t2e = fit.distribution_compare('truncated_power_law', 'exponential')
        p2e = fit.distribution_compare('power_law', 'exponential')
        # Plot!
        emp = fit.plot_ccdf(ax=ax,
                            color='r',
                            linewidth=3,
                            label='Empirical Data')
        pwr = fit.power_law.plot_ccdf(ax=ax,
                                      color='b',
                                      linestyle='--',
                                      label='Power law')
        tpw = fit.truncated_power_law.plot_ccdf(ax=ax,
                                                color='k',
                                                linestyle='--',
                                                label='Truncated Power')
        exp = fit.exponential.plot_ccdf(ax=ax,
                                        color='g',
                                        linestyle='--',
                                        label='Exponential')
        ####
        ax.set_ylabel("p(Degree≥x)")
        ax.set_xlabel("Degree Frequency")
        ax.legend(loc='best')
        plt.title('{} Degree Distribution'.format(cob.name))
        # Save Fig
        try:
            plt.savefig('{}_DegreeDist.png'.format(args.out))
        except FutureWarning as e:
            # This is a matplotlib bug
            pass
    else:
        log('Skipping Degree Dist.')

    log('Plotting GO --------------------------------------------------------')
    if args.go is not None:
        if not path.exists('{}_GO.csv'.format(args.out)):
            go = co.GOnt(args.go)
            term_ids = []
            density_emp = []
            density_pvals = []
            locality_emp = []
            locality_pvals = []
            term_sizes = []
            term_desc = []
            terms_tested = 0
            if args.max_terms is not None:
                log('Limiting to {} GO Terms', args.max_terms)
                terms = go.rand(n=args.max_terms,
                                min_term_size=args.min_term_size,
                                max_term_size=args.max_term_size)
            else:
                terms = go.iter_terms(min_term_size=args.min_term_size,
                                      max_term_size=args.max_term_size)
            for term in terms:
                term.loci = list(filter(lambda x: x in cob, term.loci))
                if len(term) < args.min_term_size or len(
                        term) > args.max_term_size:
                    continue
                #set density value for two tailed go so we only test it once
                density = cob.density(term.loci)
                #one tailed vs two tailed test
                if args.two_tailed_GO is False:
                    #run one tail for only positive values
                    if density > 0:
                        density_emp.append(density)
                    #skip negative density values
                    else:
                        continue
                #if two_tailed_go is not none
                else:
                    density_emp.append(density)
                term_ids.append(term.id)
                term_sizes.append(len(term))
                term_desc.append(str(term.desc))
                # ------ Density
                # Calculate PVals
                density_bs = np.array([
                    cob.density(cob.refgen.random_genes(n=len(term.loci))) \
                    for x in range(args.num_bootstraps)
                ])
                if density > 0:
                    pval = sum(density_bs >= density) / args.num_bootstraps
                else:
                    pval = sum(density_bs <= density) / args.num_bootstraps
                density_pvals.append(pval)

                # ------- Locality
                locality = cob.locality(term.loci,
                                        include_regression=True).resid.mean()
                locality_emp.append(locality)
                # Calculate PVals
                locality_bs = np.array([
                    cob.locality(
                        cob.refgen.random_genes(n=len(term.loci)),
                        include_regression=True
                    ).resid.mean() \
                    for x in range(args.num_bootstraps)
                ])
                if locality > 0:
                    pval = sum(locality_bs >= locality) / args.num_bootstraps
                else:
                    pval = sum(locality_bs <= locality) / args.num_bootstraps
                locality_pvals.append(pval)
                # -------------
                terms_tested += 1
                if terms_tested % 100 == 0 and terms_tested > 0:
                    log('Processed {} terms'.format(terms_tested))
            go_enrichment = pd.DataFrame({
                'GOTerm': term_ids,
                'desc': term_desc,
                'size': term_sizes,
                'density': density_emp,
                'density_pval': density_pvals,
                'locality': locality_emp,
                'locality_pval': locality_pvals
            })
            go_enrichment\
                .sort_values(by='density_pval',ascending=True)\
                .to_csv('{}_GO.csv'.format(args.out),index=False)
            if terms_tested == 0:
                log.warn('No GO terms met your min/max gene criteria!')
        else:
            go_enrichment = pd.read_table('{}_GO.csv'.format(args.out),
                                          sep=',')

        if not path.exists('{}_GO.png'.format(args.out)):
            # Convert pvals to log10
            with np.errstate(divide='ignore'):
                # When no bootstraps are more extreme than the term, the minus log pval yields an infinite
                go_enrichment['density_pval'] = -1 * np.log10(
                    go_enrichment['density_pval'])
                go_enrichment['locality_pval'] = -1 * np.log10(
                    go_enrichment['locality_pval'])
                # Fix the infinites so they are plotted
                max_density = np.max(go_enrichment['density_pval'][np.isfinite(
                    go_enrichment['density_pval'])])
                max_locality = np.max(
                    go_enrichment['locality_pval'][np.isfinite(
                        go_enrichment['locality_pval'])])
                go_enrichment.loc[
                    np.logical_not(np.isfinite(go_enrichment['density_pval'])),
                    'density_pval'] = max_density + 1
                go_enrichment.loc[np.logical_not(
                    np.isfinite(go_enrichment['locality_pval'])),
                                  'locality_pval'] = max_locality + 1
            plt.clf()
            figure, axes = plt.subplots(3, 2, figsize=(12, 12))
            # -----------
            # Density
            # ----------
            axes[0, 0].scatter(go_enrichment['density'],
                               go_enrichment['density_pval'],
                               alpha=0.05)
            axes[0, 0].set_xlabel('Empirical Density (Z-Score)')
            axes[0, 0].set_ylabel('Bootstraped -log10(p-value)')
            fold = sum(np.array(go_enrichment['density_pval']) > 1.3) / (
                0.05 * len(go_enrichment))
            axes[0, 0].axhline(y=-1 * np.log10(0.05), color='red')
            axes[0, 0].text(min(axes[0, 0].get_xlim()),
                            -1 * np.log10(0.05),
                            '{:.3g} Fold Enrichement'.format(fold),
                            color='red')
            axes[1, 0].scatter(go_enrichment['size'],
                               go_enrichment['density_pval'],
                               alpha=0.05)
            axes[1, 0].set_ylabel('Bootstrapped -log10(p-value)')
            axes[1, 0].set_xlabel('Term Size')
            axes[1, 0].axhline(y=-1 * np.log10(0.05), color='red')
            axes[2, 0].scatter(go_enrichment['size'],
                               go_enrichment['density'],
                               alpha=0.05)
            axes[2,
                 0].scatter(go_enrichment.query('density_pval>1.3')['size'],
                            go_enrichment.query('density_pval>1.3')['density'],
                            alpha=0.05,
                            color='r')
            axes[2, 0].set_ylabel('Density')
            axes[2, 0].set_xlabel('Term Size')
            # ------------
            # Do Locality
            # ------------
            axes[0, 1].scatter(go_enrichment['locality'],
                               go_enrichment['locality_pval'],
                               alpha=0.05)
            axes[0, 1].set_xlabel('Empirical Locality (Residual)')
            axes[0, 1].set_ylabel('Bootstraped -log10(p-value)')
            fold = sum(np.array(go_enrichment['locality_pval']) > 1.3) / (
                0.05 * len(go_enrichment))
            axes[0, 1].axhline(y=-1 * np.log10(0.05), color='red')
            axes[0, 1].text(min(axes[0, 1].get_xlim()),
                            -1 * np.log10(0.05),
                            '{:.3g} Fold Enrichement'.format(fold),
                            color='red')
            axes[1, 1].scatter(go_enrichment['size'],
                               go_enrichment['locality_pval'],
                               alpha=0.05)
            axes[1, 1].set_xlabel('Term Size')
            axes[1, 1].set_ylabel('Bootstrapped -log10(p-value)')
            axes[1, 1].axhline(y=-1 * np.log10(0.05), color='red')
            axes[2, 1].scatter(go_enrichment['size'],
                               go_enrichment['locality'],
                               alpha=0.05)
            axes[2, 1].scatter(
                go_enrichment.query('locality_pval>1.3')['size'],
                go_enrichment.query('locality_pval>1.3')['locality'],
                alpha=0.05,
                color='r')
            axes[2, 1].set_ylabel('Density')
            axes[2, 1].set_xlabel('Term Size')
            # Save Figure
            plt.tight_layout()
            try:
                plt.savefig('{}_GO.png'.format(args.out))
            except FutureWarning as e:
                pass
        else:
            log('Skipping GO Volcano.')
Exemplo n.º 9
0
    refgen = co.RefGen(ref)
    if refgen.has_annotations():
        print('Processing annotations for {}...'.format(ref))
        func_data_db[ref] = refgen
        func_data_db[ref].export_annotations(
            os.path.join(conf['scratch'], (ref + '.tsv')))
        if hasGWS:
            geneWordBuilder(ref,
                            [os.path.join(conf['scratch'], (ref + '.tsv'))],
                            [1], ['2 end'], ['tab'], [True])

# Find any GO ontologies we have for the networks we have
print('Finding applicable GO Ontologies...')
GOnt_db = {}
for name in co.Tools.available_datasets('GOnt')['Name']:
    gont = co.GOnt(name)
    if gont.refgen.name not in GOnt_db:
        GOnt_db[gont.refgen.name] = gont

# Generate in memory term lists
print('Finding all available terms...')
terms = {}
for name, ont in onts.items():
    terms[name] = []
    for term in ont.iter_terms():
        terms[name].append({
            'name':
            term.id,
            'desc':
            term.desc,
            'snps':
Exemplo n.º 10
0
    def from_CLI(cls, args):
        '''
            Implements an interface for the CLI to perform overlap
            Analysis
        '''
        if args.genes != [None]:
            source = 'genes'
        elif args.go is not None:
            source = 'go'
        elif args.gwas is not None:
            source = 'gwas'
        self = cls.create(source, description='CLI Overlap')
        self.source = source
        self.args = args
        # Build base camoco objects
        self.cob = co.COB(args.cob)

        # Generate the ontology of terms that we are going to look
        # at the overlap of
        if source == 'genes':
            # Be smart about this
            import re
            args.genes = list(
                chain(*[re.split('[,; ]', x) for x in args.genes]))
            self.ont = pd.DataFrame()
            self.ont.name = 'GeneList'
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        elif source == 'go':
            self.ont = co.GOnt(args.go)
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        elif source == 'gwas':
            self.ont = co.GWAS(args.gwas)
        else:
            raise ValueError(
                'Please provide a valid overlap source (--genes, --go or --gwas)'
            )
        try:
            self.generate_output_name()
        except ValueError as e:
            return

        # Save strongest description arguments if applicable
        if 'strongest' in self.args.snp2gene:
            if not (self.ont._global('strongest_attr') == args.strongest_attr):
                self.ont.set_strongest(attr=args.strongest_attr)
            if not (bool(int(self.ont._global('strongest_higher'))) == bool(
                    args.strongest_higher)):
                self.ont.set_strongest(higher=args.strongest_higher)

        # Generate a terms iterable
        if self.source == 'genes':
            # make a single term
            loci = self.cob.refgen.from_ids(self.args.genes)
            if len(loci) < len(self.args.genes):
                self.cob.log('Some input genes not in network')
            terms = [Term('CustomTerm', desc='Custom from CLI', loci=loci)]
        else:
            # Generate terms from the ontology
            if 'all' in self.args.terms:
                terms = list(self.ont.iter_terms())
            else:
                terms = [self.ont[term] for term in self.args.terms]
        all_results = list()
        results = []

        num_total_terms = len(terms)
        # Iterate through terms and calculate
        for i, term in enumerate(terms):
            self.cob.log(' ---------- Calculating overlap for {} of {} Terms',
                         i, num_total_terms)
            if term.id in self.args.skip_terms:
                self.cob.log('Skipping {} since it was in --skip-terms',
                             term.id)
            self.cob.log('Generating SNP-to-gene mapping')
            # If appropriate, generate SNP2Gene Loci
            if self.args.candidate_flank_limit > 0:
                loci = self.snp2gene(term, self.ont)
            else:
                loci = list(term.loci)
                for x in loci:
                    x.window = 1

            # Filter out terms with insufficient or too many genes
            if len(loci) < 2 or len(loci) < args.min_term_size:
                self.cob.log('Not enough genes to perform overlap')
                continue
            if args.max_term_size != None and len(loci) > args.max_term_size:
                self.cob.log('Too many genes to perform overlap')
                continue

            # Send some output to the terminal
            self.cob.log(
                "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)",
                term.id, self.ont.name, self.cob.name,
                self.args.candidate_window_size,
                self.args.candidate_flank_limit, len(loci))
            if args.dry_run:
                continue
            # Do the dirty
            try:
                overlap = self.overlap(loci)
            except DataError as e:
                continue
            self.cob.log('Generating bootstraps')
            bootstraps = self.generate_bootstraps(loci, overlap)
            bs_mean = bootstraps.groupby('iter').score.apply(np.mean).mean()
            bs_std = bootstraps.groupby('iter').score.apply(np.std).mean()
            # Calculate z scores for density
            self.cob.log('Calculating Z-Scores')
            if bs_std != 0:
                overlap['zscore'] = (overlap.score - bs_mean) / bs_std
                bootstraps['zscore'] = (bootstraps.score - bs_mean) / bs_std
            else:
                # If there is no variation, make all Z-scores 0
                overlap['zscore'] = bootstraps['zscore'] = 0
            # Calculate FDR
            self.cob.log('Calculating FDR')
            overlap['fdr'] = np.nan
            max_zscore = int(overlap.zscore.max()) + 1
            for zscore in np.arange(0, max_zscore, 0.25):
                num_random = bootstraps\
                        .groupby('iter')\
                        .apply(lambda df: sum(df.zscore >= zscore))\
                        .mean()
                num_real = sum(overlap.zscore >= zscore)
                # Calculate FDR
                if num_real != 0 and num_random != 0:
                    fdr = num_random / num_real
                elif num_real != 0 and num_random == 0:
                    fdr = 0
                else:
                    fdr = 1
                overlap.loc[overlap.zscore >= zscore, 'fdr'] = fdr
                overlap.loc[overlap.zscore >= zscore, 'num_real'] = num_real
                overlap.loc[overlap.zscore >= zscore,
                            'num_random'] = num_random
                overlap.loc[overlap.zscore >= zscore, 'bs_mean'] = bs_mean
                overlap.loc[overlap.zscore >= zscore, 'bs_std'] = bs_std
                overlap.sort_values(by=['zscore'],
                                    ascending=False,
                                    inplace=True)
            overlap_pval = (
                (sum(bootstraps.groupby('iter').apply(lambda x: x.score.mean()) >= overlap.score.mean()))\
                / len(bootstraps.iter.unique())
            )
            # This gets collated into all_results below
            overlap['COB'] = self.cob.name
            overlap['Ontology'] = self.ont.name
            overlap['Term'] = term.id
            overlap['WindowSize'] = self.args.candidate_window_size
            overlap['FlankLimit'] = self.args.candidate_flank_limit
            overlap['TermLoci'] = len(term.loci)
            overlap['TermCollapsedLoci'] = len(loci)
            overlap['TermPValue'] = overlap_pval
            overlap['NumBootstraps'] = len(bootstraps.iter.unique())
            overlap['Method'] = self.args.method
            overlap['SNP2Gene'] = self.args.snp2gene
            results.append(overlap.reset_index())
            # Summarize results
            if self.args.method == 'density':
                overlap_score = np.nanmean(overlap.score) / (
                    1 / np.sqrt(overlap.num_trans_edges.mean()))
            elif self.args.method == 'locality':
                overlap_score = np.nanmean(overlap.score)
            self.cob.log('Overlap Score ({}): {} (p<{})'.format(
                self.args.method, overlap_score, overlap_pval))
        if not args.dry_run:
            # Consolidate results and output to files
            self.results = pd.concat(results)
            self.results.to_csv(self.args.out, sep='\t', index=None)

            # Make an actual results object if not exists
            overlap_object = cls.create(self.ont)

            # Save the results to the SQLite table
            self.results.to_sql('overlap',
                                sqlite3.connect(overlap_object.db.filename),
                                if_exists='append',
                                index=False)
Exemplo n.º 11
0
    def from_CLI(cls, args):
        """
            Implements an interface to the CLI to perform GWAS simulation
        """
        self = cls()
        # Build the base objects
        self.args = args
        # Load camoco objects
        self.go = co.GOnt(self.args.GOnt)
        self.cob = co.COB(self.args.cob)
        self.generate_output_name()

        # Generate an iterable of GO Terms
        if "all" in self.args.terms:
            # Create a list of all terms within the size specification
            terms = list(
                self.go.iter_terms(
                    min_term_size=self.args.min_term_size,
                    max_term_size=self.args.max_term_size,
                ))
        elif os.path.exists(self.args.terms[0]):
            # If parameter is a filename, read term name from a filenamie
            terms = list(
                [self.go[x.strip()] for x in open(args.terms[0]).readlines()])
        else:
            # Generate terms from a parameter list
            terms = list([self.go[x] for x in self.args.terms])
        # Iterate and calculate
        log("Simulating GWAS for {} GO Terms", len(terms))
        min_term_size = np.min([len(x) for x in terms])
        max_term_size = np.max([len(x) for x in terms])
        log("All terms are between {} and {} 'SNPs'", min_term_size,
            max_term_size)

        results = []
        for i, term in enumerate(terms):
            log("-" * 75)
            window_size = self.args.candidate_window_size
            flank_limit = self.args.candidate_flank_limit
            # Generate a series of densities for parameters
            num_genes = len([x for x in term.loci if x in self.cob])
            eloci = [
                x for x in term.effective_loci(window_size=window_size)
                if x in self.cob
            ]
            eloci = self.simulate_missing_candidates(eloci,
                                                     self.args.percent_mcr)
            eloci = self.simulate_false_candidates(eloci,
                                                   self.args.percent_fcr)
            log(
                "GWAS Simulation {}: {} ({}/{} genes in {})",
                i,
                term.id,
                len(eloci),
                num_genes,
                self.cob.name,
            )
            # Make sure that the number of genes is adequate
            if num_genes > self.args.max_term_size:
                log("Too many genes... skipping")
                continue
            elif num_genes < self.args.min_term_size:
                log("Too few genes... skipping")
                continue
            elif num_genes == 0:
                continue
            # Generate candidate genes from the effecive loci
            candidates = self.cob.refgen.candidate_genes(
                eloci, flank_limit=flank_limit)
            log(
                "SNP to gene mapping finds {} genes at window:{} bp, "
                "flanking:{} genes",
                len(candidates),
                self.args.candidate_window_size,
                self.args.candidate_flank_limit,
            )
            overlap = self.overlap(eloci)
            # Dont bother bootstrapping on terms with overlap score below 0
            if overlap.score.mean() < 0:
                continue
            bootstraps = self.generate_bootstraps(eloci, overlap)
            bs_mean = bootstraps.groupby("iter").score.apply(np.mean).mean()
            bs_std = bootstraps.groupby("iter").score.apply(np.std).mean()
            # Calculate z scores for density
            overlap["zscore"] = (overlap.score - bs_mean) / bs_std
            bootstraps["zscore"] = (bootstraps.score - bs_mean) / bs_std
            overlap_pval = (sum(
                bootstraps.groupby("iter").apply(lambda x: x.score.mean()) >=
                overlap.score.mean())) / len(bootstraps.iter.unique())
            # Create a results object
            overlap["COB"] = self.cob.name
            overlap["Ontology"] = self.go.name
            overlap["Term"] = term.id
            overlap["WindowSize"] = self.args.candidate_window_size
            overlap["FlankLimit"] = self.args.candidate_flank_limit
            overlap["FCR"] = args.percent_fcr
            overlap["MCR"] = args.percent_mcr
            overlap["NumRealGenes"] = num_genes
            overlap["NumEffective"] = len(eloci)
            overlap["NumCandidates"] = len(candidates)
            overlap["TermSize"] = len(term)
            overlap["TermCollapsedLoci"] = len(eloci)
            overlap["TermPValue"] = overlap_pval
            overlap["NumBootstraps"] = len(bootstraps.iter.unique())
            overlap["Method"] = self.args.method
            results.append(overlap.reset_index())

        self.results = pd.concat(results)
        self.results.to_csv(args.out, sep="\t", index=False)
Exemplo n.º 12
0
    def from_CLI(cls, args):
        '''
            Implements an interface for the CLI to perform overlap
            Analysis
        '''

        self = cls.create(args.gwas, description='CLI Overlap')
        # Build base camoco objects
        self.args = args
        self.cob = co.COB(args.cob)
        if args.go:
            self.ont = co.GOnt(args.gwas)
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        else:
            self.ont = co.GWAS(args.gwas)
        self.generate_output_name()

        # Generate a terms iterable
        if 'all' in self.args.terms:
            terms = self.ont.iter_terms()
        else:
            terms = [self.ont[term] for term in self.args.terms]
        all_results = list()

        results = []
        # Iterate through terms and calculate
        for term in terms:
            if term.id in self.args.skip_terms:
                self.cob.log('Skipping {} since it was in --skip-terms',
                             term.id)
            # Generate SNP2Gene Loci
            loci = self.snp2gene(term)
            if len(loci) < 2 or len(loci) < args.min_term_size:
                self.cob.log('Not enough genes to perform overlap')
                continue
            if args.max_term_size != None and len(loci) > args.max_term_size:
                self.cob.log('Too many genes to perform overlap')
                continue
            # Send some output to the terminal
            self.cob.log(
                "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)",
                term.id, self.ont.name, self.cob.name,
                self.args.candidate_window_size,
                self.args.candidate_flank_limit, len(loci))
            if args.dry_run:
                continue

            # Do the dirty
            try:
                overlap = self.overlap(loci)
            except DataError as e:
                continue
            bootstraps = self.generate_bootstraps(loci, overlap)
            bs_mean = bootstraps.groupby('iter').score.apply(np.mean).mean()
            bs_std = bootstraps.groupby('iter').score.apply(np.std).mean()
            # Calculate z scores for density
            if bs_std != 0:
                overlap['zscore'] = (overlap.score - bs_mean) / bs_std
                bootstraps['zscore'] = (bootstraps.score - bs_mean) / bs_std
            else:
                # If there is no variation, make all Z-scores 0
                overlap['zscore'] = bootstraps['zscore'] = 0
            # Calculate FDR
            overlap['fdr'] = np.nan
            max_zscore = int(overlap.zscore.max()) + 1
            for zscore in np.arange(0, max_zscore, 0.25):
                num_random = bootstraps\
                        .groupby('iter')\
                        .apply(lambda df: sum(df.zscore >= zscore))\
                        .mean()
                num_real = sum(overlap.zscore >= zscore)
                # Calculate FDR
                if num_real != 0 and num_random != 0:
                    fdr = num_random / num_real
                elif num_real != 0 and num_random == 0:
                    fdr = 0
                else:
                    fdr = 1
                overlap.loc[overlap.zscore >= zscore, 'fdr'] = fdr
                overlap.loc[overlap.zscore >= zscore, 'num_real'] = num_real
                overlap.loc[overlap.zscore >= zscore,
                            'num_random'] = num_random
                overlap.loc[overlap.zscore >= zscore, 'bs_mean'] = bs_mean
                overlap.loc[overlap.zscore >= zscore, 'bs_std'] = bs_std
                overlap.sort_values(by=['zscore'],
                                    ascending=False,
                                    inplace=True)
            overlap_pval = (
                (sum(bootstraps.groupby('iter').apply(lambda x: x.score.mean()) >= overlap.score.mean()))\
                / len(bootstraps.iter.unique())
            )
            # This gets collated into all_results below
            overlap['COB'] = self.cob.name
            overlap['Ontology'] = self.ont.name
            overlap['Term'] = term.id
            overlap['WindowSize'] = self.args.candidate_window_size
            overlap['FlankLimit'] = self.args.candidate_flank_limit
            overlap['TermLoci'] = len(term.loci)
            overlap['TermCollapsedLoci'] = len(loci)
            overlap['TermPValue'] = overlap_pval
            overlap['NumBootstraps'] = len(bootstraps.iter.unique())
            overlap['Method'] = self.args.method
            overlap['SNP2Gene'] = self.args.snp2gene
            results.append(overlap.reset_index())
        if not args.dry_run:
            self.results = pd.concat(results)
            self.results.to_csv(self.args.out, sep='\t', index=None)
            overlap_object = cls.create(self.ont)
            overlap_object.results = results
            self.results.to_sql('overlap',
                                sqlite3.connect(overlap_object.db.filename),
                                if_exists='append',
                                index=False)
Exemplo n.º 13
0
    sys.exit(2)

for opt, arg in opts:
    if opt in ("-c", "--cob"):
        cob = arg
    elif opt in ("-g", "--go"):
        go = arg
    elif opt in ("-h", "--help"):
        usage()
        sys.exit(2)
    else:
        assert False, "unhandled option"

# Set the network and GO object
cob = co.COB(cob)
go = co.GOnt(go)

TotGenes = cob.num_genes()

# change from np.ndarray to pd dataframe
Clusters = pd.DataFrame(cob.clusters)

# Make a ordered dictionary to each key
# is a cluster and each value is the
# genes in that cluster
ClustDict = collections.OrderedDict()
for index, row in Clusters.iterrows():
    if row[0] in ClustDict.keys():
        ClustDict[row[0]].append(index)
    else:
        ClustDict[row[0]] = [index]
Exemplo n.º 14
0
def cob_health(args):
    log = coblog()
    log(
        f"\n"
        f"-----------------------------\n"
        f"   Network Health:{args.cob} \n"
        f"-----------------------------\n"
    )
    log(f"\nCreating reports in {os.getcwd()}\n\n")

    cob = co.COB(args.cob)
    if args.out is None:
        args.out = "{}_Health".format(cob.name)
    log(f"Output prefix: {args.out}")

    if args.edge_zscore_cutoff is not None:
        log("Changing Z-Score cutoff to {}", args.edge_zscore_cutoff)
        cob.set_sig_edge_zscore(args.edge_zscore_cutoff)

    log("Printing Summary ---------------------------------------------------")
    if not path.exists("{}.summary.txt".format(args.out)):
        with open("{}.summary.txt".format(args.out), "w") as OUT:
            # Print out the network summary
            cob.summary(file=OUT)
    else:
        log("Skipped summary.")

    log("Plotting Scores ----------------------------------------------------")
    if not path.exists("{}_CoexPCC_raw.png".format(args.out)):
        cob.plot_scores("{}_CoexPCC_raw.png".format(args.out), pcc=True)
    else:
        log("Skipped Raw.")

    if not path.exists("{}_CoexScore_zscore.png".format(args.out)):
        cob.plot_scores("{}_CoexScore_zscore.png".format(args.out), pcc=False)
    else:
        log("Skipped Norm.")

    log("Plotting Expression ------------------------------------------------")
    # if not path.exists('{}_Expr_raw.png'.format(args.out)):
    #    cob.plot(
    #        '{}_Expr_raw.png'.format(args.out),
    #        include_accession_labels=True,
    #        raw=True,
    #        cluster_method=None
    #    )
    # else:
    #    log('Skipped raw.')
    if not path.exists("{}_Expr_norm.png".format(args.out)):
        cob.plot_heatmap(
            "{}_Expr_norm.png".format(args.out),
            include_accession_labels=True,
            raw=False,
            cluster_method="ward",
            cluster_accessions=True,
        )
    else:
        log("Skipped norm.")
    # log('Plotting Cluster Expression-----------------------------------------')
    # if not path.exists('{}_Expr_cluster.png'.format(args.out)):
    #    cob.plot(
    #        '{}_Expr_cluster.png'.format(args.out),
    #        include_accession_labels=True,
    #        raw=False,
    #        cluster_accessions=True,
    #        avg_by_cluster=True
    #    )
    # else:
    #    log('Skipped norm.')

    log("Printing QC Statistics ---------------------------------------------")
    if args.refgen is not None:
        if not path.exists("{}_qc_gene.txt".format(args.out)):
            # Print out the breakdown of QC Values
            refgen = co.RefGen(args.refgen)
            gene_qc = cob._bcolz("qc_gene")
            gene_qc = gene_qc[gene_qc.pass_membership]
            gene_qc["chrom"] = ["chr" + str(refgen[x].chrom) for x in gene_qc.index]
            gene_qc = gene_qc.groupby("chrom").agg(sum, axis=0)
            # Add totals at the bottom
            totals = gene_qc.ix[:, slice(1, None)].apply(sum)
            totals.name = "TOTAL"
            gene_qc = gene_qc.append(totals)
            gene_qc.to_csv("{}_qc_gene.txt".format(args.out), sep="\t")
        else:
            log("Skipped QC summary.")

    log("Plotting Degree Distribution ---------------------------------------")
    if not path.exists("{}_DegreeDist.png".format(args.out)):
        degree = cob.degree["Degree"].values
        # Using powerlaw makes run-time warning the first time you use it.
        # This is still an open issue on the creators github.
        # The creator recommends removing this warning as long as there is a fit.
        np.seterr(divide="ignore", invalid="ignore")
        fit = powerlaw.Fit(degree, discrete=True, xmin=1)
        # get an axis
        ax = plt.subplot()
        # Calculate log ratios
        t2p = fit.distribution_compare("truncated_power_law", "power_law")
        t2e = fit.distribution_compare("truncated_power_law", "exponential")
        p2e = fit.distribution_compare("power_law", "exponential")
        # Plot!
        emp = fit.plot_ccdf(ax=ax, color="r", linewidth=3, label="Empirical Data")
        pwr = fit.power_law.plot_ccdf(
            ax=ax, linewidth=2, color="b", linestyle=":", label="Power law"
        )
        tpw = fit.truncated_power_law.plot_ccdf(
            ax=ax, linewidth=2, color="k", linestyle="-.", label="Truncated Power"
        )
        exp = fit.exponential.plot_ccdf(
            ax=ax, linewidth=2, color="g", linestyle="--", label="Exponential"
        )
        ####
        ax.set_ylabel("p(Degree≥x)")
        ax.set_xlabel("Degree Frequency")
        ax.legend(loc="best")
        plt.title("{} Degree Distribution".format(cob.name))
        # Save Fig
        try:
            plt.savefig("{}_DegreeDist.png".format(args.out))
        except FutureWarning as e:
            # This is a matplotlib bug
            pass
    else:
        log("Skipping Degree Dist.")

    if args.go is not None:
        log("Plotting GO --------------------------------------------------------")
        # Set the alpha based on the tails
        if args.two_tailed == True:
            alpha = 0.05 / 2
        else:
            alpha = 0.05
        # Generate the GO Table
        if not path.exists("{}_GO.csv".format(args.out)):
            go = co.GOnt(args.go)
            term_ids = []
            density_emp = []
            density_pvals = []
            locality_emp = []
            locality_pvals = []
            term_sizes = []
            term_desc = []
            terms_tested = 0
            # max_terms limits the number of GO terms tested (sub-sampling)
            if args.max_terms is not None:
                log("Limiting to {} GO Terms", args.max_terms)
                terms = go.rand(
                    n=args.max_terms,
                    min_term_size=args.min_term_size,
                    max_term_size=args.max_term_size,
                )
            else:
                # Else do the whole set (default is terms between 10 and 300 genes)
                terms = go.iter_terms(
                    min_term_size=args.min_term_size, max_term_size=args.max_term_size
                )
            for term in terms:
                # Some terms will lose genes that are not in networks
                term.loci = list(filter(lambda x: x in cob, term.loci))
                # Skip terms that are not an adequate size
                if len(term) < args.min_term_size or len(term) > args.max_term_size:
                    continue
                # set density value for two tailed go so we only test it once
                density = cob.density(term.loci)
                # one tailed vs two tailed test
                density_emp.append(density)
                #
                term_ids.append(term.id)
                term_sizes.append(len(term))
                term_desc.append(str(term.desc))
                # ------ Density
                # Calculate PVals
                density_bs = np.array(
                    [
                        cob.density(cob.refgen.random_genes(n=len(term.loci)))
                        for x in range(args.num_bootstraps)
                    ]
                )
                if density > 0:
                    pval = sum(density_bs >= density) / args.num_bootstraps
                else:
                    pval = sum(density_bs <= density) / args.num_bootstraps
                density_pvals.append(pval)

                # ------- Locality
                locality = cob.locality(term.loci, include_regression=True).resid.mean()
                locality_emp.append(locality)
                # Calculate PVals
                locality_bs = np.array(
                    [
                        cob.locality(
                            cob.refgen.random_genes(n=len(term.loci)),
                            include_regression=True,
                        ).resid.mean()
                        for x in range(args.num_bootstraps)
                    ]
                )
                if locality > 0:
                    pval = sum(locality_bs >= locality) / args.num_bootstraps
                else:
                    pval = sum(locality_bs <= locality) / args.num_bootstraps
                locality_pvals.append(pval)
                # -------------
                terms_tested += 1
                if terms_tested % 100 == 0 and terms_tested > 0:
                    log("Processed {} terms".format(terms_tested))
            go_enrichment = pd.DataFrame(
                {
                    "GOTerm": term_ids,
                    "desc": term_desc,
                    "size": term_sizes,
                    "density": density_emp,
                    "density_pval": density_pvals,
                    "locality": locality_emp,
                    "locality_pval": locality_pvals,
                }
            )
            # Calculate significance
            go_enrichment["density_significant"] = go_enrichment.density_pval < alpha
            go_enrichment["locality_significant"] = go_enrichment.locality_pval < alpha
            # Calculate bonferonni
            go_enrichment["density_bonferroni"] = go_enrichment.density_pval < (
                alpha / len(go_enrichment)
            )
            go_enrichment["locality_bonferroni"] = go_enrichment.locality_pval < (
                alpha / len(go_enrichment)
            )
            # Store the GO results in a CSV
            go_enrichment.sort_values(by="density_pval", ascending=True).to_csv(
                "{}_GO.csv".format(args.out), index=False
            )
            if terms_tested == 0:
                log.warn("No GO terms met your min/max gene criteria!")
        else:
            go_enrichment = pd.read_table("{}_GO.csv".format(args.out), sep=",")

        if not path.exists("{}_GO.png".format(args.out)):
            # Convert pvals to log10
            with np.errstate(divide="ignore"):
                # When no bootstraps are more extreme than the term, the minus log pval yields an infinite
                go_enrichment["density_pval"] = -1 * np.log10(
                    go_enrichment["density_pval"]
                )
                go_enrichment["locality_pval"] = -1 * np.log10(
                    go_enrichment["locality_pval"]
                )
                # Fix the infinites so they are plotted
                max_density = np.max(
                    go_enrichment["density_pval"][
                        np.isfinite(go_enrichment["density_pval"])
                    ]
                )
                max_locality = np.max(
                    go_enrichment["locality_pval"][
                        np.isfinite(go_enrichment["locality_pval"])
                    ]
                )
                go_enrichment.loc[
                    np.logical_not(np.isfinite(go_enrichment["density_pval"])),
                    "density_pval",
                ] = (max_density + 1)
                go_enrichment.loc[
                    np.logical_not(np.isfinite(go_enrichment["locality_pval"])),
                    "locality_pval",
                ] = (max_locality + 1)
            plt.clf()
            # Calculate the transparency based on the number of terms
            if len(go_enrichment) > 20:
                transparency_alpha = 0.05
            else:
                transparency_alpha = 1

            # --------------------------------------------------------------------
            # Start Plotting
            figure, axes = plt.subplots(3, 2, figsize=(12, 12))
            # -----------
            # Density
            # ----------
            log_alpha = -1 * np.log10(alpha)
            axes[0, 0].scatter(
                go_enrichment["density"],
                go_enrichment["density_pval"],
                alpha=transparency_alpha,
                color="blue",
            )
            axes[0, 0].set_xlabel("Empirical Density (Z-Score)")
            axes[0, 0].set_ylabel("Bootstraped -log10(p-value)")
            fold = sum(np.array(go_enrichment["density_pval"]) > log_alpha) / (
                alpha * len(go_enrichment)
            )
            axes[0, 0].axhline(y=-1 * np.log10(0.05), color="red")
            axes[0, 0].text(
                min(axes[0, 0].get_xlim()),
                -1 * np.log10(alpha) + 0.1,
                "{:.3g} Fold Enrichement".format(fold),
                color="red",
            )
            axes[0, 0].set_title("Density Health")
            # Plot pvalue by term size
            axes[1, 0].scatter(
                go_enrichment["size"],
                go_enrichment["density_pval"],
                alpha=transparency_alpha,
                color="blue",
            )
            axes[1, 0].set_ylabel("Bootstrapped -log10(p-value)")
            axes[1, 0].set_xlabel("Term Size")
            axes[1, 0].axhline(y=-1 * np.log10(alpha), color="red")
            axes[2, 0].scatter(
                go_enrichment["size"],
                go_enrichment["density"],
                alpha=transparency_alpha,
                color="blue",
            )
            # Plot raw density by term size
            axes[2, 0].scatter(
                go_enrichment.query(f"density_pval>{log_alpha}")["size"],
                go_enrichment.query(f"density_pval>{log_alpha}")["density"],
                alpha=transparency_alpha,
                color="r",
            )
            axes[2, 0].set_ylabel("Density")
            axes[2, 0].set_xlabel("Term Size")
            # ------------
            # Do Locality
            # ------------
            axes[0, 1].scatter(
                go_enrichment["locality"],
                go_enrichment["locality_pval"],
                alpha=transparency_alpha,
                color="blue",
            )
            axes[0, 1].set_xlabel("Empirical Locality (Residual)")
            axes[0, 1].set_ylabel("Bootstraped -log10(p-value)")
            fold = sum(np.array(go_enrichment["locality_pval"]) > log_alpha) / (
                0.05 * len(go_enrichment)
            )
            axes[0, 1].axhline(y=-1 * np.log10(0.05), color="red")
            axes[0, 1].text(
                min(axes[0, 1].get_xlim()),
                -1 * np.log10(alpha),
                "{:.3g} Fold Enrichement".format(fold),
                color="red",
            )
            axes[0, 1].set_title("Locality Health")
            axes[1, 1].scatter(
                go_enrichment["size"],
                go_enrichment["locality_pval"],
                alpha=transparency_alpha,
                color="blue",
            )
            axes[1, 1].set_xlabel("Term Size")
            axes[1, 1].set_ylabel("Bootstrapped -log10(p-value)")
            axes[1, 1].axhline(y=-1 * np.log10(0.05), color="red")
            axes[2, 1].scatter(
                go_enrichment["size"],
                go_enrichment["locality"],
                alpha=transparency_alpha,
                color="blue",
            )
            axes[2, 1].scatter(
                go_enrichment.query(f"locality_pval>{log_alpha}")["size"],
                go_enrichment.query(f"locality_pval>{log_alpha}")["locality"],
                alpha=transparency_alpha,
                color="r",
            )
            axes[2, 1].set_ylabel("Locality")
            axes[2, 1].set_xlabel("Term Size")
            # Save Figure
            plt.tight_layout()
            try:
                plt.savefig("{}_GO.png".format(args.out))
            except FutureWarning as e:
                pass
        else:
            log("Skipping GO Volcano.")