def locality(args): log = coblog() log("\n" "-----------------------\n" " Network Locality \n" "-----------------------\n") # Generate output dirs if args.out != sys.stdout: args.out = "{}_Locality.tsv".format(args.out.replace(".tsv", "")) if os.path.dirname(args.out) != "": os.makedirs(os.path.dirname(args.out), exist_ok=True) if os.path.exists("{}_Locality.tsv".format(args.out.replace(".tsv", ""))): log("{}_Locality.csv exists! Skipping!".format( args.out.replace(".tsv", ""))) return None # Grab the COB object cob = co.COB(args.cob) gwas = co.GWAS(args.gwas) # If there is a different score for 'significant', update the COB object if args.sig_edge_zscore is not None: cob.set_sig_edge_zscore(args.sig_edge_zscore) # If all, grab a generater if "all" in args.terms: terms = gwas.iter_terms() else: # Otherwise get the term out of the GWAS terms = (gwas[x] for x in args.terms) # Add in text for axes locality = pd.DataFrame([generate_data(cob, x, args) for x in terms]) locality.to_csv(args.out, sep="\t", index=None)
def ZmIonome(Zm5bFGS): # Delete the old dataset if cf.test.force.Ontology: tools.del_dataset("GWAS", "ZmIonome", force=True) if not tools.available_datasets("GWAS", "ZmIonome"): # Grab path the csv csv = os.path.join( cf.options.testdir, "raw", "GWAS", "Ionome", "sigGWASsnpsCombinedIterations.longhorn.allLoc.csv.gz", ) # Define our reference geneome df = pd.DataFrame.from_csv(csv, index_col=None) # Import class from dataframe IONS = co.GWAS.from_DataFrame( df, "ZmIonome", "Maize Ionome", Zm5bFGS, term_col="el", chr_col="chr", pos_col="pos", ) # Get rid of pesky Cobalt IONS.del_term("Co59") # I guess we need a test in here too return IONS else: return co.GWAS("ZmIonome")
def ZmWallace(Zm5bFGS): if cf.test.force.Ontology: tools.del_dataset("GWAS", "ZmWallace", force=True) if not tools.available_datasets("GWAS", "ZmWallace"): # Grab path the csv csv = os.path.join( cf.options.testdir, "raw", "GWAS", "WallacePLoSGenet", "Wallace_etal_2014_PLoSGenet_GWAS_hits-150112.txt.gz", ) # Define our reference geneome df = pd.DataFrame.from_csv(csv, index_col=None, sep="\t") # Import class from dataframe gwas = co.GWAS.from_DataFrame( df, "ZmWallace", "Wallace PLoS ONE Dataset.", Zm5bFGS, term_col="trait", chr_col="chr", pos_col="pos", ) return gwas else: return co.GWAS("ZmWallace")
def SNP2Gene_breakdown(self,COB=None): ''' Provides a breakdown of SNP to gene mapping parameters for each term in the Overlap. Includes the number of initial Loci, the number of collapsed Loci (within a window) and the number of candidate genes (within a window and up to a flank limit) Parameters ---------- COB : str (default: 'average') If specfified, the results will be composed only of SNP to gene mappings from a single COB network. If 'average' is specified, the results will be the SET of genes across all COB networks. ''' # Get some help def bp_to_kb(bp): return "{}KB".format(int(bp/1000)) def get_level(df,level): ''' Returns the level values by name ''' level_index = df.columns.names.index(level) return df.columns.levels[level_index] # Prepare the data frame results if COB == None: results = self.results else: results = self.results.query('COB=="{}"'.format(COB)) # Total for the Ionome ont = co.GWAS(self.results.Ontology.unique()[0]) ref = co.COB(self.results.COB.unique()[0])._parent_refgen # Make an aggregate term total = co.Term('total',loci=set(chain(* [x.loci for x in ont.terms()]))) # Calculate number of SNPs snps = pd.DataFrame(pd.pivot_table(results,index="Term",values='TermLoci')) snps.columns = pd.MultiIndex.from_product([['GWAS SNPs'],['-'],['-']],names=['Name','WindowSize','FlankLimit']) snps.ix['Total'] = len(total.loci) # Calculate number of Candidate Loci loci = pd.pivot_table(results,index="Term",columns=['WindowSize'],values='TermCollapsedLoci') for window_size in loci.columns: loci.ix['Total',window_size] = len(total.effective_loci(window_size)) loci.columns = pd.MultiIndex.from_product([['Collapsed Loci'],list(map(bp_to_kb,loci.columns)),['-']],names=['Name','WindowSize','FlankLimit']) # Calculate number of Candidate Genes genes = pd.pivot_table(results,index='Term',columns=['WindowSize','FlankLimit'],values='gene',aggfunc=lambda x: len(set(x))) for window_size in get_level(genes,'WindowSize'): for flank_limit in get_level(genes,'FlankLimit'): genes.ix['Total',(window_size,flank_limit)] = len(ref.candidate_genes(total.effective_loci(window_size=window_size),flank_limit=flank_limit)) genes.columns = pd.MultiIndex.from_product( [['Candidate Genes'], list(map(bp_to_kb,get_level(genes,"WindowSize"))), get_level(genes,'FlankLimit') ], names=['Name','WindowSize','FlankLimit'] ) results = snps.join(loci).join(genes) #ionome_eff_loci = [len()] return results.astype(int)
def term_network(): # Get data from the form and derive some stuff cob = networks[str(request.form['network'])] ontology = str(request.form['ontology']) term = str(request.form['term']) nodeCutoff = safeOpts('nodeCutoff',int(request.form['nodeCutoff'])) edgeCutoff = safeOpts('edgeCutoff',float(request.form['edgeCutoff'])) windowSize = safeOpts('windowSize',int(request.form['windowSize'])) flankLimit = safeOpts('flankLimit',int(request.form['flankLimit'])) # Get the candidates cob.set_sig_edge_zscore(edgeCutoff) genes = cob.refgen.candidate_genes( co.GWAS(ontology)[term].effective_loci(window_size=windowSize), flank_limit=flankLimit, chain=True, include_parent_locus=True, #include_parent_attrs=['numIterations', 'avgEffectSize'], include_num_intervening=True, include_rank_intervening=True, include_num_siblings=True) # Base of the result dict net = {} # If there are GWAS results, pass them in if ontology in gwas_data_db: gwas_data = gwas_data_db[ontology].get_data(cob=cob.name, term=term,windowSize=windowSize,flankLimit=flankLimit) net['nodes'] = getNodes(genes, cob, term, gwas_data=gwas_data, nodeCutoff=nodeCutoff) # Otherwise just run it without else: net['nodes'] = getNodes(genes, cob, term, nodeCutoff=nodeCutoff) # Get the edges of the nodes that will be rendered render_list = [] for node in net['nodes']: if node['data']['render'] == 'x': render_list.append(node['data']['id']) net['edges'] = getEdges(render_list, cob) # Log Data Point to COB Log cob.log(term + ': Found ' + str(len(net['nodes'])) + ' nodes, ' + str(len(net['edges'])) + ' edges') # Return it as a JSON object return jsonify(net)
def list_command(args): if args.type != None and args.name != None: if args.terms: if args.type == 'GWAS': gwas = co.GWAS(args.name) print('\n'.join([x.id for x in gwas.iter_terms()])) elif args.type =='GOnt': gont = co.GOnt(args.name) print('\n'.join([x.id for x in gont.iter_terms()])) else: print(co.available_datasets(args.type,args.name)) elif args.type != None and args.name == None: args.name = '%' print(co.available_datasets(args.type,args.name).to_string()) else: args.type = '%' args.name = '%' print(co.available_datasets(args.type,args.name).to_string())
def list_command(args): if args.type != None and args.name != None: if args.terms: if args.type == "GWAS": gwas = co.GWAS(args.name) print("\n".join([x.id for x in gwas.iter_terms()])) elif args.type == "GOnt": gont = co.GOnt(args.name) print("\n".join([x.id for x in gont.iter_terms()])) if args.names: print(" ".join(available_datasets(args.type, args.name).Name)) else: print(available_datasets(args.type, args.name)) elif args.type != None and args.name == None: args.name = "%" print(available_datasets(args.type, args.name).to_string()) else: args.type = "%" args.name = "%" print(available_datasets(args.type, args.name).to_string())
def ZmWallace(Zm5bFGS): if cf.test.force.Ontology: tools.del_dataset('GWAS', 'ZmWallace', force=True) if not tools.available_datasets('GWAS', 'ZmWallace'): # Grab path the csv csv = os.path.join( cf.options.testdir, 'raw', 'GWAS', 'WallacePLoSGenet', 'Wallace_etal_2014_PLoSGenet_GWAS_hits-150112.txt.bz2') # Define our reference geneome df = pd.DataFrame.from_csv(csv, index_col=None, sep='\t') # Import class from dataframe gwas = co.GWAS.from_DataFrame(df, 'ZmWallace', 'Wallace PLoS ONE Dataset.', Zm5bFGS, term_col='trait', chr_col='chr', pos_col='pos') return gwas else: return co.GWAS('ZmWallace')
def plot_gwas(args): # snag the appropriate COB cob = co.COB(args.cob) # snag the GWAS object gwas = co.GWAS(args.gwas) if 'all' in args.terms: terms = gwas.iter_terms() else: terms = [gwas[term] for term in args.terms] # Make a plot for each Term for term in terms: loci = list(term.loci) # create a dictionary of Loci which we can refer to using ids locus_lookup = {x.id: x for x in loci} # Each chromosome gets a plot chroms = set([x.chrom for x in loci]) # Create a figure with a subplot for each chromosome f, axes = plt.subplots(len(chroms), figsize=(15, 4 * len(chroms))) plt.title('{} Term'.format(term.id)) # Pull out the snp to gene mappings if args.snp2gene == 'effective': loci = sorted( term.effective_loci(window_size=args.candidate_window_size)) elif args.snp2gene == 'strongest': loci = term.strongest_loci(window_size=args.candidate_window_size, attr=args.strongest_attr, lowest=args.strongest_higher) else: raise ValueError('{} not valid snp2gene mapping'.format( args.snp2gene)) # iterate over Loci seen_chroms = set() voffset = 1 # Vertical Offset current_axis = 0 y_labels = [] y_ticks = [] for i, locus in enumerate(loci): hoffset = -1 * locus.window # Reset the temp variables in necessary if locus.chrom not in seen_chroms: seen_chroms.add(locus.chrom) current_axis = len(seen_chroms) - 1 voffset = 1 if len(y_labels) > 0 and current_axis > 0: # Set the old labels in the current axes[current_axis - 1].set_yticks(y_ticks) axes[current_axis - 1].set_yticklabels(y_labels) y_labels = [] y_ticks = [] # Get current axis cax = axes[current_axis] # Set up labels if first time one axis if voffset == 1: cax.set_ylabel('Chrom: ' + locus.chrom) cax.set_xlabel('Loci') # shortcut for current axis cax.hold(True) # Plot ALL Genes for gene in gwas.refgen.candidate_genes(locus, flank_limit=10e10): cax.barh(bottom=voffset, width=len(gene), height=5, zorder=1, left=hoffset + gene.start - locus.start + locus.window, label='RefGen Genes', color='grey') # Plot the candidate genes for gene in cob.refgen.candidate_genes(locus, flank_limit=10e10): cax.barh(bottom=voffset, width=len(gene), height=5, zorder=1, left=hoffset + gene.start - locus.start + locus.window, label='Gene Passed QC', color='green') # Plot the candidate genes for gene in cob.refgen.candidate_genes( locus, flank_limit=args.candidate_flank_limit): cax.barh(bottom=voffset, width=len(gene), height=5, zorder=1, left=hoffset + gene.start - locus.start + locus.window, label='Candidate Gene', color='red') # Plot the Effective Locus cax.scatter( # Upstream hoffset, voffset, marker='>', zorder=2) cax.scatter( # Start hoffset + locus.window, voffset, marker='.', color='blue', zorder=2) cax.scatter( # Stop hoffset + locus.window + len(locus), voffset, marker='.', color='blue', zorder=2) cax.scatter( # Downstream hoffset + locus.window + len(locus) + locus.window, voffset, marker='<', zorder=2) # Plot the Sub Loci for id in locus.sub_loci: if id in locus_lookup: sub_locus = locus_lookup[id] cax.scatter(hoffset + locus.window + abs(sub_locus.start - locus.start), voffset, zorder=2, marker='.', label='SNP', color='blue') # place a block for interlocal distance y_labels.append(commify(locus.start)) y_ticks.append(voffset) voffset += 10 # Have to finish off the ticks on the last chromosome axes[current_axis].set_yticks(y_ticks) axes[current_axis].set_yticklabels(y_labels) # Save Plot plt.savefig(args.out.replace('.png', '_{}.png'.format(term.id))) plt.close()
def from_CLI(cls, args): """ Implements an interface for the CLI to perform overlap Analysis """ if args.genes != [None]: source = "genes" elif args.go is not None: source = "go" elif args.gwas is not None: source = "gwas" elif args.ontology is not None: source = 'ontology' self = cls.create(source+'_CLI', description="CLI Overlap") self.source = source self.args = args # Build base camoco objects self.cob = co.COB(args.cob) # Generate the ontology of terms that we are going to look # at the overlap of if source == "genes": # Be smart about this import re args.genes = list(chain(*[re.split("[,; ]", x) for x in args.genes])) self.ont = pd.DataFrame() self.ont.name = "GeneList" args.candidate_window_size = 1 args.candidate_flank_limit = 0 elif source == "go": self.ont = co.GOnt(args.go) args.candidate_window_size = 1 args.candidate_flank_limit = 0 elif source == "gwas": self.ont = co.GWAS(args.gwas) elif source == 'ontology': self.ont = co.Ontology(args.ontology) else: raise ValueError( "Please provide a valid overlap source (--genes, --go or --gwas)" ) try: self.generate_output_name() except ValueError as e: return # Save strongest description arguments if applicable if "strongest" in self.args.snp2gene: if not (self.ont._global("strongest_attr") == args.strongest_attr): self.ont.set_strongest(attr=args.strongest_attr) if not ( bool(int(self.ont._global("strongest_higher"))) == bool(args.strongest_higher) ): self.ont.set_strongest(higher=args.strongest_higher) # Generate a terms iterable if self.source == "genes": # make a single term loci = self.cob.refgen.from_ids(self.args.genes) if len(loci) < len(self.args.genes): self.cob.log("Some input genes not in network") terms = [Term("CustomTerm", desc="Custom from CLI", loci=loci)] else: # Generate terms from the ontology if "all" in self.args.terms: terms = list(self.ont.iter_terms()) else: terms = [self.ont[term] for term in self.args.terms] all_results = list() results = [] num_total_terms = len(terms) # Iterate through terms and calculate for i, term in enumerate(terms): self.cob.log( " ---------- Calculating overlap for {} of {} Terms", i, num_total_terms ) if term.id in self.args.skip_terms: self.cob.log("Skipping {} since it was in --skip-terms", term.id) self.cob.log("Generating SNP-to-gene mapping") # If appropriate, generate SNP2Gene Loci if self.args.candidate_flank_limit > 0: loci = self.snp2gene(term, self.ont) else: loci = list(term.loci) for x in loci: x.window = 1 # Filter out terms with insufficient or too many genes if len(loci) < 2 or len(loci) < args.min_term_size: self.cob.log("Not enough genes to perform overlap") continue if args.max_term_size != None and len(loci) > args.max_term_size: self.cob.log("Too many genes to perform overlap") continue # Send some output to the terminal self.cob.log( "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)", term.id, self.ont.name, self.cob.name, self.args.candidate_window_size, self.args.candidate_flank_limit, len(loci), ) if args.dry_run: continue # Do the dirty try: overlap = self.overlap(loci) except DataError as e: continue self.cob.log("Generating bootstraps") bootstraps = self.generate_bootstraps(loci, overlap) bs_mean = bootstraps.groupby("iter").score.apply(np.mean).mean() bs_std = bootstraps.groupby("iter").score.apply(np.std).mean() # Calculate z scores for density self.cob.log("Calculating Z-Scores") if bs_std != 0: overlap["zscore"] = (overlap.score - bs_mean) / bs_std bootstraps["zscore"] = (bootstraps.score - bs_mean) / bs_std else: # If there is no variation, make all Z-scores 0 overlap["zscore"] = bootstraps["zscore"] = 0 # Calculate FDR self.cob.log("Calculating FDR") overlap["fdr"] = np.nan max_zscore = int(overlap.zscore.max()) + 1 for zscore in np.arange(0, max_zscore, 0.25): num_random = ( bootstraps.groupby("iter") .apply(lambda df: sum(df.zscore >= zscore)) .mean() ) num_real = sum(overlap.zscore >= zscore) # Calculate FDR if num_real != 0 and num_random != 0: fdr = num_random / num_real elif num_real != 0 and num_random == 0: fdr = 0 else: fdr = 1 overlap.loc[overlap.zscore >= zscore, "fdr"] = fdr overlap.loc[overlap.zscore >= zscore, "num_real"] = num_real overlap.loc[overlap.zscore >= zscore, "num_random"] = num_random overlap.loc[overlap.zscore >= zscore, "bs_mean"] = bs_mean overlap.loc[overlap.zscore >= zscore, "bs_std"] = bs_std overlap.sort_values(by=["zscore"], ascending=False, inplace=True) overlap_pval = ( sum( bootstraps.groupby("iter").apply(lambda x: x.score.mean()) >= overlap.score.mean() ) ) / len(bootstraps.iter.unique()) # This gets collated into all_results below overlap["COB"] = self.cob.name overlap["Ontology"] = self.ont.name overlap["Term"] = term.id overlap["WindowSize"] = self.args.candidate_window_size overlap["FlankLimit"] = self.args.candidate_flank_limit overlap["TermLoci"] = len(term.loci) overlap["TermCollapsedLoci"] = len(loci) overlap["TermPValue"] = overlap_pval overlap["NumBootstraps"] = len(bootstraps.iter.unique()) overlap["Method"] = self.args.method overlap["SNP2Gene"] = self.args.snp2gene results.append(overlap.reset_index()) # Summarize results if self.args.method == "density": overlap_score = np.nanmean(overlap.score) / ( 1 / np.sqrt(overlap.num_trans_edges.mean()) ) elif self.args.method == "locality": overlap_score = np.nanmean(overlap.score) self.cob.log( "Overlap Score ({}): {} (p<{})".format( self.args.method, overlap_score, overlap_pval ) ) if not args.dry_run: # Consolidate results and output to files self.results = pd.concat(results) self.results.to_csv(self.args.out, sep="\t", index=None) # Make an actual results object if not exists overlap_object = cls.create(self.ont) # Save the results to the SQLite table self.results.to_sql( "overlap", sqlite3.connect(overlap_object.db.filename), if_exists="append", index=False, )
refLinks = {} for name, net in networks.items(): network_info.append({ 'name': net.name, 'refgen': net._global('parent_refgen'), 'desc': net.description, }) if net._global('parent_refgen') in conf['refLinks']: refLinks[net.name] = conf['refLinks'][net._global('parent_refgen')] print('Availible Networks: ' + str(networks)) # Generate ontology list based on allowed list and load them into memory print('Preloading GWASes into Memory...') if len(conf['gwas']) < 1: conf['gwas'] = list(co.Tools.available_datasets('GWAS')['Name'].values) onts = {x: co.GWAS(x) for x in conf['gwas']} onts_info = {} for m, net in networks.items(): ref = net._global('parent_refgen') onts_info[net.name] = [] for n, ont in onts.items(): if ont.refgen.name == ref: onts_info[net.name].append({ 'name': ont.name, 'refgen': ont.refgen.name, 'desc': ont.description }) print('Availible GWASes: ' + str(onts_info)) # Prefetch the gene names for all the networks print('Fetching gene names for networks...')
def snp2gene(args): ''' Perform SNP (locus) to candidate gene mapping ''' if args.out != sys.stdout: # Create any non-existant directories if os.path.dirname(args.out) != '': os.makedirs(os.path.dirname(args.out),exist_ok=True) if os.path.exists(args.out) and not args.force: print( "Output for {} exists! Skipping!".format( args.out ),file=sys.stderr ) return None # Set a flag saying this is from a COB refgen from_cob = False # Create the refgen (option to create it from a COB) if co.Tools.available_datasets('Expr',args.refgen): refgen = co.COB(args.refgen).refgen from_cob = args.refgen elif co.Tools.available_datasets('RefGen',args.refgen): refgen = co.RefGen(args.refgen) # Create the GWAS object ont = co.GWAS(args.gwas) if 'all' in args.terms: terms = ont.iter_terms() else: terms = [ont[term] for term in args.terms] data = pd.DataFrame() results = [] for term in terms: for window_size in args.candidate_window_size: for flank_limit in args.candidate_flank_limit: if 'effective' in args.snp2gene: # Map to effective effective_loci = term.effective_loci( window_size=window_size ) elif 'strongest' in args.snp2gene: effective_loci = term.strongest_loci( window_size=window_size, attr=args.strongest_attr, lowest=args.strongest_higher ) genes = pd.DataFrame([ x.as_dict() for x in refgen.candidate_genes( effective_loci, flank_limit=flank_limit, include_parent_locus=True, include_num_siblings=True, include_num_intervening=True, include_rank_intervening=True, include_SNP_distance=True, include_parent_attrs=args.include_parent_attrs, attrs={'Term':term.id}, ) ]) genes['FlankLimit'] = flank_limit genes['WindowSize'] = window_size genes['RefGen'] = refgen.name if from_cob != False: genes['COB'] = from_cob data = pd.concat([data,genes]) # Add data from gene info files original_number_genes = len(data) for info_file in args.gene_info: log('Adding info for {}',info_file) # Assume the file is a table info = pd.read_table(info_file,sep='\t') if len(info.columns) == 1: info = pd.read_table(info_file,sep=',') # try to match as many columns as possible matching_columns = set(data.columns).intersection(info.columns) log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns)) data = pd.merge(data,info,how='left') if len(data) != original_number_genes: log.warn( 'There were multiple info rows for some genes. ' 'Beware of potential duplicate candidate gene entries! ' ) # Generate the output file data.to_csv(args.out,index=None,sep='\t') log("Summary stats") print('-'*100) #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit)) print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique()))) print("Number of candidate genes per term:") print(data.groupby('Term').apply(lambda df: len(df.ID)))
def from_CLI(cls, args): ''' Implements an interface for the CLI to perform overlap Analysis ''' if args.genes != [None]: source = 'genes' elif args.go is not None: source = 'go' elif args.gwas is not None: source = 'gwas' self = cls.create(source, description='CLI Overlap') self.source = source self.args = args # Build base camoco objects self.cob = co.COB(args.cob) # Generate the ontology of terms that we are going to look # at the overlap of if source == 'genes': # Be smart about this import re args.genes = list( chain(*[re.split('[,; ]', x) for x in args.genes])) self.ont = pd.DataFrame() self.ont.name = 'GeneList' args.candidate_window_size = 1 args.candidate_flank_limit = 0 elif source == 'go': self.ont = co.GOnt(args.go) args.candidate_window_size = 1 args.candidate_flank_limit = 0 elif source == 'gwas': self.ont = co.GWAS(args.gwas) else: raise ValueError( 'Please provide a valid overlap source (--genes, --go or --gwas)' ) try: self.generate_output_name() except ValueError as e: return # Save strongest description arguments if applicable if 'strongest' in self.args.snp2gene: if not (self.ont._global('strongest_attr') == args.strongest_attr): self.ont.set_strongest(attr=args.strongest_attr) if not (bool(int(self.ont._global('strongest_higher'))) == bool( args.strongest_higher)): self.ont.set_strongest(higher=args.strongest_higher) # Generate a terms iterable if self.source == 'genes': # make a single term loci = self.cob.refgen.from_ids(self.args.genes) if len(loci) < len(self.args.genes): self.cob.log('Some input genes not in network') terms = [Term('CustomTerm', desc='Custom from CLI', loci=loci)] else: # Generate terms from the ontology if 'all' in self.args.terms: terms = list(self.ont.iter_terms()) else: terms = [self.ont[term] for term in self.args.terms] all_results = list() results = [] num_total_terms = len(terms) # Iterate through terms and calculate for i, term in enumerate(terms): self.cob.log(' ---------- Calculating overlap for {} of {} Terms', i, num_total_terms) if term.id in self.args.skip_terms: self.cob.log('Skipping {} since it was in --skip-terms', term.id) self.cob.log('Generating SNP-to-gene mapping') # If appropriate, generate SNP2Gene Loci if self.args.candidate_flank_limit > 0: loci = self.snp2gene(term, self.ont) else: loci = list(term.loci) for x in loci: x.window = 1 # Filter out terms with insufficient or too many genes if len(loci) < 2 or len(loci) < args.min_term_size: self.cob.log('Not enough genes to perform overlap') continue if args.max_term_size != None and len(loci) > args.max_term_size: self.cob.log('Too many genes to perform overlap') continue # Send some output to the terminal self.cob.log( "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)", term.id, self.ont.name, self.cob.name, self.args.candidate_window_size, self.args.candidate_flank_limit, len(loci)) if args.dry_run: continue # Do the dirty try: overlap = self.overlap(loci) except DataError as e: continue self.cob.log('Generating bootstraps') bootstraps = self.generate_bootstraps(loci, overlap) bs_mean = bootstraps.groupby('iter').score.apply(np.mean).mean() bs_std = bootstraps.groupby('iter').score.apply(np.std).mean() # Calculate z scores for density self.cob.log('Calculating Z-Scores') if bs_std != 0: overlap['zscore'] = (overlap.score - bs_mean) / bs_std bootstraps['zscore'] = (bootstraps.score - bs_mean) / bs_std else: # If there is no variation, make all Z-scores 0 overlap['zscore'] = bootstraps['zscore'] = 0 # Calculate FDR self.cob.log('Calculating FDR') overlap['fdr'] = np.nan max_zscore = int(overlap.zscore.max()) + 1 for zscore in np.arange(0, max_zscore, 0.25): num_random = bootstraps\ .groupby('iter')\ .apply(lambda df: sum(df.zscore >= zscore))\ .mean() num_real = sum(overlap.zscore >= zscore) # Calculate FDR if num_real != 0 and num_random != 0: fdr = num_random / num_real elif num_real != 0 and num_random == 0: fdr = 0 else: fdr = 1 overlap.loc[overlap.zscore >= zscore, 'fdr'] = fdr overlap.loc[overlap.zscore >= zscore, 'num_real'] = num_real overlap.loc[overlap.zscore >= zscore, 'num_random'] = num_random overlap.loc[overlap.zscore >= zscore, 'bs_mean'] = bs_mean overlap.loc[overlap.zscore >= zscore, 'bs_std'] = bs_std overlap.sort_values(by=['zscore'], ascending=False, inplace=True) overlap_pval = ( (sum(bootstraps.groupby('iter').apply(lambda x: x.score.mean()) >= overlap.score.mean()))\ / len(bootstraps.iter.unique()) ) # This gets collated into all_results below overlap['COB'] = self.cob.name overlap['Ontology'] = self.ont.name overlap['Term'] = term.id overlap['WindowSize'] = self.args.candidate_window_size overlap['FlankLimit'] = self.args.candidate_flank_limit overlap['TermLoci'] = len(term.loci) overlap['TermCollapsedLoci'] = len(loci) overlap['TermPValue'] = overlap_pval overlap['NumBootstraps'] = len(bootstraps.iter.unique()) overlap['Method'] = self.args.method overlap['SNP2Gene'] = self.args.snp2gene results.append(overlap.reset_index()) # Summarize results if self.args.method == 'density': overlap_score = np.nanmean(overlap.score) / ( 1 / np.sqrt(overlap.num_trans_edges.mean())) elif self.args.method == 'locality': overlap_score = np.nanmean(overlap.score) self.cob.log('Overlap Score ({}): {} (p<{})'.format( self.args.method, overlap_score, overlap_pval)) if not args.dry_run: # Consolidate results and output to files self.results = pd.concat(results) self.results.to_csv(self.args.out, sep='\t', index=None) # Make an actual results object if not exists overlap_object = cls.create(self.ont) # Save the results to the SQLite table self.results.to_sql('overlap', sqlite3.connect(overlap_object.db.filename), if_exists='append', index=False)
network_info.append( { "name": net.name, "refgen": net._global("parent_refgen"), "desc": net.description, } ) if net._global("parent_refgen") in conf["refLinks"]: refLinks[net.name] = conf["refLinks"][net._global("parent_refgen")] print("Availible Networks: " + str(networks)) # Generate ontology list based on allowed list and load them into memory print("Preloading GWASes into Memory...") if len(conf["gwas"]) < 1: conf["gwas"] = list(co.Tools.available_datasets("GWAS")["Name"].values) onts = {x: co.GWAS(x) for x in conf["gwas"]} onts_info = {} for m, net in networks.items(): ref = net._global("parent_refgen") onts_info[net.name] = [] for n, ont in onts.items(): if ont.refgen.name == ref: onts_info[net.name].append( {"name": ont.name, "refgen": ont.refgen.name, "desc": ont.description} ) print("Availible GWASes: " + str(onts_info)) # Prefetch the gene names for all the networks print("Fetching gene names for networks...") network_genes = {} for name, net in networks.items():
def from_CLI(cls, args): ''' Implements an interface for the CLI to perform overlap Analysis ''' self = cls.create(args.gwas, description='CLI Overlap') # Build base camoco objects self.args = args self.cob = co.COB(args.cob) if args.go: self.ont = co.GOnt(args.gwas) args.candidate_window_size = 1 args.candidate_flank_limit = 0 else: self.ont = co.GWAS(args.gwas) self.generate_output_name() # Generate a terms iterable if 'all' in self.args.terms: terms = self.ont.iter_terms() else: terms = [self.ont[term] for term in self.args.terms] all_results = list() results = [] # Iterate through terms and calculate for term in terms: if term.id in self.args.skip_terms: self.cob.log('Skipping {} since it was in --skip-terms', term.id) # Generate SNP2Gene Loci loci = self.snp2gene(term) if len(loci) < 2 or len(loci) < args.min_term_size: self.cob.log('Not enough genes to perform overlap') continue if args.max_term_size != None and len(loci) > args.max_term_size: self.cob.log('Too many genes to perform overlap') continue # Send some output to the terminal self.cob.log( "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)", term.id, self.ont.name, self.cob.name, self.args.candidate_window_size, self.args.candidate_flank_limit, len(loci)) if args.dry_run: continue # Do the dirty try: overlap = self.overlap(loci) except DataError as e: continue bootstraps = self.generate_bootstraps(loci, overlap) bs_mean = bootstraps.groupby('iter').score.apply(np.mean).mean() bs_std = bootstraps.groupby('iter').score.apply(np.std).mean() # Calculate z scores for density if bs_std != 0: overlap['zscore'] = (overlap.score - bs_mean) / bs_std bootstraps['zscore'] = (bootstraps.score - bs_mean) / bs_std else: # If there is no variation, make all Z-scores 0 overlap['zscore'] = bootstraps['zscore'] = 0 # Calculate FDR overlap['fdr'] = np.nan max_zscore = int(overlap.zscore.max()) + 1 for zscore in np.arange(0, max_zscore, 0.25): num_random = bootstraps\ .groupby('iter')\ .apply(lambda df: sum(df.zscore >= zscore))\ .mean() num_real = sum(overlap.zscore >= zscore) # Calculate FDR if num_real != 0 and num_random != 0: fdr = num_random / num_real elif num_real != 0 and num_random == 0: fdr = 0 else: fdr = 1 overlap.loc[overlap.zscore >= zscore, 'fdr'] = fdr overlap.loc[overlap.zscore >= zscore, 'num_real'] = num_real overlap.loc[overlap.zscore >= zscore, 'num_random'] = num_random overlap.loc[overlap.zscore >= zscore, 'bs_mean'] = bs_mean overlap.loc[overlap.zscore >= zscore, 'bs_std'] = bs_std overlap.sort_values(by=['zscore'], ascending=False, inplace=True) overlap_pval = ( (sum(bootstraps.groupby('iter').apply(lambda x: x.score.mean()) >= overlap.score.mean()))\ / len(bootstraps.iter.unique()) ) # This gets collated into all_results below overlap['COB'] = self.cob.name overlap['Ontology'] = self.ont.name overlap['Term'] = term.id overlap['WindowSize'] = self.args.candidate_window_size overlap['FlankLimit'] = self.args.candidate_flank_limit overlap['TermLoci'] = len(term.loci) overlap['TermCollapsedLoci'] = len(loci) overlap['TermPValue'] = overlap_pval overlap['NumBootstraps'] = len(bootstraps.iter.unique()) overlap['Method'] = self.args.method overlap['SNP2Gene'] = self.args.snp2gene results.append(overlap.reset_index()) if not args.dry_run: self.results = pd.concat(results) self.results.to_csv(self.args.out, sep='\t', index=None) overlap_object = cls.create(self.ont) overlap_object.results = results self.results.to_sql('overlap', sqlite3.connect(overlap_object.db.filename), if_exists='append', index=False)
geneWordBuilder(func,[os.path.join(scratch_folder,(func+'.tsv'))],[1],['2 end'],['tab'],[True]) # Find any GO ontologies we have for the networks we have print('Finding applicable GO Ontologies...') GOnt_db = {} for name in co.available_datasets('GOnt')['Name']: gont = co.GOnt(name) if gont.refgen.name not in GOnt_db: GOnt_db[gont.refgen.name] = gont # Generate in memory term lists print('Finding all available terms...') terms = {} for ont in gwas_sets['data']: terms[ont[0]] = {'data': [(term.id,term.desc,len(term.loci), len(co.GWAS(ont[0]).refgen.candidate_genes(term.effective_loci(window_size=50000)))) for term in co.GWAS(ont[0]).iter_terms()]} # Set up the logging file handler = logging.FileHandler('COBErrors.log') handler.setLevel(logging.INFO) app.logger.addHandler(handler) app.logger.setLevel(logging.INFO) print('All Ready!') #--------------------------------------------- # Routes #--------------------------------------------- # Sends off the homepage @app.route('/') def index():