def simulate_false_candidates(self, eloci, FCR=0): """ Simulate the effects of False Candidate Rates (MCR) Parameters ---------- eloci : (iterable of loci objects) Set of true starting loci FCR : (float - default = 0) False candidate rate. Can provide either a floating point percentage or a whole number. (i.e. method will convert 30 <-> 0.30) Returns ------- an iterable of loci """ # Convert between percentage and float if FCR < 1 and FCR > 0: FCR = FCR * 100 # FCR: Replace a percentage of SNPs with false positives if FCR > 0: # replace some loci with random genes if FDR specified num_fcr = math.ceil(len(eloci) * (FCR / 101)) fcr_loci = self.cob.refgen.random_genes( num_fcr, window=self.args.candidate_window_size) log( "Simulating {}% of SNPs as false positive -> adding {} SNPs", FCR, len(fcr_loci), ) # permute and truncate the loci then add fcr loci eloci = np.concatenate([eloci, np.array(list(fcr_loci))]) return eloci
def simulate_missing_candidates(self, eloci, MCR=0): """ Simulate the effects of Missing Candidate Rates (MCR) Parameters ---------- eloci : (iterable of loci objects) Set of true starting loci MCR : (float - default = 0) Missing candidate rate. Can provide either a floating point percentage or a whole number. (i.e. method will convert 30 <-> 0.30) Returns ------- an iterable of loci """ # Convert between percentage and float if MCR < 1 and MCR > 0: MCR = MCR * 100 # MCR: Remove a percentage of SNPs to simulate false negatives if MCR > 0: # Calulate the index needed to hit percent missing missing_index = math.ceil(len(eloci) * (1 - (MCR / 100))) if missing_index < 2: missing_index = 2 new_eloci = np.random.permutation(eloci)[0:missing_index] log( "Simulating {}% of SNPs missed by GWAS ({} SNPs -> {})", MCR, len(eloci), len(new_eloci), ) eloci = new_eloci return eloci
def generate_bootstraps(self, loci, overlap): """ Bootstrapping procedure. Our target here is to provide enough bootstraps to identify loci that are significant at n==1000 bootstraps. The auto procedure will continue untill we meet n==1000 OR we find 50 bootstraps that are have higher score such that we will never be significant at 1000 boostraps (0.05 * 1000 = 50). """ target_score = overlap.score.mean() max_bs = 1000 num_bs = 0 bs = [] if self.args.num_bootstraps == "auto": # Create a bullshit generator... err bootstraps bs_generator = (self.overlap(loci, bootstrap=True, iter_name=x) for x in range(max_bs)) while num_bs <= 50 and len(bs) < 1000: # Add 50 bootstraps to current bootstraps bs = [next(bs_generator) for x in range(50)] + bs # Find the number of bs more extreme than the empirical score num_bs = sum([df.score.mean() >= target_score for df in bs]) log( "Iteration: {} -- current pval: {} {}% complete", len(bs), num_bs / len(bs), max(len(bs) / 10, 100 * (num_bs / 50)), ) else: # Be a lil broke back noodle and explicitly bootstrap bs = [ self.overlap(loci, bootstrap=True, iter_name=x) for x in range(int(self.args.num_bootstraps)) ] return pd.concat(bs)
def from_file(cls,filename,normalize=True): self = cls() with open(filename,'r') as IN: in_data_table = False cur_soft = None cur_data = list() for i,line in enumerate(IN): line = line.strip() if line.startswith('^'): if cur_soft: # Add the filled SOFT to Family if cur_soft.type == 'Sample': if cur_soft.is_raw() and normalize: log("Normalizing {}",cur_soft.name) cur_soft.transform() self.samples.append(cur_soft) else: setattr(self,cur_soft.type.lower(),cur_soft) # WE have a new SOFT type,name = line.replace('^','').replace(' = ','=').split('=',1) type = type.lower().capitalize() if type == 'Series': cur_soft = Series(name) elif type == 'Sample': cur_soft = Sample(name) elif type == 'Platform': cur_soft = Platform(name) else: cur_soft = Soft(name,type=type.lower().capitalize()) cur_data = list() elif line.startswith('!') and 'table_begin' in line: in_data_table = True elif line.startswith('!') and 'table_end' in line: in_data_table = False # Create DataFrame and append to SOFT cur_headers = cur_data.pop(0) cur_soft.tbl = pd.DataFrame.from_records(data=cur_data,columns=cur_headers) cur_soft.tbl.index = cur_soft.tbl.icol(0) # Turn -Inf into NaNs cur_soft.tbl[cur_soft.tbl == float('-Inf')] = np.nan cur_data = list() elif line.startswith("!"): # add info to key,val = map(str.strip,line.replace('!'+cur_soft.type+'_','').split('=',1)) cur_soft.update_info(key,val) elif line.startswith('#'): # Columns descriptions cur_soft.headers.append(line) elif in_data_table: cur_data.append(line.replace('"','').split('\t')) return self
def effective_snps(self,window_size=None,max_genes_between=1): ''' Collapse down loci that have overlapping windows. Also collapses down snps that have ''' locus_list = sorted(self.locus_list) if window_size is not None: for locus in locus_list: locus.window = window_size collapsed = [locus_list.pop(0)] for locus in locus_list: # if they have overlapping windows, collapse if locus in collapsed[-1]: # Collapse if the windows overlap collapsed[-1] = collapsed[-1] + locus else: collapsed.append(locus) log('{}: Found {} SNPs -> {} effective SNPs',self.name,len(self.locus_list),len(collapsed)) return collapsed
def create(cls,name,description,type='Camoco'): ''' This is a class method to create a new camoco type object. It initializes base directory hierarchy ''' basedir = os.path.realpath( os.path.expanduser(cf.get('options','basedir')) ) # Create the basedir if not exists try: os.makedirs(basedir,exist_ok=True) os.makedirs(os.path.join(basedir,"logs"),exist_ok=True) os.makedirs(os.path.join(basedir,"databases"),exist_ok=True) os.makedirs(os.path.join(basedir,"analyses"),exist_ok=True) os.makedirs(os.path.join(basedir,"tmp"),exist_ok=True) except Exception as e: log(' Could not create files in {}',basedir) raise try: # Create the base camoco database lite.Connection( os.path.join(basedir,'databases','Camoco.Camoco.db') ).cursor().execute(''' CREATE TABLE IF NOT EXISTS datasets ( name TEXT NOT NULL, description TEXT, type TEXT, added datetime DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(name,type) ); INSERT OR IGNORE INTO datasets (name,description,type) VALUES ('Camoco','Camoco base','Camoco'); INSERT OR FAIL INTO datasets (name,description,type) VALUES (?,?,?)''',(name,description,type) ) except ConstraintError as e: log.warn('CAUTION! {}.{} Database already exists.',name,type) self = cls(name) return self
def _guess_groups(dataframe,max_r2=0.99,max_namediff=0.8): ''' Given a data frame, this method checks to see that each column has a correlation below the max_r2. If it is above, a new column is created using the mean.''' # Calculate correlation cors = dataframe.corr() # Each column starts in its own group column_groups = list(range(0,len(cors))) # Iterate over upper triangular, this guarantees that # we dont overwrite lower numbered groups # # #OOOOOOOOOOO <- group # i <- row number (along d) # ------------- <- matrix # |d x x <- row (x means r2 > max) # | d # | d # | d # | d for i,row in enumerate(np.triu(cors.as_matrix(),k=1)): if any(row > max_r2): # higher numbered columns are highly correlated with current column which = np.where(row > max_r2)[0] from difflib import SequenceMatcher for match in which: diffratio = SequenceMatcher(None,dataframe.columns[i],dataframe.columns[match]).ratio() # if column group is already assigned, keep assignment if column_groups[i] != i and diffratio > max_namediff: # Here, samples have high expression correlation AND similar names (i.e. rep1 vs rep2) group = column_groups[i] log("{} is {} correlated with {}", dataframe.columns[i], diffratio, dataframe.columns[match] ) else: # Otherwise start your own group group = i for x in which: column_groups[x] = group return column_groups
def create(cls, name, description, type='Camoco'): ''' This is a class method to create a new camoco type object. It initializes base directory hierarchy ''' basedir = os.path.realpath( os.path.expanduser(cf.get('options', 'basedir'))) # Create the basedir if not exists try: os.makedirs(basedir, exist_ok=True) os.makedirs(os.path.join(basedir, "logs"), exist_ok=True) os.makedirs(os.path.join(basedir, "databases"), exist_ok=True) os.makedirs(os.path.join(basedir, "analyses"), exist_ok=True) os.makedirs(os.path.join(basedir, "tmp"), exist_ok=True) except Exception as e: log(' Could not create files in {}', basedir) raise try: # Create the base camoco database lite.Connection( os.path.join(basedir, 'databases', 'Camoco.Camoco.db')).cursor().execute( ''' CREATE TABLE IF NOT EXISTS datasets ( name TEXT NOT NULL, description TEXT, type TEXT, added datetime DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(name,type) ); INSERT OR IGNORE INTO datasets (name,description,type) VALUES ('Camoco','Camoco base','Camoco'); INSERT OR FAIL INTO datasets (name,description,type) VALUES (?,?,?)''', (name, description, type)) except ConstraintError as e: log.warn('CAUTION! {}.{} Database already exists.', name, type) self = cls(name) return self
def __init__(self, name, type='Camoco', basedir="~/.camoco"): # Set up our base directory self.log = log() # A dataset already exists, return it self.db = self._database(".".join([type, name])) (self.ID, self.name, self.description, self.type, self.added) = self._database('Camoco.Camoco').cursor().execute( "SELECT rowid,* FROM datasets WHERE name = ? AND type = ?", (name, type)).fetchone() cur = self.db.cursor() cur.execute(''' CREATE TABLE IF NOT EXISTS globals ( key TEXT, val TEXT ); CREATE UNIQUE INDEX IF NOT EXISTS uniqkey ON globals(key) ''')
def __init__(self,name,type='Camoco',basedir="~/.camoco"): # Set up our base directory self.log = log() # A dataset already exists, return it self.db = self._database(".".join([type,name])) (self.ID,self.name,self.description, self.type,self.added) = self._database('Camoco.Camoco').cursor().execute( "SELECT rowid,* FROM datasets WHERE name = ? AND type = ?", (name,type) ).fetchone() cur = self.db.cursor() cur.execute(''' CREATE TABLE IF NOT EXISTS globals ( key TEXT, val TEXT ); CREATE UNIQUE INDEX IF NOT EXISTS uniqkey ON globals(key) ''')
def wget(id,force=False): ''' Downloads the GEO series from the internets into PWD''' if os.path.exists("{}_family.soft.gz".format(id)) and force == False: log("{} already exists",id) return try: log("Fetching {}",id) gse = urllib.request.urlretrieve( "ftp://ftp.ncbi.nlm.nih.gov/geo/series/{}nnn/{}/soft/{}_family.soft.gz".format(id[0:len(id)-3],id,id), "{}_family.soft.gz".format(id) ) except Exception as e: log("Could not download {}",id)
def __init__(self,name,type='Camoco',basedir="~/.camoco"): # Set up our base directory self.log = log() self.type = type # A dataset already exists, return it self.db = self._database(name) try: (self.ID,self.name,self.description,self.type,self.added) = \ self._database('Camoco',type='Camoco') \ .cursor().execute( "SELECT rowid,* FROM datasets WHERE name = ? AND type = ?", (name,type) ).fetchone() cur = self.db.cursor() cur.execute(''' CREATE TABLE IF NOT EXISTS globals ( key TEXT, val TEXT ); CREATE UNIQUE INDEX IF NOT EXISTS uniqkey ON globals(key) ''') except TypeError as e: raise TypeError('{}.{} does not exist'.format(type,name))
def snp2gene(args): ''' Perform SNP (locus) to candidate gene mapping ''' if args.out != sys.stdout: # Create any non-existant directories if os.path.dirname(args.out) != '': os.makedirs(os.path.dirname(args.out),exist_ok=True) if os.path.exists(args.out) and not args.force: print( "Output for {} exists! Skipping!".format( args.out ),file=sys.stderr ) return None # Set a flag saying this is from a COB refgen from_cob = False # Create the refgen (option to create it from a COB) if co.Tools.available_datasets('Expr',args.refgen): refgen = co.COB(args.refgen).refgen from_cob = args.refgen elif co.Tools.available_datasets('RefGen',args.refgen): refgen = co.RefGen(args.refgen) # Create the GWAS object ont = co.GWAS(args.gwas) if 'all' in args.terms: terms = ont.iter_terms() else: terms = [ont[term] for term in args.terms] data = pd.DataFrame() results = [] for term in terms: for window_size in args.candidate_window_size: for flank_limit in args.candidate_flank_limit: if 'effective' in args.snp2gene: # Map to effective effective_loci = term.effective_loci( window_size=window_size ) elif 'strongest' in args.snp2gene: effective_loci = term.strongest_loci( window_size=window_size, attr=args.strongest_attr, lowest=args.strongest_higher ) genes = pd.DataFrame([ x.as_dict() for x in refgen.candidate_genes( effective_loci, flank_limit=flank_limit, include_parent_locus=True, include_num_siblings=True, include_num_intervening=True, include_rank_intervening=True, include_SNP_distance=True, include_parent_attrs=args.include_parent_attrs, attrs={'Term':term.id}, ) ]) genes['FlankLimit'] = flank_limit genes['WindowSize'] = window_size genes['RefGen'] = refgen.name if from_cob != False: genes['COB'] = from_cob data = pd.concat([data,genes]) # Add data from gene info files original_number_genes = len(data) for info_file in args.gene_info: log('Adding info for {}',info_file) # Assume the file is a table info = pd.read_table(info_file,sep='\t') if len(info.columns) == 1: info = pd.read_table(info_file,sep=',') # try to match as many columns as possible matching_columns = set(data.columns).intersection(info.columns) log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns)) data = pd.merge(data,info,how='left') if len(data) != original_number_genes: log.warn( 'There were multiple info rows for some genes. ' 'Beware of potential duplicate candidate gene entries! ' ) # Generate the output file data.to_csv(args.out,index=None,sep='\t') log("Summary stats") print('-'*100) #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit)) print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique()))) print("Number of candidate genes per term:") print(data.groupby('Term').apply(lambda df: len(df.ID)))
import sys import os import pandas as pd import numpy as np import scipy as sp import camoco as co from itertools import chain from camoco.Tools import log # Initialize a new log object log = log() def snp2gene(args): ''' Perform SNP (locus) to candidate gene mapping ''' if args.out != sys.stdout: # Create any non-existant directories if os.path.dirname(args.out) != '': os.makedirs(os.path.dirname(args.out),exist_ok=True) if os.path.exists(args.out) and not args.force: print( "Output for {} exists! Skipping!".format( args.out ),file=sys.stderr ) return None
def from_CLI(cls, args): """ Implements an interface to the CLI to perform GWAS simulation """ self = cls() # Build the base objects self.args = args # Load camoco objects self.go = co.GOnt(self.args.GOnt) self.cob = co.COB(self.args.cob) self.generate_output_name() # Generate an iterable of GO Terms if "all" in self.args.terms: # Create a list of all terms within the size specification terms = list( self.go.iter_terms( min_term_size=self.args.min_term_size, max_term_size=self.args.max_term_size, )) elif os.path.exists(self.args.terms[0]): # If parameter is a filename, read term name from a filenamie terms = list( [self.go[x.strip()] for x in open(args.terms[0]).readlines()]) else: # Generate terms from a parameter list terms = list([self.go[x] for x in self.args.terms]) # Iterate and calculate log("Simulating GWAS for {} GO Terms", len(terms)) min_term_size = np.min([len(x) for x in terms]) max_term_size = np.max([len(x) for x in terms]) log("All terms are between {} and {} 'SNPs'", min_term_size, max_term_size) results = [] for i, term in enumerate(terms): log("-" * 75) window_size = self.args.candidate_window_size flank_limit = self.args.candidate_flank_limit # Generate a series of densities for parameters num_genes = len([x for x in term.loci if x in self.cob]) eloci = [ x for x in term.effective_loci(window_size=window_size) if x in self.cob ] eloci = self.simulate_missing_candidates(eloci, self.args.percent_mcr) eloci = self.simulate_false_candidates(eloci, self.args.percent_fcr) log( "GWAS Simulation {}: {} ({}/{} genes in {})", i, term.id, len(eloci), num_genes, self.cob.name, ) # Make sure that the number of genes is adequate if num_genes > self.args.max_term_size: log("Too many genes... skipping") continue elif num_genes < self.args.min_term_size: log("Too few genes... skipping") continue elif num_genes == 0: continue # Generate candidate genes from the effecive loci candidates = self.cob.refgen.candidate_genes( eloci, flank_limit=flank_limit) log( "SNP to gene mapping finds {} genes at window:{} bp, " "flanking:{} genes", len(candidates), self.args.candidate_window_size, self.args.candidate_flank_limit, ) overlap = self.overlap(eloci) # Dont bother bootstrapping on terms with overlap score below 0 if overlap.score.mean() < 0: continue bootstraps = self.generate_bootstraps(eloci, overlap) bs_mean = bootstraps.groupby("iter").score.apply(np.mean).mean() bs_std = bootstraps.groupby("iter").score.apply(np.std).mean() # Calculate z scores for density overlap["zscore"] = (overlap.score - bs_mean) / bs_std bootstraps["zscore"] = (bootstraps.score - bs_mean) / bs_std overlap_pval = (sum( bootstraps.groupby("iter").apply(lambda x: x.score.mean()) >= overlap.score.mean())) / len(bootstraps.iter.unique()) # Create a results object overlap["COB"] = self.cob.name overlap["Ontology"] = self.go.name overlap["Term"] = term.id overlap["WindowSize"] = self.args.candidate_window_size overlap["FlankLimit"] = self.args.candidate_flank_limit overlap["FCR"] = args.percent_fcr overlap["MCR"] = args.percent_mcr overlap["NumRealGenes"] = num_genes overlap["NumEffective"] = len(eloci) overlap["NumCandidates"] = len(candidates) overlap["TermSize"] = len(term) overlap["TermCollapsedLoci"] = len(eloci) overlap["TermPValue"] = overlap_pval overlap["NumBootstraps"] = len(bootstraps.iter.unique()) overlap["Method"] = self.args.method results.append(overlap.reset_index()) self.results = pd.concat(results) self.results.to_csv(args.out, sep="\t", index=False)
def snp2gene(args): ''' Perform SNP (locus) to candidate gene mapping ''' if args.out != sys.stdout: # Create any non-existant directories if os.path.dirname(args.out) != '': os.makedirs(os.path.dirname(args.out),exist_ok=True) if os.path.exists(args.out) and not args.force: print( "Output for {} exists! Skipping!".format( args.out ),file=sys.stderr ) return None # Set a flag saying this is from a COB refgen from_cob = False # Create the refgen (option to create it from a COB) if co.available_datasets('Expr',args.refgen): refgen = co.COB(args.refgen).refgen from_cob = args.refgen elif co.available_datasets('RefGen',args.refgen): refgen = co.RefGen(args.refgen) # Create the GWAS object ont = co.GWAS(args.gwas) if 'all' in args.terms: terms = ont.iter_terms() else: terms = [ont[term] for term in args.terms] data = pd.DataFrame() results = [] for term in terms: for window_size in args.candidate_window_size: for flank_limit in args.candidate_flank_limit: if 'effective' in args.snp2gene: # Map to effective effective_loci = term.effective_loci( window_size=window_size ) elif 'strongest' in args.snp2gene: effective_loci = term.strongest_loci( window_size=window_size, attr=args.strongest_attr, lowest=args.strongest_higher ) genes = pd.DataFrame([ x.as_dict() for x in refgen.candidate_genes( effective_loci, flank_limit=flank_limit, include_parent_locus=True, include_num_siblings=True, include_num_intervening=True, include_rank_intervening=True, include_SNP_distance=True, include_parent_attrs=args.include_parent_attrs, attrs={'Term':term.id}, ) ]) genes['FlankLimit'] = flank_limit genes['WindowSize'] = window_size genes['RefGen'] = refgen.name if from_cob != False: genes['COB'] = from_cob data = pd.concat([data,genes]) # Add data from gene info files original_number_genes = len(data) for info_file in args.gene_info: log('Adding info for {}',info_file) # Assume the file is a table info = pd.read_table(info_file,sep='\t') if len(info.columns) == 1: info = pd.read_table(info_file,sep=',') # try to match as many columns as possible matching_columns = set(data.columns).intersection(info.columns) log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns)) data = pd.merge(data,info,how='left') if len(data) != original_number_genes: log.warn( 'There were multiple info rows for some genes. ' 'Beware of potential duplicate candidate gene entries! ' ) # Generate the output file data.to_csv(args.out,index=None,sep='\t') log("Summary stats") print('-'*100) #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit)) print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique()))) print("Number of candidate genes per term:") print(data.groupby('Term').apply(lambda df: len(df.ID)))