示例#1
0
def locality(args):
    log = coblog()
    log("\n"
        "-----------------------\n"
        "   Network Locality    \n"
        "-----------------------\n")
    # Generate output dirs
    if args.out != sys.stdout:
        args.out = "{}_Locality.tsv".format(args.out.replace(".tsv", ""))
    if os.path.dirname(args.out) != "":
        os.makedirs(os.path.dirname(args.out), exist_ok=True)
    if os.path.exists("{}_Locality.tsv".format(args.out.replace(".tsv", ""))):
        log("{}_Locality.csv exists! Skipping!".format(
            args.out.replace(".tsv", "")))
        return None
    # Grab the COB object
    cob = co.COB(args.cob)
    gwas = co.GWAS(args.gwas)
    # If there is a different score for 'significant', update the COB object
    if args.sig_edge_zscore is not None:
        cob.set_sig_edge_zscore(args.sig_edge_zscore)
    # If all, grab a generater
    if "all" in args.terms:
        terms = gwas.iter_terms()
    else:
        # Otherwise get the term out of the GWAS
        terms = (gwas[x] for x in args.terms)

    # Add in text for axes
    locality = pd.DataFrame([generate_data(cob, x, args) for x in terms])
    locality.to_csv(args.out, sep="\t", index=None)
示例#2
0
def ZmIonome(Zm5bFGS):
    # Delete the old dataset
    if cf.test.force.Ontology:
        tools.del_dataset("GWAS", "ZmIonome", force=True)
    if not tools.available_datasets("GWAS", "ZmIonome"):
        # Grab path the csv
        csv = os.path.join(
            cf.options.testdir,
            "raw",
            "GWAS",
            "Ionome",
            "sigGWASsnpsCombinedIterations.longhorn.allLoc.csv.gz",
        )
        # Define our reference geneome
        df = pd.DataFrame.from_csv(csv, index_col=None)
        # Import class from dataframe
        IONS = co.GWAS.from_DataFrame(
            df,
            "ZmIonome",
            "Maize Ionome",
            Zm5bFGS,
            term_col="el",
            chr_col="chr",
            pos_col="pos",
        )
        # Get rid of pesky Cobalt
        IONS.del_term("Co59")
        # I guess we need a test in here too
        return IONS
    else:
        return co.GWAS("ZmIonome")
示例#3
0
def ZmWallace(Zm5bFGS):
    if cf.test.force.Ontology:
        tools.del_dataset("GWAS", "ZmWallace", force=True)
    if not tools.available_datasets("GWAS", "ZmWallace"):
        # Grab path the csv
        csv = os.path.join(
            cf.options.testdir,
            "raw",
            "GWAS",
            "WallacePLoSGenet",
            "Wallace_etal_2014_PLoSGenet_GWAS_hits-150112.txt.gz",
        )
        # Define our reference geneome
        df = pd.DataFrame.from_csv(csv, index_col=None, sep="\t")
        # Import class from dataframe
        gwas = co.GWAS.from_DataFrame(
            df,
            "ZmWallace",
            "Wallace PLoS ONE Dataset.",
            Zm5bFGS,
            term_col="trait",
            chr_col="chr",
            pos_col="pos",
        )
        return gwas
    else:
        return co.GWAS("ZmWallace")
示例#4
0
    def SNP2Gene_breakdown(self,COB=None):
        '''
        Provides a breakdown of SNP to gene mapping parameters for each term in the Overlap.
        Includes the number of initial Loci, the number of collapsed Loci (within a window)
        and the number of candidate genes (within a window and up to a flank limit)

        Parameters
        ----------
        COB : str (default: 'average')
            If specfified, the results will be composed only of SNP to gene
            mappings from a single COB network. If 'average' is specified,
            the results will be the SET of genes across all COB networks.
        '''
        # Get some help
        def bp_to_kb(bp):
            return "{}KB".format(int(bp/1000))
        def get_level(df,level):
            ''' Returns the level values by name '''
            level_index = df.columns.names.index(level)
            return df.columns.levels[level_index]
        # Prepare the data frame results
        if COB == None:
            results = self.results
        else:
            results = self.results.query('COB=="{}"'.format(COB))
        # Total for the Ionome
        ont = co.GWAS(self.results.Ontology.unique()[0])
        ref = co.COB(self.results.COB.unique()[0])._parent_refgen
        # Make an aggregate term
        total = co.Term('total',loci=set(chain(* [x.loci for x in ont.terms()])))
        # Calculate number of SNPs
        snps = pd.DataFrame(pd.pivot_table(results,index="Term",values='TermLoci'))
        snps.columns = pd.MultiIndex.from_product([['GWAS SNPs'],['-'],['-']],names=['Name','WindowSize','FlankLimit'])
        snps.ix['Total'] = len(total.loci)
        # Calculate number of Candidate Loci
        loci = pd.pivot_table(results,index="Term",columns=['WindowSize'],values='TermCollapsedLoci')
        for window_size in loci.columns:
            loci.ix['Total',window_size] = len(total.effective_loci(window_size))
        loci.columns = pd.MultiIndex.from_product([['Collapsed Loci'],list(map(bp_to_kb,loci.columns)),['-']],names=['Name','WindowSize','FlankLimit'])
        # Calculate number of Candidate Genes
        genes = pd.pivot_table(results,index='Term',columns=['WindowSize','FlankLimit'],values='gene',aggfunc=lambda x: len(set(x)))
        for window_size in get_level(genes,'WindowSize'):
            for flank_limit in get_level(genes,'FlankLimit'):
                genes.ix['Total',(window_size,flank_limit)] = len(ref.candidate_genes(total.effective_loci(window_size=window_size),flank_limit=flank_limit))
        genes.columns = pd.MultiIndex.from_product(
            [['Candidate Genes'],
                list(map(bp_to_kb,get_level(genes,"WindowSize"))),
                get_level(genes,'FlankLimit')
            ],
            names=['Name','WindowSize','FlankLimit']
        )
        results = snps.join(loci).join(genes)
        #ionome_eff_loci = [len()]
        return results.astype(int)
示例#5
0
def term_network():
    # Get data from the form and derive some stuff
    cob = networks[str(request.form['network'])]
    ontology = str(request.form['ontology'])
    term = str(request.form['term'])
    nodeCutoff = safeOpts('nodeCutoff',int(request.form['nodeCutoff']))
    edgeCutoff = safeOpts('edgeCutoff',float(request.form['edgeCutoff']))
    windowSize = safeOpts('windowSize',int(request.form['windowSize']))
    flankLimit = safeOpts('flankLimit',int(request.form['flankLimit']))
    
    # Get the candidates
    cob.set_sig_edge_zscore(edgeCutoff)
    genes = cob.refgen.candidate_genes(
        co.GWAS(ontology)[term].effective_loci(window_size=windowSize),
        flank_limit=flankLimit,
        chain=True,
        include_parent_locus=True,
        #include_parent_attrs=['numIterations', 'avgEffectSize'],
        include_num_intervening=True,
        include_rank_intervening=True,
        include_num_siblings=True)
    
    # Base of the result dict
    net = {}
    
    # If there are GWAS results, pass them in
    if ontology in gwas_data_db:
        gwas_data = gwas_data_db[ontology].get_data(cob=cob.name,
            term=term,windowSize=windowSize,flankLimit=flankLimit)
        net['nodes'] = getNodes(genes, cob, term, gwas_data=gwas_data, nodeCutoff=nodeCutoff)
    
    # Otherwise just run it without
    else:
        net['nodes'] = getNodes(genes, cob, term, nodeCutoff=nodeCutoff)
    
    # Get the edges of the nodes that will be rendered
    render_list = []
    for node in net['nodes']:
        if node['data']['render'] == 'x':
            render_list.append(node['data']['id'])
    net['edges'] = getEdges(render_list, cob)
    
    # Log Data Point to COB Log
    cob.log(term + ': Found ' +
        str(len(net['nodes'])) + ' nodes, ' +
        str(len(net['edges'])) + ' edges')
    
    # Return it as a JSON object
    return jsonify(net)
示例#6
0
def list_command(args):
    if args.type != None and args.name != None:
        if args.terms:
            if args.type == 'GWAS':
                gwas = co.GWAS(args.name)
                print('\n'.join([x.id for x in gwas.iter_terms()]))
            elif args.type =='GOnt':
                gont = co.GOnt(args.name)
                print('\n'.join([x.id for x in gont.iter_terms()]))
        else:
            print(co.available_datasets(args.type,args.name))
    elif args.type != None and args.name == None:
        args.name = '%'
        print(co.available_datasets(args.type,args.name).to_string())
    else:
        args.type = '%'
        args.name = '%'
        print(co.available_datasets(args.type,args.name).to_string())
示例#7
0
def list_command(args):
    if args.type != None and args.name != None:
        if args.terms:
            if args.type == "GWAS":
                gwas = co.GWAS(args.name)
                print("\n".join([x.id for x in gwas.iter_terms()]))
            elif args.type == "GOnt":
                gont = co.GOnt(args.name)
                print("\n".join([x.id for x in gont.iter_terms()]))
        if args.names:
            print(" ".join(available_datasets(args.type, args.name).Name))
        else:
            print(available_datasets(args.type, args.name))
    elif args.type != None and args.name == None:
        args.name = "%"
        print(available_datasets(args.type, args.name).to_string())
    else:
        args.type = "%"
        args.name = "%"
        print(available_datasets(args.type, args.name).to_string())
示例#8
0
def ZmWallace(Zm5bFGS):
    if cf.test.force.Ontology:
        tools.del_dataset('GWAS', 'ZmWallace', force=True)
    if not tools.available_datasets('GWAS', 'ZmWallace'):
        # Grab path the csv
        csv = os.path.join(
            cf.options.testdir, 'raw', 'GWAS', 'WallacePLoSGenet',
            'Wallace_etal_2014_PLoSGenet_GWAS_hits-150112.txt.bz2')
        # Define our reference geneome
        df = pd.DataFrame.from_csv(csv, index_col=None, sep='\t')
        # Import class from dataframe
        gwas = co.GWAS.from_DataFrame(df,
                                      'ZmWallace',
                                      'Wallace PLoS ONE Dataset.',
                                      Zm5bFGS,
                                      term_col='trait',
                                      chr_col='chr',
                                      pos_col='pos')
        return gwas
    else:
        return co.GWAS('ZmWallace')
示例#9
0
文件: plotGWAS.py 项目: zhaijj/Camoco
def plot_gwas(args):
    # snag the appropriate COB
    cob = co.COB(args.cob)
    # snag the GWAS object
    gwas = co.GWAS(args.gwas)

    if 'all' in args.terms:
        terms = gwas.iter_terms()
    else:
        terms = [gwas[term] for term in args.terms]

    # Make a plot for each Term
    for term in terms:
        loci = list(term.loci)
        # create a dictionary of Loci which we can refer to using ids
        locus_lookup = {x.id: x for x in loci}
        # Each chromosome gets a plot
        chroms = set([x.chrom for x in loci])

        # Create a figure with a subplot for each chromosome
        f, axes = plt.subplots(len(chroms), figsize=(15, 4 * len(chroms)))
        plt.title('{} Term'.format(term.id))
        # Pull out the snp to gene mappings
        if args.snp2gene == 'effective':
            loci = sorted(
                term.effective_loci(window_size=args.candidate_window_size))
        elif args.snp2gene == 'strongest':
            loci = term.strongest_loci(window_size=args.candidate_window_size,
                                       attr=args.strongest_attr,
                                       lowest=args.strongest_higher)
        else:
            raise ValueError('{} not valid snp2gene mapping'.format(
                args.snp2gene))

        # iterate over Loci
        seen_chroms = set()
        voffset = 1  # Vertical Offset
        current_axis = 0
        y_labels = []
        y_ticks = []
        for i, locus in enumerate(loci):
            hoffset = -1 * locus.window
            # Reset the temp variables in necessary
            if locus.chrom not in seen_chroms:
                seen_chroms.add(locus.chrom)
                current_axis = len(seen_chroms) - 1
                voffset = 1
                if len(y_labels) > 0 and current_axis > 0:
                    # Set the old labels in the current
                    axes[current_axis - 1].set_yticks(y_ticks)
                    axes[current_axis - 1].set_yticklabels(y_labels)
                y_labels = []
                y_ticks = []
            # Get current axis
            cax = axes[current_axis]
            # Set up labels if first time one axis
            if voffset == 1:
                cax.set_ylabel('Chrom: ' + locus.chrom)
                cax.set_xlabel('Loci')
            # shortcut for current axis
            cax.hold(True)

            # Plot ALL Genes
            for gene in gwas.refgen.candidate_genes(locus, flank_limit=10e10):
                cax.barh(bottom=voffset,
                         width=len(gene),
                         height=5,
                         zorder=1,
                         left=hoffset + gene.start - locus.start +
                         locus.window,
                         label='RefGen Genes',
                         color='grey')
            # Plot the candidate genes
            for gene in cob.refgen.candidate_genes(locus, flank_limit=10e10):
                cax.barh(bottom=voffset,
                         width=len(gene),
                         height=5,
                         zorder=1,
                         left=hoffset + gene.start - locus.start +
                         locus.window,
                         label='Gene Passed QC',
                         color='green')

            # Plot the candidate genes
            for gene in cob.refgen.candidate_genes(
                    locus, flank_limit=args.candidate_flank_limit):
                cax.barh(bottom=voffset,
                         width=len(gene),
                         height=5,
                         zorder=1,
                         left=hoffset + gene.start - locus.start +
                         locus.window,
                         label='Candidate Gene',
                         color='red')

            # Plot the Effective Locus
            cax.scatter(  # Upstream
                hoffset, voffset, marker='>', zorder=2)
            cax.scatter(  # Start
                hoffset + locus.window,
                voffset,
                marker='.',
                color='blue',
                zorder=2)
            cax.scatter(  # Stop
                hoffset + locus.window + len(locus),
                voffset,
                marker='.',
                color='blue',
                zorder=2)
            cax.scatter(  # Downstream
                hoffset + locus.window + len(locus) + locus.window,
                voffset,
                marker='<',
                zorder=2)
            # Plot the Sub Loci
            for id in locus.sub_loci:
                if id in locus_lookup:
                    sub_locus = locus_lookup[id]
                    cax.scatter(hoffset + locus.window +
                                abs(sub_locus.start - locus.start),
                                voffset,
                                zorder=2,
                                marker='.',
                                label='SNP',
                                color='blue')

            # place a block for interlocal distance
            y_labels.append(commify(locus.start))
            y_ticks.append(voffset)
            voffset += 10
        # Have to finish off the ticks on the last chromosome
        axes[current_axis].set_yticks(y_ticks)
        axes[current_axis].set_yticklabels(y_labels)
        # Save Plot
        plt.savefig(args.out.replace('.png', '_{}.png'.format(term.id)))
        plt.close()
示例#10
0
    def from_CLI(cls, args):
        """
            Implements an interface for the CLI to perform overlap
            Analysis
        """
        if args.genes != [None]:
            source = "genes"
        elif args.go is not None:
            source = "go"
        elif args.gwas is not None:
            source = "gwas"
        elif args.ontology is not None:
            source = 'ontology'
        self = cls.create(source+'_CLI', description="CLI Overlap")
        self.source = source
        self.args = args
        # Build base camoco objects
        self.cob = co.COB(args.cob)

        # Generate the ontology of terms that we are going to look
        # at the overlap of
        if source == "genes":
            # Be smart about this
            import re

            args.genes = list(chain(*[re.split("[,; ]", x) for x in args.genes]))
            self.ont = pd.DataFrame()
            self.ont.name = "GeneList"
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        elif source == "go":
            self.ont = co.GOnt(args.go)
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        elif source == "gwas":
            self.ont = co.GWAS(args.gwas)
        elif source == 'ontology':
            self.ont = co.Ontology(args.ontology)
        else:
            raise ValueError(
                "Please provide a valid overlap source (--genes, --go or --gwas)"
            )
        try:
            self.generate_output_name()
        except ValueError as e:
            return

        # Save strongest description arguments if applicable
        if "strongest" in self.args.snp2gene:
            if not (self.ont._global("strongest_attr") == args.strongest_attr):
                self.ont.set_strongest(attr=args.strongest_attr)
            if not (
                bool(int(self.ont._global("strongest_higher")))
                == bool(args.strongest_higher)
            ):
                self.ont.set_strongest(higher=args.strongest_higher)

        # Generate a terms iterable
        if self.source == "genes":
            # make a single term
            loci = self.cob.refgen.from_ids(self.args.genes)
            if len(loci) < len(self.args.genes):
                self.cob.log("Some input genes not in network")
            terms = [Term("CustomTerm", desc="Custom from CLI", loci=loci)]
        else:
            # Generate terms from the ontology
            if "all" in self.args.terms:
                terms = list(self.ont.iter_terms())
            else:
                terms = [self.ont[term] for term in self.args.terms]
        all_results = list()
        results = []

        num_total_terms = len(terms)
        # Iterate through terms and calculate
        for i, term in enumerate(terms):
            self.cob.log(
                " ---------- Calculating overlap for {} of {} Terms", i, num_total_terms
            )
            if term.id in self.args.skip_terms:
                self.cob.log("Skipping {} since it was in --skip-terms", term.id)
            self.cob.log("Generating SNP-to-gene mapping")
            # If appropriate, generate SNP2Gene Loci
            if self.args.candidate_flank_limit > 0:
                loci = self.snp2gene(term, self.ont)
            else:
                loci = list(term.loci)
                for x in loci:
                    x.window = 1

            # Filter out terms with insufficient or too many genes
            if len(loci) < 2 or len(loci) < args.min_term_size:
                self.cob.log("Not enough genes to perform overlap")
                continue
            if args.max_term_size != None and len(loci) > args.max_term_size:
                self.cob.log("Too many genes to perform overlap")
                continue

            # Send some output to the terminal
            self.cob.log(
                "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)",
                term.id,
                self.ont.name,
                self.cob.name,
                self.args.candidate_window_size,
                self.args.candidate_flank_limit,
                len(loci),
            )
            if args.dry_run:
                continue
            # Do the dirty
            try:
                overlap = self.overlap(loci)
            except DataError as e:
                continue
            self.cob.log("Generating bootstraps")
            bootstraps = self.generate_bootstraps(loci, overlap)
            bs_mean = bootstraps.groupby("iter").score.apply(np.mean).mean()
            bs_std = bootstraps.groupby("iter").score.apply(np.std).mean()
            # Calculate z scores for density
            self.cob.log("Calculating Z-Scores")
            if bs_std != 0:
                overlap["zscore"] = (overlap.score - bs_mean) / bs_std
                bootstraps["zscore"] = (bootstraps.score - bs_mean) / bs_std
            else:
                # If there is no variation, make all Z-scores 0
                overlap["zscore"] = bootstraps["zscore"] = 0
            # Calculate FDR
            self.cob.log("Calculating FDR")
            overlap["fdr"] = np.nan
            max_zscore = int(overlap.zscore.max()) + 1
            for zscore in np.arange(0, max_zscore, 0.25):
                num_random = (
                    bootstraps.groupby("iter")
                    .apply(lambda df: sum(df.zscore >= zscore))
                    .mean()
                )
                num_real = sum(overlap.zscore >= zscore)
                # Calculate FDR
                if num_real != 0 and num_random != 0:
                    fdr = num_random / num_real
                elif num_real != 0 and num_random == 0:
                    fdr = 0
                else:
                    fdr = 1
                overlap.loc[overlap.zscore >= zscore, "fdr"] = fdr
                overlap.loc[overlap.zscore >= zscore, "num_real"] = num_real
                overlap.loc[overlap.zscore >= zscore, "num_random"] = num_random
                overlap.loc[overlap.zscore >= zscore, "bs_mean"] = bs_mean
                overlap.loc[overlap.zscore >= zscore, "bs_std"] = bs_std
                overlap.sort_values(by=["zscore"], ascending=False, inplace=True)
            overlap_pval = (
                sum(
                    bootstraps.groupby("iter").apply(lambda x: x.score.mean())
                    >= overlap.score.mean()
                )
            ) / len(bootstraps.iter.unique())
            # This gets collated into all_results below
            overlap["COB"] = self.cob.name
            overlap["Ontology"] = self.ont.name
            overlap["Term"] = term.id
            overlap["WindowSize"] = self.args.candidate_window_size
            overlap["FlankLimit"] = self.args.candidate_flank_limit
            overlap["TermLoci"] = len(term.loci)
            overlap["TermCollapsedLoci"] = len(loci)
            overlap["TermPValue"] = overlap_pval
            overlap["NumBootstraps"] = len(bootstraps.iter.unique())
            overlap["Method"] = self.args.method
            overlap["SNP2Gene"] = self.args.snp2gene
            results.append(overlap.reset_index())
            # Summarize results
            if self.args.method == "density":
                overlap_score = np.nanmean(overlap.score) / (
                    1 / np.sqrt(overlap.num_trans_edges.mean())
                )
            elif self.args.method == "locality":
                overlap_score = np.nanmean(overlap.score)
            self.cob.log(
                "Overlap Score ({}): {} (p<{})".format(
                    self.args.method, overlap_score, overlap_pval
                )
            )
        if not args.dry_run:
            # Consolidate results and output to files
            self.results = pd.concat(results)
            self.results.to_csv(self.args.out, sep="\t", index=None)

            # Make an actual results object if not exists
            overlap_object = cls.create(self.ont)

            # Save the results to the SQLite table
            self.results.to_sql(
                "overlap",
                sqlite3.connect(overlap_object.db.filename),
                if_exists="append",
                index=False,
            )
示例#11
0
文件: server.py 项目: monprin/cob
refLinks = {}
for name, net in networks.items():
    network_info.append({
        'name': net.name,
        'refgen': net._global('parent_refgen'),
        'desc': net.description,
    })
    if net._global('parent_refgen') in conf['refLinks']:
        refLinks[net.name] = conf['refLinks'][net._global('parent_refgen')]
print('Availible Networks: ' + str(networks))

# Generate ontology list based on allowed list and load them into memory
print('Preloading GWASes into Memory...')
if len(conf['gwas']) < 1:
    conf['gwas'] = list(co.Tools.available_datasets('GWAS')['Name'].values)
onts = {x: co.GWAS(x) for x in conf['gwas']}
onts_info = {}
for m, net in networks.items():
    ref = net._global('parent_refgen')
    onts_info[net.name] = []
    for n, ont in onts.items():
        if ont.refgen.name == ref:
            onts_info[net.name].append({
                'name': ont.name,
                'refgen': ont.refgen.name,
                'desc': ont.description
            })
print('Availible GWASes: ' + str(onts_info))

# Prefetch the gene names for all the networks
print('Fetching gene names for networks...')
示例#12
0
def snp2gene(args):
    '''
        Perform SNP (locus) to candidate gene mapping
    '''

    if args.out != sys.stdout:
        # Create any non-existant directories
        if os.path.dirname(args.out) != '':
            os.makedirs(os.path.dirname(args.out),exist_ok=True)
        if os.path.exists(args.out) and not args.force:
            print(
                "Output for {} exists! Skipping!".format(
                    args.out
                ),file=sys.stderr
            )
            return None

    # Set a flag saying this is from a COB refgen
    from_cob = False
    # Create the refgen (option to create it from a COB)
    if co.Tools.available_datasets('Expr',args.refgen):
        refgen = co.COB(args.refgen).refgen
        from_cob = args.refgen 
    elif co.Tools.available_datasets('RefGen',args.refgen):
        refgen = co.RefGen(args.refgen)
    # Create the GWAS object
    ont = co.GWAS(args.gwas)

    if 'all' in args.terms:
        terms = ont.iter_terms()
    else:
        terms = [ont[term] for term in args.terms]

    data = pd.DataFrame()
    results = []
    for term in terms:
        for window_size in args.candidate_window_size:
            for flank_limit in args.candidate_flank_limit:
                if 'effective' in args.snp2gene:
                    # Map to effective
                    effective_loci = term.effective_loci(
                        window_size=window_size
                    )
                elif 'strongest' in args.snp2gene:
                    effective_loci = term.strongest_loci(
                        window_size=window_size,
                        attr=args.strongest_attr,
                        lowest=args.strongest_higher
                    )
                genes = pd.DataFrame([ x.as_dict() for x in 
                    refgen.candidate_genes(
                        effective_loci,
                        flank_limit=flank_limit,
                        include_parent_locus=True,
                        include_num_siblings=True,
                        include_num_intervening=True,
                        include_rank_intervening=True,
                        include_SNP_distance=True,
                        include_parent_attrs=args.include_parent_attrs,
                        attrs={'Term':term.id},
                    )
                ])
                genes['FlankLimit'] = flank_limit
                genes['WindowSize'] = window_size
                genes['RefGen'] = refgen.name
                if from_cob != False:
                    genes['COB'] = from_cob
                data = pd.concat([data,genes])

    # Add data from gene info files
    original_number_genes = len(data)
    for info_file in args.gene_info:
        log('Adding info for {}',info_file)
        # Assume the file is a table
        info = pd.read_table(info_file,sep='\t')
        if len(info.columns) == 1:
            info = pd.read_table(info_file,sep=',')
        # try to match as many columns as possible
        matching_columns = set(data.columns).intersection(info.columns)
        log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns))
        data = pd.merge(data,info,how='left')
        if len(data) != original_number_genes:
            log.warn(
                'There were multiple info rows for some genes. '
                'Beware of potential duplicate candidate gene entries! '
            )
    
    # Generate the output file
    data.to_csv(args.out,index=None,sep='\t')

    log("Summary stats")
    print('-'*100)
    #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit))
    print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique())))
    print("Number of candidate genes per term:")
    print(data.groupby('Term').apply(lambda df: len(df.ID)))
示例#13
0
    def from_CLI(cls, args):
        '''
            Implements an interface for the CLI to perform overlap
            Analysis
        '''
        if args.genes != [None]:
            source = 'genes'
        elif args.go is not None:
            source = 'go'
        elif args.gwas is not None:
            source = 'gwas'
        self = cls.create(source, description='CLI Overlap')
        self.source = source
        self.args = args
        # Build base camoco objects
        self.cob = co.COB(args.cob)

        # Generate the ontology of terms that we are going to look
        # at the overlap of
        if source == 'genes':
            # Be smart about this
            import re
            args.genes = list(
                chain(*[re.split('[,; ]', x) for x in args.genes]))
            self.ont = pd.DataFrame()
            self.ont.name = 'GeneList'
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        elif source == 'go':
            self.ont = co.GOnt(args.go)
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        elif source == 'gwas':
            self.ont = co.GWAS(args.gwas)
        else:
            raise ValueError(
                'Please provide a valid overlap source (--genes, --go or --gwas)'
            )
        try:
            self.generate_output_name()
        except ValueError as e:
            return

        # Save strongest description arguments if applicable
        if 'strongest' in self.args.snp2gene:
            if not (self.ont._global('strongest_attr') == args.strongest_attr):
                self.ont.set_strongest(attr=args.strongest_attr)
            if not (bool(int(self.ont._global('strongest_higher'))) == bool(
                    args.strongest_higher)):
                self.ont.set_strongest(higher=args.strongest_higher)

        # Generate a terms iterable
        if self.source == 'genes':
            # make a single term
            loci = self.cob.refgen.from_ids(self.args.genes)
            if len(loci) < len(self.args.genes):
                self.cob.log('Some input genes not in network')
            terms = [Term('CustomTerm', desc='Custom from CLI', loci=loci)]
        else:
            # Generate terms from the ontology
            if 'all' in self.args.terms:
                terms = list(self.ont.iter_terms())
            else:
                terms = [self.ont[term] for term in self.args.terms]
        all_results = list()
        results = []

        num_total_terms = len(terms)
        # Iterate through terms and calculate
        for i, term in enumerate(terms):
            self.cob.log(' ---------- Calculating overlap for {} of {} Terms',
                         i, num_total_terms)
            if term.id in self.args.skip_terms:
                self.cob.log('Skipping {} since it was in --skip-terms',
                             term.id)
            self.cob.log('Generating SNP-to-gene mapping')
            # If appropriate, generate SNP2Gene Loci
            if self.args.candidate_flank_limit > 0:
                loci = self.snp2gene(term, self.ont)
            else:
                loci = list(term.loci)
                for x in loci:
                    x.window = 1

            # Filter out terms with insufficient or too many genes
            if len(loci) < 2 or len(loci) < args.min_term_size:
                self.cob.log('Not enough genes to perform overlap')
                continue
            if args.max_term_size != None and len(loci) > args.max_term_size:
                self.cob.log('Too many genes to perform overlap')
                continue

            # Send some output to the terminal
            self.cob.log(
                "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)",
                term.id, self.ont.name, self.cob.name,
                self.args.candidate_window_size,
                self.args.candidate_flank_limit, len(loci))
            if args.dry_run:
                continue
            # Do the dirty
            try:
                overlap = self.overlap(loci)
            except DataError as e:
                continue
            self.cob.log('Generating bootstraps')
            bootstraps = self.generate_bootstraps(loci, overlap)
            bs_mean = bootstraps.groupby('iter').score.apply(np.mean).mean()
            bs_std = bootstraps.groupby('iter').score.apply(np.std).mean()
            # Calculate z scores for density
            self.cob.log('Calculating Z-Scores')
            if bs_std != 0:
                overlap['zscore'] = (overlap.score - bs_mean) / bs_std
                bootstraps['zscore'] = (bootstraps.score - bs_mean) / bs_std
            else:
                # If there is no variation, make all Z-scores 0
                overlap['zscore'] = bootstraps['zscore'] = 0
            # Calculate FDR
            self.cob.log('Calculating FDR')
            overlap['fdr'] = np.nan
            max_zscore = int(overlap.zscore.max()) + 1
            for zscore in np.arange(0, max_zscore, 0.25):
                num_random = bootstraps\
                        .groupby('iter')\
                        .apply(lambda df: sum(df.zscore >= zscore))\
                        .mean()
                num_real = sum(overlap.zscore >= zscore)
                # Calculate FDR
                if num_real != 0 and num_random != 0:
                    fdr = num_random / num_real
                elif num_real != 0 and num_random == 0:
                    fdr = 0
                else:
                    fdr = 1
                overlap.loc[overlap.zscore >= zscore, 'fdr'] = fdr
                overlap.loc[overlap.zscore >= zscore, 'num_real'] = num_real
                overlap.loc[overlap.zscore >= zscore,
                            'num_random'] = num_random
                overlap.loc[overlap.zscore >= zscore, 'bs_mean'] = bs_mean
                overlap.loc[overlap.zscore >= zscore, 'bs_std'] = bs_std
                overlap.sort_values(by=['zscore'],
                                    ascending=False,
                                    inplace=True)
            overlap_pval = (
                (sum(bootstraps.groupby('iter').apply(lambda x: x.score.mean()) >= overlap.score.mean()))\
                / len(bootstraps.iter.unique())
            )
            # This gets collated into all_results below
            overlap['COB'] = self.cob.name
            overlap['Ontology'] = self.ont.name
            overlap['Term'] = term.id
            overlap['WindowSize'] = self.args.candidate_window_size
            overlap['FlankLimit'] = self.args.candidate_flank_limit
            overlap['TermLoci'] = len(term.loci)
            overlap['TermCollapsedLoci'] = len(loci)
            overlap['TermPValue'] = overlap_pval
            overlap['NumBootstraps'] = len(bootstraps.iter.unique())
            overlap['Method'] = self.args.method
            overlap['SNP2Gene'] = self.args.snp2gene
            results.append(overlap.reset_index())
            # Summarize results
            if self.args.method == 'density':
                overlap_score = np.nanmean(overlap.score) / (
                    1 / np.sqrt(overlap.num_trans_edges.mean()))
            elif self.args.method == 'locality':
                overlap_score = np.nanmean(overlap.score)
            self.cob.log('Overlap Score ({}): {} (p<{})'.format(
                self.args.method, overlap_score, overlap_pval))
        if not args.dry_run:
            # Consolidate results and output to files
            self.results = pd.concat(results)
            self.results.to_csv(self.args.out, sep='\t', index=None)

            # Make an actual results object if not exists
            overlap_object = cls.create(self.ont)

            # Save the results to the SQLite table
            self.results.to_sql('overlap',
                                sqlite3.connect(overlap_object.db.filename),
                                if_exists='append',
                                index=False)
示例#14
0
文件: server.py 项目: lisabang/cob
    network_info.append(
        {
            "name": net.name,
            "refgen": net._global("parent_refgen"),
            "desc": net.description,
        }
    )
    if net._global("parent_refgen") in conf["refLinks"]:
        refLinks[net.name] = conf["refLinks"][net._global("parent_refgen")]
print("Availible Networks: " + str(networks))

# Generate ontology list based on allowed list and load them into memory
print("Preloading GWASes into Memory...")
if len(conf["gwas"]) < 1:
    conf["gwas"] = list(co.Tools.available_datasets("GWAS")["Name"].values)
onts = {x: co.GWAS(x) for x in conf["gwas"]}
onts_info = {}
for m, net in networks.items():
    ref = net._global("parent_refgen")
    onts_info[net.name] = []
    for n, ont in onts.items():
        if ont.refgen.name == ref:
            onts_info[net.name].append(
                {"name": ont.name, "refgen": ont.refgen.name, "desc": ont.description}
            )
print("Availible GWASes: " + str(onts_info))

# Prefetch the gene names for all the networks
print("Fetching gene names for networks...")
network_genes = {}
for name, net in networks.items():
示例#15
0
    def from_CLI(cls, args):
        '''
            Implements an interface for the CLI to perform overlap
            Analysis
        '''

        self = cls.create(args.gwas, description='CLI Overlap')
        # Build base camoco objects
        self.args = args
        self.cob = co.COB(args.cob)
        if args.go:
            self.ont = co.GOnt(args.gwas)
            args.candidate_window_size = 1
            args.candidate_flank_limit = 0
        else:
            self.ont = co.GWAS(args.gwas)
        self.generate_output_name()

        # Generate a terms iterable
        if 'all' in self.args.terms:
            terms = self.ont.iter_terms()
        else:
            terms = [self.ont[term] for term in self.args.terms]
        all_results = list()

        results = []
        # Iterate through terms and calculate
        for term in terms:
            if term.id in self.args.skip_terms:
                self.cob.log('Skipping {} since it was in --skip-terms',
                             term.id)
            # Generate SNP2Gene Loci
            loci = self.snp2gene(term)
            if len(loci) < 2 or len(loci) < args.min_term_size:
                self.cob.log('Not enough genes to perform overlap')
                continue
            if args.max_term_size != None and len(loci) > args.max_term_size:
                self.cob.log('Too many genes to perform overlap')
                continue
            # Send some output to the terminal
            self.cob.log(
                "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)",
                term.id, self.ont.name, self.cob.name,
                self.args.candidate_window_size,
                self.args.candidate_flank_limit, len(loci))
            if args.dry_run:
                continue

            # Do the dirty
            try:
                overlap = self.overlap(loci)
            except DataError as e:
                continue
            bootstraps = self.generate_bootstraps(loci, overlap)
            bs_mean = bootstraps.groupby('iter').score.apply(np.mean).mean()
            bs_std = bootstraps.groupby('iter').score.apply(np.std).mean()
            # Calculate z scores for density
            if bs_std != 0:
                overlap['zscore'] = (overlap.score - bs_mean) / bs_std
                bootstraps['zscore'] = (bootstraps.score - bs_mean) / bs_std
            else:
                # If there is no variation, make all Z-scores 0
                overlap['zscore'] = bootstraps['zscore'] = 0
            # Calculate FDR
            overlap['fdr'] = np.nan
            max_zscore = int(overlap.zscore.max()) + 1
            for zscore in np.arange(0, max_zscore, 0.25):
                num_random = bootstraps\
                        .groupby('iter')\
                        .apply(lambda df: sum(df.zscore >= zscore))\
                        .mean()
                num_real = sum(overlap.zscore >= zscore)
                # Calculate FDR
                if num_real != 0 and num_random != 0:
                    fdr = num_random / num_real
                elif num_real != 0 and num_random == 0:
                    fdr = 0
                else:
                    fdr = 1
                overlap.loc[overlap.zscore >= zscore, 'fdr'] = fdr
                overlap.loc[overlap.zscore >= zscore, 'num_real'] = num_real
                overlap.loc[overlap.zscore >= zscore,
                            'num_random'] = num_random
                overlap.loc[overlap.zscore >= zscore, 'bs_mean'] = bs_mean
                overlap.loc[overlap.zscore >= zscore, 'bs_std'] = bs_std
                overlap.sort_values(by=['zscore'],
                                    ascending=False,
                                    inplace=True)
            overlap_pval = (
                (sum(bootstraps.groupby('iter').apply(lambda x: x.score.mean()) >= overlap.score.mean()))\
                / len(bootstraps.iter.unique())
            )
            # This gets collated into all_results below
            overlap['COB'] = self.cob.name
            overlap['Ontology'] = self.ont.name
            overlap['Term'] = term.id
            overlap['WindowSize'] = self.args.candidate_window_size
            overlap['FlankLimit'] = self.args.candidate_flank_limit
            overlap['TermLoci'] = len(term.loci)
            overlap['TermCollapsedLoci'] = len(loci)
            overlap['TermPValue'] = overlap_pval
            overlap['NumBootstraps'] = len(bootstraps.iter.unique())
            overlap['Method'] = self.args.method
            overlap['SNP2Gene'] = self.args.snp2gene
            results.append(overlap.reset_index())
        if not args.dry_run:
            self.results = pd.concat(results)
            self.results.to_csv(self.args.out, sep='\t', index=None)
            overlap_object = cls.create(self.ont)
            overlap_object.results = results
            self.results.to_sql('overlap',
                                sqlite3.connect(overlap_object.db.filename),
                                if_exists='append',
                                index=False)
示例#16
0
    geneWordBuilder(func,[os.path.join(scratch_folder,(func+'.tsv'))],[1],['2 end'],['tab'],[True])

# Find any GO ontologies we have for the networks we have
print('Finding applicable GO Ontologies...')
GOnt_db = {}
for name in co.available_datasets('GOnt')['Name']:
    gont = co.GOnt(name)
    if gont.refgen.name not in GOnt_db:
        GOnt_db[gont.refgen.name] = gont

# Generate in memory term lists
print('Finding all available terms...')
terms = {}
for ont in gwas_sets['data']:
    terms[ont[0]] = {'data': [(term.id,term.desc,len(term.loci),
        len(co.GWAS(ont[0]).refgen.candidate_genes(term.effective_loci(window_size=50000))))
        for term in co.GWAS(ont[0]).iter_terms()]}

# Set up the logging file
handler = logging.FileHandler('COBErrors.log')
handler.setLevel(logging.INFO)
app.logger.addHandler(handler)
app.logger.setLevel(logging.INFO)
print('All Ready!')

#---------------------------------------------
#                 Routes
#---------------------------------------------
# Sends off the homepage
@app.route('/')
def index():