if args.html: with open(args.html, "w") as htmlOut: print >> htmlOut, etree.tostring(html, pretty_print=True) # Finishing script logger.info(u"Count Digits Complete!") if __name__ == '__main__': # Command line options args = getOptions() # Setting logger logger = logging.getLogger() if args.debug: sl.setLogger(logger, logLevel='debug') else: sl.setLogger(logger) # Starting script with the following parameters logger.info(u"Importing data with following parameters: "\ "\n\tWide: {0}"\ "\n\tDesign: {1}"\ "\n\tUnique ID: {2}"\ "\n\tGroup: {3}"\ "\n\tHtml: {4}".\ format(args.input,args.design, args.uniqID, args.group, args.html)) # Main main(args)
# Iterating over groups for name, group in dat.design.groupby(args.group): logger.info(u"Plotting for group {0}".format(name)) # Plotting Density and Box plot for the group plotDensity(data=wide.T[group.index], name=name, pdf=pdf) # Get colors for each feature for "All groups" logger.info(u"Plotting for group {0}".format("samples")) palette.getColors(design=dat.design, groups=[]) # Plotting density and boxplots for all plotDensity(data=wide, name="samples", pdf=pdf) #Ending script logger.info(u"Ending script") if __name__ == '__main__': args = getOptions() logger = logging.getLogger() sl.setLogger(logger) logger.info("Importing data with following parameters: "\ "\n\tWide: {0}"\ "\n\tDesign: {1}"\ "\n\tUnique ID: {2}".format(args.input, args.design, args.uniqID)) palette = colorHandler(pal=args.palette, col=args.color) logger.info(u"Using {0} color scheme from {1} palette".format( args.color, args.palette)) main(args)
def main(): """ Take a gene expression matrix and extract the column with ENSEMBL IDs. Then, translate ENSEMBL IDs into Gene_Symbol needed for the rest of the pipeline. Create a table with Unique Identifiers, ENSEMBL IDs, gene symbols, match scores, and selection resul, which is useful in cases of multiple matches. Arguments: :param species: Species to download information from mygene :type species: string :param geneAnnot: Gene Expression Annotation file with ENSEMBL IDs column :type geneAnnot: file :param ensemblId: Name of the column with ENSEMBL IDs :type ensemblId: string """ args = getOptions() logger = logging.getLogger() sl.setLogger(logger) logger.info( u"""Importing data with following parameters: \ \n\tSpecies: {0}\ \n\tGene Annotation File: {1}\ \n\tUnique ID column: {2}\ \n\tENSEMBL ID Column: {3}""".format( args.species, args.geneAnnot, args.uniqId, args.ensemblId ) ) modules.checkForDuplicates(args.geneAnnot, args.uniqId) # Original Gene Expression Annotation Dataset with ENSEMBL IDs genesTable = pd.read_table(args.geneAnnot, delimiter="\t", header=0) # Find Gene Symbol mg = mygene.MyGeneInfo() genes = genesTable[args.ensemblId].tolist() genesTransformed = mg.querymany( genes, scopes="ensembl.gene", fields="symbol", species=args.species, verbose=False, returnall=True, as_dataframe=True, df_index=False, ) genesTransformedTable = genesTransformed["out"] if genesTransformedTable.shape[0] != genesTransformed["missing"].shape[0]: genesTransformedTable.drop(labels=["_id"], axis=1, inplace=True) genesTransformedTable = genesTransformedTable[["query", "symbol", "_score"]] genesTransformedTable.columns = [args.ensemblId, "GeneSymbol", "Score"] # Merge Both datasets newGenesTable = pd.merge(genesTable, genesTransformedTable, on=args.ensemblId) # In case of duplicated, select the first one (High score) newGenesTable["Selected"] = "Yes" isDup = newGenesTable.duplicated(subset=args.ensemblId, keep="first") newGenesTable["Selected"][isDup] = "No" # Write table newGenesTable.to_csv(args.output, sep="\t", index=False) else: with open(args.output, 'w') as f: f.write("no matching result! Please check the selected species and input files.")
def main(): """ Add binary flags (0/1) to a differential expression dataset depending on p-value thresholds. Arguments: :param deaDataset: Matrix with Differential Expression Analysis information :type deaDataset: file :param pvalue: Name of the column with the p-value information :type pvalue: string :param uniqid: Name of the column with the unique identifier :type uniqid: string :param thresholds: Desired flag thresholds. Must be separed with ",", no spaces allowed. :type thresholds: string Returns: :return output: Table with input and added correspondent flags columns :rtype output: file :return flags: Table with only the correspondent flags columns :rtype flags: file """ args = getOptions() logger = logging.getLogger() sl.setLogger(logger) logger.info( u"""Importing data with following parameters: \ \n\tDEA Dataset: {0}\ \n\tUnique ID: {1}\ \n\tPvalues: {2}\ \n\tThresholds: {3}""".format( args.deaDataset, args.uniqID, args.pvalue, args.thresholds ) ) modules.checkForDuplicates(args.deaDataset, args.uniqID) output = open(args.output, "w") flags = open(args.flags, "w") with open(args.deaDataset, "r") as data: header = data.readline().strip().split("\t") thresholds = args.thresholds.split(",") header_list = [] for word in header: if word == "": output.write("NA") header_list.append("NA") elif header.index(word) == len(header) - 1: word = word.replace('"', "") output.write(word) header_list.append(word) else: word = word.replace('"', "") output.write(word + "\t") header_list.append(word) flags.write(str(args.uniqID)) for threshold in thresholds: flags.write("\tFlag_" + threshold) output.write("\tFlag_" + threshold) header_list.append("\tFlag_" + threshold) flags.write("\n") output.write("\n") # Get P value column from a DEA dataset deaTable = genfromtxt( args.deaDataset, delimiter="\t", usecols=header_list.index(args.pvalue), dtype=None, ) deaTable = np.delete(deaTable, 0) # Add 1/0 if smaller/greater than threshold i = 2 for pvalue in deaTable: line = linecache.getline(args.deaDataset, i).strip() pvalue = float(pvalue.strip()) flags.write(line.split("\t")[header_list.index(args.uniqID)]) output.write(line) for threshold in thresholds: if pvalue <= float(threshold): flags.write("\t1") output.write("\t1") else: flags.write("\t0") output.write("\t0") flags.write("\n") output.write("\n") i += 1 return args
ror_df.to_csv(args.table, sep="\t", float_format="%.4f", index_label=args.uniqID, columns=["pval", "rsq", "slope"]) ror_flags.df_flags.to_csv(args.flags, sep="\t", index_label=args.uniqID) if __name__ == "__main__": # Command line options args = getOptions() # Setting up logger logger = logging.getLogger() if args.debug: sl.setLogger(logger, logLevel="debug") DEBUG = True else: sl.setLogger(logger) # Print logger info logger.info(u"""Importing data with following parameters: \tWide: {0} \tDesign: {1} \tUnique ID: {2} \tGroup: {3} \tRun Order: {4} \tLevels: {5} """.format(args.input, args.design, args.uniqID, args.group, args.order, args.levels))
def main(): """ Perform a correlation analysis of a Gene Expression Dataset and a Metabolomic Dataset. Arguments: :param geneDataset metDataset: Gene expression/Metabolomics wide dataset, respectively. :type geneDataset metDataset: files :param geneId metId: Name of the Genes/metabolites unique identifier column, respectively. :type geneId metId: strings :param geneAnnot metAnnot: Gene Expression/Metabolomics Annotation Datasets, respectively. :type geneAnnot metAnnot: files :param geneAnnotName metAnnotName: Name of the column of the Annotation file that contains genes/metabolites names respectively. :type geneAnnotName metAnnotName: strings :param meth: Methodology for the correlation function. One of 'pearson', 'spearman' or 'kendall'. :type meth: string :param thres: PValue Threshold to cut the correlations for the output table. :type thres: float Returns: :return output: Output table with the following information: Metabolite "\t" Gene "\t" Correlation "\t" pvalue :rtype output: file :return corMat: Correlation Matrix :rtype corMat: file :return fig: Network-like output figure :rtype fig: pdf """ warnings.filterwarnings("ignore", category=RRuntimeWarning) args = getOptions() logger = logging.getLogger() sl.setLogger(logger) logger.info(u"Importing data with the following parameters: " "\n\tGene Dataset: {}" "\n\tGene UniqueID: {}" "\n\tMet Dataset:{}" "\n\tMet UniqueID: {}" "\n\tMethod: {}" "\n\tThreshold: {}".format( args.geneDataset, args.geneId, args.metDataset, args.metId, args.meth, args.thres, )) modules.checkForDuplicates(args.geneDataset, args.geneId) modules.checkForDuplicates(args.metDataset, args.metId) pandas2ri.activate() with ires.path("gaitGM.data", "all_by_all_correlation.R") as my_r_script_path: f = open(my_r_script_path, "r") rFile = f.read() allByAllCorrScript = STAP(rFile, "corr_main_func") # Prepare Gene Expression Data geneTable = pd.read_table(args.geneDataset, sep="\t", header=0) if args.geneAnnot: R_gene_df = modules.Ids2Names(geneTable, args.geneId, args.geneAnnot, args.geneName) else: geneTable = geneTable.set_index(args.geneId) R_gene_df = pandas2ri.py2rpy(geneTable) # Prepare Metabolomics Data metTable = pd.read_table(args.metDataset, sep="\t", header=0) if args.metAnnot: R_met_df = modules.Ids2Names(metTable, args.metId, args.metAnnot, args.metName) else: metTable = metTable.set_index(args.metId) R_met_df = pandas2ri.py2rpy(metTable) allByAllCorrScript.corr_main_func( x=R_gene_df, y=R_met_df, meth=args.meth, thres=args.thres, corrMatPath=args.corMat, outputPath=args.output, figurePath=args.fig, )
def main(): """ Performs a Sparse Partial Least Squares (sPLS) analysis over subsets of gene expression and metabolomic data. To perform this subsetting, three different methodologies can be used for the metabolites: - By generic metabolite (sphingomyelin, ...) - By MMC cluster - By generic metabolite and then by MMC cluster and four for the genes: - All the genes - Genes contained in a list with interesting genes for the analysis - Pathway related genes for an specific generic metabolite - Metagenes (PANA approach) The outputs depend on the inputs. Arguments: :param geneDataset metDataset: Gene expression/Metabolomics wide dataset, respectively. :type geneDataset metDataset: files :param geneId metId: Name of the Genes/metabolites unique identifier column, respectively. :type geneId metId: strings :param geneAnnot metAnnot: Gene Expression/Metabolomics Annotation Dataset. :type geneAnnot metAnnot: files :param geneAnnotName metAnnotName: annotation file column with gene/metabolite names. :type geneAnnotName metAnnotName: strings :param design: Design File :type design: file :param keepX: Number of genes to keep in the sPLS model :param keepX: integer :param geneOption metOption: Options for metabolite subsetting (one of 'generic', 'mmc' or 'both') and for gene expression subsetting (one of 'all', 'geneList', 'path' or 'pana') :type geneOption metOption: strings :param geneKeggAnno metKeggAnno: KEGG Annotation files for gene expression and metabolomics, respectively. From Add KEGG Anno Info Tool :type geneKeggAnno metKeggAnno: files :param geneKeggPath metKeggPath: KEGG Pathway files for gene expression and metabolomics, respectively. From Add KEGG Pathway Info Tool :type geneKeggPath metKeggPath: files :param path2genes: Downloaded KEGG file with this information: pathway_ID "\t" geneKEGG_ID :type path2genes: file Returns: :return figure1: sPLS heatmaps :rtype figure1: pdf :return splsOut: sif-like correlation matrix including a column describing the comparison. :rtype splsOut: file :return figure2: MMC plots if mmc or both metabolite subsetting option is selected. :rtype figure2: pdf :return mmcOut: MMC Output table if mmc or both metabolite subsetting option is selected. :rtype mmcOut: file :return panaOut: Table describing genes that forms the metagenes (1/0) :rtype panaOut: file """ args = getOptions() logger = logging.getLogger() sl.setLogger(logger) logger.info(u"Importing data with the following parameters: " "\n\tGene Dataset: {}" "\n\tGene UniqueID: {}" "\n\tGene Option: {}" "\n\tMetabolite Dataset:{}" "\n\tMetabolite UniqueID: {}" "\n\tMetabolite Option: {}".format( args.geneDataset, args.geneId, args.geneOption, args.metDataset, args.metId, args.metOption, )) pandas2ri.activate() with ires.path("gaitGM.data", "sPLS.R") as my_r_script_path: f = open(my_r_script_path, "r") rFile = f.read() sPLSScript = STAP(rFile, "sPLS") rGeneData, rMetData, multipleNames, multipleNamesId = modules.prepareSPLSData( args) rData = [] data_counter = 0 for R_met_df in rMetData: R_gene_df = rGeneData[data_counter] rData.append( sPLSScript.sPLS(geneData=R_gene_df, metData=R_met_df, keepX=args.keepX)) if args.geneOption == "path": data_counter += 1 if args.metOption == "both": sPLSScript.plotInPdf(splsObjects=rData, figurePath=args.figure1, multipleNames=multipleNamesId) # Correlation Matrix corMatrix = sPLSScript.corrMat(splsObjects=rData, multipleNames=multipleNamesId, threshold=args.thres) robjects.r["write.table"]( corMatrix, file=args.splsOut, sep="\t", quote=False, row_names=False, col_names=True, ) else: sPLSScript.plotInPdf(splsObjects=rData, figurePath=args.figure1, multipleNames=multipleNames) # Correlation Matrix corMatrix = sPLSScript.corrMat(splsObjects=rData, multipleNames=multipleNames, threshold=args.thres) robjects.r["write.table"]( corMatrix, file=args.splsOut, sep="\t", quote=False, row_names=False, col_names=True, )