def main(): """ 1: ... """ desc = """... ask me later! I'm on a deadline! ...""" parser = argparse.ArgumentParser(description=desc) logger = logging.getLogger(sys.argv[0].split('/')[-1]) parser.add_argument('--hfile', type=str, required=True, nargs='+', help="""Quoted ;-delimited list containing info about the files containing homology relationships: [--hfile "path;header1;header2"]. At LEAST one hfile is required and all MUST have all three trailing data.""") parser.add_argument('--xprnfile', type=str, required=True, nargs='+', help="""Quoted ;-delimited list containing info about the files containing expression data: [--xprnfile "path;nameHeader;conditionHeader1;...;conditionHeaderN"]. At LEAST one xprnfile is required and all MUST have exactly one <path>, exactly one <nameHeader>, and at LEAST one <conditionHeader>. It is VERY important that you list the same number of conditions for each expnfile and that the order reflects which condition values are to be compared.""") parser.add_argument('--cmap', type=str, required=True, nargs='+', help="""A list of species-prefix:color combinations to set the node colors: [--cmap <AAEL:b ...>]. the number of combinations should match the number of files given to --xprnfile.""") parser.add_argument('--log', action='store_true', help="""Plot the points on a log:log scale. (Default: %(default)s)""") parser.add_argument('--show', action='store_true', help="""Plot the image for interactive manipulation, otherwise just write the file. (Default: %(default)s)""") parser.add_argument('--pdf', action='store_true', help="""Plot the image as a pdf: png otherwise. Png is preferable when data size is large. (Default: %(default)s)""") parser.add_argument('--out', type=str, default='', help="""Base path for output. (Default: current working directory)""") parser.add_argument('--load-pickle', type=str, default=False, help="""Load graph from a gpickle. (Default: %(default)s)""") args = parser.parse_args() # some manual arg set-up and checking for i in range(len(args.hfile)): args.hfile[i] = args.hfile[i].split(';') if len(args.hfile[i]) != 3: raise SanityCheckError('EXACTLY 3 values must follow --hfile: you gave %s' % (args.hfile[i])) xLen = set() for i in range(len(args.xprnfile)): args.xprnfile[i] = args.xprnfile[i].split(';') if not len(args.xprnfile[i]) >= 3: raise SanityCheckError('At LEAST 3 values must follow --xprnfile: you gave %s' % (args.xprnfile[i])) else: xLen.add(len(args.xprnfile[i])) if not len(xLen) == 1: raise SanityCheckError('The same number of values must follow every --xprnfile flag.') if not len(args.xprnfile) == len(args.cmap): raise SanityCheckError('The length of values following --xprnfile and --cmap must be the same.') cDict = {} for combo in args.cmap: try: prefix,color = combo.split(':') except: raise cDict[prefix] = color # read in the expression vector data tmpDict = {} xDict = {} for xfile in args.xprnfile: tmpDict.update(mangle_expn_vectors(expnPath=xfile[0],txNameHeader=xfile[1],condHeaders=xfile[2:],manualHeaders=False)) # convert -RX into -PX for k,v in tmpDict.iteritems(): xDict[k.replace('-R','-P')] = v del(tmpDict) if args.load_pickle: subgraphs = nx.read_gpickle('/tmp/ortho_weighted_subgraphs.gpickle') else: # lets get started: init the graph graph = nx.Graph() for f in args.hfile: import_edges(graphObj=graph,edgeTablePath=f[0],startNodeHeader=f[1],endNodeHeader=f[2]) # remove the '' node caused by unpaired relationships graph.remove_node('') # weight the edges in each graph by the pearsonr between their expression vectors weight_edges_with_pearsonr(graphObj=graph,dataVectors=xDict,uni=False) # if the edge length is imposible to graph (inf or nan) kill the edge #badEdges = [] #edgesMissingNodes = [] #for i,j in graph.edges_iter(): #try: #if math.isnan(graph[i][j]['rVal']) or math.isinf(graph[i][j]['rVal']): #badEdges.append((i,j)) #except KeyError: #edgesMissingNodes.append((i,j)) #graph.remove_edges_from(badEdges) #graph.remove_edges_from(edgesMissingNodes) # Get all subgraphs subgraphs = nx.connected_component_subgraphs(graph) nx.write_gpickle(subgraphs,"/tmp/ortho_weighted_subgraphs.gpickle") print "I layed a pickle!!" args.galaxy = False #args.label2 = "Pct w/ significant positive corr (r >= 0.5, p <= 0.05)" args.label2 = "Pct with significant correlation" for prefix in cDict: args.label1 = "Usable paralogs per subgraph within %s" % (prefix) #args.label1 = "%s x" % (prefix) pearsonStats,data = get_within_data(prefix,subgraphs) plotScatter(pearsonStats,data,args,color=cDict[prefix]) args.label1 = "Usable orthologs per subgraph between AGAP and CPIJ" #args.label1 = "both x" pearsonStats,data = get_between_data(prefixes=cDict.keys(),subgraphs=subgraphs) plotScatter(pearsonStats,data,args,color='green') print "Done."
def main(): """ 1: ... """ desc = """... ask me later! I'm on a deadline! ...""" parser = argparse.ArgumentParser(description=desc) logger = logging.getLogger(sys.argv[0].split('/')[-1]) parser.add_argument('--hfile', type=str, required=True, nargs='+', help="""Quoted ;-delimited list containing info about the files containing homology relationships: [--hfile "path;header1;header2"]. At LEAST one hfile is required and all MUST have all three trailing data.""") parser.add_argument('--xprnfile', type=str, required=True, nargs='+', help="""Quoted ;-delimited list containing info about the files containing expression data: [--xprnfile "path;nameHeader;conditionHeader1;...;conditionHeaderN"]. At LEAST one xprnfile is required and all MUST have exactly one <path>, exactly one <nameHeader>, and at LEAST one <conditionHeader>. It is VERY important that you list the same number of conditions for each expnfile and that the order reflects which condition values are to be compared.""") parser.add_argument('--targets', type=str, required=True, nargs='+', help="""A list of the gene/tx/protein symbols to use for pulling out all connected nodes.""") #parser.add_argument('--pfixlen', type=str, required=True, nargs='+', #help="""One length of the symbol prefixes (AAEL for AAEL007639-PA) or a list of prefix le.""") parser.add_argument('--cmap', type=str, required=True, nargs='+', help="""A list of species-prefix:color combinations to set the node colors: [--cmap <AAEL:b ...>]. the number of combinations should match the number of files given to --xprnfile.""") parser.add_argument('--out', type=str, required=True, help="""Path to outfile. Its file extention chooses the file type.""") parser.add_argument('--graphml', type=str, required=False, help="""Include a file path if you would like a graphML version of the final graph. (optional)""") parser.add_argument('--nonames', action='store_true', help="""If used: gene/tx names will NOT be displayed.""") parser.add_argument('--noshow', action='store_true', help="""If used: the graph NOT be displayed interactively.""") args = parser.parse_args() # some manual arg set-up and checking for i in range(len(args.hfile)): args.hfile[i] = args.hfile[i].split(';') if len(args.hfile[i]) != 3: raise SanityCheckError('EXACTLY 3 values must follow --hfile: you gave %s' % (args.hfile[i])) xLen = set() for i in range(len(args.xprnfile)): args.xprnfile[i] = args.xprnfile[i].split(';') if not len(args.xprnfile[i]) >= 3: raise SanityCheckError('At LEAST 3 values must follow --xprnfile: you gave %s' % (args.xprnfile[i])) else: xLen.add(len(args.xprnfile[i])) if not len(xLen) == 1: raise SanityCheckError('The same number of values must follow every --xprnfile flag.') if not len(args.xprnfile) == len(args.cmap): raise SanityCheckError('The length of values following --xprnfile and --cmap must be the same.') cDict = {} for combo in args.cmap: try: prefix,color = combo.split(':') except: raise cDict[prefix] = color # read in the expression vector data tmpDict = {} xDict = {} for xfile in args.xprnfile: tmpDict.update(mangle_expn_vectors(expnPath=xfile[0],txNameHeader=xfile[1],condHeaders=xfile[2:],manualHeaders=False)) # convert -RX into -PX for k,v in tmpDict.iteritems(): xDict[k.replace('-R','-P')] = v del(tmpDict) # lets get started: init the graph graph = nx.Graph() for f in args.hfile: import_edges(graphObj=graph,edgeTablePath=f[0],startNodeHeader=f[1],endNodeHeader=f[2]) # remove the '' node caused by unpaired relationships graph.remove_node('') # for debugging nx.write_gpickle(graph,"/tmp/ortho1.gpickle") # Cut out a subgraph using provided targets subgraph = graph_connected_nodes(graphObj=graph,nodeList=args.targets) # weight the edges in subgraph by the pearsonr between their expression vectors weight_edges_with_pearsonr(graphObj=subgraph,dataVectors=xDict,uni=False) # if the edge length is imposible to graph (inf or nan) kill the edge badEdges = [] edgesMissingNodes = [] for i,j in subgraph.edges_iter(): try: if math.isnan(subgraph[i][j]['rVal']) or math.isinf(subgraph[i][j]['rVal']): badEdges.append((i,j)) except KeyError: edgesMissingNodes.append((i,j)) subgraph.remove_edges_from(badEdges) subgraph.remove_edges_from(edgesMissingNodes) # begin drawing the graph by setting the node positions #a = nx.to_agraph(subgraph) #a.layout() #a.draw('%s.gv.png' % args.out) #h = nx.from_agraph(a) #pos = nx.graphviz_layout(h) #pos= nx.spring_layout(subgraph,iterations=100) pos = nx.graphviz_layout(subgraph, args='-LC1000000000') # set node colors nodelist = subgraph.nodes() node_colors = [] prefixes = cDict.keys() # get edge lables: eLab = {} for i,j in subgraph.edges_iter(): eLab[(i,j)] = 'r = %s\np = %s' % (round(subgraph[i][j]['rVal'],3),round(subgraph[i][j]['pVal'],3)) for n in nodelist: # crazy list comprehension python-voodoo to create a list of colors in the same order as nodelist node_colors.extend([cDict[x] for x in prefixes if n.startswith(x)]) nx.draw_networkx_nodes(subgraph, pos,nodelist,node_color=node_colors,node_size=1000,node_shape='o', aplha=.7) sigEdges = [] nonSigEdges = [] for e in subgraph.edges_iter(): if float(subgraph[e[0]][e[1]]['pVal']) <= 0.05: sigEdges.append(e) else: nonSigEdges.append(e) # Define color map for edge 'heats' g2r = {'green': ((0.0, 0.0, 0.0), (0.66, 0.0, 0.0), (1.0, 1.0, 1.0)), 'blue': ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0)), 'red': ((0.0, 1.0, 1.0), (0.33, 0.0, 0.0), (1.0, 0.0, 0.0))} b2g2y2o2r = {'red': ((0.0, 0.0, 0.0), #(0.9, 1.0, 1.0), (1.0, 1.0, 1.0)), 'green': ((0.0, 0.0, 0.0), (0.4, 1.0, 1.0), (0.6, 1.0, 1.0), (1.0, 0.0, 0.0)), 'blue': ((0.0, 1.0,1.0), #(0.1, 0.0, 0.0), (1.0, 0.0, 0.0))} plt.register_cmap(name='corrMap', data=b2g2y2o2r) corrMap = plt.get_cmap('corrMap') nx.draw_networkx_edges(subgraph, pos, edgelist=nonSigEdges, width=2.0, edge_cmap=corrMap, edge_vmin=-1, edge_vmax=1, edge_color=[subgraph[e[0]][e[1]]['weight'] for e in nonSigEdges], style='dashed', alpha=.7) nx.draw_networkx_edges(subgraph, pos, edgelist=sigEdges, width=2.0, edge_cmap=corrMap, edge_vmin=-1, edge_vmax=1, edge_color=[subgraph[e[0]][e[1]]['weight'] for e in sigEdges], style='solid', alpha=1) nx.draw_networkx_edges(subgraph, pos, edgelist=badEdges, width=1.0, edge_color='grey', style='solid', alpha=.3) # add color bar as key to heats plt.colorbar() #nx.draw_networkx_edge_labels(subgraph,pos,edge_labels=eLab) if not args.nonames: nx.draw_networkx_labels(subgraph, pos, font_weight='bold', font_size=8) plt.axis('off') # write out the file(s) try: plt.savefig(args.out) except ValueError: plt.savefig('%s.png' % (args.out)) if args.graphml: raise NotImplemented #nx.write_graphml(subgraph,args.graphml) if not args.noshow: plt.show()
def main(): """ 1: Collect Tx from one or more species that are within at least some r value of similarity to a provided example Tx or a submitted hypothetical expression vector. 2: Use GTFs, BEDtools, and genome FASTAs to extract the upstream flanking sequences into a new FASTA for use in motif discovery. """ desc = """(1) Collect Tx from one or more species that are within at least some r value of similarity to a provided example Tx or a submitted hypothetical expression vector. (2) Use GTFs, BEDtools, and genome FASTAs to extract the upstream flanking sequences into a new FASTA for use in motif discovery.""" parser = argparse.ArgumentParser(description=desc) FileType = argparse.FileType logger = logging.getLogger(sys.argv[0].split('/')[-1]) parser.add_argument('--expn-path', type=str, required=True, help="""Path to expression table file. \n(default: %(default)s)""") parser.add_argument('--tx-name', type=str, required=True, help="""Name of the Tx you want to use as a model. (default: %(default)s)""") parser.add_argument('--pearson-filter-type', type=str, default='>=', choices=['>=','<='], help="""Use >= to find similar expn profiles or <= to find opposite profiles. (default: %(default)s)""") parser.add_argument('--pearson-filter-thresh', type=float, default=0.7, help="""Set the threshold of the Pearson r value for the filter. (default: %(default)s)""") parser.add_argument('--pval-filter-thresh', type=float, default=0.05, help="""Set the upper threshold for the p-value of the Pearson r values to keep. (default: %(default)s)""") parser.add_argument('--tx-name-header', type=str, required=True, help="""The text of the header in the expn table where tx names are stored. (default: %(default)s)""") parser.add_argument('--cond-headers', type=str, required=True, nargs='+', help="""A list of the text of the headers in the expn table where the values for each condition are stored (--cond-headers cond1 cond2 ...). (default: %(default)s)""") parser.add_argument('--manual-headers', type=str, required=False, nargs='?', help="""If the expn table does not have headers, provide a list of ordered names for them here. (default: %(default)s)""") parser.add_argument('--gtf', type=str, required=True, help="""The path to the gtf file that you want to use for your annotation. (default: %(default)s)""") parser.add_argument('--gtf-index', type=str, required=True, help="""The path to the gtf index file generated from "gtf_to_genes". (default: %(default)s)""") parser.add_argument('--genome-fastas', type=str, required=True, nargs='+', help="""A list of paths to genomic fasta files or directories where they are stored. (default: %(default)s)""") parser.add_argument('--flank-len', type=int, default=2000, help="""The length in bp that should be harvested from the 5' end of the tx. (default: %(default)s)""") parser.add_argument('--out-dir', type=str, default='.', help="""A path to a directory where you would like the output files to be stored. (default: %(default)s)""") parser.add_argument('--dump-megafasta', action='store_true', help="""Save concatonated fasta file for debugging. (default: %(default)s)""") parser.add_argument('--dump-stats', action='store_true', help="""Print a list of Tx/gene names and the r- p-values that passed the filter and exit without getting fastas. (default: %(default)s)""") args = parser.parse_args() # tmp files will be stored here tmp_files = Bag() # 1: Use a correlation filter to pull out any Tx that is sufficiently similar to the model Tx vectDict = mangle_expn_vectors(expnPath=args.expn_path,txNameHeader=args.tx_name_header,condHeaders=args.cond_headers,manualHeaders=args.manual_headers) filterFunc = eval("lambda x: x %s %f" % (args.pearson_filter_type, args.pearson_filter_thresh)) filterDict = pearsonExpnFilter(modelVector=vectDict[args.tx_name], targetVectors=vectDict, filterFunc=filterFunc) # remove vectors whose r's pVal is not significant (<=0.05) sigVectors = {} for key in filterDict: if key[1] <= args.pval_filter_thresh: sigVectors[key] = filterDict[key] matchVectors = sigVectors ## Impose a distance filter to further refine the gene set ## incorperating magnitudes of the absolute levels of gene expression ## set the boundries of acceptable deviation for the target gene mean expression ## mangitude by bootstrapping. The metric for comparison will be the average of ## the differences of each point in remaining vectors against the target ## vector. ## 1) calc the metrics for each remaining gene's vector ## PS: numpy rocks. ##avgDists = {} ##for key in sigVectors: ##avgDist_i = np.mean(np.subtract(vectDict[args.tx_name], ##sigVectors[key])) ##avgDists[key] = avgDist_i ### 2) bootstrap that bitch and give me a stdErr! ##medianEst,stdErrEst,lo95,hi95 = basic_bootstrap_est(avgDists.values()) ### 3) recover keys that fall within +/- 1 SE ##matchVectors = {} ##for key in avgDists: ##avgDist = avgDists[key] ##if (avgDist >= -stdErrEst) and (avgDist <= stdErrEst): ##matchVectors[key] = sigVectors[key] # Sort txList so that the highest r values are at the top # and save vectors and this info out to file txList = sorted(matchVectors.keys(),key=lambda x: x[0], reverse=True) sortedTxListFile = NamedTemporaryFile(mode='w+t',prefix='txExpnVectFilteredBy_r.',suffix=".tsv",delete=False) for row in txList: if args.dump_stats: sys.stdout.write('%s\t%s\n' % ('\t'.join(map(str,row)),'\t'.join(map(str,matchVectors[row])))) else: sortedTxListFile.write('%s\t%s\n' % ('\t'.join(map(str,row)),'\t'.join(map(str,matchVectors[row])))) if args.dump_stats: sortedTxListFile.close() exit(0) tmp_files['sortedTxListFile'] = sortedTxListFile sortedTxListFile.close() g2gObj = gtf_to_genes.get_indexed_genes_matching_gtf_file_name(index_file_name=args.gtf_index, logger=logger, regex_str=args.gtf)[-1] txDict = filter_GTF_4_Tx(txList=[x[2] for x in txList],g2gObj=g2gObj) tmp_files['txBedFile'] = convert_2_bed(txDict=txDict) # 2: Use GTFs, BEDtools, and genome FASTAs to extract the upstream flanking sequences into a new FASTA fastaRecLengths,fastaSeqs = fastaRec_length_indexer(fastaFiles=args.genome_fastas) tmpFastaRecLengthFile = NamedTemporaryFile(mode='w+b',prefix='tmpFastaRecLengthFile.',suffix=".txt") for seqRec in fastaRecLengths: tmpFastaRecLengthFile.write("%s\t%s\n" % (seqRec,fastaRecLengths[seqRec])) tmpFastaRecLengthFile.flush() # TODO: concatonate fasta files megaFastaFile = NamedTemporaryFile(mode='w+b',prefix='tmpMegaFastaFile.',suffix=".fas") for fasta in fastaSeqs: megaFastaFile.write('>%s\n%s\n' % (fasta,fastaSeqs[fasta])) megaFastaFile.flush() tmp_files['flankBed'] = get_fastas(txBed=tmp_files.txBedFile.name,genomeFasta=megaFastaFile.name,lenIndex=tmpFastaRecLengthFile.name,lenFlanks=args.flank_len) # CLEAN UP: # TODO: Close all tmp_files, and move to args.outDir mkdirp(args.out_dir) for f in tmp_files: try: tmp_files[f].delete = False except AttributeError: pass try: tmp_files[f].close() except AttributeError: pass # ['sortedTxListFile', 'flankBed', 'txBedFile', 'flankFasta'] sortedTxListFile = "%s/sortedTxList.tsv" % (args.out_dir) flankBed = "%s/flankBed.bed" % (args.out_dir) txBedFile = "%s/txBed.bed" % (args.out_dir) flankFasta = "%s/flankFasta.fas" % (args.out_dir) shutil.move(tmp_files.sortedTxListFile.name, sortedTxListFile) os.chmod(sortedTxListFile,0775) tmp_files.flankBed.saveas(flankBed) os.chmod(flankBed,0775) shutil.move(tmp_files.txBedFile.name, txBedFile) os.chmod(txBedFile,0775) shutil.move(tmp_files.flankBed.seqfn, flankFasta) os.chmod(flankFasta,0775) if args.dump_megafasta: megaFasta = "%s/megaFasta.fas" % (args.out_dir) megaFastaFile.delete = False megaFastaFile.close() shutil.move(megaFastaFile.name, megaFasta) os.chmod(megaFasta,0775)