def RandSequential(opts, initPath, allProts, sampleMap, potentialSteinerMap, dummyNeighborMap, lastForestMap, countMap, weightedPrizes, negativePrizes, degPenalties): print "Learning forests in random sequential mode" # Iterate (rounds 2+) itrPath = initPath for itr in range(2, opts.iterations + 1): #lastPath = itrPath itrPath = os.path.join(opts.resultPath, "itr%d" % itr) if not os.path.exists(itrPath): os.makedirs(itrPath) # Only constrain the Steiner forests to be similar to other samples in the same group for group in sampleMap.iterkeys(): sampleNames = sampleMap[group] numSamples = countMap[group] potentialSteiner = potentialSteinerMap[group] dummyNeighborFiles = dummyNeighborMap[group] lastForests = lastForestMap[group] if len(sampleNames) != numSamples or len( potentialSteiner) != numSamples or len( dummyNeighborFiles) != numSamples or len( lastForests) != numSamples: raise RuntimeError( "Must have the same number of samples in group %s" % group) # Randomly choose the order in which to learn forests at this iteration order = range(numSamples) random.shuffle(order) # Write the order to a file with open(os.path.join(itrPath, "sampleOrder_%s.txt" % group), "w") as f: for index in order: f.write("%d\t%s\n" % (index, sampleNames[index])) # Iterate over all samples in the random order for index in order: # Create artificial prizes for this sample using all N-1 lastForests otherLastForests = list(lastForests) otherLastForests.pop(index) if weightedPrizes: # lambda2 is used as the alpha parameter artificialPrizes = CreateWgtPrizes(allProts, otherLastForests, opts.lambda1, opts.lambda2, negativePrizes) else: # Use all N-1 other sets of potential Steiner nodes otherPotentialSteiner = list(potentialSteiner) otherPotentialSteiner.pop(index) artificialPrizes = CreateUnwgtPrizes( allProts, otherPotentialSteiner, otherLastForests, opts.lambda1, opts.lambda2, negativePrizes) NetworkUtil.WriteDict( os.path.join( itrPath, "%s_artificialPrizes.txt" % sampleNames[index]), artificialPrizes) # Update the stp file based on the artificial prizes and degree penalties and copy the dummy neighbors UpdateStp(artificialPrizes, degPenalties, potentialSteiner[index], initPath, itrPath, sampleNames[index]) shutil.copyfile( os.path.join(initPath, dummyNeighborFiles[index]), os.path.join(itrPath, dummyNeighborFiles[index])) # Learn a new forest for this sample and update lastForests # All samples (besides the first and last in the random order) will use last forests # that are a mix of forests from this iteration and the previous iteration LearnSteiner(opts, itrPath, itrPath, sampleNames[index], dummyNeighborFiles[index], opts.workers) lastForests[index] = LoadForestNodes( "%s/symbol_%s_%s_1.0_%d.txt" % (itrPath, sampleNames[index], str(opts.W), opts.depth)) # Store all forests learned for this group at this iteration so they can be # retreived at the next iteration lastForestMap[group] = lastForests return itrPath
def Batch(opts, pool, initPath, allProts, sampleMap, potentialSteinerMap, dummyNeighborMap, lastForestMap, countMap, weightedPrizes, negativePrizes, degPenalties): print "Learning forests in parallel batch mode" # Iterate (rounds 2+) itrPath = initPath for itr in range(2, opts.iterations + 1): #lastPath = itrPath itrPath = os.path.join(opts.resultPath, "itr%d" % itr) if not os.path.exists(itrPath): os.makedirs(itrPath) # Only constrain the Steiner forests to be similar to other samples in the same group for group in sampleMap.iterkeys(): sampleNames = sampleMap[group] numSamples = countMap[group] potentialSteiner = potentialSteinerMap[group] dummyNeighborFiles = dummyNeighborMap[group] lastForests = lastForestMap[group] if len(sampleNames) != numSamples or len( potentialSteiner) != numSamples or len( dummyNeighborFiles) != numSamples or len( lastForests) != numSamples: raise RuntimeError( "Must have the same number of samples in group %s" % group) # Update artificial prizes based on the forests from the previous iteration if weightedPrizes: # lambda2 is used as the alpha parameter artificialPrizes = CreateWgtPrizes(allProts, lastForests, opts.lambda1, opts.lambda2, negativePrizes) else: artificialPrizes = CreateUnwgtPrizes(allProts, potentialSteiner, lastForests, opts.lambda1, opts.lambda2, negativePrizes) NetworkUtil.WriteDict( os.path.join(itrPath, "artificialPrizes_%s.txt" % group), artificialPrizes) print "%d artificial prizes in group %s at iteration %d" % ( len(artificialPrizes), group, itr) # Update the stp files based on the new artificial prizes and degree penalties # and copy the potential Steiner node files, which need to be in itrPath for i in range(numSamples): UpdateStp(artificialPrizes, degPenalties, potentialSteiner[i], initPath, itrPath, sampleNames[i]) shutil.copyfile(os.path.join(initPath, dummyNeighborFiles[i]), os.path.join(itrPath, dummyNeighborFiles[i])) # Learn new Steiner forests in parallel zippedArgs = itertools.izip(itertools.repeat(opts), itertools.repeat(itrPath), itertools.repeat(itrPath), sampleNames, dummyNeighborFiles, itertools.repeat(1)) pool.map(LearnSteinerHelper, zippedArgs) lastForests = [] for i in range(numSamples): lastForests.append( LoadForestNodes( "%s/symbol_%s_%s_1.0_%d.txt" % (itrPath, sampleNames[i], str(opts.W), opts.depth))) lastForestMap[group] = lastForests return itrPath
def main(argList): # Parse the arguments, which either come from the command line or a list # provided by the Python code calling this function parser = CreateParser() (opts, args) = parser.parse_args(argList) print "Starting constrained multi-sample Steiner forest %s" % time.strftime( "%a, %d %b %Y %H:%M:%S", time.localtime()) print "Multi-PCSF version %s" % __version__ print "Parameters: %s" % opts # TODO Add error checking of inputs if opts.iterations < 1: raise RuntimeError("Must have at least 1 iteration") # TODO Should allow the option to run serially without the pool because a # pool with 1 worker is not efficient if opts.workers < 1: opts.workers = multiprocessing.cpu_count() # Assume negative prizes to implement the common set # and change if using positive common set prizes negativePrizes = True if "positive" in opts.artificialPrizes: negativePrizes = False # Assume unweighted prizes weightedPrizes = False if "Weighted" in opts.artificialPrizes: weightedPrizes = True # Assume batch mode batchMode = True if opts.iterMode == "random": batchMode = False # Load all of the proteins in the interactome, ignoring # genes. The artificial prizes will be created for a subset of these nodes. allProts = LoadProteins(opts.interactomePath, opts.undirectedFile, opts.directedFile, opts.tfdnaFile) # Load the negative prizes for the degree penalties or an empty dictionary # if they aren't being used directedFile = "None" if opts.directedFile != "None": directedFile = os.path.join(opts.interactomePath, opts.directedFile) degPenalties = NetworkUtil.DegreePenalties( opts.mu, os.path.join(opts.interactomePath, opts.undirectedFile), directedFile) # Create the initial stp files # New directory to hold the original data before the iterations begin # These stp files will be read and updated at subsequent iterations initPath = os.path.join(opts.resultPath, "initial") if not os.path.exists(initPath): os.makedirs(initPath) # Load the list of terminal files and the sample-to-group mapping terminalMap, sampleMap, countMap = LoadTerminalFiles( opts.terminalPath, opts.masterTerminalFile) # Store the groups in a fixed order groups = sorted(terminalMap.iterkeys()) for group in groups: print "%d samples in group %s" % (countMap[group], group) # Create a pool for creating .stp files and learning Steiner forests in parallel # using the specified number of workers. Use it to create the initial # .stp files. Even when running the subsequent iterations in random sequential # order, create a pool to learn the initial trees and final pruned trees (if applicable). print "Creating a pool with %d workers" % opts.workers pool = multiprocessing.Pool(opts.workers) initialStpMap = dict() for group in groups: terminalFiles = terminalMap[group] sampleNames = sampleMap[group] # opts and initPath are invariant arguments for each sample zippedArgs = itertools.izip(itertools.repeat(opts), itertools.repeat(initPath), terminalFiles, sampleNames) initialStpMap[group] = pool.map( CreateStpHelper, zippedArgs) # Blocks until all are finished # Store which proteins don't have prizes for each patient. # These are the nodes that could potentially be Steiner nodes for # each sample. This can't be recovered from the stp files at later # iterations because both original prizes and artificial prizes will exist. # Also track how the dummy node will be connected # to the networks, either all prizes or all non-prizes (potential Steiner nodes) potentialSteinerMap = dict() dummyNeighborMap = dict() for group in groups: numSamples = countMap[group] sampleNames = sampleMap[group] initialStps = initialStpMap[group] potentialSteiner = [] # A list of sets dummyNeighborFiles = [] # A list of filenames for i in range(numSamples): dnFile = sampleNames[i] + "_dummyNeighbors.txt" dummyNeighborFiles.append(dnFile) potentialSteiner.append( DummyNeighbors(allProts, initPath, initialStps[i], dnFile, opts.dummyNeighbors)) potentialSteinerMap[group] = potentialSteiner dummyNeighborMap[group] = dummyNeighborFiles itrPath = os.path.join(opts.resultPath, "itr1") if not os.path.exists(itrPath): os.makedirs(itrPath) # Initialize the artificial prizes to be an empty dictionary so that # we learn the initial trees independently artificialPrizes = dict() # Write the unused itr1 artificial prizes so that the files exist for post-processing for group in groups: NetworkUtil.WriteDict( os.path.join(itrPath, "artificialPrizes_%s.txt" % group), artificialPrizes) print "%d artificial prizes at iteration 1" % len(artificialPrizes) # Add the degree penalties to the initial stp files. Pass in the empty artificial prize # dictionary, which won't have an effect. for group in groups: sampleNames = sampleMap[group] numSamples = countMap[group] potentialSteiner = potentialSteinerMap[group] dummyNeighborFiles = dummyNeighborMap[group] for i in range(numSamples): # Copy the dummy neighbors, which must be in the same directory as the stp file UpdateStp(artificialPrizes, degPenalties, potentialSteiner[i], initPath, itrPath, sampleNames[i]) shutil.copyfile(os.path.join(initPath, dummyNeighborFiles[i]), os.path.join(itrPath, dummyNeighborFiles[i])) # Learn the first iteration Steiner forests in parallel # Run single-threaded belief propagation when using the worker pool lastForestMap = dict() for group in groups: numSamples = countMap[group] sampleNames = sampleMap[group] dummyNeighborFiles = dummyNeighborMap[group] zippedArgs = itertools.izip(itertools.repeat(opts), itertools.repeat(itrPath), itertools.repeat(itrPath), sampleNames, dummyNeighborFiles, itertools.repeat(1)) pool.map(LearnSteinerHelper, zippedArgs) lastForests = [ ] # A list of sets, where each set contains the Steiner forest nodes for i in range(numSamples): lastForests.append( LoadForestNodes( "%s/symbol_%s_%s_1.0_%d.txt" % (itrPath, sampleNames[i], str(opts.W), opts.depth))) lastForestMap[group] = lastForests # Learn the forests at all remaining iterations and return the directory # that contains the forests from the last iteration. if opts.iterations > 1: if batchMode: itrPath = Batch(opts, pool, initPath, allProts, sampleMap, potentialSteinerMap, dummyNeighborMap, lastForestMap, countMap, weightedPrizes, negativePrizes, degPenalties) else: itrPath = RandSequential(opts, initPath, allProts, sampleMap, potentialSteinerMap, dummyNeighborMap, lastForestMap, countMap, weightedPrizes, negativePrizes, degPenalties) # Prune Steiner nodes from the forests that are not used to reach any prizes and # are only present because they were in the common set. # This is not necessary if only 1 iteration was run because in that case there # is no common set. # It is also not necessary if negative prizes were used. if opts.iterations > 1 and (not negativePrizes): print "Learning final forests" print "Pruning forests from %s" % itrPath finalPath = os.path.join(opts.resultPath, "final") if not os.path.exists(finalPath): os.makedirs(finalPath) # Nothing is returned by these operations so they can be performed # simultaneously independent of the groupings sampleNames = FlattenDict(sampleMap, groups) dummyNeighborFiles = FlattenDict(dummyNeighborMap, groups) potentialSteiner = FlattenDict(potentialSteinerMap, groups) for i in range(len(sampleNames)): forestFile = "%s/symbol_%s_%s_1.0_%d.txt" % ( itrPath, sampleNames[i], str(opts.W), opts.depth) FilterStpEdges(forestFile, initPath, finalPath, sampleNames[i], degPenalties, potentialSteiner[i]) shutil.copyfile(os.path.join(initPath, dummyNeighborFiles[i]), os.path.join(finalPath, dummyNeighborFiles[i])) zippedArgs = itertools.izip(itertools.repeat(opts), itertools.repeat(finalPath), itertools.repeat(finalPath), sampleNames, dummyNeighborFiles, itertools.repeat(1)) pool.map(LearnSteinerHelper, zippedArgs) print "Finishing constrained multi-sample Steiner forest %s" % time.strftime( "%a, %d %b %Y %H:%M:%S", time.localtime()) pool.close()