示例#1
0
 def waitForJobCount(self,
                     targetCount=0,
                     pollIntervalSeconds=60,
                     verbose=True):
     if targetCount == -1:
         return
     numJobs = self.getNumJobs()
     if numJobs <= targetCount:
         return
     waitTimer = Timer()
     while numJobs > targetCount:
         sleepTimer = Timer()
         accountName = self.account
         if self.account == None:
             accountName = "local"
         if verbose:
             sleepString = " [          ]     "
             print >> sys.stderr, "\rWaiting for " + str(
                 numJobs) + " on " + accountName + " (limit=" + str(
                     targetCount) + ")", waitTimer.elapsedTimeToString(
                     ) + sleepString,
         while sleepTimer.getElapsedTime() < pollIntervalSeconds:
             if verbose:
                 steps = int(10 * sleepTimer.getElapsedTime() /
                             pollIntervalSeconds) + 1
                 sleepString = " [" + steps * "." + (10 -
                                                     steps) * " " + "]     "
                 print >> sys.stderr, "\rWaiting for " + str(
                     numJobs) + " on " + accountName + " (limit=" + str(
                         targetCount) + ")", waitTimer.elapsedTimeToString(
                         ) + sleepString,
             time.sleep(5)
         numJobs = self.getNumJobs()
     print >> sys.stderr, "\nAll jobs done"
示例#2
0
 def waitForJobs(self, scriptNames, timeout=None):
     assert len(scriptNames) == len(outputFileNames)
     print >> sys.stderr, "Waiting for results"
     finished = 0
     louhiTimer = Timer()
     combinationStatus = {}
     while (True):
         # count finished
         finished = 0
         processStatus = {
             "FINISHED": 0,
             "QUEUED": 0,
             "FAILED": 0,
             "RUNNING": 0
         }
         for scriptName in scriptNames:
             status = self.getLouhiStatus(scriptName)
             combinationStatus[id] = status
             processStatus[status] += 1
         p = processStatus
         processStatusString = str(p["QUEUED"]) + " queued, " + str(
             p["RUNNING"]) + " running, " + str(
                 p["FINISHED"]) + " finished, " + str(
                     p["FAILED"]) + " failed"
         if processStatus["QUEUED"] + processStatus["RUNNING"] == 0:
             print >> sys.stderr
             print >> sys.stderr, "All jobs done (" + processStatusString + ")"
             break
         # decide what to do
         if timeout == None or louhiTimer.getElapsedTime() < timeout:
             sleepString = " [          ]     "
             print >> sys.stderr, "\rWaiting for " + str(
                 len(combinations)
             ) + " on " + cscConnection.machineName + "(" + processStatusString + "),", louhiTimer.elapsedTimeToString(
             ) + sleepString,
             #time.sleep(60)
             sleepTimer = Timer()
             while sleepTimer.getElapsedTime() < 60:
                 steps = int(10 * sleepTimer.getElapsedTime() / 60) + 1
                 sleepString = " [" + steps * "." + (10 -
                                                     steps) * " " + "]     "
                 print >> sys.stderr, "\rWaiting for " + str(
                     len(combinations)
                 ) + " on " + cscConnection.machineName + "(" + processStatusString + "),", louhiTimer.elapsedTimeToString(
                 ) + sleepString,
                 time.sleep(5)
         else:
             print >> sys.stderr
             print >> sys.stderr, "Timed out, ", louhiTimer.elapsedTimeToString(
             )
             return False
     return True
示例#3
0
def optimizeLocal(Classifier,
                  Evaluator,
                  trainExamples,
                  testExamples,
                  classIds,
                  combinations,
                  workDir=None,
                  timeout=None):
    bestResult = None
    combinationCount = 1
    for combination in combinations:
        Stream.setIndent(" ")
        print >> sys.stderr, "Parameters " + str(combinationCount) + "/" + str(
            len(combinations)) + ":", str(combination)
        Stream.setIndent("  ")
        combinationId = getCombinationString(combination)
        # Train
        trainOutput = "model-" + combinationId
        if workDir != None:
            trainOutput = os.path.join(workDir, trainOutput)
        print >> sys.stderr, "Training..."
        timer = Timer()
        Classifier.train(trainExamples, combination, trainOutput)
        print >> sys.stderr, "Training Complete, time:", timer.toString()
        # Test
        testOutput = "classifications-" + combinationId
        if workDir != None:
            testOutput = os.path.join(workDir, testOutput)
        print >> sys.stderr, "Testing..."
        timer = Timer()
        Classifier.test(testExamples, trainOutput, testOutput)
        print >> sys.stderr, "Testing Complete, time:", timer.toString()
        # Evaluate
        evaluationOutput = "evaluation-" + combinationId + ".csv"
        if workDir != None:
            evaluationOutput = os.path.join(workDir, evaluationOutput)
        Stream.setIndent("   ")
        evaluator = Evaluator.evaluate(testExamples, testOutput, classIds,
                                       evaluationOutput)
        #print >> sys.stderr, evaluator.toStringConcise("  ")

        if bestResult == None or evaluator.compare(
                bestResult[0]
        ) > 0:  #: averageResult.fScore > bestResult[1].fScore:
            bestResult = [
                evaluator, trainOutput, testOutput, evaluationOutput,
                combination
            ]
        combinationCount += 1
    Stream.setIndent()
    print >> sys.stderr, "Selected parameters", bestResult[-1]
    return bestResult
示例#4
0
 def __init__(self, style=None, classSet=None, featureSet=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     self.styles = style
     self.timerBuildExamples = Timer(False)
     self.timerCrawl = Timer(False)
     self.timerCrawlPrecalc = Timer(False)
     self.timerMatrix = Timer(False)
     self.timerMatrixPrecalc = Timer(False)
示例#5
0
 def waitForJobs(self,
                 jobs,
                 pollIntervalSeconds=60,
                 timeout=None,
                 verbose=True):
     print >> sys.stderr, "Waiting for results"
     waitTimer = Timer()
     while (True):
         jobStatus = {"FINISHED": 0, "QUEUED": 0, "FAILED": 0, "RUNNING": 0}
         for job in jobs:
             jobStatus[self.getJobStatus(job)] += 1
         jobStatusString = str(jobStatus["QUEUED"]) + " queued, " + str(
             jobStatus["RUNNING"]) + " running, " + str(
                 jobStatus["FINISHED"]) + " finished, " + str(
                     jobStatus["FAILED"]) + " failed"
         if jobStatus["QUEUED"] + jobStatus["RUNNING"] == 0:
             if verbose:
                 print >> sys.stderr, "\nAll runs done (" + jobStatusString + ")"
             break
         # decide what to do
         if timeout == None or timeoutTimer.getElapsedTime() < timeout:
             sleepTimer = Timer()
             accountName = self.account
             if self.account == None:
                 accountName = "local"
             if verbose:
                 sleepString = " [          ]     "
                 print >> sys.stderr, "\rWaiting for " + str(
                     len(jobs)
                 ) + " on " + accountName + "(" + jobStatusString + "),", waitTimer.elapsedTimeToString(
                 ) + sleepString,
             while sleepTimer.getElapsedTime() < pollIntervalSeconds:
                 if verbose:
                     steps = int(10 * sleepTimer.getElapsedTime() /
                                 pollIntervalSeconds) + 1
                     sleepString = " [" + steps * "." + (
                         10 - steps) * " " + "]     "
                     print >> sys.stderr, "\rWaiting for " + str(
                         len(jobs)
                     ) + " on " + accountName + "(" + jobStatusString + "),", waitTimer.elapsedTimeToString(
                     ) + sleepString,
                 time.sleep(5)
         else:
             if verbose:
                 print >> sys.stderr, "\nTimed out, ", trainTimer.elapsedTimeToString(
                 )
             break
     return jobStatus
示例#6
0
    def laplacian(self):
        """Computes hypergraph laplacian
        Delta=I-Theta,
        Theta=Dv^-1/2 H W De^-1 H^T Dv^-1/2
        
        Returns
        -------
        Delta: sparse matrix
            hypergraph laplacian
        """

        with Timer() as t_l:

            Theta = self.theta_matrix()
            Delta = spsp.eye(*sp.shape(Theta)) - Theta

        self.laplacian_timer = t_l.secs

        return Delta
示例#7
0
    def incidence_matrix(self):
        """Computes incidence matrix of size |V|*|E|
        h(v,e)=1 if v in e
        h(v,e)=0 if v not in e
        
        Returns
        -------
        H: sparse incidence matrix
            sparse incidence matrix of size |V|*|E|
        """
        with Timer() as t_in:
            H = spsp.lil_matrix(
                (sp.shape(sp.unique(self.edge_list.flatten()))[0],
                 sp.shape(self.edge_list)[0]))

            it = sp.nditer(self.edge_list, flags=['multi_index', 'refs_ok'])
            while not it.finished:
                H[it[0], it.multi_index[0]] = 1.0
                it.iternext()

        self.incidence_matrix_timer = t_in.secs
        return H
示例#8
0
    def train(cls, examples, parameters, outputFile=None):  #, timeout=None):
        """
        Train the SVM-multiclass classifier on a set of examples.
        
        @type examples: string (filename) or list (or iterator) of examples
        @param examples: a list or file containing examples in SVM-format
        @type parameters: a dictionary or string
        @param parameters: parameters for the classifier
        @type outputFile: string
        @param outputFile: the name of the model file to be written
        """
        timer = Timer()
        parameters = cls.getParams(parameters)

        # If examples are in a list, they will be written to a file for SVM-multiclass
        if type(examples) == types.ListType:
            print >> sys.stderr, "Training SVM-MultiClass on", len(
                examples), "examples"
            trainPath = self.tempDir + "/train.dat"
            examples = self.filterTrainingSet(examples)
            Example.writeExamples(examples, trainPath)
        else:
            print >> sys.stderr, "Training SVM-MultiClass on file", examples
            trainPath = cls.stripComments(examples)
        args = ["/home/jari/Programs/liblinear-1.5-poly2/train"]
        cls.__addParametersToSubprocessCall(args, parameters)
        if outputFile == None:
            args += [trainPath, "model"]
            logFile = open("svmmulticlass.log", "at")
        else:
            args += [trainPath, outputFile]
            logFile = open(outputFile + ".log", "wt")
        rv = subprocess.call(args, stdout=logFile)
        logFile.close()
        print >> sys.stderr, timer.toString()
        return rv
示例#9
0
def buildExamples(exampleBuilder, sentences, outfilename):
    timer = Timer()
    examples = []
    if "graph_kernel" in exampleBuilder.styles:
        counter = ProgressCounter(len(sentences), "Build examples", 0)
    else:
        counter = ProgressCounter(len(sentences), "Build examples")

    calculatePredictedRange(exampleBuilder, sentences)

    outfile = open(outfilename, "wt")
    exampleCount = 0
    for sentence in sentences:
        counter.update(
            1, "Building examples (" + sentence[0].getSentenceId() + "): ")
        examples = exampleBuilder.buildExamples(sentence[0])
        exampleCount += len(examples)
        examples = exampleBuilder.preProcessExamples(examples)
        Example.appendExamples(examples, outfile)
    outfile.close()

    print >> sys.stderr, "Examples built:", str(exampleCount)
    print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames())
    print >> sys.stderr, "Elapsed", timer.toString()
示例#10
0
    def __init__(self, view_origin, view_dim, screen):
        self.view_plane = Plane(view_origin, view_dim)
        self.screen = screen
        self.zoom_in = Timer()
        self.zoom_out = Timer()
        self.move_left = Timer()
        self.move_right = Timer()
        self.move_up = Timer()
        self.move_down = Timer()
        self.zoom_max = 10
        self.zoom_min = 1
        self.move_sensitivity = 50
        self.zoom_sensitivity = 1

        self.key_events = {
            'r': self.toggle_zoom_in,
            'e': self.toggle_zoom_out,
            'right': self.toggle_move_right,
            'left': self.toggle_move_left,
            'up': self.toggle_move_up,
            'down': self.toggle_move_down
        }
示例#11
0
    def laplacian_eigs(self,
                       k=6,
                       type='SM',
                       filename=None,
                       minTol=1e-23,
                       **kwargs):
        """Computes eigenvectors of laplacian
        
        Parameters
        ----------
        k: int, optional
            number of eigenpairs
        type: str, optional
            type of eigenpairs, as specified in scipy.sparse.linalg.eigs documentation, or 'LNZ' for lowest non zero
        filename: str, optional
            if filename exists, save min eigenvalue, min eigenvector, all used eigenvalues, all used eigenvectors in json format
        kwargs: named arguments to pass to scipy.sparse.linalg.eigs function
        Returns
        -------
        eigenvals: ndarray
            array of k eigenvalues
        eigenvecs: ndarray
            array of k eigenvectors
        """
        min_dict = {}

        lap = self.laplacian().tocsc()
        if k >= lap.shape[0]:
            k = lap.shape[0] - 2
        if type == 'LNZ':

            with Timer() as t_eig:
                vals, vecs = spla.eigs(lap, k=k, which='SM', **kwargs)

            #DBG
            print(vals)
            #sort vals and vecs
            sorted_eigenvals_indices = sp.argsort(vals)
            vals = sp.array([vals[i] for i in sorted_eigenvals_indices])
            vecs = sp.array([vecs[:, i] for i in sorted_eigenvals_indices]).T

            #DBG
            print(vals)
            print(sorted_eigenvals_indices)

            self.eigs_timer = t_eig.secs

            vals_lnz_indices = [
                i for i in range(len(vals)) if vals[i] > minTol
            ]

            used_vals = sp.array([vals[i] for i in vals_lnz_indices])
            used_vecs = sp.array([vecs[:, i] for i in vals_lnz_indices]).T

            #DBG
            print('******eigendata:')
            print(used_vals)
            print(min(used_vals))
            if filename:
                min_dict['min_eigenval_used'] = sp.real(
                    min(used_vals)).tolist()
                min_dict['min_eigenvec_used'] = sp.real(
                    used_vecs[:, sp.argmin(used_vals)]).tolist()
                print('-----------eigenvec_len:')
                print(
                    sp.shape(
                        sp.real(used_vecs[:, sp.argmin(used_vals)]).tolist()))
                min_dict['eigenvals_used'] = sp.real(used_vals).tolist()
                min_dict['eigenvecs_used'] = sp.real(used_vecs).tolist()
                json_writer(min_dict, filename)

            self.__isPSD(lap, k)
            self.__test_eigenpairs(vals, vecs, lap)

            return used_vals, used_vecs
        else:
            with Timer() as t_eig:
                vals, vecs = spla.eigs(lap, k=k, which=type, **kwargs)

            sorted_eigenvals_indices = sp.argsort(vals)
            vals = sp.array([vals[i] for i in sorted_eigenvals_indices])
            vecs = sp.array([vecs[:, i] for i in sorted_eigenvals_indices]).T

            self.eigs_timer = t_eig.secs

            if filename:
                min_dict['min_eigenval_used'] = sp.real(min(vals)).tolist()
                min_dict['min_eigenvec_used'] = sp.real(
                    vecs[:, sp.argmin(vals)]).tolist()
                min_dict['eigenvals_used'] = sp.real(vals).tolist()
                min_dict['eigenvecs_used'] = sp.real(vecs).tolist()
                json_writer(min_dict, filename)

            return vals, vecs
示例#12
0
    def spectral_clustering(self,
                            clusters_n,
                            k=6,
                            type='SM',
                            embed_type='custom',
                            **kwargs):
        """Performing k-means spectral clustering on laplacian eigenvectors via scikit-learn kmeans algo
        
        Parameters
        ----------
        clusters_n: int
            number of clusters
        k: int, optional
            num of eigenvectors to base kmeans
        type: str, optional
            type of eigenvectors to use for kmeans, as specified in laplacian_eigs
        embed_type: str, optional
            choices: 'custom' - perform embedding using hypergraph laplacian and custom implemented embedding
                     'sklearn_laplacian' - perform embedding using modified sklearn.spectral.embedding using the hypergraph laplacian
                     'sklearn_adjacency' - perform embedding using original sklearn.spectral.embedding using the hypergraph adjacency matrix
            default is 'custom'
        kwargs: named arguments to pass to laplacian eigs function

        Returns
        -------
        centroid: ndarray of shape (k, n_features)
        label: ndarray of shape (n_samples,)
        label_dict: dictionary
            dictionary containing {partiteName: { id: communityId , ...} , ... }
        node_tags: list of str
            order of partites, as found in hyperedges
        inertia: float
        """
        if embed_type == 'sklearn_laplacian':
            f = None
            if 'filename' in kwargs:
                f = kwargs.pop('filename')
            if 'minTol' in kwargs:
                kwargs.pop('minTol')
            if 'maxiter' in kwargs:
                kwargs.pop('maxiter')
            eigenvecs = spectral_embedding(self.laplacian(), clusters_n,
                                           **kwargs)

            #===================================================================
            # #DBG
            # print(eigenvecs)
            # print(eigenvecs.min())
            # print(eigenvecs.max())
            # print(eigenvecs.mean())
            #===================================================================
            if f:
                json_writer({'eigenvecs_used': sp.real(eigenvecs).tolist()}, f)

        elif embed_type == 'sklearn_adjacency':
            f = None
            if 'filename' in kwargs:
                f = kwargs.pop('filename')
            if 'minTol' in kwargs:
                kwargs.pop('minTol')
            if 'maxiter' in kwargs:
                kwargs.pop('maxiter')
            eigenvecs = skmanifold.spectral_embedding(self.adjacency_matrix(),
                                                      clusters_n, **kwargs)

            #===================================================================
            # #DBG
            # print(eigenvecs)
            # print(eigenvecs.min())
            # print(eigenvecs.max())
            # print(eigenvecs.mean())
            #===================================================================
            if f:
                json_writer({'eigenvecs_used': sp.real(eigenvecs).tolist()}, f)

        else:
            eigenvecs = self.laplacian_eigs(k, type, **kwargs)[1]

        with Timer() as t_cl:
            cen, lab, inert = sklearn.cluster.k_means(eigenvecs, clusters_n)

        self.clustering_timer = t_cl.secs

        label_dict = self._community_vector_match(lab)

        return cen, lab, label_dict, self.node_tags, inert
示例#13
0
    def perform_rsa(self,
                    draw="NONE",
                    print_times="NONE",
                    save_summary=False,
                    save_data=None):

        if not draw in ["NONE", "ITERATION", "END"]:
            raise ValueError("draw must be either: NONE, ITERATION or END")

        if not print_times in ["NONE", "ALL", "TOTAL"]:
            raise ValueError("print_times must be either: NONE, ALL or TOTAL")

        print_times_all = print_times == "ALL"
        iter_timers = []

        summary_dict = {
            "configuration": {
                "fig_radiuses": self.fig_radiuses.astype(float).tolist(),
                "fig_positions": self.fig_xys.astype(float).tolist(),
                "cell_num_world_size":
                [int(self.cell_num_x),
                 int(self.cell_num_y)],
                "cell_size": float(self.cell_size),
                "added_fig_num": int(self.added_fig_num),
                "voxel_removal_treshold": float(self.voxel_removal_treshold),
                "voxel_num_treshold": int(self.voxel_num_treshold),
                "fig_area": float(self.fig_area),
                "fig_radius": float(self.fig_radius),
                "version": float(self.version)
            }
        }

        iterations_data = []
        voxel_fraction = 1.0

        self.initialise_rsa()
        while (self.voxel_num > 0):

            timer_iter = Timer()
            timer_iter.start_timer("iteration")

            timer_iter.start_timer("generation")
            self.generate_figs()
            g_t = timer_iter.stop_timer("generation", print_times_all)

            timer_iter.start_timer("reject_vs_existing")
            self.reject_figs_vs_existing()
            re_t = timer_iter.stop_timer("reject_vs_existing", print_times_all)

            timer_iter.start_timer("reject_vs_new")
            self.reject_figs_vs_new()
            rn_t = timer_iter.stop_timer("reject_vs_new", print_times_all)

            timer_iter.start_timer("split_voxels")
            voxel_added_cond = (
                1.0 - (self.successfully_added_figs_num /
                       self.added_fig_num)) > self.voxel_removal_treshold
            voxel_num_cond = self.successfully_added_figs_num == 0 or self.voxel_num < self.voxel_num_treshold
            if voxel_added_cond and voxel_num_cond:
                self.split_voxels()
                voxel_fraction = 0.5 * voxel_fraction
            s_t = timer_iter.stop_timer("split_voxels", print_times_all)

            timer_iter.start_timer("reject_voxels")
            self.reject_voxels()
            rv_t = timer_iter.stop_timer("reject_voxels", print_times_all)

            i_t = timer_iter.stop_timer("iteration", print_times_all)
            iter_timers.append(timer_iter.get_timers())

            iteration_dict = {
                "timers": {
                    "generation": g_t,
                    "reject_vs_existing": re_t,
                    "reject_vs_new": rn_t,
                    "split_voxels": s_t,
                    "reject_voxels": rv_t,
                    "iteration": i_t
                },
                "data": {
                    "voxel_num": int(self.voxel_num),
                    "voxel_fraction": voxel_fraction,
                    "fig_num": int(self.fig_num),
                    "density": self.calculate_density()
                }
            }
            iterations_data.append(iteration_dict)

            if draw == "ITERATION":
                draw_func(self)

            if print_times_all:
                print("DATA: figures:", self.fig_num)
                print("DATA: voxels:", self.voxel_num)
                print("DATA: voxel_fraction:", voxel_fraction)
                print("DATA: density:", self.calculate_density())
                print("===================")

            self.iteration += 1

        total_time = sum([t["iteration"][2] for t in iter_timers])
        if print_times == "ALL" or print_times == "TOTAL":
            print("DATA: figures:", self.fig_num)
            print("DATA: voxels:", self.voxel_num)
            print("DATA: voxel_fraction:", voxel_fraction)
            print("DATA: density:", self.calculate_density())

            name = "total"
            print(f'TIMER: {name:20s} {total_time:.20f}')

        if draw == "END":
            draw_func(self)

        final_dict = {
            "voxel_fraction": voxel_fraction,
            "fig_num": int(self.fig_num),
            "density": self.calculate_density(),
            "total_time": total_time
        }

        summary_dict["iterations"] = iterations_data
        summary_dict["summary"] = final_dict
        if save_summary:
            record_run(summary_dict)
        if save_data != None:
            save_output(self.figs, self.fig_num, save_data)

        self.finalise()
        return summary_dict
示例#14
0
    def optimize(self,
                 trainSets,
                 classifySets,
                 parameters=defaultOptimizationParameters,
                 evaluationClass=None,
                 evaluationArgs={},
                 combinationsThatTimedOut=None):
        if parameters.has_key("predefined"):
            print >> sys.stderr, "Predefined model, skipping parameter estimation"
            return {"predefined": parameters["predefined"]}

        print >> sys.stderr, "Optimizing parameters"
        parameterNames = parameters.keys()
        parameterNames.sort()
        #        for p in self.notOptimizedParameters:
        #            if p in parameterNames:
        #                parameterNames.remove(p)
        parameterValues = []
        for parameterName in parameterNames:
            parameterValues.append([])
            for value in parameters[parameterName]:
                parameterValues[-1].append((parameterName, value))
        combinationLists = combine.combine(*parameterValues)
        combinations = []
        for combinationList in combinationLists:
            combinations.append({})
            for value in combinationList:
                combinations[-1][value[0]] = value[1]
        if combinationsThatTimedOut == None:
            combinationsThatTimedOut = []


#        # re-add non-optimized parameters to combinations
#        for p in self.notOptimizedParameters:
#            if parameters.has_key(p):
#                for combination in combinations:
#                    combination[p] = parameters[p]

        bestResult = None
        combinationCount = 1
        if hasattr(self, "tempDir"):
            mainTempDir = self.tempDir
            mainDebugFile = self.debugFile
        for combination in combinations:
            print >> sys.stderr, " Parameters " + str(
                combinationCount) + "/" + str(
                    len(combinations)) + ":", str(combination),
            skip = False
            #print combinationsThatTimedOut
            for discarded in combinationsThatTimedOut:
                if self._dictIsIdentical(combination, discarded):
                    print >> sys.stderr
                    print >> sys.stderr, "  Discarded before, skipping"
                    skip = True
                    break
            if skip:
                continue
            # Make copies of examples in case they are modified
            fold = 1
            foldResults = []
            for classifyExamples in classifySets:
                if type(trainSets[0]) == types.StringType:
                    trainExamples = trainSets[0]
                else:
                    trainExamples = []
                    for trainSet in trainSets:
                        if trainSet != classifyExamples:
                            trainExamples.extend(trainSet)
                trainExamplesCopy = trainExamples
                if type(trainExamples) == types.ListType:
                    trainExamplesCopy = trainExamples  #ExampleUtils.copyExamples(trainExamples)
                classifyExamplesCopy = classifyExamples
                if type(classifyExamples) == types.ListType:
                    classifyExamplesCopy = classifyExamples  #ExampleUtils.copyExamples(classifyExamples)
                if hasattr(self, "tempDir"):
                    self.tempDir = mainTempDir + "/parameters" + str(
                        combinationCount) + "/optimization" + str(fold)
                    if not os.path.exists(self.tempDir):
                        os.makedirs(self.tempDir)
                    self.debugFile = open(self.tempDir + "/debug.txt", "wt")

                timer = Timer()
                #trainStartTime = time.time()
                trainRV = self.train(trainExamplesCopy, combination)
                #trainTime = time.time() - trainStartTime
                #print >> sys.stderr, " Time spent:", trainTime, "s"
                print >> sys.stderr, " Time spent:", timer.elapsedTimeToString(
                )
                if trainRV == 0:
                    predictions = self.classify(classifyExamplesCopy)
                    evaluation = evaluationClass(predictions, **evaluationArgs)
                    if len(classifySets) == 1:
                        print >> sys.stderr, evaluation.toStringConcise("  ")
                    else:
                        print >> sys.stderr, evaluation.toStringConcise(
                            indent="  ", title="Fold " + str(fold))
                    foldResults.append(evaluation)
                    if hasattr(self, "tempDir"):
                        evaluation.saveCSV(self.tempDir + "/results.csv")
                else:
                    combinationsThatTimedOut.append(combination)
                    print >> sys.stderr, "  Timed out"
                fold += 1
            if len(foldResults) > 0:
                averageResult = evaluationClass.average(foldResults)
                poolResult = evaluationClass.pool(foldResults)
                if hasattr(self, "tempDir"):
                    TableUtils.writeCSV(
                        combination, mainTempDir + "/parameters" +
                        str(combinationCount) + ".csv")
                    averageResult.saveCSV(mainTempDir + "/parameters" +
                                          str(combinationCount) +
                                          "/resultsAverage.csv")
                    poolResult.saveCSV(mainTempDir + "/parameters" +
                                       str(combinationCount) +
                                       "/resultsPooled.csv")
                if len(classifySets) > 1:
                    print >> sys.stderr, averageResult.toStringConcise(
                        "  Avg: ")
                    print >> sys.stderr, poolResult.toStringConcise("  Pool: ")
                if bestResult == None or poolResult.compare(
                        bestResult[1]
                ) > 0:  #: averageResult.fScore > bestResult[1].fScore:
                    #bestResult = (predictions, averageResult, combination)
                    bestResult = (None, poolResult, combination)
                    # Make sure memory is released, especially important since some of the previous steps
                    # copy examples
                    bestResult[1].classifications = None
                    bestResult[1].predictions = None
            combinationCount += 1
            if hasattr(self, "tempDir"):
                self.debugFile.close()
        if hasattr(self, "tempDir"):
            self.tempDir = mainTempDir
            self.debugFile = mainDebugFile
        return bestResult
示例#15
0
    def test(cls,
             examples,
             modelPath,
             output=None,
             parameters=None,
             forceInternal=False,
             classIds=None):  # , timeout=None):
        """
        Classify examples with a pre-trained model.
        
        @type examples: string (filename) or list (or iterator) of examples
        @param examples: a list or file containing examples in SVM-format
        @type modelPath: string
        @param modelPath: filename of the pre-trained model file
        @type parameters: a dictionary or string
        @param parameters: parameters for the classifier
        @type output: string
        @param output: the name of the predictions file to be written
        @type forceInternal: Boolean
        @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py
        """
        if type(parameters) == types.StringType:
            parameters = splitParameters(parameters)
        timer = Timer()
        if type(examples) == types.ListType:
            print >> sys.stderr, "Classifying", len(
                examples), "with SVM-MultiClass model", modelPath
            examples, predictions = self.filterClassificationSet(
                examples, False)
            testPath = self.tempDir + "/test.dat"
            Example.writeExamples(examples, testPath)
        else:
            print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath
            testPath = examples
            examples = Example.readExamples(examples, False)
        if parameters != None:
            parameters = copy.copy(parameters)
            if parameters.has_key("c"):
                del parameters["c"]
            if parameters.has_key("predefined"):
                parameters = copy.copy(parameters)
                modelPath = os.path.join(parameters["predefined"][0],
                                         "classifier/model")
                del parameters["predefined"]
        # Read model
        if modelPath == None:
            modelPath = "model-multilabel"
        classModels = {}
        if modelPath.endswith(".gz"):
            f = gzip.open(modelPath, "rt")
        else:
            f = open(modelPath, "rt")
        thresholds = {}
        for line in f:
            key, value, threshold = line.split()
            classModels[key] = value
            if threshold != "None":
                thresholds[key] = float(threshold)
            else:
                thresholds[key] = 0.0
        f.close()
        mergedPredictions = []
        if type(classIds) == types.StringType:
            classIds = IdSet(filename=classIds)
        #print classModels
        print "Thresholds", thresholds
        classifierBin = Settings.SVMMultiClassDir + "/svm_multiclass_classify"
        print parameters
        if "classifier" in parameters and "svmperf" in parameters["classifier"]:
            classifierBin = Settings.SVMPerfDir + "/svm_perf_classify"
            parameters = copy.copy(parameters)
            del parameters["classifier"]
        for className in classIds.getNames():
            if className != "neg" and not "---" in className:
                classId = classIds.getId(className)
                if thresholds[str(className)] != 0.0:
                    print >> sys.stderr, "Classifying", className, "with threshold", thresholds[
                        str(className)]
                else:
                    print >> sys.stderr, "Classifying", className
                args = [classifierBin]
                #self.__addParametersToSubprocessCall(args, parameters)
                classOutput = "predictions" + ".cls-" + className
                logFile = open("svmmulticlass" + ".cls-" + className + ".log",
                               "at")
                args += [testPath, classModels[str(className)], classOutput]
                print args
                subprocess.call(args, stdout=logFile, stderr=logFile)
                cls.addPredictions(classOutput,
                                   mergedPredictions,
                                   classId,
                                   len(classIds.Ids),
                                   threshold=thresholds[str(className)])
        print >> sys.stderr, timer.toString()

        predFileName = output
        f = open(predFileName, "wt")
        for mergedPred in mergedPredictions:
            if len(mergedPred[0]) > 1 and "1" in mergedPred[0]:
                mergedPred[0].remove("1")
            mergedPred[1] = str(mergedPred[1])
            mergedPred[0] = ",".join(sorted(list(mergedPred[0])))
            f.write(" ".join(mergedPred) + "\n")
        f.close()

        return mergedPredictions
示例#16
0
    optparser.add_option(
        "-d",
        "--paramOptData",
        default=None,
        dest="paramOptData",
        help=
        "The fraction of the corpus to be always used for parameter optimization"
    )
    optparser.add_option("-m",
                         "--resultsToXML",
                         default=None,
                         dest="resultsToXML",
                         help="Output interaction xml-file")
    (options, args) = optparser.parse_args()

    timer = Timer()
    print >> sys.stderr, timer.toString()

    if options.folds.find(",") != 0:
        options.folds = options.folds.split(",")
        assert (len(options.folds) == 2)
        options.folds[0] = int(options.folds[0])
        options.folds[1] = int(options.folds[1])
        if options.paramOptData != None:
            print >> sys.stderr, "Parameter optimization set defined, parameter " + str(
                options.folds[1]
            ) + "-fold cross validation will not be performed."
    else:
        options.folds = (int(options.folds), int(options.folds))

    if options.output != None:
示例#17
0
        "--visualization",
        default=None,
        dest="visualization",
        help=
        "Visualization output directory. NOTE: If the directory exists, it will be deleted!"
    )
    optparser.add_option(
        "-m",
        "--resultsToXML",
        default=None,
        dest="resultsToXML",
        help="Results in analysis xml. NOTE: for edges, pairs, not interactions"
    )
    (options, args) = optparser.parse_args()

    mainTimer = Timer()
    print >> sys.stderr, __file__ + " start, " + mainTimer.toString()

    if options.output != None:
        if os.path.exists(options.output):
            print >> sys.stderr, "Output directory exists, removing", options.output
            shutil.rmtree(options.output)
        os.mkdir(options.output)
        if not os.path.exists(options.output + "/classifier"):
            os.mkdir(options.output + "/classifier")

    classifierParamDict = splitParameters(options.parameters)

    print >> sys.stderr, "Importing modules"
    exec "from ExampleBuilders." + options.exampleBuilder + " import " + options.exampleBuilder + " as ExampleBuilder"
    exec "from Classifiers." + options.classifier + " import " + options.classifier + " as Classifier"
示例#18
0
 def test(cls,
          examples,
          modelPath,
          output=None,
          parameters=None,
          forceInternal=False):  # , timeout=None):
     """
     Classify examples with a pre-trained model.
     
     @type examples: string (filename) or list (or iterator) of examples
     @param examples: a list or file containing examples in SVM-format
     @type modelPath: string
     @param modelPath: filename of the pre-trained model file
     @type parameters: a dictionary or string
     @param parameters: parameters for the classifier
     @type output: string
     @param output: the name of the predictions file to be written
     @type forceInternal: Boolean
     @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py
     """
     if forceInternal or Settings.SVMMultiClassDir == None:
         return cls.testInternal(examples, modelPath, output)
     timer = Timer()
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(
             examples), "with SVM-MultiClass model", modelPath
         examples, predictions = self.filterClassificationSet(
             examples, False)
         testPath = self.tempDir + "/test.dat"
         Example.writeExamples(examples, testPath)
     else:
         print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath
         testPath = cls.stripComments(examples)
         examples = Example.readExamples(examples, False)
     args = ["/home/jari/Programs/liblinear-1.5-poly2/predict"]
     if modelPath == None:
         modelPath = "model"
     if parameters != None:
         parameters = copy.copy(parameters)
         if parameters.has_key("c"):
             del parameters["c"]
         if parameters.has_key("predefined"):
             parameters = copy.copy(parameters)
             modelPath = os.path.join(parameters["predefined"][0],
                                      "classifier/model")
             del parameters["predefined"]
         self.__addParametersToSubprocessCall(args, parameters)
     if output == None:
         output = "predictions"
         logFile = open("svmmulticlass.log", "at")
     else:
         logFile = open(output + ".log", "wt")
     args += [testPath, modelPath, output]
     #if timeout == None:
     #    timeout = -1
     #print args
     subprocess.call(args, stdout=logFile, stderr=logFile)
     predictionsFile = open(output, "rt")
     lines = predictionsFile.readlines()
     predictionsFile.close()
     predictions = []
     for i in range(len(lines)):
         predictions.append([int(lines[i].split()[0])] +
                            lines[i].split()[1:])
         #predictions.append( (examples[i],int(lines[i].split()[0]),"multiclass",lines[i].split()[1:]) )
     print >> sys.stderr, timer.toString()
     return predictions
示例#19
0
def optimizeCSC(Classifier,
                Evaluator,
                trainExamples,
                testExamples,
                classIds,
                combinations,
                workDir=None,
                timeout=None,
                cscConnection=None,
                downloadAllModels=False,
                steps="BOTH",
                threshold=False):
    bestResult = None
    combinationCount = 1
    combinationIds = []
    assert steps in ["BOTH", "SUBMIT", "RESULTS"], steps

    if type(classIds) == types.StringType:
        classIds = IdSet(filename=classIds)
    if Classifier.__name__ == "MultiLabelClassifier":
        negClass1 = True
        if "classifier" in combinations[0] and combinations[0][
                "classifier"] == "svmperf":
            negClass1 = False
        print "negclass1", negClass1
        Classifier.makeClassFiles(trainExamples,
                                  testExamples,
                                  classIds,
                                  negClass1=negClass1)

    if steps in ["BOTH", "SUBMIT"]:
        print >> sys.stderr, "Initializing runs"
        for combination in combinations:
            Stream.setIndent(" ")
            print >> sys.stderr, "Parameters " + str(
                combinationCount) + "/" + str(
                    len(combinations)) + ":", str(combination)
            # Train
            combinationIds.append(
                Classifier.initTrainAndTestOnLouhi(trainExamples, testExamples,
                                                   combination, cscConnection,
                                                   workDir, classIds))
            combinationCount += 1
    else:
        for combination in combinations:
            idStr = ""
            for key in sorted(combination.keys()):
                idStr += "-" + str(key) + "_" + str(combination[key])
            combinationIds.append(idStr)
    Stream.setIndent()

    if steps in ["BOTH", "RESULTS"]:
        Stream.setIndent(" ")
        print >> sys.stderr, "Waiting for results"
        finished = 0
        louhiTimer = Timer()
        #combinationStatus = {}
        while (True):
            # count finished
            finished = 0
            processStatus = {
                "FINISHED": 0,
                "QUEUED": 0,
                "FAILED": 0,
                "RUNNING": 0
            }
            for id in combinationIds:
                #status = Classifier.getLouhiStatus(id, cscConnection)
                #combinationStatus[id] = status
                #processStatus[status] += 1
                Classifier.getLouhiStatus(id, cscConnection, processStatus,
                                          classIds)
            p = processStatus
            processStatusString = str(p["QUEUED"]) + " queued, " + str(
                p["RUNNING"]) + " running, " + str(
                    p["FINISHED"]) + " finished, " + str(
                        p["FAILED"]) + " failed"
            if processStatus["QUEUED"] + processStatus["RUNNING"] == 0:
                print >> sys.stderr
                print >> sys.stderr, "All runs done (" + processStatusString + ")"
                break
            # decide what to do
            if timeout == None or louhiTimer.getElapsedTime() < timeout:
                sleepString = " [          ]     "
                print >> sys.stderr, "\rWaiting for " + str(
                    len(combinations)
                ) + " on " + cscConnection.machineName + "(" + processStatusString + "),", louhiTimer.elapsedTimeToString(
                ) + sleepString,
                #time.sleep(60)
                sleepTimer = Timer()
                while sleepTimer.getElapsedTime() < 60:
                    steps = int(10 * sleepTimer.getElapsedTime() / 60) + 1
                    sleepString = " [" + steps * "." + (10 -
                                                        steps) * " " + "]     "
                    print >> sys.stderr, "\rWaiting for " + str(
                        len(combinations)
                    ) + " on " + cscConnection.machineName + "(" + processStatusString + "),", louhiTimer.elapsedTimeToString(
                    ) + sleepString,
                    time.sleep(5)
            else:
                print >> sys.stderr
                print >> sys.stderr, "Timed out, ", louhiTimer.elapsedTimeToString(
                )
                break

        print >> sys.stderr, "Evaluating results"
        #if type(testExamples) != types.ListType:
        #    print >> sys.stderr, "Loading examples from file", testExamples
        #    testExamples = ExampleUtils.readExamples(testExamples,False)
        bestCombinationId = None
        for i in range(len(combinationIds)):
            id = combinationIds[i]
            Stream.setIndent(" ")
            # Evaluate
            predictions = Classifier.getLouhiPredictions(
                id, cscConnection, workDir, classIds)
            if predictions == None:
                print >> sys.stderr, "No results for combination" + id
            else:
                if downloadAllModels:
                    modelFileName = Classifier.downloadModel(
                        id, cscConnection, workDir)
                    if workDir != None:
                        modelFileName = os.path.join(workDir, modelFileName)
                        subprocess.call("gzip -fv " + modelFileName,
                                        shell=True)
                print >> sys.stderr, "Evaluating results for combination" + id
                evaluationOutput = "evaluation" + id + ".csv"
                if workDir != None:
                    evaluationOutput = os.path.join(workDir, evaluationOutput)
                evaluator = Evaluator.evaluate(testExamples, predictions,
                                               classIds, evaluationOutput)
                if threshold:
                    print >> sys.stderr, "Thresholding"
                    evaluator.determineThreshold(testExamples, predictions)
                if Classifier.__name__ != "MultiLabelClassifier":
                    if bestResult == None or evaluator.compare(
                            bestResult[0]
                    ) > 0:  #: averageResult.fScore > bestResult[1].fScore:
                        bestResult = [
                            evaluator, None, predictions, evaluationOutput,
                            combinations[i]
                        ]
                        bestCombinationId = id
                else:
                    assert Evaluator.__name__ == "MultiLabelEvaluator", Evaluator.__name__
                    if bestResult == None:
                        bestResult = [{}, None]
                        for className in classIds.Ids:
                            if className != "neg" and "---" not in className:
                                bestResult[0][className] = [
                                    -1, None,
                                    classIds.getId(className), None
                                ]
                    for className in classIds.Ids:
                        if className != "neg" and "---" not in className:
                            fscore = evaluator.dataByClass[classIds.getId(
                                className)].fscore
                            if fscore > bestResult[0][className][0]:
                                bestResult[0][className] = [
                                    fscore, id, bestResult[0][className][2]
                                ]
                                if threshold:
                                    classId = classIds.getId(className, False)
                                    if classId in evaluator.thresholds:
                                        bestResult[0][className].append(
                                            evaluator.thresholds[classId])
                                    else:
                                        bestResult[0][className].append(0.0)
                                else:
                                    bestResult[0][className].append(None)
                    bestCombinationId = bestResult
                os.remove(predictions)  # remove predictions to save space
        Stream.setIndent()
        print >> sys.stderr, "Selected parameters", bestResult[-1]
        #if Classifier.__name__ == "MultiLabelClassifier":
        #    evaluator = Evaluator.evaluate(testExamples, predictions, classIds, evaluationOutput)

        # Download best model and predictions
        modelFileName = Classifier.downloadModel(bestCombinationId,
                                                 cscConnection, workDir)
        if workDir != None:
            modelFileName = os.path.join(workDir, modelFileName)
        subprocess.call("gzip -fv " + modelFileName, shell=True)
        modelFileName = modelFileName + ".gz"
        #if Classifier.__name__ != "MultiLabelClassifier":
        #bestResult = [None, None]
        bestResult[1] = modelFileName
        return bestResult