示例#1
0
class SkRanker(Ranker, SkLearner):
    '''
    Basic ranker wrapping scikit-learn functions
    '''
    
    def train(self, dataset_filename, 
              scale=True, 
              feature_selector=None, 
              feature_selection_params={},
              feature_selection_threshold=.25, 
              learning_params={}, 
              optimize=True, 
              optimization_params={}, 
              scorers=['f1_score'],
              attribute_set=None,
              class_name=None,
              metaresults_prefix="./0-",
              **kwargs):
        
        plot_filename = "{}{}".format(metaresults_prefix, "featureselection.pdf")
        data, labels = dataset_to_instances(dataset_filename, attribute_set, class_name,  **kwargs)
        learner = self.learner
        
        #the class must remember the attribute_set and the class_name in order to reproduce the vectors
        self.attribute_set = attribute_set
        self.class_name = class_name

 
        #scale data to the mean
        if scale:
            log.info("Scaling datasets...")
            log.debug("Data shape before scaling: {}".format(data.shape))
            self.scaler = StandardScaler()
            data = self.scaler.fit_transform(data)
            log.debug("Data shape after scaling: {}".format(data.shape))
            log.debug("Mean: {} , Std: {}".format(self.scaler.mean_, self.scaler.std_))

        #avoid any NaNs and Infs that may have occurred due to the scaling
        data = np.nan_to_num(data)
        
        #feature selection
        if isinstance(feature_selection_params, basestring):
            feature_selection_params = eval(feature_selection_params)
        self.featureselector, data, metadata = self.run_feature_selection(data, labels, feature_selector, feature_selection_params, feature_selection_threshold, plot_filename) 
        
        #initialize learning method and scoring functions and optimize
        self.learner, self.scorers = self.initialize_learning_method(learner, data, labels, learning_params, optimize, optimization_params, scorers)

        log.info("Data shape before fitting: {}".format(data.shape))

        self.learner.fit(data, labels)
        self.fit = True
        return metadata
    
    def get_model_description(self):
        params = {}
        
        if self.scaler:
            params = self.scaler.get_params(deep=True)
        try: #these are for SVC
            if self.learner.kernel == "rbf":
                params["gamma"] = self.learner.gamma
                params["C"] = self.learner.C
                for i, n_support in enumerate(self.learner.n_support_):
                    params["n_{}".format(i)] = n_support
                log.debug(len(self.learner.dual_coef_))
                return params
            elif self.learner.kernel == "linear":
                coefficients = self.learner.coef_
                att_coefficients = {}
                for attname, coeff in zip(self.attribute_set.get_names_pairwise(), coefficients[0]):
                    att_coefficients[attname] = coeff
                return att_coefficients
        except AttributeError:
            pass
        try: #adaboost etc
            params = self.learner.get_params()
            numeric_params = OrderedDict()
            for key, value in params.iteritems():
                try:
                    value = float(value)
                except ValueError:
                    continue
                numeric_params[key] = value
            return numeric_params
        except:
            pass
        return {}
    
    
    def get_ranked_sentence(self, parallelsentence, critical_attribute="rank_predicted", 
                            new_rank_name="rank_hard", 
                            del_orig_class_att=False, 
                            bidirectional_pairs=False, 
                            ties=True,
                            reconstruct='hard'):
        """
        """
        if type(self.learner) == str:
            if self.classifier:
                self.learner = self.classifier
                # this is to provide backwards compatibility for old models 
                # whose classes used differeent attribute names
                try:
                    self.learner._dual_coef_ = self.learner.dual_coef_
                    self.learner._intercept_ = self.learner.intercept_
                except AttributeError:
                    # it's ok if the model doesn't have these variables
                    pass

                try: # backwards compatibility for old LogisticRegression
                    try_classes = self.learner.classes_
                except AttributeError:
                    self.learner.classes_ = [-1, 1]

        #de-compose multiranked sentence into pairwise comparisons
        pairwise_parallelsentences = parallelsentence.get_pairwise_parallelsentences(bidirectional_pairs=bidirectional_pairs,
                                                                                     class_name=self.class_name,
                                                                                     ties=ties)        
        if len(parallelsentence.get_translations()) == 1:
            log.warning("Parallelsentence has only one target sentence")
            parallelsentence.tgt[0].add_attribute(new_rank_name, 1)
            return parallelsentence, {}
        elif len(parallelsentence.get_translations()) == 0:
            return parallelsentence, {}
        #list that will hold the pairwise parallel sentences including the learner's decision
        classified_pairwise_parallelsentences = []
        resultvector = {}
        
        for pairwise_parallelsentence in pairwise_parallelsentences:
            #convert pairwise parallel sentence into an orange instance
            instance = parallelsentence_to_instance(pairwise_parallelsentence, attribute_set=self.attribute_set)
            #scale data instance to mean, based on trained scaler
            if self.scaler:
                try:
                    instance = np.nan_to_num(instance)
                    instance = self.scaler.transform(instance)
                except ValueError as e:
                    log.error("Could not transform instance: {}, scikit replied: {}".format(instance, e))
                    #raise ValueError(e)
                    pass
            try:
                if self.featureselector:
                    instance = np.nan_to_num(instance)
                    instance = self.featureselector.transform(instance)
            except AttributeError:
                pass
            log.debug('Instance = {}'.format(instance)) 
            #make sure no NaN or inf appears in the instance
            instance = np.nan_to_num(instance)
            #run learner for this instance
            predicted_value = self.learner.predict(instance)
            try:
                distribution = dict(zip(self.learner.classes_, self.learner.predict_proba(instance)[0]))
            except AttributeError: 
                #if learner does not support per-class probability (e.g. LinearSVC) assign 0.5
                distribution = dict([(cl, 0.5) for cl in self.learner.classes_])
            log.debug("Distribution: {}".format(distribution))
            log.debug("Predicted value: {}".format(predicted_value))
            #even if we have a binary learner, it may be that it cannot decide between two classes
            #for us, this means a tie
            if not bidirectional_pairs and distribution and len(distribution)==2 and float(distribution[1])==0.5:
                predicted_value = 0
                distribution[predicted_value] = 0.5
                
            log.debug("{}, {}, {}".format(pairwise_parallelsentence.get_system_names(), predicted_value, distribution))
            
            
            #gather several metadata from the classification, which may be needed 
            resultvector.update({'systems' : pairwise_parallelsentence.get_system_names(),
                                 'value' : predicted_value,
                                 'distribution': distribution,
                                 'confidence': distribution[int(predicted_value)],
#                                 'instance' : instance,
                                 })
            
            #add the new predicted ranks as attributes of the new pairwise sentence
            pairwise_parallelsentence.add_attributes({"rank_predicted":predicted_value,
                                                       "prob_-1":distribution[-1],
                                                       "prob_1":distribution[1]
                                                       })
            
            classified_pairwise_parallelsentences.append(pairwise_parallelsentence)

        
        #gather all classified pairwise comparisons of into one parallel sentence again
        sentenceset = CompactPairwiseParallelSentenceSet(classified_pairwise_parallelsentences)
        if reconstruct == 'hard':
            log.debug("Applying hard reconstruction to produce rank {}".format(new_rank_name))
            ranked_sentence = sentenceset.get_multiranked_sentence(critical_attribute=critical_attribute, 
                                                               new_rank_name=new_rank_name, 
                                                               del_orig_class_att=del_orig_class_att)
        else:
            attribute1 = "prob_-1"
            attribute2 = "prob_1"
            log.debug("Applying soft reconstruction to produce rank {}".format(new_rank_name))
            try:
                ranked_sentence = sentenceset.get_multiranked_sentence_with_soft_ranks(attribute1, attribute2, 
                        critical_attribute, new_rank_name, normalize_ranking=False)
            except:
                raise ValueError("Sentenceset {} from {} caused exception".format(classified_pairwise_parallelsentences, parallelsentence))
        return ranked_sentence, resultvector