class SkRanker(Ranker, SkLearner): ''' Basic ranker wrapping scikit-learn functions ''' def train(self, dataset_filename, scale=True, feature_selector=None, feature_selection_params={}, feature_selection_threshold=.25, learning_params={}, optimize=True, optimization_params={}, scorers=['f1_score'], attribute_set=None, class_name=None, metaresults_prefix="./0-", **kwargs): plot_filename = "{}{}".format(metaresults_prefix, "featureselection.pdf") data, labels = dataset_to_instances(dataset_filename, attribute_set, class_name, **kwargs) learner = self.learner #the class must remember the attribute_set and the class_name in order to reproduce the vectors self.attribute_set = attribute_set self.class_name = class_name #scale data to the mean if scale: log.info("Scaling datasets...") log.debug("Data shape before scaling: {}".format(data.shape)) self.scaler = StandardScaler() data = self.scaler.fit_transform(data) log.debug("Data shape after scaling: {}".format(data.shape)) log.debug("Mean: {} , Std: {}".format(self.scaler.mean_, self.scaler.std_)) #avoid any NaNs and Infs that may have occurred due to the scaling data = np.nan_to_num(data) #feature selection if isinstance(feature_selection_params, basestring): feature_selection_params = eval(feature_selection_params) self.featureselector, data, metadata = self.run_feature_selection(data, labels, feature_selector, feature_selection_params, feature_selection_threshold, plot_filename) #initialize learning method and scoring functions and optimize self.learner, self.scorers = self.initialize_learning_method(learner, data, labels, learning_params, optimize, optimization_params, scorers) log.info("Data shape before fitting: {}".format(data.shape)) self.learner.fit(data, labels) self.fit = True return metadata def get_model_description(self): params = {} if self.scaler: params = self.scaler.get_params(deep=True) try: #these are for SVC if self.learner.kernel == "rbf": params["gamma"] = self.learner.gamma params["C"] = self.learner.C for i, n_support in enumerate(self.learner.n_support_): params["n_{}".format(i)] = n_support log.debug(len(self.learner.dual_coef_)) return params elif self.learner.kernel == "linear": coefficients = self.learner.coef_ att_coefficients = {} for attname, coeff in zip(self.attribute_set.get_names_pairwise(), coefficients[0]): att_coefficients[attname] = coeff return att_coefficients except AttributeError: pass try: #adaboost etc params = self.learner.get_params() numeric_params = OrderedDict() for key, value in params.iteritems(): try: value = float(value) except ValueError: continue numeric_params[key] = value return numeric_params except: pass return {} def get_ranked_sentence(self, parallelsentence, critical_attribute="rank_predicted", new_rank_name="rank_hard", del_orig_class_att=False, bidirectional_pairs=False, ties=True, reconstruct='hard'): """ """ if type(self.learner) == str: if self.classifier: self.learner = self.classifier # this is to provide backwards compatibility for old models # whose classes used differeent attribute names try: self.learner._dual_coef_ = self.learner.dual_coef_ self.learner._intercept_ = self.learner.intercept_ except AttributeError: # it's ok if the model doesn't have these variables pass try: # backwards compatibility for old LogisticRegression try_classes = self.learner.classes_ except AttributeError: self.learner.classes_ = [-1, 1] #de-compose multiranked sentence into pairwise comparisons pairwise_parallelsentences = parallelsentence.get_pairwise_parallelsentences(bidirectional_pairs=bidirectional_pairs, class_name=self.class_name, ties=ties) if len(parallelsentence.get_translations()) == 1: log.warning("Parallelsentence has only one target sentence") parallelsentence.tgt[0].add_attribute(new_rank_name, 1) return parallelsentence, {} elif len(parallelsentence.get_translations()) == 0: return parallelsentence, {} #list that will hold the pairwise parallel sentences including the learner's decision classified_pairwise_parallelsentences = [] resultvector = {} for pairwise_parallelsentence in pairwise_parallelsentences: #convert pairwise parallel sentence into an orange instance instance = parallelsentence_to_instance(pairwise_parallelsentence, attribute_set=self.attribute_set) #scale data instance to mean, based on trained scaler if self.scaler: try: instance = np.nan_to_num(instance) instance = self.scaler.transform(instance) except ValueError as e: log.error("Could not transform instance: {}, scikit replied: {}".format(instance, e)) #raise ValueError(e) pass try: if self.featureselector: instance = np.nan_to_num(instance) instance = self.featureselector.transform(instance) except AttributeError: pass log.debug('Instance = {}'.format(instance)) #make sure no NaN or inf appears in the instance instance = np.nan_to_num(instance) #run learner for this instance predicted_value = self.learner.predict(instance) try: distribution = dict(zip(self.learner.classes_, self.learner.predict_proba(instance)[0])) except AttributeError: #if learner does not support per-class probability (e.g. LinearSVC) assign 0.5 distribution = dict([(cl, 0.5) for cl in self.learner.classes_]) log.debug("Distribution: {}".format(distribution)) log.debug("Predicted value: {}".format(predicted_value)) #even if we have a binary learner, it may be that it cannot decide between two classes #for us, this means a tie if not bidirectional_pairs and distribution and len(distribution)==2 and float(distribution[1])==0.5: predicted_value = 0 distribution[predicted_value] = 0.5 log.debug("{}, {}, {}".format(pairwise_parallelsentence.get_system_names(), predicted_value, distribution)) #gather several metadata from the classification, which may be needed resultvector.update({'systems' : pairwise_parallelsentence.get_system_names(), 'value' : predicted_value, 'distribution': distribution, 'confidence': distribution[int(predicted_value)], # 'instance' : instance, }) #add the new predicted ranks as attributes of the new pairwise sentence pairwise_parallelsentence.add_attributes({"rank_predicted":predicted_value, "prob_-1":distribution[-1], "prob_1":distribution[1] }) classified_pairwise_parallelsentences.append(pairwise_parallelsentence) #gather all classified pairwise comparisons of into one parallel sentence again sentenceset = CompactPairwiseParallelSentenceSet(classified_pairwise_parallelsentences) if reconstruct == 'hard': log.debug("Applying hard reconstruction to produce rank {}".format(new_rank_name)) ranked_sentence = sentenceset.get_multiranked_sentence(critical_attribute=critical_attribute, new_rank_name=new_rank_name, del_orig_class_att=del_orig_class_att) else: attribute1 = "prob_-1" attribute2 = "prob_1" log.debug("Applying soft reconstruction to produce rank {}".format(new_rank_name)) try: ranked_sentence = sentenceset.get_multiranked_sentence_with_soft_ranks(attribute1, attribute2, critical_attribute, new_rank_name, normalize_ranking=False) except: raise ValueError("Sentenceset {} from {} caused exception".format(classified_pairwise_parallelsentences, parallelsentence)) return ranked_sentence, resultvector