def fit(self, X, y=None, n_estimators=9, max_features="auto", sample_weight=False, random_state=None): # get features and labels features, labels = self.feature_extractor.transform( X, relation_labels=self.relation_labels) if is_empty(features): log.error("No examples to train, quiting...") return self log.info("Checking parameters...") if type(max_features) == str: if max_features == "auto": max_features = self._get_max_features(features) elif max_features == "log": max_features = self._get_max_features(features, method=log2) elif max_features == "sqrt": max_features = self._get_max_features(features, method=sqrt) else: raise ValueError( "Unknown method '{}' for feature selection in Random Forest" .format(max_features)) if type(max_features) != int: raise TypeError( "The parameter 'max_features' must be either a string or integer." ) self.config.set_parameters({ "n_estimators": n_estimators, "max_features": max_features, "random_state": random_state }) # create a model self.rf = RandomForestClassifier( n_estimators=self.config.get_parameter("n_estimators"), max_features=self.config.get_parameter("max_features"), random_state=self.config.get_parameter("random_state")) log.info("Training Random Forest...") weights = to_weights(labels) if sample_weight else None self.rf.fit(features, labels, sample_weight=weights) return self
def fit(self, X, y=None, sample_weight=False): log.info("Checking parameters...") self.config.validate() # create a model self.nb = MultinomialNB() # get features and labels features, labels = self.feature_extractor.transform( X, relation_labels=self.relation_labels) if is_empty(features): log.error("No examples to train, quiting...") return self log.info("Training Naive Bayes...") weights = to_weights(labels) if sample_weight else None self.nb.fit(features, labels, sample_weight=weights) return self
def fit(self, X, y=None, max_iterations=100, C=1, sample_weight=False): log.info("Checking parameters...") self.config.set_parameters({"max_iterations": max_iterations, "C": C}) self.config.validate() # create a model self.svm = CalibratedClassifierCV( LinearSVC(max_iter=self.config.get_parameter("max_iterations"), C=self.config.get_parameter("C"))) # get features and labels features, labels = self.feature_extractor.transform( X, relation_labels=self.relation_labels) if is_empty(features): log.error("No examples to train, quiting...") return self log.info("Training SVM...") weights = to_weights(labels) if sample_weight else None self.svm.fit(features, labels, sample_weight=weights) return self
def transform(self, X, y=None): # get features (labels are ignored) features, _ = self.feature_extractor.transform( X, relation_labels=self.relation_labels) if is_empty(features): return X # make predictions log.info("Predicting relations in {} documents with SVM...".format( len(X))) probs = self.svm.predict_proba(features) # the order of labels corresponds to the order of probabilities labels = self.svm.classes_ predicted_labels = [] for prob in probs: score = numpy.amax(prob) label = labels[numpy.where(prob == score)[0][0]] predicted_labels += [{label: score}] # make annotated documents return self.feature_extractor.predictions_to_annotated_documents( predicted_labels)