def get_training_data_matrix(self, normalize, ablation_features=(), toExclude=()): """ Process the training data. Args: — normalize: a boolean flag if the data should be normalized in a feature matrix; — toExclude: a list/tuple of paradigms that cannot be used in the training data (for cross-validation); — ablate: a list of features to exclude during the ablation study. Return: - headlines; - a training sparse scipy matrix; - a list of targets. """ assert isinstance(normalize, bool) self._check_if_ablation_appropriate(ablation_features) # additional data initialization: self._read_category_val_alternations(self.categoryPath) # self.categoryDescription self._read_paradigm_lengths() # self.pLengths # and this is a table maker itself setParadigms = set() with codecs.open(self.MLDataPath, 'r', 'utf-8-sig') as f: data = json.loads(f.read()) processedData = [] targets = [] for lexeme in data: if lexeme["paradigm"] in toExclude: continue else: setParadigms.add(lexeme["paradigm"]) lexemeFeatureDic = self._convert_lexeme_to_feature_dic(lexeme, ablation_features) processedData.append(lexemeFeatureDic) sampleEval = FeatureExtractor.is_positive_example(lexeme) targets.append(sampleEval) headlines, matrix = self._dic_list_to_matrix(processedData, normalize) if setParadigms: logging.info("Training set paradigms: %s", u" ".join(list(setParadigms))) else: logging.critical("Training set is empty.") return headlines, matrix, targets
def _category_entropy_variance(self, lexeme): return FeatureExtractor.category_entropy_variance(lexeme, self.categoryDescription)
def _number_of_one_value_categories(self, lexeme): return FeatureExtractor.number_of_one_value_categories(lexeme, self.categoryDescription)
def _entropy_to_paradigm_length(self, lexeme): return FeatureExtractor.entropy_to_paradigm_length(lexeme, self.pLengths)
def _part_of_found_gramm(self, lexeme): return FeatureExtractor.part_of_found_grammars(lexeme, self.pLengths)
def _part_of_found_flex(self, lexeme): return FeatureExtractor.part_of_found_flex(lexeme, self.pLengths)
def _min_category_entropy(self, lexeme): return FeatureExtractor.min_category_entropy(lexeme, self.categoryDescription)