def make_feature_maker(algo_name, training_data_dir, train_set_name, dev_set_name): logging.info("[%s] Initializing feature maker ...", algo_name) train_path = os.path.join(training_data_dir, train_set_name + ".csv") dev_path = os.path.join(training_data_dir, dev_set_name + ".csv") data_info_path = data_info_lib.data_info_path_for_testing(training_data_dir) return feature_lib.FeatureMaker(train_path, dev_path, data_info_path)
def main(unused_argv): if not FLAGS.dev_data_file: raise ValueError("Specify --dev_data_file") if not FLAGS.training_data_file: raise ValueError("Specify --training_data_file") if not FLAGS.data_info_file: raise ValueError("Specify --data_info_file") if not (FLAGS.target_feature or FLAGS.target_feature_file): raise ValueError("Specify --target_feature or --target_feature_file") if FLAGS.cross_validate and not FLAGS.best_configurations_file: raise ValueError( "Specify --best_configurations_file in cross-validation " "mode") features = [] if FLAGS.target_feature_file: with open(FLAGS.target_feature_file) as s: for line in s: features.append(line.strip()) else: features = [FLAGS.target_feature] # Process features. feature_maker = feature_lib.FeatureMaker(FLAGS.training_data_file, FLAGS.dev_data_file, FLAGS.data_info_file) # Perform cross-validation to establish the best configurations of models # and features or simply train and evaluate. if FLAGS.cross_validate: best_configs = _cross_validation_training(feature_maker, features) logging.info("Saving best configs to \"%s\" ...", FLAGS.best_configurations_file) with open(FLAGS.best_configurations_file, "w", encoding=const.ENCODING) as f: json.dump(best_configs, f) else: _train_and_evaluate(feature_maker, features)
def main(unused_argv): feature_maker = lib.FeatureMaker( FLAGS.training_data, FLAGS.dev_data, FLAGS.data_info) training_df, dev_df = feature_maker.process_data( "Order_of_Subject,_Object_and_Verb") long_implicational = ( "The_Position_of_Negative_Morphemes_in_SOV_Languages" "@18 SV&OV&NegV@Order_of_Subject,_Object_and_Verb_majval") assert "family_majval" in dev_df.columns assert "family_count" in dev_df.columns assert long_implicational in dev_df.columns assert long_implicational in training_df.columns non_zeroes = [] for fname in training_df.columns: if ("majval" in fname and "genus" not in fname and "family" not in fname and "neighborhood" not in fname): for i in training_df[fname]: if i: non_zeroes.append(i) # Show that there are non-zero (non-NA) entries for implicationals: assert non_zeroes non_zeroes = [] for fname in dev_df.columns: if ("majval" in fname and "genus" not in fname and "family" not in fname and "neighborhood" not in fname): for i in dev_df[fname]: if i: non_zeroes.append(i) # Show that there are non-zero (non-NA) entries for implicationals: assert non_zeroes # Remove some of the columns. # # Obviously if you use this make sure you do the same thing to both training # and dev. smaller_dev_df = feature_maker.select_columns( dev_df, discard_counts=True, discard_implicationals=True) assert "wals_code" in smaller_dev_df.columns assert "target_value" in smaller_dev_df.columns assert long_implicational not in smaller_dev_df.columns assert "family_count" not in smaller_dev_df.columns # Remove some different columns smaller_dev_df = feature_maker.select_columns( dev_df, discard_counts=True, discard_implicationals=False) assert "wals_code" in smaller_dev_df.columns assert "target_value" in smaller_dev_df.columns assert long_implicational in smaller_dev_df.columns assert "family_count" not in smaller_dev_df.columns # Try another feature training_df, dev_df = feature_maker.process_data("Hand_and_Arm") assert "family_majval" in dev_df.columns assert "family_count" in dev_df.columns long_implicational = ( "Number_of_Cases@9 Exclusively borderline case-marking" "@Hand_and_Arm_majval") assert long_implicational in dev_df.columns for c in dev_df.columns: assert c in training_df.columns