def main(): ''' Handles command line arguments and gets things started. ''' parser = argparse.ArgumentParser(description="Prints out the weights of a \ given model.", conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_file', help='model file to load') parser.add_argument('--k', help='number of top features to print (0 for all)', type=int, default=50) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args() # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) weights = learner.model_params print("Number of nonzero features:", len(weights), file=sys.stderr) for feat, val in sorted(iteritems(weights), key=lambda x: -abs(x[1]))[:k]: print("{:.12f}\t{}".format(val, feat))
def main(): ''' Handles command line arguments and gets things started. ''' parser = argparse.ArgumentParser( description="Prints out the weights of a \ given model.", conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_file', help='model file to load') parser.add_argument('--k', help='number of top features to print (0 for all)', type=int, default=50) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args() # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) weights = learner.model_params print("Number of nonzero features:", len(weights), file=sys.stderr) for feat, val in sorted(iteritems(weights), key=lambda x: -abs(x[1]))[:k]: print("{:.12f}\t{}".format(val, feat))
def main(argv=None): """ Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str """ parser = argparse.ArgumentParser(description="Prints out the weights of a \ given model.", conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_file', help='model file to load') parser.add_argument('--k', help='number of top features to print (0 for all)', type=int, default=50) parser.add_argument('--sign', choices=['positive', 'negative', 'all'], default='all', help='show only positive, only negative, ' + 'or all weights') parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) (weights, intercept) = learner.model_params weight_items = iteritems(weights) if args.sign == 'positive': weight_items = (x for x in weight_items if x[1] > 0) elif args.sign == 'negative': weight_items = (x for x in weight_items if x[1] < 0) if intercept is not None: # subclass of LinearModel if '_intercept_' in intercept: # Some learners (e.g. LinearSVR) may return a list of intercepts if isinstance(intercept['_intercept_'], np.ndarray): intercept_list = ["%.12f" % i for i in intercept['_intercept_']] print("intercept = {}".format(intercept_list)) else: print("intercept = {:.12f}".format(intercept['_intercept_'])) else: print("== intercept values ==") for (label, val) in intercept.items(): print("{:.12f}\t{}".format(val, label)) print() print("Number of nonzero features:", len(weights), file=sys.stderr) for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]: print("{:.12f}\t{}".format(val, feat))
def main(argv=None): """ Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str """ parser = argparse.ArgumentParser( description="Prints out the weights of a \ given model.", conflict_handler="resolve", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("model_file", help="model file to load") parser.add_argument("--k", help="number of top features to print (0 for all)", type=int, default=50) parser.add_argument( "--sign", choices=["positive", "negative", "all"], default="all", help="show only positive, only negative, " + "or all weights", ) parser.add_argument("--version", action="version", version="%(prog)s {0}".format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=("%(asctime)s - %(name)s - %(levelname)s - " + "%(message)s")) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) (weights, intercept) = learner.model_params weight_items = iteritems(weights) if args.sign == "positive": weight_items = (x for x in weight_items if x[1] > 0) elif args.sign == "negative": weight_items = (x for x in weight_items if x[1] < 0) if intercept is not None: # subclass of LinearModel if "_intercept_" in intercept: # Some learners (e.g. LinearSVR) may return a list of intercepts if isinstance(intercept["_intercept_"], np.ndarray): intercept_list = ["%.12f" % i for i in intercept["_intercept_"]] print("intercept = {}".format(intercept_list)) else: print("intercept = {:.12f}".format(intercept["_intercept_"])) else: print("== intercept values ==") for (label, val) in intercept.items(): print("{:.12f}\t{}".format(val, label)) print() print("Number of nonzero features:", len(weights), file=sys.stderr) for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]: print("{:.12f}\t{}".format(val, feat))
def main(argv=None): """ Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str """ parser = argparse.ArgumentParser(description="Prints out the weights of a \ given model.", conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_file', help='model file to load') parser.add_argument('--k', help='number of top features to print (0 for all)', type=int, default=50) parser.add_argument('--sign', choices=['positive', 'negative', 'all'], default='all', help='show only positive, only negative, ' + 'or all weights') parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) (weights, intercept) = learner.model_params weight_items = iteritems(weights) if args.sign == 'positive': weight_items = (x for x in weight_items if x[1] > 0) elif args.sign == 'negative': weight_items = (x for x in weight_items if x[1] < 0) if intercept is not None: # subclass of LinearModel if '_intercept_' in intercept: print("intercept = {:.12f}".format(intercept['_intercept_'])) else: print("== intercept values ==") for (label, val) in intercept.items(): print("{:.12f}\t{}".format(val, label)) print() print("Number of nonzero features:", len(weights), file=sys.stderr) for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]: print("{:.12f}\t{}".format(val, feat))
def update_model(model_file): """Read in the model file and save it again.""" model_dir = dirname(model_file) # get the list of current files so that we can # remove them later to ensure there are no stranded # .npy files npy_files = glob.glob(join(model_dir, '*.npy')) # now load the SKLL model model = Learner.from_file(model_file) # delete the existing npy files. The model file will get overwritten, # but we do not know the exact number of current .npy files. for npy_file in npy_files: remove(npy_file) model.save(model_file)
def update_model(model_file): ''' Read in the model file and save it again''' model_dir = dirname(model_file) # get the list of current files so that we can # remove them later to ensure there are no stranded # .npy files npy_files = glob.glob(join(model_dir, '*.npy')) # now load the SKLL model model = Learner.from_file(model_file) # delete the existing npy files. The model file will get overwritten, # but we do not know the exact number of current .npy files. for npy_file in npy_files: remove(npy_file) model.save(model_file)
def __init__(self, model_path, threshold=None, positive_class=1): ''' Initialize the predictor. :param model_path: Path to use when loading trained model. :type model_path: str :param threshold: If the model we're using is generating probabilities of the positive class, return 1 if it meets/exceeds the given threshold and 0 otherwise. :type threshold: float :param positive_class: If the model is only being used to predict the probability of a particular class, this specifies the index of the class we're predicting. 1 = second class, which is default for binary classification. :type positive_class: int ''' self._learner = Learner.from_file(model_path) self._pos_index = positive_class self.threshold = threshold
def __init__(self, model_path, threshold=None, positive_class=1): ''' Initialize the predictor. :param model_path: Path to use when loading trained model. :type model_path: str :param threshold: If the model we're using is generating probabilities of the positive class, return 1 if it meets/exceeds the given threshold and 0 otherwise. :type threshold: float :param positive_class: If the model is only being used to predict the probability of a particular class, this specifies the index of the class we're predicting. 1 = second class, which is default for binary classification. :type positive_class: int ''' self._learner = Learner.from_file(model_path) self._pos_index = positive_class self.threshold = threshold
def compute_and_save_predictions(config_file, output_file, feats_file): """ Generate predictions using the information in the config file and save them into the given output file. """ logger = logging.getLogger(__name__) # read in the main config file config_obj = read_json_file(config_file) config_obj = check_main_config(config_obj, context='rsmpredict') # get the directory where the config file lives # if this is the 'expm' directory, then go # up one level. configpath = dirname(config_file) # get the input file containing the feature values # for which we want to generate the predictions input_features_file = locate_file(config_obj['input_features_file'], configpath) if not input_features_file: raise FileNotFoundError('Input file {} does not exist'.format(config_obj['input_features_file'])) # get the experiment ID experiment_id = config_obj['experiment_id'] # get the column name that will hold the ID id_column = config_obj['id_column'] # get the column name for human score (if any) human_score_column = config_obj['human_score_column'] # get the column name for second human score (if any) second_human_score_column = config_obj['second_human_score_column'] # get the column name for subgroups (if any) subgroups = config_obj['subgroups'] # get the column names for flag columns (if any) flag_column_dict = check_flag_column(config_obj) # get the name for the candidate_column (if any) candidate_column = config_obj['candidate_column'] # get the directory of the experiment experiment_dir = locate_file(config_obj['experiment_dir'], configpath) if not experiment_dir: raise FileNotFoundError('The directory {} does not exist.'.format(config_obj['experiment_dir'])) else: experiment_output_dir = normpath(join(experiment_dir, 'output')) if not exists(experiment_output_dir): raise FileNotFoundError('The directory {} does not contain ' 'the output of an rsmtool experiment.'.format(experiment_dir)) # find all the .model files in the experiment output directory model_files = glob.glob(join(experiment_output_dir, '*.model')) if not model_files: raise FileNotFoundError('The directory {} does not contain any rsmtool models.'.format(experiment_output_dir)) experiment_ids = [splitext(basename(mf))[0] for mf in model_files] if experiment_id not in experiment_ids: raise FileNotFoundError('{} does not contain a model for the experiment "{}". ' 'The following experiments are contained in this ' 'directory: {}'.format(experiment_output_dir, experiment_id, experiment_ids)) # check that the directory contains outher required files required_file_types = ['feature', 'postprocessing_params'] for file_type in required_file_types: expected_file_name = "{}_{}.csv".format(experiment_id, file_type) if not exists(join(experiment_output_dir, expected_file_name)): raise FileNotFoundError('{} does not contain the required file ' '{} that was generated during the ' 'original model training'.format(experiment_output_dir, expected_file_name)) # read in the given features but make sure that the # `id_column`, `candidate_column` and subgroups are read in as a string logger.info('Reading features from {}'.format(input_features_file)) string_columns = [id_column, candidate_column] + subgroups converter_dict = dict([(column, str) for column in string_columns if column]) df_input = pd.read_csv(input_features_file, converters=converter_dict) # make sure that the columns specified in the config file actually exist columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys()) # add subgroups and the flag columns to the list of columns # that will be added to the final file columns_to_copy = subgroups + list(flag_column_dict.keys()) # human_score_column will be set to sc1 by default # we only raise an error if it's set to something else. # However, since we cannot distinguish whether the column was set # to sc1 by default or specified as such in the config file # we append it to output anyway as long as # it is in the input file if human_score_column != 'sc1' or 'sc1' in df_input.columns: columns_to_check.append(human_score_column) columns_to_copy.append('sc1') if candidate_column: columns_to_check.append(candidate_column) columns_to_copy.append('candidate') if second_human_score_column: columns_to_check.append(second_human_score_column) columns_to_copy.append('sc2') missing_columns = set(columns_to_check).difference(df_input.columns) if missing_columns: raise KeyError("Columns {} from the config file " "do not exist in the data.".format(missing_columns)) # rename all columns df_input = rename_default_columns(df_input, [], id_column, human_score_column, second_human_score_column, None, None, candidate_column=candidate_column) # check that the id_column contains unique values if df_input['spkitemid'].size != df_input['spkitemid'].unique().size: raise ValueError("The data contains repeated response IDs in {}. Please make sure all response IDs are unique and re-run the tool.".format(id_column)) # now we need to pre-process these features using # the parameters that are already stored in the # _features.csv file. df_feature_info = pd.read_csv(join(experiment_output_dir, '{}_feature.csv'.format(experiment_id)), index_col=0) required_features = df_feature_info.index.tolist() # ensure that all the features that are needed by the model # are present in the input file input_feature_columns = [c for c in df_input if c != id_column] missing_features = set(required_features).difference(input_feature_columns) if missing_features: raise KeyError('{} is missing the following features: {}'.format(feats_file, missing_features)) extra_features = set(input_feature_columns).difference(required_features + [id_column]) if extra_features: logging.warning('The following extraenous features will be ignored: {}'.format(extra_features)) # keep the required features plus the id features_to_keep = ['spkitemid'] + required_features # check if actually have the human scores for this data and add # sc1 to preprocessed features for consistency with other tools has_human_scores = 'sc1' in df_input if has_human_scores: features_to_keep.append('sc1') df_features = df_input[features_to_keep] # preprocess the feature values logger.info('Pre-processing input features') # first we need to filter out NaNs and any other # weird features, the same way we did for rsmtool. df_filtered = df_features.copy() df_excluded = pd.DataFrame(columns=df_filtered.columns) for feature_name in required_features: newdf, newdf_excluded = filter_on_column(df_filtered, feature_name, 'spkitemid', exclude_zeros=False, exclude_zero_sd=False) del df_filtered df_filtered = newdf df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer') # make sure that the remaining data frame is not empty if len(df_filtered) == 0: raise ValueError("There are no responses left after " "filtering out non-numeric feature values. No analysis " "will be run") df_features = df_filtered.copy() df_features_preprocessed = df_features.copy() for feature_name in required_features: feature_values = df_features[feature_name].values feature_transformation = df_feature_info.loc[feature_name]['transform'] feature_weight = df_feature_info.loc[feature_name]['sign'] train_feature_mean = df_feature_info.loc[feature_name]['train_mean'] train_feature_sd = df_feature_info.loc[feature_name]['train_sd'] train_transformed_mean = df_feature_info.loc[feature_name]['train_transformed_mean'] train_transformed_sd = df_feature_info.loc[feature_name]['train_transformed_sd'] # transform the feature values and remove outliers df_features_preprocessed[feature_name] = preprocess_feature(feature_values, feature_name, feature_transformation, train_feature_mean, train_feature_sd, exclude_zero_sd=False) # now standardize the feature values df_features_preprocessed[feature_name] = (df_features_preprocessed[feature_name] - train_transformed_mean) / train_transformed_sd # Multiply features by weight. Within the # current SR timeline, the mean of the transformed train # feature used to standardize test features has to be # computed before multiplying the train feature by the weight. df_features_preprocessed[feature_name] = df_features_preprocessed[feature_name] * feature_weight # save the pre-processed features to disk if we were asked to if feats_file: logger.info('Saving pre-processed feature values to {}'.format(feats_file)) # create any directories needed for the output file os.makedirs(dirname(feats_file), exist_ok=True) df_features_preprocessed.to_csv(feats_file, index=False) # now load the SKLL model to generate the predictions model = Learner.from_file(join(experiment_output_dir, '{}.model'.format(experiment_id))) # now generate the predictions for the features using this model logger.info('Generating predictions') df_predictions = predict_with_model(model, df_features_preprocessed) # read in the post-processing parameters from disk df_postproc_params = pd.read_csv(join(experiment_output_dir, '{}_postprocessing_params.csv'.format(experiment_id))) trim_min = df_postproc_params['trim_min'].values[0] trim_max = df_postproc_params['trim_max'].values[0] h1_mean = df_postproc_params['h1_mean'].values[0] h1_sd = df_postproc_params['h1_sd'].values[0] train_predictions_mean = df_postproc_params['train_predictions_mean'].values[0] train_predictions_sd = df_postproc_params['train_predictions_sd'].values[0] # now scale the predictions logger.info('Rescaling predictions') scaled_predictions = (df_predictions['raw'] - train_predictions_mean) / train_predictions_sd scaled_predictions = scaled_predictions * h1_sd + h1_mean df_predictions['scale'] = scaled_predictions # trim and round the predictions logger.info('Trimming and rounding predictions') df_predictions['raw_trim'] = trim(df_predictions['raw'], trim_min, trim_max) df_predictions['raw_trim_round'] = np.rint(df_predictions['raw_trim']).astype('int64') df_predictions['scale_trim'] = trim(df_predictions['scale'], trim_min, trim_max) df_predictions['scale_trim_round'] = np.rint(df_predictions['scale_trim']).astype('int64') # add back the columns that we were requested to copy if any if columns_to_copy: df_predictions_with_metadata = pd.merge(df_predictions, df_input[['spkitemid'] + columns_to_copy]) assert(len(df_predictions) == len(df_predictions_with_metadata)) else: df_predictions_with_metadata = df_predictions.copy() # create any directories needed for the output file os.makedirs(dirname(output_file), exist_ok=True) # save the predictions to disk logger.info('Saving predictions to {}'.format(output_file)) df_predictions_with_metadata.to_csv(output_file, index=False) # save excluded responses to disk if not df_excluded.empty: excluded_output_file = '{}_excluded_responses{}'.format(*splitext(output_file)) logger.info('Saving excluded responses to {}'.format(excluded_output_file)) df_excluded.to_csv(excluded_output_file, index=False)
def main(argv=None): """ Handles command line arguments and gets things started. Parameters ---------- argv : list of str List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. """ parser = argparse.ArgumentParser( description="Prints out the weights of a \ given model.", conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_file', help='model file to load') group = parser.add_mutually_exclusive_group() group.add_argument('--k', help='number of top features to print (0 for all)', type=int, default=50) group.add_argument("--sort_by_labels", '-s', action='store_true', default=False, help="order the features by classes") parser.add_argument( '--sign', choices=['positive', 'negative', 'all'], default='all', help='show only positive, only negative or all weights') parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s')) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) (weights, intercept) = learner.model_params multiclass = False model = learner._model if (isinstance(model, LinearSVC) or (isinstance(model, LogisticRegression) and len(learner.label_list) > 2) or (isinstance(model, SVC) and model.kernel == 'linear')): multiclass = True weight_items = weights.items() if args.sign == 'positive': weight_items = (x for x in weight_items if x[1] > 0) elif args.sign == 'negative': weight_items = (x for x in weight_items if x[1] < 0) if intercept is not None: # subclass of LinearModel if '_intercept_' in intercept: # Some learners (e.g. LinearSVR) may return an array of intercepts but # sometimes that array is of length 1 so we don't need to print that # as an array/list. First, let's normalize these cases. model_intercepts = intercept['_intercept_'] intercept_is_array = isinstance(model_intercepts, np.ndarray) num_intercepts = len(model_intercepts) if intercept_is_array else 1 if intercept_is_array and num_intercepts == 1: model_intercepts = model_intercepts[0] intercept_is_array = False # now print out the intercepts print("intercept = {:.12f}".format(model_intercepts)) else: print("== intercept values ==") for (label, val) in intercept.items(): print("{: .12f}\t{}".format(val, label)) print() print("Number of nonzero features:", len(weights), file=sys.stderr) weight_by_class = defaultdict(dict) if multiclass and args.sort_by_labels: for label_feature, weight in weight_items: label, feature = label_feature.split() weight_by_class[label][feature] = weight for label in sorted(weight_by_class): for feat, val in sorted(weight_by_class[label].items(), key=lambda x: -abs(x[1])): print("{: .12f}\t{}\t{}".format(val, label, feat)) else: for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]: print("{: .12f}\t{}".format(val, feat))
def compute_and_save_predictions(config_file, output_file, feats_file): """ Generate predictions using the information in the config file and save them into the given output file. """ logger = logging.getLogger(__name__) # read in the main config file config_obj = read_json_file(config_file) config_obj = check_main_config(config_obj, context='rsmpredict') # get the directory where the config file lives # if this is the 'expm' directory, then go # up one level. configpath = dirname(config_file) # get the input file containing the feature values # for which we want to generate the predictions input_features_file = locate_file(config_obj['input_features_file'], configpath) if not input_features_file: raise FileNotFoundError('Input file {} does not exist'.format( config_obj['input_features_file'])) # get the experiment ID experiment_id = config_obj['experiment_id'] # get the column name that will hold the ID id_column = config_obj['id_column'] # get the column name for human score (if any) human_score_column = config_obj['human_score_column'] # get the column name for second human score (if any) second_human_score_column = config_obj['second_human_score_column'] # get the column name for subgroups (if any) subgroups = config_obj['subgroups'] # get the column names for flag columns (if any) flag_column_dict = check_flag_column(config_obj) # get the name for the candidate_column (if any) candidate_column = config_obj['candidate_column'] # get the directory of the experiment experiment_dir = locate_file(config_obj['experiment_dir'], configpath) if not experiment_dir: raise FileNotFoundError('The directory {} does not exist.'.format( config_obj['experiment_dir'])) else: experiment_output_dir = normpath(join(experiment_dir, 'output')) if not exists(experiment_output_dir): raise FileNotFoundError( 'The directory {} does not contain ' 'the output of an rsmtool experiment.'.format(experiment_dir)) # find all the .model files in the experiment output directory model_files = glob.glob(join(experiment_output_dir, '*.model')) if not model_files: raise FileNotFoundError( 'The directory {} does not contain any rsmtool models.'.format( experiment_output_dir)) experiment_ids = [splitext(basename(mf))[0] for mf in model_files] if experiment_id not in experiment_ids: raise FileNotFoundError( '{} does not contain a model for the experiment "{}". ' 'The following experiments are contained in this ' 'directory: {}'.format(experiment_output_dir, experiment_id, experiment_ids)) # check that the directory contains outher required files required_file_types = ['feature', 'postprocessing_params'] for file_type in required_file_types: expected_file_name = "{}_{}.csv".format(experiment_id, file_type) if not exists(join(experiment_output_dir, expected_file_name)): raise FileNotFoundError('{} does not contain the required file ' '{} that was generated during the ' 'original model training'.format( experiment_output_dir, expected_file_name)) # read in the given features but make sure that the # `id_column`, `candidate_column` and subgroups are read in as a string logger.info('Reading features from {}'.format(input_features_file)) string_columns = [id_column, candidate_column] + subgroups converter_dict = dict([(column, str) for column in string_columns if column]) df_input = pd.read_csv(input_features_file, converters=converter_dict) # make sure that the columns specified in the config file actually exist columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys()) # add subgroups and the flag columns to the list of columns # that will be added to the final file columns_to_copy = subgroups + list(flag_column_dict.keys()) # human_score_column will be set to sc1 by default # we only raise an error if it's set to something else. # However, since we cannot distinguish whether the column was set # to sc1 by default or specified as such in the config file # we append it to output anyway as long as # it is in the input file if human_score_column != 'sc1' or 'sc1' in df_input.columns: columns_to_check.append(human_score_column) columns_to_copy.append('sc1') if candidate_column: columns_to_check.append(candidate_column) columns_to_copy.append('candidate') if second_human_score_column: columns_to_check.append(second_human_score_column) columns_to_copy.append('sc2') missing_columns = set(columns_to_check).difference(df_input.columns) if missing_columns: raise KeyError("Columns {} from the config file " "do not exist in the data.".format(missing_columns)) # rename all columns df_input = rename_default_columns(df_input, [], id_column, human_score_column, second_human_score_column, None, None, candidate_column=candidate_column) # check that the id_column contains unique values if df_input['spkitemid'].size != df_input['spkitemid'].unique().size: raise ValueError( "The data contains repeated response IDs in {}. Please make sure all response IDs are unique and re-run the tool." .format(id_column)) # now we need to pre-process these features using # the parameters that are already stored in the # _features.csv file. df_feature_info = pd.read_csv(join(experiment_output_dir, '{}_feature.csv'.format(experiment_id)), index_col=0) required_features = df_feature_info.index.tolist() # ensure that all the features that are needed by the model # are present in the input file input_feature_columns = [c for c in df_input if c != id_column] missing_features = set(required_features).difference(input_feature_columns) if missing_features: raise KeyError('{} is missing the following features: {}'.format( feats_file, missing_features)) extra_features = set(input_feature_columns).difference(required_features + [id_column]) if extra_features: logging.warning( 'The following extraenous features will be ignored: {}'.format( extra_features)) # keep the required features plus the id features_to_keep = ['spkitemid'] + required_features # check if actually have the human scores for this data and add # sc1 to preprocessed features for consistency with other tools has_human_scores = 'sc1' in df_input if has_human_scores: features_to_keep.append('sc1') df_features = df_input[features_to_keep] # preprocess the feature values logger.info('Pre-processing input features') # first we need to filter out NaNs and any other # weird features, the same way we did for rsmtool. df_filtered = df_features.copy() df_excluded = pd.DataFrame(columns=df_filtered.columns) for feature_name in required_features: newdf, newdf_excluded = filter_on_column(df_filtered, feature_name, 'spkitemid', exclude_zeros=False, exclude_zero_sd=False) del df_filtered df_filtered = newdf df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer') # make sure that the remaining data frame is not empty if len(df_filtered) == 0: raise ValueError( "There are no responses left after " "filtering out non-numeric feature values. No analysis " "will be run") df_features = df_filtered.copy() df_features_preprocessed = df_features.copy() for feature_name in required_features: feature_values = df_features[feature_name].values feature_transformation = df_feature_info.loc[feature_name]['transform'] feature_weight = df_feature_info.loc[feature_name]['sign'] train_feature_mean = df_feature_info.loc[feature_name]['train_mean'] train_feature_sd = df_feature_info.loc[feature_name]['train_sd'] train_transformed_mean = df_feature_info.loc[feature_name][ 'train_transformed_mean'] train_transformed_sd = df_feature_info.loc[feature_name][ 'train_transformed_sd'] # transform the feature values and remove outliers df_features_preprocessed[feature_name] = preprocess_feature( feature_values, feature_name, feature_transformation, train_feature_mean, train_feature_sd, exclude_zero_sd=False) # now standardize the feature values df_features_preprocessed[feature_name] = ( df_features_preprocessed[feature_name] - train_transformed_mean) / train_transformed_sd # Multiply features by weight. Within the # current SR timeline, the mean of the transformed train # feature used to standardize test features has to be # computed before multiplying the train feature by the weight. df_features_preprocessed[feature_name] = df_features_preprocessed[ feature_name] * feature_weight # save the pre-processed features to disk if we were asked to if feats_file: logger.info( 'Saving pre-processed feature values to {}'.format(feats_file)) # create any directories needed for the output file os.makedirs(dirname(feats_file), exist_ok=True) df_features_preprocessed.to_csv(feats_file, index=False) # now load the SKLL model to generate the predictions model = Learner.from_file( join(experiment_output_dir, '{}.model'.format(experiment_id))) # now generate the predictions for the features using this model logger.info('Generating predictions') df_predictions = predict_with_model(model, df_features_preprocessed) # read in the post-processing parameters from disk df_postproc_params = pd.read_csv( join(experiment_output_dir, '{}_postprocessing_params.csv'.format(experiment_id))) trim_min = df_postproc_params['trim_min'].values[0] trim_max = df_postproc_params['trim_max'].values[0] h1_mean = df_postproc_params['h1_mean'].values[0] h1_sd = df_postproc_params['h1_sd'].values[0] train_predictions_mean = df_postproc_params[ 'train_predictions_mean'].values[0] train_predictions_sd = df_postproc_params['train_predictions_sd'].values[0] # now scale the predictions logger.info('Rescaling predictions') scaled_predictions = (df_predictions['raw'] - train_predictions_mean) / train_predictions_sd scaled_predictions = scaled_predictions * h1_sd + h1_mean df_predictions['scale'] = scaled_predictions # trim and round the predictions logger.info('Trimming and rounding predictions') df_predictions['raw_trim'] = trim(df_predictions['raw'], trim_min, trim_max) df_predictions['raw_trim_round'] = np.rint( df_predictions['raw_trim']).astype('int64') df_predictions['scale_trim'] = trim(df_predictions['scale'], trim_min, trim_max) df_predictions['scale_trim_round'] = np.rint( df_predictions['scale_trim']).astype('int64') # add back the columns that we were requested to copy if any if columns_to_copy: df_predictions_with_metadata = pd.merge( df_predictions, df_input[['spkitemid'] + columns_to_copy]) assert (len(df_predictions) == len(df_predictions_with_metadata)) else: df_predictions_with_metadata = df_predictions.copy() # create any directories needed for the output file os.makedirs(dirname(output_file), exist_ok=True) # save the predictions to disk logger.info('Saving predictions to {}'.format(output_file)) df_predictions_with_metadata.to_csv(output_file, index=False) # save excluded responses to disk if not df_excluded.empty: excluded_output_file = '{}_excluded_responses{}'.format( *splitext(output_file)) logger.info( 'Saving excluded responses to {}'.format(excluded_output_file)) df_excluded.to_csv(excluded_output_file, index=False)
def main(argv=None): """ Handles command line arguments and gets things started. Parameters ---------- argv : list of str List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. """ parser = argparse.ArgumentParser(description="Prints out the weights of a \ given model.", conflict_handler='resolve', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_file', help='model file to load') group = parser.add_mutually_exclusive_group() group.add_argument('--k', help='number of top features to print (0 for all)', type=int, default=50) group.add_argument("--sort_by_labels", '-s', action='store_true', default=False, help="order the features by classes") parser.add_argument('--sign', choices=['positive', 'negative', 'all'], default='all', help='show only positive, only negative, ' + 'or all weights') parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) k = args.k if args.k > 0 else None learner = Learner.from_file(args.model_file) (weights, intercept) = learner.model_params multiclass = False model = learner._model if (isinstance(model, LinearSVC) or (isinstance(model, LogisticRegression) and len(learner.label_list) > 2) or (isinstance(model, SVC) and model.kernel == 'linear')): multiclass = True weight_items = iteritems(weights) if args.sign == 'positive': weight_items = (x for x in weight_items if x[1] > 0) elif args.sign == 'negative': weight_items = (x for x in weight_items if x[1] < 0) if intercept is not None: # subclass of LinearModel if '_intercept_' in intercept: # Some learners (e.g. LinearSVR) may return an array of intercepts but # sometimes that array is of length 1 so we don't need to print that # as an array/list. First, let's normalize these cases. model_intercepts = intercept['_intercept_'] intercept_is_array = isinstance(model_intercepts, np.ndarray) num_intercepts = len(model_intercepts) if intercept_is_array else 1 if intercept_is_array and num_intercepts == 1: model_intercepts = model_intercepts[0] intercept_is_array = False # now print out the intercepts print("intercept = {:.12f}".format(model_intercepts)) else: print("== intercept values ==") for (label, val) in intercept.items(): print("{: .12f}\t{}".format(val, label)) print() print("Number of nonzero features:", len(weights), file=sys.stderr) weight_by_class = defaultdict(dict) if multiclass and args.sort_by_labels: for label_feature, weight in weight_items: label, feature = label_feature.split() weight_by_class[label][feature] = weight for label in sorted(weight_by_class): for feat, val in sorted(weight_by_class[label].items(), key=lambda x: -abs(x[1])): print("{: .12f}\t{}\t{}".format(val, label, feat)) else: for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]: print("{: .12f}\t{}".format(val, feat))