def generate_data_for_significant_nei_utm_ids(): output_file = GeneralMethods.get_method_id()+'.json' so_hashtags, mf_utm_id_to_valid_nei_utm_ids = set(), {} for utm_object in \ FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True): for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems(): if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag) mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\ utm_object['mf_nei_utm_id_to_common_h_count'].keys() hashtags = sorted(list(so_hashtags)) mf_utm_id_to_vector = {} for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True): # print i, utm_object['utm_id'] utm_id_vector = map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0), hashtags) mf_utm_id_to_vector[utm_object['utm_id']] = robjects.FloatVector(utm_id_vector) for i, (utm_id, vector) in enumerate(mf_utm_id_to_vector.iteritems()): print '%s of %s'%(i+1, len(mf_utm_id_to_vector)) ltuo_utm_id_and_vector = [(utm_id, vector)] for valid_nei_utm_id in mf_utm_id_to_valid_nei_utm_ids[utm_id]: if valid_nei_utm_id in mf_utm_id_to_vector and valid_nei_utm_id!=utm_id: ltuo_utm_id_and_vector.append((valid_nei_utm_id, mf_utm_id_to_vector[valid_nei_utm_id])) od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0))) df_utm_vectors = robjects.DataFrame(od) df_utm_vectors_json = R_Helper.get_json_for_data_frame(df_utm_vectors) dfm_dict = cjson.decode(df_utm_vectors_json) mf_utm_ids_to_utm_colnames = dict(zip(zip(*ltuo_utm_id_and_vector)[0], df_utm_vectors.colnames)) utm_id_colname = mf_utm_ids_to_utm_colnames[utm_id] dfm_dict['prediction_variable'] = utm_id_colname dfm_dict['predictor_variables'] = filter(lambda colname: colname!=utm_id_colname, df_utm_vectors.colnames) dfm_dict['mf_utm_colnames_to_utm_ids'] = dict(zip(df_utm_vectors.colnames, zip(*ltuo_utm_id_and_vector)[0])) FileIO.writeToFileAsJson(dfm_dict, output_file)
def mapper(self, key, line): data_for_df = cjson.decode(line) prediction_variable = data_for_df['prediction_variable'] predictor_variables = data_for_df['predictor_variables'] mf_utm_colnames_to_utm_ids = data_for_df['mf_utm_colnames_to_utm_ids'] del data_for_df['prediction_variable'] del data_for_df['predictor_variables'] del data_for_df['mf_utm_colnames_to_utm_ids'] df_utm_vectors = R_Helper.get_data_frame_from_json(cjson.encode(data_for_df)) selected_utm_colnames = R_Helper.variable_selection_using_backward_elimination( df_utm_vectors, prediction_variable, predictor_variables ) utm_id = mf_utm_colnames_to_utm_ids[prediction_variable] nei_utm_ids = [mf_utm_colnames_to_utm_ids[selected_utm_colname] for selected_utm_colname in selected_utm_colnames] yield utm_id, {'utm_id': utm_id, 'nei_utm_ids': nei_utm_ids}
def _get_parameter_names_to_values(train_feature_vectors): mf_column_name_to_column_data = defaultdict(list) train_feature_vectors = map(itemgetter('feature_vector'), train_feature_vectors) for feature_vector in train_feature_vectors: if feature_vector['value_to_predict']: mf_column_name_to_column_data['value_to_predict'].append(feature_vector['value_to_predict']) for column_name in LIST_OF_MODELS: mf_column_name_to_column_data[column_name].append(feature_vector.get(column_name, 0.0)) data = {} for column_name, column_data in mf_column_name_to_column_data.iteritems(): data[column_name] = robjects.FloatVector(column_data) if data: data_frame = robjects.DataFrame(data) prediction_variable = 'value_to_predict' predictor_variables = LIST_OF_MODELS model = R_Helper.linear_regression_model( data_frame, prediction_variable, predictor_variables, # with_variable_selection=True ) return R_Helper.get_parameter_values(model)
def significant_nei_utm_ids(): mf_utm_id_to_valid_nei_utm_ids = {} def get_utm_vectors(): so_hashtags = set() for utm_object in \ FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True): for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems(): if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag) mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\ utm_object['mf_nei_utm_id_to_common_h_count'].keys() hashtags, ltuo_utm_id_and_vector = sorted(list(so_hashtags)), [] for i, utm_object in enumerate(FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True)): # print i, utm_object['utm_id'] utm_id_vector = map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0), hashtags) ltuo_utm_id_and_vector.append((utm_object['utm_id'], robjects.FloatVector(utm_id_vector))) od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0))) df_utm_vectors = robjects.DataFrame(od) return df_utm_vectors output_file = fld_google_drive_data_analysis%GeneralMethods.get_method_id() df_utm_vectors = get_utm_vectors() # print df_utm_vectors.nrow # exit() utm_colnames = df_utm_vectors.colnames mf_utm_id_to_utm_colnames = dict(zip(sorted(mf_utm_id_to_valid_nei_utm_ids), utm_colnames)) mf_utm_colnames_to_utm_id = dict(zip(utm_colnames, sorted(mf_utm_id_to_valid_nei_utm_ids))) for i, utm_colname in enumerate(utm_colnames): utm_id = mf_utm_colnames_to_utm_id[utm_colname] prediction_variable = utm_colname print i, utm_id predictor_variables = [mf_utm_id_to_utm_colnames[valid_nei_utm_ids] for valid_nei_utm_ids in mf_utm_id_to_valid_nei_utm_ids[utm_id] if valid_nei_utm_ids in mf_utm_id_to_utm_colnames and valid_nei_utm_ids != utm_id ] selected_utm_colnames = R_Helper.variable_selection_using_backward_elimination( df_utm_vectors, prediction_variable, predictor_variables, debug=True ) nei_utm_ids = [mf_utm_colnames_to_utm_id[selected_utm_colname] for selected_utm_colname in selected_utm_colnames] print 'Writing to: ', output_file FileIO.writeToFileAsJson({'utm_id': utm_id, 'nei_utm_ids': nei_utm_ids}, output_file)
def get_performance_metrics(feature_vectors, *args, **kwargs): train_feature_vectors, test_feature_vectors = split_feature_vectors_into_test_and_training(feature_vectors) filtered_train_feature_vectors = filter(lambda fv: len(fv['feature_vector'])>1, train_feature_vectors) filtered_test_feature_vectors = filter(lambda fv: len(fv['feature_vector'])>1, test_feature_vectors) if filtered_train_feature_vectors and filtered_test_feature_vectors: parameter_names_to_values = LearningToRank._get_parameter_names_to_values(filtered_train_feature_vectors) if parameter_names_to_values: accuracy_mf_num_of_hashtags_to_metric_values = defaultdict(list) impact_mf_num_of_hashtags_to_metric_values = defaultdict(list) mf_parameter_names_to_values = dict(parameter_names_to_values) test_feature_vectors.sort(key=itemgetter('tu')) ltuo_tu_and_ltuo_hashtag_and_actual_score_and_feature_vector =\ [(tu, map( itemgetter('hashtag', 'actual_score', 'feature_vector'), it_feature_vectors) ) for tu, it_feature_vectors in groupby(test_feature_vectors, key=itemgetter('tu')) ] for tu, ltuo_hashtag_and_actual_score_and_feature_vector in \ ltuo_tu_and_ltuo_hashtag_and_actual_score_and_feature_vector: for _, __, fv in ltuo_hashtag_and_actual_score_and_feature_vector: del fv['value_to_predict'] ltuo_hashtag_and_actual_score_and_predicted_score =\ map(lambda (hashtag, actual_score, feature_vector): ( hashtag, actual_score, R_Helper.get_predicted_value(mf_parameter_names_to_values, feature_vector) ), ltuo_hashtag_and_actual_score_and_feature_vector) ltuo_hashtag_and_actual_score = [ (hashtag, actual_score) for hashtag, actual_score, _ in ltuo_hashtag_and_actual_score_and_predicted_score if actual_score!=None] ltuo_hashtag_and_predicted_score = map( itemgetter(0,2), ltuo_hashtag_and_actual_score_and_predicted_score ) if ltuo_hashtag_and_actual_score and ltuo_hashtag_and_predicted_score: ltuo_hashtag_and_actual_score = sorted( ltuo_hashtag_and_actual_score, key=itemgetter(1), reverse=True ) ltuo_hashtag_and_predicted_score = sorted( ltuo_hashtag_and_predicted_score, key=itemgetter(1), reverse=True ) for num_of_hashtags in NUM_OF_HASHTAGS: hashtags_dist = dict(ltuo_hashtag_and_actual_score) actual_ordering_of_hashtags = zip(*ltuo_hashtag_and_actual_score)[0] predicted_ordering_of_hashtags = zip(*ltuo_hashtag_and_predicted_score)[0] accuracy = EvaluationMetric.accuracy( actual_ordering_of_hashtags[:num_of_hashtags], predicted_ordering_of_hashtags[:num_of_hashtags], num_of_hashtags ) impact = EvaluationMetric.impact( actual_ordering_of_hashtags[:num_of_hashtags], predicted_ordering_of_hashtags[:num_of_hashtags], hashtags_dist ) accuracy_mf_num_of_hashtags_to_metric_values[num_of_hashtags].append(accuracy) impact_mf_num_of_hashtags_to_metric_values[num_of_hashtags].append(impact) return (accuracy_mf_num_of_hashtags_to_metric_values, impact_mf_num_of_hashtags_to_metric_values) return {}, {}