def fetch_from_vertica_to_df(self, data_set, query): data_set_query = self.get_data_set_attribute(data_set, 'query') if data_set_query != query: connection = connect(self.connection_details) cursor = connection.cursor() print 'Executing ', data_set, 'Query...' print query columns = get_column_names_from_sql_query(query) cursor.execute(query) data = [] while True: rows = cursor.fetchmany(10000) data.extend([[str(ele) for ele in row] for row in rows]) if len(rows) <= 1: break df = MetadataDataFrame(data=data, columns=columns, meta_info={ 'query': query, 'built_features': [], 'aggregate_values': {}, 'columns': columns }) cursor.close() if len(df) == 0: raise (ValueError('SQL result in empty fetch!!')) else: self.set_data_set_attribute(data_set, 'data', df) self.set_data_set_attribute(data_set, 'query', query) self.set_data_set_attribute(data_set, 'columns', columns) self.set_data_set_attribute(data_set, 'built_features', [])
def load_results(self): if os.path.isfile(self.datafile): self.result = Pickle.load(open(self.datafile, 'rb')) elif self.meta_info is not None: columns = ['id', 'model', 'learner'] + list( set(self.meta_info['features'])) + self.meta_info[ 'metrics'] + self.meta_info['learner_parameters'] self.result = MetadataDataFrame(columns=columns, meta_info=self.meta_info) else: raise (RuntimeError('No results exist to Summarize!!'))
def add_metrics(self, model_name, learner_name, features, params_list, metrics_list): rows = defaultdict(list) for params, metrics in zip(params_list, metrics_list): if not self.check_if_exists(model_name, learner_name, features, params): row = self.get_row_dict(model_name, learner_name, features, params, metrics) [rows[key].append(value) for key, value in row.items()] self.result = self.result.append(MetadataDataFrame(data=rows), ignore_index=True) Pickle.dump(self.result, open(self.datafile, 'wb'))
def plot_results(self): group_by_keys = ['model', 'learner'] + list( set(self.result.meta_info['features'])) result = MetadataDataFrame(columns=group_by_keys) for grouped_by, data_frame in self.result.groupby(group_by_keys): result = result.append(data_frame[group_by_keys + ['AUC']][ data_frame['AUC'] == data_frame['AUC'].max()], ignore_index=True) result = result.sort(['AUC'], ascending=[1]).reset_index(drop=True) result_list = [] for model in result['model'].unique(): result_model = result[result['model'] == model] rows_to_keep = [True] index = result_model.index.tolist() if len(result_model) > 1: base = result_model['AUC'][index[0]] for i in range(1, len(result_model)): if result_model['AUC'][index[i]] > base + 0.0001: rows_to_keep.append(True) base = result_model['AUC'][index[i]] else: rows_to_keep.append(False) result_list.append(result_model[rows_to_keep]) result = result_list[0] if len(result_list) > 1: for i in range(1, len(result_list)): result = result.append(result_list[i], ignore_index=True) result = result.sort(['AUC'], ascending=[1]).reset_index(drop=True) if len(result) == 1: return key_freq = defaultdict(int) for key in result.keys(): distinct_values = self.result[key].unique() if len(distinct_values) == 1: del result[key] else: for val in distinct_values: key_freq[key] = max( key_freq[key], len(self.result[key][self.result[key] == val])) sorted_keys = [ key[0] for key in sorted(key_freq.items(), key=operator.itemgetter(1)) ] group_by_keys = [key for key in sorted_keys if key in group_by_keys] default_annotation = {'weight': 'bold', 'ha': 'center', 'va': 'center'} fig = plt.figure(figsize=(len(result.keys()) / 2, len(result) / 2)) plt.subplots_adjust(left=0.1, right=1.0, bottom=0.0, top=1.0, wspace=0.2, hspace=0.0) index = 0 for key in group_by_keys: sorted_values = result[key].tolist() color_map, color_map_r = COLOR_MAP.next() width = 3 if key in ['model', 'learner'] else 1 for i in range(len(result)): if result[key][i] == 'present': plt.gca().add_patch( Rectangle(xy=(index, i), facecolor='green', width=width, height=1, alpha=0.5)) elif result[key][i] == 'absent': plt.gca().add_patch( Rectangle(xy=(index, i), facecolor='red', width=width, height=1, alpha=0.5)) else: color_value = color_map( sorted_values.index(result[key][i]) / float(len(result) - 1)) color_value_r = color_map_r( sorted_values.index(result[key][i]) / float(len(result) - 1)) default_annotation.update({ 'rotation': '00', 'fontsize': 8 }) plt.gca().add_patch( Rectangle(xy=(index, i), facecolor=color_value, width=width, height=1)) plt.gca().annotate(sorted_values[i], (index + +float(width) / 2, i + 0.5), color=color_value_r, **default_annotation) default_annotation.update({'rotation': '90', 'fontsize': 9}) plt.gca().add_patch( Rectangle(xy=(index, len(result)), facecolor=color_map(1), width=width, height=15)) plt.gca().annotate(key, (index + float(width) / 2, len(result) + 7.5), color='black', **default_annotation) index += width plt.ylim([0, len(result) + 15]) plt.xlim([0, index]) ticks = np.linspace(start=0.5, stop=len(result) - 0.5, endpoint=len(result) - 0.5, num=len(result)) labels = [round(value, 4) for value in result['AUC'].tolist()] plt.gca().get_yaxis().set_ticks(ticks) plt.ylabel('AUC') plt.gca().get_yaxis().set_ticklabels(labels) png_path = '.'.join(self.datafile.split('.')[0:-1]) + '.png' fig.savefig(png_path, dpi=fig.dpi) print 'Summary saved as', png_path