def __get_stop_coords(): """Returns a dict of {str(stopId) : [str(lat), str(lon)]}""" stops = read_csv(Cfg.stops_file) # dict keys of the stopId, value is [lat, lon] list d = defaultdict(list) for r in stops: d[str(r[4])] = (str(r[7]), str(r[8])) return d
def __get_weather_data(): """Returns the weather data in the format of: {DD/MM/YYYY: {TIME_BIN: [cloud, rain, temp, wind]}} """ weather = defaultdict(lambda: defaultdict(list)) rows = read_csv(Cfg.weather_file) next(rows) # first line is the column headers for row in rows: weather[row[0]][row[1]] = row[2:] return weather
def get_bert_generate_label(model, context_filename, money_filename, tmp_json_path, tags_list): lastest_ckpt = get_lastest_ckpt(model.output_dir + "/checkpoint") pred_labels, result_prob = model.predict( context_filename, model.output_dir + "/" + lastest_ckpt) model.generate_pred_file(tags_list, context_filename, tmp_json_path, pred_labels, result_prob) filenames, ahs, money, labels, _ = read_csv(model.data_dir + '/' + money_filename) filenames_new, ahs_new, money_new, labels_new, attrs_new = get_nlp_result( tmp_json_path, filenames, ahs, money, labels) return filenames_new, ahs_new, money_new, labels_new, attrs_new
def main(): rows = read_csv(expanduser('~/datasets/datafiles/siri.20121106.csv')) datas = ({ 'dt': datetime.fromtimestamp(int(row[0][:-6])), 'line': row[1], 'jpId': row[3], 'timeframe': row[4], 'vjId': row[5], 'lat': row[9], 'lon': row[8], 'delay': row[10], 'blockId': row[11], 'stopId': row[13], 'atStop': row[14] } for row in rows) # A nifty way of merging dictionaries. Python3.5+ datas = ({**d, **get_weather(d, WEATHER_DATA)} for d in datas) datas = ({**d, **nearest_stop(d, get_stops(d))} for d in datas) # writer = coro_print() # prints to screen for debugging. writer = coro_csv_writer( expanduser('~/datasets/datafiles/busdata_improved.csv')) columns = [ 'dt', 'line', 'jpId', 'timeframe', 'vjId', 'lat', 'lon', 'delay', 'blockId', 'stopId', 'atStop', 'running_dist', 'running_time', 'closest_stop', 'closest_stop_dist', 'cloud', 'rain', 'temp', 'wind' ] writer.send(columns) # write column headers # seperate datas by journeyPatternId, vehicleJourneyId and timeframe journey_parsers = {} for d in datas: key = ':'.join([d['jpId'], d['vjId'], d['timeframe']]) try: journey_parsers[key].send(d) except KeyError: journey_parsers[key] = journey_parser(writer) journey_parsers[key].send(d) # TODO: Clean up handlers. Remove old ones once the journey has ended. try: writer.send(StopIteration) except StopIteration: print('Done')
def main(h5_file_name, vid_name, dir_name, target_gest, gest_list_h5, gest_corr_csv): if gest_list_h5 is None: gests = get_annotations(h5_file_name) gest_seq = get_gesture_sequence(gests, target_gest) gest_timestamps = convert_frame_to_timestamp(gest_seq) save_videos(vid_name, dir_name, gest_timestamps) elif gest_corr_csv is not None and len(gest_corr_csv) > 0: # Extract all gestures for a particular user user_name = os.path.basename(h5_file_name) # corresponds to row csv_data = global_utils.read_csv(gest_corr_csv) csv_data = filter(lambda x: x['filename'] == user_name, csv_data) labels = [target_gest] if target_gest >= 0 else range(11) for label in labels: frames = [[int(d['start_frame']), int(d['end_frame'])] \ for d in csv_data if int(d['old_label']) == label] # Create dir for saving. if not os.path.exists(dir_name): os.makedirs(dir_name) label_dir = os.path.join(dir_name, str(label)) if not os.path.exists(label_dir): os.makedirs(label_dir) timestamps = convert_frame_to_timestamp(frames) save_videos(vid_name, label_dir, timestamps) else: if not os.path.exists(dir_name): os.makedirs(dir_name) labels = [target_gest] if target_gest >= 0 else range(11) for label in labels: file_name = os.path.basename(h5_file_name) gest_h5 = h5py.File(gest_list_h5, 'r') group = 'test' if file_name in gest_h5['test'].keys() else 'train' all_gests = np.array(gest_h5[group][file_name][str(label)]) label_dir = os.path.join(dir_name, str(label)) if not os.path.exists(label_dir): os.makedirs(label_dir) if len(all_gests.shape) == 1: print("No videos found") else: gest_timestamps = convert_frame_to_timestamp( all_gests.tolist()) save_videos(vid_name, label_dir, gest_timestamps)
def get_nlp_label(csv_file, json_file): attr_vecs = getVecFromJson(json_file, ) filenames30, ahs30, data30, labels30, attrs30 = read_csv(csv_file) count = 0 filenames_new, ahs_new, data_new, labels_new, attrs_new = [],[],[],[],[] for dic in attr_vecs: for idx, value in enumerate(zip(filenames30, ahs30)): name = value[0] + "&" + value[-1] for k, v in dic.items(): if k == name: orig_vec = attrs30[idx] if v != orig_vec: count += 1 attrs30[idx] = v attrs30[idx] = v filenames_new.append(value[0]) ahs_new.append(value[-1]) data_new.append(data30[idx]) labels_new.append(labels30[idx]) attrs_new.append(v) return ahs_new, data_new, labels_new, attrs_new
def execute_statement( self, bql_statement_ast, pretty=True, timing=False, plots=None, yes=False, debug=False, pandas_df=None, pandas_output=True, key_column=None, ): """ Accepts a SINGLE BQL STATEMENT as input, parses it, and executes it if it was parsed successfully. If pretty=True, then the command output will be pretty-printed as a string. If pretty=False, then the command output will be returned as a python object. timing=True prints out how long the command took to execute. For commands that have visual results, plots=True will cause those to be displayed by matplotlib as graphics rather than being pretty-printed as text. (Note that the graphics will also be saved if the user added SAVE TO <filename> to the BQL.) """ if timing: start_time = time.time() parser_out = None ##TODO move pyparsing objects out of client into parser if debug: parser_out = self.parser.parse_single_statement(bql_statement_ast) else: try: parser_out = self.parser.parse_single_statement(bql_statement_ast) except Exception as e: raise utils.BayesDBParseError(str(e)) if parser_out is None: print "Could not parse command. Try typing 'help' for a list of all commands." return elif not parser_out: return method_name, args_dict, client_dict = parser_out if client_dict is None: client_dict = {} ## Do stuff now that you know the user's command, but before passing it to engine. if method_name == "execute_file": return dict(message="execute_file", bql_string=open(args_dict["filename"], "r").read()) elif (method_name == "drop_btable") and (not yes): ## If dropping something, ask for confirmation. print "Are you sure you want to permanently delete this btable, and all associated models, without any way to get them back? Enter 'y' if yes." user_confirmation = raw_input() if "y" != user_confirmation.strip(): return dict(message="Operation canceled by user.") elif (method_name == "drop_models") and (not yes): ## If dropping something, ask for confirmation. print "Are you sure you want to permanently delete model(s), without any way to get them back? Enter 'y' if yes." user_confirmation = raw_input() if "y" != user_confirmation.strip(): return dict(message="Operation canceled by user.") elif method_name == "load_models": pklpath = client_dict["pkl_path"] try: models = pickle.load(gzip.open(self.parser.get_absolute_path(pklpath), "rb")) except IOError as e: if pklpath[-7:] != ".pkl.gz": if pklpath[-4:] == ".pkl": models = pickle.load(open(self.parser.get_absolute_path(pklpath), "rb")) else: pklpath = pklpath + ".pkl.gz" models = pickle.load(gzip.open(self.parser.get_absolute_path(pklpath), "rb")) else: raise utils.BayesDBError("Models file %s could not be found." % pklpath) args_dict["models"] = models elif method_name == "create_btable": if pandas_df is None: header, rows = data_utils.read_csv(client_dict["csv_path"]) else: header, rows = data_utils.read_pandas_df(pandas_df) args_dict["header"] = header args_dict["raw_T_full"] = rows args_dict["key_column"] = key_column args_dict["subsample"] = False # Display warning messages and get confirmation if btable is too large. # Ask user if they want to turn on subsampling. max_columns = 200 max_rows = 1000 max_cells = 100000 message = None if not yes: if len(rows[0]) > max_columns: message = ( "The btable you are uploading has %d columns, but BayesDB is currently designed to support only %d columns. If you proceed, performance may suffer unless you set many columns' datatypes to 'ignore'. Would you like to continue? Enter 'y' if yes." % (len(rows[0]), max_columns) ) if len(rows) > max_rows: message = ( "The btable you are uploading has %d rows, but BayesDB is currently designed to support only %d rows. If you proceed, performance may suffer. Would you like to continue? Enter 'y' to continue without subsampling, 'n' to abort, 's' to continue by subsampling %d rows, or a positive integer to specify the number of rows to be subsampled." % (len(rows), max_rows, max_rows) ) if len(rows[0]) * len(rows) > max_cells: message = ( "The btable you are uploading has %d cells, but BayesDB is currently designed to support only %d cells. If you proceed, performance may suffer unless you enable subsampling. Enter 'y' to continue without subsampling, 'n' to abort, 's' to continue by subsampling %d rows, or a positive integer to specify the number of rows to be subsampled." % (len(rows) * len(rows[0]), max_cells, max_rows) ) if message is not None: print message user_confirmation = raw_input() if "y" == user_confirmation.strip(): pass elif "n" == user_confirmation.strip(): return dict(message="Operation canceled by user.") elif "s" == user_confirmation.strip(): args_dict["subsample"] = min(max_rows, len(rows)) elif utils.is_int(user_confirmation.strip()): args_dict["subsample"] = int(user_confirmation.strip()) else: return dict(message="Operation canceled by user.") elif method_name in ["label_columns", "update_metadata"]: if client_dict["source"] == "file": header, rows = data_utils.read_csv(client_dict["csv_path"]) args_dict["mappings"] = {key: value for key, value in rows} ## Call engine. result = self.call_bayesdb_engine(method_name, args_dict, debug) ## If error occurred, exit now. if "error" in result and result["error"]: if pretty: print result["message"] return result["message"] else: return result ## Do stuff now that engine has given you output, but before printing the result. result = self.callback(method_name, args_dict, client_dict, result) assert type(result) != int if timing: end_time = time.time() print "Elapsed time: %.2f seconds." % (end_time - start_time) if plots is None: plots = "DISPLAY" in os.environ.keys() if "matrix" in result and (plots or client_dict["filename"]): # Plot matrices plotting_utils.plot_matrix( result["matrix"], result["column_names"], result["title"], client_dict["filename"] ) if pretty: if "column_lists" in result: print self.pretty_print(dict(column_lists=result["column_lists"])) return self.pretty_print(result) else: return result if "plot" in client_dict and client_dict["plot"]: if plots or client_dict["filename"]: # Plot generalized histograms or scatterplots plot_remove_key = method_name in ["select", "infer"] plotting_utils.plot_general_histogram( result["columns"], result["data"], result["M_c"], client_dict["filename"], client_dict["scatter"], remove_key=plot_remove_key, ) return self.pretty_print(result) else: if "message" not in result: result["message"] = "" result["message"] = ( "Your query indicates that you would like to make a plot, but in order to do so, you must either enable plotting in a window or specify a filename to save to by appending 'SAVE TO <filename>' to this command.\n" + result["message"] ) if pretty: pp = self.pretty_print(result) print pp if pandas_output and "data" in result and "columns" in result: result_pandas_df = data_utils.construct_pandas_df(result) return result_pandas_df else: return result
def execute_statement(self, bql_statement_ast, pretty=True, timing=False, plots=None, yes=False, debug=False, pandas_df=None, pandas_output=True, key_column=None, return_raw_result=False, force_output=False): """ Accepts a SINGLE BQL STATEMENT as input, parses it, and executes it if it was parsed successfully. If pretty=True, then the command output will be pretty-printed as a string. If pretty=False, then the command output will be returned as a python object. If force_output=True, then results will be returned regardless of pretty timing=True prints out how long the command took to execute. For commands that have visual results, plots=True will cause those to be displayed by matplotlib as graphics rather than being pretty-printed as text. (Note that the graphics will also be saved if the user added SAVE TO <filename> to the BQL.) """ if timing: start_time = time.time() parser_out = None # TODO move pyparsing objects out of client into parser if debug: parser_out = self.parser.parse_single_statement(bql_statement_ast) else: try: parser_out = self.parser.parse_single_statement( bql_statement_ast) except Exception as e: raise utils.BayesDBParseError(str(e)) if parser_out is None: print( "Could not parse command. Try typing 'help' for a list of all commands." ) return elif not parser_out: return method_name, args_dict, client_dict = parser_out if client_dict is None: client_dict = {} # Do stuff now that you know the user's command, but before passing it to engine. if method_name == 'execute_file': return dict(message='execute_file', bql_string=open(args_dict['filename'], 'r').read()) elif method_name == 'update_codebook': _, codebook_rows = data_utils.read_csv( client_dict['codebook_path'], has_header=True) # TODO: require specific codebook_header values? Or don't require a header, # and if the first value in the header is actually a data column name, assume # the first row is codebook data, not a header. # Create a dict indexed by column name codebook = dict() for codebook_row in codebook_rows: codebook[codebook_row[0]] = dict( zip(['short_name', 'description', 'value_map'], codebook_row[1:])) args_dict['codebook'] = codebook elif (method_name == 'drop_btable') and (not yes): # If dropping something, ask for confirmation. print( "Are you sure you want to permanently delete this btable, and all associated " "models, without any way to get them back? Enter 'y' if yes.") user_confirmation = raw_input() if 'y' != user_confirmation.strip(): return dict(message="Operation canceled by user.") elif (method_name == 'drop_models') and (not yes): # If dropping something, ask for confirmation. print( "Are you sure you want to permanently delete model(s), without any way to get " "them back? Enter 'y' if yes.") user_confirmation = raw_input() if 'y' != user_confirmation.strip(): return dict(message="Operation canceled by user.") elif method_name == 'load_models': pklpath = client_dict['pkl_path'] try: model_data = pickle.load( gzip.open(self.parser.get_absolute_path(pklpath), 'rb')) except IOError as e: if pklpath[-7:] != '.pkl.gz': if pklpath[-4:] == '.pkl': model_data = pickle.load( open(self.parser.get_absolute_path(pklpath), 'rb')) else: pklpath = pklpath + ".pkl.gz" model_data = pickle.load( gzip.open(self.parser.get_absolute_path(pklpath), 'rb')) else: raise utils.BayesDBError( 'Models file %s could not be found.' % pklpath) # This is the more recent version, where schema is stored with models. if 'schema' in model_data.keys(): args_dict['models'] = model_data['models'] args_dict['model_schema'] = model_data['schema'] # This support older saved models, where only the model info was stored. else: args_dict['models'] = model_data args_dict['model_schema'] = None # Older versions of model_schema just had a str cctype as the dict items. # Newest version has a dict of cctype and parameters. Use this values to # test the recency of the models. model_schema = args_dict['model_schema'] if model_schema: model_schema_itemtype = type( model_schema[model_schema.keys()[0]]) else: model_schema_itemtype = None if model_schema is None or model_schema_itemtype != dict: args_dict['model_schema'] = None if not yes: print """WARNING! The models you are currently importing were saved without a schema or without detailed column parameters (probably from a previous version). If you are loading models into the same table from which you created them, problems are unlikely, unless you have dropped models and then updated the schema. If you are loading models into a different table from which you created them, you should verify that the table schemas are the same. Please use "SAVE MODELS FROM <btable> TO <filename.pkl.gz>" to create an updated copy of your models. Are you sure you want to load these model(s)? """ user_confirmation = raw_input() if 'y' != user_confirmation.strip(): return dict(message="Operation canceled by user.") elif method_name == 'create_btable': if pandas_df is None: header, rows = data_utils.read_csv(client_dict['csv_path']) else: header, rows = data_utils.read_pandas_df(pandas_df) args_dict['header'] = header args_dict['raw_T_full'] = rows args_dict['key_column'] = key_column args_dict['subsample'] = False if 'codebook_path' in client_dict: _, codebook_rows = data_utils.read_csv( client_dict['codebook_path'], has_header=True) # TODO: require specific codebook_header values? Or don't require a header, # and if the first value in the header is actually a data column name, assume # the first row is codebook data, not a header. # Create a dict indexed by column name codebook = dict() for codebook_row in codebook_rows: codebook[codebook_row[0]] = dict( zip(['short_name', 'description', 'value_map'], codebook_row[1:])) args_dict['codebook'] = codebook else: warning = dedent(""" WARNING! You are creating a btable without a codebook, which will make interpretation of results more difficult. Codebooks should be in CSV format with each row corresponding to one column of the original data. The codebook should have four columns: 1. actual column name 2. short column description 3. long column description 4. value map (optional, only used for categorical columns - should be in JSON format) """) print(warning) # Display warning messages and get confirmation if btable is too large. # Ask user if they want to turn on subsampling. max_columns = 200 max_rows = 1000 max_cells = 100000 message = None if not yes: if len(rows[0]) > max_columns: message = "The btable you are uploading has %d columns, but BayesDB is " \ "currently designed to support only %d columns. If you proceed, " \ "performance may suffer unless you set many columns' datatypes to " \ "'ignore'. Would you like to continue? Enter 'y' if yes." \ % (len(rows[0]), max_columns) if len(rows) > max_rows: message = "The btable you are uploading has %d rows, but BayesDB is currently "\ "designed to support only %d rows. If you proceed, performance may "\ "suffer. Would you like to continue? Enter 'y' to continue without "\ "subsampling, 'n' to abort, 's' to continue by subsampling %d rows, "\ "or a positive integer to specify the number of rows to be "\ "subsampled." % (len(rows), max_rows, max_rows) if len(rows[0]) * len(rows) > max_cells: message = "The btable you are uploading has %d cells, but BayesDB is currently"\ " designed to support only %d cells. If you proceed, performance may"\ " suffer unless you enable subsampling. Enter 'y' to continue "\ " without subsampling, 'n' to abort, 's' to continue by subsampling "\ "%d rows, or a positive integer to specify the number of rows to be "\ "subsampled." % (len(rows)*len(rows[0]), max_cells, max_rows) if message is not None: print(message) user_confirmation = raw_input() if 'y' == user_confirmation.strip(): pass elif 'n' == user_confirmation.strip(): return dict(message="Operation canceled by user.") elif 's' == user_confirmation.strip(): args_dict['subsample'] = min(max_rows, len(rows)) elif utils.is_int(user_confirmation.strip()): args_dict['subsample'] = int(user_confirmation.strip()) else: return dict(message="Operation canceled by user.") elif method_name in ['label_columns', 'update_metadata']: if client_dict['source'] == 'file': header, rows = data_utils.read_csv(client_dict['csv_path']) args_dict['mappings'] = {key: value for key, value in rows} # Call engine. result = self.call_bayesdb_engine(method_name, args_dict, debug) # If error occurred, exit now. if 'error' in result and result['error']: if pretty: print(result['message']) if force_output: return result else: return result['message'] else: return result # Do stuff now that engine has given you output, but before printing the result. result = self.callback(method_name, args_dict, client_dict, result) if return_raw_result: raw_result = { 'result': result, 'method_name': method_name, 'client_dict': client_dict } print("returning raw result for %s" % (method_name)) return raw_result assert type(result) != int if timing: end_time = time.time() print('Elapsed time: %.2f seconds.' % (end_time - start_time)) if plots is None: plots = 'DISPLAY' in os.environ.keys() if 'matrix' in result and (plots or client_dict['filename']): # Plot matrices plotting_utils.plot_matrix(result['matrix'], result['column_names'], result['title'], client_dict['filename']) if pretty: if 'column_lists' in result: print( self.pretty_print( dict(column_lists=result['column_lists']))) if force_output: return result else: return self.pretty_print(result) else: return result if ('plot' in client_dict and client_dict['plot']): if (plots or client_dict['filename']): # Plot generalized histograms or scatterplots try: plotting_M_c = result['metadata_full']['M_c_full'] except KeyError: plotting_M_c = result['M_c'] plot_remove_key = method_name in ['select', 'infer'] plotting_utils.plot_general_histogram( result['column_names'], result['data'], plotting_M_c, result['schema_full'], client_dict['filename'], client_dict['scatter'], remove_key=plot_remove_key) return self.pretty_print(result) else: if 'message' not in result: result['message'] = "" result['message'] = "Your query indicates that you would like to make a plot, but "\ "in order to do so, you must either enable plotting in a "\ "window or specify a filename to save to by appending 'SAVE "\ "TO <filename>' to this command.\n" + result['message'] if pretty: pp = self.pretty_print(result) print(pp) # Print warnings last so they're readable without scrolling backwards. if 'warnings' in result: """ Pretty-print warnings. """ for warning in result['warnings']: print('WARNING: %s' % warning) if pandas_output and 'data' in result and 'column_labels' in result: result_pandas_df = data_utils.construct_pandas_df(result) return result_pandas_df else: return result
def execute_statement(self, bql_statement_ast, pretty=True, timing=False, plots=None, yes=False, debug=False, pandas_df=None, pandas_output=True, key_column=None, return_raw_result=False): """ Accepts a SINGLE BQL STATEMENT as input, parses it, and executes it if it was parsed successfully. If pretty=True, then the command output will be pretty-printed as a string. If pretty=False, then the command output will be returned as a python object. timing=True prints out how long the command took to execute. For commands that have visual results, plots=True will cause those to be displayed by matplotlib as graphics rather than being pretty-printed as text. (Note that the graphics will also be saved if the user added SAVE TO <filename> to the BQL.) """ if timing: start_time = time.time() parser_out = None # TODO move pyparsing objects out of client into parser if debug: parser_out = self.parser.parse_single_statement(bql_statement_ast) else: try: parser_out = self.parser.parse_single_statement(bql_statement_ast) except Exception as e: raise utils.BayesDBParseError(str(e)) if parser_out is None: print("Could not parse command. Try typing 'help' for a list of all commands.") return elif not parser_out: return method_name, args_dict, client_dict = parser_out if client_dict is None: client_dict = {} # Do stuff now that you know the user's command, but before passing it to engine. if method_name == 'execute_file': return dict(message='execute_file', bql_string=open(args_dict['filename'], 'r').read()) elif method_name == 'update_codebook': _, codebook_rows = data_utils.read_csv(client_dict['codebook_path'], has_header=True) # TODO: require specific codebook_header values? Or don't require a header, # and if the first value in the header is actually a data column name, assume # the first row is codebook data, not a header. # Create a dict indexed by column name codebook = dict() for codebook_row in codebook_rows: codebook[codebook_row[0]] = dict(zip(['short_name', 'description', 'value_map'], codebook_row[1:])) args_dict['codebook'] = codebook elif (method_name == 'drop_btable') and (not yes): # If dropping something, ask for confirmation. print("Are you sure you want to permanently delete this btable, and all associated " "models, without any way to get them back? Enter 'y' if yes.") user_confirmation = raw_input() if 'y' != user_confirmation.strip(): return dict(message="Operation canceled by user.") elif (method_name == 'drop_models') and (not yes): # If dropping something, ask for confirmation. print("Are you sure you want to permanently delete model(s), without any way to get " "them back? Enter 'y' if yes.") user_confirmation = raw_input() if 'y' != user_confirmation.strip(): return dict(message="Operation canceled by user.") elif method_name == 'load_models': pklpath = client_dict['pkl_path'] try: model_data = pickle.load(gzip.open(self.parser.get_absolute_path(pklpath), 'rb')) except IOError as e: if pklpath[-7:] != '.pkl.gz': if pklpath[-4:] == '.pkl': model_data = pickle.load(open(self.parser.get_absolute_path(pklpath), 'rb')) else: pklpath = pklpath + ".pkl.gz" model_data = pickle.load(gzip.open(self.parser.get_absolute_path(pklpath), 'rb')) else: raise utils.BayesDBError('Models file %s could not be found.' % pklpath) # This is the more recent version, where schema is stored with models. if 'schema' in model_data.keys(): args_dict['models'] = model_data['models'] args_dict['model_schema'] = model_data['schema'] # This support older saved models, where only the model info was stored. else: args_dict['models'] = model_data args_dict['model_schema'] = None elif method_name == 'create_btable': if pandas_df is None: header, rows = data_utils.read_csv(client_dict['csv_path']) else: header, rows = data_utils.read_pandas_df(pandas_df) args_dict['header'] = header args_dict['raw_T_full'] = rows args_dict['key_column'] = key_column args_dict['subsample'] = False if 'codebook_path' in client_dict: _, codebook_rows = data_utils.read_csv(client_dict['codebook_path'], has_header=True) # TODO: require specific codebook_header values? Or don't require a header, # and if the first value in the header is actually a data column name, assume # the first row is codebook data, not a header. # Create a dict indexed by column name codebook = dict() for codebook_row in codebook_rows: codebook[codebook_row[0]] = dict(zip(['short_name', 'description', 'value_map'], codebook_row[1:])) args_dict['codebook'] = codebook else: warning = dedent(""" WARNING! You are creating a btable without a codebook, which will make interpretation of results more difficult. Codebooks should be in CSV format with each row corresponding to one column of the original data. The codebook should have four columns: 1. actual column name 2. short column description 3. long column description 4. value map (optional, only used for categorical columns - should be in JSON format) """) print(warning) # Display warning messages and get confirmation if btable is too large. # Ask user if they want to turn on subsampling. max_columns = 200 max_rows = 1000 max_cells = 100000 message = None if not yes: if len(rows[0]) > max_columns: message = "The btable you are uploading has %d columns, but BayesDB is " \ "currently designed to support only %d columns. If you proceed, " \ "performance may suffer unless you set many columns' datatypes to " \ "'ignore'. Would you like to continue? Enter 'y' if yes." \ % (len(rows[0]), max_columns) if len(rows) > max_rows: message = "The btable you are uploading has %d rows, but BayesDB is currently "\ "designed to support only %d rows. If you proceed, performance may "\ "suffer. Would you like to continue? Enter 'y' to continue without "\ "subsampling, 'n' to abort, 's' to continue by subsampling %d rows, "\ "or a positive integer to specify the number of rows to be "\ "subsampled." % (len(rows), max_rows, max_rows) if len(rows[0])*len(rows) > max_cells: message = "The btable you are uploading has %d cells, but BayesDB is currently"\ " designed to support only %d cells. If you proceed, performance may"\ " suffer unless you enable subsampling. Enter 'y' to continue "\ " without subsampling, 'n' to abort, 's' to continue by subsampling "\ "%d rows, or a positive integer to specify the number of rows to be "\ "subsampled." % (len(rows)*len(rows[0]), max_cells, max_rows) if message is not None: print(message) user_confirmation = raw_input() if 'y' == user_confirmation.strip(): pass elif 'n' == user_confirmation.strip(): return dict(message="Operation canceled by user.") elif 's' == user_confirmation.strip(): args_dict['subsample'] = min(max_rows, len(rows)) elif utils.is_int(user_confirmation.strip()): args_dict['subsample'] = int(user_confirmation.strip()) else: return dict(message="Operation canceled by user.") elif method_name in ['label_columns', 'update_metadata']: if client_dict['source'] == 'file': header, rows = data_utils.read_csv(client_dict['csv_path']) args_dict['mappings'] = {key: value for key, value in rows} # Call engine. result = self.call_bayesdb_engine(method_name, args_dict, debug) # If error occurred, exit now. if 'error' in result and result['error']: if pretty: print(result['message']) return result['message'] else: return result # Do stuff now that engine has given you output, but before printing the result. result = self.callback(method_name, args_dict, client_dict, result) if return_raw_result: raw_result = { 'result': result, 'method_name': method_name, 'client_dict': client_dict} print("returning raw result for %s" % (method_name)) return raw_result assert type(result) != int if timing: end_time = time.time() print('Elapsed time: %.2f seconds.' % (end_time - start_time)) if plots is None: plots = 'DISPLAY' in os.environ.keys() if 'matrix' in result and (plots or client_dict['filename']): # Plot matrices plotting_utils.plot_matrix(result['matrix'], result['column_names'], result['title'], client_dict['filename']) if pretty: if 'column_lists' in result: print(self.pretty_print(dict(column_lists=result['column_lists']))) return self.pretty_print(result) else: return result if ('plot' in client_dict and client_dict['plot']): if (plots or client_dict['filename']): # Plot generalized histograms or scatterplots try: plotting_M_c = result['metadata_full']['M_c_full'] except KeyError: plotting_M_c = result['M_c'] plot_remove_key = method_name in ['select', 'infer'] plotting_utils.plot_general_histogram(result['column_names'], result['data'], plotting_M_c, result['schema_full'], client_dict['filename'], client_dict['scatter'], remove_key=plot_remove_key) return self.pretty_print(result) else: if 'message' not in result: result['message'] = "" result['message'] = "Your query indicates that you would like to make a plot, but "\ "in order to do so, you must either enable plotting in a "\ "window or specify a filename to save to by appending 'SAVE "\ "TO <filename>' to this command.\n" + result['message'] if pretty: pp = self.pretty_print(result) print(pp) # Print warnings last so they're readable without scrolling backwards. if 'warnings' in result: """ Pretty-print warnings. """ for warning in result['warnings']: print('WARNING: %s' % warning) if pandas_output and 'data' in result and 'column_labels' in result: result_pandas_df = data_utils.construct_pandas_df(result) return result_pandas_df else: return result
abl_times = args.abl_times print("Rule file is :", rule_file_path) if rule_file_path == "None": rule_file_path = None pretrain_filename = "0_0.10.json" pretrain_money_filename = "./data/0_0.10.csv" abl_train_filename = "1_0.90.json" abl_train_money_filename = "1_0.90.csv" test_filename = "10.json" test_money_filename = "10.csv" recorder.write_pair("Method", "ABL") pretrain_filenames, pretrain_ahs, pretrain_money, pretrain_labels, pretrain_attrs = read_csv( pretrain_money_filename) # 调用 Bert perception = BERT(bert_path="./chinese_L-12_H-768_A-12", data_dir="./data", output_dir="./abl_model_0", num_train_epochs=pretrain_bert_train_epochs) sentence = SentenceModel() abductor = SentenceAbduction(sentence, rule_file_path, True) perception.train(pretrain_filename) lastest_ckpt = get_lastest_ckpt(perception.output_dir + "/checkpoint") print("Test BERT:") bert_eval_info = perception.eval( test_filename, perception.output_dir + "/" + lastest_ckpt)
def execute_statement( self, bql_statement_ast, pretty=True, timing=False, plots=None, yes=False, debug=False, pandas_df=None, pandas_output=True, key_column=None, return_raw_result=False, force_output=False, ): """ Accepts a SINGLE BQL STATEMENT as input, parses it, and executes it if it was parsed successfully. If pretty=True, then the command output will be pretty-printed as a string. If pretty=False, then the command output will be returned as a python object. If force_output=True, then results will be returned regardless of pretty timing=True prints out how long the command took to execute. For commands that have visual results, plots=True will cause those to be displayed by matplotlib as graphics rather than being pretty-printed as text. (Note that the graphics will also be saved if the user added SAVE TO <filename> to the BQL.) """ if timing: start_time = time.time() parser_out = None # TODO move pyparsing objects out of client into parser if debug: parser_out = self.parser.parse_single_statement(bql_statement_ast) else: try: parser_out = self.parser.parse_single_statement(bql_statement_ast) except Exception as e: raise utils.BayesDBParseError(str(e)) if parser_out is None: print ("Could not parse command. Try typing 'help' for a list of all commands.") return elif not parser_out: return method_name, args_dict, client_dict = parser_out if client_dict is None: client_dict = {} # Do stuff now that you know the user's command, but before passing it to engine. if method_name == "execute_file": return dict(message="execute_file", bql_string=open(args_dict["filename"], "r").read()) elif method_name == "update_codebook": _, codebook_rows = data_utils.read_csv(client_dict["codebook_path"], has_header=True) # TODO: require specific codebook_header values? Or don't require a header, # and if the first value in the header is actually a data column name, assume # the first row is codebook data, not a header. # Create a dict indexed by column name codebook = dict() for codebook_row in codebook_rows: codebook[codebook_row[0]] = dict(zip(["short_name", "description", "value_map"], codebook_row[1:])) args_dict["codebook"] = codebook elif (method_name == "drop_btable") and (not yes): # If dropping something, ask for confirmation. print ( "Are you sure you want to permanently delete this btable, and all associated " "models, without any way to get them back? Enter 'y' if yes." ) user_confirmation = raw_input() if "y" != user_confirmation.strip(): return dict(message="Operation canceled by user.") elif (method_name == "drop_models") and (not yes): # If dropping something, ask for confirmation. print ( "Are you sure you want to permanently delete model(s), without any way to get " "them back? Enter 'y' if yes." ) user_confirmation = raw_input() if "y" != user_confirmation.strip(): return dict(message="Operation canceled by user.") elif method_name == "load_models": pklpath = client_dict["pkl_path"] try: model_data = pickle.load(gzip.open(self.parser.get_absolute_path(pklpath), "rb")) except IOError as e: if pklpath[-7:] != ".pkl.gz": if pklpath[-4:] == ".pkl": model_data = pickle.load(open(self.parser.get_absolute_path(pklpath), "rb")) else: pklpath = pklpath + ".pkl.gz" model_data = pickle.load(gzip.open(self.parser.get_absolute_path(pklpath), "rb")) else: raise utils.BayesDBError("Models file %s could not be found." % pklpath) # This is the more recent version, where schema is stored with models. if "schema" in model_data.keys(): args_dict["models"] = model_data["models"] args_dict["model_schema"] = model_data["schema"] # This support older saved models, where only the model info was stored. else: args_dict["models"] = model_data args_dict["model_schema"] = None # Older versions of model_schema just had a str cctype as the dict items. # Newest version has a dict of cctype and parameters. Use this values to # test the recency of the models. model_schema = args_dict["model_schema"] if model_schema: model_schema_itemtype = type(model_schema[model_schema.keys()[0]]) else: model_schema_itemtype = None if model_schema is None or model_schema_itemtype != dict: args_dict["model_schema"] = None if not yes: print """WARNING! The models you are currently importing were saved without a schema or without detailed column parameters (probably from a previous version). If you are loading models into the same table from which you created them, problems are unlikely, unless you have dropped models and then updated the schema. If you are loading models into a different table from which you created them, you should verify that the table schemas are the same. Please use "SAVE MODELS FROM <btable> TO <filename.pkl.gz>" to create an updated copy of your models. Are you sure you want to load these model(s)? """ user_confirmation = raw_input() if "y" != user_confirmation.strip(): return dict(message="Operation canceled by user.") elif method_name == "create_btable": if pandas_df is None: header, rows = data_utils.read_csv(client_dict["csv_path"]) else: header, rows = data_utils.read_pandas_df(pandas_df) args_dict["header"] = header args_dict["raw_T_full"] = rows args_dict["key_column"] = key_column args_dict["subsample"] = False if "codebook_path" in client_dict: _, codebook_rows = data_utils.read_csv(client_dict["codebook_path"], has_header=True) # TODO: require specific codebook_header values? Or don't require a header, # and if the first value in the header is actually a data column name, assume # the first row is codebook data, not a header. # Create a dict indexed by column name codebook = dict() for codebook_row in codebook_rows: codebook[codebook_row[0]] = dict(zip(["short_name", "description", "value_map"], codebook_row[1:])) args_dict["codebook"] = codebook else: warning = dedent( """ WARNING! You are creating a btable without a codebook, which will make interpretation of results more difficult. Codebooks should be in CSV format with each row corresponding to one column of the original data. The codebook should have four columns: 1. actual column name 2. short column description 3. long column description 4. value map (optional, only used for categorical columns - should be in JSON format) """ ) print (warning) # Display warning messages and get confirmation if btable is too large. # Ask user if they want to turn on subsampling. max_columns = 200 max_rows = 1000 max_cells = 100000 message = None if not yes: if len(rows[0]) > max_columns: message = ( "The btable you are uploading has %d columns, but BayesDB is " "currently designed to support only %d columns. If you proceed, " "performance may suffer unless you set many columns' datatypes to " "'ignore'. Would you like to continue? Enter 'y' if yes." % (len(rows[0]), max_columns) ) if len(rows) > max_rows: message = ( "The btable you are uploading has %d rows, but BayesDB is currently " "designed to support only %d rows. If you proceed, performance may " "suffer. Would you like to continue? Enter 'y' to continue without " "subsampling, 'n' to abort, 's' to continue by subsampling %d rows, " "or a positive integer to specify the number of rows to be " "subsampled." % (len(rows), max_rows, max_rows) ) if len(rows[0]) * len(rows) > max_cells: message = ( "The btable you are uploading has %d cells, but BayesDB is currently" " designed to support only %d cells. If you proceed, performance may" " suffer unless you enable subsampling. Enter 'y' to continue " " without subsampling, 'n' to abort, 's' to continue by subsampling " "%d rows, or a positive integer to specify the number of rows to be " "subsampled." % (len(rows) * len(rows[0]), max_cells, max_rows) ) if message is not None: print (message) user_confirmation = raw_input() if "y" == user_confirmation.strip(): pass elif "n" == user_confirmation.strip(): return dict(message="Operation canceled by user.") elif "s" == user_confirmation.strip(): args_dict["subsample"] = min(max_rows, len(rows)) elif utils.is_int(user_confirmation.strip()): args_dict["subsample"] = int(user_confirmation.strip()) else: return dict(message="Operation canceled by user.") elif method_name in ["label_columns", "update_metadata"]: if client_dict["source"] == "file": header, rows = data_utils.read_csv(client_dict["csv_path"]) args_dict["mappings"] = {key: value for key, value in rows} # Call engine. result = self.call_bayesdb_engine(method_name, args_dict, debug) # If error occurred, exit now. if "error" in result and result["error"]: if pretty: print (result["message"]) if force_output: return result else: return result["message"] else: return result # Do stuff now that engine has given you output, but before printing the result. result = self.callback(method_name, args_dict, client_dict, result) if return_raw_result: raw_result = {"result": result, "method_name": method_name, "client_dict": client_dict} print ("returning raw result for %s" % (method_name)) return raw_result assert type(result) != int if timing: end_time = time.time() print ("Elapsed time: %.2f seconds." % (end_time - start_time)) if plots is None: plots = "DISPLAY" in os.environ.keys() if "matrix" in result and (plots or client_dict["filename"]): # Plot matrices plotting_utils.plot_matrix( result["matrix"], result["column_names"], result["title"], client_dict["filename"] ) if pretty: if "column_lists" in result: print (self.pretty_print(dict(column_lists=result["column_lists"]))) if force_output: return result else: return self.pretty_print(result) else: return result if "plot" in client_dict and client_dict["plot"]: if plots or client_dict["filename"]: # Plot generalized histograms or scatterplots try: plotting_M_c = result["metadata_full"]["M_c_full"] except KeyError: plotting_M_c = result["M_c"] plot_remove_key = method_name in ["select", "infer"] plotting_utils.plot_general_histogram( result["column_names"], result["data"], plotting_M_c, result["schema_full"], client_dict["filename"], client_dict["scatter"], remove_key=plot_remove_key, ) return self.pretty_print(result) else: if "message" not in result: result["message"] = "" result["message"] = ( "Your query indicates that you would like to make a plot, but " "in order to do so, you must either enable plotting in a " "window or specify a filename to save to by appending 'SAVE " "TO <filename>' to this command.\n" + result["message"] ) if pretty: pp = self.pretty_print(result) print (pp) # Print warnings last so they're readable without scrolling backwards. if "warnings" in result: """ Pretty-print warnings. """ for warning in result["warnings"]: print ("WARNING: %s" % warning) if pandas_output and "data" in result and "column_labels" in result: result_pandas_df = data_utils.construct_pandas_df(result) return result_pandas_df else: return result
import os import os.path from data_utils import subj_images, read_csv, make_seq_example BASE_PATH = '/home/yasaman/HN/HN-sample-to-start-with-otherdata-imgs' SURG_PATH = 'Images/Pyeloplasty/' CONS_PATH = 'Images/No Surgery/' CLINC_PATH = 'RAW_PHN_DATA_Sample.csv' SURG_PATH = os.path.join(BASE_PATH, SURG_PATH) CONS_PATH = os.path.join(BASE_PATH, CONS_PATH) surgery_names = os.listdir(SURG_PATH) surgery_names.sort() # path to save hydronephrosis TFREC_HN_PATH = '/home/yasaman/HN/hn_data.tfrecords' label_map = dict({0:CONS_PATH, 1:SURG_PATH}) all_examples = read_csv(os.path.join(BASE_PATH, CLINC_PATH)) with tf.python_io.TFRecordWriter(TFREC_HN_PATH) as writer: for study_no,info in all_examples.items(): example = make_seq_example(study_no, info, label_map) writer.write(example.SerializeToString())