def i_check_output_file(step, output=None, check_file=None): if check_file is None or output is None: assert False check_file = res_filename(check_file) try: output_file = os.path.join(world.directory, "reify.py") with open(check_file, open_mode("r")) as check_file_handler: check_contents = check_file_handler.read().strip("\n") # remove unicode mark for strings if Python3 if PYTHON3: check_contents = check_contents.replace( \ " u'", " '").replace("{u'", "{'").replace(' u"', ' "') with open(output_file, open_mode("r")) as output_file: output_file_contents = output_file.read() #strip comments at the beginning of the file output_file_contents = re.sub(r'""".*"""', '', output_file_contents, flags=re.S).strip("\n") if check_contents == output_file_contents: assert True else: if PYTHON3: # look for an alternative in PYTHON3 check_contents = python3_contents(check_file, check_contents) if check_contents == output_file_contents: assert True else: assert False, ("File contents:\n%s\nExpected contents:\n%s" % (output_file_contents, check_contents)) except Exception, exc: assert False, str(exc)
def i_check_output_file(step, output=None, check_file=None): if check_file is None or output is None: assert False check_file = res_filename(check_file) try: output_file = os.path.join(world.directory, "reify.py") with open(check_file, open_mode("r")) as check_file_handler: check_contents = check_file_handler.read().strip("\n") # remove unicode mark for strings if Python3 if PYTHON3: check_contents = check_contents.replace( \ " u'", " '").replace("{u'", "{'").replace(' u"', ' "') with open(output_file, open_mode("r")) as output_file: output_file_contents = output_file.read() #strip comments at the beginning of the file output_file_contents = re.sub(r'""".*"""', '', output_file_contents, flags=re.S).strip("\n") #strip internally added project id information prefix = "" if PYTHON3 else "u" p_str = r',\s\\\n \{\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \ % prefix output_file_contents = re.sub(p_str, ')', output_file_contents, flags=re.S).strip("\n") p_str = r',\s\\\n \s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \ % prefix output_file_contents = re.sub(p_str, ')', output_file_contents, flags=re.S).strip("\n") p_str = r',\n \s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \ % prefix output_file_contents = re.sub(p_str, '})', output_file_contents, flags=re.S).strip("\n") p_str = r',\s\'project\':\s%s\'project/[a-f0-9]{24}\'' % prefix output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S).strip("\n") print output_file_contents if check_contents == output_file_contents: assert True else: if PYTHON3: # look for an alternative in PYTHON3 check_contents = python3_contents(check_file, check_contents) if check_contents == output_file_contents: assert True else: assert False, ("File contents:\n%s\nExpected contents:\n%s" % (output_file_contents, check_contents)) except Exception, exc: assert False, str(exc)
def i_check_output_file(step, output=None, check_file=None): if check_file is None or output is None: assert False check_file = res_filename(check_file) output_file = os.path.join(world.directory, "reify.py") with open(check_file, open_mode("r")) as check_file_handler: check_contents = check_file_handler.read().strip("\n") check_contents_lines = check_contents.split("\n") for index, line in enumerate(check_contents_lines): if line: check_contents_lines[index] = INDENT + line check_contents = "\n".join(check_contents_lines) # remove unicode mark for strings if Python3 if PYTHON3: check_contents = check_contents.replace( \ " u'", " '").replace("{u'", "{'").replace( \ ' u"', ' "').replace('u\\\'', '\\\'') with open(output_file, open_mode("r")) as output_file: output_file_contents = output_file.read() #strip comments at the beginning of the file output_file_contents = re.sub(r'#!.*def\smain\(\):\n', '', output_file_contents, flags=re.S).strip("\n") output_file_contents = output_file_contents.replace( \ '\n\nif __name__ == "__main__":\n main()', '') #strip internally added project id information prefix = "" if PYTHON3 else "u" p_str = r',\s\\\n%s\{\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \ % (INDENT * 2, prefix) output_file_contents = re.sub(p_str, ')', output_file_contents, flags=re.S).strip("\n") p_str = r',\s\\\n%s\s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \ % (INDENT * 2, prefix) output_file_contents = re.sub(p_str, ')', output_file_contents, flags=re.S).strip("\n") p_str = r',\n%s\s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \ % (INDENT * 2, prefix) output_file_contents = re.sub(p_str, '})', output_file_contents, flags=re.S).strip("\n") p_str = r',\s\'project\':\s%s\'project/[a-f0-9]{24}\'' % prefix output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S).strip("\n") if check_contents == output_file_contents: assert True else: if PYTHON3: # look for an alternative in PYTHON3 check_contents = python3_contents(check_file, check_contents) if check_contents != output_file_contents: check_contents = python3_contents( check_file, check_contents, alternative="_1") eq_(check_contents, output_file_contents)
def i_check_output_file(step, output=None, check_file=None): if check_file is None or output is None: assert False check_file = res_filename(check_file) output_file = os.path.join(world.directory, "reify.py") with open(check_file, open_mode("r")) as check_file_handler: check_contents = check_file_handler.read().strip("\n") check_contents_lines = check_contents.split("\n") for index, line in enumerate(check_contents_lines): if line: check_contents_lines[index] = INDENT + line check_contents = "\n".join(check_contents_lines) # remove unicode mark for strings if Python3 if PYTHON3: check_contents = check_contents.replace( \ " u'", " '").replace("{u'", "{'").replace( \ ' u"', ' "').replace('u\\\'', '\\\'') with open(output_file, open_mode("r")) as output_file: output_file_contents = output_file.read() #strip comments at the beginning of the file output_file_contents = re.sub(r'#!.*def\smain\(\):\n', '', output_file_contents, flags=re.S).strip("\n") output_file_contents = output_file_contents.replace( \ '\n\nif __name__ == "__main__":\n main()', '') #strip internally added project id information prefix = "" if PYTHON3 else "u" p_str = r',\s\\\n%s\{\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \ % (INDENT * 2, prefix) output_file_contents = re.sub(p_str, ')', output_file_contents, flags=re.S).strip("\n") p_str = r',\s\\\n%s\s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \ % (INDENT * 2, prefix) output_file_contents = re.sub(p_str, ')', output_file_contents, flags=re.S).strip("\n") p_str = r',\n%s\s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \ % (INDENT * 2, prefix) output_file_contents = re.sub(p_str, '})', output_file_contents, flags=re.S).strip("\n") p_str = r',\s\'project\':\s%s\'project/[a-f0-9]{24}\'' % prefix output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S).strip("\n") if check_contents == output_file_contents: assert True else: if PYTHON3: # look for an alternative in PYTHON3 check_contents = python3_contents(check_file, check_contents) if check_contents != output_file_contents: check_contents = python3_contents(check_file, check_contents, alternative="_1") eq_(check_contents, output_file_contents)
def best_first_search(datasets_file, api, args, common_options, staleness=None, penalty=None, objective_name=None, resume=False): """Selecting the fields to be used in the model construction """ counter = 0 loop_counter = 0 features_file = os.path.normpath( os.path.join(args.output_dir, FEATURES_LOG)) with open(features_file, u.open_mode("w")) as features_handler: features_writer = csv.writer(features_handler, lineterminator="\n") features_writer.writerow( ["step", "state", "score", "metric_value", "best_score"]) features_handler.flush() if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_PENALTY # retrieving the first dataset in the file try: with open(datasets_file, u.open_mode("r")) as datasets_handler: dataset_id = datasets_handler.readline().strip() except IOError, exc: sys.exit("Could not read the generated datasets file: %s" % str(exc)) try: stored_dataset = u.storage_file_name(args.output_dir, dataset_id) with open(stored_dataset, u.open_mode("r")) as dataset_handler: dataset = json.loads(dataset_handler.read()) except IOError: dataset = api.check_resource(dataset_id, query_string=ALL_FIELDS_QS) # initial feature set fields = Fields(dataset) excluded_features = ([] if args.exclude_features is None else args.exclude_features.split(args.args_separator)) try: excluded_ids = [ fields.field_id(feature) for feature in excluded_features ] objective_id = fields.field_id(objective_name) except ValueError, exc: sys.exit(exc)
def create_kfold_json(args, kfold_field=DEFAULT_KFOLD_FIELD, objective_field=None, resume=False): """Create the files to generate a new field with a random integer from 0 to k-1, and a filter file for each of these indexes. """ output_dir = args.output_dir k = args.k_folds if args.k_folds else DEFAULT_KFOLDS try: selecting_file_list = [] for index in range(0, k): new_field = NEW_FIELD % (index, k, kfold_field, index, objective_field) selecting_file = TEST_DATASET % index selecting_file = os.path.normpath( os.path.join(output_dir, selecting_file)) selecting_file_list.append(selecting_file) # When resuming, check if the file already exists if not resume or not os.path.isfile(selecting_file): resume = False with open(selecting_file, u.open_mode("w")) as test_dataset: test_dataset.write(new_field) return selecting_file_list, resume except IOError: sys.exit("Could not create the necessary files.")
def best_first_search(datasets_file, api, args, command_obj, staleness=None, penalty=None, objective_name=None, resume=False): """Selecting the fields to be used in the model construction """ counter = 0 loop_counter = 0 features_file = os.path.normpath( os.path.join(args.output_dir, FEATURES_LOG)) features_writer = UnicodeWriter(features_file).open_writer() features_header = FEATURES_HEADER if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_PENALTY # retrieving the first dataset in the file try: with open(datasets_file, u.open_mode("r")) as datasets_handler: dataset_id = datasets_handler.readline().strip() except IOError, exc: sys.exit("Could not read the generated datasets file: %s" % str(exc))
def create_kfold_json(args, kfold_field=DEFAULT_KFOLD_FIELD, objective_field=None, resume=False): """Create the files to generate a new field with a random integer from 0 to k-1, and a filter file for each of these indexes. """ output_dir = args.output_dir k = args.k_folds if args.k_folds else DEFAULT_KFOLDS try: selecting_file_list = [] for index in range(0, k): new_field = NEW_FIELD % (index, k, kfold_field, index, objective_field) selecting_file = TEST_DATASET % index selecting_file = os.path.normpath(os.path.join(output_dir, selecting_file)) selecting_file_list.append(selecting_file) # When resuming, check if the file already exists if not resume or not os.path.isfile(selecting_file): resume = False with open(selecting_file, u.open_mode("w")) as test_dataset: test_dataset.write(new_field) return selecting_file_list, resume except IOError: sys.exit("Could not create the necessary files.")
def i_check_sample_file(step, check_sample_file=None): if check_sample_file is None: assert False check_sample_file = res_filename(check_sample_file) try: sample_file = os.path.join(world.directory, "sample.csv") with open(check_sample_file, open_mode("r")) as check_sample_file: check_sample_contents = check_sample_file.read() with open(sample_file, open_mode("r")) as sample_file: sample_file_contents = sample_file.read() if check_sample_contents == sample_file_contents: assert True else: assert False, ("File contents:\n%s\nExpected contents:\n%s" % (sample_file_contents, check_sample_contents)) except Exception, exc: assert False, str(exc)
def multi_label_expansion(training_set, training_set_header, args, output_path, labels=None, session_file=None, input_flag=False): """Splitting the labels in a multi-label objective field to create a source with column per label """ objective_field = args.objective_field input_reader = TrainReader(training_set, training_set_header, objective_field, multi_label=True, labels=labels, label_separator=args.label_separator, training_separator=args.training_separator, multi_label_fields=args.multi_label_fields_list, label_aggregates=args.label_aggregates_list, objective=not input_flag) # read file to get all the different labels if no --labels flag is given # or use labels given in --labels and generate the new field names new_headers = input_reader.get_label_headers() try: file_name = os.path.basename(training_set) except AttributeError: file_name = "test_set.csv" if input_flag else "training_set.csv" output_file = "%s%sextended_%s" % (output_path, os.sep, file_name) message = u.dated("Transforming to extended source.\n") u.log_message(message, log_file=session_file, console=args.verbosity) with open(output_file, u.open_mode('w')) as output_handler: output = csv.writer(output_handler, lineterminator="\n") output.writerow(new_headers) # read to write new source file with column per label input_reader.reset() if training_set_header: input_reader.get_next() while True: try: row = input_reader.get_next(extended=True) output.writerow(row) except StopIteration: break # training sources are zipped to minimize upload time and resources if not input_flag: output_file_zip = "%s%sextended_%s.zip" % (output_path, os.sep, file_name) with ZipFile(output_file_zip, 'w', ZIP_DEFLATED) as output_zipped_file: output_zipped_file.write(output_file, file_name) output_file = output_file_zip objective_field = input_reader.headers[input_reader.objective_column] input_reader.close() return (output_file, input_reader.get_multi_label_data())
def best_first_search(datasets_file, api, args, common_options, staleness=None, penalty=None, objective_name=None, resume=False): """Selecting the fields to be used in the model construction """ counter = 0 loop_counter = 0 features_file = os.path.normpath(os.path.join(args.output_dir, FEATURES_LOG)) with open(features_file, u.open_mode("w")) as features_handler: features_writer = csv.writer(features_handler, lineterminator="\n") features_writer.writerow([ "step", "state", "score", "metric_value", "best_score"]) features_handler.flush() if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_PENALTY # retrieving the first dataset in the file try: with open(datasets_file, u.open_mode("r")) as datasets_handler: dataset_id = datasets_handler.readline().strip() except IOError, exc: sys.exit("Could not read the generated datasets file: %s" % str(exc)) try: stored_dataset = u.storage_file_name(args.output_dir, dataset_id) with open(stored_dataset, u.open_mode("r")) as dataset_handler: dataset = json.loads(dataset_handler.read()) except IOError: dataset = api.check_resource(dataset_id, query_string=ALL_FIELDS_QS) # initial feature set fields = Fields(dataset) excluded_features = ([] if args.exclude_features is None else args.exclude_features.split( args.args_separator)) try: excluded_ids = [fields.field_id(feature) for feature in excluded_features] objective_id = fields.field_id(objective_name) except ValueError, exc: sys.exit(exc)
def i_check_sample_json(step, check_sample_file=None): if check_sample_file is None: assert False check_sample_file = res_filename(check_sample_file) try: sample_file = os.path.join(world.directory, "stat_info.json") with open(check_sample_file, open_mode("r")) as check_sample_file: contents = check_sample_file.read() check_sample_json = json.loads(contents) with open(sample_file, open_mode("r")) as sample_file: contents = sample_file.read() sample_file_json = json.loads(contents) if check_sample_json == sample_file_json: assert True else: assert False, ("File contents:\n%s\nExpected contents:\n%s" % (sample_file_json, check_sample_json)) except Exception, exc: assert False, str(exc)
def retrieve_subcommands(): """Retrieves the executed subcommands in inverse order """ global subcommand_list subcommand_list = open(subcommand_file, u.open_mode("r")).readlines() if not u.PYTHON3: subcommand_list = [subcommand.decode(u.SYSTEM_ENCODING) for subcommand in subcommand_list] subcommand_list.reverse()
def python3_contents(filename, prior_contents, alternative=""): """Check for a file that has alternative contents for Python3 and return its contents """ directory = os.path.dirname(filename) basename = os.path.basename(filename) basename_name, basename_ext = basename.split(".") filename = os.path.join(directory, "%s_py3%s.%s" % ( \ basename_name, alternative, basename_ext)) try: with open(filename, open_mode("r")) as file_handler: return file_handler.read().strip("\n") except IOError: return prior_contents
def create_candidates_evaluations(datasets_file, args, command_obj, resume=False, random_candidates=DEFAULT_MIN_CANDIDATES): """ Create random candidates ensembles evaluations """ global subcommand_list output_dir = os.path.normpath( u.check_dir( os.path.join(u"%s%s" % (args.output_dir, random_candidates), "evaluation.json"))) command = COMMANDS["random_candidates"] % (datasets_file, random_candidates, output_dir) command_args = command.split() """ common_options_list = u.get_options_list(args, command_obj.common_options, prioritary=command_args) command_args.extend(common_options_list) """ command_args.append("--objective") command_args.append(args.objective_field) command_args = add_model_options(command_args, args) command_obj.propagate( command_args, exclude=["--dataset", "--datasets", "--dataset-file"]) command = rebuild_command(command_args) if resume: next_command = subcommand_list.pop() if different_command(next_command, command): resume = False u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) elif not subcommand_list: main_dispatcher(args=['main', '--resume']) resume = False else: u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) evaluation_file = os.path.normpath( os.path.join(output_dir, "evaluation.json")) try: with open(evaluation_file, u.open_mode("r")) as evaluation_handler: evaluation = json.loads(evaluation_handler.read()) return evaluation, resume except (ValueError, IOError): sys.exit("Failed to retrieve evaluation.")
def create_candidates_evaluations(datasets_file, args, command_obj, resume=False, random_candidates=DEFAULT_MIN_CANDIDATES): """ Create random candidates ensembles evaluations """ global subcommand_list output_dir = os.path.normpath(u.check_dir( os.path.join(u"%s%s" % (args.output_dir, random_candidates), "evaluation.json"))) command = COMMANDS["random_candidates"] % ( datasets_file, random_candidates, output_dir) command_args = command.split() """ common_options_list = u.get_options_list(args, command_obj.common_options, prioritary=command_args) command_args.extend(common_options_list) """ command_args.append("--objective") command_args.append(args.objective_field) command_args = add_model_options(command_args, args) command_obj.propagate(command_args, exclude=["--dataset", "--datasets", "--dataset-file"]) command = rebuild_command(command_args) if resume: next_command = subcommand_list.pop() if different_command(next_command, command): resume = False u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) elif not subcommand_list: main_dispatcher(args=['main', '--resume']) resume = False else: u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) evaluation_file = os.path.normpath(os.path.join(output_dir, "evaluation.json")) try: with open(evaluation_file, u.open_mode("r")) as evaluation_handler: evaluation = json.loads(evaluation_handler.read()) return evaluation, resume except (ValueError, IOError): sys.exit("Failed to retrieve evaluation.")
def create_node_th_evaluations(datasets_file, args, common_options, resume=False, node_threshold=DEFAULT_MIN_NODES): """ Create node_threshold evaluations """ global subcommand_list output_dir = os.path.normpath( u.check_dir( os.path.join(u"%s%s" % (args.output_dir, node_threshold), "evaluation.json"))) command = COMMANDS["node_threshold"] % (datasets_file, node_threshold, output_dir) command_args = command.split() common_options_list = u.get_options_list(args, common_options, prioritary=command_args) command_args.extend(common_options_list) command_args.append("--objective") command_args.append(args.objective_field) command_args = add_model_options(command_args, args) command = rebuild_command(command_args) if resume: next_command = subcommand_list.pop() if different_command(next_command, command): resume = False u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) elif not subcommand_list: main_dispatcher(args=['main', '--resume']) resume = False else: u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) evaluation_file = os.path.normpath( os.path.join(output_dir, "evaluation.json")) try: with open(evaluation_file, u.open_mode("r")) as evaluation_handler: evaluation = json.loads(evaluation_handler.read()) return evaluation, resume except (ValueError, IOError): sys.exit("Failed to retrieve evaluation.")
def create_node_th_evaluations(datasets_file, args, common_options, resume=False, node_threshold=DEFAULT_MIN_NODES): """ Create node_threshold evaluations """ global subcommand_list output_dir = os.path.normpath(u.check_dir( os.path.join(u"%s%s" % (args.output_dir, node_threshold), "evaluation.json"))) command = COMMANDS["node_threshold"] % ( datasets_file, node_threshold, output_dir) command_args = command.split() common_options_list = u.get_options_list(args, common_options, prioritary=command_args) command_args.extend(common_options_list) command_args.append("--objective") command_args.append(args.objective_field) command_args = add_model_options(command_args, args) command = rebuild_command(command_args) if resume: next_command = subcommand_list.pop() if different_command(next_command, command): resume = False u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) elif not subcommand_list: main_dispatcher(args=['main', '--resume']) resume = False else: u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) evaluation_file = os.path.normpath(os.path.join(output_dir, "evaluation.json")) try: with open(evaluation_file, u.open_mode("r")) as evaluation_handler: evaluation = json.loads(evaluation_handler.read()) return evaluation, resume except (ValueError, IOError): sys.exit("Failed to retrieve evaluation.")
def best_first_search(datasets_file, api, args, common_options, staleness=None, penalty=None, objective_name=None, resume=False): """Selecting the fields to be used in the model construction """ counter = 0 loop_counter = 0 features_file = os.path.normpath(os.path.join(args.output_dir, FEATURES_LOG)) features_writer = UnicodeWriter(features_file).open_writer() features_header = FEATURES_HEADER if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_PENALTY # retrieving the first dataset in the file try: with open(datasets_file, u.open_mode("r")) as datasets_handler: dataset_id = datasets_handler.readline().strip() except IOError, exc: sys.exit("Could not read the generated datasets file: %s" % str(exc))
def i_check_output_file(step, output=None, check_file=None): if check_file is None or output is None: assert False check_file = res_filename(check_file) output_file = os.path.join(world.directory, os.path.basename(output)) with open(check_file, open_mode("r")) as check_file_handler: check_contents = check_file_handler.read().strip("\n") """ check_contents_lines = check_contents.split("\n") for index, line in enumerate(check_contents_lines): if line: check_contents_lines[index] = INDENT + line check_contents = "\n".join(check_contents_lines) """ # remove unicode mark for strings if Python3 if PYTHON3: check_contents = check_contents.replace( \ " u'", " '").replace("{u'", "{'").replace( \ ' u"', ' "').replace('u\\\'', '\\\'') check_contents = re.sub(r'\n\s*', '\n', check_contents) with open(output_file, open_mode("r")) as output_file: output_file_contents = output_file.read() #strip comments at the beginning of the file output_file_contents = re.sub(r'#!.*def\smain\(\):\n', '', output_file_contents, flags=re.S).strip("\n") output_file_contents = output_file_contents.replace( \ '\nif __name__ == "__main__":\n main()', '') #strip internally added project id information prefix = "" if PYTHON3 else "u" p_str = r'%s\'project\':\s%s\'project/[a-f0-9]{24}\',?\s?' \ % (prefix, prefix) output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S).strip("\n") p_str = r'/[a-f0-9]{24}' output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S) check_contents = re.sub(p_str, '', check_contents, flags=re.S) p_str = r';;.*\n' output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S) check_contents = re.sub(p_str, '', check_contents, flags=re.S) p_str = r'created by.*\n' output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S) check_contents = re.sub(p_str, '', check_contents, flags=re.S) p_str = r' api = .*?\n' output_file_contents = re.sub(p_str, ' api = BigML()\n', output_file_contents, flags=re.S).strip("\n") output_file_contents = re.sub(r'\n\s*', '\n', output_file_contents) check_contents = re.sub(r'\n\s*', '\n', check_contents) output_file_contents = output_file_contents.strip("\n").strip() check_contents = check_contents.strip("\n").strip() if check_contents != output_file_contents: if PYTHON3: # look for an alternative in PYTHON3 check_contents = python3_contents( \ check_file, check_contents) if check_contents != output_file_contents: check_contents = python3_contents(check_file, check_contents, alternative="_1") with open("%s_bck" % check_file, "w") as bck_file: bck_file.write(output_file_contents) eq_(check_contents, output_file_contents)
def best_candidates_number(datasets_file, args, common_options, penalty=None, resume=False): """Selecting the best number of random candidates to be used in the ensemble construction """ loop_counter = 0 candidates_file = os.path.normpath(os.path.join(args.output_dir, CANDIDATES_LOG)) with open(candidates_file, u.open_mode("w")) as candidates_handler: candidates_writer = csv.writer(candidates_handler, lineterminator="\n") candidates_writer.writerow([ "step", "random_candidates", "score", "metric_value", "best_score"]) candidates_handler.flush() args.output_dir = os.path.normpath(os.path.join(args.output_dir, "random")) max_candidates = args.max_candidates + 1 if args.nodes_step is None: args.nodes_step = DEFAULT_CANDIDATES_STEP random_candidates = args.min_candidates if penalty is None: penalty = DEFAULT_CANDIDATES_PENALTY best_score = - float('inf') metric = args.optimize score = best_score while random_candidates < max_candidates: loop_counter += 1 (score, metric_value, metric, resume) = candidates_evaluate(datasets_file, args, random_candidates, common_options, penalty=penalty, resume=resume, metric=metric) candidates_writer.writerow([ loop_counter, random_candidates, score, metric_value, best_score]) candidates_handler.flush() if (score - EPSILON) > best_score: best_candidates = random_candidates best_score = score message = 'New best random candidates number is: %s\n' % \ best_candidates u.log_message(message, log_file=session_file, console=args.verbosity) if metric in PERCENT_EVAL_METRICS: message = '%s = %0.2f%% (score = %s)\n' % ( metric.capitalize(), metric_value * 100, score) else: message = '%s = %f (score = %s)\n' % (metric.capitalize(), metric_value, score) u.log_message(message, log_file=session_file, console=args.verbosity) random_candidates += DEFAULT_CANDIDATES_STEP message = ('The best random candidates number is: %s \n' % best_candidates) u.log_message(message, log_file=session_file, console=1) if metric in PERCENT_EVAL_METRICS: message = ('%s = %0.2f%%\n' % (metric.capitalize(), (best_score * 100))) else: message = ('%s = %f\n' % (metric.capitalize(), best_score)) u.log_message(message, log_file=session_file, console=1) return best_candidates
os.path.join(args.output_dir, FEATURES_LOG)) features_writer = UnicodeWriter(features_file).open_writer() features_header = FEATURES_HEADER if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_PENALTY # retrieving the first dataset in the file try: with open(datasets_file, u.open_mode("r")) as datasets_handler: dataset_id = datasets_handler.readline().strip() except IOError, exc: sys.exit("Could not read the generated datasets file: %s" % str(exc)) try: stored_dataset = u.storage_file_name(args.output_dir, dataset_id) with open(stored_dataset, u.open_mode("r")) as dataset_handler: dataset = json.loads(dataset_handler.read()) except IOError: dataset = api.check_resource(dataset_id, query_string=ALL_FIELDS_QS) # initial feature set fields = Fields(dataset) excluded_features = ([] if args.exclude_features is None else args.exclude_features.split(args.args_separator)) try: excluded_ids = [ fields.field_id(feature) for feature in excluded_features ] objective_id = fields.field_id(objective_name) except ValueError, exc: sys.exit(exc) field_ids = [
def i_check_output_file(step, output=None, check_file=None): if check_file is None or output is None: assert False check_file = res_filename(check_file) output_file = os.path.join(world.directory, os.path.basename(output)) with open(check_file, open_mode("r")) as check_file_handler: check_contents = check_file_handler.read().strip("\n") """ check_contents_lines = check_contents.split("\n") for index, line in enumerate(check_contents_lines): if line: check_contents_lines[index] = INDENT + line check_contents = "\n".join(check_contents_lines) """ # remove unicode mark for strings if Python3 if PYTHON3: check_contents = check_contents.replace( \ " u'", " '").replace("{u'", "{'").replace( \ ' u"', ' "').replace('u\\\'', '\\\'') check_contents = re.sub(r'\n\s*', '\n', check_contents) with open(output_file, open_mode("r")) as output_file: output_file_contents = output_file.read() #strip comments at the beginning of the file output_file_contents = re.sub(r'#!.*def\smain\(\):\n', '', output_file_contents, flags=re.S).strip("\n") output_file_contents = output_file_contents.replace( \ '\nif __name__ == "__main__":\n main()', '') #strip internally added project id information prefix = "" if PYTHON3 else "u" p_str = r'%s\'project\':\s%s\'project/[a-f0-9]{24}\',?\s?' \ % (prefix, prefix) output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S).strip("\n") p_str = r'/[a-f0-9]{24}' output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S) check_contents = re.sub(p_str, '', check_contents, flags=re.S) p_str = r';;.*\n' output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S) check_contents = re.sub(p_str, '', check_contents, flags=re.S) p_str = r'created by.*\n' output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S) check_contents = re.sub(p_str, '', check_contents, flags=re.S) p_str = r' api = .*?\n' output_file_contents = re.sub(p_str, ' api = BigML()\n', output_file_contents, flags=re.S).strip("\n") output_file_contents = re.sub(r'\n\s*', '\n', output_file_contents) check_contents = re.sub(r'\n\s*', '\n', check_contents) output_file_contents = output_file_contents.strip("\n").strip() check_contents = check_contents.strip("\n").strip() if check_contents != output_file_contents: if PYTHON3: # look for an alternative in PYTHON3 check_contents = python3_contents( \ check_file, check_contents) if check_contents != output_file_contents: check_contents = python3_contents( check_file, check_contents, alternative="_1") with open("%s_bck" % check_file, "w") as bck_file: bck_file.write(output_file_contents) eq_(check_contents, output_file_contents)
features_writer = UnicodeWriter(features_file).open_writer() features_header = FEATURES_HEADER if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_PENALTY # retrieving the first dataset in the file try: with open(datasets_file, u.open_mode("r")) as datasets_handler: dataset_id = datasets_handler.readline().strip() except IOError, exc: sys.exit("Could not read the generated datasets file: %s" % str(exc)) try: stored_dataset = u.storage_file_name(args.output_dir, dataset_id) with open(stored_dataset, u.open_mode("r")) as dataset_handler: dataset = json.loads(dataset_handler.read()) except IOError: dataset = api.check_resource(dataset_id, query_string=ALL_FIELDS_QS) # initial feature set fields = Fields(dataset) excluded_features = ([] if args.exclude_features is None else args.exclude_features.split( args.args_separator)) try: excluded_ids = [fields.field_id(feature) for feature in excluded_features] objective_id = fields.field_id(objective_name) except ValueError, exc: sys.exit(exc)
def best_node_threshold(datasets_file, args, common_options, staleness=None, penalty=None, resume=False): """Selecting the node_limit to be used in the model construction """ loop_counter = 0 nodes_file = os.path.normpath(os.path.join(args.output_dir, NODES_LOG)) with open(nodes_file, u.open_mode("w")) as nodes_handler: nodes_writer = csv.writer(nodes_handler, lineterminator="\n") nodes_writer.writerow([ "step", "node_threshold", "score", "metric_value", "best_score"]) nodes_handler.flush() args.output_dir = os.path.normpath(os.path.join(args.output_dir, "node_th")) max_nodes = args.max_nodes + 1 if args.min_nodes is None: args.min_nodes = DEFAULT_MIN_NODES if args.nodes_step is None: args.nodes_step = DEFAULT_NODES_STEP node_threshold = args.min_nodes if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_NODES_PENALTY best_score = - float('inf') best_unchanged_count = 0 metric = args.optimize score = best_score while best_unchanged_count < staleness and node_threshold < max_nodes: loop_counter += 1 (score, metric_value, metric, resume) = node_threshold_evaluate(datasets_file, args, node_threshold, common_options, penalty=penalty, resume=resume, metric=metric) nodes_writer.writerow([ loop_counter, node_threshold, score, metric_value, best_score]) nodes_handler.flush() if (score - EPSILON) > best_score: best_threshold = node_threshold best_score = score best_unchanged_count = 0 message = 'New best node threshold: %s\n' % (best_threshold) u.log_message(message, log_file=session_file, console=args.verbosity) if metric in PERCENT_EVAL_METRICS: message = '%s = %0.2f%% (score = %s)\n' % ( metric.capitalize(), metric_value * 100, score) else: message = '%s = %f (score = %s)\n' % (metric.capitalize(), metric_value, score) u.log_message(message, log_file=session_file, console=args.verbosity) else: best_unchanged_count += 1 node_threshold += args.nodes_step message = ('The best node threshold is: %s \n' % best_threshold) u.log_message(message, log_file=session_file, console=1) if metric in PERCENT_EVAL_METRICS: message = ('%s = %0.2f%%\n' % (metric.capitalize(), (best_score * 100))) else: message = ('%s = %f\n' % (metric.capitalize(), best_score)) u.log_message(message, log_file=session_file, console=1) return best_threshold