示例#1
0
    logger.debug("Using output_dir: " + output_dir)
    logger.debug("Using ingestion_program_dir: " + ingestion_program_dir)
    logger.debug("Using code_dir: " + code_dir)

    # Our libraries
    path.append(ingestion_program_dir)
    path.append(code_dir)
    #IG: to allow submitting the starting kit as sample submission
    path.append(code_dir + '/sample_code_submission')
    import data_io
    from dataset import AutoSpeechDataset # THE class of AutoNLP datasets

    data_io.mkdir(output_dir)

    #### INVENTORY DATA (and sort dataset names alphabetically)
    datanames = data_io.inventory_data(dataset_dir)
    #### Delete zip files and metadata file
    datanames = [x for x in datanames if x.endswith('.data')]

    if len(datanames) != 1:
        raise ValueError("{} datasets found in dataset_dir={}!\n"\
                        .format(len(datanames), dataset_dir) +
                        "Please put only ONE dataset under dataset_dir.")

    basename = datanames[0]
    D = AutoSpeechDataset(os.path.join(dataset_dir, basename))
    metadata = D.get_metadata()
    time_budget = metadata.get("time_budget", time_budget)
    logger.info("Time budget: {}".format(time_budget))

    write_start_file(output_dir, start_time=start, time_budget=time_budget,
示例#2
0
     input_dir = default_input_dir
     output_dir = default_output_dir
 else:
     input_dir = argv[1]
     output_dir = os.path.abspath(argv[2]);
     
 vprint( verbose,  "Using input_dir: " + input_dir)
 vprint( verbose,  "Using output_dir: " + output_dir)
     
 # Move old results and create a new output directory 
 if not(running_on_codalab) and save_previous_results:
     data_io.mvdir(output_dir, output_dir+'_'+the_date) 
 data_io.mkdir(output_dir) 
 
 #### INVENTORY DATA (and sort dataset names alphabetically)
 datanames = data_io.inventory_data(input_dir)
 # Overwrite the "natural" order
 
 #### DEBUG MODE: Show dataset list and STOP
 if debug_mode>=3:
     data_io.show_io(input_dir, output_dir)
     print('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n')        	
     data_io.write_list(datanames)      
     datanames = [] # Do not proceed with learning and testing
     
 # ==================== @RESULT SUBMISSION (KEEP THIS) =====================
 # Always keep this code to enable result submission of pre-calculated results
 # deposited in the res/ subdirectory.
 if len(datanames)>0:
     vprint( verbose,  "************************************************************************")
     vprint( verbose,  "****** Attempting to copy files (from res/) for RESULT submission ******")
示例#3
0
from data_io import vprint           # print only in verbose mode
from data_manager import DataManager # load/save data and get info about them

sys.path.append("libs")

default_input_dir="C:\\Users\\vmkocheg\\Documents\\MLContest\\Phase2\\input"
default_output_dir="C:\\Users\\vmkocheg\\Documents\\MLContest\\Phase2\\output"
if len(argv)==1: # Use the default input and output directories if no arguments are provided
    input_dir = default_input_dir
    output_dir = default_output_dir
else:
    input_dir = argv[1]
    output_dir = os.path.abspath(argv[2]);

#### INVENTORY DATA (and sort dataset names alphabetically)
datanames = data_io.inventory_data(input_dir)
#### DEBUG MODE: Show dataset list and STOP
if debug_mode>=3:
    data_io.show_io(input_dir, output_dir)
    data_io.write_list(datanames)
    datanames = [] # Do not proceed with learning and testing


for basename in datanames: # Loop over datasets
    if basename not in ["robert"]:
        continue

    vprint( verbose,  "************************************************")
    vprint( verbose,  "******** Processing dataset " + basename.capitalize() + " ********")
    vprint( verbose,  "************************************************")
def ingestion_fn(dataset_dir,
                 code_dir,
                 time_budget,
                 time_budget_approx,
                 output_dir,
                 score_dir,
                 model_config_name=None,
                 model_config=None):
    #### Check whether everything went well
    ingestion_success = True

    # Parse directories
    root_dir = _HERE(os.pardir)
    ingestion_program_dir = join(root_dir, "ingestion_program")

    if dataset_dir.endswith("run/input") and code_dir.endswith("run/program"):
        logger.debug(
            "Since dataset_dir ends with 'run/input' and code_dir "
            "ends with 'run/program', suppose running on " +
            "CodaLab platform. Modify dataset_dir to 'run/input_data' "
            "and code_dir to 'run/submission'. " +
            "Directory parsing should be more flexible in the code of " +
            "compute worker: we need explicit directories for " +
            "dataset_dir and code_dir.")
        dataset_dir = dataset_dir.replace("run/input", "run/input_data")
        code_dir = code_dir.replace("run/program", "run/submission")

    # Show directories for debugging
    logger.debug("sys.argv = " + str(sys.argv))
    logger.debug("Using dataset_dir: " + dataset_dir)
    logger.debug("Using output_dir: " + output_dir)
    logger.debug("Using ingestion_program_dir: " + ingestion_program_dir)
    logger.debug("Using code_dir: " + code_dir)

    # Our libraries
    path.append(ingestion_program_dir)
    path.append(code_dir)
    # IG: to allow submitting the starting kit as sample submission
    path.append(code_dir + "/sample_code_submission")
    import data_io
    from dataset import AutoDLDataset  # THE class of AutoDL datasets

    data_io.mkdir(output_dir)

    #### INVENTORY DATA (and sort dataset names alphabetically)
    datanames = data_io.inventory_data(dataset_dir)
    #### Delete zip files and metadata file
    datanames = [x for x in datanames if x.endswith(".data")]

    if len(datanames) != 1:
        raise ValueError("{} datasets found in dataset_dir={}!\n".format(
            len(datanames), dataset_dir) +
                         "Please put only ONE dataset under dataset_dir.")

    basename = datanames[0]

    logger.info("************************************************")
    logger.info("******** Processing dataset " + basename[:-5].capitalize() +
                " ********")
    logger.info("************************************************")
    logger.debug("Version: {}. Description: {}".format(VERSION, DESCRIPTION))

    ##### Begin creating training set and test set #####
    logger.info("Reading training set and test set...")
    D_train = AutoDLDataset(os.path.join(dataset_dir, basename, "train"))
    D_test = AutoDLDataset(os.path.join(dataset_dir, basename, "test"))
    ##### End creating training set and test set #####

    ## Get correct prediction shape
    num_examples_test = D_test.get_metadata().size()
    output_dim = D_test.get_metadata().get_output_size()
    correct_prediction_shape = (num_examples_test, output_dim)

    # 20 min for participants to initializing and install other packages
    # try:
    #     init_time_budget = 20 * 60  # time budget for initilization.
    #     timer = Timer()
    #     timer.set(init_time_budget)
    #     with timer.time_limit("Initialization"):

    ##### Begin creating model #####
    logger.info("Creating model...this process should not exceed 20min.")
    from model import Model  # in participants' model.py

    # The metadata of D_train and D_test only differ in sample_count
    M = Model(D_train.get_metadata(),
              model_config_name=model_config_name,
              model_config=model_config)
    ###### End creating model ######

    # except TimeoutException as e:
    #     logger.info("[-] Initialization phase exceeded time budget. Move to train/predict phase")
    # except Exception as e:
    #     logger.error("Failed to initializing model.")
    #     logger.error("Encountered exception:\n" + str(e), exc_info=True)
    #

    # Mark starting time of ingestion
    start = time.time()
    logger.info("=" * 5 + " Start core part of ingestion program. " +
                "Version: {} ".format(VERSION) + "=" * 5)

    write_start_file(output_dir,
                     start_time=start,
                     time_budget=time_budget,
                     task_name=basename.split(".")[0])

    try:
        # Check if the model has methods `train` and `test`.
        for attr in ["train", "test"]:
            if not hasattr(M, attr):
                raise ModelApiError(
                    "Your model object doesn't have the method " +
                    "`{}`. Please implement it in model.py.")

        # Check if model.py uses new done_training API instead of marking
        # stopping by returning None
        use_done_training_api = hasattr(M, "done_training")
        if not use_done_training_api:
            logger.warning(
                "Your model object doesn't have an attribute " +
                "`done_training`. But this is necessary for ingestion " +
                "program to know whether the model has done training " +
                "and to decide whether to proceed more training. " +
                "Please add this attribute to your model.")

        # Keeping track of how many predictions are made
        prediction_order_number = 0

        # Start the CORE PART: train/predict process
        while not (use_done_training_api and M.done_training):
            remaining_time_budget = start + time_budget - time.time()
            # Train the model
            logger.info("Begin training the model...")
            M.train(D_train.get_dataset(),
                    remaining_time_budget=remaining_time_budget)
            logger.info("Finished training the model.")
            # Make predictions using the trained model
            logger.info("Begin testing the model by making predictions " +
                        "on test set...")
            remaining_time_budget = start + time_budget - time.time()
            Y_pred = M.test(D_test.get_dataset(),
                            remaining_time_budget=remaining_time_budget)
            logger.info("Finished making predictions.")
            if Y_pred is None:  # Stop train/predict process if Y_pred is None
                logger.info("The method model.test returned `None`. " +
                            "Stop train/predict process.")
                break
            else:  # Check if the prediction has good shape
                prediction_shape = tuple(Y_pred.shape)
                if prediction_shape != correct_prediction_shape:
                    raise BadPredictionShapeError(
                        "Bad prediction shape! Expected {} but got {}.".format(
                            correct_prediction_shape, prediction_shape))
            remaining_time_budget = start + time_budget_approx - time.time()
            if remaining_time_budget < 0:
                break
            # Write timestamp to 'start.txt'
            write_timestamp(output_dir,
                            predict_idx=prediction_order_number,
                            timestamp=time.time())
            # Prediction files: adult.predict_0, adult.predict_1, ...
            filename_test = basename[:-5] + ".predict_" + str(
                prediction_order_number)
            # Write predictions to output_dir
            data_io.write(os.path.join(output_dir, filename_test), Y_pred)
            prediction_order_number += 1
            logger.info(
                "[+] {0:d} predictions made, time spent so far {1:.2f} sec".
                format(prediction_order_number,
                       time.time() - start))
            remaining_time_budget = start + time_budget_approx - time.time()
            logger.info(
                "[+] Time left {0:.2f} sec".format(remaining_time_budget))

    except Exception as e:
        ingestion_success = False
        logger.info("Failed to run ingestion.")
        logger.error("Encountered exception:\n" + str(e), exc_info=True)

    # Finishing ingestion program
    end_time = time.time()
    overall_time_spent = end_time - start

    # Write overall_time_spent to a end.txt file
    end_filename = "end.txt"
    with open(os.path.join(output_dir, end_filename), "w") as f:
        f.write("ingestion_duration: " + str(overall_time_spent) + "\n")
        f.write("ingestion_success: " + str(int(ingestion_success)) + "\n")
        f.write("end_time: " + str(end_time) + "\n")
        logger.info("Wrote the file {} marking the end of ingestion.".format(
            end_filename))
        if ingestion_success:
            logger.info("[+] Done. Ingestion program successfully terminated.")
            logger.info("[+] Overall time spent %5.2f sec " %
                        overall_time_spent)
        else:
            logger.info(
                "[-] Done, but encountered some errors during ingestion.")
            logger.info("[-] Overall time spent %5.2f sec " %
                        overall_time_spent)

    # Copy all files in output_dir to score_dir
    os.system("cp -R {} {}".format(os.path.join(output_dir, "*"), score_dir))
    logger.debug("Copied all ingestion output to scoring output directory.")

    logger.info("[Ingestion terminated]")
示例#5
0
def _main(args):
    # Mark starting time of ingestion
    start = time.time()
    logger.info("=" * 5 + " Start ingestion program. ")

    #### Check whether everything went well
    ingestion_success = True

    dataset_dir = args.dataset_dir
    output_dir = args.output_dir
    ingestion_program_dir = args.ingestion_program_dir
    code_dir = args.code_dir
    score_dir = args.score_dir
    time_budget = args.time_budget

    if dataset_dir.endswith('run/input') and\
        code_dir.endswith('run/program'):
        logger.debug(
            "Since dataset_dir ends with 'run/input' and code_dir "
            "ends with 'run/program', suppose running on " +
            "CodaLab platform. Modify dataset_dir to 'run/input_data' "
            "and code_dir to 'run/submission'. " +
            "Directory parsing should be more flexible in the code of " +
            "compute worker: we need explicit directories for " +
            "dataset_dir and code_dir.")
        dataset_dir = dataset_dir.replace('run/input', 'run/input_data')
        code_dir = code_dir.replace('run/program', 'run/submission')

    # Show directories for debugging
    logger.debug("sys.argv = " + str(sys.argv))
    logger.debug("Using dataset_dir: " + dataset_dir)
    logger.debug("Using output_dir: " + output_dir)
    logger.debug("Using ingestion_program_dir: " + ingestion_program_dir)
    logger.debug("Using code_dir: " + code_dir)

    # Our libraries
    path.append(ingestion_program_dir)
    path.append(code_dir)
    #IG: to allow submitting the starting kit as sample submission
    path.append(code_dir + '/sample_code_submission')
    import data_io
    from dataset import AutoSpeechDataset  # THE class of AutoNLP datasets

    #### INVENTORY DATA (and sort dataset names alphabetically)
    datanames = data_io.inventory_data(dataset_dir)
    #### Delete zip files and metadata file
    datanames = [x for x in datanames if x.endswith('.data')]

    if len(datanames) != 1:
        raise ValueError("{} datasets found in dataset_dir={}!\n"\
                        .format(len(datanames), dataset_dir) +
                        "Please put only ONE dataset under dataset_dir.")

    basename = datanames[0]
    D = AutoSpeechDataset(os.path.join(dataset_dir, basename))
    metadata = D.get_metadata()
    time_budget = metadata.get("time_budget", time_budget)
    logger.info("Time budget: {}".format(time_budget))

    write_start_file(output_dir,
                     start_time=start,
                     time_budget=time_budget,
                     task_name=basename.split('.')[0])

    logger.info("************************************************")
    logger.info("******** Processing dataset " + basename[:-5].capitalize() +
                " ********")
    logger.info("************************************************")

    ##### Begin creating training set and test set #####
    logger.info("Reading training set and test set...")
    D.read_dataset()
    ##### End creating training set and test set #####

    ## Get correct prediction shape
    num_examples_test = D.get_test_num()
    output_dim = D.get_class_num()
    correct_prediction_shape = (num_examples_test, output_dim)

    try:
        # ========= Creating a model
        timer = Timer()
        timer.set(
            20 * 60
        )  # 20 min for participants to initializing and install other packages
        with timer.time_limit("Importing model"):
            from model import Model  # in participants' model.py

        ##### Begin creating model #####
        logger.info("Creating model...")
        with timer.time_limit('Initialization'):
            M = Model(metadata)
        ###### End creating model ######
    except TimeoutException as e:
        logger.info(
            "[-] Initialization phase exceeded time budget. Move to train/predict phase"
        )
    except Exception as e:
        logger.info("Failed to initializing model.")
        logger.error("Encountered exception:\n" + str(e), exc_info=True)
        raise
    finally:
        try:
            timer = Timer()
            timer.set(time_budget)
            # Check if the model has methods `train` and `test`.
            for attr in ['train', 'test']:
                if not hasattr(M, attr):
                    raise ModelApiError(
                        "Your model object doesn't have the method " +
                        "`{}`. Please implement it in model.py.")

            # Check if model.py uses new done_training API instead of marking
            # stopping by returning None
            use_done_training_api = hasattr(M, 'done_training')
            if not use_done_training_api:
                logger.warning(
                    "Your model object doesn't have an attribute " +
                    "`done_training`. But this is necessary for ingestion " +
                    "program to know whether the model has done training " +
                    "and to decide whether to proceed more training. " +
                    "Please add this attribute to your model.")

            # Keeping track of how many predictions are made
            prediction_order_number = 0

            # Start the CORE PART: train/predict process
            while (not (use_done_training_api and M.done_training)):

                # Train the model
                logger.info("Begin training the model...")
                remaining_time_budget = timer.remain
                with timer.time_limit('training'):
                    M.train(D.get_train(), remaining_time_budget=timer.remain)
                logger.info("Finished training the model.")

                # Make predictions using the trained model
                logger.info("Begin testing the model by making predictions " +
                            "on test set...")
                remaining_time_budget = timer.remain
                with timer.time_limit('predicting'):
                    Y_pred = M.test(
                        D.get_test(),
                        remaining_time_budget=remaining_time_budget)
                logger.info("Finished making predictions.")

                if Y_pred is None:  # Stop train/predict process if Y_pred is None
                    logger.info("The method model.test returned `None`. " +
                                "Stop train/predict process.")
                    break
                else:  # Check if the prediction has good shape
                    prediction_shape = tuple(Y_pred.shape)
                    if prediction_shape != correct_prediction_shape:
                        raise BadPredictionShapeError(
                            "Bad prediction shape! Expected {} but got {}."\
                            .format(correct_prediction_shape, prediction_shape)
                        )
                # Write timestamp to 'start.txt'
                write_timestamp(output_dir,
                                predict_idx=prediction_order_number,
                                timestamp=timer.exec)
                # Prediction files: adult.predict_0, adult.predict_1, ...
                filename_test = basename[:-5] + '.predict_' +\
                    str(prediction_order_number)
                # Write predictions to output_dir
                tmp_pred = np.argmax(Y_pred, axis=1)
                # data_io.write(os.path.join(output_dir,filename_test), Y_pred)
                data_io.write(os.path.join(output_dir, filename_test),
                              tmp_pred)
                prediction_order_number += 1
                logger.info("[+] {0:d} predictions made, time spent so far {1:.2f} sec"\
                            .format(prediction_order_number, time.time() - start))
                logger.info("[+] Time left {0:.2f} sec".format(timer.remain))
        except TimeoutException as e:
            logger.info(
                "[-] Ingestion program exceeded time budget. Predictions "
                "made so far will be used for evaluation.")
        except Exception as e:
            ingestion_success = False
            logger.info("Failed to run ingestion.")
            logger.error("Encountered exception:\n" + str(e), exc_info=True)
            raise
        finally:
            # Finishing ingestion program
            end_time = time.time()
            overall_time_spent = end_time - start

            # Write overall_time_spent to a end.txt file
            end_filename = 'end.txt'
            with open(os.path.join(output_dir, end_filename), 'w') as f:
                f.write('ingestion_duration: ' + str(overall_time_spent) +
                        '\n')
                f.write('ingestion_success: ' + str(int(ingestion_success)) +
                        '\n')
                f.write('end_time: ' + str(end_time) + '\n')
                logger.info("Wrote the file {} marking the end of ingestion."\
                            .format(end_filename))
                if ingestion_success:
                    logger.info(
                        "[+] Done. Ingestion program successfully terminated.")
                    logger.info("[+] Overall time spent %5.2f sec " %
                                overall_time_spent)
                else:
                    logger.info(
                        "[-] Done, but encountered some errors during ingestion."
                    )
                    logger.info("[-] Overall time spent %5.2f sec " %
                                overall_time_spent)

            # Copy all files in output_dir to score_dir
            os.system("cp -R {} {}".format(os.path.join(output_dir, '*'),
                                           score_dir))
            logger.debug(
                "Copied all ingestion output to scoring output directory.")

            logger.info("[Ingestion terminated]")
示例#6
0
codalab_run_dir = os.path.join(run_dir, "program")
if os.path.isdir(codalab_run_dir): 
    run_dir=codalab_run_dir
    running_on_codalab = True
    print "Running on Codalab!"
lib_dir = os.path.join(run_dir, "lib")
res_dir = os.path.join(run_dir, "res")

# Our libraries  
path.append (run_dir)
path.append (lib_dir)
import data_io                       # general purpose input/output functions
from data_io import vprint           # print only in verbose mode
from data_manager import DataManager # load/save data and get info about them

datanames = data_io.inventory_data(default_input_dir)
verbose = True
debug_mode = 0
zipme = True
max_time = 90
max_cycle = 1
execution_success = True
the_date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M")
submission_filename = '../automl_sample_submission_' + the_date


overall_start = time.time()
if len(datanames)>0:
    vprint( verbose,  "************************************************************************")
    vprint( verbose,  "****** Attempting to copy files (from res/) for RESULT submission ******")
    vprint( verbose,  "************************************************************************")
示例#7
0
codalab_run_dir = os.path.join(run_dir, "program")
if os.path.isdir(codalab_run_dir):
    run_dir = codalab_run_dir
    running_on_codalab = True
    print "Running on Codalab!"
lib_dir = os.path.join(run_dir, "lib")
res_dir = os.path.join(run_dir, "res")

# Our libraries
path.append(run_dir)
path.append(lib_dir)
import data_io  # general purpose input/output functions
from data_io import vprint  # print only in verbose mode
from data_manager import DataManager  # load/save data and get info about them

datanames = data_io.inventory_data(default_input_dir)
verbose = True
debug_mode = 0
zipme = True
max_time = 90
max_cycle = 1
execution_success = True
the_date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M")
submission_filename = '../automl_sample_submission_' + the_date

overall_start = time.time()
if len(datanames) > 0:
    vprint(
        verbose,
        "************************************************************************"
    )