示例#1
0
 def test_exists_s3(self):
     access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
     secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
     if access_key_id and secret_access_key:
         file_path = "s3://analytics-zoo-data/nyc_taxi.csv"
         assert exists(file_path)
         file_path = "s3://analytics-zoo-data/abc.csv"
         assert not exists(file_path)
示例#2
0
 def test_mkdirs_local_2(self):
     temp = tempfile.mkdtemp()
     path = os.path.join(temp, "dir1")
     makedirs("file://" + path)
     assert exists("file://" + path)
     path = os.path.join(temp, "dir2/dir3")
     makedirs("file://" + path)
     assert exists("file://" + path)
     shutil.rmtree(temp)
示例#3
0
 def test_mkdirs_s3(self):
     access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
     secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
     if access_key_id and secret_access_key:
         file_path = "s3://analytics-zoo-data/temp/abc/"
         makedirs(file_path)
         assert exists(file_path)
         import boto3
         s3_client = boto3.Session(
             aws_access_key_id=access_key_id,
             aws_secret_access_key=secret_access_key).client('s3',
                                                             verify=False)
         s3_client.delete_object(Bucket='analytics-zoo-data',
                                 Key='temp/abc/')
示例#4
0
 def test_exists_local(self):
     file_path = os.path.join(self.resource_path, "orca/data/random.npy")
     assert exists("file://" + file_path)
     file_path = os.path.join(self.resource_path, "orca/data/abc.npy")
     assert not exists("file://" + file_path)
    idx_list = tbl.gen_string_idx(CAT_COLS, freq_limit=args.frequency_limit)
    cat_sizes = [idx.size() for idx in idx_list]

    cross_sizes = args.cross_sizes

    tbl_all_data = tbl.encode_string(CAT_COLS, idx_list)\
        .fillna(0, INT_COLS + CAT_COLS)\
        .normalize(INT_COLS)\
        .cross_columns(crossed_columns=[CAT_COLS[0:2], CAT_COLS[2:4]],
                       bucket_sizes=cross_sizes)
    tbl_all_data.compute()
    time_end = time()
    print("Train data loading and preprocessing time: ", time_end - time_start)

    # save meta
    if not exists(os.path.join(args.output_folder, "meta")):
        makedirs(os.path.join(args.output_folder, "meta"))
    cate_sizes_text = ""
    for i in cat_sizes:
        cate_sizes_text += str(i) + '\n'
    write_text(os.path.join(args.output_folder, "meta/categorical_sizes.txt"),
               cate_sizes_text)

    cross_sizes_text = ""
    for i in cross_sizes:
        cross_sizes_text += str(i) + '\n'
    write_text(os.path.join(args.output_folder, "meta/cross_sizes.txt"),
               cross_sizes_text)

    tbl_all_data.show(5)
    print("Finished")