示例#1
0
        assert T["dimensions"][0] == len(mc["name_to_idx"])

def assert_other(mr, mc, xl, xd, T):
    # is the number of views in xd equal to their cached counts in xl?
    assert len(xl["column_partition"]["counts"]) == len(xd)

if __name__ == '__main__':
    import argparse
    import json
    parser = argparse.ArgumentParser('A script to validate a json file\'s compliance with the predictive-DB spec')
    parser.add_argument('filename', type=str)
    args = parser.parse_args()
    filename = args.filename
    #
    if filename.endswith('.pkl.gz'):
        parsed_sample = fu.unpickle(filename)
        parsed_sample['M_r'] = strify_M_r(parsed_sample['M_r'])
        parsed_sample['M_c'] = strify_M_c(parsed_sample['M_c'])
        parsed_sample['X_L'] = convert_X_L(parsed_sample['X_L'])
        parsed_sample['T'] = convert_T(parsed_sample['T'])
    else:
        with open(filename) as fh:
            one_line = "".join(fh.readlines()).translate(None,"\n\t ")
            parsed_sample = json.loads(one_line)

    M_c = parsed_sample["M_c"]
    M_r = parsed_sample["M_r"]
    X_L = parsed_sample["X_L"]
    X_D = parsed_sample["X_D"]
    T = parsed_sample["T"]
# parse some arguments
parser = argparse.ArgumentParser()
parser.add_argument('pkl_name', type=str)
parser.add_argument('--inf_seed', default=0, type=int)
args = parser.parse_args(['/usr/local/crosscat/cython_code/iter_90_pickled_state.pkl.gz'])
pkl_name = args.pkl_name
inf_seed = args.inf_seed

random_state = numpy.random.RandomState(inf_seed)
# FIXME: getting weird error on conversion to int: too large from inside pyx
def get_next_seed(max_val=32767): # sys.maxint):
    return random_state.randint(max_val)

# resume from saved name
save_dict = fu.unpickle(pkl_name)
M_c = save_dict['M_c']
X_L = save_dict['X_L']
X_D = save_dict['X_D']
T = save_dict['T']
num_cols = len(X_L['column_partition']['assignments'])
row_idx = 205
col_idx = 13
Q = [(row_idx, col_idx)]
imputed, confidence = su.impute_and_confidence(
    M_c, X_L, X_D, Y=None, Q=Q, n=400, get_next_seed=get_next_seed)

T_array = numpy.array(T)
which_view_idx = X_L['column_partition']['assignments'][col_idx]
X_D_i = numpy.array(X_D[which_view_idx])
which_cluster_idx = X_D_i[row_idx]
示例#3
0
    n_steps = args.n_steps
    chunk_size = args.chunk_size
    chunk_filename_prefix = args.chunk_filename_prefix
    chunk_dest_dir = args.chunk_dest_dir
    max_time = args.max_time
    table_filename = args.table_filename
    resume_filename = args.resume_filename
    pkl_filename = args.pkl_filename
    #
    command = args.command
    # assert command in set(gu.get_method_names(HadoopEngine))
    #
    cctypes_filename = args.cctypes_filename
    cctypes = None
    if cctypes_filename is not None:
      cctypes = fu.unpickle(cctypes_filename)

    hdfs_uri, jobtracker_uri = hu.get_uris(base_uri, hdfs_uri, jobtracker_uri)
    T, M_r, M_c = du.read_model_data_from_csv(table_filename, gen_seed=0,
                                              cctypes=cctypes)
    he = HadoopEngine(which_engine_binary=which_engine_binary,
		      which_hadoop_binary=which_hadoop_binary,
		      which_hadoop_jar=which_hadoop_jar,
                      hdfs_dir=hdfs_dir, hdfs_uri=hdfs_uri,
                      jobtracker_uri=jobtracker_uri)
    
    X_L_list, X_D_list = None, None
    if command == 'initialize':
        hadoop_output = he.initialize(M_c, M_r, T,
                                      initialization='from_the_prior',
                                      n_chains=n_chains)
示例#4
0
    # is the number of views in xd equal to their cached counts in xl?
    assert len(xl["column_partition"]["counts"]) == len(xd)


if __name__ == '__main__':
    import argparse
    import json
    parser = argparse.ArgumentParser(
        'A script to validate a json file\'s compliance with the predictive-DB spec'
    )
    parser.add_argument('filename', type=str)
    args = parser.parse_args()
    filename = args.filename
    #
    if filename.endswith('.pkl.gz'):
        parsed_sample = fu.unpickle(filename)
        parsed_sample['M_r'] = strify_M_r(parsed_sample['M_r'])
        parsed_sample['M_c'] = strify_M_c(parsed_sample['M_c'])
        parsed_sample['X_L'] = convert_X_L(parsed_sample['X_L'])
        parsed_sample['T'] = convert_T(parsed_sample['T'])
    else:
        with open(filename) as fh:
            one_line = "".join(fh.readlines()).translate(None, "\n\t ")
            parsed_sample = json.loads(one_line)

    M_c = parsed_sample["M_c"]
    M_r = parsed_sample["M_r"]
    X_L = parsed_sample["X_L"]
    X_D = parsed_sample["X_D"]
    T = parsed_sample["T"]
示例#5
0
    n_steps = args.n_steps
    chunk_size = args.chunk_size
    chunk_filename_prefix = args.chunk_filename_prefix
    chunk_dest_dir = args.chunk_dest_dir
    max_time = args.max_time
    table_filename = args.table_filename
    resume_filename = args.resume_filename
    pkl_filename = args.pkl_filename
    #
    command = args.command
    # assert command in set(gu.get_method_names(HadoopEngine))
    #
    cctypes_filename = args.cctypes_filename
    cctypes = None
    if cctypes_filename is not None:
      cctypes = fu.unpickle(cctypes_filename)

    hdfs_uri, jobtracker_uri = hu.get_uris(base_uri, hdfs_uri, jobtracker_uri)
    T, M_r, M_c = du.read_model_data_from_csv(table_filename, gen_seed=0,
                                              cctypes=cctypes)
    he = HadoopEngine(which_engine_binary=which_engine_binary,
		      which_hadoop_binary=which_hadoop_binary,
		      which_hadoop_jar=which_hadoop_jar,
                      hdfs_dir=hdfs_dir, hdfs_uri=hdfs_uri,
                      jobtracker_uri=jobtracker_uri)
    
    X_L_list, X_D_list = None, None
    if command == 'initialize':
        hadoop_output = he.initialize(M_c, M_r, T,
                                      initialization='from_the_prior',
                                      n_chains=n_chains)
示例#6
0
parser.add_argument('pkl_name', type=str)
parser.add_argument('--inf_seed', default=0, type=int)
parser.add_argument('--hostname', default='127.0.0.1', type=str)
args = parser.parse_args()
pkl_name = args.pkl_name
inf_seed = args.inf_seed
hostname = args.hostname


# FIXME: getting weird error on conversion to int: too large from inside pyx
def get_next_seed(max_val=32767):  # sys.maxint):
    return random_state.randint(max_val)


# resume from saved name
save_dict = fu.unpickle(pkl_name)
random_state = numpy.random.RandomState(inf_seed)
M_c = save_dict['M_c']
X_L = save_dict['X_L']
X_D = save_dict['X_D']

# FIXME: test constraints
# Y = [su.Bunch(index=2,value=2.3), su.Bunch(index=0,value=-4.)]
Y = None

# test simple_predictive_sample_observed
views_replicating_samples_params = su.determine_replicating_samples_params(
    X_L, X_D)
views_samples = []
for replicating_samples_params in views_replicating_samples_params:
    this_view_samples = []
    time_analyze=time_analyze_helper,
    convergence_analyze=convergence_analyze_helper,
    chunk_analyze=chunk_analyze_helper,
    mi_analyze=mi_analyze_helper
    )


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--table_data_filename', type=str,
                        default=hs.default_table_data_filename)
    parser.add_argument('--command_dict_filename', type=str,
                        default=hs.default_command_dict_filename)
    args = parser.parse_args()
    table_data_filename = args.table_data_filename
    command_dict_filename = args.command_dict_filename
    
    
    table_data = fu.unpickle(table_data_filename)
    command_dict = fu.unpickle(command_dict_filename)
    command = command_dict['command']
    method = method_lookup[command]
    #
    from signal import signal, SIGPIPE, SIG_DFL 
    signal(SIGPIPE,SIG_DFL) 
    for line in sys.stdin:
        key, data_dict = xu.parse_hadoop_line(line)
        ret_dict = method(table_data, data_dict, command_dict)
        xu.write_hadoop_line(sys.stdout, key, ret_dict)
    time_analyze=time_analyze_helper,
    convergence_analyze=convergence_analyze_helper,
    chunk_analyze=chunk_analyze_helper,
    mi_analyze=mi_analyze_helper
    )


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--table_data_filename', type=str,
                        default=hs.default_table_data_filename)
    parser.add_argument('--command_dict_filename', type=str,
                        default=hs.default_command_dict_filename)
    args = parser.parse_args()
    table_data_filename = args.table_data_filename
    command_dict_filename = args.command_dict_filename
    
    
    table_data = fu.unpickle(table_data_filename)
    command_dict = fu.unpickle(command_dict_filename)
    command = command_dict['command']
    method = method_lookup[command]
    #
    from signal import signal, SIGPIPE, SIG_DFL 
    signal(SIGPIPE,SIG_DFL) 
    for line in sys.stdin:
        key, data_dict = xu.parse_hadoop_line(line)
        ret_dict = method(table_data, data_dict, command_dict)
        xu.write_hadoop_line(sys.stdout, key, ret_dict)
)


def process_line(line, table_data):
    key, dict_in = xu.parse_hadoop_line(line)
    if dict_in is None:
        return None, None
    command = dict_in['command']
    method = method_lookup[command]
    ret_dict = method(table_data, dict_in)
    return key, ret_dict


if __name__ == '__main__':
    pass

# read the files
table_data = fu.unpickle('table_data.pkl.gz')
with open('hadoop_input') as fh:
    lines = [line for line in fh]

sc = SparkContext("local", "Simple job")
broadcasted_table_data = sc.broadcast(table_data)
parallelized = sc.parallelize(lines)
map_result = parallelized.map(
    lambda line: process_line(line, broadcasted_table_data.value)).collect()

print map_result

#
    initialize=initialize_helper,
    analyze=analyze_helper,
    time_analyze=time_analyze_helper,
    )

def process_line(line, table_data):
        key, dict_in = xu.parse_hadoop_line(line)
        if dict_in is None:
            return None, None
        command = dict_in['command']
        method = method_lookup[command]
        ret_dict = method(table_data, dict_in)
        return key, ret_dict

if __name__ == '__main__':
    pass

# read the files
table_data = fu.unpickle('table_data.pkl.gz')
with open('hadoop_input') as fh:
    lines = [line for line in fh]

sc = SparkContext("local", "Simple job")
broadcasted_table_data = sc.broadcast(table_data)
parallelized = sc.parallelize(lines)
map_result = parallelized.map(lambda line: process_line(line, broadcasted_table_data.value)).collect()

print map_result

#