def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw):
    """
    TODO Finish docstring
    Args:
        hdfs_output: Path to hdfs temporary output or None if execution should be performed locally using hadoopy.launch_local.
    """
    labels = {}
    try:
        labels = file_parse.load(local_labels)
    except IOError:
        pass
    if hdfs_output is None:
        j = hadoopy.launch_local(hdfs_input_pos, None, _lf('collect_keys.py'))
        pos_keys = sum((x[1] for x in j['output']), [])
        j = hadoopy.launch_local(hdfs_input_neg, None, _lf('collect_keys.py'))
        neg_keys = sum((x[1] for x in j['output']), [])
    else:
        hdfs_output_pos = hdfs_output + '/pos'
        hdfs_output_neg = hdfs_output + '/neg'
        picarus._launch_frozen(hdfs_input_pos, hdfs_output_pos, _lf('collect_keys.py'))
        picarus._launch_frozen(hdfs_input_neg, hdfs_output_neg, _lf('collect_keys.py'))
        pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), [])
        neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), [])

    labels[classifier_name] = {'labels': {'1': pos_keys, '-1': neg_keys},
                               'classifier': classifier,
                               'classifier_extra': classifier_extra}
    file_parse.dump(labels, local_labels)
示例#2
0
 def test_local(self):
     out_path = '%s/local_test/%f' % (self.data_path, time.time())
     hadoopy.put('wc-input-alice.tb', out_path + '/wc-input-alice.tb')
     hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out', 'local.py', max_input=1000,
                          cmdenvs=['TEST_ENV=10'],
                          files=['wc-input-alice.tb'])  # Just bring this along to test the files
     hadoopy.launch_local(((1000 * 'a', 10000000 * 'b') for x in range(100)), None, 'local.py', max_input=10000,
                          cmdenvs=['TEST_ENV=10'],
                          files=['wc-input-alice.tb'])
示例#3
0
    def test_name(self):
        kv_sizes = [(1, 1), (1024 ** 2, 1024 ** 2), (50 * 1024 ** 2, 50 * 1024 ** 2)]

        a = hadoopy.launch_local((x for x in kv_sizes for y in range(20)), None, 'size_job.py')
        kvs = list(((len(x), len(y)) for x, y in a['output']))
        print(len(kvs))

        a = hadoopy.launch_local(kv_sizes, None, 'size_job.py')
        kvs = list(a['output'])
        print(len(kvs))

        a = hadoopy.launch_local(kvs, None, 'null_job.py')
        kvs = list(a['output'])
        print(len(kvs))
示例#4
0
 def launch(in_name, out_name, script_path, **kw):
     # If local kv cache doesn't exist, then copy the correct number of values there
     try:
         kw['files'] = list(kw['files']) + list(files)
     except KeyError:
         kw['files'] = files
     return hadoopy.launch_local(_local_iter(in_name, max_input), None, script_path, **kw)['output']
示例#5
0
 def test_local(self):
     out_path = "%s/local_test/%f" % (self.data_path, time.time())
     hadoopy.put("wc-input-alice.tb", out_path + "/wc-input-alice.tb")
     hadoopy.launch_local(
         out_path + "/wc-input-alice.tb",
         out_path + "/out",
         "local.py",
         max_input=1000,
         cmdenvs=["TEST_ENV=10"],
         files=["wc-input-alice.tb"],
     )  # Just bring this along to test the files
     hadoopy.launch_local(
         ((1000 * "a", 10000000 * "b") for x in range(100)),
         None,
         "local.py",
         max_input=10000,
         cmdenvs=["TEST_ENV=10"],
         files=["wc-input-alice.tb"],
     )
示例#6
0
 def test_local(self):
     out_path = '%s/local_test/%f' % (self.data_path, time.time())
     hadoopy.mkdir(out_path)
     hadoopy.put('wc-input-alice.tb', out_path + '/wc-input-alice.tb')
     hadoopy.launch_local(out_path + '/wc-input-alice.tb',
                          out_path + '/out_list_cmdenvs',
                          'local.py',
                          max_input=1000,
                          cmdenvs=['TEST_ENV=10'],
                          files=[
                              'wc-input-alice.tb'
                          ])  # Just bring this along to test the files
     hadoopy.launch_local(out_path + '/wc-input-alice.tb',
                          out_path + '/out',
                          'local.py',
                          max_input=1000,
                          cmdenvs={'TEST_ENV': '10'},
                          files=[
                              'wc-input-alice.tb'
                          ])  # Just bring this along to test the files
     hadoopy.launch_local(
         ((1000 * 'a', 10000000 * 'b') for x in range(100)),
         None,
         'local.py',
         max_input=10000,
         cmdenvs=['TEST_ENV=10'],
         files=['wc-input-alice.tb'])
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output,
                          classifier_name, classifier_extra, local_labels,
                          classifier, **kw):
    """
    TODO Finish docstring
    Args:
        hdfs_output: Path to hdfs temporary output or None if execution should be performed locally using hadoopy.launch_local.
    """
    labels = {}
    try:
        labels = file_parse.load(local_labels)
    except IOError:
        pass
    if hdfs_output is None:
        j = hadoopy.launch_local(hdfs_input_pos, None, _lf('collect_keys.py'))
        pos_keys = sum((x[1] for x in j['output']), [])
        j = hadoopy.launch_local(hdfs_input_neg, None, _lf('collect_keys.py'))
        neg_keys = sum((x[1] for x in j['output']), [])
    else:
        hdfs_output_pos = hdfs_output + '/pos'
        hdfs_output_neg = hdfs_output + '/neg'
        picarus._launch_frozen(hdfs_input_pos, hdfs_output_pos,
                               _lf('collect_keys.py'))
        picarus._launch_frozen(hdfs_input_neg, hdfs_output_neg,
                               _lf('collect_keys.py'))
        pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), [])
        neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), [])

    labels[classifier_name] = {
        'labels': {
            '1': pos_keys,
            '-1': neg_keys
        },
        'classifier': classifier,
        'classifier_extra': classifier_extra
    }
    file_parse.dump(labels, local_labels)
示例#8
0
def launch_zmq(flow_controller, script_path, cleanup_func=None, outputs=None, **kw):

    def _kvs():
        while True:
            yield flow_controller.recv()

    kvs = hadoopy.launch_local(_kvs(), None, script_path, poll=flow_controller.poll, **kw)['output']
    if outputs is None:
        for k, v in kvs:
            # k is the node number, v is a k/v tuple
            flow_controller.send(k, v)
    else:
        for kv in kvs:
            for s in outputs:
                flow_controller.send(s, kv)
示例#9
0
文件: driver.py 项目: bjzu/hadoopy
#    and then run.  This is the most common way to use Hadoopy as it avoid having to install
#    anything on the cluster including Python, dependencies, and your code.
# 3. launch_local: This is intended for unit tests, debugging, education, and very small jobs.
#    It emulates the behavior of launch/launch_frozen as close as possible but on the local
#    machine.  Read its docstring for compatibility and details.
#
# The first argument is the input, for launch_local it can use HDFS paths or an iterator
# of (key, value) pairs.  The second argument is the output, it can use an HDFS path or None
# if the output shouldn't be written to HDFS (as in this case).  The third argument is the script
# path.  The return value of launch_local is a dictionary (see its docstring), and we want 'output'
# which is an iterator of the output (key, value) pairs.
#
# By default Hadoopy talks to Hadoop Streaming using a simple serialization format called TypedBytes.
# The alternative is line oriented records like key0<tab>value0<newline>key1<tab>value1<newline> which
# are 1.) less efficient, 2.) more annoying to work with as everything has to be a string and that
# string can't contain <tab> or <newline> characters.
#
# Note that the types of the (key, value) pairs can be any serializable Python type when using the
# TypedBytes interface (recommended and default), they will be presented to your program in the same
# form they are provided.  All base types are serialized very efficiently and they fall back to Pickle
# for types not supported by TypedBytes.  If this is confusing, just know that you can input/output
# anything you can pickle and Hadoopy does things in a fast way.
output_kvs = hadoopy.launch_local(get_lines(input_path), None, 'wc.py')['output']

# Analyze the output.  The output is an iterator of (word, count) where word is a string and count
# is an integer.
word_counts = dict(output_kvs)
for probe_word, expected_count in [('the', 1664), ('Alice', 221), ('tree', 3)]:
    print('word_counts[%s] = %d' % (probe_word, word_counts[probe_word]))
    assert expected_count == word_counts[probe_word]
示例#10
0
    return s

##############################################################################

if hadoopy.exists(temp_vector_path):
    hadoopy.rmr("-skipTrash %s"%temp_vector_path)
copy(eigen_vector_tb_path, temp_vector_path)    

while diff>0.01:
    
   
    eigen_vector_before = load_eigen_vector(temp_vector_path)

    if hadoopy.exists(temp_vector_path):
        hadoopy.rmr("-skipTrash %s"%temp_vector_path)
    
    hadoopy.launch_local(data_tb_path, temp_vector_path, 'PageRank.py')
    
    eigen_vector_after = load_eigen_vector(temp_vector_path)
    
    if hadoopy.exists(eigen_vector_tb_path):
        hadoopy.rmr("-skipTrash %s"%eigen_vector_tb_path)

    copy(temp_vector_path, eigen_vector_tb_path)
    
    diff = calcul_delta(eigen_vector_before, eigen_vector_after)
    
    print diff
    

示例#11
0
文件: driver.py 项目: Jeffliu/hadoopy
#    and then run.  This is the most common way to use Hadoopy as it avoid having to install
#    anything on the cluster including Python, dependencies, and your code.
# 3. launch_local: This is intended for unit tests, debugging, education, and very small jobs.
#    It emulates the behavior of launch/launch_frozen as close as possible but on the local
#    machine.  Read its docstring for compatibility and details.
#
# The first argument is the input, for launch_local it can use HDFS paths or an iterator
# of (key, value) pairs.  The second argument is the output, it can use an HDFS path or None
# if the output shouldn't be written to HDFS (as in this case).  The third argument is the script
# path.  The return value of launch_local is a dictionary (see its docstring), and we want 'output'
# which is an iterator of the output (key, value) pairs.
#
# By default Hadoopy talks to Hadoop Streaming using a simple serialization format called TypedBytes.
# The alternative is line oriented records like key0<tab>value0<newline>key1<tab>value1<newline> which
# are 1.) less efficient, 2.) more annoying to work with as everything has to be a string and that
# string can't contain <tab> or <newline> characters.
#
# Note that the types of the (key, value) pairs can be any serializable Python type when using the
# TypedBytes interface (recommended and default), they will be presented to your program in the same
# form they are provided.  All base types are serialized very efficiently and they fall back to Pickle
# for types not supported by TypedBytes.  If this is confusing, just know that you can input/output
# anything you can pickle and Hadoopy does things in a fast way.
output_kvs = hadoopy.launch_local(get_lines(input_path), None, wc_py )['output']

# Analyze the output.  The output is an iterator of (word, count) where word is a string and count
# is an integer.
word_counts = dict(output_kvs)
for probe_word, expected_count in [('the', 1664), ('Alice', 221), ('tree', 3)]:
    print('word_counts[%s] = %d' % (probe_word, word_counts[probe_word]))
    assert expected_count == word_counts[probe_word]
示例#12
0
#    and then run.  This is the most common way to use Hadoopy as it avoid having to install
#    anything on the cluster including Python, dependencies, and your code.
# 3. launch_local: This is intended for unit tests, debugging, education, and very small jobs.
#    It emulates the behavior of launch/launch_frozen as close as possible but on the local
#    machine.  Read its docstring for compatibility and details.
#
# The first argument is the input, for launch_local it can use HDFS paths or an iterator
# of (key, value) pairs.  The second argument is the output, it can use an HDFS path or None
# if the output shouldn't be written to HDFS (as in this case).  The third argument is the script
# path.  The return value of launch_local is a dictionary (see its docstring), and we want 'output'
# which is an iterator of the output (key, value) pairs.
#
# By default Hadoopy talks to Hadoop Streaming using a simple serialization format called TypedBytes.
# The alternative is line oriented records like key0<tab>value0<newline>key1<tab>value1<newline> which
# are 1.) less efficient, 2.) more annoying to work with as everything has to be a string and that
# string can't contain <tab> or <newline> characters.
#
# Note that the types of the (key, value) pairs can be any serializable Python type when using the
# TypedBytes interface (recommended and default), they will be presented to your program in the same
# form they are provided.  All base types are serialized very efficiently and they fall back to Pickle
# for types not supported by TypedBytes.  If this is confusing, just know that you can input/output
# anything you can pickle and Hadoopy does things in a fast way.
output_kvs = hadoopy.launch_local(get_lines(input_path), None, wc_py)['output']

# Analyze the output.  The output is an iterator of (word, count) where word is a string and count
# is an integer.
word_counts = dict(output_kvs)
for probe_word, expected_count in [('the', 1664), ('Alice', 221), ('tree', 3)]:
    print('word_counts[%s] = %d' % (probe_word, word_counts[probe_word]))
    assert expected_count == word_counts[probe_word]