Пример #1
0
def main():
    program_name = './avro_pyrw.py'
    data_in = './users.avro'
    path = os.path.realpath(data_in)
    length = os.stat(path).st_size
    input_split = InputSplit.to_string('file://' + path, 0, length)
    out_path = os.path.realpath('.')
    conf = {
        "mapreduce.task.partition": "0",
        "mapreduce.task.output.dir": 'file://%s' % out_path,
    }
    hsn = HadoopSimulatorNetwork(program=program_name)
    hsn.run(None, None, conf, input_split=input_split)
Пример #2
0
def main():
    program_name = './avro_pyrw.py'
    data_in = './users.avro'
    path = os.path.realpath(data_in)
    length = os.stat(path).st_size
    input_split = InputSplit.to_string('file://'+path, 0, length)
    out_path = os.path.realpath('.')
    conf = {
        "mapreduce.task.partition": "0",
        "mapreduce.task.output.dir": 'file://%s' % out_path,
    }
    hsn = HadoopSimulatorNetwork(program=program_name)
    hsn.run(None, None, conf, input_split=input_split)
Пример #3
0
Файл: run.py Проект: crs4/pydoop
def create_configuration():
    data_in = os.path.join(EXAMPLES_DIR, 'input', 'alice_1.txt')
    data_out = 'results.txt'
    data_in_uri = 'file://%s' % data_in
    data_in_size = os.stat(data_in).st_size
    output_dir = tempfile.mkdtemp(prefix="pydoop_")
    output_dir_uri = 'file://%s' % output_dir
    conf = {
        "mapred.map.tasks": "2",
        "mapred.reduce.tasks": "1",
        "mapred.job.name": "wordcount",
        "mapred.work.output.dir": output_dir_uri,
        "mapred.task.partition": "0",
    }
    input_split = InputSplit.to_string(data_in_uri, 0, data_in_size)
    return data_in, data_out, conf, input_split, output_dir
Пример #4
0
def create_configuration():
    data_in = os.path.join(EXAMPLES_DIR, 'input', 'alice_1.txt')
    data_out = 'results.txt'
    data_in_uri = 'file://%s' % data_in
    data_in_size = os.stat(data_in).st_size
    output_dir = tempfile.mkdtemp(prefix="pydoop_")
    output_dir_uri = 'file://%s' % output_dir
    conf = {
        "mapred.map.tasks": "2",
        "mapred.reduce.tasks": "1",
        "mapred.job.name": "wordcount",
        "mapred.work.output.dir": output_dir_uri,
        "mapred.task.partition": "0",
    }
    input_split = InputSplit.to_string(data_in_uri, 0, data_in_size)
    return data_in, data_out, conf, input_split, output_dir
Пример #5
0
def main(argv):
    try:
        data_in = argv[1]
    except IndexError:
        sys.exit("Usage: python %s AVRO_FILE" % argv[0])
    shutil.copy('../schemas/stats.avsc', 'stats.avsc')
    program_name = cp_script('./avro_pyrw.py')
    path = os.path.realpath(data_in)
    length = os.stat(path).st_size
    input_split = InputSplit.to_string('file://' + path, 0, length)
    out_path = os.path.realpath('.')
    conf = {
        "mapreduce.task.partition": "0",
        "mapreduce.task.output.dir": 'file://%s' % out_path,
    }
    hsn = HadoopSimulatorNetwork(program=program_name)
    hsn.run(None, None, conf, input_split=input_split)
Пример #6
0
 def get_areader(offset, length):
     isplit = InputSplit(InputSplit.to_string(url, offset, length))
     ctx = FunkyCtx(isplit)
     return AvroReader(ctx)
Пример #7
0
 def get_areader(offset, length):
     isplit = InputSplit(InputSplit.to_string(url, offset, length))
     ctx = FunkyCtx(isplit)
     return AvroReader(ctx)
Пример #8
0
 def test_input_split(self):
     for s in example_input_splits:
         i = InputSplit(s[0])
         self.assertEqual(i.filename, s[1])
         self.assertEqual(i.offset, s[2])
         self.assertEqual(i.length, s[3])