示例#1
0
def main(argv):
    parser = make_parser()
    opt, _ = parser.parse_args(argv)
    if opt.output is not sys.stdout:
        opt.output = open(opt.output, 'w')
    logger = logging.getLogger("main")
    logger.setLevel(logging.DEBUG)
    runner = hadut.PipesRunner(prefix=PREFIX, logger=logger)
    with open(LOCAL_MR_SCRIPT) as f:
        pipes_code = pts.add_sys_path(f.read())
    runner.set_input(opt.input, put=True)
    runner.set_exe(pipes_code)
    mr_options = get_mr_options(opt, runner.wd)
    runner.run(properties=mr_options,
               hadoop_conf_dir=HADOOP_CONF_DIR,
               logger=logger)
    mr_output = runner.collect_output()
    runner.clean()
    d = pts.parse_mr_output(mr_output, vtype=int)
    ip_list = sorted(d.iteritems(), key=operator.itemgetter(1), reverse=True)
    if opt.n_top:
        ip_list = ip_list[:opt.n_top]
    for ip, count in ip_list:
        opt.output.write("%s\t%d\n" % (ip, count))
    if opt.output is not sys.stdout:
        opt.output.close()
示例#2
0
def main(argv):
  parser = make_parser()
  opt, _ = parser.parse_args(argv)
  if opt.output is not sys.stdout:
    opt.output = open(opt.output, 'w')
  logger = logging.getLogger("main")
  logger.setLevel(logging.DEBUG)
  runner = hadut.PipesRunner(prefix=PREFIX, logger=logger)
  with open(LOCAL_MR_SCRIPT) as f:
    pipes_code = pts.add_sys_path(f.read())
  runner.set_input(opt.input, put=True)
  runner.set_exe(pipes_code)
  mr_options = get_mr_options(opt, runner.wd)
  runner.run(
    properties=mr_options, hadoop_conf_dir=HADOOP_CONF_DIR, logger=logger
    )
  mr_output = runner.collect_output()
  runner.clean()
  d = pts.parse_mr_output(mr_output, vtype=int)
  ip_list = sorted(d.iteritems(), key=operator.itemgetter(1), reverse=True)
  if opt.n_top:
    ip_list = ip_list[:opt.n_top]
  for ip, count in ip_list:
    opt.output.write("%s\t%d\n" % (ip, count))
  if opt.output is not sys.stdout:
    opt.output.close()
示例#3
0
def main(argv):
    parser = make_parser()
    args = parser.parse_args(argv)
    update_conf(args)
    logger = logging.getLogger("main")
    logger.setLevel(logging.INFO)
    runner = hadut.PipesRunner(prefix=PREFIX, logger=logger)
    with open(args.pipes_exe) as f:
        pipes_code = pts.add_sys_path(f.read())
    runner.set_input(args.local_input, put=True)
    runner.set_exe(pipes_code)
    runner.run(properties=CONF, hadoop_conf_dir=HADOOP_CONF_DIR, logger=logger)
    res = runner.collect_output()
    runner.clean()
    local_wc = pts.LocalWordCount(args.local_input)
    logging.info(local_wc.check(res))
示例#4
0
def main(argv):
  logger = logging.getLogger("main")
  logger.setLevel(logging.INFO)
  local_input = argv[1]
  with open(MR_SCRIPT) as f:
    pipes_code = pts.add_sys_path(f.read())
  runner = hadut.PipesRunner(prefix=PREFIX, logger=logger)
  runner.set_input(local_input, put=True)
  runner.set_exe(pipes_code)
  runner.run()
  res = runner.collect_output()
  runner.clean()
  hdfs.rmr(HDFS_WD)
  logger.info("checking results")
  expected_res = local_vc(local_input)
  logger.info(check(res, expected_res))
示例#5
0
文件: run_wc.py 项目: ZEMUSHKA/pydoop
def main(argv):
  parser = make_parser()
  args = parser.parse_args(argv)
  update_conf(args)
  logger = logging.getLogger("main")
  logger.setLevel(logging.DEBUG)
  runner = hadut.PipesRunner(prefix=PREFIX, logger=logger)
  with open(args.pipes_exe) as f:
    pipes_code = pts.add_sys_path(f.read())
  runner.set_input(args.local_input, put=True)
  runner.set_exe(pipes_code)
  runner.run(properties=CONF, hadoop_conf_dir=HADOOP_CONF_DIR, logger=logger)
  res = runner.collect_output()
  runner.clean()
  local_wc = pts.LocalWordCount(args.local_input)
  logging.info(local_wc.check(res))
示例#6
0
def main(argv):
    logger = logging.getLogger("main")
    logger.setLevel(logging.INFO)
    local_input = argv[1]
    with open(MR_SCRIPT) as f:
        pipes_code = pts.add_sys_path(f.read())
    runner = hadut.PipesRunner(prefix=PREFIX, logger=logger)
    runner.set_input(local_input, put=True)
    runner.set_exe(pipes_code)
    runner.run()
    res = runner.collect_output()
    runner.clean()
    hdfs.rmr(HDFS_WD)
    logger.info("checking results")
    expected_res = local_vc(local_input)
    logger.info(check(res, expected_res))
示例#7
0
def main(argv):

    logger = logging.getLogger("main")
    logger.setLevel(logging.DEBUG)

    with Timer() as total_time:

        parser = make_parser()
        args = parser.parse_args(argv)
        if args.dataset:
            print args.dataset
            create_dataset(logger, args.dataset)

        if args.script:
            piped_code_file = args.script
        else:
            piped_code_file = DEFAULT_SCRIPT

        if not os.path.exists(piped_code_file):
            raise IOError("script {0} not found !!!".format(piped_code_file))

        with open(piped_code_file) as f:
            pipes_code = pts.add_sys_path(f.read())

        dataset = [d for d in os.listdir("dataset") if d.endswith("MB")]
        dataset.sort(cmp=lambda x, y: cmp(int(x.replace("MB", "")),
                                          int(y.replace("MB", ""))))

        logger.info(" Uploading dataset: { %s }", ', '.join(dataset))
        if not hadut.path_exists(os.path.join(DATASET_DIR)):
            logger.info("  dataset folder created")
            hdfs.mkdir(DATASET_DIR)

        for data_filename in dataset:
            source_path = os.path.join(DATASET_DIR, data_filename)
            dest_path = os.path.join(DATASET_DIR, data_filename)

            if not hadut.path_exists(os.path.join(DATASET_DIR, data_filename)):
                logger.info(" -> uploading %s...", source_path)
                hdfs.put(source_path, dest_path)

        update_conf(args)

        results = dict()
        for data_input in dataset:

            with Timer() as t:
                runner = hadut.PipesRunner(prefix=PREFIX, logger=logger)
                logger.info("Running the script %s with data input %s..",
                            piped_code_file, data_input)
                data_input_path = os.path.join(DATASET_DIR, data_input)
                runner.set_input(data_input_path, put=False)
                runner.set_exe(pipes_code)
                runner.run(properties=CONF,
                           hadoop_conf_dir=HADOOP_CONF_DIR,
                           logger=logger)
                res = runner.collect_output()
                print data_input_path
                local_wc = pts.LocalWordCount(data_input_path)
                logging.info(local_wc.check(res))
                # print res
                # runner.clean()
            results[data_input] = (t.secs, t.msecs)

    print "\n\n RESULTs"
    print "=" * (len(piped_code_file) + 15)
    print " *  script: {0}".format(piped_code_file)
    print " *  mappers: {0}".format(CONF["mapred.map.tasks"])
    print " *  reducers: {0}".format(CONF["mapred.reduce.tasks"])
    print " *  dataset: [{0}]".format(",".join(dataset))
    print " *  times (input -> secs):"
    for data_input in dataset:
        print "    - {0} -> {1} secs.".format(data_input,
                                              results[data_input][0])
    print "\n => Total execution time: {0}".format(total_time.secs)
    print "=" * (len(piped_code_file) + 15)
    print "\n"
示例#8
0
def main(argv):

    logger = logging.getLogger("main")
    logger.setLevel(logging.DEBUG)

    with Timer() as total_time:

        parser = make_parser()
        args = parser.parse_args(argv)
        if args.dataset:
            print args.dataset
            create_dataset(logger, args.dataset)

        if args.script:
            piped_code_file = args.script
        else:
            piped_code_file = DEFAULT_SCRIPT

        if not os.path.exists(piped_code_file):
            raise IOError("script {0} not found !!!".format(piped_code_file))

        with open(piped_code_file) as f:
            pipes_code = pts.add_sys_path(f.read())

        dataset = [d for d in os.listdir("dataset") if d.endswith("MB")]
        dataset.sort(cmp=lambda x, y: cmp(
            int(x.replace("MB", "")), int(y.replace("MB", ""))
        ))

        logger.info(" Uploading dataset: { %s }", ', '.join(dataset))
        if not hadut.path_exists(os.path.join(DATASET_DIR)):
            logger.info("  dataset folder created")
            hdfs.mkdir(DATASET_DIR)

        for data_filename in dataset:
            source_path = os.path.join(DATASET_DIR, data_filename)
            dest_path = os.path.join(DATASET_DIR, data_filename)

            if not hadut.path_exists(os.path.join(DATASET_DIR, data_filename)):
                logger.info(" -> uploading %s...", source_path)
                hdfs.put(source_path, dest_path)

        update_conf(args)

        results = dict()
        for data_input in dataset:

            with Timer() as t:
                runner = hadut.PipesRunner(prefix=PREFIX, logger=logger)
                logger.info("Running the script %s with data input %s..",
                            piped_code_file, data_input)
                data_input_path = os.path.join(DATASET_DIR, data_input)
                runner.set_input(data_input_path, put=False)
                runner.set_exe(pipes_code)
                runner.run(properties=CONF, hadoop_conf_dir=HADOOP_CONF_DIR,
                           logger=logger)
                res = runner.collect_output()
                print data_input_path
                local_wc = pts.LocalWordCount(data_input_path)
                logging.info(local_wc.check(res))
                #print res
                #runner.clean()
            results[data_input] = (t.secs, t.msecs)

    print "\n\n RESULTs"
    print "=" * (len(piped_code_file) + 15)
    print " *  script: {0}".format(piped_code_file)
    print " *  mappers: {0}".format(CONF["mapred.map.tasks"])
    print " *  reducers: {0}".format(CONF["mapred.reduce.tasks"])
    print " *  dataset: [{0}]".format(",".join(dataset))
    print " *  times (input -> secs):"
    for data_input in dataset:
        print "    - {0} -> {1} secs.".format(
            data_input, results[data_input][0]
        )
    print "\n => Total execution time: {0}".format(total_time.secs)
    print "=" * (len(piped_code_file) + 15)
    print "\n"