Exemplo n.º 1
0
Arquivo: check.py Projeto: crs4/pydoop
def check_grep(mr_out_dir):
    output = hadut.collect_output(mr_out_dir).splitlines()
    exp_output = []
    for name in sorted(os.listdir(DEFAULT_INPUT_DIR)):
        with open(os.path.join(DEFAULT_INPUT_DIR, name)) as f:
            exp_output.extend([_.strip() for _ in f if "March" in _])
    return output == exp_output
Exemplo n.º 2
0
def check_grep(mr_out_dir):
    output = hadut.collect_output(mr_out_dir).splitlines()
    exp_output = []
    for name in sorted(os.listdir(DEFAULT_INPUT_DIR)):
        with open(os.path.join(DEFAULT_INPUT_DIR, name)) as f:
            exp_output.extend([_.strip() for _ in f if "March" in _])
    return output == exp_output
Exemplo n.º 3
0
def check_caseswitch(mr_out_dir, switch="upper"):
    output = hadut.collect_output(mr_out_dir)
    exp_output = []
    for name in sorted(os.listdir(DEFAULT_INPUT_DIR)):
        with open(os.path.join(DEFAULT_INPUT_DIR, name)) as f:
            exp_output.append(getattr(f.read(), switch)())
    exp_output = "".join(exp_output)
    return output.splitlines() == exp_output.splitlines()
Exemplo n.º 4
0
Arquivo: check.py Projeto: crs4/pydoop
def check_caseswitch(mr_out_dir, switch="upper"):
    output = hadut.collect_output(mr_out_dir)
    exp_output = []
    for name in sorted(os.listdir(DEFAULT_INPUT_DIR)):
        with open(os.path.join(DEFAULT_INPUT_DIR, name)) as f:
            exp_output.append(getattr(f.read(), switch)())
    exp_output = "".join(exp_output)
    return output.splitlines() == exp_output.splitlines()
Exemplo n.º 5
0
def check_base_histogram(mr_out_dir):
    output = Counter()
    for line in hadut.collect_output(mr_out_dir).splitlines():
        k, v = line.split("\t")
        output[k] = int(v)
    exp_output = Counter()
    in_dir = os.path.join(THIS_DIR, "data", "base_histogram_input")
    for name in os.listdir(in_dir):
        with open(os.path.join(in_dir, name)) as f:
            for line in f:
                for base in line.rstrip().split("\t", 10)[9]:
                    exp_output[base] += 1
    return output == exp_output
Exemplo n.º 6
0
def main():
    parser = make_parser()
    opt, _ = parser.parse_args()
    logger = logging.getLogger("main")
    logger.setLevel(logging.INFO)
    logger.info("running word count")
    wc_output = run_wc(opt)
    logger.info("running filter")
    filter_output = run_filter(opt, wc_output)
    logger.info("checking results")
    res = hadut.collect_output(filter_output)
    local_wc = pts.LocalWordCount(opt.input, min_occurrence=opt.threshold)
    logger.info(local_wc.check(res))
Exemplo n.º 7
0
Arquivo: check.py Projeto: crs4/pydoop
def check_base_histogram(mr_out_dir):
    output = Counter()
    for line in hadut.collect_output(mr_out_dir).splitlines():
        k, v = line.split("\t")
        output[k] = int(v)
    exp_output = Counter()
    in_dir = os.path.join(THIS_DIR, "data", "base_histogram_input")
    for name in os.listdir(in_dir):
        with open(os.path.join(in_dir, name)) as f:
            for line in f:
                for base in line.rstrip().split("\t", 10)[9]:
                    exp_output[base] += 1
    return output == exp_output
Exemplo n.º 8
0
def main():
    parser = make_parser()
    opt, _ = parser.parse_args()
    logger = logging.getLogger("main")
    logger.setLevel(logging.INFO)
    logger.info("running dst counter")
    dst_output = run_dst(opt)
    logger.info("running top50 filter")
    filter_output = run_filter(opt, dst_output)
    logger.info("checking results")
    res = hadut.collect_output(filter_output)

    with open("results/result_tf.txt", "w") as f_out:
        f_out.write(res)
Exemplo n.º 9
0
def check_wordcount(mr_out_dir, stop_words=None):
    output = hadut.collect_output(mr_out_dir)
    local_wc = pts.LocalWordCount(DEFAULT_INPUT_DIR, stop_words=stop_words)
    res = local_wc.check(output)
    return res.startswith("OK")  # FIXME: change local_wc to raise an exception
Exemplo n.º 10
0
Arquivo: check.py Projeto: crs4/pydoop
def check_wordcount(mr_out_dir, stop_words=None):
    output = hadut.collect_output(mr_out_dir)
    local_wc = pts.LocalWordCount(DEFAULT_INPUT_DIR, stop_words=stop_words)
    res = local_wc.check(output)
    return res.startswith("OK")  # FIXME: change local_wc to raise an exception
Exemplo n.º 11
0
def get_res(output_dir):
    return pts.parse_mr_output(hadut.collect_output(output_dir), vtype=int)