Пример #1
0
def ts_rank(job_id, iter_count, input_path, top_n):
    if not (job_id and iter_count and input_path
            and os.path.exists(input_path)):
        print 'error'
        return []
    print 'job_id:', monitor(job_id)
    if monitor(job_id) == 'finished':
        print 'hadoop_results start'
        return hadoop_results(job_id, top_n)
    fs = HadoopFS()
    fs.rmr('%s' % job_id)
    fs.mkdir('%s' % job_id)
    fs.put(input_path, '%s/hat_init' % job_id)  # input文件的路径
    #init
    ts_rank_iter = TsRankIter(input_path='%s/hat_init' % job_id,
                              output_path='%s/hat_tmp1' % job_id)
    ts_rank_iter.run()
    #iter
    for i in range(iter_count - 1):
        ts_rank_iter = TsRankIter(input_path='%s/hat_tmp%s' % (job_id,
                                                               (i + 1)),
                                  output_path='%s/hat_tmp%s' % (job_id,
                                                                (i + 2)))
        ts_rank_iter.run()
    #sort
    ts_rank_sorter = TsRankSorter(
        input_path='%s/hat_tmp%s' % (job_id, iter_count),
        output_path='%s/hat_results' % job_id)  # 这里的input_path是不是错了?
    ts_rank_sorter.run()
    # clean init and temp files
    fs.rmr('%s/hat_tmp*' % job_id)
    fs.rmr('%s/hat_init' % job_id)
    sorted_uids, all_uid_tr = hadoop_results(job_id, top_n)

    return sorted_uids, all_uid_tr
Пример #2
0
def hadoop_results(job_id, top_n):
    data = []
    fs = HadoopFS()
    outputs = fs.cat('%s/hat_results/*' % job_id)
    if not outputs:
        return [], {}
    all_outputs = outputs
    if len(outputs) > top_n:
        outputs = outputs[-top_n:]
    outputs.reverse()
    sorted_uids = []
    all_uid_r = {}
    for line in all_outputs:
        uid, r = line.strip().split('\t')
        all_uid_r[uid] = r
    for line in outputs:
        uid, r = line.strip().split('\t')
        sorted_uids.append(uid)
    return sorted_uids, all_uid_r
Пример #3
0
def monitor(job_id):
    fs = HadoopFS()
    finished = False
    has_tmps = False
    outputs = fs.ls('%s' % job_id)
    if not outputs:
        return 'data_not_prepared'
    count = 0
    for line in outputs:
        if 'tmp' in line:
            count += 1
            has_tmps = True
        if 'results' in line:
            if not has_tmps:
                finished = True
    if not finished:
        return 'stage%s' % count
    else:
        return 'finished'