示例#1
0
def ts_rank(job_id, iter_count, input_path, top_n):
    if not(job_id and iter_count and input_path and os.path.exists(input_path)):
        print 'error'
        return []
    print 'job_id:', monitor(job_id)
    if monitor(job_id) == 'finished':
        print 'hadoop_results start'
        return hadoop_results(job_id, top_n)
    fs = HadoopFS()
    fs.rmr('%s' % job_id)
    fs.mkdir('%s' % job_id)
    fs.put(input_path, '%s/hat_init' % job_id) # input文件的路径
    #init
    ts_rank_iter = TsRankIter(input_path='%s/hat_init' % job_id, output_path='%s/hat_tmp1' % job_id)
    ts_rank_iter.run()
    #iter
    for i in range(iter_count-1):
        ts_rank_iter = TsRankIter(input_path='%s/hat_tmp%s' % (job_id, (i+1)), output_path='%s/hat_tmp%s' % (job_id, (i+2)))
        ts_rank_iter.run()
    #sort
    ts_rank_sorter = TsRankSorter(input_path='%s/hat_tmp%s' % (job_id, iter_count), output_path='%s/hat_results' % job_id) # 这里的input_path是不是错了?
    ts_rank_sorter.run()
    # clean init and temp files
    fs.rmr('%s/hat_tmp*' % job_id)
    fs.rmr('%s/hat_init' % job_id)
    sorted_uids, all_uid_tr = hadoop_results(job_id, top_n)

    return sorted_uids, all_uid_tr
示例#2
0
def ts_rank(job_id, iter_count, input_path, top_n):
    if not (job_id and iter_count and input_path
            and os.path.exists(input_path)):
        print 'error'
        return []
    print 'job_id:', monitor(job_id)
    if monitor(job_id) == 'finished':
        print 'hadoop_results start'
        return hadoop_results(job_id, top_n)
    fs = HadoopFS()
    fs.rmr('%s' % job_id)
    fs.mkdir('%s' % job_id)
    fs.put(input_path, '%s/hat_init' % job_id)  # input文件的路径
    #init
    ts_rank_iter = TsRankIter(input_path='%s/hat_init' % job_id,
                              output_path='%s/hat_tmp1' % job_id)
    ts_rank_iter.run()
    #iter
    for i in range(iter_count - 1):
        ts_rank_iter = TsRankIter(input_path='%s/hat_tmp%s' % (job_id,
                                                               (i + 1)),
                                  output_path='%s/hat_tmp%s' % (job_id,
                                                                (i + 2)))
        ts_rank_iter.run()
    #sort
    ts_rank_sorter = TsRankSorter(
        input_path='%s/hat_tmp%s' % (job_id, iter_count),
        output_path='%s/hat_results' % job_id)  # 这里的input_path是不是错了?
    ts_rank_sorter.run()
    # clean init and temp files
    fs.rmr('%s/hat_tmp*' % job_id)
    fs.rmr('%s/hat_init' % job_id)
    sorted_uids, all_uid_tr = hadoop_results(job_id, top_n)

    return sorted_uids, all_uid_tr
示例#3
0
def pagerank(job_id, iter_count, input_path, top_n):
    if not (job_id and iter_count and input_path and os.path.exists(input_path)):
        return []

    if monitor(job_id) == 'finished':
        return hadoop_results(job_id, top_n)

    #set work dir and put input temp file into file system
    fs = HadoopFS()
    fs.rmr('%s' % job_id)
    fs.mkdir('%s' % job_id)
    fs.put(input_path, '%s/hat_init' % job_id)

    #init
    pr_iter = PageRankIter(input_path='%s/hat_init' % job_id, output_path='%s/hat_tmp1' % job_id)
    pr_iter.run()

    #iter
    for i in range(iter_count-1):
        pr_iter = PageRankIter(input_path='%s/hat_tmp%s' % (job_id, (i+1)), output_path='%s/hat_tmp%s' % (job_id, (i+2)))
        pr_iter.run()

    #sort
    pr_sorter = PageRankSorter(input_path='%s/hat_tmp%s' % (job_id, iter_count), output_path='%s/hat_results' % job_id)
    pr_sorter.run()

    #clean init and temp files
    fs.rmr('%s/hat_tmp*' % job_id)
    fs.rmr('%s/hat_init' % job_id)

    sorted_uids = hadoop_results(job_id, top_n)

    return sorted_uids