def ts_rank(job_id, iter_count, input_path, top_n): if not(job_id and iter_count and input_path and os.path.exists(input_path)): print 'error' return [] print 'job_id:', monitor(job_id) if monitor(job_id) == 'finished': print 'hadoop_results start' return hadoop_results(job_id, top_n) fs = HadoopFS() fs.rmr('%s' % job_id) fs.mkdir('%s' % job_id) fs.put(input_path, '%s/hat_init' % job_id) # input文件的路径 #init ts_rank_iter = TsRankIter(input_path='%s/hat_init' % job_id, output_path='%s/hat_tmp1' % job_id) ts_rank_iter.run() #iter for i in range(iter_count-1): ts_rank_iter = TsRankIter(input_path='%s/hat_tmp%s' % (job_id, (i+1)), output_path='%s/hat_tmp%s' % (job_id, (i+2))) ts_rank_iter.run() #sort ts_rank_sorter = TsRankSorter(input_path='%s/hat_tmp%s' % (job_id, iter_count), output_path='%s/hat_results' % job_id) # 这里的input_path是不是错了? ts_rank_sorter.run() # clean init and temp files fs.rmr('%s/hat_tmp*' % job_id) fs.rmr('%s/hat_init' % job_id) sorted_uids, all_uid_tr = hadoop_results(job_id, top_n) return sorted_uids, all_uid_tr
def ts_rank(job_id, iter_count, input_path, top_n): if not (job_id and iter_count and input_path and os.path.exists(input_path)): print 'error' return [] print 'job_id:', monitor(job_id) if monitor(job_id) == 'finished': print 'hadoop_results start' return hadoop_results(job_id, top_n) fs = HadoopFS() fs.rmr('%s' % job_id) fs.mkdir('%s' % job_id) fs.put(input_path, '%s/hat_init' % job_id) # input文件的路径 #init ts_rank_iter = TsRankIter(input_path='%s/hat_init' % job_id, output_path='%s/hat_tmp1' % job_id) ts_rank_iter.run() #iter for i in range(iter_count - 1): ts_rank_iter = TsRankIter(input_path='%s/hat_tmp%s' % (job_id, (i + 1)), output_path='%s/hat_tmp%s' % (job_id, (i + 2))) ts_rank_iter.run() #sort ts_rank_sorter = TsRankSorter( input_path='%s/hat_tmp%s' % (job_id, iter_count), output_path='%s/hat_results' % job_id) # 这里的input_path是不是错了? ts_rank_sorter.run() # clean init and temp files fs.rmr('%s/hat_tmp*' % job_id) fs.rmr('%s/hat_init' % job_id) sorted_uids, all_uid_tr = hadoop_results(job_id, top_n) return sorted_uids, all_uid_tr
def pagerank(job_id, iter_count, input_path, top_n): if not (job_id and iter_count and input_path and os.path.exists(input_path)): return [] if monitor(job_id) == 'finished': return hadoop_results(job_id, top_n) #set work dir and put input temp file into file system fs = HadoopFS() fs.rmr('%s' % job_id) fs.mkdir('%s' % job_id) fs.put(input_path, '%s/hat_init' % job_id) #init pr_iter = PageRankIter(input_path='%s/hat_init' % job_id, output_path='%s/hat_tmp1' % job_id) pr_iter.run() #iter for i in range(iter_count-1): pr_iter = PageRankIter(input_path='%s/hat_tmp%s' % (job_id, (i+1)), output_path='%s/hat_tmp%s' % (job_id, (i+2))) pr_iter.run() #sort pr_sorter = PageRankSorter(input_path='%s/hat_tmp%s' % (job_id, iter_count), output_path='%s/hat_results' % job_id) pr_sorter.run() #clean init and temp files fs.rmr('%s/hat_tmp*' % job_id) fs.rmr('%s/hat_init' % job_id) sorted_uids = hadoop_results(job_id, top_n) return sorted_uids