def ts_rank(job_id, iter_count, input_path, top_n): if not (job_id and iter_count and input_path and os.path.exists(input_path)): print 'error' return [] print 'job_id:', monitor(job_id) if monitor(job_id) == 'finished': print 'hadoop_results start' return hadoop_results(job_id, top_n) fs = HadoopFS() fs.rmr('%s' % job_id) fs.mkdir('%s' % job_id) fs.put(input_path, '%s/hat_init' % job_id) # input文件的路径 #init ts_rank_iter = TsRankIter(input_path='%s/hat_init' % job_id, output_path='%s/hat_tmp1' % job_id) ts_rank_iter.run() #iter for i in range(iter_count - 1): ts_rank_iter = TsRankIter(input_path='%s/hat_tmp%s' % (job_id, (i + 1)), output_path='%s/hat_tmp%s' % (job_id, (i + 2))) ts_rank_iter.run() #sort ts_rank_sorter = TsRankSorter( input_path='%s/hat_tmp%s' % (job_id, iter_count), output_path='%s/hat_results' % job_id) # 这里的input_path是不是错了? ts_rank_sorter.run() # clean init and temp files fs.rmr('%s/hat_tmp*' % job_id) fs.rmr('%s/hat_init' % job_id) sorted_uids, all_uid_tr = hadoop_results(job_id, top_n) return sorted_uids, all_uid_tr
def hadoop_results(job_id, top_n): data = [] fs = HadoopFS() outputs = fs.cat('%s/hat_results/*' % job_id) if not outputs: return [], {} all_outputs = outputs if len(outputs) > top_n: outputs = outputs[-top_n:] outputs.reverse() sorted_uids = [] all_uid_r = {} for line in all_outputs: uid, r = line.strip().split('\t') all_uid_r[uid] = r for line in outputs: uid, r = line.strip().split('\t') sorted_uids.append(uid) return sorted_uids, all_uid_r
def monitor(job_id): fs = HadoopFS() finished = False has_tmps = False outputs = fs.ls('%s' % job_id) if not outputs: return 'data_not_prepared' count = 0 for line in outputs: if 'tmp' in line: count += 1 has_tmps = True if 'results' in line: if not has_tmps: finished = True if not finished: return 'stage%s' % count else: return 'finished'