def ts_rank(job_id, iter_count, input_path, top_n): if not (job_id and iter_count and input_path and os.path.exists(input_path)): print 'error' return [] print 'job_id:', monitor(job_id) if monitor(job_id) == 'finished': print 'hadoop_results start' return hadoop_results(job_id, top_n) fs = HadoopFS() fs.rmr('%s' % job_id) fs.mkdir('%s' % job_id) fs.put(input_path, '%s/hat_init' % job_id) # input文件的路径 #init ts_rank_iter = TsRankIter(input_path='%s/hat_init' % job_id, output_path='%s/hat_tmp1' % job_id) ts_rank_iter.run() #iter for i in range(iter_count - 1): ts_rank_iter = TsRankIter(input_path='%s/hat_tmp%s' % (job_id, (i + 1)), output_path='%s/hat_tmp%s' % (job_id, (i + 2))) ts_rank_iter.run() #sort ts_rank_sorter = TsRankSorter( input_path='%s/hat_tmp%s' % (job_id, iter_count), output_path='%s/hat_results' % job_id) # 这里的input_path是不是错了? ts_rank_sorter.run() # clean init and temp files fs.rmr('%s/hat_tmp*' % job_id) fs.rmr('%s/hat_init' % job_id) sorted_uids, all_uid_tr = hadoop_results(job_id, top_n) return sorted_uids, all_uid_tr
def test_pagerank(self): size = 1000 g = nx.DiGraph(nx.powerlaw_cluster_graph(size, 3, 0.001)) N = len(g.nodes()) tmp_file = tempfile.NamedTemporaryFile(delete=False) for node in g.nodes(): outlinks = g.out_edges(nbunch=[node]) outlinks = map(str, [n2 for n1, n2 in outlinks]) if not outlinks: value = 'pr_results,%s,%s' % (1.0/N, N) tmp_file.write('%s\t%s\n' % (node, value)) else: outlinks_str = ','.join(outlinks) value = 'pr_results,%s,%s,' % (1.0/N, N) value += outlinks_str tmp_file.write('%s\t%s\n' % (node, value)) tmp_file.flush() input_path = tmp_file.name job_id = 'unittest' sorted_ids = pagerank(job_id, self.iter_count, input_path, self.top_n) fs = HadoopFS() fs.rmr('%s/hat_results' % job_id) if self.top_n <= size: self.assertEqual(len(sorted_ids), self.top_n, 'some ids is missing') id_ranges = range(0, 1000) for _id in sorted_ids: self.assertIn(int(_id), id_ranges, 'node should in graph')
def ts_rank(job_id, iter_count, input_path, top_n): if not(job_id and iter_count and input_path and os.path.exists(input_path)): print 'error' return [] print 'job_id:', monitor(job_id) if monitor(job_id) == 'finished': print 'hadoop_results start' return hadoop_results(job_id, top_n) fs = HadoopFS() fs.rmr('%s' % job_id) fs.mkdir('%s' % job_id) fs.put(input_path, '%s/hat_init' % job_id) # input文件的路径 #init ts_rank_iter = TsRankIter(input_path='%s/hat_init' % job_id, output_path='%s/hat_tmp1' % job_id) ts_rank_iter.run() #iter for i in range(iter_count-1): ts_rank_iter = TsRankIter(input_path='%s/hat_tmp%s' % (job_id, (i+1)), output_path='%s/hat_tmp%s' % (job_id, (i+2))) ts_rank_iter.run() #sort ts_rank_sorter = TsRankSorter(input_path='%s/hat_tmp%s' % (job_id, iter_count), output_path='%s/hat_results' % job_id) # 这里的input_path是不是错了? ts_rank_sorter.run() # clean init and temp files fs.rmr('%s/hat_tmp*' % job_id) fs.rmr('%s/hat_init' % job_id) sorted_uids, all_uid_tr = hadoop_results(job_id, top_n) return sorted_uids, all_uid_tr
def pagerank(job_id, iter_count, input_path, top_n): if not (job_id and iter_count and input_path and os.path.exists(input_path)): return [] if monitor(job_id) == 'finished': return hadoop_results(job_id, top_n) #set work dir and put input temp file into file system fs = HadoopFS() fs.rmr('%s' % job_id) fs.mkdir('%s' % job_id) fs.put(input_path, '%s/hat_init' % job_id) #init pr_iter = PageRankIter(input_path='%s/hat_init' % job_id, output_path='%s/hat_tmp1' % job_id) pr_iter.run() #iter for i in range(iter_count-1): pr_iter = PageRankIter(input_path='%s/hat_tmp%s' % (job_id, (i+1)), output_path='%s/hat_tmp%s' % (job_id, (i+2))) pr_iter.run() #sort pr_sorter = PageRankSorter(input_path='%s/hat_tmp%s' % (job_id, iter_count), output_path='%s/hat_results' % job_id) pr_sorter.run() #clean init and temp files fs.rmr('%s/hat_tmp*' % job_id) fs.rmr('%s/hat_init' % job_id) sorted_uids = hadoop_results(job_id, top_n) return sorted_uids
def read_from_hdfs(job_id, top_n): fs = HadoopFS() outputs = fs.cat('%s/hat_results/*' % job_id) if not outputs: return [] if len(outputs) > top_n: outputs = outputs[-top_n:] outputs.reverse() sorted_uids = [] for line in outputs: uid, value = line.strip().split('\t') sorted_uids.append(uid) return sorted_uids
def pagerank_simluation_test(input_path): iter_count = 5 top_n = 500 job_id = 1 sorted_ids = pagerank(job_id, iter_count, input_path, top_n) if sorted_ids: if len(sorted_ids) < 10: for i in range(len(sorted_ids)): print sorted_ids[i] else: for i in range(10): print sorted_ids[i] fs = HadoopFS() fs.rmr('%s/hat_results' % job_id)
def hadoop_results(job_id, top_n): data = [] fs = HadoopFS() outputs = fs.cat('%s/hat_results/*' % job_id) if not outputs: return [], {} all_outputs = outputs if len(outputs) > top_n: outputs = outputs[-top_n:] outputs.reverse() sorted_uids = [] all_uid_r = {} for line in all_outputs: uid, r = line.strip().split('\t') all_uid_r[uid] = r for line in outputs: uid, r = line.strip().split('\t') sorted_uids.append(uid) return sorted_uids, all_uid_r
def monitor(job_id): fs = HadoopFS() finished = False has_tmps = False outputs = fs.ls('%s' % job_id) if not outputs: return 'data_not_prepared' count = 0 for line in outputs: if 'tmp' in line: count += 1 has_tmps = True if 'results' in line: if not has_tmps: finished = True if not finished: return 'stage%s' % count else: return 'finished'
def save_to_tmp(job_id, data): tmp_file = tempfile.NamedTemporaryFile(delete=False) for key, value in data.iteritems(): tmp_file.write('%s\t%s\n' % (key, value)) tmp_file.flush() fs = HadoopFS() fs.rmr('%s' % job_id) fs.mkdir('%s' % job_id) fs.put(tmp_file.name, '%s/hat_init' % job_id) return tmp_file.name
def main(): job_id = 'hat_1' if (len(sys.argv) < 3): print 'Usage: python pagerank.py input_file iter_count' sys.exit() else: iter_count = int(sys.argv[2]) input_file_name = sys.argv[1] fs = HadoopFS() #set work dir and put input file into file system fs.mkdir('%s' % job_id) fs.put(input_file_name, '%s/hat_init' % job_id) #init pr_iter = PageRankIter(input_path='%s/hat_init' % job_id, output_path='%s/hat_tmp1' % job_id) pr_iter.run() #iter for i in range(iter_count-1): pr_iter = PageRankIter(input_path='%s/hat_tmp%s' % (job_id, (i+1)), output_path='%s/hat_tmp%s' % (job_id, (i+2))) pr_iter.run() #sort pr_sorter = PageRankSorter(input_path='%s/hat_tmp%s' % (job_id, iter_count), output_path='%s/hat_results' % job_id) pr_sorter.run() #output and clean work dir try: outputs = fs.cat('%s/hat_results/*' % job_id) if len(outputs) > 100: outputs = outputs[-100:] for line in outputs: print line except Exception: raise finally: fs.rmr('%s' % job_id)
#-*- coding: utf-8 -*- from hat.fs import HadoopFS fs = HadoopFS(debug=True) print fs.mkdir('test') print fs.put('test.txt', 'test/test_for_fs') for line in fs.cat('test/test_for_fs/*'): print line print fs.rmr('test') for line in fs.ls(): print line