def run(self): hdfs = luigi.contrib.hdfs.hadoopcli_clients.create_hadoopcli_client() dates = self.version_date() if len(dates) == 0: raise Exception("These's no user data in[%s]" % self.input()[0].fn) exit_code = -1 mr_path = os.path.dirname(self.bin) copyfile("%s/mapreduce.properties.user" % mr_path, "%s/mapreduce.properties" % mr_path) if not os.path.exists(self.local_version): hdfs.remove(self.user_root) hdfs.mkdir(self.user_root) exit_code = mr_cmd(self.bin, "pr.user") if exit_code != 0 or not check_mr_success(self.merge): raise Exception("GetExternalUser failed") hdfs.rename(self.merge, self.output()["user"].path) else: local_dates = [line.strip() for line in open(self.local_version)] latest_dates = set(dates) - set(local_dates) if len(latest_dates) == 0: raise Exception("These's no new arrival user data") hdfs.remove(self.user_root) hdfs.mkdir(self.user_root) hdfs.mkdir(self.increamental_archive) for d in latest_dates: hdfs.copy("%s/%s" % (self.input()[0].path, d), "%s/%s" % (self.increamental_archive, d)) exit_code = mr_cmd(self.bin, "pr.latest.user") if exit_code != 0 or not check_mr_success(self.output()["user"].path): raise Exception("GetExternalUser failed") # make version tag with self.output()["version"].open("w") as version_fd: dates.sort() for d in dates: print >> version_fd, d
def run(self): hdfs = luigi.contrib.hdfs.hadoopcli_clients.create_hadoopcli_client() hdfs.remove(self.paper_root) hdfs.mkdir(self.paper_root) mr_path = os.path.dirname(self.bin) copyfile("%s/mapreduce.properties.doc" % mr_path, "%s/mapreduce.properties" % mr_path) exit_code = mr_cmd(self.bin, "pr.paper") if exit_code != 0 or not check_mr_success(self.output().path): raise Exception("GetExternalPaper failed")
def infer_topic(in_fn, model_fn, out_fn, conf): parser = SafeConfigParser() parser.read(conf) root = parser.get("basic", "root") hadoop_stream = parser.get('basic', 'hadoop_stream') topic_num = parser.getint('plda+', 'topic_num') alpha = 50.0 / topic_num task_id = uuid.uuid4() infer_in_path = "%s/%s" % (parser.get('plda+', 'infer_in_path'), task_id) infer_out_path = "%s/%s" % (parser.get('plda+', 'infer_out_path'), task_id) infer_burn_in_iter = parser.getint('plda+', 'infer_burn_in_iter') infer_total_iter = parser.getint('plda+', 'infer_total_iter') infer_reduce_tasks = parser.getint('plda+', 'infer_reduce_tasks') infer_reducer_mb = parser.getint('plda+', 'infer_reducer_mb') mapper = '%s/plda/infer_mapper' % root reducer = '%s/plda/infer_reducer' % root reducer_wrapper = '%s/data/temp/reducer_wrapper.sh' % root hdfs = luigi.contrib.hdfs.hadoopcli_clients.create_hadoopcli_client() hdfs.mkdir(infer_in_path) hdfs.put(in_fn, infer_in_path) with open(reducer_wrapper, 'w') as wrapper_fd: print >> wrapper_fd, "#!/bin/bash" print >> wrapper_fd, "./infer_reducer --alpha %f --beta 0.01 --model_file ./%s --burn_in_iterations %d --total_iterations %d -sparse true" % \ (alpha, os.path.basename(model_fn), infer_burn_in_iter, infer_total_iter) cmd = '''hadoop jar %s \ -D mapred.job.name="mr plda+ infer" \ -D mapred.job.map.memory.mb=32 \ -D mapred.job.reduce.memory.mb=%d \ -D io.compression.codecs=org.apache.hadoop.io.compress.DefaultCodec \ -input %s \ -output %s \ -file %s \ -file %s \ -file %s \ -file %s \ -mapper ./infer_mapper \ -reducer ./reducer_wrapper.sh \ -numReduceTasks %d ''' cmd = cmd % (hadoop_stream, infer_reducer_mb, infer_in_path, infer_out_path, model_fn, mapper, reducer, reducer_wrapper, infer_reduce_tasks) os.system(cmd) os.remove(reducer_wrapper) if check_mr_success(infer_out_path): with open(out_fn, 'w') as out_fd: get_mr_dir(infer_out_path, out_fd) hdfs.remove(infer_in_path) hdfs.remove(infer_out_path) else: hdfs.remove(infer_in_path) hdfs.remove(infer_out_path) raise Exception("failed to infer topic")