示例#1
0
    def run(self):
        hdfs = luigi.contrib.hdfs.hadoopcli_clients.create_hadoopcli_client()
        dates = self.version_date()
        if len(dates) == 0:
            raise Exception("These's no user data in[%s]" % self.input()[0].fn)

        exit_code = -1
        mr_path = os.path.dirname(self.bin)
        copyfile("%s/mapreduce.properties.user" % mr_path, "%s/mapreduce.properties" % mr_path)
        if not os.path.exists(self.local_version):
            hdfs.remove(self.user_root)
            hdfs.mkdir(self.user_root)
            exit_code = mr_cmd(self.bin, "pr.user")
            if exit_code != 0 or not check_mr_success(self.merge):
                raise Exception("GetExternalUser failed")
            hdfs.rename(self.merge, self.output()["user"].path)
        else:
            local_dates = [line.strip() for line in open(self.local_version)]
            latest_dates = set(dates) - set(local_dates)
            if len(latest_dates) == 0:
                raise Exception("These's no new arrival user data")
            hdfs.remove(self.user_root)
            hdfs.mkdir(self.user_root)
            hdfs.mkdir(self.increamental_archive)
            for d in latest_dates:
                hdfs.copy("%s/%s" % (self.input()[0].path, d), "%s/%s" % (self.increamental_archive, d))
            exit_code = mr_cmd(self.bin, "pr.latest.user")
            if exit_code != 0 or not check_mr_success(self.output()["user"].path):
                raise Exception("GetExternalUser failed")
                # make version tag
        with self.output()["version"].open("w") as version_fd:
            dates.sort()
            for d in dates:
                print >> version_fd, d
示例#2
0
 def run(self):
     hdfs = luigi.contrib.hdfs.hadoopcli_clients.create_hadoopcli_client()
     hdfs.remove(self.paper_root)
     hdfs.mkdir(self.paper_root)
     mr_path = os.path.dirname(self.bin)
     copyfile("%s/mapreduce.properties.doc" % mr_path, "%s/mapreduce.properties" % mr_path)
     exit_code = mr_cmd(self.bin, "pr.paper")
     if exit_code != 0 or not check_mr_success(self.output().path):
         raise Exception("GetExternalPaper failed")
示例#3
0
文件: rec.py 项目: lqleeqee/avatar
	def run(self):
		hdfs = luigi.contrib.hdfs.hadoopcli_clients.create_hadoopcli_client()
		df = sf.load_sframe(self.input()[0]['rec'].fn)
		delete_cols = [col for col in df.column_names() if col != "history" and col != "id" and col != "rlist"]
		df.remove_columns(delete_cols)
		df.export_csv(self.local_csv, quote_level=csv.QUOTE_NONE, delimiter="\t", header=False)	
		hbase_input_csv = "%s/user.rec.csv" % self.hbase_input_path
		hdfs.mkdir(self.hbase_input_path)	
		hdfs.put(self.local_csv, hbase_input_csv)
		os.remove(self.local_csv)
		to_hbase(self.hbase_input_path, self.bin)
		hdfs.remove(self.hbase_input_path)
示例#4
0
def infer_topic(in_fn, model_fn, out_fn, conf):
	parser = SafeConfigParser()
	parser.read(conf)
	root = parser.get("basic", "root")
	hadoop_stream = parser.get('basic', 'hadoop_stream')
	topic_num = parser.getint('plda+', 'topic_num')
	alpha = 50.0 / topic_num
	task_id = uuid.uuid4()
	infer_in_path = "%s/%s" % (parser.get('plda+', 'infer_in_path'), task_id)
	infer_out_path = "%s/%s" % (parser.get('plda+', 'infer_out_path'), task_id)
	infer_burn_in_iter = parser.getint('plda+', 'infer_burn_in_iter')
	infer_total_iter = parser.getint('plda+', 'infer_total_iter')
	infer_reduce_tasks = parser.getint('plda+', 'infer_reduce_tasks')
	infer_reducer_mb = parser.getint('plda+', 'infer_reducer_mb')
	mapper = '%s/plda/infer_mapper' % root
	reducer = '%s/plda/infer_reducer' % root
	reducer_wrapper = '%s/data/temp/reducer_wrapper.sh' % root

	hdfs = luigi.contrib.hdfs.hadoopcli_clients.create_hadoopcli_client()
	hdfs.mkdir(infer_in_path)
	hdfs.put(in_fn, infer_in_path)

	with open(reducer_wrapper, 'w') as wrapper_fd:
		print >> wrapper_fd, "#!/bin/bash"
		print >> wrapper_fd, "./infer_reducer --alpha %f --beta 0.01 --model_file ./%s --burn_in_iterations %d --total_iterations %d -sparse true" % \
			(alpha, os.path.basename(model_fn), infer_burn_in_iter, infer_total_iter)
	cmd = '''hadoop jar %s \
	      -D mapred.job.name="mr plda+ infer" \
	      -D mapred.job.map.memory.mb=32 \
	      -D mapred.job.reduce.memory.mb=%d \
	      -D io.compression.codecs=org.apache.hadoop.io.compress.DefaultCodec \
	      -input %s \
	      -output %s \
	      -file %s \
	      -file %s \
	      -file %s \
	      -file %s \
	      -mapper ./infer_mapper \
	      -reducer ./reducer_wrapper.sh \
	      -numReduceTasks %d
	      '''
	cmd = cmd % (hadoop_stream, infer_reducer_mb,
		infer_in_path, infer_out_path,
		model_fn, mapper, reducer, reducer_wrapper,
		infer_reduce_tasks)
	os.system(cmd)
	os.remove(reducer_wrapper)
	if check_mr_success(infer_out_path):
		with open(out_fn, 'w') as out_fd:
			get_mr_dir(infer_out_path, out_fd)
		hdfs.remove(infer_in_path)
		hdfs.remove(infer_out_path)
	else:
		hdfs.remove(infer_in_path)
		hdfs.remove(infer_out_path)
		raise Exception("failed to infer topic")