def setupjob(job, args): """ Set up a job to run on a date range of directories. Jobs expect two arguments, startdate and enddate, both in yyyy-MM-dd format. """ import java.text.SimpleDateFormat as SimpleDateFormat import java.util.Date as Date import java.util.Calendar as Calendar import com.mozilla.util.DateUtil as DateUtil import com.mozilla.util.DateIterator as DateIterator import org.apache.hadoop.mapreduce.lib.input.FileInputFormat as FileInputFormat import org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat as MyInputFormat if len(args) != 3: raise Exception( "Usage: <testpilot_study> <startdate-YYYY-MM-DD> <enddate-YYYY-MM-DD>" ) # use to collect up each date in the given range class MyDateIterator(DateIterator): def __init__(self): self._list = [] def get(self): return self._list def see(self, aTime): self._list.append(aTime) sdf = SimpleDateFormat(dateformat) study = args[0] startdate = Calendar.getInstance() startdate.setTime(sdf.parse(args[1])) enddate = Calendar.getInstance() enddate.setTime(sdf.parse(args[2])) dates = MyDateIterator() DateUtil.iterateByDay(startdate.getTimeInMillis(), enddate.getTimeInMillis(), dates) paths = [] for d in dates.get(): paths.append(pathformat % (study, sdf.format(Date(d)))) job.setInputFormatClass(MyInputFormat) FileInputFormat.setInputPaths(job, ",".join(paths)) job.getConfiguration().set("org.mozilla.jydoop.mappertype", "TEXT")
def setupjob(job, args): """ Set up a job to run on a date range of directories. Jobs expect two arguments, startdate and enddate, both in yyyy-MM-dd format. """ import java.text.SimpleDateFormat as SimpleDateFormat import java.util.Date as Date import java.util.Calendar as Calendar import com.mozilla.util.DateUtil as DateUtil import com.mozilla.util.DateIterator as DateIterator import org.apache.hadoop.mapreduce.lib.input.FileInputFormat as FileInputFormat import org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat as MyInputFormat if len(args) != 3: raise Exception("Usage: <testpilot_study> <startdate-YYYY-MM-DD> <enddate-YYYY-MM-DD>") # use to collect up each date in the given range class MyDateIterator(DateIterator): def __init__(self): self._list = [] def get(self): return self._list def see(self, aTime): self._list.append(aTime) sdf = SimpleDateFormat(dateformat) study = args[0] startdate = Calendar.getInstance() startdate.setTime(sdf.parse(args[1])) enddate = Calendar.getInstance() enddate.setTime(sdf.parse(args[2])) dates = MyDateIterator() DateUtil.iterateByDay(startdate.getTimeInMillis(), enddate.getTimeInMillis(), dates) paths = [] for d in dates.get(): paths.append(pathformat % (study, sdf.format(Date(d)))) job.setInputFormatClass(MyInputFormat) FileInputFormat.setInputPaths(job, ",".join(paths)); job.getConfiguration().set("org.mozilla.jydoop.mappertype", "TEXT")
def setupjob(job, args): """ Similar to the above, but run telemetry data that's already been exported to HDFS. Jobs expect two arguments, startdate and enddate, both in yyyyMMdd format. """ import java.text.SimpleDateFormat as SimpleDateFormat import java.util.Date as Date import java.util.Calendar as Calendar import java.util.concurrent.TimeUnit as TimeUnit import com.mozilla.util.DateUtil as DateUtil import com.mozilla.util.DateIterator as DateIterator import org.apache.hadoop.mapreduce.lib.input.FileInputFormat as FileInputFormat import org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat as MyInputFormat if len(args) != 2: raise Exception("Usage: <startdate-YYYYMMDD> <enddate-YYYYMMDD>") # use to collect up each date in the given range class MyDateIterator(DateIterator): def __init__(self): self._list = [] def get(self): return self._list def see(self, aTime): self._list.append(aTime) sdf = SimpleDateFormat(dateformat) sdf_hdfs = SimpleDateFormat(hdfs_dateformat) startdate = Calendar.getInstance() startdate.setTime(sdf.parse(args[0])) enddate = Calendar.getInstance() enddate.setTime(sdf.parse(args[1])) nowdate = Calendar.getInstance() # HDFS only contains the last 2 weeks of data (up to yesterday) startMillis = startdate.getTimeInMillis() endMillis = enddate.getTimeInMillis() nowMillis = nowdate.getTimeInMillis() startDiff = nowMillis - startMillis if TimeUnit.DAYS.convert(startDiff, TimeUnit.MILLISECONDS) > 14: raise Exception( "HDFS Data only includes the past 14 days of history. Try again with more recent dates or use the HBase data directly." ) endDiff = nowMillis - endMillis if TimeUnit.DAYS.convert(endDiff, TimeUnit.MILLISECONDS) < 1: raise Exception( "HDFS Data only includes data up to yesterday. For (partial) data for today, use the HBase data directly." ) dates = MyDateIterator() DateUtil.iterateByDay(startMillis, endMillis, dates) paths = [] for d in dates.get(): paths.append(hdfs_pathformat % (sdf_hdfs.format(Date(d)))) job.setInputFormatClass(MyInputFormat) FileInputFormat.setInputPaths(job, ",".join(paths)) job.getConfiguration().set("org.mozilla.jydoop.mappertype", "TEXT")
def hdfs_setupjob(job, args): """ Similar to the above, but run telemetry data that's already been exported to HDFS. Jobs expect two arguments, startdate and enddate, both in yyyyMMdd format. """ import java.text.SimpleDateFormat as SimpleDateFormat import java.util.Date as Date import java.util.Calendar as Calendar import java.util.concurrent.TimeUnit as TimeUnit import com.mozilla.util.DateUtil as DateUtil import com.mozilla.util.DateIterator as DateIterator import org.apache.hadoop.mapreduce.lib.input.FileInputFormat as FileInputFormat import org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat as MyInputFormat if len(args) != 2: raise Exception("Usage: <startdate-YYYYMMDD> <enddate-YYYYMMDD>") # use to collect up each date in the given range class MyDateIterator(DateIterator): def __init__(self): self._list = [] def get(self): return self._list def see(self, aTime): self._list.append(aTime) sdf = SimpleDateFormat(dateformat) sdf_hdfs = SimpleDateFormat(hdfs_dateformat) startdate = Calendar.getInstance() startdate.setTime(sdf.parse(args[0])) enddate = Calendar.getInstance() enddate.setTime(sdf.parse(args[1])) nowdate = Calendar.getInstance() # HDFS only contains the last 2 weeks of data (up to yesterday) startMillis = startdate.getTimeInMillis() endMillis = enddate.getTimeInMillis() nowMillis = nowdate.getTimeInMillis() startDiff = nowMillis - startMillis if TimeUnit.DAYS.convert(startDiff, TimeUnit.MILLISECONDS) > 14: raise Exception("HDFS Data only includes the past 14 days of history. Try again with more recent dates or use the HBase data directly.") endDiff = nowMillis - endMillis if TimeUnit.DAYS.convert(endDiff, TimeUnit.MILLISECONDS) < 1: raise Exception("HDFS Data only includes data up to yesterday. For (partial) data for today, use the HBase data directly.") dates = MyDateIterator() DateUtil.iterateByDay(startMillis, endMillis, dates) paths = [] for d in dates.get(): paths.append(hdfs_pathformat % (sdf_hdfs.format(Date(d)))) job.setInputFormatClass(MyInputFormat) FileInputFormat.setInputPaths(job, ",".join(paths));