示例#1
0
 def __init__(self):
     self.database_name = glb('dbName')
     self.hostname = glb('pgHost')
     self.username = glb('pgUser')
     self.password = glb('pgPasswd')
     self.url_connect = "jdbc:postgresql://" + self.hostname + \
                        ":5432/" + self.database_name
     self.properties = {"user"     : self.username,
                        "password" : self.password,
                        "driver"   : "org.postgresql.Driver"}
 def __init__(self, year, month):
     self.year = year
     self.month = month
     self.weatherBucket = glb('s3WeatherBucket')
     self.taxiBucket = glb('s3TaxiBucket')
     self.ylwTaxiPrefix = glb('ylwTaxiPrefix')
     self.weatherS3Key = []
     self.weatherFile = []
     self.wthrRep = []
     self.getYlwTaxiFilename()
     self.getWeatherFilename(
         ['Central-Park', 'La-Guardia', 'JFK-Intl', 'Newark-Intl'])
 def writeToPostgres(self, prefix):
     # There are two different kind of schemas:
     # Prior to 2017, use coordinates; after, location ID
     if 'pULocId' in self.ylwTaxi.columns:
         keepCols = glb('pgKeepCols1')
     else:
         keepCols = glb('pgKeepCols2')
     dropCols = [clm for clm in self.ylwTaxi.columns if clm not in keepCols]
     for clm in dropCols:
         self.ylwTaxi = self.ylwTaxi.drop(clm)
     self.ylwTaxi = self.ylwTaxi.select(keepCols)
     self.pgTableName = prefix + '_' + self.year + '_' + self.month
     connector = postgres.PostgresConnector()
     connector.write(self.ylwTaxi, self.pgTableName, glb('pgWriteMode'))
示例#4
0
def parseIsdLine(line):

    weatherFields = glb('weatherFields')
    
    # Initialize all to none
    readings={key: None for key in weatherFields}  
    
    # First 4 fields are year, month, day and hour.
    # Convert them into a time-stamp:
    date_hr = []
    fields = line.split()
    for cnt in range(4):
        date_hr.append(fields.pop(0))
            
    date_hr_str = date_hr[0]+'-'+date_hr[1]+'-'+date_hr[2]+' '+date_hr[3]+':00:00'
    timeType    = dtt.getTimeType(date_hr_str)
    timeStmp    = dtt.getTimeStamp(timeType)
    fields.insert(0,timeStmp)
       
    # Load values into readings and remove scaling:
    for cnt, val in enumerate(fields):
        if not isNone(val):
            key_val = getActualReadings(cnt, val)
            readings[weatherFields[key_val[0]]] = key_val[1]
    
    return readings  
def main():
    conf  = SparkConf()
    conf.set('spark.executorEnv.PGHOST'    , os.environ['PGHOST'])
    conf.set('spark.executorEnv.PGUSER'    , os.environ['PGUSER'])
    conf.set('spark.executorEnv.PGPASSWORD', os.environ['PGPASSWORD'])
    spark = SparkSession.builder                 \
                        .appName("batchProcess") \
                        .config(conf=conf)       \
                        .getOrCreate()
    
    spark.sparkContext.addPyFile("postgres.py")
    spark.sparkContext.addPyFile("globalVar.py")
    spark.sparkContext.addPyFile("datetimeTools.py")
    spark.sparkContext.addPyFile("batchProcessing.py")

    sqlc = SQLContext(sparkContext=spark.sparkContext, sparkSession=spark)
    
    # Years ond months of interest: n-years back from current year
    nOfYears = glb('nOfPassYears')
    currYear = dt.now().year
    yearList = [str(cnt + currYear - nOfYears + 1) for cnt in range(nOfYears)]
    #yearList = ['2017','2018','2019']
    months   = [str(val + 1).zfill(2) for val in range(12)]
    
    # Create an object for every taxi table
    # Make sure to remove object if file does not exist
    ptr = 0; tableObj = []
    for yr in yearList:
        for mn in months:
            tableObj.append(batchProcess(yr, mn))
            if not tableObj[ptr].hasTable(sqlc):
                del tableObj[ptr]
            else:
                ptr = ptr + 1

    # Start calling methods in batchProcessing.py
    for table in tableObj:
        table.readTable(sqlc)                # Read table
        table.addDatetime()                  # Add year, month, day and hour to table
        table.addMetrics()                   # Add vehicle speed and fare per mile
        table.fixPrecip()                    # Fix precipitation values
        for ind, station in enumerate(glb('noaaStations')):
            table.setStation(ind)
            table.aggByHour()                    # Aggregate data by the hour
            table.writeResults('hourly_yellow_') # Write to DB with prefix 'yellow'

    spark.stop()
示例#6
0
 def writeResults(self, prefix):
     pgTableName = prefix + self.year + '_' + self.month + '_' + 'st' + str(
         self.station)
     conn = postgres.PostgresConnector()
     conn.write(self.hr,
                pgTableName,
                glb('pgWriteMode'),
                db='taxi_aggregates')
 def getWeatherFilename(self, stationNames):
     # Create a list of weather filepaths for every station
     s3Prefix = glb('s3Prefix')
     for stationName in stationNames:
         keyName  = stationName + '/' + getStationID(stationName) + '-' + \
                      self.year + '-' + self.month + '.csv'
         filepath = s3Prefix + self.weatherBucket + "/" + keyName
         self.weatherS3Key.append(keyName)
         self.weatherFile.append(filepath)
 def readData(self, spark):
     # Read both weather reports (all stations) and taxi data
     s3ReadMode = glb('s3ReadMode')
     if self.year == '2016' and self.month in [
             '07', '08', '09', '10', '11', '12'
     ]:
         badSchmType = '1'  # Known bad schema
     else:
         badSchmType = '0'
     fields, dTypes     = \
         gtf.getColNamesAndTypes(self.taxiBucket, self.ylwTaxiS3Key, badSchmType)
     self.ylwTaxiSchema = makeSchema(fields, dTypes)
     self.ylwTaxi = spark.read.csv(self.ylwTaxiFile, header=True, \
                    mode=s3ReadMode, schema=self.ylwTaxiSchema)
     self.weatherSchema = makeSchema(glb('weatherFields'),
                                     glb('weatherDataType'))
     for wFile in self.weatherFile:
         self.wthrRep.append(spark.read.csv(wFile, header=True, \
                             mode=s3ReadMode, schema=self.weatherSchema))
def noaaGzipToCsv(filename):
    # Translate the isd-lite format to csv
    readings = []
    with gzip.open(filename, "r") as text:
        for line in text:
            readings.append(parseISD.parseIsdLine(line))  # See parseISD.py

    outFName = filename.replace('.gz', '.csv')
    with open(outFName, 'w') as outFile:
        writer = csv.DictWriter(outFile, fieldnames=glb('weatherFields'))
        writer.writeheader()
        for row in readings:
            writer.writerow(row)
    return outFName
def fixBadNoaaData():
    print("Central-Park 2012 APR-AUG messed up, use La-Guardia data.")
    weatherBucket = glb('s3WeatherBucket')
    ovr = True  # Overwrite
    S3Tools.copy_among_buckets(weatherBucket, 'La-Guardia/725030-14732-2012-04.csv', \
                               weatherBucket, 'Central-Park/725053-94728-2012-04.csv', ovr)
    S3Tools.copy_among_buckets(weatherBucket, 'La-Guardia/725030-14732-2012-05.csv', \
                               weatherBucket, 'Central-Park/725053-94728-2012-05.csv', ovr)
    S3Tools.copy_among_buckets(weatherBucket, 'La-Guardia/725030-14732-2012-06.csv', \
                               weatherBucket, 'Central-Park/725053-94728-2012-06.csv', ovr)
    S3Tools.copy_among_buckets(weatherBucket, 'La-Guardia/725030-14732-2012-07.csv', \
                               weatherBucket, 'Central-Park/725053-94728-2012-07.csv', ovr)
    S3Tools.copy_among_buckets(weatherBucket, 'La-Guardia/725030-14732-2012-08.csv', \
                               weatherBucket, 'Central-Park/725053-94728-2012-08.csv', ovr)
def splitIntoMonth(csvFiles, years):
    # Partition into months
    # Need to read the year before and after to account for
    # bad taxi data (some entry appears a couple of days late)
    outFiles = []
    for ind, csvFile in enumerate(csvFiles):
        curYearRec = []
        with open(csvFile) as curFile:
            dfCur = csv.DictReader(curFile)
            for row in dfCur:
                curYearRec.append(row)
        if not ind == 0:  # Previous year data
            ptr = 0
            with open(csvFiles[ind - 1]) as preFile:
                dfPre = csv.DictReader(preFile)
                for row in dfPre:
                    curYearRec.insert(ptr, row)
                    ptr = ptr + 1
        if not ind == len(csvFiles) - 1:  # Following year data
            with open(csvFiles[ind + 1]) as posFile:
                dfPost = csv.DictReader(posFile)
                for row in dfPost:
                    curYearRec.append(row)

        # Create a time stamp for each beginning of month
        months = [str(val + 1).zfill(2) for val in range(12)]
        monStr = [str(years[ind]) + '-' + mn + '-01 00:00:00' for mn in months]
        monTS = dtt.strArrayToTimeStamps(monStr)
        monTS.append(monTS[-1] + 31 * 24 * 3600)

        for cnt, month in enumerate(months):
            indices = [idx for idx, val in enumerate(curYearRec) \
                if int(val['timeStamp']) >= monTS[cnt] - 48*3600 and \
                   int(val['timeStamp']) <= monTS[cnt + 1] + 49*3600]
            # 48 hr overlap; 49 because of the way list works

            if not indices == []:
                monthRec = curYearRec[indices[0]:indices[-1]]
                outFName = csvFile.replace('.csv', '-' + month + '.csv')
                with open(outFName, 'w') as outFile:
                    writer = csv.DictWriter(outFile,
                                            fieldnames=glb('weatherFields'))
                    writer.writeheader()
                    for row in monthRec:
                        writer.writerow(row)
                outFiles.append(outFName)

    return outFiles
def main():
    # One spark session to join them all
    conf = SparkConf()
    conf.set('spark.executorEnv.PGHOST', os.environ['PGHOST'])
    conf.set('spark.executorEnv.PGUSER', os.environ['PGUSER'])
    conf.set('spark.executorEnv.PGPASSWORD', os.environ['PGPASSWORD'])
    spark = SparkSession.builder             \
                        .appName("timeJoin") \
                        .config(conf=conf)   \
                        .getOrCreate()

    spark.sparkContext.addPyFile("postgres.py")
    spark.sparkContext.addPyFile("globalVar.py")
    spark.sparkContext.addPyFile("getTaxiFields.py")
    spark.sparkContext.addPyFile("datetimeTools.py")
    spark.sparkContext.addPyFile("appendWeatherData.py")
    spark.sparkContext.addPyFile("dataProcessing.py")

    # Years ond months of interest: n-years back from current year
    nOfYears = glb('nOfPassYears')
    currYear = datetime.now().year
    yearList = [str(cnt + currYear - nOfYears + 1) for cnt in range(nOfYears)]
    months = [str(val + 1).zfill(2) for val in range(12)]

    # Create an object for every taxi data file
    # Make sure to remove object if file does not exist
    ptr = 0
    dataObj = []
    for yr in yearList:
        for mn in months:
            dataObj.append(dataProcess(yr, mn))
            if not dataObj[ptr].hasData():
                del dataObj[ptr]
            else:
                ptr = ptr + 1

    # Start calling methods in dataProcessing.py
    for dProp in dataObj:
        dProp.readData(spark)  # Read data
        dProp.addTimestamp()  # Convert string to timestamp
        dProp.addWthrStationID()  # Add weather station ID
        dProp.joinTables(spark)  # Main join process
        dProp.writeToPostgres('yellow')  # Write to DB with prefix 'yellow'
        #dProp.printCheck()

    spark.stop()
def main():
    # These variables are sourced from globalVar.py
    noaaFtpDomain = glb('noaaFtpDomain')
    noaaFtpPath = glb('noaaFtpPath')
    noaaStations = glb('noaaStations')
    noaaLogin = glb('noaaLogin')
    noaaPassword = glb('noaaPassword')
    nOfYears = glb('nOfPassYears')
    weatherBucket = glb('s3WeatherBucket')

    # Backup weather data
    print('Moving current data to back-up bucket.')
    backupBucket = weatherBucket + '-bak'
    S3Tools.duplicateBucket(origBucket=weatherBucket, newBucket=backupBucket)

    # Years of interest: n-years back from current year
    currYear = datetime.now().year
    yearList = [cnt + currYear - nOfYears + 1 for cnt in range(nOfYears)]

    # Loop over a few stations of interest
    for station in noaaStations:
        csvFiles = []
        for yrInd, year in enumerate(yearList):
            pathname = noaaFtpPath + str(
                year)  # According to NOAA dir. structure
            filename = station + '-' + str(year) + '.gz'
            print("Processing: %s" % filename)
            downloadFromFtp(noaaFtpDomain, pathname, noaaLogin, noaaPassword,
                            [filename])
            csvFiles.append(noaaGzipToCsv(filename))
            cleanupLocal([filename])  # Remove *.gz files

        # Partition by month
        print("Partitioning by month...")
        outFiles = splitIntoMonth(csvFiles, yearList)
        uploadToS3(outFiles, weatherBucket, noaaStations.get(station))
        cleanupLocal(csvFiles)
        cleanupLocal(outFiles)

    fixBadNoaaData()
 def getYlwTaxiFilename(self):
     # Define taxi data filepath
     s3Prefix = glb('s3Prefix')
     self.ylwTaxiS3Key = self.ylwTaxiPrefix + self.year + '-' + self.month + '.csv'
     self.ylwTaxiFile = s3Prefix + self.taxiBucket + "/" + self.ylwTaxiS3Key
def getStationID(stationName):
    noaaStations = glb('noaaStations')
    return noaaStations.keys()[noaaStations.values().index(stationName)]