Exemplo n.º 1
0
 def GetNewFiles(self, lastModifiedDatetime):
     '''
     Get the list of new files on S3
     '''
     maxModifiedDatetime = None
     files = []
     if lastModifiedDatetime is not None:
         maxModifiedDatetime = lastModifiedDatetime
         newFiles = S3Utilities.GetFilesSinceGivenDatetime(
             self.job["bucketName"], self.job["s3SrcDirectory"],
             lastModifiedDatetime)
         for newFile in newFiles:
             dtStr = newFile["datetime"]
             dt = DatetimeUtilities.ConvertToDT(dtStr)
             if dt > maxModifiedDatetime:
                 maxModifiedDatetime = dt
                 files.append(newFile["fileName"])
     else:
         newFiles = S3Utilities.GetFilesNModifiedDatetimeFromS3(
             self.job["bucketName"], self.job["s3SrcDirectory"])
         for newFile in newFiles:
             dtStr = newFile["datetime"]
             dt = DatetimeUtilities.ConvertToDT(dtStr)
             if maxModifiedDatetime is None:
                 maxModifiedDatetime = dt
             if dt > maxModifiedDatetime:
                 maxModifiedDatetime = dt
             files.append(newFile["fileName"])
     return (files, maxModifiedDatetime)
Exemplo n.º 2
0
    def GetFilesSinceGivenDatetime(s3Bucket, s3Path, lastModifiedDatetime):
        '''
        Returns the list of files on S3 that has date greater than the parameter lastModifiedDateStr
        s3Bucket to be passed in the format (string): "ihs-bda-data"
        s3Path to be passed in the format (string): "/projects/Pgcr_WindDashboard/ERCOT/"
        '''
        import os
        import re
        if not s3Path.startswith("/"):
            s3Path = "/" + s3Path.strip()

        if not s3Path.endswith("/"):
            s3Path = s3Path + "/"

        cmd = "aws s3 ls s3://" + s3Bucket + s3Path + " --recursive"
        listing = os.popen(cmd).read().split("\n")
        listing = [ls.strip() for ls in listing
                   if len(ls.strip()) > 0]  #remove blank lines
        modifiedFiles = []
        for item in listing:
            dtpart = str(item[0:20])  #get the first 19 characters
            dtpart = re.sub(r"\s+", " ", dtpart.strip(
            ))  #replace rogue space characters with a single space
            dt = DatetimeUtilities.ConvertToUTC(dtpart)
            if dt > lastModifiedDatetime:
                fileDict = {}
                filename = item.split(" ")[-1]
                fileDict["fileName"] = filename
                fileDict["datetime"] = str(
                    DatetimeUtilities.ConvertToUTC(dtpart))
                modifiedFiles.append(fileDict)
        return modifiedFiles
Exemplo n.º 3
0
 def GetLastModifiedDatetime(self, filelocs):
     '''
     Handles the incremental load of ERCOT data
     Pulls the json {"lastModifiedDatetime": "2017-06-07 19:51:06"} from eaa_dev.etl_process_logs
     Returns datetime in UTC
     '''
     self.currProcId = self.etlUtilities.GetRunID(filelocs["tblEtl"]["table"], self.moduleName)
     lastRunRecJson = self.etlUtilities.GetLastGoodRun(filelocs["tblEtl"]["table"], self.moduleName)
     paramsList = {}
     lastModifiedDatetime = None
     if (lastRunRecJson is not None) and (lastRunRecJson.get("params") is not None):
         paramsList = json.loads(lastRunRecJson["params"])
     if paramsList.get("lastModifiedDatetime") is not None:
         lastModifiedDatetime = paramsList["lastModifiedDatetime"]
     return DatetimeUtilities.ConvertToUTC(lastModifiedDatetime)
Exemplo n.º 4
0
 def Start(self, logger, moduleName, filelocs):
     '''
     Start of routine
     '''
     try:
         ApplicationBase.Start(self, logger, moduleName, filelocs)
         self.packedFolder = self.localTempDirectory + "/packed/"
         self.CreateFolders()
         lastModifiedDatetime = self.GetLastModifiedDatetime(filelocs)
         maxModifiedDatetime = self.ProcessFiles(lastModifiedDatetime)
         self.UploadPackedToS3()
         self.LoadErcotTables()
         self.SetLastModifiedDatetime(filelocs, DatetimeUtilities.ConvertToSTR(maxModifiedDatetime))
         self.EmptyPackedFolder()
         self.PostLoadETL()
     except Exception:
         logger.exception(moduleName + " - Exception!")
         raise
Exemplo n.º 5
0
 def FillDatesArray(self, xl, sh, rndx):
     '''
     take a row and filter the values so that we only have a the date and associated column
     '''
     retValArray = []
     try:
         datesArray = xl.ExcelFillRowToArray(sh, rndx)
         for cvNdx, cv in enumerate(datesArray):
             if DatetimeUtilities.IsDate(cv):
                 dObj = self.CellDataClass()
                 dObj.val = cv
                 dObj.ndx = cvNdx
                 retValArray.append(dObj)
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in : FillDatesArray ")
         raise
     return retValArray