Пример #1
0
    def CreateMigrationScript(self):
        '''
        takes the template for the Update sqlscript and customizes it
        '''
        sqlMigrateScript = None
        try:
            ###
            #  make sure that we have a place to put the sql script
            ###
            sqlScript = 'PopulateHistoryTemplate.sql'
            self.logger.debug(self.moduleName + " -- " +
                              "CreateMigrationScript" + " starting ")
            sqlMigrateTemplate = self.location + '/sql/' + sqlScript
            sqlMigrateScript = self.commonParams["sqlFolder"] + re.sub(
                'Template.sql$', '.sql', sqlScript)
            #commonParams["cat"]
            FileUtilities.CreateFolder(self.commonParams["sqlFolder"])
            FileUtilities.RemoveFileIfItExists(sqlMigrateScript)
            ###
            #  gather variables needed
            ###
            schemaName = None
            attrSrc = None
            dataSrc = None
            attrDest = None
            dataDest = None
            orderByFields = None
            partByFields = None

            for table in self.commonParams["cat"]["tables"]:
                schemaName = table["schemaName"]
                if table["type"] == "attributes":
                    attrSrc = table["srctable"]
                    attrDest = table["table"]
                    if "partition" in table:
                        orderByFields = table["partition"]["order"]
                        partByFields = table["partition"]["over"]
                elif table["type"] == "series":
                    dataSrc = table["srctable"]
                    dataDest = table["table"]

            with open(sqlMigrateTemplate) as infile, open(
                    sqlMigrateScript, 'w') as outfile:
                for line in infile:
                    line = line.replace('{schemaName}', schemaName)
                    line = line.replace('{attrSrc}', attrSrc)
                    line = line.replace('{dataSrc}', dataSrc)
                    line = line.replace('{attrDest}', attrDest)
                    line = line.replace('{dataDest}', dataDest)
                    line = line.replace('{orderByFields}', orderByFields)
                    line = line.replace('{partByFields}', partByFields)
                    outfile.write(line)
            self.logger.debug(self.moduleName + " -- " +
                              "CreateMigrationScript" + " finished ")
        except:
            self.logger.exception(
                self.moduleName +
                " - we had an error in CreateMigrationScript")
            raise
        return sqlMigrateScript
Пример #2
0
 def CreateSQLFiles(self, proc, dest):
     '''
     Routine to create sql files to use to create tables in RedShift
     '''
     try:
         for table in proc["tables"]:
             fname = self.commonParams["sqlFolder"] + "Create_" + table[
                 "name"] + ".sql"
             self.logger.info(fname)
             outfile = open(fname, "w")
             outLine = "DROP TABLE IF EXISTS {}.{};".format(
                 dest, table["name"])
             outLine = FileUtilities.PutLine(outLine, outfile)
             outLine = "CREATE TABLE {}.{} (".format(dest, table["name"])
             outLine = FileUtilities.PutLine(outLine, outfile)
             ndx = 0
             for fld in table["fields"]:
                 if ndx > 0:
                     outLine = ','
                 ndx = ndx + 1
                 outLine = outLine + fld["name"] + " " + fld["type"]
                 if fld["type"] == "VARCHAR":
                     outLine = outLine + "(" + fld["size"] + ")  ENCODE LZO"
                 outLine = FileUtilities.PutLine(outLine, outfile)
             outfile.write(");")
             outfile.close()
     except:
         self.logger.exception(self.moduleName +
                               "- we had an error in CreateSQLFiles ")
         raise
Пример #3
0
 def ProcessFiles(self, lastModifiedDatetime):
     '''
     Start processing the ERCOT files
     '''
     maxModifiedDatetime = None
     try:
         filesOnS3, maxModifiedDatetime = self.GetNewFiles(lastModifiedDatetime)
         filesOnS3 = [fl.replace(self.job["s3SrcDirectory"][1:],"") for fl in filesOnS3]
         for fileConfig in self.job["files"]:
             zipFiles = list(filter(re.compile(fileConfig["FileRegex"]).match, filesOnS3))
             for zipFileName in zipFiles:  # eg DAM_Hr_LMP_2011.zip
                 self.DownloadFile(self.job["s3SrcDirectory"][1:] + zipFileName, self.localTempDirectory + "/raw/")
                 self.fileUtilities.UnzipUsing7z(self.localTempDirectory + "/raw/" + zipFileName, self.localTempDirectory + "/output/")
                 level2Files = self.fileUtilities.ScanFolder(self.localTempDirectory + "/output/")
                 level2Files = [l2File for l2File in level2Files if l2File.lower().endswith("_csv.zip")]  # exclude all non-csv files
                 for l2File in level2Files:  # eg cdr.00012328.0000000000000000.20110101.131852.DAMHRLMPNP4183_csv.zip
                     l2zip = self.localTempDirectory + "/output/" + l2File
                     self.fileUtilities.UnzipUsing7z(l2zip, self.localTempDirectory + "/csvs/")
                     FileUtilities.RemoveFileIfItExists(l2zip)  # delete the file after unzipping
                 FileUtilities.RemoveFileIfItExists(self.localTempDirectory + "/raw/" + zipFileName)  # delete the parent file
                 self.AddColumnSkipHeader(self.localTempDirectory + "/csvs/")  # Add column DSTFlag if it doesn't exist and skip header
                 self.PackFiles(self.localTempDirectory + "/csvs/", self.localTempDirectory + "/packed/")
     except:
         self.logger.exception("Error while processing ERCOT files")
         raise
     return maxModifiedDatetime
Пример #4
0
    def __init__(self):
        '''
        Initial settings
        '''
        super(VantageAthenaSpark, self).__init__()

        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))
Пример #5
0
    def CheckWorkingFolders(self):
        '''
        Check if the working folders are out there to re-create them
        '''
        self.logger.info(self.moduleName + "Checking on working folders...")

        FileUtilities.RemoveFolder(self.rawFolder)
        FileUtilities.RemoveFolder(self.csvFolder)
        FileUtilities.CreateFolder(self.rawFolder)
        FileUtilities.CreateFolder(self.csvFolder)
Пример #6
0
    def __init__(self):
        '''
        Initial settings
        '''
        super(AutoInsight, self).__init__()

        self.awsParams = ""
        self.processingFile = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__)))
Пример #7
0
    def __init__(self):
        '''
        Initial settings
        '''
        super(ECRConnectAthenaSpark, self).__init__()

        self.jDataPulled = False
        self.xRefPulled = False
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))
Пример #8
0
    def CleanWorkingFolders(self):
        '''
        Ensures the folders are cleaned and ready before the process execution.
        '''
        self.logger.info("Cleaning local working folders...")

        FileUtilities.RemoveFolder(self.tempFolder)
        FileUtilities.RemoveFolder(self.packedFolder)

        FileUtilities.CreateFolder(self.tempFolder)
        FileUtilities.CreateFolder(self.packedFolder)
Пример #9
0
    def __init__(self):
        '''
        Initial settings
        '''
        super(ECRConnect, self).__init__()

        self.awsParams = ""
        self.csvFile = None
        self.csvFileHistory = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))
Пример #10
0
    def __init__(self):
        '''
        Initial settings
        '''
        super(Vantage, self).__init__()

        self.awsParams = ""
        self.packedFolder = None
        self.rawFolder = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))
Пример #11
0
 def CreateFolders(self):
     '''
     Creates folder if it doesn't exist
     If it already exists, empties the folder contents
     '''
     FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                       self.job["downloadPath"])
     FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                       "/cleaned/")
     FileUtilities.EmptyFolderContents(self.localTempDirectory + "/packed/")
     for fp in self.job["foxpro_files"]:
         FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                           "/packed/" + fp["Name"] + "/")
Пример #12
0
    def __init__(self):
        '''
        Initial settings
        '''
        super(PGCRAirMarketsAthenaSpark, self).__init__()

        self.awsParams = ""
        self.tempFolder = None
        self.packedFolder = None
        self.rawDataFolder = None
        self.toPackFiles = []

        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__)))
Пример #13
0
 def CreateFolders(self):
     '''
     Creates folders
     '''
     FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                       self.job["folderPath"]["raw"])
     FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                       self.job["folderPath"]["ident"])
     FileUtilities.EmptyFolderContents(
         self.localTempDirectory + self.job["folderPath"]["transactions"])
     FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                       self.job["folderPath"]["contracts"])
     FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                       self.job["folderPath"]["indexPub"])
Пример #14
0
 def __init__(self):
     '''
     Initial settings
     '''
     super(HistoricalBrentAthenaSpark, self).__init__()
     self.location = FileUtilities.PathToForwardSlash(
         os.path.dirname(os.path.abspath(__file__)))
Пример #15
0
 def CleanFiles(self, iso):
     '''
     Generic cleaning wrapper
     '''
     self.logger.info("Cleaning data")
     ignoreLines = None
     columnCount = None
     inputPath = self.localTempDirectory + "/" + iso["Name"] + "/"
     outputPath = self.localTempDirectory + "/" + iso["Name"] + "/cleaned/"
     FileUtilities.CreateFolder(
         outputPath)  #creates the cleaned folder if doesn't already exist
     if iso.get("IgnoreLines") is not None:
         ignoreLines = iso.get("IgnoreLines")
     if iso.get("column_count") is not None:
         columnCount = iso.get("column_count")
     listOfFiles = self.fileUtilities.GetListOfFiles(
         inputPath, self.job["input_file_type"])  #get all the CSV files
     self.logger.info("Files found: {}".format(str(len(listOfFiles))))
     for fp in listOfFiles:
         try:
             self.fileUtilities.CleanFile(inputPath + fp,
                                          outputPath + fp,
                                          IgnoreLines=ignoreLines,
                                          ColumnCount=columnCount,
                                          Delimiter=self.job["delimiter"])
         except Exception as ex:
             self.logger.exception(
                 "Error while cleaning the MISO file: {}".format(fp))
             self.logger.exception("{}".format(str(ex)))
             raise
Пример #16
0
 def __init__(self, logger):
     '''
     Constructor
     '''
     self.logger = logger
     self.location = FileUtilities.PathToForwardSlash(
         os.path.dirname(os.path.abspath(__file__)))
Пример #17
0
 def __init__(self):
     '''
     Initial settings
     '''
     super(AthenaSparkIncrementalExample, self).__init__()
     self.location = FileUtilities.PathToForwardSlash(
         os.path.dirname(os.path.abspath(__file__)))
Пример #18
0
    def CreatePullScript(self):
        '''
        takes the template for the pull script and customizes it for the data we need
        '''
        sqlPullDataScript = None
        try:
            self.logger.debug(self.moduleName + " -- " + "CreatePullScript" +
                              " starting ")
            sqlPullDataTemplate = self.location + '/sql/' + self.job[
                "sqlPullDataScriptTemplate"]
            sqlPullDataScript = self.localTempDirectory + "/sql/" + re.sub(
                'Template.sql$', '.sql', self.job["sqlPullDataScriptTemplate"])
            FileUtilities.RemoveFileIfItExists(sqlPullDataScript)

            with open(sqlPullDataTemplate) as infile, open(
                    sqlPullDataScript, 'w') as outfile:
                for line in infile:
                    outfile.write(line)
            self.logger.debug(self.moduleName + " -- " + "CreatePullScript" +
                              " finished ")
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in CreatePullScript")
            raise
        return sqlPullDataScript
Пример #19
0
 def __init__(self):
     '''
     Constructor
     '''
     super(MagellanODATAAthenaSpark, self).__init__()
     self.location = FileUtilities.PathToForwardSlash(
         os.path.dirname(os.path.abspath(__file__)))
Пример #20
0
 def __init__(self):
     '''
     Initial settings
     '''
     super(FHWA, self).__init__()
     self.location = FileUtilities.PathToForwardSlash(
         os.path.dirname(os.path.abspath(__file__)))
Пример #21
0
 def __init__(self):
     '''
     Constructor for this class
     '''
     self.location = FileUtilities.PathToForwardSlash(
         os.path.dirname(os.path.abspath(__file__)))
     self.awsParams = ""
Пример #22
0
 def CreateUpdateScript(self, pSchema, pTable):
     '''
     takes the template for the Update sqlscript and customizes it
     '''
     sqlUpdateScript = None
     try:
         self.logger.debug(self.moduleName + " -- " +
                           "UpDate Table Script" + " starting ")
         sqlUpdateTemplate = self.location + '/' + self.job[
             "sqlUpdateScript"]
         sqlUpdateScript = self.localTempDirectory + "/" + re.sub(
             'Template.sql$', '.sql', self.job["sqlUpdateScript"])
         FileUtilities.RemoveFileIfItExists(sqlUpdateScript)
         with open(sqlUpdateTemplate) as infile, open(sqlUpdateScript,
                                                      'w') as outfile:
             for line in infile:
                 line = line.replace('{schemaName}',
                                     self.job["destinationSchema"])
                 line = line.replace('{tbname}', 'Totem')
                 line = line.replace('{tbtotem}', self.sourceTableName)
                 line = line.replace('{tbstats}', pSchema + "." + pTable)
                 line = line.replace('{procid}', str(self.currProcId))
                 outfile.write(line)
         self.logger.debug(self.moduleName + " -- " +
                           "UpDate Table Script" + " finished ")
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in UpDate Table Script")
         raise
     return sqlUpdateScript
Пример #23
0
def ProcessApps(logger, processArray, folderlocs):
    '''
    ProcessApps process all the applications that are turned on.
    '''
    FileUtilities.CreateFolder(folderlocs["relativeOutputfolder"])
    try:
        ab = ApplicationBase()
        ev = ab.LoadEnvironmentVariables(logger)
        if "tblEtl" in folderlocs:
            etlUtilities = EtlLoggingUtilities(logger)
            etlUtilities.awsParams = ev.awsParams
            etlUtilities.appschema = folderlocs["tblEtl"]["appschema"]
            etlUtilities.etlSchema = folderlocs["tblEtl"]["schemaName"]

        for proc in processArray:
            module = proc["module"]
            baseName = module.rsplit('.', 1)[1]

            logger.info(baseName + " - Starting module.")
            moduleName = importlib.import_module(module)
            className = getattr(moduleName, baseName)()
            className.Start(logger, baseName, folderlocs) # For single threading

            if "tblEtl" in folderlocs:
                procid = etlUtilities.GetRunID(folderlocs["tblEtl"]["table"], str(baseName))
                if procid > -1:
                    etlUtilities.CompleteInstance(folderlocs["tblEtl"]["table"], procid, 'C')

    except:
        logger.exception("Exception processing application modules!")
        raise
    logger.info(baseName + " - module COMPLETED.")
Пример #24
0
 def __init__(self):
     '''
     Initial settings
     '''
     super(AutoLightVehiclesAthena, self).__init__()
     self.location = FileUtilities.PathToForwardSlash(
         os.path.dirname(os.path.abspath(__file__)))
Пример #25
0
    def __init__(self):
        '''
        Constructor
@summary: Explanation of sql
        The primary table is PeriodValues all the other tables are used as supporting
        Product --> to get the definition of the product if needed
        Location --> obtain the description of location and although I am against hardcoding and because of the layout of the data
                1)  LocationTypeID of 2 is regional and this data could be double dipped so we are omitting those as well as
                2)  Exceptions to the rule are USR and DDR that also has the chance to be double dippled
        Category -->  obtain the description of the category ID
                1)  We are only interested in specific categories and based on our Chemical expert they are :
                    a.  10  --> Production
                    b.  12  --> Total Supply
                    c.  15  --> Total Demand
                    d.  19  --> Domestic Demand
        SubCategory -->  although we are not using this table it is important to note it since we are only looking at the summary level
                         where the value = 0 rather than each broken down level from the PeriodValues table

        Last comment is the we are storing the name of the database in the config file since it could change
        periodically depending on what is provided
        for instance currently the last good database is WASP_2015t
        There are two pulls of the database each one 6 months apart.  The first one will be in the form of WASP2015
        and the next on six months later will have a t at the end such as WASP2015t
        '''
        super(ChemicalsAthenaSpark, self).__init__()
        self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__)))
Пример #26
0
 def SkipHeader(self, inputFolderPath):
     '''
     Skips the Header
     '''
     csvFiles = self.fileUtilities.ScanFolder(inputFolderPath)
     for csvFile in csvFiles:
         FileUtilities.RemoveLines(inputFolderPath + csvFile, [1])
Пример #27
0
 def __init__(self):
     '''
     Constructor
     '''
     super(Counter, self).__init__()
     self.location = FileUtilities.PathToForwardSlash(
         os.path.dirname(os.path.abspath(__file__)))
Пример #28
0
 def ProcessFiles(self, downloadedFiles):
     '''
     Process the XML files
     '''
     counter = 1
     for xmlFile in downloadedFiles:
         collection = xml.dom.minidom.parse(xmlFile).documentElement
         fileFullPath = self.localTempDirectory + "/kml" + str(counter) + ".csv"
         self.fileUtilities.CreateFile(fileFullPath)
         for folder in collection.getElementsByTagName("Folder"):
             if folder.getElementsByTagName("name")[0].childNodes[0].data == "LMP Points": #name tag
                 for placemark in folder.getElementsByTagName("Placemark"):
                     plantDetails = self.GetPlantDetails(placemark)
                     line = "{}|{}|{}|{}|{}|{}|{}|{}\n".format(plantDetails["name"],
                                                               plantDetails["settlementPoint"],
                                                               plantDetails["plantName"],
                                                               plantDetails["plantAddress"],
                                                               plantDetails["county"],
                                                               plantDetails["utility"],
                                                               plantDetails["latitude"],
                                                               plantDetails["longitude"])
                     FileUtilities.WriteToFile(fileFullPath, line)
         self.PackFile(fileFullPath)
         self.fileUtilities.DeleteFile(xmlFile)
         counter = counter + 1
Пример #29
0
 def MoveToS3(self, localFolderName, folderName, subFolder):
     '''
     move gzip files to s3 and clean local instance
     localFolderName --> local folder name
     subFolder --> date
     folderName --> folder name on s3
     '''
     try:
         self.logger.debug(self.moduleName + " -- " + "MoveToS3 " +
                           localFolderName + " starting ")
         ###
         #  move any gzip files to the s3 server
         ###
         s3folder = "s3://" + self.job["bucketName"] + self.job["s3GzipFolderBase"] +\
                     "/" + folderName + '/' + subFolder
         localFolder = self.fileUtilities.gzipFolder + localFolderName
         S3Utilities.SyncFolderAWSCli(localFolder,
                                      s3folder,
                                      args='''--quiet --include "*.gz"''',
                                      dbug="Y")
         # Cleanup local files
         FileUtilities.EmptyFolderContents(localFolder)
         self.logger.debug(self.moduleName + " -- " + "MoveToS3 " +
                           localFolderName + " finished ")
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in MoveToS3")
         raise
Пример #30
0
    def __init__(self):
        '''
        Initial settings
                    commonParams = {}
                    commonParams["cat"] = cat { see below }
                    commonParams["moduleName"] = self.moduleName
                    commonParams["loggerParams"] = "log"
                    commonParams["sqlFolder"] = self.sqlFolder

        cat is a json nugget that defines the tables required
             cat = {
                "name": "giif",
                "tables": [
                {
                    "schemaName": "eaa_dev",
                    "srctable": "GEForcast_giif_attributes_working",
                    "table": "GEForcastHistory_giif_attributes",
                    "new": "Y",
                    "type": "attributes",
                    "partition": {
                        "over": "mnemonic, frequencychar",
                        "order": "publisheddate"
                    },
                    "fields": [
                        { "name": "object_id", "type": "VARCHAR", "size": "30" },
                        { "name": "name", "type": "VARCHAR", "size": "100" },
                        { "name": "mnemonic", "type": "VARCHAR", "size": "100" },
                        { "name": "frequencyChar", "type": "VARCHAR", "size": "2" },
                        { "name": "geo", "type": "VARCHAR", "size": "20" },
                        { "name": "startDate", "type": "Date"},
                        { "name": "endDate", "type": "Date"},
                        { "name": "updatedDate", "type": "Date"},
                        { "name": "publishedDate", "type": "Date"},
                        { "name": "longLabel", "type": "VARCHAR", "size": "2000" },
                        { "name": "dataEdge", "type": "VARCHAR", "size": "100" }
                    ],
                    "sortkey":"object_id"
                },
                {
                    "schemaName": "eaa_dev",
                    "srctable": "GEForcast_giif_data_working",
                    "table": "GEForcastHistory_giif_data",
                    "new": "Y",
                    "type": "series",
                    "fields": [
                        { "name": "object_id", "type": "VARCHAR", "size": "30" },
                        { "name": "date", "type": "DATE" },
                        { "name": "value", "type": "FLOAT8" }
                    ],
                    "sortkey":"object_id, date"
                }]
            }
        '''
        self.commonParams = {}
        self.fileUtilities = None
        self.logger = None
        self.moduleName = None
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))