示例#1
0
   def __init__(self, yaml_file): 
      # Bootstrap environment
      self.config = yaml.load(open(yaml_file))
      self.env    = yaml.load(open(self.config['ENV']))

      self.db = OnlineDB( self.env['DSN'] )
      set_schema_sql = self.env['SET_SCHEMA_SQL']
      self.db.executeSQL(set_schema_sql, True)
示例#2
0
class timestamp_dim_loader:
    def __init__(self):
        #  Reading configuration file ( YAML file )
        self.logger = EtlLogger.get_logger(
            self.__class__.__name__)  # use class name as the log name
        self.lock = JobLock(
            self.__class__.__name__)  # use class name as the lock name
        self.env = yaml.load(open(config['ENV']))
        self.db = OnlineDB(self.env['DSN'], logger=self.logger)
        set_schema_sql = self.env['SET_SCHEMA_SQL']
        self.db.executeSQL(set_schema_sql)
        self.config = config

    def loadTimestamp(self, date_sid):
        row_sql = config['CHECK_COUNT'] % (date_sid)
        existing_rows = self.db.retrieveSQL(row_sql)
        if not existing_rows:
            date_part = date_sid[0:4] + "-" + date_sid[4:6] + "-" + date_sid[
                6:8]
            self.logger.info("Starting to load hourly timestamp")
            sql = config['INSERT_NEW_RECORD'] % (date_sid, date_part,
                                                 date_part)
            self.db.executeSQL(sql, False)
            self.logger.info("inserted records for %s" % (date_part))

    def updateLoadState(self):
        # update load_state and commit
        self.db.updateLoadStateWithCurrentTime(self.config['LOAD_STATE_VAR'])
        self.db.commit()
        self.logger.info("Updated load_state variable %s and  committed txn" %
                         (self.config['LOAD_STATE_VAR']))
class timestamp_dim_loader:
        def __init__( self ):
            #  Reading configuration file ( YAML file )
            self.logger=EtlLogger.get_logger(self.__class__.__name__)  # use class name as the log name
            self.lock = JobLock(self.__class__.__name__)  # use class name as the lock name
            self.env= yaml.load( open( config['ENV'] ) )
            self.db = OnlineDB( self.env['DSN'], logger=self.logger )
            set_schema_sql = self.env['SET_SCHEMA_SQL']
            self.db.executeSQL( set_schema_sql )
            self.config = config

        def loadTimestamp( self,date_sid):
            row_sql=config['CHECK_COUNT'] %(date_sid)
	    existing_rows = self.db.retrieveSQL(row_sql)
	    if not existing_rows:
	     date_part=date_sid[0:4]+"-"+date_sid[4:6]+"-"+date_sid[6:8]
             self.logger.info("Starting to load hourly timestamp")
	     sql=config['INSERT_NEW_RECORD'] %(date_sid,date_part,date_part)
	     self.db.executeSQL(sql, False)
             self.logger.info("inserted records for %s" %(date_part))
     	def updateLoadState(self):
      	# update load_state and commit
	   self.db.updateLoadStateWithCurrentTime(self.config['LOAD_STATE_VAR'])
      	   self.db.commit()
      	   self.logger.info("Updated load_state variable %s and  committed txn" % ( self.config['LOAD_STATE_VAR']))	           
示例#4
0
 def __init__(self):
     #  Reading configuration file ( YAML file )
     self.logger = EtlLogger.get_logger(
         self.__class__.__name__)  # use class name as the log name
     self.lock = JobLock(
         self.__class__.__name__)  # use class name as the lock name
     self.env = yaml.load(open(config['ENV']))
     self.db = OnlineDB(self.env['DSN'], logger=self.logger)
     set_schema_sql = self.env['SET_SCHEMA_SQL']
     self.db.executeSQL(set_schema_sql)
     self.config = config
   def __init__(self, yaml_file, intrvl, feed_hour, data_files, schema_file): 
      # Bootstrap environment
      self.config = yaml.load(open(yaml_file))
      self.env    = yaml.load(open(self.config['ENV']))
      self.rollup_config = yaml.load(open(self.config['ROLLUP_CONFIG']))

      self.db = OnlineDB( self.env['DSN'] )
      self.intrvl = intrvl
      self.feed_hour = feed_hour
      self.data_files = data_files
      self.schema_file = schema_file
      set_schema_sql = self.env['SET_SCHEMA_SQL']
      self.db.executeSQL(set_schema_sql, True)
    def __init__(self,table_name,config,logger):
        self.logger = logger # use class name as the log name
        #  Reading configuration file ( YAML file )
	self.config=config
        self.env = yaml.load(open(config['ENV']))
        self.db = OnlineDB(self.env['DSN'], self.logger)
        set_schema_sql = self.env['SET_SCHEMA_SQL']
        self.db.executeSQL(set_schema_sql)
 def __init__( self ):
     #  Reading configuration file ( YAML file )
     self.logger=EtlLogger.get_logger(self.__class__.__name__)  # use class name as the log name
     self.lock = JobLock(self.__class__.__name__)  # use class name as the lock name
     self.env= yaml.load( open( config['ENV'] ) )
     self.db = OnlineDB( self.env['DSN'], logger=self.logger )
     set_schema_sql = self.env['SET_SCHEMA_SQL']
     self.db.executeSQL( set_schema_sql )
     self.config = config
示例#8
0
class schema_mismatch:

   def __init__(self, yaml_file): 
      # Bootstrap environment
      self.config = yaml.load(open(yaml_file))
      self.env    = yaml.load(open(self.config['ENV']))

      self.db = OnlineDB( self.env['DSN'] )
      set_schema_sql = self.env['SET_SCHEMA_SQL']
      self.db.executeSQL(set_schema_sql, True)


   def find_mismatch(self):
      #table_mismatch = self.db.retrieveSQL()
      self.db.executeSQL(self.config['INSERT_TABLE_MISMATCH'], True)
      self.db.executeSQL(self.config['INSERT_COLUMN_MISMATCH'], True)
      #[(res,)] = self.db.retrieveSQLArgs(check_sql, args[0])

   def transfer_schema_data(self):
    first_host = self.config['FIRST_HOST']
    second_host = self.config['SECOND_HOST']   
    self.db.executeSQL(self.config['TRUNCATE_TEMP_COLUMNS'], True)
    self.db.executeSQL(self.config['TRUNCATE_TABLE_MISMATCH'], True)
    self.db.executeSQL(self.config['TRUNCATE_COLUMN_MISMATCH'], True)
     	
    query="time /opt/vertica/bin/vsql -h "+second_host+" -U "+self.env['VERTICA_USER']+" -w "+self.env['VERTICA_PASSWORD']+" -c "
    query=query+" \"CONNECT TO VERTICA DW USER "+self.env['VERTICA_USER']+" PASSWORD '"+self.env['VERTICA_PASSWORD']+"' ON '"+first_host+"',"+self.env['TCP_PORT']+";"
    query=query+"EXPORT TO VERTICA DW.schema.temp_columns AS select * from DW.v_catalog.columns;\""
    print query
    subprocess.call(query,shell=True)
class ox_openrtb_sum_hourly:

   def __init__(self, yaml_file, intrvl, feed_hour, data_files, schema_file): 
      # Bootstrap environment
      self.config = yaml.load(open(yaml_file))
      self.env    = yaml.load(open(self.config['ENV']))
      self.rollup_config = yaml.load(open(self.config['ROLLUP_CONFIG']))

      self.db = OnlineDB( self.env['DSN'] )
      self.intrvl = intrvl
      self.feed_hour = feed_hour
      self.data_files = data_files
      self.schema_file = schema_file
      set_schema_sql = self.env['SET_SCHEMA_SQL']
      self.db.executeSQL(set_schema_sql, True)

   def load_temporary_table(self):
      tmp_table_name = self.config['FEED_TMP_TABLE_NAME']
      drop_stmt = "DROP TABLE IF EXISTS %s" % (tmp_table_name)
      print "Removing temporary table %s" % (tmp_table_name)
      self.db.executeSQL(drop_stmt, True)
    
      feed_columns = self.config['FEED_COLUMNS'] 
      orderby = self.config['FEED_TMP_TABLE_ORDER_BY_CLAUSE']
      segmentation = self.config['FEED_TMP_TABLE_SEGMENTATION_CLAUSE']

      create_stmt = vertica_create_sql_detailed(self.schema_file,
         tmp_table_name, feed_columns, temp=True, pretty=False, orderby=orderby, segmentation=segmentation)
      print "Creating temporary table %s using column mapping: %s" % ( tmp_table_name, feed_columns) 
      self.db.executeSQL(create_stmt, commit=True)

      load_stmt = vertica_multifile_load_sql_detailed(self.data_files, self.schema_file,
         tmp_table_name, feed_columns, local=True, pretty=True)

      print "Loading table from data file %s" % ( self.data_files )
      result_count = self.db.executeSQL(load_stmt, commit=True)

      # Sanity check: the data should be for the hour ODFI claims it's for
      wrong_hour_result = self.db.retrieveSQLArgs(self.config['FEED_TMP_CHECK_HOUR'], self.feed_hour)
      if wrong_hour_result:
         wrong_hour_count = wrong_hour_result[0][0]
         if wrong_hour_count > 0:
            raise Exception("ERROR: File(s) %s contain %d rows not for the current hour (%s)" % 
              (self.data_files, wrong_hour_count, self.feed_hour))

      return result_count
   def max_element(self,itemList):
     counter = {}
     maxItemCount = 0
     mostPopularItem=itemList[0][0]    
     for item in itemList:
       if item[0][0] in counter.keys():
        counter[item[0][0]] += 1
       else:
        counter[item[0][0]] = 1
       if counter[item[0][0]] > maxItemCount:
        maxItemCount = counter[item[0][0]]
        mostPopularItem = item[0][0]
     return mostPopularItem

   def update_carrier_data(self):
      code=self.db.retrieveSQLArgs(self.config['GET_MISSING_CARRIER_CODES'],())
      code_part=[x[0].encode('UTF-8') for x in code if x[0] is not None]
      print code_part 
      for codes in code_part:         
        if codes.find(',')>0:
          single_codes=codes.split(',')
          final_code={}            
          for single_code in single_codes:
            print "Single code from collection : %s" %(single_code)
            try:
                code_name=self.db.retrieveSQL(self.config['GET_SINGLE_CARRIER']+"'"+single_code+"'")
            except:
                print 'Problem retrieving carrier by single code: %s' %(single_code)
                code_name=None
            print '%s (Single Code) -> %s (Code Name)' %(single_code, code_name)
            if code_name: 
              final_code[single_code]=code_name
          final_code_name="Unknown - Unknown"
          if final_code:
            final_code_name=self.max_element(final_code.values())
          print "Missing code_name inserting into carrier_dim",final_code_name
          self.db.executeSQLArgs(self.config['INSERT_CARRIER_DIM'], ((final_code_name,codes,datetime.now().strftime('%Y-%m-%d %H:%M:%S'),datetime.now().strftime('%Y-%m-%d %H:%M:%S'))))
                 
        else:
          single_code=codes
          print "Single code : %s" % (single_code)
          try:
            code_name=self.db.retrieveSQL(self.config['GET_SINGLE_CARRIER']+"'"+single_code+"'")
          except:
            print 'Problem retrieving carrier by single code: %s' %(single_code)
            code_name=None
          print '%s (Single Code) -> %s (Code Name)' %(single_code, code_name)
          if not code_name or not code_name[0][0]:
            self.db.executeSQLArgs(self.config['INSERT_CARRIER_DIM'], (("Unknown - Unknown",codes,datetime.now().strftime('%Y-%m-%d %H:%M:%S'),datetime.now().strftime('%Y-%m-%d %H:%M:%S'))))    
          else:
            self.db.executeSQLArgs(self.config['INSERT_CARRIER_DIM'], ((code_name[0][0],codes,datetime.now().strftime('%Y-%m-%d %H:%M:%S'),datetime.now().strftime('%Y-%m-%d %H:%M:%S'))))    
      
   def remove_existing_data(self):
      print "DELETE_EXISTING_HOUR: Removing data for hour %s " % (self.feed_hour)
      self.db.executeSQLArgs(self.config['DELETE_EXISTING_HOUR'], ((self.feed_hour,self.feed_hour)))

   def insert_merged_data(self):
      row_count = self.db.executeSQL(self.config['INSERT_DATA'], False)
      print "INSERT_DATA: Inserted %d rows for hour %s" % ( row_count, self.feed_hour )

   def updateLoadState(self):
      # update load_state and commit
      self.db.updateLoadStateWithFeedIntrvl(self.config['LOAD_STATE_VAR'], self.intrvl)
      self.db.commit()
      print "Updated and Committed load_state variable %s for %s" % ( self.config['LOAD_STATE_VAR'], self.intrvl)
   
   def queueDailyRollupJobsForRepublishing(self):
      queue_hour(self.db, self.rollup_config, 'ox_openrtb_sum_hourly', 'day', self.feed_hour)

   def run_rollups(self):
      run_rollups(self.db, self.rollup_config, 'ox_openrtb_sum_hourly', self.feed_hour)
class sqoop_table:
    def __init__(self,table_name,config,logger):
        self.logger = logger # use class name as the log name
        #  Reading configuration file ( YAML file )
	self.config=config
        self.env = yaml.load(open(config['ENV']))
        self.db = OnlineDB(self.env['DSN'], self.logger)
        set_schema_sql = self.env['SET_SCHEMA_SQL']
        self.db.executeSQL(set_schema_sql)
	
    def drop_table(self, table):
        sql = "drop table if exists " + table
	self.logger.info(sql)
        self.db.executeSQL(sql)
	
    def check_table(self, table):
        sql = "select table_name from v_catalog.tables where table_name= ?"
        result=self.db.retrieveSQLArgs(sql,(table))
	if not result:
	  return False
	elif result[0][0]==table:
	  return True
	
    def get_primary_column(self, table_name):
        # Select the month/day/primaryid which is not backed up yet
        sql = "select primary_partition_column from sqoop_etl_status where is_complete='f' and table_name='"+table_name+"' order by primary_partition_value LIMIT 1"
        primary_id = self.db.retrieveSQL(sql)
        if primary_id and primary_id[0][0]:
            print "SELECT: %s value selected for backup is %s" % (primary_id, primary_id[0][0])
        else:
            print "No new data available for copying"
        return primary_id

    def get_primary_value(self,table_name):
        sql = "select primary_partition_value from sqoop_etl_status where is_complete='f' and table_name='"+table_name+"' order by primary_partition_value LIMIT 1"
        primary_value = self.db.retrieveSQL(sql)
	return primary_value

    def get_row_count(self,table_name,primary_id,primary_value):
        sql = "select count(*) from "+table_name+" where "+ primary_id +" = "+ primary_value
        row_count = self.db.retrieveSQL(sql)
	return row_count

    def update_backup_status(self, table_name, primary_id, primary_value):
        # Insert records to the secondary table for the given month
	if primary_id:
          sql = "update sqoop_etl_status set is_complete='t', end_date = '" + str(datetime.datetime.now())+"' where primary_partition_column=? and primary_partition_value=? and table_name=?" 
          self.db.executeSQLArgs(sql, (primary_id,primary_value,table_name))
          self.db.commit()
	  self.logger.info("updated sqoop_etl status for table_name=%s",table_name)
          print "COMPLETED: Backed up the data for the %s = %s" % (primary_id, primary_value)

    def deleteCurrentSecondaryId(self, secTable, secID):
        # Since the sqoop job for this secondary id is completed we are deleting it
        sql = "delete from " + secTable + " where secondary_id= ?"
        self.db.executeSQLArgs(sql, (secID))
        self.db.commit()
        print "DELETE: Completed backup for the  secondary id %s" % (secID)

    def create_temp_table(self, tempTable, table_name):
        # Creating the temporary table with additional rowId column and partitioned on rowID
        createSQL = "create table IF NOT EXISTS " + self.config['DW_DB']+"."+tempTable + " like " + self.config['DW_DB']+"."+table_name
        self.db.executeSQL(createSQL)
        alterSQL = "alter table " +self.config['DW_DB']+"."+ tempTable + " add column rowId int not null"
        self.db.executeSQL(alterSQL)
        partitionSQL = "ALTER TABLE " + self.config['DW_DB']+"."+tempTable + "  PARTITION BY rowId REORGANIZE"
        self.db.executeSQL(partitionSQL)
	self.logger.info("CREATE TEMP TABLE: %s",self.config['DW_DB']+"."+table_name)

        print "CREATE: Created temporary table with row id... "

    def truncate_table(self, table_name):
        # Clean up the the temp table for the next iteration...
        sql = "truncate table  " + self.config['DW_DB']+"."+table_name
	self.logger.info("TRUNCATE TABLE: %s",table_name)
        self.db.executeSQL(sql)
        print "TRUNCATE: Truncated temp table... "

    def pupulate_temp_table(self, tempTable,sequence, numMappers, table_name, primary_id, primary_value):
	columns = self.get_temp_table_column(table_name)
	if primary_id:
          sql = "insert into " + self.config['DW_DB']+"."+ tempTable + "(rowId, " + columns + ") select " + sequence + ".nextval % ? ," + columns + " from " + self.config['DW_DB']+"."+table_name + " where " + primary_id + "= ? ;"
          num_rows = self.db.executeSQLArgs(sql, (numMappers, primary_value))
	else:
	  sql="insert into " + self.config['DW_DB']+"."+ tempTable + "(rowId, " + columns + ") select " + sequence + ".nextval % ? ," + columns + " from " + self.config['DW_DB']+"."+table_name + ""
          num_rows = self.db.executeSQLArgs(sql, (numMappers))
	print sql
        self.db.commit()
	self.logger.info("RUNNING: %s",sql)
    
    def backup_status(self,table_name):
        sql = "select 1 from sqoop_etl_status where is_complete='f' and table_name='"+table_name+"' order by primary_partition_value LIMIT 1"
        value= self.db.retrieveSQL(sql)
	if not value:
	  return "false"
	else:
	  return "true"
		
    def get_table_column(self, table_name):
        # get the the column names of the given table
        sql = "select column_name from v_catalog.columns where table_name = ? and table_schema='"+self.config['DW_DB']+"'"
        cols = self.db.retrieveSQLArgs(sql, (table_name))
        columns = ','.join(str(v[0]) for v in cols)
	print columns
        return columns
    
    def get_table_column_with_type(self, table_name):
	primary_id = self.get_primary_column(table_name)
        # get the the column names of the given table
        sql = "select column_name || ' ' || data_type from v_catalog.columns where table_name = ? and table_schema='"+self.config['DW_DB']+"'"
        cols = self.db.retrieveSQLArgs(sql, (table_name))
        columns = ','.join(str(v[0]) for v in cols)
	print columns
        return columns
    
    def generate_status_table(self, table_name):
	sql = "select export_objects('','"+self.config['DW_DB']+"."+table_name+"')"
        table_statement = self.db.retrieveSQL(sql)
        table_statement=table_statement[0][0]
        table_statement=table_statement.split(";")[0]
	primary_id = self.config['PRIMARY_ID']
	skip_last_months=self.config['SKIP_LATEST_MONTHS']
        # get the the column names of the given table
        #sql = "select  distinct "+primary_id+" from " + table_name
	sql="insert into sqoop_etl_status "
	if primary_id=="":
	  primary_id="NULL"
	  sql=sql+" select '"+table_name+"','"+self.config['HIVE_DB']+"',CAST('"+str(datetime.datetime.now())+"' AS timestamp),"+"null"+","+primary_id+","+primary_id+",'"+'f'+"',"+self.config['NUM_MAPPERS'] +" WHERE not exists(select 1 from sqoop_etl_status where table_name='"+table_name+"')"
	else: 
	  sql=sql+" select '"+table_name+"','"+self.config['HIVE_DB']+"','"+str(datetime.datetime.now())+"',"+"null"+",'"+primary_id+"',"+primary_id+",'"+'f'+"',"+self.config['NUM_MAPPERS']+" from (select  "+primary_id+", row_number() over(order by "+primary_id+" desc) rn from (select distinct "+primary_id+" from "+self.config['DW_DB']+"."+table_name+" ) C group by 1) A where rn>"+skip_last_months+" and not exists(select 1 from sqoop_etl_status where primary_partition_value="+primary_id+");"
	self.logger.info("RUNNING: %s",sql)
        self.db.executeSQL(sql)
        self.db.commit();
	
    def get_temp_table_column(self, table_name):
	primary_id = self.get_primary_column(table_name)
        # get the the column names of the given table
        sql = "select column_name from v_catalog.columns where table_name = ? and table_schema='"+self.config['DW_DB']+"'"
        cols = self.db.retrieveSQLArgs(sql, (table_name))
        columns = ','.join(str(v[0]) for v in cols)
        return columns

    def create_external_table(self, table_name):
	table_statement=self.get_table_column_with_type(table_name) +" )"
	table_statement=table_statement+" as  copy  from '"+self.env['HDFS_STORAGE_HOST']
	table_statement=table_statement+table_name+"/*' ON ANY NODE ORC"
        table_statement="CREATE EXTERNAL TABLE " + self.config['DW_DB']+"."+ table_name+"_ext ( "+table_statement 
        self.db.executeSQL(table_statement)
	self.logger.info("RUNNING: %s",table_statement)
        print "TRUNCATE: Truncated temp table... "

    def get_secondary_value(self, sec_table_name):
        sql = "select secondary_id from " + sec_table_name + " limit 1"
        secondary_value = self.db.retrieveSQL(sql)
        if secondary_value:
            print "SELECT: The secondary id that will be backed up next is %s" % (secondary_value[0][0])
        else:
            print "SELECT: Completed all secondary ids"

        return secondary_value