Exemplo n.º 1
0
def dt():
  jobs = []

  if project.verbose: print("DT To BigQuery")

  # legacy deprecated ( do not use )
  if 'path' in project.task: project.task['paths'] = [project.task['path']]

  # loop all dt files to match pattern or match any pattern
  print('PATHS', project.task['paths'])

  for path in (project.task['paths'] or ['']):

    print(path)
    for dt_object in object_list(project.task['auth'], '%s:%s' % (project.task['bucket'], path), raw=True):
      dt_size = dt_object['size']
      dt_file = dt_object['name']
      dt_time = dt_timestamp(dt_file)

      dt_partition = dt_file.split('.', 1)[0]
      if ((project.task.get('days') is None and project.task.get('hours') is None) or (dt_time > project.now - timedelta(days=project.task.get('days', 60), hours=project.task.get('hours', 0)))):
        if not table_exists(project.task['to']['auth'], project.id, project.task['to']['dataset'], dt_partition):
           dt_move(dt_object, dt_partition, jobs) 
        else:
          if project.verbose: print ('DT Partition Exists:', dt_partition)

  for count, job in enumerate(jobs):
    print('Waiting For Job: %d of %d' % (count + 1, len(jobs)))
    job_wait(project.task['to']['auth'], job)
Exemplo n.º 2
0
def archive():
    if project.verbose:
        print('ARCHIVE')

    day = project.date - timedelta(days=abs(project.task['days']))

    for object in object_list(project.task['auth'],
                              project.task['storage']['bucket'] + ':' +
                              project.task['storage']['path'],
                              files_only=True,
                              raw=True):
        object_day = datetime.strptime(object['updated'],
                                       '%Y-%m-%dT%H:%M:%S.%fZ').date()
        if object_day <= day:
            if project.task.get('delete', False) == False:
                if project.verbose:
                    print('ARCHIVING FILE:', object['name'])
                object_move(
                    project.task['auth'],
                    '%s:%s' % (object['bucket'], object['name']),
                    '%s:archive/%s' % (object['bucket'], object['name']))
            else:
                if project.verbose:
                    print('DELETING FILE:', )
                object_delete(project.task['auth'],
                              '%s:%s' % (object['bucket'], object['name']))
Exemplo n.º 3
0
def storage_list(account):
    path = '%s:' % account.get_bucket(full_path=False)
    project.initialize(_project=CLOUD_PROJECT, _service=CLOUD_SERVICE)

    try:
        for filename_storage in object_list('service', path, files_only=True):
            yield Storage(filename_storage)
    except:
        pass  # if no bucket then skip ( maybe not set up yet )
Exemplo n.º 4
0
def archive():
    if project.verbose: print('ARCHIVE')

    day = project.date - timedelta(days=abs(project.task['days']))

    if 'storage' in project.task:
        for file_name in object_list(project.task['auth'],
                                     project.task['storage']['bucket'] + ':' +
                                     project.task['storage']['path'],
                                     files_only=True):
            file_day = parse_yyyymmdd(file_name)
            if file_day and file_day <= day:
                if project.task.get('delete', False) == False:
                    if project.verbose: print('ARCHIVING FILE:', file_name)
                    object_move(project.task['auth'], file_name,
                                file_name.replace(':', ':archive/'))
                else:
                    if project.verbose: print('DELETING FILE:', file_name)
                    object_delete(project.task['auth'], file_name)
Exemplo n.º 5
0
def dt():
    if project.verbose: print "DT TO TABLE", project.task['to']['table']

    delimiter = '\n'
    disposition = 'WRITE_TRUNCATE'

    # loop all dt files to match pattern
    path = '%s:%s' % (project.task['from']['bucket'],
                      project.task['from']['path'])
    for dt_file in object_list(project.task['auth'], path, files_only=True):

        # decompression handler for gzip ( must be outside of chunks as it keeps track of stream across multiple calls )
        gz_handler = zlib.decompressobj(32 + zlib.MAX_WBITS)

        # sliding view of data flowing out of decompression, used to buffer and delimit rows
        first_row = True
        view = ''

        # loop all chunks of file, decompress, and find row delimiter
        for data_gz in object_get_chunks(project.task['auth'], dt_file):

            view += gz_handler.decompress(data_gz.read())

            if first_row:
                end = view.find(delimiter)
                schema = dt_schema(view[:end].split(','))
                view = view[(end + 1):]
                first_row = False

            end = view.rfind(delimiter)

            io_to_table(project.task['auth'], project.id,
                        project.task['to']['dataset'],
                        project.task['to']['table'], BytesIO(view[:end]),
                        'CSV', schema, 0, disposition, False)
            disposition = 'WRITE_APPEND'
            view = view[min(end + 1, len(view)):]