def dt(): jobs = [] if project.verbose: print("DT To BigQuery") # legacy deprecated ( do not use ) if 'path' in project.task: project.task['paths'] = [project.task['path']] # loop all dt files to match pattern or match any pattern print('PATHS', project.task['paths']) for path in (project.task['paths'] or ['']): print(path) for dt_object in object_list(project.task['auth'], '%s:%s' % (project.task['bucket'], path), raw=True): dt_size = dt_object['size'] dt_file = dt_object['name'] dt_time = dt_timestamp(dt_file) dt_partition = dt_file.split('.', 1)[0] if ((project.task.get('days') is None and project.task.get('hours') is None) or (dt_time > project.now - timedelta(days=project.task.get('days', 60), hours=project.task.get('hours', 0)))): if not table_exists(project.task['to']['auth'], project.id, project.task['to']['dataset'], dt_partition): dt_move(dt_object, dt_partition, jobs) else: if project.verbose: print ('DT Partition Exists:', dt_partition) for count, job in enumerate(jobs): print('Waiting For Job: %d of %d' % (count + 1, len(jobs))) job_wait(project.task['to']['auth'], job)
def archive(): if project.verbose: print('ARCHIVE') day = project.date - timedelta(days=abs(project.task['days'])) for object in object_list(project.task['auth'], project.task['storage']['bucket'] + ':' + project.task['storage']['path'], files_only=True, raw=True): object_day = datetime.strptime(object['updated'], '%Y-%m-%dT%H:%M:%S.%fZ').date() if object_day <= day: if project.task.get('delete', False) == False: if project.verbose: print('ARCHIVING FILE:', object['name']) object_move( project.task['auth'], '%s:%s' % (object['bucket'], object['name']), '%s:archive/%s' % (object['bucket'], object['name'])) else: if project.verbose: print('DELETING FILE:', ) object_delete(project.task['auth'], '%s:%s' % (object['bucket'], object['name']))
def storage_list(account): path = '%s:' % account.get_bucket(full_path=False) project.initialize(_project=CLOUD_PROJECT, _service=CLOUD_SERVICE) try: for filename_storage in object_list('service', path, files_only=True): yield Storage(filename_storage) except: pass # if no bucket then skip ( maybe not set up yet )
def archive(): if project.verbose: print('ARCHIVE') day = project.date - timedelta(days=abs(project.task['days'])) if 'storage' in project.task: for file_name in object_list(project.task['auth'], project.task['storage']['bucket'] + ':' + project.task['storage']['path'], files_only=True): file_day = parse_yyyymmdd(file_name) if file_day and file_day <= day: if project.task.get('delete', False) == False: if project.verbose: print('ARCHIVING FILE:', file_name) object_move(project.task['auth'], file_name, file_name.replace(':', ':archive/')) else: if project.verbose: print('DELETING FILE:', file_name) object_delete(project.task['auth'], file_name)
def dt(): if project.verbose: print "DT TO TABLE", project.task['to']['table'] delimiter = '\n' disposition = 'WRITE_TRUNCATE' # loop all dt files to match pattern path = '%s:%s' % (project.task['from']['bucket'], project.task['from']['path']) for dt_file in object_list(project.task['auth'], path, files_only=True): # decompression handler for gzip ( must be outside of chunks as it keeps track of stream across multiple calls ) gz_handler = zlib.decompressobj(32 + zlib.MAX_WBITS) # sliding view of data flowing out of decompression, used to buffer and delimit rows first_row = True view = '' # loop all chunks of file, decompress, and find row delimiter for data_gz in object_get_chunks(project.task['auth'], dt_file): view += gz_handler.decompress(data_gz.read()) if first_row: end = view.find(delimiter) schema = dt_schema(view[:end].split(',')) view = view[(end + 1):] first_row = False end = view.rfind(delimiter) io_to_table(project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], BytesIO(view[:end]), 'CSV', schema, 0, disposition, False) disposition = 'WRITE_APPEND' view = view[min(end + 1, len(view)):]