def main(): """watch a specific directory, logging changes and running python scripts when they are written to disk""" home_dir = Path(environ.get('HOME')) run_logfile = home_dir / 'pyrun.log' watchdog_logfile = home_dir / 'pydir.log' run_log = FileHandler(str(run_logfile), level='NOTICE', bubble=True, mode='w', delay=True) file_log = FileHandler(str(watchdog_logfile), level='INFO', bubble=True) with run_log.applicationbound(): with file_log.applicationbound(): watched_dir = home_dir / 'code' / 'pyrep' / 'coderunner' / 'snippets' handler = MyEventHandler(run_logfile, run_log) obs = InotifyObserver() obs.schedule(handler, str(watched_dir), False) obs.start() try: while True: sleep(1) except: # pylint: disable=bare-except obs.stop() obs.join()
def main(): """ The main routine which kicks everything off :return: """ # Setup the command line arguments flags = argparse.ArgumentParser(description="Tool to validate and fix errors in CSV files for TADC imports") flags.add_argument('csv_file', type=str, help="Path to a CSV file to validate") flags.add_argument('header_rows', type=str, help="Number of header rows") flags.add_argument('--fix-missing', '-f', action='store_true', help="Fix missing fields by inserting the value 'unknown'") flags.add_argument('--output-dir', '-o', type=str, help='Where to put output files', default=os.getcwd()) flags.add_argument('--log-dir', '-l', type=str, help='Where to put log files', default='/tmp') flags.add_argument('--log-level', type=str, help='Choose a log level', default='INFO') flags.add_argument('--old-date-format', type=str, help="the format of dates that will be fixed", default='%d/%m/%Y') args = flags.parse_args() log_filename = os.path.join( args.log_dir, 'tadc_import_validator_{}.log'.format(os.path.basename(time.strftime('%Y%m%d-%H%M%S'))) ) # register some logging handlers log_handler = FileHandler( log_filename, mode='w', level=args.log_level, bubble=True ) stdout_handler = StreamHandler(sys.stdout, level=args.log_level, bubble=True) with stdout_handler.applicationbound(): with log_handler.applicationbound(): log.info("Arguments: {}".format(args)) start = time.time() log.info("starting at {}".format(time.strftime('%l:%M%p %Z on %b %d, %Y'))) with CSVFileValidator( csv_file=args.csv_file, header_rows=args.header_rows, output_dir=args.output_dir, old_date_format=args.old_date_format, fix_missing=args.fix_missing) as validator: validator.validate_file() log.info("Running time: {}".format(str(datetime.timedelta(seconds=(round(time.time() - start, 3)))))) log.info("Log written to {}:".format(log_filename)) log.info("Fixed data is in: {}".format(validator.get_fixed_filename()))
class Fibratus(): """Fibratus entrypoint. Setup the core components including the kernel event stream collector and the tracing controller. At this point the system handles are also being enumerated. """ def __init__(self, filament): self.logger = Logger(Fibratus.__name__) self.file_handler = FileHandler(os.path.join(os.path.abspath(__file__), '..', '..', '..', 'fibratus.log'), mode='w+') self.kevt_streamc = KEventStreamCollector(etw.KERNEL_LOGGER_NAME.encode()) self.kcontroller = KTraceController() self.ktrace_props = KTraceProps() self.ktrace_props.enable_kflags() self.ktrace_props.logger_name = etw.KERNEL_LOGGER_NAME self.handle_repository = HandleRepository() self._handles = [] # query for handles on the # start of kernel trace with self.file_handler.applicationbound(): self.logger.info('Starting fibratus...') self.logger.info('Enumerating system handles...') self._handles = self.handle_repository.query_handles() self.logger.info('%s handles found' % len(self._handles)) self.handle_repository.free_buffers() self.thread_registry = ThreadRegistry(self.handle_repository, self._handles) self.kevent = KEvent(self.thread_registry) self._filament = filament self.fsio = FsIO(self.kevent, self._handles) self.hive_parser = HiveParser(self.kevent, self.thread_registry) self.tcpip_parser = TcpIpParser(self.kevent) self.dll_repository = DllRepository(self.kevent) self.requires_render = {} self.filters_count = 0 def run(self): @atexit.register def _exit(): self.stop_ktrace() self.kcontroller.start_ktrace(etw.KERNEL_LOGGER_NAME, self.ktrace_props) def on_kstream_open(): if self._filament is None: IO.write_console('Done! ') self.kevt_streamc.set_kstream_open_callback(on_kstream_open) self._open_kstream() def _open_kstream(self): try: self.kevt_streamc.open_kstream(self._on_next_kevent) except Exception as e: with self.file_handler.applicationbound(): self.logger.error(e) except KeyboardInterrupt: self.stop_ktrace() def stop_ktrace(self): IO.write_console('Stopping fibratus...') if self._filament: self._filament.close() self.kcontroller.stop_ktrace(self.ktrace_props) self.kevt_streamc.close_kstream() def add_filters(self, kevent_filters): if len(kevent_filters) > 0: self.filters_count = len(kevent_filters) # include the basic filters # that are essential to the # rest of kernel events self.kevt_streamc.add_kevent_filter(ENUM_PROCESS) self.kevt_streamc.add_kevent_filter(ENUM_THREAD) self.kevt_streamc.add_kevent_filter(ENUM_IMAGE) self.kevt_streamc.add_kevent_filter(REG_CREATE_KCB) self.kevt_streamc.add_kevent_filter(REG_DELETE_KCB) # these kevents are necessary for consistent state # of the trace. If the user doesn't include them # in a filter list, then we do the job but set the # kernel event type as not eligible for rendering if not KEvents.CREATE_PROCESS in kevent_filters: self.kevt_streamc.add_kevent_filter(CREATE_PROCESS) self.requires_render[CREATE_PROCESS] = False else: self.requires_render[CREATE_PROCESS] = True if not KEvents.CREATE_THREAD in kevent_filters: self.kevt_streamc.add_kevent_filter(CREATE_THREAD) self.requires_render[CREATE_THREAD] = False else: self.requires_render[CREATE_THREAD] = True if not KEvents.CREATE_FILE in kevent_filters: self.kevt_streamc.add_kevent_filter(CREATE_FILE) self.requires_render[CREATE_FILE] = False else: self.requires_render[CREATE_FILE] = True for kevent_filter in kevent_filters: ktuple = kname_to_tuple(kevent_filter) if isinstance(ktuple, list): for kt in ktuple: self.kevt_streamc.add_kevent_filter(kt) if not kt in self.requires_render: self.requires_render[kt] = True else: self.kevt_streamc.add_kevent_filter(ktuple) if not ktuple in self.requires_render: self.requires_render[ktuple] = True def _on_next_kevent(self, ktype, cpuid, ts, kparams): """Callback which fires when new kernel event arrives. This callback is invoked for every new kernel event forwarded from the kernel stream collector. Parameters ---------- ktype: tuple Kernel event type. cpuid: int Indentifies the CPU core where the event has been captured. ts: str Temporal reference of the kernel event. kparams: dict Kernel event's parameters. """ # initialize kernel event properties self.kevent.ts = ts self.kevent.cpuid = cpuid self.kevent.name = ktuple_to_name(ktype) kparams = ddict(kparams) # thread / process kernel events if ktype in [CREATE_PROCESS, CREATE_THREAD, ENUM_PROCESS, ENUM_THREAD]: self.thread_registry.add_thread(ktype, kparams) if ktype in [CREATE_PROCESS, CREATE_THREAD]: self.thread_registry.init_thread_kevent(self.kevent, ktype, kparams) self._render(ktype) elif ktype in [TERMINATE_PROCESS, TERMINATE_THREAD]: self.thread_registry.init_thread_kevent(self.kevent, ktype, kparams) self._render(ktype) self.thread_registry.remove_thread(ktype, kparams) # file system/disk kernel events elif ktype in [CREATE_FILE, DELETE_FILE, CLOSE_FILE, READ_FILE, WRITE_FILE]: self.fsio.parse_fsio(ktype, kparams) self._render(ktype) # dll kernel events elif ktype in [LOAD_IMAGE, ENUM_IMAGE]: self.dll_repository.register_dll(kparams) if ktype == LOAD_IMAGE: self._render(ktype) elif ktype == UNLOAD_IMAGE: self.dll_repository.unregister_dll(kparams) self._render(ktype) # registry kernel events elif ktype == REG_CREATE_KCB: self.hive_parser.add_kcb(kparams) elif ktype == REG_DELETE_KCB: self.hive_parser.remove_kcb(kparams.key_handle) elif ktype in [REG_CREATE_KEY, REG_DELETE_KEY, REG_OPEN_KEY, REG_QUERY_KEY, REG_SET_VALUE, REG_DELETE_VALUE, REG_QUERY_VALUE]: self.hive_parser.parse_hive(ktype, kparams) self._render(ktype) # network kernel events elif ktype in [SEND_SOCKET_TCPV4, SEND_SOCKET_UDPV4, RECV_SOCKET_TCPV4, RECV_SOCKET_UDPV4, ACCEPT_SOCKET_TCPV4, CONNECT_SOCKET_TCPV4, DISCONNECT_SOCKET_TCPV4, RECONNECT_SOCKET_TCPV4]: self.tcpip_parser.parse_tcpip(ktype, kparams) self._render(ktype) if self._filament: # call filament method # to process the next # kernel event from the stream if ktype not in [ENUM_PROCESS, ENUM_THREAD, ENUM_IMAGE]: if self.kevent.name: self._filament.process(self.kevent) def _render(self, ktype): """Renders the kevent to the standard output stream. Parameters ---------- ktype: tuple Identifier of the kernel event """ if not self._filament: if ktype in self.requires_render: rr = self.requires_render[ktype] if rr: self.kevent.render() elif self.filters_count == 0: self.kevent.render()
elif soort == 'event': p.events.append((dt.datetime.today().isoformat(' ')[:19], data)) elif soort == 'statuscode': p.status = data elif soort == 'arch': p.set_arch(data) list(p) if update: p.write() return p if __name__ == "__main__": fnm = "afrift" log_handler = FileHandler('get_acties_sql_1.log', mode='w') with log_handler.applicationbound(): test_get_acties(fnm, {}, "") test_get_acties(fnm, {"idlt": "2010"}, "") test_get_acties(fnm, { "idlt": "2010", "id": "and", "idgt": "2007-0003" }, "") test_get_acties(fnm, { "idgt": "2010", "id": "or", "idlt": "2007-0003" }, "") test_get_acties(fnm, {"idgt": "2007-0003"}, "") test_get_acties(fnm, {"status": ["1"]}, "") test_get_acties(fnm, {"status": ["1", "2"]}, "")
from logbook import FileHandler from logbook import Logger from argparse import ArgumentParser import sys parser = ArgumentParser() logpath = './log/' parser.add_argument('--log', nargs=1, help='log path') parser.add_argument('--version', nargs=1, help='maintain version') args = parser.parse_args(sys.argv[1:]) logfilepath = logpath + args.log[0] maintain_version = args.version[0] log_handler = FileHandler(logfilepath) logbk = Logger('Token Maintain') with log_handler.applicationbound(): logbk.info('maintain prepare') at_least = AT_LEAST_TOKEN_COUNT max_tokens_redis_limit = MAX_TOKENS_IN_REDIS logbk.info('maintain begin') # 认证新用户,并将access_token加入mongodb,redis从mongodb导入新token,不重置已有token 的 req_count if maintain_version == 'addatoken': print 'generate new token, write to mongo, push to redis without reset request count' generate_api_access_token(logbk) add_without_reset_req_count(max_tokens_redis_limit, logbk) # 将mongodb中所有access_token加入redis,并重置已有token 的 req_count if maintain_version == 'addalltoken':
""" import os import sys from logbook import Processor, StreamHandler, DEBUG, Logger, FileHandler my_handler = FileHandler("test.log", encoding="utf-8", level=DEBUG) # my_handler = StreamHandler(sys.stdout, level=DEBUG) def log_other_info(record): """ a) 通过 with.processor可以让在其中的日志拥有共同的逻辑,相当于一个切面注入 比如这里的例子是 在每条日志中记录一些额外的信息(额外的信息是通过在日志对象(logRecord)的extra(字典对象)属性中添加 一些其他的信息),这样每条日志都会有这里添加的额外的信息。 b) 有个疑问就是,这些额外的信息怎么运用呢,比如这些信息如何能和日志一块记录在文件中呢 c) 关于日志的属性,见 logrecord.py """ record.extra['myname'] = 'kute' record.extra['mycwd'] = os.getcwd() # update myname propertiy record.extra.update(myname="lisa") print(record.to_dict()) if __name__ == "__main__": with my_handler.applicationbound(): with Processor(log_other_info).applicationbound(): mylog = Logger("processor") mylog.notice("notice msg.")
def main(): """Shows basic usage of the Google Drive API. Creates a Google Drive API service object and outputs the names and IDs for up to 10 files. """ log_filename = os.path.join( args.log_dir, 'google-drive-to-s3-{}.log'.format(os.path.basename(time.strftime('%Y%m%d-%H%M%S'))) ) # register some logging handlers log_handler = FileHandler( log_filename, mode='w', level=args.log_level, bubble=True ) stdout_handler = StreamHandler(sys.stdout, level=args.log_level, bubble=True) with stdout_handler.applicationbound(): with log_handler.applicationbound(): log.info("Arguments: {}".format(args)) start = time.time() log.info("starting at {}".format(time.strftime('%l:%M%p %Z on %b %d, %Y'))) credentials = get_credentials() http = credentials.authorize(httplib2.Http()) drive_service = discovery.build('drive', 'v3', http=http) s3 = boto3.resource('s3') # load up a match file if we have one. if args.match_file: with open(args.match_file, 'r') as f: match_filenames = f.read().splitlines() else: match_filenames = None # get the files in the specified folder. files = drive_service.files() request = files.list( pageSize=args.page_size, q="'{}' in parents".format(args.folder_id), fields="nextPageToken, files(id, name)" ) # make sure our S3 Key prefix has a trailing slash key_prefix = ensure_trailing_slash(args.key_prefix) page_counter = 0 file_counter = 0 while request is not None: file_page = request.execute(http=http) page_counter += 1 page_file_counter = 0 # reset the paging file counter # determine the page at which to start processing. if page_counter >= args.start_page: log.info(u"######## Page {} ########".format(page_counter)) for this_file in file_page['files']: file_counter += 1 page_file_counter += 1 if we_should_process_this_file(this_file['name'], match_filenames): log.info(u"#== Processing {} file number {} on page {}. {} files processed.".format( this_file['name'], page_file_counter, page_counter, file_counter )) # download the file download_request = drive_service.files().get_media(fileId=this_file['id']) fh = io.BytesIO() # Using an in memory stream location downloader = MediaIoBaseDownload(fh, download_request) done = False pbar = InitBar(this_file['name']) while done is False: status, done = downloader.next_chunk() pbar(int(status.progress()*100)) # print("\rDownload {}%".format(int(status.progress() * 100))) del pbar # upload to bucket log.info(u"Uploading to S3") s3.Bucket(args.bucket).put_object( Key="{}{}".format(key_prefix, this_file['name']), Body=fh.getvalue(), ACL='public-read' ) log.info(u"Uploaded to S3") fh.close() # close the file handle to release memory else: log.info(u"Do not need to process {}".format(this_file['name'])) # stop if we have come to the last user specified page if args.end_page and page_counter == args.end_page: log.info(u"Finished paging at page {}".format(page_counter)) break # request the next page of files request = files.list_next(request, file_page) log.info("Running time: {}".format(str(datetime.timedelta(seconds=(round(time.time() - start, 3)))))) log.info("Log written to {}:".format(log_filename))
class Fibratus(): """Fibratus entrypoint. Setup the core components including the kernel event stream collector and the tracing controller. At this point the system handles are also being enumerated. """ def __init__(self, filament): self.logger = Logger(Fibratus.__name__) self.file_handler = FileHandler(os.path.join(os.path.abspath(__file__), '..', '..', '..', 'fibratus.log'), mode='w+') self.kevt_streamc = KEventStreamCollector( etw.KERNEL_LOGGER_NAME.encode()) self.kcontroller = KTraceController() self.ktrace_props = KTraceProps() self.ktrace_props.enable_kflags() self.ktrace_props.logger_name = etw.KERNEL_LOGGER_NAME self.handle_repository = HandleRepository() self._handles = [] # query for handles on the # start of kernel trace with self.file_handler.applicationbound(): self.logger.info('Starting fibratus...') self.logger.info('Enumerating system handles...') self._handles = self.handle_repository.query_handles() self.logger.info('%s handles found' % len(self._handles)) self.handle_repository.free_buffers() self.thread_registry = ThreadRegistry(self.handle_repository, self._handles) self.kevent = KEvent(self.thread_registry) self._filament = filament self.fsio = FsIO(self.kevent, self._handles) self.hive_parser = HiveParser(self.kevent, self.thread_registry) self.tcpip_parser = TcpIpParser(self.kevent) self.dll_repository = DllRepository(self.kevent) self.requires_render = {} self.filters_count = 0 def run(self): @atexit.register def _exit(): self.stop_ktrace() self.kcontroller.start_ktrace(etw.KERNEL_LOGGER_NAME, self.ktrace_props) def on_kstream_open(): if self._filament is None: IO.write_console('Done! ') self.kevt_streamc.set_kstream_open_callback(on_kstream_open) self._open_kstream() def _open_kstream(self): try: self.kevt_streamc.open_kstream(self._on_next_kevent) except Exception as e: with self.file_handler.applicationbound(): self.logger.error(e) except KeyboardInterrupt: self.stop_ktrace() def stop_ktrace(self): IO.write_console('Stopping fibratus...') if self._filament: self._filament.close() self.kcontroller.stop_ktrace(self.ktrace_props) self.kevt_streamc.close_kstream() def add_filters(self, kevent_filters): if len(kevent_filters) > 0: self.filters_count = len(kevent_filters) # include the basic filters # that are essential to the # rest of kernel events self.kevt_streamc.add_kevent_filter(ENUM_PROCESS) self.kevt_streamc.add_kevent_filter(ENUM_THREAD) self.kevt_streamc.add_kevent_filter(ENUM_IMAGE) self.kevt_streamc.add_kevent_filter(REG_CREATE_KCB) self.kevt_streamc.add_kevent_filter(REG_DELETE_KCB) # these kevents are necessary for consistent state # of the trace. If the user doesn't include them # in a filter list, then we do the job but set the # kernel event type as not eligible for rendering if not KEvents.CREATE_PROCESS in kevent_filters: self.kevt_streamc.add_kevent_filter(CREATE_PROCESS) self.requires_render[CREATE_PROCESS] = False else: self.requires_render[CREATE_PROCESS] = True if not KEvents.CREATE_THREAD in kevent_filters: self.kevt_streamc.add_kevent_filter(CREATE_THREAD) self.requires_render[CREATE_THREAD] = False else: self.requires_render[CREATE_THREAD] = True if not KEvents.CREATE_FILE in kevent_filters: self.kevt_streamc.add_kevent_filter(CREATE_FILE) self.requires_render[CREATE_FILE] = False else: self.requires_render[CREATE_FILE] = True for kevent_filter in kevent_filters: ktuple = kname_to_tuple(kevent_filter) if isinstance(ktuple, list): for kt in ktuple: self.kevt_streamc.add_kevent_filter(kt) if not kt in self.requires_render: self.requires_render[kt] = True else: self.kevt_streamc.add_kevent_filter(ktuple) if not ktuple in self.requires_render: self.requires_render[ktuple] = True def _on_next_kevent(self, ktype, cpuid, ts, kparams): """Callback which fires when new kernel event arrives. This callback is invoked for every new kernel event forwarded from the kernel stream collector. Parameters ---------- ktype: tuple Kernel event type. cpuid: int Indentifies the CPU core where the event has been captured. ts: str Temporal reference of the kernel event. kparams: dict Kernel event's parameters. """ # initialize kernel event properties self.kevent.ts = ts self.kevent.cpuid = cpuid self.kevent.name = ktuple_to_name(ktype) kparams = ddict(kparams) # thread / process kernel events if ktype in [CREATE_PROCESS, CREATE_THREAD, ENUM_PROCESS, ENUM_THREAD]: self.thread_registry.add_thread(ktype, kparams) if ktype in [CREATE_PROCESS, CREATE_THREAD]: self.thread_registry.init_thread_kevent( self.kevent, ktype, kparams) self._render(ktype) elif ktype in [TERMINATE_PROCESS, TERMINATE_THREAD]: self.thread_registry.init_thread_kevent(self.kevent, ktype, kparams) self._render(ktype) self.thread_registry.remove_thread(ktype, kparams) # file system/disk kernel events elif ktype in [ CREATE_FILE, DELETE_FILE, CLOSE_FILE, READ_FILE, WRITE_FILE ]: self.fsio.parse_fsio(ktype, kparams) self._render(ktype) # dll kernel events elif ktype in [LOAD_IMAGE, ENUM_IMAGE]: self.dll_repository.register_dll(kparams) if ktype == LOAD_IMAGE: self._render(ktype) elif ktype == UNLOAD_IMAGE: self.dll_repository.unregister_dll(kparams) self._render(ktype) # registry kernel events elif ktype == REG_CREATE_KCB: self.hive_parser.add_kcb(kparams) elif ktype == REG_DELETE_KCB: self.hive_parser.remove_kcb(kparams.key_handle) elif ktype in [ REG_CREATE_KEY, REG_DELETE_KEY, REG_OPEN_KEY, REG_QUERY_KEY, REG_SET_VALUE, REG_DELETE_VALUE, REG_QUERY_VALUE ]: self.hive_parser.parse_hive(ktype, kparams) self._render(ktype) # network kernel events elif ktype in [ SEND_SOCKET_TCPV4, SEND_SOCKET_UDPV4, RECV_SOCKET_TCPV4, RECV_SOCKET_UDPV4, ACCEPT_SOCKET_TCPV4, CONNECT_SOCKET_TCPV4, DISCONNECT_SOCKET_TCPV4, RECONNECT_SOCKET_TCPV4 ]: self.tcpip_parser.parse_tcpip(ktype, kparams) self._render(ktype) if self._filament: # call filament method # to process the next # kernel event from the stream if ktype not in [ENUM_PROCESS, ENUM_THREAD, ENUM_IMAGE]: if self.kevent.name: self._filament.process(self.kevent) def _render(self, ktype): """Renders the kevent to the standard output stream. Parameters ---------- ktype: tuple Identifier of the kernel event """ if not self._filament: if ktype in self.requires_render: rr = self.requires_render[ktype] if rr: self.kevent.render() elif self.filters_count == 0: self.kevent.render()
def main(): """ Copy a folder from Source to Target """ log_filename = os.path.join( args.log_dir, 'copy-google-drive-folder-{}.log'.format( os.path.basename(time.strftime('%Y%m%d-%H%M%S')))) # register some logging handlers log_handler = FileHandler(log_filename, mode='w', level=args.log_level, bubble=True) stdout_handler = StreamHandler(sys.stdout, level=args.log_level, bubble=True) with stdout_handler.applicationbound(): with log_handler.applicationbound(): log.info("Arguments: {}".format(args)) start = time.time() log.info("starting at {}".format( time.strftime('%l:%M%p %Z on %b %d, %Y'))) credentials = get_credentials() http = credentials.authorize(httplib2.Http()) drive_service = discovery.build('drive', 'v3', http=http) # get the files in the specified folder. files = drive_service.files() request = files.list( pageSize=args.page_size, q="'{}' in parents".format(args.source_folder_id), fields="nextPageToken, files(id, name, mimeType)") page_counter = 0 file_counter = 0 while request is not None: file_page = request.execute(http=http) page_counter += 1 page_file_counter = 0 # reset the paging file counter # determine the page at which to start processing. if page_counter >= args.start_page: log.info(u"######## Page {} ########".format(page_counter)) for this_file in file_page['files']: file_counter += 1 page_file_counter += 1 log.info( u"#== Processing {} {} file number {} on page {}. {} files processed." .format(this_file['mimeType'], this_file['name'], page_file_counter, page_counter, file_counter)) # if not a folder if this_file[ 'mimeType'] != 'application/vnd.google-apps.folder': # Copy the file new_file = {'title': this_file['name']} copied_file = drive_service.files().copy( fileId=this_file['id'], body=new_file).execute() # move it to it's new location drive_service.files().update( fileId=copied_file['id'], addParents=args.target_folder_id, removeParents=args.source_folder_id).execute() else: log.info(u"Skipped Folder") else: log.info(u"Skipping Page {}".format(page_counter)) # stop if we have come to the last user specified page if args.end_page and page_counter == args.end_page: log.info( u"Finished paging at page {}".format(page_counter)) break # request the next page of files request = files.list_next(request, file_page) log.info("Running time: {}".format( str(datetime.timedelta(seconds=(round(time.time() - start, 3)))))) log.info("Log written to {}:".format(log_filename))
def main(): """ Copy a folder from Source to Target """ log_filename = os.path.join( args.log_dir, 'copy-google-drive-folder-{}.log'.format(os.path.basename(time.strftime('%Y%m%d-%H%M%S'))) ) # register some logging handlers log_handler = FileHandler( log_filename, mode='w', level=args.log_level, bubble=True ) stdout_handler = StreamHandler(sys.stdout, level=args.log_level, bubble=True) with stdout_handler.applicationbound(): with log_handler.applicationbound(): log.info("Arguments: {}".format(args)) start = time.time() log.info("starting at {}".format(time.strftime('%l:%M%p %Z on %b %d, %Y'))) credentials = get_credentials() http = credentials.authorize(httplib2.Http()) drive_service = discovery.build('drive', 'v3', http=http) # get the files in the specified folder. files = drive_service.files() request = files.list( pageSize=args.page_size, q="'{}' in parents".format(args.source_folder_id), fields="nextPageToken, files(id, name, mimeType)" ) page_counter = 0 file_counter = 0 while request is not None: file_page = request.execute(http=http) page_counter += 1 page_file_counter = 0 # reset the paging file counter # determine the page at which to start processing. if page_counter >= args.start_page: log.info(u"######## Page {} ########".format(page_counter)) for this_file in file_page['files']: file_counter += 1 page_file_counter += 1 log.info(u"#== Processing {} {} file number {} on page {}. {} files processed.".format( this_file['mimeType'], this_file['name'], page_file_counter, page_counter, file_counter )) # if not a folder if this_file['mimeType'] != 'application/vnd.google-apps.folder': # Copy the file new_file = {'title': this_file['name']} copied_file = drive_service.files().copy(fileId=this_file['id'], body=new_file).execute() # move it to it's new location drive_service.files().update( fileId=copied_file['id'], addParents=args.target_folder_id, removeParents=args.source_folder_id ).execute() else: log.info(u"Skipped Folder") else: log.info(u"Skipping Page {}".format(page_counter)) # stop if we have come to the last user specified page if args.end_page and page_counter == args.end_page: log.info(u"Finished paging at page {}".format(page_counter)) break # request the next page of files request = files.list_next(request, file_page) log.info("Running time: {}".format(str(datetime.timedelta(seconds=(round(time.time() - start, 3)))))) log.info("Log written to {}:".format(log_filename))
for k,v in headers.iteritems(): self.req.add_header(k,v) def set_req(self): self.req = urllib2.Request(self.url, urllib.urlencode(self.data)) #self.req = urllib2.Request(self.url) def send(self): self.set_req() return urllib2.urlopen(self.req) if __name__ == "__main__": logger = Logger("TicketchangeToInfluxdb") logfile = "ticketchangetoinfluxdb.log" fh = FileHandler(logfile,"a") fh.applicationbound() fh.push_application() client = Client() client.test() adapter = Adapter() client.set_adapter(adapter) a =client.get_adapter() a.test() print("This is just a test.") logger.info("Testing logging.")
#!/bin/env python import os, sys, urllib, urllib2, time from logbook import Logger, FileHandler user = "******" token = "password" message = bytes(user).encode('utf-8') secret = bytes(token).encode('utf-8') logger = Logger("Cache Purge") logfile = "cache-purge.log" fh = FileHandler(logfile, "a") fh.applicationbound() fh.push_application() api_root = "https://api.ccu.akamai.com" get_call = "/ccu/v2/queues/default" #data = {} try: req = None url = api_root + get_call mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() mgr.add_password(None, api_root, user, token) handler = urllib2.HTTPBasicAuthHandler(mgr) opener = urllib2.build_opener(handler) urllib2.install_opener(opener) req = urllib2.Request(url) #req = urllib2.Request(api_root,urllib.urlencode(data))
class Pipeline(object): """Represents the abstraction of a pipeline of jobs to be run distributed over machines """ def __init__(self, workdir, jobs, total_cores, scheduler=None, queue=None, local=False, retries=None): """Initialize a pipeline. :param workdir: Name of a directory to use for scratch space and results. This needs to be visible to all nodes over NFS or similar. :param jobs: A list of jobs, which are just dicts. The only required key for now is "description", which will be used for the directory that holds all this job's output. :param total_cores: The total number of cores you want to use for processing. :returns: A Pipeline object, which has methods that invoke various kinds of distributed work. """ # validate things for job in jobs: if type(job) is not dict: raise ValueError("job is not a dict: {}".format(job)) if not job.get("description"): raise ValueError("job {} has not description".format(job)) workdir = os.path.abspath(os.path.expanduser(workdir)) if not os.path.exists(workdir): raise ValueError( "workdir: {} appears not to exist".format(workdir)) self.workdir = workdir self.jobs = jobs self.total_cores = total_cores self.scheduler = scheduler self.queue = queue self.local = local self.retries = retries # setup default cluster_view self._cluster_view = cluster_view def start(self): """Initialize workdir, logging, etc. in preparation for running jobs. """ # make a working directory for each job for job in self.jobs: job["workdir"] = os.path.join(self.workdir, job["description"]) fs.maybe_mkdir(job["workdir"]) # temporary ipython profile directory self.ipythondir = os.path.join(self.workdir, ".ipython") fs.maybe_mkdir(self.ipythondir) # log dir self.logdir = os.path.join(self.workdir, "log") fs.maybe_mkdir(self.logdir) # determine which IP we are going to listen on for logging try: self.listen_ip = localinterfaces.public_ips()[0] except: raise ValueError("This machine appears not to have" " any publicly visible IP addresses") # setup ZMQ logging self.handler = FileHandler(os.path.join(self.logdir, "dish.log")) self.listen_port = str(randint(5000, 10000)) self.subscriber = ZeroMQPullSubscriber("tcp://" + self.listen_ip + ":" + self.listen_port) self.controller = self.subscriber.dispatch_in_background(self.handler) self.logger = Logger("dish_master") def stop(self): """Gracefully shutdown the Pipeline, cleaning up threads, sockets, etc. Leaves working directory intact so everything can in principle be picked up again where we left off. """ self.controller.stop() self.subscriber.close() def _compute_resources(self, cores_per_engine, mem_per_engine, max_engines): if cores_per_engine > self.total_cores: raise ValueError("A job requested {0} but only {1}" " are available.".format(cores_per_engine, self.total_cores)) num_engines = self.total_cores // cores_per_engine if len(self.jobs) < num_engines: # we don't even need this many engines num_engines = self.jobs if max_engines: num_engines = min(num_engines, max_engines) # TODO in the future, should maybe validate that requested # cores and memory are actually going to be availible. This # would unfortunately have to be specialized for each # scheduler probably. return num_engines, cores_per_engine, mem_per_engine @contextmanager def group(self, cores=1, mem="0.1", max=None): """Context manager for "grouping" a set of pipeline operations. A group of operations is run on the same ipython cluster and has it's resources specified in the group as opposed to in each individual job. This is useful if there is some small amount of setup work that isn't worth spinning up a new cluster for but which needs to be done before a resource intensive task. For example:: with p.group(cores=8, mem=12): p.run("setup.sh . . .") # do some data munging or other setup p.run("main_work -n 8 . . .") # call an expensive program """ # TODO this duplicates some code from p.map and is a bit # clunky, there is probably a better abstraction here engines, cores, mem = self._compute_resources(cores, mem, max) extra_params = {"run_local": self.local, "mem": mem} old_view_factory = self._cluster_view cm = self._cluster_view(self.scheduler, self.queue, engines, profile=self.ipythondir, cores_per_job=cores, extra_params=extra_params, retries=self.retries) view = cm.gen.next() @contextmanager def reuse_view(*args, **kwargs): yield view # everything done in the block will use the view we just made self._cluster_view = reuse_view try: yield finally: # restore the normal cluster_view context manager on exit self._cluster_view = old_view_factory try: cm.gen.next() # clean up the view we've been using except StopIteration: pass def _transaction_filter(self, targets): """Filter the `jobs` appropriately based on whether `targets` is a function, str, or list of str""" # TODO there has got to be a better way to do this -____- to_run = [] dont_run = [] if callable(targets): f = targets for job in self.jobs: if f(job): dont_run.append(job) else: to_run.append(job) return to_run, dont_run elif isinstance(targets, str): targets = [targets] elif not isinstance(targets, list): TypeError("transaction targets must be list, str, or callable") for job in self.jobs: canonical_targets = fs.canonicalize(job, targets) if all((os.path.exists(target) for target in canonical_targets)): info = ("Skipping transaction for job {} targets {} " "already present") with self.handler.applicationbound(): self.logger.info(info.format(job["description"], canonical_targets)) dont_run.append(job) else: # targets not present for this job to_run.append(job) return to_run, dont_run @contextmanager def transaction(self, targets): """Do some work "transacationally", in the sense that nothing done inside a ``transaction`` block will be "commited" to the workdir unless it all succeeds without error. The work done inside a transaction is also idempotent in that you must specify a ``target`` file or files for the tranasaction and it will not be run if the target exists already. This is perhaps best illustrated by a simple example:: with p.transaction("{workdir}/example.txt"): p.run("{tmpdir}/touch example.txt") This will result in a file ``B.txt`` in each job's ``workdir``. The creation of this file will be skipped if the code is run again and the file already exists. This is obviously a silly example, but the code inside the `with` block can be any arbitrarily complex series of operations which produces a set of target output files at the end. This is a powerful feature in that it allows pipelines to be restratable: if a pipeline crashes for some reason but you have it's major sections wrapped in ``transaction`` blocks, you can simple run it again and pick up where you left off without redoing any work. The transaction blocks guarentee that the ``workdir`` for each job is never in an inconsistent state and that work that's already been completed isn't redone. Inside a transaction, each job has a special ``tmpdir`` key, whose value is the path to a unique temporary directory for the job. You can do work that produces files inside the ``tmpdir`` and expect everything in it to be moved to the job's ``workdir`` if the transaction compeltes without error. The ``tmpdir`` will be removed at the end of the transaction regardless of whether or not it succeeds. We change directories to the ``tmpdir`` before doing anything else and implicitly consider targets to be relative to a job's ``workdir`` so the above example could also be written written:: with p.transaction("example.txt"): p.run("touch example.txt") which sacrifices explicitness for brevity. :param targets: a string or list of strings descsribing files that must exist in order for the transaction to be skipped. """ to_run, dont_run = self._transaction_filter(targets) for job in to_run: job["tmpdir"] = tempfile.mkdtemp(dir=job["workdir"]) self.jobs = to_run try: yield finally: for job in self.jobs: if not os.path.exists(os.path.join(job["tmpdir"], ".error")): fs.liftdir(job["tmpdir"], job["workdir"]) shutil.rmtree(job["tmpdir"]) del job["tmpdir"] self.jobs = dont_run + self.jobs def localmap(self, f): """Just like ``map``, but work locally rather than launching an ipython cluster. This is useful for tasks where the cluster launch overhead would swamp the cost of the actual work to be done. :params f: function of ``(job, logger)`` to be mapped over all jobs. """ self.jobs = map(logging_wrapper, self.jobs, (f for j in self.jobs), (self.listen_ip for j in self.jobs), (self.listen_port for j in self.jobs)) def map(self, f, cores=1, mem="0.1", max=None): """Map the function ``f`` over all of the ``jobs`` in this pipeline. ``f`` must be a function of two arguments, the job and a logger. It should modify the job it is passed, which will then be returned over the wire. A silly example:: def f(job, logger): job["capitalized_description"] = job["description"].toupper() p.map(f) Will give each ``job`` in the pipeline a ``capitalized_description`` attribute, which can then be used in future pipline operations. ``cores`` and ``mem`` are used to specify the cores and memory required by this step; they will be passed to the underlying scheduler. ``max`` can be used as a hard limit on the number of jobs to run. This is useful if, for example, a particular task puts pressure on some sort of storage system (a distributed file system, object store, etc.) that you know will fail under too much load. :param f: function of ``(job, logger)`` to be mapped over all jobs. :param cores: cores required by this call. :param mem: memory required by this call. :param max: maximum number of jobs to submit. """ if not self.jobs: # this looks very odd, it's necessary because sometimes # being a transaction causes self.jobs to be empty, and # IPython throws errors if you try to make over the empty # list. It might be cleaner to catch the error after # letting IPython do the map; will have to think about it. return engines, cores, mem = self._compute_resources(cores, mem, max) extra_params = {"run_local": self.local, "mem": mem} with self._cluster_view(self.scheduler, self.queue, engines, profile=self.ipythondir, cores_per_job=cores, extra_params=extra_params, retries=self.retries) as view: # using cloudpickle allows us to serialize all sorts of things # we wouldn't otherwise be able to dview = view.client.direct_view() use_cloudpickle() dview.apply(use_cloudpickle) self.jobs = view.map_sync(logging_wrapper, self.jobs, (f for j in self.jobs), (self.listen_ip for j in self.jobs), (self.listen_port for j in self.jobs)) def run(self, template, capture_in=None, **kwargs): """Run the ``template`` formatted with the contents of each job. Example:: p.run("touch {workdir}/example.txt") will make an example.txt file in each job's workdir. ``cores`` and ``mem`` mean the same thing they do in the ``map`` method. If a string is passed for ``capture_in``, the stdout of the command will be captured in ``job[capture_in]`` for each job. """ runner = cmdrunner(template, capture_in) self.map(runner, **kwargs)