def __init__(self, config, label, site, login, password, folder, dest_root): self.config = config self._label = label self._site = site self._login = login self._password = password self._folder = folder self._dest_root = dest_root u.ensure_path(self._dest_root) self._ftp = None self._splt = re.compile('\s+') self._list_file_name = '_ftp_dirlist.txt' self._dir_list = self.config.admin + '/' + self._label + self._list_file_name self._download_count = 0 self._download_limit = self.config.get_int('input', 'download_limit', default=0) # this is used to store/persist metadata about directly downloaded files. # key has no path. self._local_files = {} # this is populated by TreeProcessor self.rules_run_files = [] self._re_include = [] self._re_exclude = [] re_include_files = self.config.get_multi('input', 'include_files') for str in re_include_files: self._re_include.append(re.compile(str)) re_exclude_files = self.config.get_multi('input', 'exclude_files') for str in re_exclude_files: self._re_exclude.append(re.compile(str))
def preprocess_ast(self, force_rewrite=False, path=None): ''' :param force_rewrite: for each sample, if a file with the corresponding name already exists on disk, don't overwrite it :param path: target folder :return: ''' if not path: path = Path.cwd() / "data" / "code" / "tmp" if force_rewrite: remove_folder(path / "sources") sources_path = path / "sources" ensure_path(sources_path) result_path = path / "contexts" ensure_path(result_path) # TODO: check weird race condition # print(os.path.exists(path / "sources")) # print(os.path.exists(path / "contexts")) self._extract_ast(sources_path=sources_path, result_path=result_path, force_rewrite=force_rewrite) result_path /= "cpp" return self._preprocess_path_contexts(result_path)
def copy_with_metadata(self, finfo, dest): (dest_dir, dest_file) = os.path.split(dest) # prohibit destinations outside our output root rel_path = u.make_rel_path(self.config.output, dest_dir, strict=False) if not rel_path: logging.warning( "disallowing copy dest '%s', outside of output_root '%s'" % (dest_dir, self.config.output)) # no self-copy if dest == finfo['full']: logging.warning("not copying file '%s' onto itself!" % dest) return None u.ensure_path(dest_dir) Config.log("%s to %s" % (finfo['full'], dest), tag='COPY_WITH_METADATA') if not os.path.exists(finfo['full']): msg = "file '%s' does not exist" % finfo['full'] Config.log(msg, tag='COPY_WITH_METADATA_ERROR') return None shutil.copyfile(finfo['full'], dest) # set metadata for new file local = u.local_metadata(dest_dir, dest_file) newfi = copy.deepcopy(finfo) # TODO replace with unified metadata-copy system newfi['name'] = dest_file newfi['path'] = dest_dir newfi['full'] = dest newfi['size'] = local['size'] newfi['modified'] = local['modified'] # clear transient metadata not applicable to new file u.remove_no_copy_metadata(newfi) newfi['rules_run'] = False return newfi
def captionize(interp, finfo, argstr): try: interpret = u.interpret_method(interp) argstr = u.debracket(argstr, interpret, finfo=finfo) args = u.parse_arg_string(argstr) src = finfo['full'] Config.log("src '%s', argstr '%s'" % (src, argstr), tag='CAPTIONIZE') if not u.have_required(args, 'dest', 'where', 'font_size', 'bar_size', 'pad_x', 'pad_y', 'text'): raise Exception("captionize incomplete args '%s'" % argstr) dest = args['dest'] (dest_dir, dest_file) = os.path.split(dest) u.ensure_path(dest_dir) params = [] if args['where'] == 'top': params.extend(['-gravity', 'northwest']) elif args['where'] == 'bottom': params.extend(['-gravity', 'southwest']) else: raise Exception("captionize invalid 'where' arg in " + argstr) # TODO: validate colors. See https://www.imagemagick.org/script/color.php if 'background_color' in args: params.extend(['-background', args['background_color']]) else: params.extend(['-background', 'white']) if 'text_color' in args: params.extend(['-fill', args['text_color']]) else: params.extend(['-fill', 'black']) # TODO: validate font name. "convert -list font" will list them. system dependent. if 'font' in args: params.extend(['-font', args['font']]) else: params.extend(['-font', 'Helvetica']) params.extend(['-pointsize', args['font_size']]) params.extend(['-splice', '0x' + args['bar_size']]) x = u.force_sign(args['pad_x']) y = u.force_sign(args['pad_y']) params.extend(['-annotate', x + y]) fixed_text = args['text'].replace("'", "\\'") params.append('"' + fixed_text + '"') call_args = ['convert', src] + params + [dest] (returncode, stdout, stderr) = u.run_command(call_args) # logging.debug("returncode: %s\nstdout: %s\nstderr: %s" % (returncode, stdout, stderr)) if returncode == 0: # we know the file that was created, so make its metadata now newfi = {} newfi['parent_full'] = finfo['full'] # provenance newfi['name'] = dest_file newfi['path'] = dest_dir newfi['full'] = dest newfi['rules_run'] = False newfi.pop('groups', None) return {'new_finfo': newfi} else: logging.error("captionize failed with rc %i, stderr = '%s'" % (returncode, stderr)) except Exception as exc: logging.error("captionize exception '%s'" % str(exc))
def upload_finfo(self, finfo, content_type=None, extra_args=None): key = finfo['key'] rel_path, _ = os.path.split(key) if extra_args is None: extra_args = { 'Metadata': { 'md5': finfo['md5'] } } if not content_type: content_type = mimetypes.MimeTypes().guess_type(finfo['name'])[0] extra_args['ContentType'] = content_type dest_full = self._file_dest_root + '/' + key dest_dir, dest_name = os.path.split(dest_full) info = { 'new': True, 'name': finfo['name'], 'rel_path': rel_path, 'key': key, 'size': finfo['size'], 'modified': finfo['modified'], 'md5': finfo['md5'] } # transfer other metadata self.transfer_metadata(finfo, local_root=self.local_root, dest=info) self.tree_info[key] = info u.ensure_path(dest_dir) shutil.copyfile(finfo['full'], dest_full)
def get_screenshot(interp, finfo, argstr): try: interpret = u.interpret_method(interp) argstr = u.debracket(argstr, interpret, finfo=finfo) args = u.parse_arg_string(argstr) Config.log(argstr, tag='GET_SCREENSHOT') if not u.have_required(args, 'url', 'dest', 'height', 'width'): raise Exception("get_screenshot incomplete args '%s'" % argstr) dest = u.debracket(args['dest'], interpret, finfo=finfo) (dest_dir, dest_file) = os.path.split(dest) u.ensure_path(dest_dir) puppeteer_templates = Config.main.template_root + '/puppeteer' template_file = puppeteer_templates + '/get_screenshot.js' try: template = open(template_file).read() except Exception as exc: Config.log("'%s' opening template file '%s" % (str(exc), template_file), tag='GET_SCREENSHOT_ERROR') raise symbols = { 'url': args['url'], 'width': args['width'], 'height': args['height'], 'dest': dest } script = u.debracket(template, interpret, symbols=symbols) script_path = Config.main.output + '/tmp' u.ensure_path(script_path) script_file = script_path + '/get_screenshot.js' open(script_file, 'w').write(script) working_dir = Config.main.get('tools', 'node_workdir') if not os.path.isdir(working_dir): err = "get_screenshot: invalid node_workdir '%s'" % working_dir logging.error(err) raise Exception(err) call_args = ['node', script_file] (returncode, stdout, stderr) = u.run_command( call_args, working_dir) # logging.debug("returncode: %s\nstdout: %s\nstderr: %s" % (returncode, stdout, stderr)) if returncode == 0: # we know the file that was created, so make its metadata now newfi = {} newfi['parent_full'] = finfo['full'] # provenance newfi['name'] = dest_file newfi['path'] = dest_dir newfi['full'] = dest newfi['rules_run'] = False newfi.pop('groups', None) return {'new_finfo': newfi} else: Config.log("rc %i, stderr = '%s'" % (returncode, stderr), tag='GET_SCREENSHOT_ERROR') except Exception as exc: Config.log(str(exc), tag='GET_SCREENSHOT_ERROR')
def test_age_file(): test = '/home/floeuser/test_age_file' dest = test + '/' + 'aged' u.ensure_path(dest) src = test + '/' + 'foo.bar' with open(src, 'w') as fh: fh.write('foo.bar') for i in range(1, 510): start = u.timestamp_now() u.age_file(src, dest) print("iter %i took %f seconds" % (i, u.timestamp_now() - start))
def archive(self, options=None): if options is None: options = {} if not self.config.archive: logging.error("can't archive, no archive_root configured") return try: u.ensure_path(self.config.archive + '/output') u.deploy_tree(self.config.output, self.config.archive + '/output', options=options) except Exception as exc: err = "archive: exception '%s' running rsync" % str(exc) logging.error(err) raise Config.log('', tag='TP_ARCHIVE_COMPLETE')
def _upload(self, src_path, src_name, key, bucket=None, content_type=None, extra_args=None): full = u.pathify(src_path, src_name) if extra_args is None: extra_args = { 'Metadata': { 'md5': u.md5(full) } } if not content_type: content_type = mimetypes.MimeTypes().guess_type(src_name)[0] extra_args['ContentType'] = content_type dest_full = self._file_dest_root + '/' + key dest_dir, dest_name = os.path.split(dest_full) u.ensure_path(dest_dir) shutil.copyfile(full, dest_full)
def scale_and_copy(interp, finfo, argstr): try: interpret = u.interpret_method(interp) argstr = u.debracket(argstr, interpret, finfo=finfo) args = u.parse_arg_string(argstr) src = finfo['full'] Config.log("src '%s', argstr '%s'" % (src, argstr), tag='SCALE_AND_COPY') # image magick infers format from extension if not u.have_required(args, 'dest', 'size'): raise Exception("scale_and_copy incomplete args '%s'" % argstr) if 'larger_dim' in args: # TODO: this alternative to 'size' requires getting dims of original raise Exception("scale_and_copy: 'larger_dim' not yet supported") dest = args['dest'] size = int(args['size']) size_str = "%ix%i" % (size, size) # to convert only first frame of animated gif, specify 'file[0]' if 'single_frame' in args and args['single_frame']: src += '[0]' ### TODO more copied stuff from copy_with_metadata! (dest_dir, dest_file) = os.path.split(dest) u.ensure_path(dest_dir) call_args = ['convert', src, '-resize', size_str, dest] (returncode, stdout, stderr) = u.run_command(call_args) # logging.debug("returncode: %s\nstdout: %s\nstderr: %s" % (returncode, stdout, stderr)) if returncode == 0: # we know the file that was created, so make its metadata now newfi = {} newfi['parent_full'] = finfo['full'] # provenance newfi['name'] = dest_file newfi['path'] = dest_dir newfi['full'] = dest newfi['rules_run'] = False newfi.pop('groups', None) # add thumb dimensions to metadata newfi['width'], newfi['height'] = get_image_size(dest) return {'new_finfo': newfi} else: logging.error("scale_and_copy failed with rc %i, stderr = '%s'" % (returncode, stderr)) return {} except Exception as exc: logging.error("scale_and_copy exception '%s'" % str(exc)) return {}
def _extract_frames(src, dest_dir): try: u.ensure_path(dest_dir) (src_path, src_name) = os.path.split(src) destspec = dest_dir + '/' + src_name + '_frame_%05d.gif' runargs = ['convert', '-coalesce', src, destspec] (returncode, stdout, stderr) = u.run_command(runargs) logging.debug("convert returncode: %s\nstdout: %s\nstderr: %s" % (returncode, stdout, stderr)) if returncode == 0: return True else: logging.error("get_frame_count failed with rc %i, stderr = '%s'" % (returncode, stderr)) except Exception as exc: logging.error("_extract_frames src '%s' exception '%s'" % (src, str(exc))) raise
def __init__(self, config, dest_mgr=None): self.config = config # key is rule type, value is array of rules. # only rule types so far: # self_tree: rules run against TP's own trees (input and output) # dest: rules run against tree of files deployed on dest self._rule_funcs = {'self_tree': [], 'dest': []} self.file_info = {} self.symbols = {} self._dest_mgr = dest_mgr self._track_file_callback = None if self._dest_mgr: self._track_file_callback = self._dest_mgr.track_file self.input_mgrs = [] self._files_processed = 0 self.PASSES = 20 # max iterations to process all new files self._pass = 0 self._root_file_limit = config.get_int('process', 'root_file_limit', default=1000000) self._root_files_processed = 0 # rule parsing and other regexes self._re = {} self._re['regex'] = re.compile(r'(.*) like (.*)') self._re['copy'] = re.compile(r'copy to (.*)') self._re['group'] = re.compile(r'^\$(\d+)$') self._re['cond'] = re.compile(r'^(\s*)if\s+(.*):\s*$') self._re['action'] = re.compile(r'^(\s*)(.*)$') self._re['header'] = re.compile(r'^(\s*)\[(.*)\]$') self._re['comment'] = re.compile(r'^\s*(#.*)?$') self._re['arb_fn'] = re.compile(r'(\S+)\s+(.*)') self._re['define'] = re.compile(r'^\s*([a-zA-Z][_a-zA-Z0-9]*)\s*=\s*(.*)$') unpack_filter = config.get('process', 'unpack_files_wanted', return_none=True) if unpack_filter: self._re['unpack_files_wanted'] = re.compile(unpack_filter) self._always_unpack = self.config.is_true('process', 'always_unpack', absent_means_yes=True) self.unpack_root = self.config.input + u.unpack_marker() u.ensure_path(self.unpack_root) self.clear_first = self.config.is_true('process', 'clear_first', absent_means_yes=True) self.make_md5 = self.config.is_true('process', 'make_md5', absent_means_no=True) self._parse_rules()
def restore_from_archive(self, wanted, options=None): if options is None: options = {} if 'verbose' in options and options['verbose']: logging.info("restore_from_archive starting") re_wanted = re.compile(wanted) archive_root = self.config.archive + '/output' for dir_name, subdirs, files in os.walk(archive_root): for file_name in files: full_src = dir_name + '/' + file_name if re.search(re_wanted, full_src): full_dest = u.reroot_file(full_src, archive_root, self.config.output) (dest_full_path, dest_name) = os.path.split(full_dest) u.ensure_path(dest_full_path) shutil.copyfile(full_src, full_dest) if 'verbose' in options and options['verbose']: logging.info("restore_from_archive %s -> %s" % (full_src, full_dest)) if 'verbose' in options and options['verbose']: logging.info("restore_from_archive completed")
def process(self, do_clear_info=True): logging.info("starting tree processing") start = u.timestamp_now() u.ensure_path(self.config.output) self._root_files_processed = 0 if self.clear_first and do_clear_info: u.clear_folder(self.config.output) # do this at start in case last run didn't clean up properly self.remove_unpacked_files() if do_clear_info: self.file_info.clear() self._pass = 0 # pass number self._files_processed = 0 # make one pass over the input files. if you need to know whether this is # the input pass, check for self._pass == 0. self._walk_files(self.config.input) if self.config.signalled(): logging.info("signal set, leaving tp.process") return False # then make passes over the output files until no new files are encountered work_done = self._files_processed > 0 Config.log('tp._files_processed = %i' % self._files_processed, tag='WORK_DONE_PASS_0') # do NOT look at _root_files_processed after pass 0 - we want to fully # process any files created during pass 0 while self._pass < self.PASSES: self._files_processed = 0 self._pass += 1 self._walk_files(self.config.output) if self.config.signalled(): logging.info("signal set, leaving tp.process after pass %i" % self._pass) work_done = False break Config.log('tp._files_processed = %i' % self._files_processed, tag='WORK_DONE_PASS_%i' % self._pass) if self._files_processed > 0: work_done = True else: break if self._pass >= self.PASSES: raise Exception("completed %i passes and still not done. failing" % self.PASSES) self.update_input_mgr_metadata() elapsed = u.timestamp_now() - start Config.log("tp completed in %i passes, %f seconds, work_done %s" % (self._pass, elapsed, work_done), tag='WORK_DONE') return work_done
def __init__(self, args, data): super().__init__(args, data) self.params = self.args["prepare"]["source"] self.tmp_path = Path.cwd() / "data" / "tmp" ensure_path(self.tmp_path) self.original = self.tmp_path / "original.cpp" self.source_path = self.tmp_path / "solution.cpp" self.log_path = self.tmp_path / "cppcheck_log.xml" self.tokens_path = self.tmp_path / "solution.tokens" self.dest_path = Path.cwd() / "data" / "datasets" / "source" self.compiler = self.params["compiler"] if self.compiler != "g++": raise NotImplementedError("Unsupported compiler {}".format( self.compiler)) self.tokenizer = self._ensure_tokenizer_exists() self.parallel = args["prepare"]["parallel"]
def default_template_file_action(self, dir_name, file_name, dest_rel_path=None, dest_name=None): template_full = dir_name + '/' + file_name Config.log("default_template_file_action '%s'" % template_full, tag='DEFAULT_TEMPLATE_FILE_ACTION') if dest_name: rel_path = dest_rel_path dest_path = u.pathify(self.output_root, dest_rel_path) else: rel_path = u.make_rel_path(self.site_root, dir_name) dest_path = u.pathify(self.output_root, rel_path) dest_name = file_name u.ensure_path(dest_path) dest_full = u.pathify(dest_path, dest_name) info = { 'name': dest_name, 'path': dest_path, 'rel_path': rel_path, 'full': dest_full, 'key': u.make_key(rel_path, dest_name) } if self.config.is_template_type(file_name): template = open(template_full).read() output = u.debracket(template, self.interpret) if not self.config.is_special_file(info['key']): open(dest_full, 'w').write(output) local = u.local_metadata(dest_path, dest_name) info['size'] = local['size'] info['modified'] = local['modified'] info['md5'] = u.md5(dest_full) self.track_file(info) else: shutil.copyfile(template_full, dest_full) local = u.local_metadata(dest_path, dest_name) info['size'] = local['size'] info['modified'] = local['modified'] info['md5'] = u.md5(dest_full) self.track_file(info)
def _parse_url_patterns(self): for ar in self._assemble_url_patterns(): tag, pat, dest, start, idle, mode = ar if tag in self._url_patterns: msg = "ignoring duplicate url tag '%s'" % tag Config.log(msg, tag='URL_DUPLICATE') continue if not self._re['tag'].match(tag): Config.log(tag, tag='URL_INVALID_TAG') raise Exception('URL_INVALID_TAG') if not self._valid_pattern(pat): Config.log(pat, tag='URL_INVALID_PATTERN') raise Exception('URL_INVALID_PATTERN') daynum = None try: daynum = u.datestr_to_daynum(start) except Exception as exc: Config.log(start, tag='URL_INVALID_START') raise try: int(idle) # TODO check for nonneg and not too huge except ValueError: Config.log(idle, tag='URL_INVALID_IDLE') raise if mode != 'snap' and mode != 'fetch': Config.log(mode, tag='URL_INVALID_MODE') raise Exception('URL_INVALID_MODE') # this is the base path under which all files for this pattern are stored - # in this dir if no dest, but if dest is nonempty they may go in a subdir path = os.path.join(self.config.input, tag) u.ensure_path(path) self._url_patterns[tag] = { 'tag': tag, 'pattern': pat, 'start': daynum, 'dest': dest, 'current_daynum': daynum, 'idle': int(idle), 'mode': mode, 'path': path }
def __init__(self, config, dest_mgr=None, track_file_callback=None): self.config = config self.dest_mgr = dest_mgr self._track_file_callback = track_file_callback # site root is the tree of fixed assets that are always built and replicated # on the remote site (e.g. index.html) self.site_root = config.template_root + '/site' self.generate_root = config.template_root + '/generated' u.ensure_path(self.site_root) self.output_root = self.config.output self._worklists = {} self._re = {} self._re['worklist'] = re.compile(r'^worklist\s+(.*)$', re.MULTILINE | re.DOTALL) self._re['get_config'] = re.compile( r'^get_config\s+([a-z_]+)\s*:\s*(.*)') self._re['part'] = re.compile(r'^part\s+(.*)$') self._files_created = [] self.page_context = None self._default_worklist = None self._mode_plugins = {}
def combine_frames(frame_list, dest_dir, dest_name, delay=None): try: u.ensure_path(dest_dir) runargs = ['convert'] if delay is not None: runargs = runargs + ['-delay', delay] runargs = runargs + [ '@' + frame_list, '-loop', '0', dest_dir + '/' + dest_name ] (returncode, stdout, stderr) = u.run_command(runargs) logging.debug("convert returncode: %s\nstdout: %s\nstderr: %s" % (returncode, stdout, stderr)) if returncode == 0: return True else: logging.error("combine_frames failed with rc %i, stderr = '%s'" % (returncode, stderr)) except Exception as exc: logging.error("combine_frames frame_list '%s' exception '%s'" % (frame_list, str(exc))) raise
def send_request(self, url, useragent, protocol=None): parsed = urlparse(url) scheme = parsed.scheme host = parsed.netloc resource = parsed.path or "" # daca nu exista path, ia string gol in loc de None port = parsed.port dir_queue = resource.split('/') dir_queue = list( filter("".__ne__, dir_queue) ) # filtreaza si returneaza orice element care nu e egal cu "" dir_queue = [host] + dir_queue # structura de directoare va fi: # scheme - / # | - host - / # | - path1 # | - path2 # | - path3 # | - path4 etc filename = dir_queue[-1] dir_queue.pop() location = ensure_path( "output", scheme, dir_queue) # creeaza structura de directoare si returneaza ultimul self.new_get_request(resource, protocol) self.add_header("Host", host) self.add_header("User-Agent", useragent) self.end_message() write_flag = False filename = location + "/" + filename + ".html" with open(filename, "w") as output: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.connect((host, 80)) s.send(self.message.encode()) while True: data = s.recv(1024).decode() if write_flag is False: try: idx = data.index("<!DOCTYPE") write_flag = True except ValueError: try: idx = data.index("<!doctype") write_flag = True except ValueError: idx = None print(data[0:idx]) if idx is not None: output.write(data[idx:]) else: output.write(data) if "</html>" in data or "</HTML>" in data: break
def __init__(self, config, url_patterns): self.config = config u.ensure_path(config.input) self._list_file_name = '_url_list.txt' # TODO should be able to separately configure dir for persist files (e.g. for git convenience) self._list_file = self.config.admin + '/' + self._list_file_name self._download_count = 0 self._download_limit = self.config.get_int('input', 'url_download_limit', default=0) self._raw_url_patterns = url_patterns self._url_patterns = {} self._daynum_today = u.daynum_now() # metadata for downloaded files (key is product_tag/file_name): self._local_files = {} self._re = {} self._re['tag'] = re.compile(r'^[a-zA-Z0-9_]+') self._parse_url_patterns() # this is used to store/persist metadata about directly downloaded files. self._local_files = {} # this is populated by TreeProcessor self.rules_run_files = []
def __init__(self, configFile): Config.main = self # singleton self.configFile = configFile self._config = None self.plugin = None self.start_time = datetime.utcnow() self.special_mode_args = [] self.final_summary = '' self.iteration = None self._re = {} self._re['n'] = re.compile(r'\n+') self._re['symbol'] = re.compile(r'\s*=\s*') self._re['comment'] = re.compile(r'^\s*(#.*)?$') self.symbols = {} self._rules_mask = {} self._debug_tags = {} self._template_extensions = {} self.log_to_console = False # special modes available on command line, and info about them self._special_modes = { 'ftp_catchup': {}, 'ftp_remove': {}, 'im_rerun': {}, 'ftp_clear': {}, 'im_meta_from_local': {}, 'restore_from_archive': {}, 'test_deploy_test': {} } # TODO allow override via config self._special_file_tags = {'run_post_tree': True, 'run_pre_web': True} if configFile: self.load_config() self.input = self.get('local', 'input') u.ensure_path(self.input) self.output = self.get('local', 'output') u.ensure_path(self.output) self.admin = self.get('local', 'admin') u.ensure_path(self.admin) self.archive = self.get('local', 'archive') u.ensure_path(self.archive) # template root is where all templates, of all types, are stored self.template_root = self.get('process', 'template_root', return_none=True) if self.template_root is None: self.template_root = self.admin + '/templates' # Config is in charge of logging options loglevel = logging.INFO self._logfile = self.get('local', 'logfile', return_none=True) if self._logfile: log_dir = self.admin + '/logs' u.ensure_path(log_dir) full = log_dir + '/' + self._logfile u.age_file(full, log_dir, move=True) # see https://stackoverflow.com/questions/1943747/python-logging-before-you-run-logging-basicconfig: # if someone tried to log something before basicConfig is called, Python creates a default handler that # goes to the console and will ignore further basicConfig calls. Remove the handler if there is one. root = logging.getLogger() if root.handlers: for handler in root.handlers: root.removeHandler(handler) logging.basicConfig(filemode='w', filename=full, format='%(asctime)s %(message)s', datefmt='(%b %d %Y %H:%M:%S)', level=loglevel) logging.info("starting at %s" % self.display_times(self.start_time)) logging.info("configFile is %s" % self.configFile) logging.info("working directory is %s" % os.getcwd()) if self.is_true('local', 'log_config', absent_means_no=True): self.log_config() self._load_symbols() self._load_rules_to_run() self._load_debug_tags() self._load_template_extensions() self.log_to_console = self.is_true('local', 'log_to_console', absent_means_no=True) # allow user to bail out on run by creating a signal file self._signal_file = self.get('actions', 'signal_file', return_none=True) if self._signal_file: logging.info("will watch for signal file '%s'" % self._signal_file) logging.info("sys.path:\n" + '\n'.join([p for p in sys.path]))
def get_model_path(args): res_path = Path.cwd() / "data" / "models" / args["model"] / "data" ensure_path(res_path) return res_path
def panoply(interp, finfo, argstr): Config.log("file '%s' argstr '%s'" % (finfo['full'], argstr), tag='PANOPLY') try: interpret = u.interpret_method(interp) argstr = u.debracket(argstr, interpret, finfo=finfo) args = u.parse_arg_string(argstr) if not 'action' in args: raise Exception("panoply: 'action arg is required") if not 'dest' in args: raise Exception("panoply: dest arg is required") action = args['action'] dest = u.debracket(args['dest'], interpret, finfo=finfo) panoply_templates = Config.main.template_root + '/panoply' src = finfo['full'] if not src.endswith('.nc'): logging.error("panoply command: '%s' is not a dataset file" % src) return ### TODO more copied stuff from copy_with_metadata! (dest_dir, dest_file) = os.path.split(dest) u.ensure_path(dest_dir) jar = 'PanoplyCL.jar' size = None if 'size' in args: size = int(args['size']) size_factor = _panoply_size_to_size_factor(size) template_file = panoply_templates + '/' + action + '.pclt' try: template = open(template_file).read() except Exception as exc: logging.error( "panoply command: error '%s' opening template file '%s" % (str(exc), template_file)) raise symbols = { 'dataset': src, 'output_file': dest, 'size_factor': size_factor } script = u.debracket(template, interpret, symbols=symbols) script_path = Config.main.output + '/tmp' u.ensure_path(script_path) script_file = script_path + '/' + action + '.pcl' open(script_file, 'w').write(script) working_dir = Config.main.get('tools', 'panoply_workdir') if not os.path.isdir(working_dir): err = "panoply: invalid panoply_workdir '%s'" % working_dir logging.error(err) raise Exception(err) call_args = ['java', '-jar', jar, script_file] (returncode, stdout, stderr) = u.run_command(call_args, working_dir) logging.debug("returncode: %s\nstdout: %s\nstderr: %s" % (returncode, stdout, stderr)) if returncode == 0: # we know the file that was created, so make its metadata now newfi = {} newfi['name'] = dest_file newfi['path'] = dest_dir newfi['full'] = dest newfi['rules_run'] = False tmp = u.local_metadata(newfi['path'], newfi['name']) newfi['size'] = tmp['size'] newfi['modified'] = tmp['modified'] return {'new_finfo': newfi} else: logging.error("panoply failed with rc '%i', stderr = '%s'" % (returncode, stderr)) except Exception as exc: # py2 logging.error("panoply exception '%s'" % exc.message) logging.error("panoply exception '%s'" % str(exc))
def download_page(page, level): mapped = map_local_path(page) clean = mapped[0] target = join("doc", mapped[1]) src = join("doc/orig", mapped[1]) if page in htmls: return mapped[1] if level > 10: pages.append(page) return mapped[1] if "//foundationdb.com" not in page: return page if ".pdf" in page: ensure_file(page, join("doc", mapped[1])) return mapped[1] if ".png" in page: ensure_file(page, join("doc", mapped[1])) return mapped[1] if "javadoc" in page: print(" skip javadoc") return page if "courses" in page: return page ensure_file(page, src) with open(src, 'r') as r: lines = list(r) result = [] for i, l in enumerate(lines): r = redirect_regex.search(l) if r: new_url = r.groupdict()["url"].replace("\/", "/") return download_page("https://web.archive.org" + new_url, level + 1) htmls.add(page) links = set() def match(m): dict = m.groupdict() url = dict["url"] kind = dict["kind"] name = map_local_path(url)[0] full_url = "https://web.archive.org" + url if kind == "": full_local = download_page(full_url, level + 1) else: full_local = kind + "/" + name ensure_file(full_url, "doc/" + full_local) # matching local path return relpath(full_local, dirname(mapped[1])) with open(src, 'r') as r: lines = list(r) clean_wb(lines) result = [] for i, l in enumerate(lines): result.append(ref_regex.sub(match, l)) print("Saving to ", target) ensure_path(target) with open(target, 'w') as w: w.writelines(result) return mapped[1]
def send_request(self, url, useragent, protocol): self.num += 1 print(self.num, url) self.links[url] = True parsed = urlparse(url) scheme = parsed.scheme if scheme == "http": implicit_port = 80 elif scheme == "https": implicit_port = 443 else: implicit_port = 80 host = parsed.netloc resource = parsed.path or "" # daca nu exista path, ia string gol in loc de None port = parsed.port dir_queue = resource.split('/') dir_queue = list(filter("".__ne__, dir_queue)) # filtreaza si returneaza orice element care nu e egal cu "" if len(dir_queue) == 0: filename = "index.html" dir_queue.append(filename) else: filename = dir_queue[-1] dir_queue = [host] + dir_queue # structura de directoare va fi: # scheme - / # | - host - / # | - path1 # | - path2 # | - path3 # | - path4 etc if filename: dir_queue.pop() location = ensure_path("output", scheme, dir_queue) # creeaza structura de directoare si returneaza ultimul if filename: message = RequestCreator.new_get_request(resource, protocol) message = RequestCreator.add_header("Host", host, message) message = RequestCreator.add_header("User-Agent", useragent, message) message = RequestCreator.end_message(message) write_flag = False if filename.endswith(".html"): filename = location + "/" + filename else: filename = location + "/" + filename + ".html" # with open(filename, "w") as output: file_data = "" # daca hostul este cacheuit, doar ia IP, altfel se acceseaza robots.txt si se iau lucrurile permise if not self.cache.is_cached(host): # daca domeniul nu e in cache, se verifica robots.txt # get_ip cacheuieste domeniul domain_ip = self.cache.get_ip(host) if domain_ip is not None: self.robots[host] = RoboFile(host, domain_ip) self.robots[host].obtain(host, useragent, protocol, implicit_port) if not self.robots[host].ok: return else: # exclus din robots.txt if self.robots[host].match_link(parsed.path): return else: return else: domain_ip = self.cache.get_ip(host) if domain_ip is None: return else: if not self.robots[host].ok: return else: if self.robots[host].match_link(parsed.path): return if domain_ip is not None: headers = "" with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: if implicit_port == 443: context = ssl.create_default_context() s = context.wrap_socket(s, server_hostname=host) s.connect((domain_ip, implicit_port)) s.send(message.encode()) former_first = "" data = bytes(0) while True: s.settimeout(1) try: try: data += s.recv(1024) data = data.decode() except UnicodeDecodeError: continue firstline = data.splitlines()[0] if firstline == former_first: break former_first = firstline except socket.timeout: break if write_flag is False: try: idx = data.index("<!DOCTYPE") write_flag = True except ValueError: try: idx = data.index("<!doctype") write_flag = True except ValueError: try: idx = data.index("<html") except: idx = None headers += data[0:idx] if idx is not None: file_data += data[idx:] else: file_data += data if "</html>" in data or "</HTML>" in data: break data = bytes(0) # VERIFICA HEADERELE code = RequestCreator.get_response_code(headers) redirected = False if code.startswith("3"): # try redirect tries = 5 location = RequestCreator.extract_location(headers) if location is not None: while not redirected and tries > 0: data, rheaders = self.redirect(location, domain_ip, protocol, useragent) code = RequestCreator.get_response_code(rheaders) if code.startswith("2"): redirected = True elif code.startswith("3"): location = RequestCreator.extract_location(rheaders) if location is None: break tries -= 1 else: # eroare break elif code.startswith("4") or code.startswith("5"): # eroare return if code.startswith("2") or redirected: soup = BeautifulSoup(file_data, 'html.parser') metas = soup.find_all("meta") ok1 = False ok2 = False found = False for meta in metas: if "name" in meta: if meta["name"].lower() == "robots": found = True if meta["content"].lower() == "all" or meta["content"].lower() == "index": ok1 = True if meta["content"].lower() == "all" or meta["content"].lower() == "follow": ok2 = True if not found: ok1 = True ok2 = True if ok1: with open(filename, "w", encoding="utf-8") as output: output.write(file_data) if ok2: links = soup.find_all("a", href=True) for link in links: if link["href"] in self.links: continue if "http" in link["href"] or "https" in link["href"]: to_add = link["href"] if "#" in to_add: to_add = to_add[:to_add.rfind("#")] if to_add not in self.queue: self.queue.append(to_add) else: to_add = urljoin(url, link["href"]) if "#" in to_add: to_add = to_add[:to_add.rfind("#")] if to_add not in self.queue: self.queue.append(to_add)
def send_request(self, url, useragent, protocol=None): self.links[url] = True parsed = urlparse(url) scheme = parsed.scheme host = parsed.netloc resource = parsed.path or "" # daca nu exista path, ia string gol in loc de None port = parsed.port dir_queue = resource.split('/') if dir_queue[-1] == "": filename = None else: filename = dir_queue[-1] dir_queue = list( filter("".__ne__, dir_queue) ) # filtreaza si returneaza orice element care nu e egal cu "" dir_queue = [host] + dir_queue # structura de directoare va fi: # scheme - / # | - host - / # | - path1 # | - path2 # | - path3 # | - path4 etc if filename: dir_queue.pop() location = ensure_path( "output", scheme, dir_queue) # creeaza structura de directoare si returneaza ultimul if filename: self.new_get_request(resource, protocol) self.add_header("Host", host) self.add_header("User-Agent", useragent) self.end_message() write_flag = False if filename.endswith(".html"): filename = location + "/" + filename else: filename = location + "/" + filename + ".html" # with open(filename, "w") as output: file_data = "" with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.connect((host, 80)) s.send(self.message.encode()) while True: data = s.recv(1024).decode() if write_flag is False: try: idx = data.index("<!DOCTYPE") write_flag = True except ValueError: try: idx = data.index("<!doctype") write_flag = True except ValueError: try: idx = data.index("<html") except: idx = None # print(data[0:idx]) if idx is not None: file_data += data[idx:] else: file_data += data if "</html>" in data or "</HTML>" in data: break soup = BeautifulSoup(file_data, 'html.parser') metas = soup.find_all("meta") ok1 = True ok2 = True found = False for meta in metas: if meta["name"] == "robots": found = True if meta["content"] == "all" or meta["content"] == "index": ok1 = True if meta["content"] == "all" or meta["content"] == "follow": ok2 = True if found: ok1 = False ok2 = False if ok1: with open(filename, "w") as output: output.write(file_data) if ok2: links = soup.find_all("a", href=True) for link in links: if link["href"] in self.links: continue if "http" in link["href"] or "https" in link["href"]: self.queue.append(link["href"]) else: if port: to_add = scheme + "://" + host + ":" + str(port) if link["href"].startswith('/'): to_add += link["href"] else: to_add += "/" + link["href"] else: to_add = scheme + "://" + host if link["href"].startswith('/'): to_add += link["href"] else: to_add += "/" + link["href"] self.queue.append(to_add)
def get_model_path(self): path = Path.cwd() / "data" / "models" / self.model_name ensure_path(path) return path
if args.out is None: log_info("Didn't specify the output path. Use the default one: %s" % path) else: path = args.out if args.debug: log_debug("Debug mode enabled") if args.fine: log_fine("Display find-grand logging") if args.re_meta: log_info("Redownload all metadata") # === Ensure the path to the output directories meta_dir_path = path + "/metadata" ensure_path(meta_dir_path) price_dir_path = path + "/price" ensure_path(price_dir_path) # === Retrieve the ids of the target applications === data = retrieve_data(APP_LIST_URL) if data is None: sys.exit(1) apps = [] for app in data['applist']['apps']['app']: # According to CT's theory, only the apps with id divisible by 10 are games if app['appid'] % 10 == 0: apps.append((app['appid'], app['name']))
def make_hybrid_forecast(webmaker, dest_key, work_item, list_args): try: Config.log("dest_key = '%s'" % dest_key, tag='HYBRID_FORECAST_START_' + dest_key) (dest_path, dest_name) = os.path.split(dest_key) finfos = work_item['roles']['default'] if not len(finfos): msg = "logic error, no finfos on default list for dest_key '%s'" % dest_key raise Exception(msg) archive_root = webmaker.config.get('local', 'archive', return_none=True) if not archive_root: logging.error("make_hybrid_forecast: fail, no archive_root configured") return 'todo error1' local_store_start = u.datestr_to_daynum(webmaker.config.get('local', 'local_store_start')) history_days = webmaker.config.get_int('tools', 'hybrid_forecast_history_days', default=20) history_frame0s = [] extract_frames = webmaker.config.plugin.get('extract_frames') combine_frames = webmaker.config.plugin.get('combine_frames') get_frame_count = webmaker.config.plugin.get('get_frame_count') # we need the current image for forecast frames, should be finfo current_image = finfos[0]['full'] if not current_image or not os.path.isfile(current_image): msg = "make_hybrid_forecast can't find expected current_image '%s'" % current_image logging.error(msg) return 'todo error2' frame_ct = get_frame_count(current_image) if frame_ct < 2: msg = "make_hybrid_forecast: current_image '%s' not multi-frame, taking no action" % current_image logging.warning(msg) return '' logging.info("got current_image file '%s'" % current_image) (current_path, current_name) = os.path.split(current_image) # look for frame0 files first in output, then archive roots = [webmaker.config.output + '/static', archive_root + '/output/static'] # find history files going back from date of current file if 'item_args' in work_item and 'date_string' in work_item['item_args']: date_str = work_item['item_args']['date_string'] else: msg = "make_hybrid_forecast can't find date_string" logging.error(msg) return 'todo error3' testdate = u.datestr_to_daynum(date_str) - 1 while True: date_str = u.daynum_to_datestr(testdate) # TODO what if it isn't a gif? also hard-coded convention fname = u.find_in_paths( current_name + '_frame_00000.gif', 'esrl-daily-forecasts/' + date_str + '/frame0', roots, options={'full': True} ) if fname: # logging.info("got frame0 file '%s'" % fname) Config.log(fname, tag='HYBRID_FORECAST_FRAME_GOT0') history_frame0s.insert(0, fname) else: Config.log("frames not contiguous by date, missing '%s'" % date_str, 'HYBRID_FORECAST_FRAME_ERR') if len(history_frame0s) >= history_days: break testdate -= 1 if testdate < local_store_start: msg = "make_hybrid_forecast only found %i history frames" % len(history_frame0s) break hist_frame_ct = len(history_frame0s) if not hist_frame_ct: msg = "key '%s': no history frames, can't proceed" % dest_key Config.log(msg, tag='HYBRID_FORECAST_ERROR') webmaker.config.add_to_final_summary(msg) return 'todo error4' # extract frames from current image work_path = webmaker.config.output + '/tmp/' + dest_name u.ensure_path(work_path) extract_frames(webmaker.interpret, finfos[0], 'out_dir:' + work_path) # make list of frame files for new image (TODO review path of tmp file) flist = work_path + '/' + dest_name + '.frames' content = "\n".join(history_frame0s) # add forecast frames for frame_num in range(1, frame_ct - 1): fname = work_path + '/' + current_name + '_frame_' + '{:05d}'.format(frame_num) + '.gif' content += '\n' + fname open(flist, 'w').write(content) # make output animation if 'delay' in list_args: delay = list_args['delay'] else: delay = webmaker.config.get('tools', 'default_frame_delay', return_none=True) combine_frames(flist, webmaker.output_root + '/' + dest_path, dest_name, delay=delay) # TODO use utility method?? display_name = dest_name if 'display_name' in finfos[0]: display_name = finfos[0]['display_name'] item_args = work_item['item_args'] if 'display_name' in item_args: display_name = u.parse_subargs( item_args['display_name'], webmaker.interpret, webmaker.config, finfos[0] ) # TODO REVIEW: we do this for panoply, should we here also? info = { 'name': dest_name, 'path': webmaker.output_root + '/' + dest_path, 'rel_path': dest_path, 'full': webmaker.output_root + '/' + dest_key, 'key': dest_key, 'display_name': display_name } tmp = u.local_metadata(info['path'], info['name']) info['size'] = tmp['size'] info['modified'] = tmp['modified'] webmaker.track_file(info) work_item['roles']['output'] = info os.remove(flist) Config.log("key '%s'" % dest_key, tag='HYBRID_FORECAST_SUCCESS') # TODO review - returning empty string because invoking from html return "" except Exception as exc: Config.log(str(exc), tag='HYBRID_FORECAST_EXCEPTION') raise