Пример #1
0
    def __init__(self, config, label, site, login, password, folder,
                 dest_root):
        self.config = config
        self._label = label
        self._site = site
        self._login = login
        self._password = password
        self._folder = folder
        self._dest_root = dest_root
        u.ensure_path(self._dest_root)
        self._ftp = None
        self._splt = re.compile('\s+')
        self._list_file_name = '_ftp_dirlist.txt'
        self._dir_list = self.config.admin + '/' + self._label + self._list_file_name
        self._download_count = 0
        self._download_limit = self.config.get_int('input',
                                                   'download_limit',
                                                   default=0)

        # this is used to store/persist metadata about directly downloaded files.
        # key has no path.
        self._local_files = {}
        # this is populated by TreeProcessor
        self.rules_run_files = []

        self._re_include = []
        self._re_exclude = []
        re_include_files = self.config.get_multi('input', 'include_files')
        for str in re_include_files:
            self._re_include.append(re.compile(str))
        re_exclude_files = self.config.get_multi('input', 'exclude_files')
        for str in re_exclude_files:
            self._re_exclude.append(re.compile(str))
Пример #2
0
    def preprocess_ast(self, force_rewrite=False, path=None):
        '''
        :param force_rewrite: for each sample, if a file with
        the corresponding name already exists on disk, don't overwrite it
        :param path: target folder
        :return:
        '''

        if not path:
            path = Path.cwd() / "data" / "code" / "tmp"

        if force_rewrite:
            remove_folder(path / "sources")

        sources_path = path / "sources"
        ensure_path(sources_path)

        result_path = path / "contexts"
        ensure_path(result_path)

        # TODO: check weird race condition
        # print(os.path.exists(path / "sources"))
        # print(os.path.exists(path / "contexts"))

        self._extract_ast(sources_path=sources_path,
                          result_path=result_path,
                          force_rewrite=force_rewrite)

        result_path /= "cpp"
        return self._preprocess_path_contexts(result_path)
Пример #3
0
 def copy_with_metadata(self, finfo, dest):
     (dest_dir, dest_file) = os.path.split(dest)
     # prohibit destinations outside our output root
     rel_path = u.make_rel_path(self.config.output, dest_dir, strict=False)
     if not rel_path:
         logging.warning(
             "disallowing copy dest '%s', outside of output_root '%s'" %
             (dest_dir, self.config.output))
     # no self-copy
     if dest == finfo['full']:
         logging.warning("not copying file '%s' onto itself!" % dest)
         return None
     u.ensure_path(dest_dir)
     Config.log("%s to %s" % (finfo['full'], dest), tag='COPY_WITH_METADATA')
     if not os.path.exists(finfo['full']):
         msg = "file '%s' does not exist" % finfo['full']
         Config.log(msg, tag='COPY_WITH_METADATA_ERROR')
         return None
     shutil.copyfile(finfo['full'], dest)
     # set metadata for new file
     local = u.local_metadata(dest_dir, dest_file)
     newfi = copy.deepcopy(finfo) # TODO replace with unified metadata-copy system
     newfi['name'] = dest_file
     newfi['path'] = dest_dir
     newfi['full'] = dest
     newfi['size'] = local['size']
     newfi['modified'] = local['modified']
     # clear transient metadata not applicable to new file
     u.remove_no_copy_metadata(newfi)
     newfi['rules_run'] = False
     return newfi
Пример #4
0
def captionize(interp, finfo, argstr):
    try:
        interpret = u.interpret_method(interp)
        argstr = u.debracket(argstr, interpret, finfo=finfo)
        args = u.parse_arg_string(argstr)
        src = finfo['full']
        Config.log("src '%s', argstr '%s'" % (src, argstr), tag='CAPTIONIZE')
        if not u.have_required(args, 'dest', 'where', 'font_size', 'bar_size',
                               'pad_x', 'pad_y', 'text'):
            raise Exception("captionize incomplete args '%s'" % argstr)
        dest = args['dest']
        (dest_dir, dest_file) = os.path.split(dest)
        u.ensure_path(dest_dir)
        params = []
        if args['where'] == 'top':
            params.extend(['-gravity', 'northwest'])
        elif args['where'] == 'bottom':
            params.extend(['-gravity', 'southwest'])
        else:
            raise Exception("captionize invalid 'where' arg in " + argstr)
        # TODO: validate colors. See https://www.imagemagick.org/script/color.php
        if 'background_color' in args:
            params.extend(['-background', args['background_color']])
        else:
            params.extend(['-background', 'white'])
        if 'text_color' in args:
            params.extend(['-fill', args['text_color']])
        else:
            params.extend(['-fill', 'black'])
        # TODO: validate font name. "convert -list font" will list them. system dependent.
        if 'font' in args:
            params.extend(['-font', args['font']])
        else:
            params.extend(['-font', 'Helvetica'])
        params.extend(['-pointsize', args['font_size']])
        params.extend(['-splice', '0x' + args['bar_size']])
        x = u.force_sign(args['pad_x'])
        y = u.force_sign(args['pad_y'])
        params.extend(['-annotate', x + y])
        fixed_text = args['text'].replace("'", "\\'")
        params.append('"' + fixed_text + '"')

        call_args = ['convert', src] + params + [dest]
        (returncode, stdout, stderr) = u.run_command(call_args)
        # logging.debug("returncode: %s\nstdout: %s\nstderr: %s" % (returncode, stdout, stderr))
        if returncode == 0:
            # we know the file that was created, so make its metadata now
            newfi = {}
            newfi['parent_full'] = finfo['full']  # provenance
            newfi['name'] = dest_file
            newfi['path'] = dest_dir
            newfi['full'] = dest
            newfi['rules_run'] = False
            newfi.pop('groups', None)
            return {'new_finfo': newfi}
        else:
            logging.error("captionize failed with rc %i, stderr = '%s'" %
                          (returncode, stderr))
    except Exception as exc:
        logging.error("captionize exception '%s'" % str(exc))
Пример #5
0
 def upload_finfo(self, finfo, content_type=None, extra_args=None):
     key = finfo['key']
     rel_path, _ = os.path.split(key)
     if extra_args is None:
         extra_args = {
             'Metadata': {
                 'md5': finfo['md5']
             }
         }
     if not content_type:
         content_type = mimetypes.MimeTypes().guess_type(finfo['name'])[0]
         extra_args['ContentType'] = content_type
     dest_full = self._file_dest_root + '/' + key
     dest_dir, dest_name = os.path.split(dest_full)
     info = {
         'new': True,
         'name': finfo['name'],
         'rel_path': rel_path,
         'key': key,
         'size': finfo['size'],
         'modified': finfo['modified'],
         'md5': finfo['md5']
     }
     # transfer other metadata
     self.transfer_metadata(finfo, local_root=self.local_root, dest=info)
     self.tree_info[key] = info
     u.ensure_path(dest_dir)
     shutil.copyfile(finfo['full'], dest_full)
Пример #6
0
def get_screenshot(interp, finfo, argstr):
    try:
        interpret = u.interpret_method(interp)
        argstr = u.debracket(argstr, interpret, finfo=finfo)
        args = u.parse_arg_string(argstr)
        Config.log(argstr, tag='GET_SCREENSHOT')
        if not u.have_required(args, 'url', 'dest', 'height', 'width'):
            raise Exception("get_screenshot incomplete args '%s'" % argstr)
        dest = u.debracket(args['dest'], interpret, finfo=finfo)
        (dest_dir, dest_file) = os.path.split(dest)
        u.ensure_path(dest_dir)
        puppeteer_templates = Config.main.template_root + '/puppeteer'
        template_file = puppeteer_templates + '/get_screenshot.js'
        try:
            template = open(template_file).read()
        except Exception as exc:
            Config.log("'%s' opening template file '%s" %
                          (str(exc), template_file), tag='GET_SCREENSHOT_ERROR')
            raise
        symbols = {
            'url': args['url'],
            'width': args['width'],
            'height': args['height'],
            'dest': dest
        }
        script = u.debracket(template, interpret, symbols=symbols)
        script_path = Config.main.output + '/tmp'
        u.ensure_path(script_path)
        script_file = script_path + '/get_screenshot.js'
        open(script_file, 'w').write(script)

        working_dir = Config.main.get('tools', 'node_workdir')
        if not os.path.isdir(working_dir):
            err = "get_screenshot: invalid node_workdir '%s'" % working_dir
            logging.error(err)
            raise Exception(err)

        call_args = ['node', script_file]
        (returncode, stdout, stderr) = u.run_command(
            call_args, working_dir)
        # logging.debug("returncode: %s\nstdout: %s\nstderr: %s" % (returncode, stdout, stderr))
        if returncode == 0:
            # we know the file that was created, so make its metadata now
            newfi = {}
            newfi['parent_full'] = finfo['full'] # provenance
            newfi['name'] = dest_file
            newfi['path'] = dest_dir
            newfi['full'] = dest
            newfi['rules_run'] = False
            newfi.pop('groups', None)
            return {'new_finfo': newfi}
        else:
            Config.log("rc %i, stderr = '%s'" % (returncode, stderr),
                tag='GET_SCREENSHOT_ERROR')
    except Exception as exc:
        Config.log(str(exc), tag='GET_SCREENSHOT_ERROR')
Пример #7
0
def test_age_file():
    test = '/home/floeuser/test_age_file'
    dest = test + '/' + 'aged'
    u.ensure_path(dest)
    src = test + '/' + 'foo.bar'
    with open(src, 'w') as fh:
        fh.write('foo.bar')
    for i in range(1, 510):
        start = u.timestamp_now()
        u.age_file(src, dest)
        print("iter %i took %f seconds" % (i, u.timestamp_now() - start))
Пример #8
0
 def archive(self, options=None):
     if options is None:
         options = {}
     if not self.config.archive:
         logging.error("can't archive, no archive_root configured")
         return
     try:
         u.ensure_path(self.config.archive + '/output')
         u.deploy_tree(self.config.output, self.config.archive + '/output', options=options)
     except Exception as exc:
         err = "archive: exception '%s' running rsync" % str(exc)
         logging.error(err)
         raise
     Config.log('', tag='TP_ARCHIVE_COMPLETE')
Пример #9
0
 def _upload(self, src_path, src_name, key, bucket=None, content_type=None, extra_args=None):
     full = u.pathify(src_path, src_name)
     if extra_args is None:
         extra_args = {
             'Metadata': {
                 'md5': u.md5(full)
             }
         }
     if not content_type:
         content_type = mimetypes.MimeTypes().guess_type(src_name)[0]
         extra_args['ContentType'] = content_type
     dest_full = self._file_dest_root + '/' + key
     dest_dir, dest_name = os.path.split(dest_full)
     u.ensure_path(dest_dir)
     shutil.copyfile(full, dest_full)
Пример #10
0
def scale_and_copy(interp, finfo, argstr):
    try:
        interpret = u.interpret_method(interp)
        argstr = u.debracket(argstr, interpret, finfo=finfo)
        args = u.parse_arg_string(argstr)
        src = finfo['full']
        Config.log("src '%s', argstr '%s'" % (src, argstr),
                   tag='SCALE_AND_COPY')
        # image magick infers format from extension
        if not u.have_required(args, 'dest', 'size'):
            raise Exception("scale_and_copy incomplete args '%s'" % argstr)
        if 'larger_dim' in args:
            # TODO: this alternative to 'size' requires getting dims of original
            raise Exception("scale_and_copy: 'larger_dim' not yet supported")
        dest = args['dest']
        size = int(args['size'])
        size_str = "%ix%i" % (size, size)
        # to convert only first frame of animated gif, specify 'file[0]'
        if 'single_frame' in args and args['single_frame']:
            src += '[0]'
        ### TODO more copied stuff from copy_with_metadata!
        (dest_dir, dest_file) = os.path.split(dest)
        u.ensure_path(dest_dir)
        call_args = ['convert', src, '-resize', size_str, dest]
        (returncode, stdout, stderr) = u.run_command(call_args)
        # logging.debug("returncode: %s\nstdout: %s\nstderr: %s" % (returncode, stdout, stderr))
        if returncode == 0:
            # we know the file that was created, so make its metadata now
            newfi = {}
            newfi['parent_full'] = finfo['full']  # provenance
            newfi['name'] = dest_file
            newfi['path'] = dest_dir
            newfi['full'] = dest
            newfi['rules_run'] = False
            newfi.pop('groups', None)
            # add thumb dimensions to metadata
            newfi['width'], newfi['height'] = get_image_size(dest)
            return {'new_finfo': newfi}
        else:
            logging.error("scale_and_copy failed with rc %i, stderr = '%s'" %
                          (returncode, stderr))
            return {}
    except Exception as exc:
        logging.error("scale_and_copy exception '%s'" % str(exc))
        return {}
Пример #11
0
def _extract_frames(src, dest_dir):
    try:
        u.ensure_path(dest_dir)
        (src_path, src_name) = os.path.split(src)
        destspec = dest_dir + '/' + src_name + '_frame_%05d.gif'
        runargs = ['convert', '-coalesce', src, destspec]
        (returncode, stdout, stderr) = u.run_command(runargs)
        logging.debug("convert returncode: %s\nstdout: %s\nstderr: %s" %
                      (returncode, stdout, stderr))
        if returncode == 0:
            return True
        else:
            logging.error("get_frame_count failed with rc %i, stderr = '%s'" %
                          (returncode, stderr))
    except Exception as exc:
        logging.error("_extract_frames src '%s' exception '%s'" %
                      (src, str(exc)))
        raise
Пример #12
0
    def __init__(self, config, dest_mgr=None):
        self.config = config
        # key is rule type, value is array of rules.
        # only rule types so far:
        # self_tree: rules run against TP's own trees (input and output)
        # dest: rules run against tree of files deployed on dest
        self._rule_funcs = {'self_tree': [], 'dest': []}
        self.file_info = {}
        self.symbols = {}
        self._dest_mgr = dest_mgr
        self._track_file_callback = None
        if self._dest_mgr:
            self._track_file_callback = self._dest_mgr.track_file
        self.input_mgrs = []
        self._files_processed = 0
        self.PASSES = 20  # max iterations to process all new files
        self._pass = 0
        self._root_file_limit = config.get_int('process', 'root_file_limit', default=1000000)
        self._root_files_processed = 0

        # rule parsing and other regexes
        self._re = {}
        self._re['regex'] = re.compile(r'(.*) like (.*)')
        self._re['copy'] = re.compile(r'copy to (.*)')
        self._re['group'] = re.compile(r'^\$(\d+)$')
        self._re['cond'] = re.compile(r'^(\s*)if\s+(.*):\s*$')
        self._re['action'] = re.compile(r'^(\s*)(.*)$')
        self._re['header'] = re.compile(r'^(\s*)\[(.*)\]$')
        self._re['comment'] = re.compile(r'^\s*(#.*)?$')
        self._re['arb_fn'] = re.compile(r'(\S+)\s+(.*)')
        self._re['define'] = re.compile(r'^\s*([a-zA-Z][_a-zA-Z0-9]*)\s*=\s*(.*)$')

        unpack_filter = config.get('process', 'unpack_files_wanted', return_none=True)
        if unpack_filter:
            self._re['unpack_files_wanted'] = re.compile(unpack_filter)

        self._always_unpack = self.config.is_true('process', 'always_unpack', absent_means_yes=True)
        self.unpack_root = self.config.input + u.unpack_marker()
        u.ensure_path(self.unpack_root)
        self.clear_first = self.config.is_true('process', 'clear_first',
                                               absent_means_yes=True)
        self.make_md5 = self.config.is_true('process', 'make_md5',
                                            absent_means_no=True)
        self._parse_rules()
Пример #13
0
 def restore_from_archive(self, wanted, options=None):
     if options is None:
         options = {}
     if 'verbose' in options and options['verbose']:
         logging.info("restore_from_archive starting")
     re_wanted = re.compile(wanted)
     archive_root = self.config.archive + '/output'
     for dir_name, subdirs, files in os.walk(archive_root):
         for file_name in files:
             full_src = dir_name + '/' + file_name
             if re.search(re_wanted, full_src):
                 full_dest = u.reroot_file(full_src, archive_root, self.config.output)
                 (dest_full_path, dest_name) = os.path.split(full_dest)
                 u.ensure_path(dest_full_path)
                 shutil.copyfile(full_src, full_dest)
                 if 'verbose' in options and options['verbose']:
                     logging.info("restore_from_archive %s -> %s" % (full_src, full_dest))
     if 'verbose' in options and options['verbose']:
         logging.info("restore_from_archive completed")
Пример #14
0
 def process(self, do_clear_info=True):
     logging.info("starting tree processing")
     start = u.timestamp_now()
     u.ensure_path(self.config.output)
     self._root_files_processed = 0
     if self.clear_first and do_clear_info:
         u.clear_folder(self.config.output)
     # do this at start in case last run didn't clean up properly
     self.remove_unpacked_files()
     if do_clear_info:
         self.file_info.clear()
     self._pass = 0  # pass number
     self._files_processed = 0
     # make one pass over the input files. if you need to know whether this is
     # the input pass, check for self._pass == 0.
     self._walk_files(self.config.input)
     if self.config.signalled():
         logging.info("signal set, leaving tp.process")
         return False
     # then make passes over the output files until no new files are encountered
     work_done = self._files_processed > 0
     Config.log('tp._files_processed = %i' % self._files_processed, tag='WORK_DONE_PASS_0')
     # do NOT look at _root_files_processed after pass 0 - we want to fully
     # process any files created during pass 0
     while self._pass < self.PASSES:
         self._files_processed = 0
         self._pass += 1
         self._walk_files(self.config.output)
         if self.config.signalled():
             logging.info("signal set, leaving tp.process after pass %i" % self._pass)
             work_done = False
             break
         Config.log('tp._files_processed = %i' % self._files_processed, tag='WORK_DONE_PASS_%i' % self._pass)
         if self._files_processed > 0:
             work_done = True
         else:
             break
     if self._pass >= self.PASSES:
         raise Exception("completed %i passes and still not done. failing" % self.PASSES)
     self.update_input_mgr_metadata()
     elapsed = u.timestamp_now() - start
     Config.log("tp completed in %i passes, %f seconds, work_done %s" % (self._pass, elapsed, work_done), tag='WORK_DONE')
     return work_done
Пример #15
0
    def __init__(self, args, data):
        super().__init__(args, data)
        self.params = self.args["prepare"]["source"]

        self.tmp_path = Path.cwd() / "data" / "tmp"
        ensure_path(self.tmp_path)

        self.original = self.tmp_path / "original.cpp"
        self.source_path = self.tmp_path / "solution.cpp"
        self.log_path = self.tmp_path / "cppcheck_log.xml"
        self.tokens_path = self.tmp_path / "solution.tokens"
        self.dest_path = Path.cwd() / "data" / "datasets" / "source"

        self.compiler = self.params["compiler"]
        if self.compiler != "g++":
            raise NotImplementedError("Unsupported compiler {}".format(
                self.compiler))

        self.tokenizer = self._ensure_tokenizer_exists()
        self.parallel = args["prepare"]["parallel"]
Пример #16
0
 def default_template_file_action(self,
                                  dir_name,
                                  file_name,
                                  dest_rel_path=None,
                                  dest_name=None):
     template_full = dir_name + '/' + file_name
     Config.log("default_template_file_action '%s'" % template_full,
                tag='DEFAULT_TEMPLATE_FILE_ACTION')
     if dest_name:
         rel_path = dest_rel_path
         dest_path = u.pathify(self.output_root, dest_rel_path)
     else:
         rel_path = u.make_rel_path(self.site_root, dir_name)
         dest_path = u.pathify(self.output_root, rel_path)
         dest_name = file_name
     u.ensure_path(dest_path)
     dest_full = u.pathify(dest_path, dest_name)
     info = {
         'name': dest_name,
         'path': dest_path,
         'rel_path': rel_path,
         'full': dest_full,
         'key': u.make_key(rel_path, dest_name)
     }
     if self.config.is_template_type(file_name):
         template = open(template_full).read()
         output = u.debracket(template, self.interpret)
         if not self.config.is_special_file(info['key']):
             open(dest_full, 'w').write(output)
             local = u.local_metadata(dest_path, dest_name)
             info['size'] = local['size']
             info['modified'] = local['modified']
             info['md5'] = u.md5(dest_full)
             self.track_file(info)
     else:
         shutil.copyfile(template_full, dest_full)
         local = u.local_metadata(dest_path, dest_name)
         info['size'] = local['size']
         info['modified'] = local['modified']
         info['md5'] = u.md5(dest_full)
         self.track_file(info)
Пример #17
0
 def _parse_url_patterns(self):
     for ar in self._assemble_url_patterns():
         tag, pat, dest, start, idle, mode = ar
         if tag in self._url_patterns:
             msg = "ignoring duplicate url tag '%s'" % tag
             Config.log(msg, tag='URL_DUPLICATE')
             continue
         if not self._re['tag'].match(tag):
             Config.log(tag, tag='URL_INVALID_TAG')
             raise Exception('URL_INVALID_TAG')
         if not self._valid_pattern(pat):
             Config.log(pat, tag='URL_INVALID_PATTERN')
             raise Exception('URL_INVALID_PATTERN')
         daynum = None
         try:
             daynum = u.datestr_to_daynum(start)
         except Exception as exc:
             Config.log(start, tag='URL_INVALID_START')
             raise
         try:
             int(idle) # TODO check for nonneg and not too huge
         except ValueError:
             Config.log(idle, tag='URL_INVALID_IDLE')
             raise
         if mode != 'snap' and mode != 'fetch':
             Config.log(mode, tag='URL_INVALID_MODE')
             raise Exception('URL_INVALID_MODE')
         # this is the base path under which all files for this pattern are stored -
         # in this dir if no dest, but if dest is nonempty they may go in a subdir
         path = os.path.join(self.config.input, tag)
         u.ensure_path(path)
         self._url_patterns[tag] = {
             'tag': tag,
             'pattern': pat,
             'start': daynum,
             'dest': dest,
             'current_daynum': daynum,
             'idle': int(idle),
             'mode': mode,
             'path': path
         }
Пример #18
0
 def __init__(self, config, dest_mgr=None, track_file_callback=None):
     self.config = config
     self.dest_mgr = dest_mgr
     self._track_file_callback = track_file_callback
     # site root is the tree of fixed assets that are always built and replicated
     # on the remote site (e.g. index.html)
     self.site_root = config.template_root + '/site'
     self.generate_root = config.template_root + '/generated'
     u.ensure_path(self.site_root)
     self.output_root = self.config.output
     self._worklists = {}
     self._re = {}
     self._re['worklist'] = re.compile(r'^worklist\s+(.*)$',
                                       re.MULTILINE | re.DOTALL)
     self._re['get_config'] = re.compile(
         r'^get_config\s+([a-z_]+)\s*:\s*(.*)')
     self._re['part'] = re.compile(r'^part\s+(.*)$')
     self._files_created = []
     self.page_context = None
     self._default_worklist = None
     self._mode_plugins = {}
Пример #19
0
def combine_frames(frame_list, dest_dir, dest_name, delay=None):
    try:
        u.ensure_path(dest_dir)
        runargs = ['convert']
        if delay is not None:
            runargs = runargs + ['-delay', delay]
        runargs = runargs + [
            '@' + frame_list, '-loop', '0', dest_dir + '/' + dest_name
        ]
        (returncode, stdout, stderr) = u.run_command(runargs)
        logging.debug("convert returncode: %s\nstdout: %s\nstderr: %s" %
                      (returncode, stdout, stderr))
        if returncode == 0:
            return True
        else:
            logging.error("combine_frames failed with rc %i, stderr = '%s'" %
                          (returncode, stderr))
    except Exception as exc:
        logging.error("combine_frames frame_list '%s' exception '%s'" %
                      (frame_list, str(exc)))
        raise
Пример #20
0
 def send_request(self, url, useragent, protocol=None):
     parsed = urlparse(url)
     scheme = parsed.scheme
     host = parsed.netloc
     resource = parsed.path or ""  # daca nu exista path, ia string gol in loc de None
     port = parsed.port
     dir_queue = resource.split('/')
     dir_queue = list(
         filter("".__ne__, dir_queue)
     )  # filtreaza si returneaza orice element care nu e egal cu ""
     dir_queue = [host] + dir_queue
     # structura de directoare va fi:
     # scheme - /
     #          | - host - /
     #                     | - path1
     #                     | - path2
     #                     | - path3
     #                     | - path4 etc
     filename = dir_queue[-1]
     dir_queue.pop()
     location = ensure_path(
         "output", scheme,
         dir_queue)  # creeaza structura de directoare si returneaza ultimul
     self.new_get_request(resource, protocol)
     self.add_header("Host", host)
     self.add_header("User-Agent", useragent)
     self.end_message()
     write_flag = False
     filename = location + "/" + filename + ".html"
     with open(filename, "w") as output:
         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
             s.connect((host, 80))
             s.send(self.message.encode())
             while True:
                 data = s.recv(1024).decode()
                 if write_flag is False:
                     try:
                         idx = data.index("<!DOCTYPE")
                         write_flag = True
                     except ValueError:
                         try:
                             idx = data.index("<!doctype")
                             write_flag = True
                         except ValueError:
                             idx = None
                     print(data[0:idx])
                     if idx is not None:
                         output.write(data[idx:])
                 else:
                     output.write(data)
                 if "</html>" in data or "</HTML>" in data:
                     break
Пример #21
0
    def __init__(self, config, url_patterns):
        self.config = config
        u.ensure_path(config.input)
        self._list_file_name = '_url_list.txt'
        # TODO should be able to separately configure dir for persist files (e.g. for git convenience)
        self._list_file = self.config.admin + '/' + self._list_file_name
        self._download_count = 0
        self._download_limit = self.config.get_int('input', 'url_download_limit', default=0)
        self._raw_url_patterns = url_patterns
        self._url_patterns = {}
        self._daynum_today = u.daynum_now()
        # metadata for downloaded files (key is product_tag/file_name):
        self._local_files = {}
        self._re = {}
        self._re['tag'] = re.compile(r'^[a-zA-Z0-9_]+')

        self._parse_url_patterns()

        # this is used to store/persist metadata about directly downloaded files.
        self._local_files = {}
        # this is populated by TreeProcessor
        self.rules_run_files = []
Пример #22
0
    def __init__(self, configFile):
        Config.main = self  # singleton
        self.configFile = configFile
        self._config = None
        self.plugin = None
        self.start_time = datetime.utcnow()
        self.special_mode_args = []
        self.final_summary = ''
        self.iteration = None
        self._re = {}
        self._re['n'] = re.compile(r'\n+')
        self._re['symbol'] = re.compile(r'\s*=\s*')
        self._re['comment'] = re.compile(r'^\s*(#.*)?$')
        self.symbols = {}
        self._rules_mask = {}
        self._debug_tags = {}
        self._template_extensions = {}
        self.log_to_console = False
        # special modes available on command line, and info about them
        self._special_modes = {
            'ftp_catchup': {},
            'ftp_remove': {},
            'im_rerun': {},
            'ftp_clear': {},
            'im_meta_from_local': {},
            'restore_from_archive': {},
            'test_deploy_test': {}
        }
        # TODO allow override via config
        self._special_file_tags = {'run_post_tree': True, 'run_pre_web': True}

        if configFile:
            self.load_config()
        self.input = self.get('local', 'input')
        u.ensure_path(self.input)
        self.output = self.get('local', 'output')
        u.ensure_path(self.output)
        self.admin = self.get('local', 'admin')
        u.ensure_path(self.admin)
        self.archive = self.get('local', 'archive')
        u.ensure_path(self.archive)

        # template root is where all templates, of all types, are stored
        self.template_root = self.get('process',
                                      'template_root',
                                      return_none=True)
        if self.template_root is None:
            self.template_root = self.admin + '/templates'

        # Config is in charge of logging options
        loglevel = logging.INFO
        self._logfile = self.get('local', 'logfile', return_none=True)
        if self._logfile:
            log_dir = self.admin + '/logs'
            u.ensure_path(log_dir)
            full = log_dir + '/' + self._logfile
            u.age_file(full, log_dir, move=True)
            # see https://stackoverflow.com/questions/1943747/python-logging-before-you-run-logging-basicconfig:
            # if someone tried to log something before basicConfig is called, Python creates a default handler that
            # goes to the console and will ignore further basicConfig calls. Remove the handler if there is one.
            root = logging.getLogger()
            if root.handlers:
                for handler in root.handlers:
                    root.removeHandler(handler)
            logging.basicConfig(filemode='w',
                                filename=full,
                                format='%(asctime)s %(message)s',
                                datefmt='(%b %d %Y %H:%M:%S)',
                                level=loglevel)
        logging.info("starting at %s" % self.display_times(self.start_time))
        logging.info("configFile is %s" % self.configFile)
        logging.info("working directory is %s" % os.getcwd())
        if self.is_true('local', 'log_config', absent_means_no=True):
            self.log_config()
        self._load_symbols()
        self._load_rules_to_run()
        self._load_debug_tags()
        self._load_template_extensions()
        self.log_to_console = self.is_true('local',
                                           'log_to_console',
                                           absent_means_no=True)

        # allow user to bail out on run by creating a signal file
        self._signal_file = self.get('actions',
                                     'signal_file',
                                     return_none=True)
        if self._signal_file:
            logging.info("will watch for signal file '%s'" % self._signal_file)

        logging.info("sys.path:\n" + '\n'.join([p for p in sys.path]))
Пример #23
0
def get_model_path(args):
    res_path = Path.cwd() / "data" / "models" / args["model"] / "data"
    ensure_path(res_path)
    return res_path
Пример #24
0
def panoply(interp, finfo, argstr):
    Config.log("file '%s' argstr '%s'" % (finfo['full'], argstr),
               tag='PANOPLY')
    try:
        interpret = u.interpret_method(interp)
        argstr = u.debracket(argstr, interpret, finfo=finfo)
        args = u.parse_arg_string(argstr)
        if not 'action' in args:
            raise Exception("panoply: 'action arg is required")
        if not 'dest' in args:
            raise Exception("panoply: dest arg is required")
        action = args['action']
        dest = u.debracket(args['dest'], interpret, finfo=finfo)
        panoply_templates = Config.main.template_root + '/panoply'
        src = finfo['full']
        if not src.endswith('.nc'):
            logging.error("panoply command: '%s' is not a dataset file" % src)
            return

        ### TODO more copied stuff from copy_with_metadata!
        (dest_dir, dest_file) = os.path.split(dest)
        u.ensure_path(dest_dir)
        jar = 'PanoplyCL.jar'

        size = None
        if 'size' in args:
            size = int(args['size'])
        size_factor = _panoply_size_to_size_factor(size)

        template_file = panoply_templates + '/' + action + '.pclt'
        try:
            template = open(template_file).read()
        except Exception as exc:
            logging.error(
                "panoply command: error '%s' opening template file '%s" %
                (str(exc), template_file))
            raise
        symbols = {
            'dataset': src,
            'output_file': dest,
            'size_factor': size_factor
        }
        script = u.debracket(template, interpret, symbols=symbols)
        script_path = Config.main.output + '/tmp'
        u.ensure_path(script_path)
        script_file = script_path + '/' + action + '.pcl'
        open(script_file, 'w').write(script)

        working_dir = Config.main.get('tools', 'panoply_workdir')
        if not os.path.isdir(working_dir):
            err = "panoply: invalid panoply_workdir '%s'" % working_dir
            logging.error(err)
            raise Exception(err)

        call_args = ['java', '-jar', jar, script_file]
        (returncode, stdout, stderr) = u.run_command(call_args, working_dir)
        logging.debug("returncode: %s\nstdout: %s\nstderr: %s" %
                      (returncode, stdout, stderr))
        if returncode == 0:
            # we know the file that was created, so make its metadata now
            newfi = {}
            newfi['name'] = dest_file
            newfi['path'] = dest_dir
            newfi['full'] = dest
            newfi['rules_run'] = False
            tmp = u.local_metadata(newfi['path'], newfi['name'])
            newfi['size'] = tmp['size']
            newfi['modified'] = tmp['modified']
            return {'new_finfo': newfi}
        else:
            logging.error("panoply failed with rc '%i', stderr = '%s'" %
                          (returncode, stderr))
    except Exception as exc:
        # py2 logging.error("panoply exception '%s'" % exc.message)
        logging.error("panoply exception '%s'" % str(exc))
Пример #25
0
def download_page(page, level):

    mapped = map_local_path(page)
    clean = mapped[0]
    target = join("doc", mapped[1])

    src = join("doc/orig", mapped[1])

    if page in htmls:
        return mapped[1]

    if level > 10:
        pages.append(page)
        return mapped[1]

    if "//foundationdb.com" not in page:
        return page
    if ".pdf" in page:
        ensure_file(page, join("doc", mapped[1]))
        return mapped[1]
    if ".png" in page:
        ensure_file(page, join("doc", mapped[1]))
        return mapped[1]
    if "javadoc" in page:
        print(" skip javadoc")
        return page
    if "courses" in page:
        return page

    ensure_file(page, src)

    with open(src, 'r') as r:
        lines = list(r)
        result = []
        for i, l in enumerate(lines):
            r = redirect_regex.search(l)
            if r:
                new_url = r.groupdict()["url"].replace("\/", "/")

                return download_page("https://web.archive.org" + new_url,
                                     level + 1)

    htmls.add(page)

    links = set()

    def match(m):
        dict = m.groupdict()
        url = dict["url"]
        kind = dict["kind"]

        name = map_local_path(url)[0]
        full_url = "https://web.archive.org" + url

        if kind == "":
            full_local = download_page(full_url, level + 1)
        else:
            full_local = kind + "/" + name
            ensure_file(full_url, "doc/" + full_local)

        # matching local path
        return relpath(full_local, dirname(mapped[1]))

    with open(src, 'r') as r:
        lines = list(r)
        clean_wb(lines)
        result = []
        for i, l in enumerate(lines):
            result.append(ref_regex.sub(match, l))

        print("Saving to ", target)
        ensure_path(target)

        with open(target, 'w') as w:
            w.writelines(result)
        return mapped[1]
Пример #26
0
 def send_request(self, url, useragent, protocol):
     self.num += 1
     print(self.num, url)
     self.links[url] = True
     parsed = urlparse(url)
     scheme = parsed.scheme
     if scheme == "http":
         implicit_port = 80
     elif scheme == "https":
         implicit_port = 443
     else:
         implicit_port = 80
     host = parsed.netloc
     resource = parsed.path or ""        # daca nu exista path, ia string gol in loc de None
     port = parsed.port
     dir_queue = resource.split('/')
     dir_queue = list(filter("".__ne__, dir_queue))  # filtreaza si returneaza orice element care nu e egal cu ""
     if len(dir_queue) == 0:
         filename = "index.html"
         dir_queue.append(filename)
     else:
         filename = dir_queue[-1]
     dir_queue = [host] + dir_queue
     # structura de directoare va fi:
     # scheme - /
     #          | - host - /
     #                     | - path1
     #                     | - path2
     #                     | - path3
     #                     | - path4 etc
     if filename:
         dir_queue.pop()
     location = ensure_path("output", scheme, dir_queue)     # creeaza structura de directoare si returneaza ultimul
     if filename:
         message = RequestCreator.new_get_request(resource, protocol)
         message = RequestCreator.add_header("Host", host, message)
         message = RequestCreator.add_header("User-Agent", useragent, message)
         message = RequestCreator.end_message(message)
         write_flag = False
         if filename.endswith(".html"):
             filename = location + "/" + filename
         else:
             filename = location + "/" + filename + ".html"
         # with open(filename, "w") as output:
         file_data = ""
         # daca hostul este cacheuit, doar ia IP, altfel se acceseaza robots.txt si se iau lucrurile permise
         if not self.cache.is_cached(host):
             # daca domeniul nu e in cache, se verifica robots.txt
             # get_ip cacheuieste domeniul
             domain_ip = self.cache.get_ip(host)
             if domain_ip is not None:
                 self.robots[host] = RoboFile(host, domain_ip)
                 self.robots[host].obtain(host, useragent, protocol, implicit_port)
                 if not self.robots[host].ok:
                     return
                 else:
                     # exclus din robots.txt
                     if self.robots[host].match_link(parsed.path):
                         return
             else:
                 return
         else:
             domain_ip = self.cache.get_ip(host)
             if domain_ip is None:
                 return
             else:
                 if not self.robots[host].ok:
                     return
                 else:
                     if self.robots[host].match_link(parsed.path):
                         return
         if domain_ip is not None:
             headers = ""
             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                 if implicit_port == 443:
                     context = ssl.create_default_context()
                     s = context.wrap_socket(s, server_hostname=host)
                 s.connect((domain_ip, implicit_port))
                 s.send(message.encode())
                 former_first = ""
                 data = bytes(0)
                 while True:
                     s.settimeout(1)
                     try:
                         try:
                             data += s.recv(1024)
                             data = data.decode()
                         except UnicodeDecodeError:
                             continue
                         firstline = data.splitlines()[0]
                         if firstline == former_first:
                             break
                         former_first = firstline
                     except socket.timeout:
                         break
                     if write_flag is False:
                         try:
                             idx = data.index("<!DOCTYPE")
                             write_flag = True
                         except ValueError:
                             try:
                                 idx = data.index("<!doctype")
                                 write_flag = True
                             except ValueError:
                                 try:
                                     idx = data.index("<html")
                                 except:
                                     idx = None
                         headers += data[0:idx]
                         if idx is not None:
                             file_data += data[idx:]
                     else:
                         file_data += data
                     if "</html>" in data or "</HTML>" in data:
                         break
                     data = bytes(0)
             # VERIFICA HEADERELE
             code = RequestCreator.get_response_code(headers)
             redirected = False
             if code.startswith("3"):
                 # try redirect
                 tries = 5
                 location = RequestCreator.extract_location(headers)
                 if location is not None:
                     while not redirected and tries > 0:
                         data, rheaders = self.redirect(location, domain_ip, protocol, useragent)
                         code = RequestCreator.get_response_code(rheaders)
                         if code.startswith("2"):
                             redirected = True
                         elif code.startswith("3"):
                             location = RequestCreator.extract_location(rheaders)
                             if location is None:
                                 break
                             tries -= 1
                         else:   # eroare
                             break
             elif code.startswith("4") or code.startswith("5"):
                 # eroare
                 return
             if code.startswith("2") or redirected:
                 soup = BeautifulSoup(file_data, 'html.parser')
                 metas = soup.find_all("meta")
                 ok1 = False
                 ok2 = False
                 found = False
                 for meta in metas:
                     if "name" in meta:
                         if meta["name"].lower() == "robots":
                             found = True
                             if meta["content"].lower() == "all" or meta["content"].lower() == "index":
                                 ok1 = True
                             if meta["content"].lower() == "all" or meta["content"].lower() == "follow":
                                 ok2 = True
                 if not found:
                     ok1 = True
                     ok2 = True
                 if ok1:
                     with open(filename, "w", encoding="utf-8") as output:
                         output.write(file_data)
                 if ok2:
                     links = soup.find_all("a", href=True)
                     for link in links:
                         if link["href"] in self.links:
                             continue
                         if "http" in link["href"] or "https" in link["href"]:
                             to_add = link["href"]
                             if "#" in to_add:
                                 to_add = to_add[:to_add.rfind("#")]
                             if to_add not in self.queue:
                                 self.queue.append(to_add)
                         else:
                             to_add = urljoin(url, link["href"])
                             if "#" in to_add:
                                 to_add = to_add[:to_add.rfind("#")]
                             if to_add not in self.queue:
                                 self.queue.append(to_add)
Пример #27
0
 def send_request(self, url, useragent, protocol=None):
     self.links[url] = True
     parsed = urlparse(url)
     scheme = parsed.scheme
     host = parsed.netloc
     resource = parsed.path or ""  # daca nu exista path, ia string gol in loc de None
     port = parsed.port
     dir_queue = resource.split('/')
     if dir_queue[-1] == "":
         filename = None
     else:
         filename = dir_queue[-1]
     dir_queue = list(
         filter("".__ne__, dir_queue)
     )  # filtreaza si returneaza orice element care nu e egal cu ""
     dir_queue = [host] + dir_queue
     # structura de directoare va fi:
     # scheme - /
     #          | - host - /
     #                     | - path1
     #                     | - path2
     #                     | - path3
     #                     | - path4 etc
     if filename:
         dir_queue.pop()
     location = ensure_path(
         "output", scheme,
         dir_queue)  # creeaza structura de directoare si returneaza ultimul
     if filename:
         self.new_get_request(resource, protocol)
         self.add_header("Host", host)
         self.add_header("User-Agent", useragent)
         self.end_message()
         write_flag = False
         if filename.endswith(".html"):
             filename = location + "/" + filename
         else:
             filename = location + "/" + filename + ".html"
         # with open(filename, "w") as output:
         file_data = ""
         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
             s.connect((host, 80))
             s.send(self.message.encode())
             while True:
                 data = s.recv(1024).decode()
                 if write_flag is False:
                     try:
                         idx = data.index("<!DOCTYPE")
                         write_flag = True
                     except ValueError:
                         try:
                             idx = data.index("<!doctype")
                             write_flag = True
                         except ValueError:
                             try:
                                 idx = data.index("<html")
                             except:
                                 idx = None
                     # print(data[0:idx])
                     if idx is not None:
                         file_data += data[idx:]
                 else:
                     file_data += data
                 if "</html>" in data or "</HTML>" in data:
                     break
         soup = BeautifulSoup(file_data, 'html.parser')
         metas = soup.find_all("meta")
         ok1 = True
         ok2 = True
         found = False
         for meta in metas:
             if meta["name"] == "robots":
                 found = True
                 if meta["content"] == "all" or meta["content"] == "index":
                     ok1 = True
                 if meta["content"] == "all" or meta["content"] == "follow":
                     ok2 = True
             if found:
                 ok1 = False
                 ok2 = False
         if ok1:
             with open(filename, "w") as output:
                 output.write(file_data)
         if ok2:
             links = soup.find_all("a", href=True)
             for link in links:
                 if link["href"] in self.links:
                     continue
                 if "http" in link["href"] or "https" in link["href"]:
                     self.queue.append(link["href"])
                 else:
                     if port:
                         to_add = scheme + "://" + host + ":" + str(port)
                         if link["href"].startswith('/'):
                             to_add += link["href"]
                         else:
                             to_add += "/" + link["href"]
                     else:
                         to_add = scheme + "://" + host
                         if link["href"].startswith('/'):
                             to_add += link["href"]
                         else:
                             to_add += "/" + link["href"]
                     self.queue.append(to_add)
Пример #28
0
 def get_model_path(self):
     path = Path.cwd() / "data" / "models" / self.model_name
     ensure_path(path)
     return path
Пример #29
0
if args.out is None:
    log_info("Didn't specify the output path. Use the default one: %s" % path)
else:
    path = args.out

if args.debug:
    log_debug("Debug mode enabled")
if args.fine:
    log_fine("Display find-grand logging")
if args.re_meta:
    log_info("Redownload all metadata")


# === Ensure the path to the output directories
meta_dir_path = path + "/metadata"
ensure_path(meta_dir_path)
price_dir_path = path + "/price"
ensure_path(price_dir_path)


# === Retrieve the ids of the target applications ===
data = retrieve_data(APP_LIST_URL)
if data is None:
    sys.exit(1)

apps = []
for app in data['applist']['apps']['app']:
    # According to CT's theory, only the apps with id divisible by 10 are games
    if app['appid'] % 10 == 0:
        apps.append((app['appid'], app['name']))
Пример #30
0
def make_hybrid_forecast(webmaker, dest_key, work_item, list_args):
    try:
        Config.log("dest_key = '%s'" % dest_key, tag='HYBRID_FORECAST_START_' + dest_key)
        (dest_path, dest_name) = os.path.split(dest_key)
        finfos = work_item['roles']['default']
        if not len(finfos):
            msg = "logic error, no finfos on default list for dest_key '%s'" % dest_key
            raise Exception(msg)
        archive_root = webmaker.config.get('local', 'archive', return_none=True)
        if not archive_root:
            logging.error("make_hybrid_forecast: fail, no archive_root configured")
            return 'todo error1'

        local_store_start = u.datestr_to_daynum(webmaker.config.get('local', 'local_store_start'))
        history_days = webmaker.config.get_int('tools', 'hybrid_forecast_history_days', default=20)
        history_frame0s = []

        extract_frames = webmaker.config.plugin.get('extract_frames')
        combine_frames = webmaker.config.plugin.get('combine_frames')
        get_frame_count = webmaker.config.plugin.get('get_frame_count')

        # we need the current image for forecast frames, should be finfo
        current_image = finfos[0]['full']
        if not current_image or not os.path.isfile(current_image):
            msg = "make_hybrid_forecast can't find expected current_image '%s'" % current_image
            logging.error(msg)
            return 'todo error2'
        frame_ct = get_frame_count(current_image)
        if frame_ct < 2:
            msg = "make_hybrid_forecast: current_image '%s' not multi-frame, taking no action" % current_image
            logging.warning(msg)
            return ''
        logging.info("got current_image file '%s'" % current_image)
        (current_path, current_name) = os.path.split(current_image)

        # look for frame0 files first in output, then archive
        roots = [webmaker.config.output + '/static', archive_root + '/output/static']

        # find history files going back from date of current file
        if 'item_args' in work_item and 'date_string' in work_item['item_args']:
            date_str = work_item['item_args']['date_string']
        else:
            msg = "make_hybrid_forecast can't find date_string"
            logging.error(msg)
            return 'todo error3'

        testdate = u.datestr_to_daynum(date_str) - 1
        while True:
            date_str = u.daynum_to_datestr(testdate)
            # TODO what if it isn't a gif? also hard-coded convention
            fname = u.find_in_paths(
                current_name + '_frame_00000.gif',
                'esrl-daily-forecasts/' + date_str + '/frame0',
                roots,
                options={'full': True}
            )
            if fname:
                # logging.info("got frame0 file '%s'" % fname)
                Config.log(fname, tag='HYBRID_FORECAST_FRAME_GOT0')
                history_frame0s.insert(0, fname)
            else:
                Config.log("frames not contiguous by date, missing '%s'" % date_str, 'HYBRID_FORECAST_FRAME_ERR')
            if len(history_frame0s) >= history_days:
                break
            testdate -= 1
            if testdate < local_store_start:
                msg = "make_hybrid_forecast only found %i history frames" % len(history_frame0s)
                break
        hist_frame_ct = len(history_frame0s)
        if not hist_frame_ct:
            msg = "key '%s': no history frames, can't proceed" % dest_key
            Config.log(msg, tag='HYBRID_FORECAST_ERROR')
            webmaker.config.add_to_final_summary(msg)
            return 'todo error4'
        # extract frames from current image
        work_path = webmaker.config.output + '/tmp/' + dest_name
        u.ensure_path(work_path)
        extract_frames(webmaker.interpret, finfos[0], 'out_dir:' + work_path)

        # make list of frame files for new image (TODO review path of tmp file)
        flist = work_path + '/' + dest_name + '.frames'
        content = "\n".join(history_frame0s)
        # add forecast frames
        for frame_num in range(1, frame_ct - 1):
            fname = work_path + '/' + current_name + '_frame_' + '{:05d}'.format(frame_num) + '.gif'
            content += '\n' + fname
        open(flist, 'w').write(content)
        # make output animation
        if 'delay' in list_args:
            delay = list_args['delay']
        else:
            delay = webmaker.config.get('tools', 'default_frame_delay', return_none=True)
        combine_frames(flist, webmaker.output_root + '/' + dest_path, dest_name, delay=delay)
        # TODO use utility method??
        display_name = dest_name
        if 'display_name' in finfos[0]:
            display_name = finfos[0]['display_name']
        item_args = work_item['item_args']
        if 'display_name' in item_args:
            display_name = u.parse_subargs(
                item_args['display_name'],
                webmaker.interpret,
                webmaker.config,
                finfos[0]
            )
        # TODO REVIEW: we do this for panoply, should we here also?
        info = {
            'name': dest_name,
            'path': webmaker.output_root + '/' + dest_path,
            'rel_path': dest_path,
            'full': webmaker.output_root + '/' + dest_key,
            'key': dest_key,
            'display_name': display_name
        }
        tmp = u.local_metadata(info['path'], info['name'])
        info['size'] = tmp['size']
        info['modified'] = tmp['modified']
        webmaker.track_file(info)
        work_item['roles']['output'] = info
        os.remove(flist)
        Config.log("key '%s'" % dest_key, tag='HYBRID_FORECAST_SUCCESS')
        # TODO review - returning empty string because invoking from html
        return ""
    except Exception as exc:
        Config.log(str(exc), tag='HYBRID_FORECAST_EXCEPTION')
        raise