Пример #1
0
 def dump_templates(self, data):
     if not data.get('templates'):
         path = '/'.join(strip_json(self.context['path']).split('/')[:2])
         storage = self.context['storage']
         try:
             names = OrderedDict((strip_json(fname), 1)
                                 for fname in storage.listdir(path)[1])
             data['samples'] = list(names)
             return data
         except OSError:
             # Directory does not exist
             data['samples'] = []
             return data
     templates = []
     for template in data['templates']:
         # Only migrate item templates
         if template.get('page_type') != 'item':
             continue
         template['id'] = template.get('page_id') or template.get('name')
         templates.append(template['id'])
         path = self.context['path']
         path = '/'.join((strip_json(path).strip('/'),
                         '{}.json'.format(template['id'])))
         sample = json.dumps(template, sort_keys=True, indent=4)
         self.context['storage'].save(path, ContentFile(sample, path))
     data['samples'] = templates
     path, storage = self.context['path'], self.context['storage']
     spider = json.dumps(data, indent=4, sort_keys=True)
     storage.save(path, ContentFile(spider, path))
     return data
Пример #2
0
 def _update_sample(self, sample=None, project=None, data=None):
     """Recompile sample with latest annotations"""
     if sample is None:
         sample = self._load_sample(data, project)
         path = 'spiders/{}/templates/{}/{{}}.html'.format(
             self.data['spider'], self.data['sample'])
     else:
         path = _html_path(sample)
     if hasattr(sample, 'dump'):
         sample = sample.dump()
     html_path = path.format
     for name, type_ in (('original_body', 'raw'), ('rendered_body', None)):
         try:
             path = html_path(name)
             html = decode(self.storage.open(path).read())
         except IOError:
             if not self.tab:
                 six.reraise(*sys.exc_info())
             html = decoded_html(self.tab, type_)
             if html:
                 self.storage.save(path, ContentFile(html, path))
             else:
                 html = '<html></html>'
         sample[name] = html
     return sample
Пример #3
0
 def open_with_default(name, default=None, *args, **kwargs):
     try:
         return open_(name, *args, **kwargs)
     except IOError as error:
         if error.errno == errno.ENOENT:
             return ContentFile(json.dumps(default), name)
         raise error
Пример #4
0
    def _commit_delete(self, collector, saved_paths=None, deleted_paths=None):
        if saved_paths is None:
            saved_paths = set()
        if deleted_paths is None:
            deleted_paths = set()

        for model, fields in iteritems(collector.save):
            model.resolve_attributes(snapshots=('committed',))
            model._stage_changes(fields)

        for model in collector.delete:
            path = model.storage_path(model, snapshots=('committed',))
            if model.opts.owner:
                if path and path not in saved_paths and path not in deleted_paths:
                    to_save = self._get_object_to_dump(
                        model, parent_snapshots=('committed',))
                    model.storage.save(path, ContentFile(
                        to_save.dumps(state='staged'), path))
                    saved_paths.add(path)
            else:
                if path not in deleted_paths:
                    model.storage.delete(path)
                    deleted_paths.add(path)

        for model, fields in iteritems(collector.save):
            model._commit_changes(saved_paths, deleted_paths)

        for model in collector.delete:
            store = model.data_store
            store.update_snapshot('working', ('working', 'staged', 'committed'))
            store.clear_snapshot('staged')
            store.clear_snapshot('committed')
Пример #5
0
 def dump_templates_to_file(self, data):
     if 'template_names' in data or 'templates' not in data:
         return data
     template_names = []
     for template in data['templates']:
         # Only migrate item templates
         if template.get('page_type') != 'item':
             continue
         template['id'] = template.get('page_id') or template.get('name')
         template_names.append(template['id'])
         path = self.context['path']
         path = '/'.join((path[:-len('.json')].strip('/'),
                         '{}.json'.format(template['id'])))
         sample = json.dumps(template, sort_keys=True, indent=4)
         self.context['storage'].save(path, ContentFile(sample, path))
     data['template_names'] = template_names
     del data['templates']
     path, storage = self.context['path'], self.context['storage']
     spider = json.dumps(data, indent=4, sort_keys=True)
     storage.save(path, ContentFile(spider, path))
     return data
Пример #6
0
 def create_project(self, name):
     self.validate_project_name(name)
     project_filename = self.project_filename(name)
     project_files = {
         'project.json': templates['PROJECT'],
         'scrapy.cfg': templates['SCRAPY'],
         'setup.py': templates['SETUP'] % str(name),
         'items.json': templates['ITEMS'],
         join('spiders', '__init__.py'): '',
         join('spiders', 'settings.py'): templates['SETTINGS'],
     }
     for filename, template in project_files.items():
         path = join(project_filename, filename)
         self.storage.save(path, ContentFile(template, path))
Пример #7
0
    def dump_actions(self, data):
        if not data.get('actions'):
            path = '/'.join(strip_json(self.context['path']).split('/')[:2])
            path = path + '/actions'

            storage = self.context['storage']
            try:
                names = OrderedDict((strip_json(fname), 1)
                                    for fname in storage.listdir(path)[1])
                data['actions'] = list(names)
                return data
            except OSError:
                # Directory does not exist
                data['actions'] = []
                return data

        actions = []
        for action in data['actions']:
            # Only migrate item templates
            #if template.get('page_type') != 'item' and template.get('page_type') != 'links':
            #    continue

            action['id'] = action.get('id') or action.get('name')
            actions.append(action['id'])
            path = self.context['path'] + '/actions'
            path = '/'.join(
                (strip_json(path).strip('/'), '{}.json'.format(action['id'])))
            action_content = json.dumps(action, sort_keys=True, indent=4)
            self.context['storage'].save(path,
                                         ContentFile(action_content, path))
        data['actions'] = actions

        path, storage = self.context['path'], self.context['storage']
        spider = json.dumps(data, indent=4, sort_keys=True)
        storage.save(path, ContentFile(spider, path))

        return data
Пример #8
0
 def _migrate_html(self, sample):
     base_path = strip_json(self.context['path']).strip('/')
     # Clean and use annotated body if there is no original body present
     if 'annotation_body' in sample and not sample.get('original_body'):
         sample['original_body'] = self._clean(sample['annotated_body'])
     storage = self.context['storage']
     for key, value in sample.items():
         if (not value or not key.endswith('_body') or
                 key == 'annotated_body'):
             continue
         path = '/'.join((base_path, '{}.html'.format(key)))
         html = value.encode('utf-8')
         if hasattr(html, 'encode') and isinstance(html, six.text_type):
             html = encode(html).decode('utf-8')
         if not storage.exists(path):
             storage.save(path, ContentFile(html, path))
     return sample
Пример #9
0
    def _commit_changes(self, saved_paths=None, deleted_paths=None):
        if saved_paths is None:
            saved_paths = set()
        if deleted_paths is None:
            deleted_paths = set()

        for model in chain([self],
                           (model
                            for model, _ in self._staged_model_references())):
            store = model.data_store
            dirty = (model._file_fields.intersection(iterkeys(store['staged']))
                     or 'project' in store.dirty_fields(
                         'working', ('committed', )))
            path = model.storage_path(model, snapshots=('staged', 'committed'))
            old_path = model.storage_path(model,
                                          snapshots=('committed', 'staged'))
            if dirty or old_path != path:
                if path not in saved_paths and path not in deleted_paths:
                    to_save = self._get_object_to_dump(
                        model, parent_snapshots=('staged', 'committed'))
                    model.storage.save(
                        path, ContentFile(to_save.dumps(state='staged'), path))
                    saved_paths.add(path)
                if old_path != path and old_path not in deleted_paths:
                    try:
                        model.storage.delete(old_path)
                    except IOError as ex:
                        # Assume missing files are already deleted
                        if ex.errno != errno.ENOENT:
                            six.reraise(*sys.exc_info())
                    deleted_paths.add(old_path)
        for model in chain([self],
                           (model
                            for model, _ in self._staged_model_references())):
            store = model.data_store
            dirty = set(iterkeys(store['staged']))
            if dirty:
                store.update_snapshot('committed', ('staged', ), fields=dirty)
                store.clear_snapshot('staged')
                store.clear_snapshot('working',
                                     fields=dirty.intersection(
                                         iterkeys(store['working'])))
Пример #10
0
    def _commit_delete(self, collector, saved_paths=None, deleted_paths=None):
        if saved_paths is None:
            saved_paths = set()
        if deleted_paths is None:
            deleted_paths = set()

        for model, fields in iteritems(collector.save):
            model.resolve_attributes(snapshots=('committed', ))
            model._stage_changes(fields)

        for model in collector.delete:
            path = model.storage_path(model,
                                      snapshots=('committed', 'staged',
                                                 'working'))
            if model.opts.owner:
                if path and path not in saved_paths and path not in deleted_paths:
                    to_save = self._get_object_to_dump(
                        model, parent_snapshots=('committed', ))
                    model.storage.save(
                        path, ContentFile(to_save.dumps(state='staged'), path))
                    saved_paths.add(path)
            else:
                if path not in deleted_paths:
                    try:
                        model.storage.delete(path)
                    except IOError as ex:
                        # Assume missing files are already deleted
                        if ex.errno != errno.ENOENT:
                            six.reraise(*sys.exc_info())
                    deleted_paths.add(path)

        for model, fields in iteritems(collector.save):
            model._commit_changes(saved_paths, deleted_paths)

        for model in collector.delete:
            store = model.data_store
            store.update_snapshot('working',
                                  ('working', 'staged', 'committed'))
            store.clear_snapshot('staged')
            store.clear_snapshot('committed')
Пример #11
0
def _update_sample(data, socket, sample=None, project=None):
    """Recompile sample with latest annotations"""
    if sample is None:
        project = project or socket.spiderspec.project
        spiders = project.spiders
        spider = spiders[data['spider']]
        samples = spider.samples
        sample = samples[data['sample']]
        path = 'spiders/{}/{}/{{}}.html'.format(data['spider'], data['sample'])
    else:
        path = _html_path(sample)
    if hasattr(sample, 'dump'):
        sample = sample.dump()
    html_path = path.format
    for name, type_ in (('original_body', 'raw'), ('rendered_body', None)):
        try:
            path = html_path(name)
            html = decode(socket.storage.open(path).read())
        except IOError:
            html = decoded_html(socket.tab, type_)
            socket.storage.save(path, ContentFile(html, path))
        sample[name] = html
    return sample
Пример #12
0
 def save_file(self, name, file_path, file_contents):
     self._open_repo(name)
     self.storage.save(
         file_path,
         ContentFile(json.dumps(file_contents, sort_keys=True, indent=4),
                     file_path))
Пример #13
0
 def save_raw(serializer, data):
     context = serializer.context
     path, storage = context['path'], context['storage']
     data = {k: v for k, v in data.items() if not k.endswith('_body')}
     sample = json.dumps(data, indent=4, sort_keys=True)
     storage.save(path, ContentFile(sample, path))
Пример #14
0
 def open_(name, *args, **kwargs):
     try:
         data = files[name]
     except KeyError:
         raise IOError(2, 'No file or directory', name)
     return ContentFile(data, name)
Пример #15
0
 def savejson(self, obj, resources):
     # convert to json in a way that will make sense in diffs
     fname = self._rfilename(*resources)
     self.storage.save(
         fname, ContentFile(json.dumps(obj, sort_keys=True, indent=4),
                            fname))