def test_extract_repeated_field(self):
     sample = {
         'plugins': {'annotations-plugin': {}},
         'url': 'https://stackoverflow.com',
         'original_body': re.sub(
             'data-scrapy-annotate=".*"', '', html_page._body),
         'scrapes': 'default',
         'page_id': '507f520c3bf361f4c5cd55c44307a271bccb2218',
         'version': '0.13.0'
     }
     data = open_spec('so_annotations.json')
     annos, items, results = data['annos'], data['items'], data['results']
     sample['plugins']['annotations-plugin']['extracts'] = annos
     spider = IblSpider('so', make_spider(sample=sample),
                        items, {}, Settings())
     page = HtmlResponse('http://url', body=sample['original_body'],
                         encoding='utf-8')
     items = [i for i in spider.parse(page) if not isinstance(i, Request)]
     keys = {(u'_index', u'_template', u'_type', u'answered', u'tags',
              u'title', 'url')}
     self.assertEqual({tuple(sorted(i.keys())) for i in items}, keys)
     self.assertEqual([items[0], items[52], items[-1]], results)
     self.assertEqual(len(items), 96)
     spider, page, results = open_spider_page_and_results('autoevolution.json')
     items = [i for i in spider.parse(page) if not isinstance(i, Request)]
     self.assertEqual(items, results)
 def test_extract_repeated_field(self):
     sample = {
         'plugins': {'annotations-plugin': {}},
         'url': 'https://stackoverflow.com',
         'original_body': re.sub(
             'data-scrapy-annotate=".*"', '', html_page._body),
         'scrapes': 'default',
         'page_id': '507f520c3bf361f4c5cd55c44307a271bccb2218',
         'version': '0.13.0'
     }
     data = open_spec('so_annotations.json')
     annos, items, results = data['annos'], data['items'], data['results']
     sample['plugins']['annotations-plugin']['extracts'] = annos
     spider = IblSpider('so', make_spider(sample=sample),
                        items, {}, Settings())
     page = HtmlResponse('http://url', body=sample['original_body'],
                         encoding='utf-8')
     items = [i for i in spider.parse(page) if not isinstance(i, Request)]
     keys = {(u'_index', u'_template', u'_type', u'answered', u'tags',
              u'title', 'url')}
     self.assertEqual({tuple(sorted(i.keys())) for i in items}, keys)
     self.assertEqual([items[0], items[52], items[-1]], results)
     self.assertEqual(len(items), 96)
     spider, page, results = open_spider_page_and_results('autoevolution.json')
     items = [i for i in spider.parse(page) if not isinstance(i, Request)]
     self.assertEqual(items, results)
Пример #3
0
 def test_trained(self):
     base = 'http://www.daft.ie/ireland/houses-for-sale/?offset={}'.format
     daft_url = base(10)
     spec = {
         'start_urls': [daft_url],
         'links_to_follow': 'auto',
         'respect_nofollow': False,
         'follow_patterns': [],
         'exclude_patterns': [],
         'init_requests': [],
         'templates': [daft_sample]
     }
     settings = Settings()
     settings.set('LOADED_PLUGINS', load_plugins(settings))
     spider = IblSpider('hn', spec, {}, {}, settings=settings)
     request = Request(daft_url)
     response = HtmlResponse(url=daft_url,
                             body=daft_body,
                             request=request,
                             encoding="utf-8")
     data = {
         r.url
         for r in spider.handle_html(response) if isinstance(r, Request)
     }
     self.assertEqual({base(i) for i in (90, 80, 70)}, data)
 def test_extract_multiple_item_types(self):
     spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {},
                        Settings())
     data = list(spider.parse(
         HtmlResponse('http://url',
                      body=xceed_spider['templates'][0]['original_body'],
                      encoding='utf-8')
     ))
     self.assertEqual(data[:6], xceed_spider['results'])
 def test_extract_multiple_item_types(self):
     spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {},
                        Settings())
     data = list(
         spider.parse(
             HtmlResponse(
                 'http://url',
                 body=xceed_spider['templates'][0]['original_body'],
                 encoding='utf-8')))
     self.assertEqual(data[:6], xceed_spider['results'])
 def test_extract_multiple_item_types(self):
     spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {},
                        Settings())
     data = list(spider.parse(
         HtmlResponse('http://url',
                      body=xceed_spider['templates'][0]['original_body'],
                      encoding='utf-8')
     ))
     items = [d for d in data if not isinstance(d, Request)]
     self.assertEqual(items, xceed_spider['results'])
 def test_extract_multiple_item_types(self):
     spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {},
                        Settings())
     data = list(spider.parse(
         HtmlResponse('http://url',
                      body=xceed_spider['templates'][0]['original_body'],
                      encoding='utf-8')
     ))
     items = [d for d in data if not isinstance(d, Request)]
     self.assertEqual(items, xceed_spider['results'])
 def test_extract_multiple_item_types(self):
     spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {},
                        Settings())
     data = list(spider.parse(
         HtmlResponse('http://url',
                      body=xceed_spider['templates'][0]['original_body'],
                      encoding='utf-8')
     ))
     items = sorted([d for d in data if not isinstance(d, Request)],
                    key=lambda x: ('ticket', 'venue', 'event').index(x['_type']))
     self.assertEqual(items, xceed_spider['results'])
 def test_extract_multiple_item_types(self):
     spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {},
                        Settings())
     data = list(spider.parse(
         HtmlResponse('http://url',
                      body=xceed_spider['templates'][0]['original_body'],
                      encoding='utf-8')
     ))
     items = sorted([d for d in data if not isinstance(d, Request)],
                    key=lambda x: ('ticket', 'venue', 'event').index(x['_type']))
     self.assertEqual(items, xceed_spider['results'])
Пример #10
0
 def create_spider(self, project, auth_info, params, **kwargs):
     spider = params.get('spider')
     if spider is None:
         return None, None
     pspec = self.bot.spec_manager.project_spec(project, auth_info)
     try:
         spider_spec = pspec.resource('spiders', spider)
         spider_spec['templates'] = []
         for template in spider_spec.get('template_names', []):
             try:
                 spider_spec['templates'].append(
                     pspec.resource('spiders', spider, template))
             except TypeError:
                 # Template names not consistent with templates
                 pspec.remove_template(spider, template)
         items_spec = pspec.resource('items')
         extractors = pspec.resource('extractors')
         return (IblSpider(spider, spider_spec, items_spec, extractors,
                           self.bot.runner.settings,
                           **kwargs), spider_spec['templates'])
     except IOError as ex:
         if ex.errno == errno.ENOENT:
             log.msg("skipping extraction, no spec: %s" % ex.filename)
             return None, None
         else:
             raise
Пример #11
0
    def open_spider(self, meta, storage=None, project=None):
        if not (meta.get('project') and meta.get('spider')):
            return {'error': 4005, 'reason': 'No project specified'}

        if (self.user.authorized_projects is not None
                and meta['project'] not in self.user.authorized_projects
                and not self.user.staff):
            return {
                'error': 4004,
                'reason': 'Project "%s" not found' % meta['project']
            }
        spider_name = meta['spider']

        if project is None:
            project = Project(storage, id=meta.get('project'))

        try:
            spider_model = project.spiders[spider_name]
        except (IOError, KeyError):
            return {
                'error': 4004,
                'reason': 'Spider "%s" not found' % spider_name
            }
        spider_name, spider, items, extractors = load_spider_data(spider_model)
        if not self.settings.get('SPLASH_URL'):
            self.settings.set('SPLASH_URL', 'portia')
        self.factory[self].spider = IblSpider(spider_name, spider, items,
                                              extractors, self.settings)
        self.factory[self].spiderspec = SpiderSpec(project, spider_name,
                                                   spider, items, extractors)
Пример #12
0
def load_spider(storage, model):
    items = json.load(storage.open_with_default('items.json', '{}'))
    extractors = json.load(storage.open_with_default('extractors.json', '{}'))
    spider = json.loads(model.dumps())
    samples = [json.loads(sample.dumps()) for sample in model.samples]
    spider['templates'] = samples
    return IblSpider(model.id, spider, items, extractors, Settings())
Пример #13
0
 def update_spider(self,
                   meta,
                   spider=None,
                   template=None,
                   items=None,
                   extractors=None):
     if not hasattr(self.factory[self], 'spiderspec'):
         return self.open_spider(meta)
     spec = self.factory[self].spiderspec
     if spec is None or spec.name != meta.get('spider'):
         return self.open_spider(meta)
     items = items or spec.items
     extractors = extractors or spec.extractors
     if spider:
         spider['templates'] = spec.spider['templates']
     else:
         spider = spec.spider
     if template:
         for idx, tmpl in enumerate(spider['templates']):
             if template['original_body'] == tmpl['original_body']:
                 spider['templates'][idx] = template
                 break
         else:
             spider['templates'].append(template)
         spider['template_names'] = [t['name'] for t in spider['templates']]
     self.factory[self].spider = IblSpider(meta['spider'], spider, items,
                                           extractors, self.settings)
     self.factory[self].spiderspec = SpiderSpec(meta['spider'], spider,
                                                items, extractors)
Пример #14
0
 def open_spider(self, meta):
     if ('project' not in meta or 'spider' not in meta or
             (self.user.authorized_projects is not None and
              meta['project'] not in self.user.authorized_projects and
              not self.user.staff)):
         return {'error': 4004,
                 'reason': 'Project "%s" not found' % meta['project']}
     spider_name = meta['spider']
     spec = self.spec_manager.project_spec(meta['project'], self.user.auth)
     spider = spec.resource('spiders', spider_name)
     items = spec.resource('items')
     extractors = spec.resource('extractors')
     templates = []
     for template in spider.get('template_names', []):
         try:
             templates.append(spec.resource('spiders', spider_name,
                                            template))
         except TypeError:
             # Template names not consistent with templates
             spec.remove_template(spider_name, template)
     spider['templates'] = templates
     if not self.settings.get('SPLASH_URL'):
         self.settings.set('SPLASH_URL', 'portia')
     self.factory[self].spider = IblSpider(spider_name, spider, items,
                                           extractors, self.settings)
     self.factory[self].spiderspec = SpiderSpec(spider_name, spider, items,
                                                extractors)
Пример #15
0
def load_project_data(open_func, spiders_list_func, project_dir):
    """Load project data using provided open_func and project directory."""
    # Load items and extractors from project
    schemas = open_func(project_dir, 'items')
    extractors = open_func(project_dir, 'extractors')

    # Load spiders and templates
    spiders = {}
    spiders_list = spiders_list_func(project_dir)
    for spider_name in spiders_list:
        spider = open_func(project_dir, 'spiders', spider_name)
        if not spider:
            log.warning('Skipping "%s" spider as there is no data',
                        spider_name)
            continue
        if 'template_names' in spider:
            samples = spider.get('template_names', [])
            spider['templates'] = []
            for sample_name in samples:
                sample = open_func(project_dir, 'spiders', spider_name,
                                   sample_name)
                _build_sample(sample)
                spider['templates'].append(sample)
        else:
            for sample in spider.get('templates', []):
                _build_sample(sample)
        spiders[spider_name] = (IblSpider(spider_name, spider, schemas,
                                          extractors, Settings()), spider)
    return schemas, extractors, spiders
Пример #16
0
def open_spider_page_and_results(name):
    sample_spec = open_spec(name)
    schemas = sample_spec['schemas']
    results = sample_spec['results']
    page = UTF8HtmlResponse('http://url', body=sample_spec['original_body'])
    spider = IblSpider(name, make_spider(sample=sample_spec), schemas, {},
                       Settings())
    return spider, page, results
Пример #17
0
def open_spider_page_and_results(name):
    sample_spec = open_spec(name)
    schemas = sample_spec['schemas']
    results = sample_spec['results']
    if 'original_body' not in sample_spec:
        sample_spec['original_body'] = open_spec('{}.html'.format(
            name[:-len('.json')]))
    page = UTF8HtmlResponse('http://url', body=sample_spec['original_body'])
    spider = IblSpider(name, make_spider(sample=sample_spec), schemas, {},
                       Settings())
    return spider, page, results
Пример #18
0
 def test_trained(self):
     base = 'http://www.daft.ie/ireland/houses-for-sale/?offset={}'.format
     daft_url = base(10)
     spec = {
         'start_urls': [daft_url],
         'links_to_follow': 'auto',
         'respect_nofollow': False,
         'follow_patterns': [],
         'exclude_patterns': [],
         'init_requests': [],
         'templates': [daft_sample]
     }
     settings = Settings()
     settings.set('LOADED_PLUGINS', load_plugins(settings))
     spider = IblSpider('hn', spec, {}, {}, settings=settings)
     request = Request(daft_url)
     response = UTF8HtmlResponse(url=daft_url, body=daft_body,
                                 request=request)
     data = {r.url for r in spider.handle_html(response)
             if isinstance(r, Request)}
     self.assertEqual({base(i) for i in (90, 80, 70)}, data)
Пример #19
0
def load_spider(spec, spider_name):
    try:
        spider = spec.spider_with_templates(spider_name)
    except (TypeError, ValueError):
        raise BadRequest('The spider %s, could not be cound' % spider_name)
    try:
        items = spec.resource('items')
    except (TypeError, ValueError):
        items = {}
    try:
        extractors = spec.resource('extractors')
    except (TypeError, ValueError):
        extractors = {}
    return IblSpider(spider_name, spider, items, extractors, Settings())
Пример #20
0
 def create_spider(self, project, params, **kwargs):
     spider = params.get('spider')
     if spider is None:
         return
     pspec = self.bot.spec_manager.project_spec(project)
     try:
         spider_spec = pspec.resource('spiders', spider)
         items_spec = pspec.resource('items')
         extractors = pspec.resource('extractors')
         return IblSpider(spider, spider_spec, items_spec, extractors,
             **kwargs)
     except IOError as ex:
         if ex.errno == errno.ENOENT:
             log.msg("skipping extraction, no spec: %s" % ex.filename)
         else:
             raise
Пример #21
0
def load_project_data(storage):
    """Load project data using provided open_func and project directory."""
    # Load items and extractors from project

    schemas = storage.open('items.json')
    extractors = storage.open('extractors.json')

    # Load spiders and templates
    spider_loader = SpiderLoader(storage)
    spiders = {}
    for spider_name in spider_loader.spider_names:
        spider = spider_loader[spider_name]
        crawler = IblSpider(spider_name, spider, schemas, extractors,
                            Settings())
        spiders[spider_name] = (crawler, spider)
    return schemas, extractors, spiders
Пример #22
0
 def create_spider(self, project, auth_info, params, **kwargs):
     spider = params.get('spider')
     if spider is None:
         return None, None
     pspec = self.bot.spec_manager.project_spec(project, auth_info)
     try:
         spider_spec = pspec.spider_with_templates(spider)
         items_spec = pspec.resource('items')
         extractors = pspec.resource('extractors')
         return (IblSpider(spider, spider_spec, items_spec, extractors,
                           self.bot.runner.settings,
                           **kwargs), spider_spec['templates'])
     except IOError as ex:
         if ex.errno == errno.ENOENT:
             log.msg("skipping extraction, no spec: %s" % ex.filename)
             return None, None
         else:
             raise
Пример #23
0
    def open_spider(self, meta):
        if ('project' not in meta or 'spider' not in meta or
                (self.user.authorized_projects is not None and
                 meta['project'] not in self.user.authorized_projects and
                 not self.user.staff)):
            return {'error': 4004,
                    'reason': 'Project "%s" not found' % meta['project']}
        spider_name = meta['spider']
        spec = self.spec_manager.project_spec(meta['project'], self.user.auth)

        spider = spec.spider_with_templates(spider_name)
        items = spec.resource('items')
        extractors = spec.resource('extractors')
        if not self.settings.get('SPLASH_URL'):
            self.settings.set('SPLASH_URL', 'portia')
        self.factory[self].spider = IblSpider(spider_name, spider, items,
                                              extractors, self.settings)
        self.factory[self].spiderspec = SpiderSpec(spider_name, spider, items,
                                                   extractors)
Пример #24
0
    def open_spider(self, meta, project=None):
        if not (meta.get('project') and meta.get('spider')):
            return {'error': 4005, 'reason': 'No project specified'}

        if (self.user.authorized_projects is not None
                and meta['project'] not in self.user.authorized_projects
                and not self.user.staff):
            return {
                'error': 4004,
                'reason': 'Project "%s" not found' % meta['project']
            }
        spider_name = meta['spider']

        # project_meta = meta.get('project')
        # project_id = (project_meta if isinstance(project_meta, six.string_types)
        #               else project_meta.id)
        # project = Project(self.storage, id=project_id)

        if project is None:
            project = Project(self.storage, id=meta.get('project'))

        try:
            spider_model = project.spiders[spider_name]
        except IOError:
            return {
                'error': 4003,
                'reason': 'Spider "%s" not found' % spider_name
            }
        spider = spider_model.dump()
        spider['templates'] = []
        for sample in spider_model.samples:
            sample = sample.dump()
            for key in ('original_body', 'rendered_body'):
                if not (sample.get(key) or '').strip():
                    sample[key] = u'<html></html>'
            spider['templates'].append(sample)
        items, extractors = project.schemas.dump(), project.extractors.dump()
        if not self.settings.get('SPLASH_URL'):
            self.settings.set('SPLASH_URL', 'portia')
        self.factory[self].spider = IblSpider(spider_name, spider, items,
                                              extractors, self.settings)
        self.factory[self].spiderspec = SpiderSpec(project, spider_name,
                                                   spider, items, extractors)
Пример #25
0
def load_spider(spec, spider_name):
    spider = spec.spider_with_templates(spider_name)
    items = spec.resource('items')
    extractors = spec.resource('extractors')
    return IblSpider(spider_name, spider, items, extractors, Settings())
Пример #26
0
def load_spider(model):
    name, spider, items, extractors = load_spider_data(model)
    return IblSpider(name, spider, items, extractors, Settings())