Python best_match示例，scrapely.template.best_match Python示例

示例#1

0

显示文件

文件： test_template.py 项目： scrapy/scrapely

 def test_selected_data(self):
     tm = TemplateMaker(self.PAGE)
     indexes = tm.select(best_match('text to annotate'))
     data = [tm.selected_data(i) for i in indexes]
     self.assertEqual(data, \
         [u'<p>Some text to annotate here</p>', \
         u'<p>Another text to annotate there</p>'])

示例#2

0

显示文件

文件： test_template.py 项目： scrapy/scrapely

 def test_annotate_multiple(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match('text to annotate'), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([(tpl, None)])
     self.assertEqual(ex.extract(self.PAGE)[0],
         [{u'field1': [u'Some text to annotate here', u'Another text to annotate there']}])

示例#3

0

显示文件

文件： test_template.py 项目： xyb/scrapely

 def test_annotations(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match('text to annotate'), best_match=False)
     annotations = [x[0] for x in tm.annotations()]
     self.assertEqual(annotations,
         [{u'annotations': {u'content': u'field1'}},
          {u'annotations': {u'content': u'field1'}}])

示例#4

0

显示文件

文件： test_template.py 项目： scrapy/scrapely

 def test_annotations(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match('text to annotate'), best_match=False)
     annotations = [x[0] for x in tm.annotations()]
     self.assertEqual(annotations,
         [{u'annotations': {u'content': u'field1'}},
          {u'annotations': {u'content': u'field1'}}])

示例#5

0

显示文件

文件： test_template.py 项目： xyb/scrapely

 def test_selected_data(self):
     tm = TemplateMaker(self.PAGE)
     indexes = tm.select(best_match('text to annotate'))
     data = [tm.selected_data(i) for i in indexes]
     self.assertEqual(data, \
         [u'<p>Some text to annotate here</p>', \
         u'<p>Another text to annotate there</p>'])

示例#6

0

显示文件

文件： test_template.py 项目： xyb/scrapely

 def test_annotate_ignore_unpaired(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match("and that's"), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([(tpl, None)])
     self.assertEqual(ex.extract(self.PAGE)[0],
         [{u'field1': [u"More text with unpaired tag <img />and that's it"]}])

示例#7

0

显示文件

文件： test_template.py 项目： netconstructor/scrapely

 def test_annotations(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate("field1", best_match("text to annotate"), best_match=False)
     annotations = [x[0] for x in tm.annotations()]
     self.assertEqual(
         annotations, [{u"annotations": {u"content": u"field1"}}, {u"annotations": {u"content": u"field1"}}]
     )

示例#8

0

显示文件

文件： test_template.py 项目： scrapy/scrapely

 def test_annotate_ignore_unpaired(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match("and that's"), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([(tpl, None)])
     self.assertEqual(ex.extract(self.PAGE)[0],
         [{u'field1': [u"More text with unpaired tag <img />and that's it"]}])

示例#9

0

显示文件

文件： test_template.py 项目： xyb/scrapely

 def test_annotate_multiple(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match('text to annotate'), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([(tpl, None)])
     self.assertEqual(ex.extract(self.PAGE)[0],
         [{u'field1': [u'Some text to annotate here', u'Another text to annotate there']}])

示例#10

0

显示文件

文件： test_template.py 项目： netconstructor/scrapely

 def test_annotate_multiple(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate("field1", best_match("text to annotate"), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([tpl])
     self.assertEqual(
         ex.extract(self.PAGE)[0], [{u"field1": [u"Some text to annotate here", u"Another text to annotate there"]}]
     )

示例#11

0

显示文件

文件： tool.py 项目： wangsouc/scrapely

def apply_criteria(criteria, tm):
    """Apply the given criteria object to the given template"""
    func = best_match(criteria.text) if criteria.text else lambda x, y: False
    sel = tm.select(func)
    if criteria.number is not None:
        if criteria.number < len(sel):
            sel = [sel[criteria.number]]
        else:
            sel = []
    return sel

示例#12

0

显示文件

文件： tool.py 项目： esimionato/scrapely

def apply_criteria(criteria, tm):
    """Apply the given criteria object to the given template"""
    func = best_match(criteria.text) if criteria.text else lambda x, y: False
    sel = tm.select(func)
    if criteria.number is not None:
        if criteria.number < len(sel):
            sel = [sel[criteria.number]]
        else:
            sel = []
    return sel

示例#13

0

显示文件

文件： __init__.py 项目： CodeOps/scrapely

 def train_from_htmlpage(self, htmlpage, data):
     assert data, "Cannot train with empty data"
     tm = TemplateMaker(htmlpage)
     for field, values in data.items():
         if (isinstance(values, (bytes, str)) or
                 not hasattr(values, '__iter__')):
             values = [values]
         for value in values:
             value = str_to_unicode(value, htmlpage.encoding)
             tm.annotate(field, best_match(value))
     self.add_template(tm.get_template())

示例#14

0

显示文件

文件： __init__.py 项目： bopopescu/vinalo

 def train_from_htmlpage(self, htmlpage, data):
     assert data, "Cannot train with empty data"
     tm = TemplateMaker(htmlpage)
     for field, values in data.items():
         if not hasattr(values, '__iter__'):
             values = [values]
         for value in values:
             if isinstance(value, str):
                 value = value.decode(htmlpage.encoding or 'utf-8')
             tm.annotate(field, best_match(value))
     self.add_template(tm.get_template())

示例#15

0

显示文件

 def train_from_htmlpage(self, htmlpage, data):
     assert data, "Cannot train with empty data"
     tm = TemplateMaker(htmlpage)
     for field, values in data.items():
         if (isinstance(values, (bytes, str))
                 or not hasattr(values, '__iter__')):
             values = [values]
         for value in values:
             value = str_to_unicode(value, htmlpage.encoding)
             tm.annotate(field, best_match(value))
     self.add_template(tm.get_template())

示例#16

0

显示文件

文件： scraper.py 项目： bry0n969/scrapely-hack

 def train(self, url=None, data=None, html=None, encoding='utf-8'):
     assert data, "Cannot train with empty data"
     page = self._get_page(url, encoding, html)
     tm = TemplateMaker(page)
     for field, values in data.items():
         if not hasattr(values, '__iter__'):
             values = [values]
         for value in values:
             if isinstance(value, str):
                 value = value.decode(encoding)
             tm.annotate(field, best_match(value))
     self.templates.append(tm.get_template())

示例#17

0

显示文件

文件： scrapely-hack.py 项目： carriercomm/scraperwiki-scraper-vault

 def train(self, url=None, data=None, html=None, encoding='utf-8'):
     assert data, "Cannot train with empty data"
     page = self._get_page(url, encoding, html)
     tm = TemplateMaker(page)
     for field, values in data.items():
         if not hasattr(values, '__iter__'):
             values = [values]
         for value in values:
             if isinstance(value, str):
                 value = value.decode(encoding)
             tm.annotate(field, best_match(value))
     self.templates.append(tm.get_template())

示例#18

0

显示文件

文件： scrape.py 项目： I-TREND/SASF

def annotate(url, site_id, items):
    t = url_to_page(url)
    tms = [TemplateMaker(t)]

    for n, s in items:

        func = best_match(s)
        sel = tms[-1].select(func)
        print 'ATTRIBUTE: %s' % n
        for i in sel:
            print u'[%d] %s' % (i, tms[-1].selected_data(i))
        if len(sel) == 1:
            row = sel[0]
        else:
            row = raw_input('? ')
            try:
                row = int(row)
            except ValueError:
                row = sel[0]
                #row = int(raw_input('? ')) #rows.pop(0)
        print 'SELECTED: %d' % row
        print ''
        annotated = False
        for tm in tms:
            try:
                if tm.annotate_fragment(row, n):
                    annotated = True
                    break
            except FragmentAlreadyAnnotated:
                pass
        if not annotated:
            tms.append(TemplateMaker(t))
            tms[-1].annotate_fragment(row, n)

    save_templates('scraper.json', site_id, (tm.get_template() for tm in tms))
    return [tm.get_template() for tm in tms]

示例#19

0

显示文件

文件： scrape.py 项目： I-TREND/SASF

def annotate(url, site_id, items):
    t = url_to_page(url)
    tms = [TemplateMaker(t)]

    for n, s in items:

        func = best_match(s)
        sel = tms[-1].select(func)
        print 'ATTRIBUTE: %s' % n
        for i in sel:
            print u'[%d] %s' % (i, tms[-1].selected_data(i))
        if len(sel) == 1:
            row = sel[0]
        else:
            row = raw_input('? ')
            try:
                row = int(row)
            except ValueError:
                row = sel[0]
                #row = int(raw_input('? ')) #rows.pop(0)
        print 'SELECTED: %d' % row
        print ''
        annotated = False
        for tm in tms:
            try:
                if tm.annotate_fragment(row, n):
                    annotated = True
                    break
            except FragmentAlreadyAnnotated:
                pass
        if not annotated:
            tms.append(TemplateMaker(t))
            tms[-1].annotate_fragment(row, n)

    save_templates('scraper.json', site_id, (tm.get_template() for tm in tms))
    return [tm.get_template() for tm in tms]

示例#20

0

显示文件

文件： test_template.py 项目： netconstructor/scrapely

 def _matches(self, text):
     bm = best_match(text)
     matches = [(bm(f, self.PAGE), f) for f in self.PAGE.parsed_body]
     matches = [x for x in matches if x[0]]
     matches.sort(reverse=True)
     return [self.PAGE.fragment_data(x[1]) for x in matches]

示例#21

0

显示文件

文件： test_template.py 项目： xyb/scrapely

 def test_annotate_fragment_not_found(self):
     tm = TemplateMaker(self.PAGE)
     self.assertRaises(FragmentNotFound, tm.annotate, 'field1', best_match("missing text"))

示例#22

0

显示文件

文件： test_template.py 项目： netconstructor/scrapely

 def test_selected_data(self):
     tm = TemplateMaker(self.PAGE)
     indexes = tm.select(best_match("text to annotate"))
     data = [tm.selected_data(i) for i in indexes]
     self.assertEqual(data, [u"<p>Some text to annotate here</p>", u"<p>Another text to annotate there</p>"])

示例#23

0

显示文件

文件： test_template.py 项目： netconstructor/scrapely

 def test_annotate_fragment_already_annotated(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate("field1", best_match("text to annotate"))
     self.assertRaises(FragmentAlreadyAnnotated, tm.annotate, "field1", best_match("text to annotate"))

示例#24

0

显示文件

文件： test_template.py 项目： netconstructor/scrapely

 def test_annotate_fragment_not_found(self):
     tm = TemplateMaker(self.PAGE)
     self.assertRaises(FragmentNotFound, tm.annotate, "field1", best_match("missing text"))

示例#25

0

显示文件

文件： test_template.py 项目： xyb/scrapely

 def test_annotate_fragment_already_annotated(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match('text to annotate'))
     self.assertRaises(FragmentAlreadyAnnotated, tm.annotate, 'field1', best_match("text to annotate"))

示例#26

0

显示文件

文件： test_template.py 项目： xyb/scrapely

 def _matches(self, text):
     bm = best_match(text)
     matches = [(bm(f, self.PAGE), f) for f in self.PAGE.parsed_body]
     matches = [x for x in matches if x[0]]
     matches.sort(reverse=True)
     return [self.PAGE.fragment_data(x[1]) for x in matches]