示例#1
0
    def parse_detail(self, response):
        item = None
        for each in response.xpath(
            './/div[@class="tiInherit"]/parent::div/*'
        )[3:]:
            content = each.xpath('child::node()')
            if content and content[0].xpath('local-name()').extract() == ['span']:
                if item:
                    yield self.finalize(item)

                item = WebSourcesCorpusItem(
                    url=response.url,
                    name=' '.join(self.text_from_node(c) for c in content[:3]),
                    bio=text.clean_extract(each, './/text()', sep=' '),
                )

                if each.xpath('./i'):
                    item['other'] = {
                        'profession': text.clean_extract(each, './i//text()')
                    }

                assert item['name'] and len(item['name']) > 3
            elif item:
                item['bio'] += '\n' + text.clean_extract(each, './/text()', sep=' ')

        if item:
            yield self.finalize(item)
示例#2
0
 def parse_detail(self, response):
     for each in response.xpath('.//div[@id="headerContainer"]/following-sibling::div//p'):
         yield WebSourcesCorpusItem(
             url=response.url,
             name=text.clean_extract(each, "./span//text()"),
             bio=text.clean_extract(each, ".//text()", sep=" "),
         )
示例#3
0
    def parse_detail(self, response):
        item = None
        for each in response.xpath(
                './/div[@class="tiInherit"]/parent::div/*')[3:]:
            content = each.xpath('child::node()')
            if content and content[0].xpath('local-name()').extract() == [
                    'span'
            ]:
                if item:
                    yield self.finalize(item)

                item = WebSourcesCorpusItem(
                    url=response.url,
                    name=' '.join(self.text_from_node(c) for c in content[:3]),
                    bio=text.clean_extract(each, './/text()', sep=' '),
                )

                if each.xpath('./i'):
                    item['other'] = {
                        'profession': text.clean_extract(each, './i//text()')
                    }

                assert item['name'] and len(item['name']) > 3
            elif item:
                item['bio'] += '\n' + text.clean_extract(
                    each, './/text()', sep=' ')

        if item:
            yield self.finalize(item)
示例#4
0
    def refine_item(self, response, item):
        item['other'] = {}
        for section in response.xpath('.//div[@id="content"]//div[@class="featured"][child::h2]'):
            title = text.clean_extract(section, 'h2//text()')
            content = [text.clean_extract(p, './/text()') for p in section.xpath('p')]
            item['other'][title] = content

        return super(SculptureUkSpider, self).refine_item(response, item)
示例#5
0
 def parse_detail(self, response):
     for each in response.xpath(
             './/div[@id="headerContainer"]/following-sibling::div//p'):
         yield WebSourcesCorpusItem(
             url=response.url,
             name=text.clean_extract(each, './span//text()'),
             bio=text.clean_extract(each, './/text()', sep=' '),
         )
示例#6
0
 def parse_bio(self, response):
     table = response.xpath('.//table[@align="center"]')
     fields = [text.clean_extract(field, './/text()', sep=' ')
               for field in table.xpath('./tr[1]/th')]
     bio = []
     for table_row in table.xpath('./tr[position()>1]'):
         values = [text.clean_extract(val, './/text()', sep=' ')
                   for val in table_row.xpath('./td')]
         bio.append(dict(zip(fields, values)))
     return bio
示例#7
0
    def parse_detail(self, response):
        for each in response.xpath(
                './/div[@id="headerContainer"]/following-sibling::p'):
            item = WebSourcesCorpusItem(
                url=response.url,
                bio=text.clean_extract(each, './/text()', sep=' '),
            )

            if each.xpath('./a'):
                item['name'] = text.clean_extract(each, './a[1]//text()')

            if 'name' in item or item['bio']:
                yield item
示例#8
0
    def parse_detail(self, response):
        for each in response.xpath(
            './/div[@id="headerContainer"]/following-sibling::p'
        ):
            item = WebSourcesCorpusItem(
                url=response.url,
                bio=text.clean_extract(each, './/text()', sep=' '),
            )

            if each.xpath('./a'):
                item['name'] = text.clean_extract(each, './a[1]//text()')

            if 'name' in item or item['bio']:
                yield item
示例#9
0
 def parse_bio(self, response):
     table = response.xpath('.//table[@align="center"]')
     fields = [
         text.clean_extract(field, './/text()', sep=' ')
         for field in table.xpath('./tr[1]/th')
     ]
     bio = []
     for table_row in table.xpath('./tr[position()>1]'):
         values = [
             text.clean_extract(val, './/text()', sep=' ')
             for val in table_row.xpath('./td')
         ]
         bio.append(dict(zip(fields, values)))
     return bio
示例#10
0
    def parse_detail(self, response):
        artist_id = response.url.split('/')[-1]

        keys = response.xpath('.//div[@id="member_info"]//dt')
        values = response.xpath('.//div[@id="member_info"]//dd')
        info = dict((text.clean_extract(k, './/text()'),
                     text.clean_extract(v, './/text()'))
                    for k, v in zip(keys, values))

        item = WebSourcesCorpusItem(
            url=response.url,
            name=info.pop('Real/full name:'),
            other=info,
        )

        yield Request('http://www.metal-archives.com/artist/read-more/id/' + artist_id,
                      self.parse_extern, meta={'item': item, 'field': 'bio', 'aid': artist_id})
示例#11
0
 def parse_person(self, response):
     item = WebSourcesCorpusItem()
     item['url'] = response.url
     name = clean_extract(response,
                          "//h1[contains(@class, 'header')]//text()")
     if name:
         item['name'] = name
     else:
         logging.debug("No name found for item with URL '%s'" % item['url'])
     bio_nodes = response.xpath("//li[contains(., 'BIOGRAPHY')]").extract()
     if bio_nodes:
         item['bio'] = fromstring(
             '\n'.join(bio_nodes)).text_content().strip()
     else:
         logging.debug("No raw text biography found for %s" % item['name'])
     item['other'] = {}
     keys = response.css('li#info td.fieldnameback')
     if keys:
         for key_node in keys:
             key_text = key_node.xpath('.//text()').extract_first()
             # Take the first sibling of the key node as the value
             value = key_node.xpath('./following-sibling::td[1]')
             if value:
                 people_links = value.xpath(
                     ".//a[contains(@href, 'getperson')]")
                 if people_links:
                     logging.debug("Values with links found for key '%s'" %
                                   key_text)
                     item['other'][key_text] = []
                     for person in people_links:
                         name = person.xpath('.//text()').extract_first()
                         link = person.xpath('@href').extract_first()
                         item['other'][key_text].append(
                             {name: response.urljoin(link)})
                 else:
                     literal_value = clean_extract(value, './/text()')
                     item['other'][key_text] = literal_value
             else:
                 logging.debug("No value found for key '%s'" % key_text)
     else:
         logging.debug("No semi-structured data found for '%s'" %
                       item['name'])
     yield item
示例#12
0
 def extract_dl_key_value(self, dl_pairs, item):
     """ Feed the item with key-value pairs extracted from <dl> tags """
     for pair in dl_pairs:
         key = pair.xpath('./dt/text()').extract_first().replace(' ', '_').lower()
         value = clean_extract(pair, './dd//text()')
         if key or value:
             item['other'][key] = value
         else:
             logging.debug("Couldn't extract key or value from pair node '%s'" % pair)
     return item
示例#13
0
    def parse(self, response):
        current_item = None

        for p in response.xpath('.//div[@id="mw-content-text"]/p'):
            content = p.xpath('child::node()')
            if content and content[0].xpath('local-name()').extract() == ['a']:
                if current_item is not None:
                    if 'other' in current_item:
                        current_item['other'] = json.dumps(
                            current_item['other'])
                    yield current_item

                current_item = WebSourcesCorpusItem(
                    url=text.clean_extract(content[0], '@href'),
                    name=text.clean_extract(content[0], 'text()'),
                    bio=' '.join(
                        text.clean_extract(c, './/text()')
                        for c in content[1:]))
            else:
                txt = p.xpath('text()').extract()[0]
                m = re.match(
                    ur'([^(]{,50})\((about )?(B\.C\. )?(\d+| ) ?- ?(\d+| )\)',
                    txt)
                if m:
                    if 'other' in current_item:
                        current_item['other'] = json.dumps(
                            current_item['other'])
                    yield current_item
                    current_item = WebSourcesCorpusItem(
                        url=response.url,
                        name=m.group(1).strip(),
                        birth=(m.group(3) or '') + m.group(4),
                        death=(m.group(3) or '') + m.group(5),
                        bio=text.clean_extract(p, './/text()'),
                    )
                elif current_item is not None:
                    current_item['bio'] += text.clean_extract(p, './/text()')

        if current_item is not None:
            if 'other' in current_item:
                current_item['other'] = json.dumps(current_item['other'])
            yield current_item
示例#14
0
    def parse_other(self, response):
        table = response.xpath('.//table')[1]
        ul_with_caption = table.xpath(
            './/text()[string-length(.) > 1]/following-sibling::ul')

        res = {}
        for ul in ul_with_caption:
            caption = text.clean_extract(ul, 'preceding-sibling::text()')
            list = ul.xpath('li/child::*').extract()
            res[caption] = list
        res.pop('', None)
        return res
示例#15
0
    def refine_item(self, response, item):
        item['other'] = {}
        for ul in response.xpath(
            './/div[@id="stammdaten"]/div[contains(@class, "text")]//ul'
        ):
            field = ul.xpath('preceding-sibling::h4/text()').extract()[-1]
            value = [
                text.clean_extract(li, './/text()', sep=' ') for li in ul.xpath('li')
            ]
            item['other'][field] = value

        for section in response.xpath('.//div[@class="section"]'):
            title = text.clean_extract(section, 'div[1]//text()')
            values = [text.clean_extract(li, './/text()')
                      for li in section.xpath('div[2]/ul/li')]
            if values:
                item['other'][title] = values

        item['name'] = text.clean(item['name'].replace('\t', ' '))

        return super(AcademiaNetSpider, self).refine_item(response, item)
示例#16
0
    def refine_item(self, response, item):
        data = {}
        for section in response.xpath('.//div[@class="biography-item-container"]'):
            title = text.clean_extract(section, 'div[1]//h3//text()')

            keys = [
                text.clean_extract(td, './/text()')
                for td in section.xpath('./div[2]//table//td[contains(@class, "post")]')
            ]

            values = [
                text.clean_extract(td, './/text()')
                for td in section.xpath('./div[2]//table//td[contains(@class, "date")]')
            ]

            content = zip(keys, values)
            if content:
                data[title] = content

        item['other'] = data

        return super(ParliamentUkSpider, self).refine_item(response, item)
示例#17
0
 def refine_item(self, response, item):
     content = response.xpath('.//div[@id="mw-content-text"]/div[2]')
     children = content.xpath('./p/child::node()')
     if len(children) < 3 or children[2].xpath('local-name()').extract() != ['span']:
         return None
     else:
         name = children[2].xpath('.//text()').extract()
         if name:
             item['bio'] = text.clean_extract(content, './/text()')
             item['name'] = text.clean(children[1].extract() + ' ' + name[0])
         else:
             return None
     return super(MusiciansSpider, self).refine_item(response, item)
示例#18
0
    def parse_other(self, response):
        table = response.xpath('.//table')[1]
        ul_with_caption = table.xpath(
            './/text()[string-length(.) > 1]/following-sibling::ul'
        )

        res = {}
        for ul in ul_with_caption:
            caption = text.clean_extract(ul, 'preceding-sibling::text()')
            list = ul.xpath('li/child::*').extract()
            res[caption] = list
        res.pop('', None)
        return res
示例#19
0
    def parse_detail(self, response):
        artist_id = response.url.split('/')[-1]

        keys = response.xpath('.//div[@id="member_info"]//dt')
        values = response.xpath('.//div[@id="member_info"]//dd')
        info = dict((text.clean_extract(k, './/text()'),
                     text.clean_extract(v, './/text()'))
                    for k, v in zip(keys, values))

        item = WebSourcesCorpusItem(
            url=response.url,
            name=info.pop('Real/full name:'),
            other=info,
        )

        yield Request('http://www.metal-archives.com/artist/read-more/id/' +
                      artist_id,
                      self.parse_extern,
                      meta={
                          'item': item,
                          'field': 'bio',
                          'aid': artist_id
                      })
示例#20
0
 def refine_item(self, response, item):
     try:
         title = text.clean_extract(response.selector, './/h1[@id="firstHeading"]//text()')
         name, birth, death = self.parse_title(title)
     except (IndexError, ValueError):
         # not a person (could be a place or whatever else)
         logging.debug('Not a person at ' + response.url)
         return None
     else:
         item['name'] = name
         item['birth'] = birth
         item['death'] = death
         item['other'] = json.dumps({'title': title})
         return item
示例#21
0
    def refine_item(self, response, item):
        data = {}
        for section in response.xpath(
                './/div[@class="biography-item-container"]'):
            title = text.clean_extract(section, 'div[1]//h3//text()')

            keys = [
                text.clean_extract(td, './/text()') for td in section.xpath(
                    './div[2]//table//td[contains(@class, "post")]')
            ]

            values = [
                text.clean_extract(td, './/text()') for td in section.xpath(
                    './div[2]//table//td[contains(@class, "date")]')
            ]

            content = zip(keys, values)
            if content:
                data[title] = content

        item['other'] = data

        return super(ParliamentUkSpider, self).refine_item(response, item)
示例#22
0
    def refine_item(self, response, item):
        birth_death = text.clean_extract(
            response, './/div[@id="maincontent"]/p[1]/em').split('<br>')[0]

        birth_death = re.subn(r'<[^>]+>', '', birth_death)[0].split('d.')
        if len(birth_death) == 2:
            birth, death = birth_death
            birth = birth[len('b.'):].strip()
            death = death.strip()

            item['birth'] = birth if birth != '?' else None
            item['death'] = death if death != '?' else None

        return super(MunksrollSpider, self).refine_item(response, item)
示例#23
0
 def parse_extern(self, response):
     meta = response.meta
     txt = text.clean_extract(response.selector, './/text()')
     if meta['field'] == 'bio':
         meta['item']['bio'] = txt
         meta['field'] = 'trivia'
         yield Request(
             'http://www.metal-archives.com/artist/read-more/id/%s/field/trivia' % meta['aid'],
             self.parse_extern, meta=meta
         )
     else:  # trivia
         meta['item']['other']['trivia'] = txt
         meta['item']['other'] = json.dumps(meta['item']['other'])
         yield meta['item']
示例#24
0
 def parse_person(self, response):
     item = WebSourcesCorpusItem()
     item['url'] = response.url
     name = clean_extract(response, "//h1[contains(@class, 'header')]//text()")
     if name:
         item['name'] = name
     else:
         logging.debug("No name found for item with URL '%s'" % item['url'])
     bio_nodes = response.xpath("//li[contains(., 'BIOGRAPHY')]").extract()
     if bio_nodes:
         item['bio'] = fromstring('\n'.join(bio_nodes)).text_content().strip()
     else:
         logging.debug("No raw text biography found for %s" % item['name'])
     item['other'] = {}
     keys = response.css('li#info td.fieldnameback')
     if keys:
         for key_node in keys:
             key_text = key_node.xpath('.//text()').extract_first()
             # Take the first sibling of the key node as the value
             value = key_node.xpath('./following-sibling::td[1]')
             if value:
                 people_links = value.xpath(".//a[contains(@href, 'getperson')]")
                 if people_links:
                     logging.debug("Values with links found for key '%s'" % key_text)
                     item['other'][key_text] = []
                     for person in people_links:
                         name = person.xpath('.//text()').extract_first()
                         link = person.xpath('@href').extract_first()
                         item['other'][key_text].append({name: response.urljoin(link)})
                 else:
                     literal_value = clean_extract(value, './/text()')
                     item['other'][key_text] = literal_value
             else:
                 logging.debug("No value found for key '%s'" % key_text)
     else:
         logging.debug("No semi-structured data found for '%s'" % item['name'])
     yield item
示例#25
0
    def parse(self, response):
        current_item = None

        for p in response.xpath('.//div[@id="mw-content-text"]/p'):
            content = p.xpath('child::node()')
            if content and content[0].xpath('local-name()').extract() == ['a']:
                if current_item is not None:
                    if 'other' in current_item:
                        current_item['other'] = json.dumps(current_item['other'])
                    yield current_item

                current_item = WebSourcesCorpusItem(
                    url=text.clean_extract(content[0], '@href'),
                    name=text.clean_extract(content[0], 'text()'),
                    bio=' '.join(text.clean_extract(c, './/text()') for c in content[1:])
                )
            else:
                txt = p.xpath('text()').extract()[0]
                m = re.match(ur'([^(]{,50})\((about )?(B\.C\. )?(\d+| ) ?- ?(\d+| )\)', txt)
                if m:
                    if 'other' in current_item:
                        current_item['other'] = json.dumps(current_item['other'])
                    yield current_item
                    current_item = WebSourcesCorpusItem(
                        url=response.url,
                        name=m.group(1).strip(),
                        birth=(m.group(3) or '') + m.group(4),
                        death=(m.group(3) or '') + m.group(5),
                        bio=text.clean_extract(p, './/text()'),
                    )
                elif current_item is not None:
                    current_item['bio'] += text.clean_extract(p, './/text()')

        if current_item is not None:
            if 'other' in current_item:
                current_item['other'] = json.dumps(current_item['other'])
            yield current_item
示例#26
0
 def refine_item(self, response, item):
     try:
         title = text.clean_extract(response.selector,
                                    './/h1[@id="firstHeading"]//text()')
         name, birth, death = self.parse_title(title)
     except (IndexError, ValueError):
         # not a person (could be a place or whatever else)
         logging.debug('Not a person at ' + response.url)
         return None
     else:
         item['name'] = name
         item['birth'] = birth
         item['death'] = death
         item['other'] = json.dumps({'title': title})
         return item
示例#27
0
    def refine_item(self, response, item):
        birth_death = text.clean_extract(response,
                                         './/div[@id="maincontent"]/p[1]/em'
                                         ).split('<br>')[0]

        birth_death = re.subn(r'<[^>]+>', '', birth_death)[0].split('d.')
        if len(birth_death) == 2:
            birth, death = birth_death
            birth = birth[len('b.'):].strip()
            death = death.strip()

            item['birth'] = birth if birth != '?' else None
            item['death'] = death if death != '?' else None

        return super(MunksrollSpider, self).refine_item(response, item)
示例#28
0
 def refine_item(self, response, item):
     content = response.xpath('.//div[@id="mw-content-text"]/div[2]')
     children = content.xpath('./p/child::node()')
     if len(children) < 3 or children[2].xpath(
             'local-name()').extract() != ['span']:
         return None
     else:
         name = children[2].xpath('.//text()').extract()
         if name:
             item['bio'] = text.clean_extract(content, './/text()')
             item['name'] = text.clean(children[1].extract() + ' ' +
                                       name[0])
         else:
             return None
     return super(MusiciansSpider, self).refine_item(response, item)
示例#29
0
 def parse_extern(self, response):
     meta = response.meta
     txt = text.clean_extract(response.selector, './/text()')
     if meta['field'] == 'bio':
         meta['item']['bio'] = txt
         meta['field'] = 'trivia'
         yield Request(
             'http://www.metal-archives.com/artist/read-more/id/%s/field/trivia'
             % meta['aid'],
             self.parse_extern,
             meta=meta)
     else:  # trivia
         meta['item']['other']['trivia'] = txt
         meta['item']['other'] = json.dumps(meta['item']['other'])
         yield meta['item']
示例#30
0
    def refine_item(self, response, item):
        try:
            dates = text.clean_extract(
                response, './/div[@id="text"]/span[@class="article_header"]//text()'
            ).split('(')[1].split(')')[0]
        except IndexError:
            pass
        else:
            birth, death = text.parse_birth_death(dates.replace('\n', ''))
            if birth or death:
                item['birth'] = birth or None
                item['death'] = death or None

        item['name'] = '%s, %s' % (item['other'].pop('forename'),
                                   item['other'].pop('surname'))

        return super(YbaLlgcOrgUkSpider, self).refine_item(response, item)
示例#31
0
    def refine_item(self, response, item):
        try:
            dates = text.clean_extract(
                response,
                './/div[@id="text"]/span[@class="article_header"]//text()'
            ).split('(')[1].split(')')[0]
        except IndexError:
            pass
        else:
            birth, death = text.parse_birth_death(dates.replace('\n', ''))
            if birth or death:
                item['birth'] = birth or None
                item['death'] = death or None

        item['name'] = '%s, %s' % (item['other'].pop('forename'),
                                   item['other'].pop('surname'))

        return super(YbaLlgcOrgUkSpider, self).refine_item(response, item)
示例#32
0
    def refine_item(self, response, item):
        base = ".//table//td[1]//table//tr[3]//table//td"

        data = {}
        item["bio"] = ""
        for paragraph in response.xpath(base + "//p"):
            fields = paragraph.xpath("./b/text()").extract()
            if not fields:
                inner_text = text.clean_extract(paragraph, ".//text()", sep=" ", unicode=False)
                item["bio"] += inner_text + "\n"
            else:
                contents = paragraph.xpath(".//text()").extract()
                for field, values in text.split_at(contents, fields):
                    if field is not None:
                        data[field.lower().strip().replace(":", "")] = " ".join(values).strip()

        item["birth"] = data.get("born")
        item["death"] = data.get("died")
        item["other"] = data
        if not item["bio"]:
            del item["bio"]

        return super(NndbComSpider, self).refine_item(response, item)
示例#33
0
 def refine_item(self, response, item):
     item['other'] = {}
     name = item['name']
     # Alias
     alias = clean_extract(
         response, './/div[@class="expandable-header"][contains(., "Name variations")]/following-sibling::div[@class="expandable-content"]//dd/text()')
     if alias:
         item['other']['alias'] = alias
     else:
         logging.debug("No alias found for '%s'" % name)
     # Relevant left key-value pairs
     left = response.xpath('..//div[@class="left"]/div[@class="fieldGroup"]/dl')
     if left:
         item = self.extract_dl_key_value(left, item)
     else:
         logging.debug("No relevant key-value pairs found on the left box for '%s'" % name)
     # Relevant right key-value pairs
     right = response.xpath('.//div[@class="right"]/div[@class="fieldGroup split"]/dl')
     if right:
         item = self.extract_dl_key_value(right, item)
     else:
         logging.debug("No relevant key-value pairs found on the right box for '%s'" % name)
     return super(RKDArtistsSpider, self).refine_item(response, item)
示例#34
0
    def refine_item(self, response, item):
        base = './/table//td[1]//table//tr[3]//table//td'

        data = {}
        item['bio'] = ''
        for paragraph in response.xpath(base + '//p'):
            fields = paragraph.xpath('./b/text()').extract()
            if not fields:
                inner_text = text.clean_extract(paragraph, './/text()', sep=' ',
                                                unicode=False)
                item['bio'] += inner_text + '\n'
            else:
                contents = paragraph.xpath('.//text()').extract()
                for field, values in text.split_at(contents, fields):
                    if field is not None:
                        data[field.lower().strip().replace(':', '')] = ' '.join(values).strip()

        item['birth'] = data.get('born')
        item['death'] = data.get('died')
        item['other'] = data
        if not item['bio']:
            del item['bio']

        return super(NndbComSpider, self).refine_item(response, item)
示例#35
0
 def parse_fellow(self, response):
     yield WebSourcesCorpusItem(
         url=response.url,
         bio=text.clean_extract(response, './/div[@class="expandableBio"]//text()'),
         other=json.dumps(response.meta)
     )
示例#36
0
 def text_from_node(self, node):
     return (text.clean_extract(node, './/text()', sep=' ')
             if node.xpath('local-name()').extract()
             else text.clean(node.extract()))
示例#37
0
 def text_from_node(self, node):
     return (text.clean_extract(node, './/text()', sep=' ')
             if node.xpath('local-name()').extract() else text.clean(
                 node.extract()))