def parse_detail(self, response): item = None for each in response.xpath( './/div[@class="tiInherit"]/parent::div/*' )[3:]: content = each.xpath('child::node()') if content and content[0].xpath('local-name()').extract() == ['span']: if item: yield self.finalize(item) item = WebSourcesCorpusItem( url=response.url, name=' '.join(self.text_from_node(c) for c in content[:3]), bio=text.clean_extract(each, './/text()', sep=' '), ) if each.xpath('./i'): item['other'] = { 'profession': text.clean_extract(each, './i//text()') } assert item['name'] and len(item['name']) > 3 elif item: item['bio'] += '\n' + text.clean_extract(each, './/text()', sep=' ') if item: yield self.finalize(item)
def parse_detail(self, response): for each in response.xpath('.//div[@id="headerContainer"]/following-sibling::div//p'): yield WebSourcesCorpusItem( url=response.url, name=text.clean_extract(each, "./span//text()"), bio=text.clean_extract(each, ".//text()", sep=" "), )
def parse_detail(self, response): item = None for each in response.xpath( './/div[@class="tiInherit"]/parent::div/*')[3:]: content = each.xpath('child::node()') if content and content[0].xpath('local-name()').extract() == [ 'span' ]: if item: yield self.finalize(item) item = WebSourcesCorpusItem( url=response.url, name=' '.join(self.text_from_node(c) for c in content[:3]), bio=text.clean_extract(each, './/text()', sep=' '), ) if each.xpath('./i'): item['other'] = { 'profession': text.clean_extract(each, './i//text()') } assert item['name'] and len(item['name']) > 3 elif item: item['bio'] += '\n' + text.clean_extract( each, './/text()', sep=' ') if item: yield self.finalize(item)
def refine_item(self, response, item): item['other'] = {} for section in response.xpath('.//div[@id="content"]//div[@class="featured"][child::h2]'): title = text.clean_extract(section, 'h2//text()') content = [text.clean_extract(p, './/text()') for p in section.xpath('p')] item['other'][title] = content return super(SculptureUkSpider, self).refine_item(response, item)
def parse_detail(self, response): for each in response.xpath( './/div[@id="headerContainer"]/following-sibling::div//p'): yield WebSourcesCorpusItem( url=response.url, name=text.clean_extract(each, './span//text()'), bio=text.clean_extract(each, './/text()', sep=' '), )
def parse_bio(self, response): table = response.xpath('.//table[@align="center"]') fields = [text.clean_extract(field, './/text()', sep=' ') for field in table.xpath('./tr[1]/th')] bio = [] for table_row in table.xpath('./tr[position()>1]'): values = [text.clean_extract(val, './/text()', sep=' ') for val in table_row.xpath('./td')] bio.append(dict(zip(fields, values))) return bio
def parse_detail(self, response): for each in response.xpath( './/div[@id="headerContainer"]/following-sibling::p'): item = WebSourcesCorpusItem( url=response.url, bio=text.clean_extract(each, './/text()', sep=' '), ) if each.xpath('./a'): item['name'] = text.clean_extract(each, './a[1]//text()') if 'name' in item or item['bio']: yield item
def parse_detail(self, response): for each in response.xpath( './/div[@id="headerContainer"]/following-sibling::p' ): item = WebSourcesCorpusItem( url=response.url, bio=text.clean_extract(each, './/text()', sep=' '), ) if each.xpath('./a'): item['name'] = text.clean_extract(each, './a[1]//text()') if 'name' in item or item['bio']: yield item
def parse_bio(self, response): table = response.xpath('.//table[@align="center"]') fields = [ text.clean_extract(field, './/text()', sep=' ') for field in table.xpath('./tr[1]/th') ] bio = [] for table_row in table.xpath('./tr[position()>1]'): values = [ text.clean_extract(val, './/text()', sep=' ') for val in table_row.xpath('./td') ] bio.append(dict(zip(fields, values))) return bio
def parse_detail(self, response): artist_id = response.url.split('/')[-1] keys = response.xpath('.//div[@id="member_info"]//dt') values = response.xpath('.//div[@id="member_info"]//dd') info = dict((text.clean_extract(k, './/text()'), text.clean_extract(v, './/text()')) for k, v in zip(keys, values)) item = WebSourcesCorpusItem( url=response.url, name=info.pop('Real/full name:'), other=info, ) yield Request('http://www.metal-archives.com/artist/read-more/id/' + artist_id, self.parse_extern, meta={'item': item, 'field': 'bio', 'aid': artist_id})
def parse_person(self, response): item = WebSourcesCorpusItem() item['url'] = response.url name = clean_extract(response, "//h1[contains(@class, 'header')]//text()") if name: item['name'] = name else: logging.debug("No name found for item with URL '%s'" % item['url']) bio_nodes = response.xpath("//li[contains(., 'BIOGRAPHY')]").extract() if bio_nodes: item['bio'] = fromstring( '\n'.join(bio_nodes)).text_content().strip() else: logging.debug("No raw text biography found for %s" % item['name']) item['other'] = {} keys = response.css('li#info td.fieldnameback') if keys: for key_node in keys: key_text = key_node.xpath('.//text()').extract_first() # Take the first sibling of the key node as the value value = key_node.xpath('./following-sibling::td[1]') if value: people_links = value.xpath( ".//a[contains(@href, 'getperson')]") if people_links: logging.debug("Values with links found for key '%s'" % key_text) item['other'][key_text] = [] for person in people_links: name = person.xpath('.//text()').extract_first() link = person.xpath('@href').extract_first() item['other'][key_text].append( {name: response.urljoin(link)}) else: literal_value = clean_extract(value, './/text()') item['other'][key_text] = literal_value else: logging.debug("No value found for key '%s'" % key_text) else: logging.debug("No semi-structured data found for '%s'" % item['name']) yield item
def extract_dl_key_value(self, dl_pairs, item): """ Feed the item with key-value pairs extracted from <dl> tags """ for pair in dl_pairs: key = pair.xpath('./dt/text()').extract_first().replace(' ', '_').lower() value = clean_extract(pair, './dd//text()') if key or value: item['other'][key] = value else: logging.debug("Couldn't extract key or value from pair node '%s'" % pair) return item
def parse(self, response): current_item = None for p in response.xpath('.//div[@id="mw-content-text"]/p'): content = p.xpath('child::node()') if content and content[0].xpath('local-name()').extract() == ['a']: if current_item is not None: if 'other' in current_item: current_item['other'] = json.dumps( current_item['other']) yield current_item current_item = WebSourcesCorpusItem( url=text.clean_extract(content[0], '@href'), name=text.clean_extract(content[0], 'text()'), bio=' '.join( text.clean_extract(c, './/text()') for c in content[1:])) else: txt = p.xpath('text()').extract()[0] m = re.match( ur'([^(]{,50})\((about )?(B\.C\. )?(\d+| ) ?- ?(\d+| )\)', txt) if m: if 'other' in current_item: current_item['other'] = json.dumps( current_item['other']) yield current_item current_item = WebSourcesCorpusItem( url=response.url, name=m.group(1).strip(), birth=(m.group(3) or '') + m.group(4), death=(m.group(3) or '') + m.group(5), bio=text.clean_extract(p, './/text()'), ) elif current_item is not None: current_item['bio'] += text.clean_extract(p, './/text()') if current_item is not None: if 'other' in current_item: current_item['other'] = json.dumps(current_item['other']) yield current_item
def parse_other(self, response): table = response.xpath('.//table')[1] ul_with_caption = table.xpath( './/text()[string-length(.) > 1]/following-sibling::ul') res = {} for ul in ul_with_caption: caption = text.clean_extract(ul, 'preceding-sibling::text()') list = ul.xpath('li/child::*').extract() res[caption] = list res.pop('', None) return res
def refine_item(self, response, item): item['other'] = {} for ul in response.xpath( './/div[@id="stammdaten"]/div[contains(@class, "text")]//ul' ): field = ul.xpath('preceding-sibling::h4/text()').extract()[-1] value = [ text.clean_extract(li, './/text()', sep=' ') for li in ul.xpath('li') ] item['other'][field] = value for section in response.xpath('.//div[@class="section"]'): title = text.clean_extract(section, 'div[1]//text()') values = [text.clean_extract(li, './/text()') for li in section.xpath('div[2]/ul/li')] if values: item['other'][title] = values item['name'] = text.clean(item['name'].replace('\t', ' ')) return super(AcademiaNetSpider, self).refine_item(response, item)
def refine_item(self, response, item): data = {} for section in response.xpath('.//div[@class="biography-item-container"]'): title = text.clean_extract(section, 'div[1]//h3//text()') keys = [ text.clean_extract(td, './/text()') for td in section.xpath('./div[2]//table//td[contains(@class, "post")]') ] values = [ text.clean_extract(td, './/text()') for td in section.xpath('./div[2]//table//td[contains(@class, "date")]') ] content = zip(keys, values) if content: data[title] = content item['other'] = data return super(ParliamentUkSpider, self).refine_item(response, item)
def refine_item(self, response, item): content = response.xpath('.//div[@id="mw-content-text"]/div[2]') children = content.xpath('./p/child::node()') if len(children) < 3 or children[2].xpath('local-name()').extract() != ['span']: return None else: name = children[2].xpath('.//text()').extract() if name: item['bio'] = text.clean_extract(content, './/text()') item['name'] = text.clean(children[1].extract() + ' ' + name[0]) else: return None return super(MusiciansSpider, self).refine_item(response, item)
def parse_other(self, response): table = response.xpath('.//table')[1] ul_with_caption = table.xpath( './/text()[string-length(.) > 1]/following-sibling::ul' ) res = {} for ul in ul_with_caption: caption = text.clean_extract(ul, 'preceding-sibling::text()') list = ul.xpath('li/child::*').extract() res[caption] = list res.pop('', None) return res
def parse_detail(self, response): artist_id = response.url.split('/')[-1] keys = response.xpath('.//div[@id="member_info"]//dt') values = response.xpath('.//div[@id="member_info"]//dd') info = dict((text.clean_extract(k, './/text()'), text.clean_extract(v, './/text()')) for k, v in zip(keys, values)) item = WebSourcesCorpusItem( url=response.url, name=info.pop('Real/full name:'), other=info, ) yield Request('http://www.metal-archives.com/artist/read-more/id/' + artist_id, self.parse_extern, meta={ 'item': item, 'field': 'bio', 'aid': artist_id })
def refine_item(self, response, item): try: title = text.clean_extract(response.selector, './/h1[@id="firstHeading"]//text()') name, birth, death = self.parse_title(title) except (IndexError, ValueError): # not a person (could be a place or whatever else) logging.debug('Not a person at ' + response.url) return None else: item['name'] = name item['birth'] = birth item['death'] = death item['other'] = json.dumps({'title': title}) return item
def refine_item(self, response, item): data = {} for section in response.xpath( './/div[@class="biography-item-container"]'): title = text.clean_extract(section, 'div[1]//h3//text()') keys = [ text.clean_extract(td, './/text()') for td in section.xpath( './div[2]//table//td[contains(@class, "post")]') ] values = [ text.clean_extract(td, './/text()') for td in section.xpath( './div[2]//table//td[contains(@class, "date")]') ] content = zip(keys, values) if content: data[title] = content item['other'] = data return super(ParliamentUkSpider, self).refine_item(response, item)
def refine_item(self, response, item): birth_death = text.clean_extract( response, './/div[@id="maincontent"]/p[1]/em').split('<br>')[0] birth_death = re.subn(r'<[^>]+>', '', birth_death)[0].split('d.') if len(birth_death) == 2: birth, death = birth_death birth = birth[len('b.'):].strip() death = death.strip() item['birth'] = birth if birth != '?' else None item['death'] = death if death != '?' else None return super(MunksrollSpider, self).refine_item(response, item)
def parse_extern(self, response): meta = response.meta txt = text.clean_extract(response.selector, './/text()') if meta['field'] == 'bio': meta['item']['bio'] = txt meta['field'] = 'trivia' yield Request( 'http://www.metal-archives.com/artist/read-more/id/%s/field/trivia' % meta['aid'], self.parse_extern, meta=meta ) else: # trivia meta['item']['other']['trivia'] = txt meta['item']['other'] = json.dumps(meta['item']['other']) yield meta['item']
def parse_person(self, response): item = WebSourcesCorpusItem() item['url'] = response.url name = clean_extract(response, "//h1[contains(@class, 'header')]//text()") if name: item['name'] = name else: logging.debug("No name found for item with URL '%s'" % item['url']) bio_nodes = response.xpath("//li[contains(., 'BIOGRAPHY')]").extract() if bio_nodes: item['bio'] = fromstring('\n'.join(bio_nodes)).text_content().strip() else: logging.debug("No raw text biography found for %s" % item['name']) item['other'] = {} keys = response.css('li#info td.fieldnameback') if keys: for key_node in keys: key_text = key_node.xpath('.//text()').extract_first() # Take the first sibling of the key node as the value value = key_node.xpath('./following-sibling::td[1]') if value: people_links = value.xpath(".//a[contains(@href, 'getperson')]") if people_links: logging.debug("Values with links found for key '%s'" % key_text) item['other'][key_text] = [] for person in people_links: name = person.xpath('.//text()').extract_first() link = person.xpath('@href').extract_first() item['other'][key_text].append({name: response.urljoin(link)}) else: literal_value = clean_extract(value, './/text()') item['other'][key_text] = literal_value else: logging.debug("No value found for key '%s'" % key_text) else: logging.debug("No semi-structured data found for '%s'" % item['name']) yield item
def parse(self, response): current_item = None for p in response.xpath('.//div[@id="mw-content-text"]/p'): content = p.xpath('child::node()') if content and content[0].xpath('local-name()').extract() == ['a']: if current_item is not None: if 'other' in current_item: current_item['other'] = json.dumps(current_item['other']) yield current_item current_item = WebSourcesCorpusItem( url=text.clean_extract(content[0], '@href'), name=text.clean_extract(content[0], 'text()'), bio=' '.join(text.clean_extract(c, './/text()') for c in content[1:]) ) else: txt = p.xpath('text()').extract()[0] m = re.match(ur'([^(]{,50})\((about )?(B\.C\. )?(\d+| ) ?- ?(\d+| )\)', txt) if m: if 'other' in current_item: current_item['other'] = json.dumps(current_item['other']) yield current_item current_item = WebSourcesCorpusItem( url=response.url, name=m.group(1).strip(), birth=(m.group(3) or '') + m.group(4), death=(m.group(3) or '') + m.group(5), bio=text.clean_extract(p, './/text()'), ) elif current_item is not None: current_item['bio'] += text.clean_extract(p, './/text()') if current_item is not None: if 'other' in current_item: current_item['other'] = json.dumps(current_item['other']) yield current_item
def refine_item(self, response, item): birth_death = text.clean_extract(response, './/div[@id="maincontent"]/p[1]/em' ).split('<br>')[0] birth_death = re.subn(r'<[^>]+>', '', birth_death)[0].split('d.') if len(birth_death) == 2: birth, death = birth_death birth = birth[len('b.'):].strip() death = death.strip() item['birth'] = birth if birth != '?' else None item['death'] = death if death != '?' else None return super(MunksrollSpider, self).refine_item(response, item)
def refine_item(self, response, item): content = response.xpath('.//div[@id="mw-content-text"]/div[2]') children = content.xpath('./p/child::node()') if len(children) < 3 or children[2].xpath( 'local-name()').extract() != ['span']: return None else: name = children[2].xpath('.//text()').extract() if name: item['bio'] = text.clean_extract(content, './/text()') item['name'] = text.clean(children[1].extract() + ' ' + name[0]) else: return None return super(MusiciansSpider, self).refine_item(response, item)
def parse_extern(self, response): meta = response.meta txt = text.clean_extract(response.selector, './/text()') if meta['field'] == 'bio': meta['item']['bio'] = txt meta['field'] = 'trivia' yield Request( 'http://www.metal-archives.com/artist/read-more/id/%s/field/trivia' % meta['aid'], self.parse_extern, meta=meta) else: # trivia meta['item']['other']['trivia'] = txt meta['item']['other'] = json.dumps(meta['item']['other']) yield meta['item']
def refine_item(self, response, item): try: dates = text.clean_extract( response, './/div[@id="text"]/span[@class="article_header"]//text()' ).split('(')[1].split(')')[0] except IndexError: pass else: birth, death = text.parse_birth_death(dates.replace('\n', '')) if birth or death: item['birth'] = birth or None item['death'] = death or None item['name'] = '%s, %s' % (item['other'].pop('forename'), item['other'].pop('surname')) return super(YbaLlgcOrgUkSpider, self).refine_item(response, item)
def refine_item(self, response, item): base = ".//table//td[1]//table//tr[3]//table//td" data = {} item["bio"] = "" for paragraph in response.xpath(base + "//p"): fields = paragraph.xpath("./b/text()").extract() if not fields: inner_text = text.clean_extract(paragraph, ".//text()", sep=" ", unicode=False) item["bio"] += inner_text + "\n" else: contents = paragraph.xpath(".//text()").extract() for field, values in text.split_at(contents, fields): if field is not None: data[field.lower().strip().replace(":", "")] = " ".join(values).strip() item["birth"] = data.get("born") item["death"] = data.get("died") item["other"] = data if not item["bio"]: del item["bio"] return super(NndbComSpider, self).refine_item(response, item)
def refine_item(self, response, item): item['other'] = {} name = item['name'] # Alias alias = clean_extract( response, './/div[@class="expandable-header"][contains(., "Name variations")]/following-sibling::div[@class="expandable-content"]//dd/text()') if alias: item['other']['alias'] = alias else: logging.debug("No alias found for '%s'" % name) # Relevant left key-value pairs left = response.xpath('..//div[@class="left"]/div[@class="fieldGroup"]/dl') if left: item = self.extract_dl_key_value(left, item) else: logging.debug("No relevant key-value pairs found on the left box for '%s'" % name) # Relevant right key-value pairs right = response.xpath('.//div[@class="right"]/div[@class="fieldGroup split"]/dl') if right: item = self.extract_dl_key_value(right, item) else: logging.debug("No relevant key-value pairs found on the right box for '%s'" % name) return super(RKDArtistsSpider, self).refine_item(response, item)
def refine_item(self, response, item): base = './/table//td[1]//table//tr[3]//table//td' data = {} item['bio'] = '' for paragraph in response.xpath(base + '//p'): fields = paragraph.xpath('./b/text()').extract() if not fields: inner_text = text.clean_extract(paragraph, './/text()', sep=' ', unicode=False) item['bio'] += inner_text + '\n' else: contents = paragraph.xpath('.//text()').extract() for field, values in text.split_at(contents, fields): if field is not None: data[field.lower().strip().replace(':', '')] = ' '.join(values).strip() item['birth'] = data.get('born') item['death'] = data.get('died') item['other'] = data if not item['bio']: del item['bio'] return super(NndbComSpider, self).refine_item(response, item)
def parse_fellow(self, response): yield WebSourcesCorpusItem( url=response.url, bio=text.clean_extract(response, './/div[@class="expandableBio"]//text()'), other=json.dumps(response.meta) )
def text_from_node(self, node): return (text.clean_extract(node, './/text()', sep=' ') if node.xpath('local-name()').extract() else text.clean(node.extract()))
def text_from_node(self, node): return (text.clean_extract(node, './/text()', sep=' ') if node.xpath('local-name()').extract() else text.clean( node.extract()))