def parse_related_documents(self, response): xpath = '/html/body/div/table/tr[3]/td/table/tr/td/table/tr' hxs = HtmlXPathSelector(response).select(xpath) act = Loader(self, response, LegalAct(), hxs, required=('_id', )) act.add_xpath('_id', 'td[2]/b/text()') if not act.get_output_value('_id'): p_id = unicode(self._get_query_attr(response.url, 'p_id')) act.replace_value('_id', u'NONUMBER-%s' % p_id) relations = defaultdict(list) xpath = '/html/body/div/table/tr[3]/td/table/tr/td/align/table/tr' for row in HtmlXPathSelector(response).select(xpath): docid = get_all(row, 'td[4]/span//text()') rel_type = row.select('td[6]/span/text()') if rel_type: rel_type = rel_type.extract()[0].strip().lower() if rel_type in (u'pakeistas dokumentas', u'ankstesnė dokumento redakcija'): relations['amends'].append(docid) elif rel_type == u'priimtas dokumentas': relations['adopts'].append(docid) elif rel_type == u'ryšys su taikymą nusakančiu dokumentu': relations['defines_applicability'].append(docid) elif rel_type == u'ryšys su galiojimą nusakančiu dokumentu': relations['defines_validity'].append(docid) elif rel_type == u'negalioja de jure': relations['defines_as_no_longer_valid'].append(docid) elif rel_type == u'kitas projekto variantas': relations['new_draft_version'].append(docid) elif rel_type == u'kitas projekto variantas': relations['new_draft_version'].append(docid) elif rel_type == u'ryšys su ratifikavimo dokumentu': relations['ratification'].append(docid) if relations: act.add_value('relations', dict(relations)) yield act.load_item()
def _parse_law_act(self, response, hxs, base=False): """ Extracts basic document information and returns law act loader. Parameters: base Return only base information about document. This is used, when filling some information bits to a law act from several law act documents. """ lang = hxs.select('tr[1]/td[4]/b/text()').extract()[0].strip().lower() if lang not in (u'lietuvių', u'rusų', u'anglų', u'ispanų'): self.error(response, 'Unknown language: %s' % lang) if lang != u'lietuvių': return None act = Loader(self, response, LegalAct(), hxs, required=REQUIRED_FIELDS) act.add_xpath('_id', 'tr[1]/td[2]/b/text()') source = self._get_source(response.url, 'p_id') if not act.get_output_value('_id'): act.replace_value('_id', u'NONUMBER-%s' % source['id']) if base: return act act.add_xpath('name', 'caption/text()') act.add_xpath('kind', 'tr[1]/td[1]/b/text()') act.add_xpath('number', 'tr[1]/td[2]/b/text()') act.add_xpath('date', 'tr[1]/td[3]/b/text()') act.add_value('source', source) self._fix_name_case(act) return act