def parse_check_numbers(params, verbose=False): """ Checks if a list contains interval info. This function just exists to shorten read_timeline() to something that Python won't throw warnings about. There's no need to call this outside that function. :param params: is the list to check. :param verbose: is an optional flag. When true, extra parsing information is printed to the console. Defaults to false. """ # make sure it's a set of 3 numbers if len(params) != 3: if verbose: print(magenta("\t\tinvalid parameter count:"), len(params)) return False # make sure each of the 3 numbers is good for param in params: try: num = int(param, 10) # 10 is the number base # catch parsing errors except ValueError: if verbose: print(red("\t\tinvalid integer > "), end='') print(add_quotes(param)) return False else: # int() allows negatives, but we don't want those if num < 0: if verbose: print(red("\t\tinvalid integer range > "), end='') print(add_quotes(param)) return False return True
def prompt(msg, minor=False, default_yes=False): if default_yes: prompt = '[Yn]' else: prompt = '[yN]' if minor: ansicolor.write_out(ansicolor.magenta('-> %s %s ' % (msg, prompt))) else: ansicolor.write_out(ansicolor.magenta('> %s %s ' % (msg, prompt))) inp = raw_input() if default_yes: return False if 'n' in inp else True else: return True if 'y' in inp else False
def parse_opinion(self, response): """ Parse one pre-law opinion """ op_data = response.meta['op_data'] parl_id = LAW.PARL_ID.xt(response) description = LAW.DESCRIPTION.xt(response) docs = self.parse_docs(response) category = self.parse_category(response) keywords = self.parse_keywords(response) entity = OPINION.ENTITY.xt(response) entity['title'] = op_data['title'] or entity['title_detail'] entity['title_detail'] = entity['title_detail'] entity['email'] = entity['email'] or op_data['email'] entity_item, created = Entity.objects.get_or_create( title=entity['title'], title_detail=entity['title_detail'] ) if entity['phone'] and not entity_item.phone: entity_item.phone = entity['phone'] if entity['email'] and not entity_item.email: entity_item.email = entity['email'] opinion_item, created = Opinion.objects.get_or_create( parl_id=parl_id, defaults={ 'date': op_data['date'], 'description': description, 'source_link': response.url, 'entity': entity_item, 'prelaw': response.meta['law_item'], 'category': category } ) # Foreign Keys opinion_item.documents = docs opinion_item.keywords = keywords response.meta['opinion'] = opinion_item step_num = self.parse_op_steps(response) entity_str = u"{} / {} / {} [{}]".format( green(entity_item.title_detail), entity_item.phone, entity_item.email, 'new' if created else 'updated') log.msg( u"Opinion: {} by {}".format( magenta(opinion_item.parl_id), entity_str ))
def send_request(url, headers, redirects, data=None): if data is None: try: r = requests.get(url, headers=headers, timeout=timeout_, verify=False, allow_redirects=redirects) except requests.exceptions.ConnectionError: print "Error. Connection refused." sys.exit(1) except requests.exceptions.Timeout: print magenta( "[!]") + " Time of response exceeded {} seconds!".format( timeout) sys.exit(1) else: try: r = requests.post(url, headers=headers, data=data, timeout=timeout_, verify=False, allow_redirects=redirects) except requests.exceptions.ConnectionError: print "Error. Connection refused." sys.exit(1) except requests.exceptions.Timeout: print magenta( "[!]") + " Time of response exceeded {} seconds!".format( timeout_) sys.exit(1) if r.status_code != 200 and r.status_code != 302: print "Error with HTTP code", r.status_code print r.text sys.exit(-1) return r
def f_verbose(value): if ("[X]" in value) or ("[+]" in value): f_save(value + '\n') col_cred = value.split('`') neutrino = '' for index, item in enumerate(col_cred): if index & 1: neutrino = neutrino + blue(item) else: neutrino += item if "[X]" in neutrino: print neutrino.replace("[X]", red("[X]")) elif "[+]" in neutrino: print neutrino.replace("[+]", yellow("[+]")) elif args.verbose: print neutrino.replace("[*]", green("[*]")).replace("[!]", magenta("[!]")) return
def parse(self, response): # Extract fields title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) ts = GENERIC.TIMESTAMP.xt(response) if not (u'BI' in parl_id or u'PET' in parl_id): # VBG have their parl_id only in the url parl_id = response.url.split('/')[-2] status = LAW.STATUS.xt(response) raw_llp = response.url.split('/')[-4] if raw_llp != u'BR': LLP = LegislativePeriod.objects.get(roman_numeral=raw_llp) else: LLP = None if not self.IGNORE_TIMESTAMP and not self.has_changes( parl_id, LLP, response.url, ts): self.logger.info( green(u"Skipping Petition, no changes: {}".format(title))) return # save ids and stuff for internals if LLP not in self.idlist: self.idlist[LLP] = {} self.idlist[LLP][response.url] = [parl_id, LLP] # Extract foreign keys category = self.parse_category(response) description = LAW.DESCRIPTION.xt(response) signing_url, signable = PETITION.SIGNING.xt(response) signature_count = PETITION.SIGNATURE_COUNT.xt(response) # Parse reference reference = self.parse_reference(response) # Log our progress logtext = u"Scraping {} with id {}, LLP {} @ {}".format( red(title), magenta(u"[{}]".format(parl_id)), green(str(LLP)), blue(response.url)) log.msg(logtext, level=log.INFO) # Create and save Petition petition_item, petition_item_created = Petition.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'status': status, 'source_link': response.url, 'description': description, 'signable': signable, 'signing_url': signing_url, 'signature_count': signature_count, 'reference': reference, 'ts': ts }) if not petition_item_created: petition_item.save() # Attach foreign keys petition_item.keywords = self.parse_keywords(response) petition_item.category = category petition_item.documents = self.parse_docs(response) petition_item.save() # Parse creators petition_creators = self.parse_creators(response) for petition_creator in petition_creators: petition_creator.created_petitions.add(petition_item) callback_requests = [] # is the tab 'Parlamentarisches Verfahren available?' if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): response.meta['petition_item'] = petition_item self.parse_parliament_steps(response) # Parse opinions opinions = PETITION.OPINIONS.xt(response) if opinions: for op in opinions: if Opinion.objects.filter(parl_id=op['parl_id']).exists(): continue post_req = scrapy.Request("{}/{}".format(BASE_HOST, op['url']), callback=self.parse_opinion, dont_filter=True) post_req.meta['petition_item'] = petition_item post_req.meta['op_data'] = op callback_requests.append(post_req) # Only BI or PET (but not PET-BR) have online signatures if u'BI' in parl_id or u'PET' in parl_id and not u'PET-BR' in parl_id: signatures_base_url = '{}/PAKT/VHG/{}/{}/{}/filter.psp?xdocumentUri=/PAKT/VHG/{}/{}/{}/'\ 'index.shtml&GP_CODE={}&ITYP={}&INR={}&FBEZ=BI_001&R_1000=ALLE&STEP=&pageNumber=' raw_parl_id = petition_item.parl_id[1:-1].split('/') petition_type = raw_parl_id[1] petition_number = int(raw_parl_id[0]) url_parl_id = '{}_{}'.format(petition_type, petition_number) signatures_url = signatures_base_url.format( BASE_HOST, LLP.roman_numeral, petition_type, url_parl_id, LLP.roman_numeral, petition_type, url_parl_id, LLP.roman_numeral, petition_type, petition_number) post_req = scrapy.Request(signatures_url, callback=self.parse_signatures, dont_filter=True) post_req.meta['petition_item'] = petition_item callback_requests.append(post_req) log.msg(green("Open Callback requests: {}".format( len(callback_requests))), level=log.INFO) return callback_requests
def parse(self, response): # Parse parl_id = COMITTEE.url_to_parlid(response.url)[1] ts = GENERIC.TIMESTAMP.xt(response) llp = COMITTEE.LLP.xt(response) name = COMITTEE.NAME.xt(response) if llp is not None: nrbr = 'Nationalrat' legislative_period = LegislativePeriod.objects.get( roman_numeral=llp) # NR comittees are "active" if they are in the current LLP active = ( legislative_period == LegislativePeriod.objects.get_current()) else: nrbr = 'Bundesrat' legislative_period = None # BR comittees are active if they are not "aufgelöst" active = COMITTEE.ACTIVE.xt(response) # main-comittee parl_id starts with the number 1 # sub-comittees parl_id start with the number 2 if not parl_id.startswith(u'(1/'): try: parent_parl_id = u'(1/{}'.format(parl_id.split('/')[1]) parent_comitee = Comittee.objects.get( parl_id=parent_parl_id, legislative_period=legislative_period) except Comittee.DoesNotExist: parent_comitee = None else: parent_comitee = None if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, legislative_period, nrbr, response.url, ts): self.logger.info( green(u"Skipping Comittee, no changes: {}".format( name))) return # Log our progress logtext = u"Scraping {} with id {}, LLP {} @ {}".format( red(name), magenta(u"[{}]".format(parl_id)), green(unicode(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) description = COMITTEE.DESCRIPTION.xt(response) comittee_data = { 'description': description, 'name': name, 'source_link': response.url, 'parent_comittee': parent_comitee, 'active': active, 'ts': ts } try: comittee_item, created_comittee = Comittee.objects.update_or_create( parl_id=parl_id, legislative_period=legislative_period, nrbr=nrbr, defaults=comittee_data ) except: log.msg( u"Could not update/create Comittee {}".format(name), level=log.ERROR) return # import ipdb # ipdb.set_trace() meetings = COMITTEE.MEETINGS.xt(response) comittee_laws = [] for meeting in meetings: agenda_data = meeting['agenda'] if agenda_data is not None: agenda_item, agenda_created = Document.objects.get_or_create( **agenda_data) else: agenda_item = None meeting_data = { 'agenda': agenda_item } # Log our progress logtext = u"Scraping meeting no. {} of {} on {}".format( red(meeting['number']), magenta(name), green(str(meeting['date'].date())), ) log.msg(logtext, level=log.INFO) meeting_item, meeting_created = ComitteeMeeting.objects.update_or_create( number=meeting['number'], date=meeting['date'], comittee=comittee_item, defaults=meeting_data ) for topic in meeting['topics']: if topic['law'] is not None: law = topic['law'] law_item = self.parse_law(law) if law_item is not None: comittee_laws.append(law_item) else: law_item = None agenda_topic_data = { 'comment': topic['comment'], 'law': law_item, } agenda_topic_item, agenda_topic_created = ComitteeAgendaTopic.objects.update_or_create( number=topic['number'], meeting=meeting_item, text=topic['text'], defaults=agenda_topic_data, ) # parse Verhandlungsgegenstaende and Veroeffentlichungen laws_and_reports = COMITTEE.LAWS.xt(response) for law in laws_and_reports: # Log our progress logtext = u"Adding law with id {}, LLP {} to {}".format( magenta(u"[{}]".format(law['parl_id'])), green(law['llp']), blue(name) ) log.msg(logtext, level=log.INFO) law_item = self.parse_law(law) if law_item is not None: comittee_laws.append(law_item) comittee_item.laws.add(*comittee_laws) comittee_item.save()
def suggest(msg, minor=False): if minor: ansicolor.write_out(ansicolor.magenta('-> %s\n' % msg)) else: ansicolor.write_out(ansicolor.magenta('> %s\n' % msg))
def parse(self, response): # Extract fields ts = GENERIC.TIMESTAMP.xt(response) title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts): self.logger.info( green(u"Skipping Law, no changes: {}".format( title))) return # save ids and stuff for internals if LLP not in self.idlist: self.idlist[LLP] = {} self.idlist[LLP][response.url] = [parl_id, LLP] # Extract foreign keys category = self.parse_category(response) description = PRELAW.DESCRIPTION.xt(response) # Log our progress logtext = u"Scraping {} with id {}, LLP {} @ {}".format( red(title), magenta(u"[{}]".format(parl_id)), green(str(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) # Create and save Law pre_law_data = { 'title': title, 'description': description, 'ts': ts } law_item, created = Law.objects.get_or_create( parl_id=parl_id, source_link=response.url, legislative_period=LLP, defaults=pre_law_data) if not created: law_item.save() # Attach foreign keys law_item.keywords = self.parse_keywords(response) law_item.category = category law_item.documents = self.parse_docs(response) law_item.save() # Parse opinions opinions = PRELAW.OPINIONS.xt(response) callback_requests = [] # is the tab 'Parlamentarisches Verfahren available?' if opinions: skipped_ops = 0 for op in opinions: if Opinion.objects.filter(parl_id=op['parl_id']).exists(): skipped_ops += 1 continue post_req = scrapy.Request("{}/{}".format(BASE_HOST, op['url']), callback=self.parse_opinion, dont_filter=True) post_req.meta['law_item'] = law_item post_req.meta['op_data'] = op callback_requests.append(post_req) log.msg(green("Open/Skipped Callback requests: {}/{}".format( len(callback_requests), skipped_ops)), level=log.INFO) return callback_requests
def parse(self, response): # Extract fields ts = GENERIC.TIMESTAMP.xt(response) title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) if not self.IGNORE_TIMESTAMP and not self.has_changes( parl_id, LLP, response.url, ts): self.logger.info( green(u"Skipping Law, no changes: {}".format(title))) return # save ids and stuff for internals if LLP not in self.idlist: self.idlist[LLP] = {} self.idlist[LLP][response.url] = [parl_id, LLP] # Extract foreign keys category = self.parse_category(response) description = PRELAW.DESCRIPTION.xt(response) # Log our progress logtext = u"Scraping {} with id {}, LLP {} @ {}".format( red(title), magenta(u"[{}]".format(parl_id)), green(unicode(LLP)), blue(response.url)) log.msg(logtext, level=log.INFO) # Create and save Law pre_law_data = { 'title': title, 'description': description, 'source_link': response.url, 'ts': ts } law_item, created = Law.objects.get_or_create(parl_id=parl_id, legislative_period=LLP, defaults=pre_law_data) if not created: law_item.save() # Attach foreign keys law_item.keywords = self.parse_keywords(response) law_item.category = category law_item.documents = self.parse_docs(response) law_item.save() # Parse opinions opinions = PRELAW.OPINIONS.xt(response) callback_requests = [] # is the tab 'Parlamentarisches Verfahren available?' if opinions: skipped_ops = 0 for op in opinions: if Opinion.objects.filter(parl_id=op['parl_id']).exists(): skipped_ops += 1 continue post_req = scrapy.Request("{}/{}".format(BASE_HOST, op['url']), callback=self.parse_opinion, dont_filter=True) post_req.meta['law_item'] = law_item post_req.meta['op_data'] = op callback_requests.append(post_req) log.msg(green("Open/Skipped Callback requests: {}/{}".format( len(callback_requests), skipped_ops)), level=log.INFO) return callback_requests
def parse(self, response): # Extract fields title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) ts = GENERIC.TIMESTAMP.xt(response) if not (u'BI' in parl_id or u'PET' in parl_id): # VBG have their parl_id only in the url parl_id = response.url.split('/')[-2] status = LAW.STATUS.xt(response) raw_llp = response.url.split('/')[-4] if raw_llp != u'BR': LLP = LegislativePeriod.objects.get( roman_numeral=raw_llp) else: LLP = None if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts): self.logger.info( green(u"Skipping Petition, no changes: {}".format( title))) return # save ids and stuff for internals if LLP not in self.idlist: self.idlist[LLP] = {} self.idlist[LLP][response.url] = [parl_id, LLP] # Extract foreign keys category = self.parse_category(response) description = LAW.DESCRIPTION.xt(response) signing_url, signable = PETITION.SIGNING.xt(response) signature_count = PETITION.SIGNATURE_COUNT.xt(response) # Parse reference reference = self.parse_reference(response) # Log our progress logtext = u"Scraping {} with id {}, LLP {} @ {}".format( red(title), magenta(u"[{}]".format(parl_id)), green(str(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) # Create and save Petition petition_item, petition_item_created = Petition.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'status': status, 'source_link': response.url, 'description': description, 'signable': signable, 'signing_url': signing_url, 'signature_count': signature_count, 'reference': reference, 'ts': ts } ) if not petition_item_created: petition_item.save() # Attach foreign keys petition_item.keywords = self.parse_keywords(response) petition_item.category = category petition_item.documents = self.parse_docs(response) petition_item.save() # Parse creators petition_creators = self.parse_creators(response) for petition_creator in petition_creators: petition_creator.created_petitions.add(petition_item) callback_requests = [] # is the tab 'Parlamentarisches Verfahren available?' if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): response.meta['petition_item'] = petition_item self.parse_parliament_steps(response) # Parse opinions opinions = PETITION.OPINIONS.xt(response) if opinions: for op in opinions: if Opinion.objects.filter(parl_id=op['parl_id']).exists(): continue post_req = scrapy.Request("{}/{}".format(BASE_HOST, op['url']), callback=self.parse_opinion, dont_filter=True) post_req.meta['petition_item'] = petition_item post_req.meta['op_data'] = op callback_requests.append(post_req) # Only BI or PET (but not PET-BR) have online signatures if u'BI' in parl_id or u'PET' in parl_id and not u'PET-BR' in parl_id: signatures_base_url = '{}/PAKT/VHG/{}/{}/{}/filter.psp?xdocumentUri=/PAKT/VHG/{}/{}/{}/'\ 'index.shtml&GP_CODE={}&ITYP={}&INR={}&FBEZ=BI_001&R_1000=ALLE&STEP=&pageNumber=' raw_parl_id = petition_item.parl_id[1:-1].split('/') petition_type = raw_parl_id[1] petition_number = int(raw_parl_id[0]) url_parl_id = '{}_{}'.format(petition_type, petition_number) signatures_url = signatures_base_url.format(BASE_HOST, LLP.roman_numeral, petition_type, url_parl_id, LLP.roman_numeral, petition_type, url_parl_id, LLP.roman_numeral, petition_type, petition_number) post_req = scrapy.Request(signatures_url, callback=self.parse_signatures, dont_filter=True) post_req.meta['petition_item'] = petition_item callback_requests.append(post_req) log.msg(green("Open Callback requests: {}".format( len(callback_requests))), level=log.INFO) return callback_requests
def parse(self, response): # Parse parl_id = COMITTEE.url_to_parlid(response.url)[1] ts = GENERIC.TIMESTAMP.xt(response) LLP = COMITTEE.LLP.xt(response) name = COMITTEE.NAME.xt(response) if LLP is not None: nrbr = 'Nationalrat' legislative_period = LegislativePeriod.objects.get( roman_numeral=LLP) # NR comittees are always "active", only BR comittees are either active or inactive active = True else: nrbr = 'Bundesrat' legislative_period = None # BR comittees are active if they are not "aufgelöst" active = COMITTEE.ACTIVE.xt(response) # main-comittee parl_id starts with the number 1 # sub-comittees parl_id start with the number 2 if not parl_id.startswith(u'(1/'): try: parent_parl_id = u'(1/{}'.format(parl_id.split('/')[1]) parent_comitee = Comittee.objects.get( parl_id=parent_parl_id, legislative_period=legislative_period) except Comittee.DoesNotExist: parent_comitee = None else: parent_comitee = None if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, legislative_period, nrbr, response.url, ts): self.logger.info( green(u"Skipping Comittee, no changes: {}".format( name))) return # Log our progress logtext = u"Scraping {} with id {}, LLP {} @ {}".format( red(name), magenta(u"[{}]".format(parl_id)), green(unicode(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) description = COMITTEE.DESCRIPTION.xt(response) comittee_data = { 'description': description, 'name': name, 'source_link': response.url, 'parent_comittee': parent_comitee, 'ts': ts } try: comittee_item, created_comittee = Comittee.objects.update_or_create( parl_id=parl_id, legislative_period=legislative_period, nrbr=nrbr, active=active, defaults=comittee_data ) except: log.msg( u"Could not update/create Comittee {}".format(name), level=log.ERROR) return # import ipdb # ipdb.set_trace() meetings = COMITTEE.MEETINGS.xt(response) comittee_laws = [] for meeting in meetings: agenda_data = meeting['agenda'] if agenda_data is not None: agenda_item, agenda_created = Document.objects.get_or_create( **agenda_data) else: agenda_item = None meeting_data = { 'agenda': agenda_item } # Log our progress logtext = u"Scraping meeting no. {} of {} on {}".format( red(meeting['number']), magenta(name), green(str(meeting['date'].date())), ) log.msg(logtext, level=log.INFO) meeting_item, meeting_created = ComitteeMeeting.objects.update_or_create( number=meeting['number'], date=meeting['date'], comittee=comittee_item, defaults=meeting_data ) for topic in meeting['topics']: if topic['law'] is not None: law = topic['law'] law_item = self.parse_law(law) if law_item is not None: comittee_laws.append(law_item) else: law_item = None agenda_topic_data = { 'comment': topic['comment'], 'law': law_item, } agenda_topic_item, agenda_topic_created = ComitteeAgendaTopic.objects.update_or_create( number=topic['number'], meeting=meeting_item, text=topic['text'], defaults=agenda_topic_data, ) # parse Verhandlungsgegenstaende and Veroeffentlichungen laws_and_reports = COMITTEE.LAWS.xt(response) for law in laws_and_reports: # Log our progress logtext = u"Adding law with id {}, LLP {} to {}".format( magenta(u"[{}]".format(law['parl_id'])), green(law['llp']), blue(name) ) log.msg(logtext, level=log.INFO) law_item = self.parse_law(law) if law_item is not None: comittee_laws.append(law_item) comittee_item.laws.add(*comittee_laws) comittee_item.save()
def f_verbose(value): if args.verbose: print value.replace("[X]", red("[X]")).replace("[*]", green("[*]")).replace("[!]", magenta("[!]"))\ .replace("safe", blue("safe")) return