Exemplo n.º 1
0
def print_answer(p):
    fromname = p.fromname

    try:
        flag = ms_flag_values[p[SAPMS].flag]
    except:
        flag = "0"
    try:
        opcode = str(ms_opcode_values[p[SAPMS].opcode])
    except:
        opcode = str(p[SAPMS].opcode)
    try:
        opcode_err = str(ms_opcode_error_values[p[SAPMS].opcode_error])
    except:
        opcode_err = 'None'

    if opcode_err == 'MSOP_OK':
        opcode_err = green(opcode_err)
    else:
        opcode_err = red(opcode_err, bold=True)

    if p.key != null_key:
        mskey_parse_print(p.key)
        key = p.key.encode('hex')
    else:
        key = "NULL"

    logger.debug("flag: " + cyan(flag) + " opcode:" + cyan(opcode) + \
        " opcode_error: " + green(opcode_err) + " key: %s" % key)
Exemplo n.º 2
0
    def parse_inquiry_response(self, response):
        """
        Callback function for parsing the inquiry responses
        """
        inquiry_item = response.meta['inquiry_item']
        source_link = response.url
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.RESPONSEDESCRIPTION.xt(response)
        LLP = inquiry_item.legislative_period
        category = INQUIRY.CATEGORY.xt(response)

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        try:
            sender_object = Person.objects.get(
                parl_id=INQUIRY.RESPONSESENDER.xt(response))
        except:
            log.msg(
                red(u'Receiver was not found in database, skipping Inquiry {} in LLP {}'
                    .format(parl_id, LLP)))
            return

        # Create or update Inquiry item
        inquiryresponse_item, inquiryresponse_created = InquiryResponse.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'source_link': source_link,
                'description': description,
                'sender': sender_object
            })

        # Attach foreign Keys
        inquiryresponse_item.documents = self.parse_response_docs(response)
        inquiryresponse_item.category = cat

        # Save InquiryResponse object
        inquiryresponse_item.save()

        if inquiryresponse_created:
            logtext = u"[{} of {}] Created InquiryResponse {} with ID {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated InquiryResponse {} with ID {}, LLP {} @ {}"

        logtext = logtext.format(self.SCRAPED_COUNTER, self.TOTAL_COUNTER,
                                 cyan(title), cyan(u"{}".format(parl_id)),
                                 green(str(LLP)), blue(response.url))
        log.msg(logtext, level=log.INFO)

        inquiry_item.response = inquiryresponse_item
        inquiry_item.save()

        return
Exemplo n.º 3
0
def handle_answer(s, p):
    fromname = p.fromname
    try:
        flag = ms_flag_values[p[SAPMS].flag]
    except:
        flag = "0"
    try:
        opcode = str(ms_opcode_values[p[SAPMS].opcode])
    except:
        opcode = str(p[SAPMS].opcode)
    try:
        opcode_err = str(ms_opcode_error_values[p[SAPMS].opcode_error])
    except:
        opcode_err = 'None'

    if opcode_err == 'MSOP_OK':
        opcode_err = green(opcode_err)
    else:
        opcode_err = red(opcode_err, bold=True)

    if p.key != null_key:
        p.show()
        key = " key: " + yellow('NOT NULL', bold=True)
        print "[!] Out of order packets, reload this script."
        #s.close()
        #exit(0)
    else:
        key = ""

    print "flag: " + cyan(flag) + " opcode:" + cyan(opcode) + \
        " opcode_error: " + green(opcode_err) + key

    # "idenfify request from the server?
    if key != "" and flag == 'MS_REQUEST' and opcode == '0':
        s.send(ms_adm_nilist(p, 1))
def handle_answer(s, p):
    fromname = p.fromname
    try:
        flag = ms_flag_values[p[SAPMS].flag]
    except:
        flag = "0"
    try:
        opcode = str(ms_opcode_values[p[SAPMS].opcode])
    except:
        opcode = str(p[SAPMS].opcode)
    try:
        opcode_err = str(ms_opcode_error_values[p[SAPMS].opcode_error])
    except:
        opcode_err = 'None'

    if opcode_err == 'MSOP_OK':
        opcode_err = green(opcode_err)
    else:
        opcode_err = red(opcode_err, bold=True)

    if p.key != null_key:
        key = " key: " + yellow('NOT NULL', bold=True)
        logger.error("[!] Out of order packets, reload this script.")
        #s.close()
        #exit(0)
    else:
        key = ""

    logger.info("flag: " + cyan(flag) + " opcode:" + cyan(opcode) + \
        " opcode_error: " + green(opcode_err) + key)
Exemplo n.º 5
0
def format_field_as_txt(field_name: str,
                        field_doc: FieldDoc,
                        second_column: int,
                        field_prefix: str = '') -> str:
    output = ''

    field_name_length = \
        INDENT + \
        len(field_prefix + field_name + FIELD_SUFFIX) + \
        INDENT

    field_name = \
        ' ' * INDENT + \
        ansicolor.cyan(field_prefix + field_name) + FIELD_SUFFIX + \
        ' ' * INDENT

    description_indent = ' ' * second_column

    description = field_doc['description']
    output += field_name + \
        textwrap.fill(
            description,
            width=78,
            initial_indent=description_indent,
            subsequent_indent=description_indent
        )[field_name_length:] + '\n'

    if 'examples' in field_doc:
        output += description_indent + \
            ansicolor.yellow('Examples:') + ' ' + \
            str(field_doc['examples']) + '\n'
    output += '\n\n'

    return output
Exemplo n.º 6
0
    def parse_steps(self, response):
        """
        Parse the Pre-Law's steps
        """
        law_item = response.meta['law_item']

        # Create phase if we don't have it yet
        phase_item, created = Phase.objects.get_or_create(title='default')
        if created:
            log.msg(u"Created Phase {}".format(
                green(u'[{}]'.format(phase_item.title))))

        steps = PRELAW.STEPS.xt(response)
        if steps:
            log.msg(u"Creating {} steps".format(
                cyan(u'[{}]'.format(len(steps)))))

        # Create steps
        for step in steps:
            step_item, created = Step.objects.update_or_create(
                title=step['title'],
                sortkey=step['sortkey'],
                date=step['date'],
                protocol_url=step['protocol_url'],
                law=law_item,
                phase=phase_item,
                source_link=response.url)
            step_item.save()
Exemplo n.º 7
0
    def parse_steps(self, response):
        """
        Parse the Pre-Law's steps
        """
        law_item = response.meta['law_item']

        # Create phase if we don't have it yet
        phase_item, created = Phase.objects.get_or_create(
            title='default')
        if created:
            log.msg(u"Created Phase {}".format(
                green(u'[{}]'.format(phase_item.title))))

        steps = PRELAW.STEPS.xt(response)
        if steps:
            log.msg(u"Creating {} steps".format(
                cyan(u'[{}]'.format(len(steps)))))

        # Create steps
        for step in steps:
            step_item, created = Step.objects.update_or_create(
                title=step['title'],
                sortkey=step['sortkey'],
                date=step['date'],
                protocol_url=step['protocol_url'],
                law=law_item,
                phase=phase_item,
                source_link=response.url
            )
            step_item.save()
    def parse(self, response):
        # Extract fields
        title = LAW.TITLE.xt(response)
        parl_id = LAW.PARL_ID.xt(response)
        status = LAW.STATUS.xt(response)

        LLP = LegislativePeriod.objects.get(
            roman_numeral=response.url.split('/')[-4])

        # Extract foreign keys
        category = LAW.CATEGORY.xt(response)
        description = LAW.DESCRIPTION.xt(response)

        # Create category if we don't have it yet
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # Create and save Law
        law_data = {
            'title': title,
            'status': status,
            'description': description
        }
        law_item, law_created = Law.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            source_link=response.url,
            defaults=law_data)

        # Attach foreign keys
        law_item.keywords = self.parse_keywords(response)
        law_item.category = cat
        law_item.documents = self.parse_docs(response)

        law_item.save()

        # Log our progress
        if law_created:
            logtext = u"Created {} with id {}, LLP {} @ {}"
        else:
            logtext = u"Updated {} with id {}, LLP {} @ {}"

        logtext = logtext.format(
            red(title),
            cyan(u"[{}]".format(parl_id)),
            green(str(LLP)),
            blue(response.url)
        )
        log.msg(logtext, level=log.INFO)

        response.meta['law_item'] = law_item

        # is the tab 'Parlamentarisches Verfahren available?'
        if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
            self.parse_parliament_steps(response)

        if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'):
            self.parse_pre_parliament_steps(response)
Exemplo n.º 9
0
def inform(msg, minor=False, major=False):
    if major:
        ansicolor.write_out(ansicolor.yellow('>>> %s\n' % msg))
    elif minor:
        ansicolor.write_out(ansicolor.cyan('-> %s\n' % msg))
    else:
        ansicolor.write_out(ansicolor.green('> %s\n' % msg))
Exemplo n.º 10
0
def inform(msg, minor=False, major=False):
    if major:
        ansicolor.write_out(ansicolor.yellow('>>> %s\n' % msg))
    elif minor:
        ansicolor.write_out(ansicolor.cyan('-> %s\n' % msg))
    else:
        ansicolor.write_out(ansicolor.green('> %s\n' % msg))
Exemplo n.º 11
0
    def write_progress(self, rate=None, prestart=None, wait=None, complete=False, error=None):
        # compute string lengths
        action = self.action.rjust(self.actionwidth)

        if error:
            rate = error
        elif prestart:
            rate = "starting"
        elif wait:
            rate = ("%s" % self.retry_wait) + "s..."
        elif complete:
            rate = "done"
        else:
            rate = "%s/s" % self.format_size(rate)
        rate = rate.ljust(self.ratewidth)

        url = self.url_fmt

        if self.totalsize:
            size = self.format_size(self.totalsize)
        elif self.download_size:
            size = self.format_size(self.download_size)
        else:
            size = "????? B"
        size = ("  %s" % size).ljust(self.sizewidth)

        # add formatting
        if error:
            rate = ansicolor.red(rate)
        elif prestart or wait:
            rate = ansicolor.cyan(rate)
        elif complete:
            rate = ansicolor.green(rate)
        else:
            rate = ansicolor.yellow(rate)

        # draw progress bar
        if not (error or prestart or complete) and self.totalsize:
            c = int(self.urlwidth * self.download_size / self.totalsize)
            url = ansicolor.wrap_string(self.url_fmt, c, None, reverse=True)

        if not self.totalsize:
            size = ansicolor.yellow(size)

        line = "%s ::  %s  " % (action, rate)

        term = (os.environ.get("DEBUG_FETCH") and "\n") or "\r"
        if error or complete:
            term = "\n"
        ioutils.write_err("%s%s%s%s" % (line, url, size, term))

        # log download
        if error:
            self.log_url(error, error=True)
        elif complete:
            self.log_url("done")
Exemplo n.º 12
0
def zoom_print_facets(result):
    facets_k = result.keys()
    if not facets_k: return
    #count = str(result['device'][0]['count'])
    #facets_k.remove("device")
    #print "Total: %s" % red(count, bold=True)
    for k in facets_k:
        print "- %s" % green(k, bold=True)
        for e in result[k]:
            count = str(e['count'])
            name = e['name']
            if isinstance(name, int): name = str(name)
            print count.ljust(9) + cyan(name).ljust(20)
        print
    return
Exemplo n.º 13
0
    def print_debug(self):
        """
        Collects and prints a structured debug message
        """
        message = """
    {bar}

    {title}

      Scraping LLPs: {llps}
      Base URL:      {url}

    {bar}
        """.format(bar=cyan(
            '############################################################'),
                   title=red(self.title),
                   llps=self.LLP or "Not applicable",
                   url=self.BASE_URL)
        print message
Exemplo n.º 14
0
    def print_debug(self):
        """
        Collects and prints a structured debug message
        """
        message = """
    {bar}

    {title}

      Scraping LLPs: {llps}
      Base URL:      {url}

    {bar}
        """.format(
            bar=cyan(
                '############################################################'),
            title=red(self.title),
            llps=self.LLP or "Not applicable",
            url=self.BASE_URL
        )
        print message
Exemplo n.º 15
0
    def print_debug(self):
        """
        Collects and prints a structured debug message
        """
        message = """
    {bar}

    {title}

      Scraping LLPs: {llps}
      Ignoring Timestamps: {IGNORE_TIMESTAMP}
      Base URL:      {url}

    {bar}
        """.format(
            bar=cyan(
                '############################################################'
            ),
            title=red(self.title),
            llps=self.LLP or "Not applicable",
            url=self.BASE_URL,
            IGNORE_TIMESTAMP=self.IGNORE_TIMESTAMP,
        )
        print message
Exemplo n.º 16
0
    def parse(self, response):

        # rss = feedparser.parse(response.url)

        persons = PERSON.LIST.xt(response)

        callback_requests = []

        # which llp are we in?
        urloptions = response.url.split('?')[1]

        llp_roman = [
            opt.split('=')[1] for opt in urloptions.split('&')
            if opt.split('=')[0] == 'GP'
        ]
        llp_item = LegislativePeriod.objects.get(roman_numeral=llp_roman[0])

        # function string
        function = [
            opt.split('=')[1] for opt in urloptions.split('&')
            if opt.split('=')[0] == 'NRBR'
        ]
        function_str = self.RSS_TO_FUNCTION[function[0]]
        function_item, f_created = Function.objects.get_or_create(
            title=function_str)

        self.logger.info(u"Scraping {} persons for LLP {}".format(
            len(persons), llp_roman))

        # Iterate all persons
        for p in persons:
            # Extract basic data
            parl_id = p['source_link'].split('/')[-2]
            p['source_link'] = "{}{}".format(BASE_HOST, p['source_link'])

            changed = False
            # Create or update simple person's item
            try:
                person_data = {
                    'reversed_name': p['reversed_name'],
                    'source_link': p['source_link']
                }
                person_item, created_person = Person.objects.update_or_create(
                    parl_id=parl_id, defaults=person_data)
            except Exception as e:
                self.logger.warning("Error saving Person {}: {}".format(
                    green(u'[{}]'.format(p['reversed_name'])), e))
                continue
            if created_person:
                self.logger.info(u"Created Person {}".format(
                    green(u'[{}]'.format(p['reversed_name']))))
            else:
                self.logger.info(u"Updated Person {}".format(
                    green(u"[{}]".format(p['reversed_name']))))
            for mandate in p['mandates']:
                party_item = self.get_party_item(mandate)
                state_item = self.get_state_item(p['electoral_state'])
                # Create and append mandate
                try:
                    mandate_item, m_created = Mandate.objects.update_or_create(
                        function=function_item,
                        legislative_period=llp_item,
                        party=party_item,
                        state=state_item)
                except:
                    self.logger.info(
                        red(u"Error saving Mandate {} ({})".format(
                            function_item, party_item)))
                    import ipdb
                    ipdb.set_trace()
                if mandate_item not in person_item.mandates.all():
                    changed = True
                    person_item.mandates.add(mandate_item)
            if changed:
                # In case we added/modified a mandate now,
                latest_mandate_item = person_item.get_latest_mandate()
                person_item.latest_mandate = latest_mandate_item
                self.logger.info(
                    cyan(u"Latest mandate for {} is now {}".format(
                        person_item, latest_mandate_item)))
                person_item.save()

            # First time we encounter a person, we scan her detail page too
            if not parl_id in self.persons_scraped:
                # Create Detail Page request
                req = scrapy.Request(p['source_link'],
                                     callback=self.parse_person_detail)
                req.meta['person'] = {
                    'reversed_name': p['reversed_name'],
                    'source_link': p['source_link'],
                    'parl_id': parl_id
                }
                callback_requests.append(req)
                self.persons_scraped.append(parl_id)
        return callback_requests
Exemplo n.º 17
0
    def parse(self, response):
        source_link = response.url
        category = INQUIRY.CATEGORY.xt(response)
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.DESCRIPTION.xt(response)
        sender_objects = []
        callback_requests = []
        ts = GENERIC.TIMESTAMP.xt(response)

        # Inquiries from Bundesrat don't have an LLP => set None
        if("BR" in category):
            LLP = None
        else:
            LLP = LegislativePeriod.objects.get(
                roman_numeral=response.url.split('/')[-4])
        if not self.has_changes(parl_id, LLP, response.url, ts):
            self.logger.info(
                green(u"Skipping Inquiry, no changes: {}".format(
                    title)))
            return

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # An inquiry can have multiple senders, but only a single recipient.
        # Try/catch in case person does not exist in the database.
        try:
            for sender_object in INQUIRY.SENDER.xt(response):
                sender_objects.append(Person.objects.get(
                    parl_id=sender_object))
        except:
            log.msg(red(u'Sender was not found in database, skipping Inquiry {} in LLP {}'.format(
                parl_id, LLP)))
            return
        try:
            receiver_object = Person.objects.get(
                parl_id=INQUIRY.RECEIVER.xt(response))
        except:
            log.msg(red(u'Receiver was not found in database, skipping Inquiry {} in LLP {}'.format(
                parl_id, LLP)))
            return

        # Create or update Inquiry item
        inquiry_item, inquiry_created = Inquiry.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'source_link': source_link,
                'description': description,
                'receiver': receiver_object,
                'ts': ts
            }
        )

        # Attach foreign keys
        inquiry_item.keywords = self.parse_keywords(response)
        inquiry_item.documents = self.parse_docs(response)
        inquiry_item.category = cat
        inquiry_item.sender = sender_objects

        response.meta['inquiry_item'] = inquiry_item

        # Dringliche / Urgent inquiries have a different structure for steps
        # and history. This case distinction accomodates these different
        # structures.
        if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()):
            if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
                self.parse_parliament_steps(response)
        else:
            response_link = self.parse_steps(response)
            if response_link:
                post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link),
                                          callback=self.parse_inquiry_response,
                                          dont_filter=True)
                post_req.meta['inquiry_item'] = inquiry_item

                callback_requests.append(post_req)

        # Save Inquiry item and log to terminal if created or updated.
        inquiry_item.save()

        if inquiry_created:
            logtext = u"Created Inquiry {} with ID {}, LLP {} @ {}"
        else:
            logtext = u"Updated Inquiry {} with ID {}, LLP {} @ {}"

        logtext = logtext.format(
            cyan(title),
            cyan(u"{}".format(parl_id)),
            green(str(LLP)),
            blue(response.url),
            green(u"{}".format(inquiry_item.keywords))
        )
        log.msg(logtext, level=log.INFO)

        log.msg(green("Open Callback requests: {}".format(
            len(callback_requests))), level=log.INFO)

        return callback_requests
Exemplo n.º 18
0
    def parse_inquiry_response(self, response):
        """
        Callback function for parsing the inquiry responses
        """
        inquiry_item = response.meta['inquiry_item']
        source_link = response.url
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.RESPONSEDESCRIPTION.xt(response)
        LLP = inquiry_item.legislative_period
        category = INQUIRY.CATEGORY.xt(response)

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        try:
            sender_object = Person.objects.get(
                parl_id=INQUIRY.RESPONSESENDER.xt(response))
        except:
            log.msg(red(u'Receiver was not found in database, skipping Inquiry {} in LLP {}'.format(
                parl_id, LLP)))
            return

        # Create or update Inquiry item
        inquiryresponse_item, inquiryresponse_created = InquiryResponse.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'source_link': source_link,
                'description': description,
                'sender': sender_object
            }
        )

        # Attach foreign Keys
        inquiryresponse_item.documents = self.parse_docs(response)
        inquiryresponse_item.category = cat

        # Save InquiryResponse object
        inquiryresponse_item.save()

        if inquiryresponse_created:
            logtext = u"Created InquiryResponse {} with ID {}, LLP {} @ {}"
        else:
            logtext = u"Updated InquiryResponse {} with ID {}, LLP {} @ {}"

        logtext = logtext.format(
            cyan(title),
            cyan(u"{}".format(parl_id)),
            green(str(LLP)),
            blue(response.url)
        )
        log.msg(logtext, level=log.INFO)

        inquiry_item.response = inquiryresponse_item
        inquiry_item.save()

        return
Exemplo n.º 19
0
    def parse(self, response):

        # rss = feedparser.parse(response.url)

        persons = PERSON.LIST.xt(response)

        callback_requests = []

        # which llp are we in?
        urloptions = response.url.split('?')[1]

        llp_roman = [opt.split('=')[1]
                     for opt in urloptions.split('&') if opt.split('=')[0] == 'GP']
        llp_item = LegislativePeriod.objects.get(roman_numeral=llp_roman[0])

        # function string
        function = [opt.split('=')[1]
                    for opt in urloptions.split('&') if opt.split('=')[0] == 'NRBR']
        function_str = self.RSS_TO_FUNCTION[function[0]]
        function_item, f_created = Function.objects.get_or_create(
            title=function_str)

        self.logger.info(
            "Scraping {} persons for LLP {}".format(len(persons), llp_roman))

        # Iterate all persons
        for p in persons:
            # Extract basic data
            parl_id = p['source_link'].split('/')[-2]
            p['source_link'] = "{}{}".format(BASE_HOST, p['source_link'])

            # Create or update simple person's item
            person_data = {
                'reversed_name': p['reversed_name']
            }
            person_item, created_person = Person.objects.update_or_create(
                source_link=p['source_link'],
                parl_id=parl_id,
                defaults=person_data
            )
            if created_person:
                self.logger.info(u"Created Person {}".format(
                    green(u'[{}]'.format(p['reversed_name']))))
            else:
                self.logger.info(u"Updated Person {}".format(
                    green(u"[{}]".format(p['reversed_name']))
                ))

            for mandate in p['mandates']:
                party_item = self.get_party_item(mandate)
                state_item = self.get_state_item(p['electoral_state'])
                # Create and append mandate
                try:
                    mandate_item, m_created = Mandate.objects.update_or_create(
                        function=function_item,
                        legislative_period=llp_item,
                        party=party_item,
                        state=state_item)
                except:
                    self.logger.info(
                        red("Error saving Mandate {} ({})".format(function_item, party_item)))
                    import ipdb
                    ipdb.set_trace()

                person_item.mandates.add(mandate_item)

            # Do a save to update the db models
            person_item.save()

            # In case we added/modified a mandate now,
            if p['mandates']:
                latest_mandate_item = person_item.get_latest_mandate()
                person_item.latest_mandate = latest_mandate_item
                self.logger.info(
                    cyan("Latest mandate for {} is now {}".format(person_item, latest_mandate_item)))
                person_item.save()

            # First time we encounter a person, we scan her detail page too
            if not parl_id in self.persons_scraped:

                # Create Detail Page request
                req = scrapy.Request(p['source_link'],
                                     callback=self.parse_person_detail)
                req.meta['person'] = {
                    'reversed_name': p['reversed_name'],
                    'source_link': p['source_link'],
                    'parl_id': parl_id
                }
                callback_requests.append(req)
                self.persons_scraped.append(parl_id)
        return callback_requests
Exemplo n.º 20
0
class InquiriesSpider(BaseSpider):
    BASE_URL = "{}/{}".format(BASE_HOST, "PAKT/JMAB/filter.psp")

    URLOPTIONS = {
        'view': 'RSS',
        'jsMode': 'RSS',
        'xdocumentUri': '/PAKT/JMAB/index.shtml',
        'NRBR': 'NR',
        'anwenden': 'Anwenden',
        'JMAB': 'J_JPR_M',
        'VHG2': 'ALLE',
        'SUCH': '',
        'listeId': '105',
        'FBEZ': 'FP_005'
    }

    name = "inquiries"
    inquiries_scraped = []

    def __init__(self, **kw):
        super(InquiriesSpider, self).__init__(**kw)

        if 'llp' in kw:
            try:
                self.LLP = [int(kw['llp'])]
            except:
                pass

        self.cookies_seen = set()
        self.idlist = {}
        self.url_override = kw.get('url', None)

    def start_requests(self):
        """
        Returns a list of URLs to scrape
        """
        # This predefined list of URLs is chosen to include all types of
        # inquiries possible in the Austrian parliament in order to provide a
        # suitable testing surface for new functions.
        # urls = ["https://www.parlament.gv.at/PAKT/VHG/XXV/JPR/JPR_00019/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/JPR/JPR_00016/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/J/J_06954/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/M/M_00178/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/JEU/JEU_00003/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/J/J_06758/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/BR/J-BR/J-BR_03089/index.shtml",
        #         "https://www.parlament.gv.at/PAKT/VHG/BR/J-BR/J-BR_03091/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/BR/J-BR/J-BR_01155/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_06110/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_06651/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_04024/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_04025/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XX/M/M_00178/index.shtml"]
        urls = [] if not self.url_override else [self.url_override]

        if self.LLP and not self.url_override:
            for i in self.LLP:
                for nrbr in ['NR', 'BR']:
                    roman_numeral = roman.toRoman(i)
                    options = self.URLOPTIONS.copy()
                    options['GP'] = roman_numeral
                    options['NRBR'] = nrbr
                    url_options = urlencode(options)
                    url_llp = "{}?{}".format(self.BASE_URL, url_options)
                    rss = feedparser.parse(url_llp)

                    self.logger.info("GP {}: {} inquiries from {}".format(
                        roman_numeral, len(rss['entries']), nrbr)
                    )
                    urls = urls + [entry['link'] for entry in rss['entries']]
        self.TOTAL_COUNTER = len(urls)
        for url in urls:
            yield self.make_requests_from_url(url)

    def parse(self, response):
        self.SCRAPED_COUNTER += 1

        source_link = response.url
        category = INQUIRY.CATEGORY.xt(response)
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.DESCRIPTION.xt(response)
        sender_objects = []
        callback_requests = []
        ts = GENERIC.TIMESTAMP.xt(response)

        # Inquiries from Bundesrat don't have an LLP => set None
        if("BR" in category):
            LLP = None
        else:
            LLP = LegislativePeriod.objects.get(
                roman_numeral=response.url.split('/')[-4])
        if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts):
            self.logger.debug(
                green(u"[{} of {}] Skipping Inquiry, no changes: {}".format(
                    self.SCRAPED_COUNTER,
                    self.TOTAL_COUNTER,
                    title)))
            return

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.debug(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # An inquiry can have multiple senders, but only a single recipient.
        # Try/catch in case person does not exist in the database.
        try:
            for sender_object in INQUIRY.SENDER.xt(response):
                sender_objects.append(Person.objects.get(
                    parl_id=sender_object))
        except:
            log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format(
                INQUIRY.SENDER.xt(response), parl_id, LLP)))
            return
        try:
            receiver_object = Person.objects.get(
                parl_id=INQUIRY.RECEIVER.xt(response))
        except:
            log.warning(red(u'Receiver "{}" was not found in database, skipping Inquiry {} in LLP {}'.format(
                INQUIRY.RECEIVER.xt(response), parl_id, LLP)))
            return

        # Create or update Inquiry item
        inquiry_item, inquiry_created = Inquiry.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'source_link': source_link,
                'description': description,
                'receiver': receiver_object,
                'ts': ts
            }
        )

        if inquiry_created:
            inquiry_item.status = 'offen'

        # Attach foreign keys
        inquiry_item.keywords = self.parse_keywords(response)
        inquiry_item.documents = self.parse_docs(response)
        inquiry_item.category = cat
        inquiry_item.sender = sender_objects

        response.meta['inquiry_item'] = inquiry_item

        # Dringliche / Urgent inquiries have a different structure for steps
        # and history. This case distinction accomodates these different
        # structures.
        if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()):
            if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
                self.parse_parliament_steps(response)
        else:
            response_link = self.parse_steps(response)
            if response_link:
                post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link),
                                          callback=self.parse_inquiry_response,
                                          dont_filter=True)
                post_req.meta['inquiry_item'] = inquiry_item

                callback_requests.append(post_req)

        # Save Inquiry item and log to terminal if created or updated.
        inquiry_item.save()

        if inquiry_created:
            logtext = u"[{} of {}] Created Inquiry {} with ID {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated Inquiry {} with ID {}, LLP {} @ {}"

        logtext = logtext.format(
            self.SCRAPED_COUNTER,
            self.TOTAL_COUNTER,
            cyan(title),
            cyan(u"{}".format(parl_id)),
            green(unicode(LLP)),
            blue(response.url),
            green(u"{}".format(inquiry_item.keywords))
        )
        log.msg(logtext, level=log.DEBUG if not self.SCRAPED_COUNTER%1000==0 else log.INFO)

        # log.msg(green("Open Callback requests: {}".format(
        #   len(callback_requests))), level=log.INFO)

        return callback_requests

    def has_changes(self, parl_id, legislative_period, source_link, ts):
        if not Inquiry.objects.filter(
            parl_id=parl_id,
            legislative_period=legislative_period,
            source_link=source_link
        ).exists():
            return True

        ts = ts.replace(tzinfo=pytz.utc)
        if Inquiry.objects.get(
                parl_id=parl_id,
                legislative_period=legislative_period,
                source_link=source_link).ts != ts:
            return True
        return False

    def parse_keywords(self, response):
        keywords = INQUIRY.KEYWORDS.xt(response)

        # Create all keywords we don't yet have in the DB
        keyword_items = []
        for keyword in keywords:
            kw, created = Keyword.objects.get_or_create(title=keyword)
            if created:
                log.msg(u"Created keyword {}".format(
                    green(u'[{}]'.format(keyword))),level=log.DEBUG)
            keyword_items.append(kw)

        return keyword_items

    def parse_docs(self, response):

        docs = INQUIRY.DOCS.xt(response)

        # Create all docs we don't yet have in the DB
        doc_items = []
        for document in docs:
            doc, created = Document.objects.get_or_create(
                title=document['title'],
                html_link=document['html_url'],
                pdf_link=document['pdf_url'],
                stripped_html=None
            )
            doc_items.append(doc)
        return doc_items

    def parse_response_docs(self, response):

        docs = INQUIRY.RESPONSEDOCS.xt(response)

        # Create all docs we don't yet have in the DB
        doc_items = []
        for document in docs:
            doc, created = Document.objects.get_or_create(
                title=document['title'],
                html_link=document['html_url'],
                pdf_link=document['pdf_url'],
                stripped_html=None
            )
            doc_items.append(doc)
        return doc_items

    def parse_steps(self, response):
        """
            Callback function to parse the single-page history for normal inquiries
        """
        response_link = []
        inquiry_item = response.meta['inquiry_item']

        # Get or created a default-phase for inquiries, because there are no phases in
        # simple inquiries.
        phase_item, created = Phase.objects.get_or_create(
            title='default_inqu')
        if created:
            log.msg(u"Created Phase {}".format(
                green(u'[{}]'.format(phase_item.title))),level=log.DEBUG)

        steps = INQUIRY.STEPS.xt(response)

        for step in steps:
            if "Schriftliche Beantwortung" in step["title"]:
                response_link = INQUIRY.RESPONSE_LINK.xt(response)

        for step in steps:
            step_item, created = Step.objects.update_or_create(
                title=step['title'],
                sortkey=step['sortkey'],
                date=step['date'],
                protocol_url=step['protocol_url'],
                law=inquiry_item,
                phase=phase_item,
                source_link=response.url
            )
            step_item.save()
        if response_link:
            return response_link
        else:
            return

    def parse_parliament_steps(self, response):
        """
        Callback function to parse the additional 'Parlamentarisches Verfahren'
        page.
        """
        inquiry_item = response.meta['inquiry_item']

        phases = INQUIRY.PHASES.xt(response)

        for phase in phases:
            # Create phase if we don't have it yet
            phase_item, created = Phase.objects.get_or_create(
                title=phase['title'])
            if created:
                log.msg(u"Created Phase {}".format(
                    green(u'[{}]'.format(phase_item.title))),level=log.DEBUG)

            # Create steps
            for step in phase['steps']:
                step_item, created = Step.objects.update_or_create(
                    title=step['title']['text'],
                    sortkey=step['sortkey'],
                    date=step['date'],
                    protocol_url=step['protocol_url'],
                    law=inquiry_item,
                    phase=phase_item,
                    source_link=response.url
                )
                step_item.save()
                if created:
                    log.msg(u"Created Step {}".format(
                        green(u'[{}]'.format(step_item.title))),level=log.DEBUG)

                # Save statements for this step, if applicable
                if 'statements' in step['title']:
                    for stmnt in step['title']['statements']:
                        # Find the person
                        pq = Person.objects.filter(
                            source_link__endswith=stmnt['person_source_link'])
                        if pq.exists() and pq.count() == 1:
                            person_item = pq.first()
                            st_data = {
                                'speech_type': stmnt['statement_type'],
                                'protocol_url': stmnt['protocol_link']
                            }
                            st_item, st_created = Statement.objects.update_or_create(
                                index=stmnt['index'],
                                person=person_item,
                                step=step_item,
                                defaults=st_data)
                            if st_created:
                                log.msg(u"Created Statement by {} on {}".format(
                                    green(
                                        u'[{}]'.format(person_item.full_name)),
                                    step_item.date),level=log.DEBUG)
                            else:
                                log.msg(u"Updated Statement by {} on {}".format(
                                    green(
                                        u'[{}]'.format(person_item.full_name)),
                                    step_item.date),level=log.DEBUG)
                        else:
                            # We can't save statements if we can't find the
                            # Person
                            log.warning(
                                red(u"Skipping Statement by {}: Person with source_link {} does{} exist{}").format(
                                    green(
                                        u'[{}]'.format(stmnt['person_name'])),
                                    blue(
                                        "[{}]".format(stmnt['person_source_link'])),
                                    red("{}").format(
                                        "" if pq.exists() else " not"),
                                    "" if pq.count() > 1 else ", but {} persons matching found!".format(
                                        pq.count())
                                ))
                            continue

    def parse_inquiry_response(self, response):
        """
        Callback function for parsing the inquiry responses
        """
        inquiry_item = response.meta.get('inquiry_item',None) # allow testing single urls for parsing errors
        source_link = response.url
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.RESPONSEDESCRIPTION.xt(response)
        LLP = inquiry_item.legislative_period if inquiry_item else None
        category = INQUIRY.CATEGORY.xt(response)

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))),level=log.DEBUG)

        try:
            sender_object = Person.objects.get(
                parl_id=INQUIRY.RESPONSESENDER.xt(response))
        except Exception, e:
            log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format(
                INQUIRY.RESPONSESENDER.xt(response), parl_id, LLP)))
            return

        if not inquiry_item:
            print locals()
            return # allow testing single urls for parsing errors

        # Create or update Inquiry item
        inquiryresponse_item, inquiryresponse_created = InquiryResponse.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'source_link': source_link,
                'description': description,
                'sender': sender_object
            }
        )

        # Attach foreign Keys
        inquiryresponse_item.documents = self.parse_response_docs(response)
        inquiryresponse_item.category = cat

        # Save InquiryResponse object
        inquiryresponse_item.save()

        if inquiryresponse_created:
            logtext = u"[{} of {}] Created InquiryResponse {} with ID {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated InquiryResponse {} with ID {}, LLP {} @ {}"

        logtext = logtext.format(
            self.SCRAPED_COUNTER,
            self.TOTAL_COUNTER,
            cyan(title),
            cyan(u"{}".format(parl_id)),
            green(unicode(LLP)),
            blue(response.url)
        )
        log.msg(logtext, level=log.DEBUG if self.SCRAPED_COUNTER!=0 else log.INFO)

        inquiry_item.response = inquiryresponse_item
        inquiry_item.status = 'response_received'
        inquiry_item.save()

        return
Exemplo n.º 21
0
from ansicolor import cyan
from ansicolor import green
from ansicolor import red
from ansicolor import white

print("Let's try two colors: %s and %s!" % (red("red"), green("green")))
print("It's also easy to produce text in %s," % (red("bold", bold=True)))
print("...%s," % (green("reverse", reverse=True)))
print("...and %s." % (cyan("bold and reverse", bold=True, reverse=True)))
Exemplo n.º 22
0
    def parse_list(self, response):

        # rss = feedparser.parse(response.url)

        persons = PERSON.LIST.xt(response)
        logger.info(u"parsing list: {}, {} persons".format(
            green(u'[{}]'.format(response.url)), len(persons)))

        callback_requests = []

        # which llp are we in?
        urloptions = response.url.split('?')[1]
        opts = dict(urlparse.parse_qsl(urloptions))

        llp_roman = opts['GP']
        llp_item = LegislativePeriod.objects.get(roman_numeral=llp_roman)

        # function string
        function = opts['NRBR']
        function_str = self.RSS_TO_FUNCTION[function]
        function_item, f_created = Function.objects.get_or_create(
            title=function_str)

        logger.info(
            u"Scraping {} persons for LLP {}, {}".format(len(persons), llp_roman, function))

        # Iterate all persons
        for p in persons:
            # Extract basic data
            parl_id = p['source_link'].split('/')[-2]
            p['source_link'] = "{}{}".format(BASE_HOST, p['source_link'])

            changed = False
            # Create or update simple person's item
            try:
                person_data = {
                    'reversed_name': p['reversed_name'],
                    'source_link': p['source_link']
                }
                person_item, created_person = Person.objects.update_or_create(
                    parl_id=parl_id,
                    defaults=person_data)
            except Exception as e:
                logger.warning("Error saving Person {}: {}".format(
                    green(u'[{}]'.format(p['reversed_name'])),
                    e
                ))
                continue
            if created_person:
                logger.debug(u"Created Person {}".format(
                    green(u'[{}]'.format(p['reversed_name']))))
            else:
                logger.debug(u"Updated Person {}".format(
                    green(u"[{}]".format(p['reversed_name']))
                ))
            for mandate in p['mandates']:
                party_item = self.get_party_item(mandate)
                state_item = self.get_state_item(p['electoral_state'])
                # Create and append mandate
                try:
                    mandate_items = person_item.mandate_set.filter(
                        Q(function__title__contains='Nationalrat') if 'Nationalrat' in function_item.title else Q(function__title__contains='Bundesrat')
                        ).filter(
                        legislative_period=llp_item,
                        party=party_item
                    )
                    if not mandate_items:
                        mandate_items = [person_item.mandate_set.create(
                            function=function_item,
                            legislative_period=llp_item,
                            party=party_item,
                            state=state_item
                        )]
                    mandate_item = mandate_items[0]
                except Exception, e:
                    logger.warning(
                        red(u"Error saving Mandate {} ({}) / Person {}".format(function_item, party_item, person_item.pk)))
                    import ipdb
                    ipdb.set_trace()
            if changed:
                # In case we added/modified a mandate now,
                latest_mandate_item = person_item.get_latest_mandate()
                person_item.latest_mandate = latest_mandate_item
                logger.debug(
                    cyan(u"Latest mandate for {} is now {}".format(person_item, latest_mandate_item)))
                person_item.save()

            # First time we encounter a person, we scan her detail page too
            if not parl_id in self.persons_scraped:
                # Create Detail Page request
                req = scrapy.Request(p['source_link'],
                                     callback=self.parse_person_detail)
                req.meta['person'] = {
                    'reversed_name': p['reversed_name'],
                    'source_link': p['source_link'],
                    'parl_id': parl_id
                }
                callback_requests.append(req)
                self.persons_scraped.append(parl_id)
Exemplo n.º 23
0
    def parse(self, response):
        self.SCRAPED_COUNTER += 1

        source_link = response.url
        category = INQUIRY.CATEGORY.xt(response)
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.DESCRIPTION.xt(response)
        sender_objects = []
        callback_requests = []
        ts = GENERIC.TIMESTAMP.xt(response)

        # Inquiries from Bundesrat don't have an LLP => set None
        if("BR" in category):
            LLP = None
        else:
            LLP = LegislativePeriod.objects.get(
                roman_numeral=response.url.split('/')[-4])
        if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts):
            self.logger.debug(
                green(u"[{} of {}] Skipping Inquiry, no changes: {}".format(
                    self.SCRAPED_COUNTER,
                    self.TOTAL_COUNTER,
                    title)))
            return

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.debug(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # An inquiry can have multiple senders, but only a single recipient.
        # Try/catch in case person does not exist in the database.
        try:
            for sender_object in INQUIRY.SENDER.xt(response):
                sender_objects.append(Person.objects.get(
                    parl_id=sender_object))
        except:
            log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format(
                INQUIRY.SENDER.xt(response), parl_id, LLP)))
            return
        try:
            receiver_object = Person.objects.get(
                parl_id=INQUIRY.RECEIVER.xt(response))
        except:
            log.warning(red(u'Receiver "{}" was not found in database, skipping Inquiry {} in LLP {}'.format(
                INQUIRY.RECEIVER.xt(response), parl_id, LLP)))
            return

        # Create or update Inquiry item
        inquiry_item, inquiry_created = Inquiry.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'source_link': source_link,
                'description': description,
                'receiver': receiver_object,
                'ts': ts
            }
        )

        if inquiry_created:
            inquiry_item.status = 'offen'

        # Attach foreign keys
        inquiry_item.keywords = self.parse_keywords(response)
        inquiry_item.documents = self.parse_docs(response)
        inquiry_item.category = cat
        inquiry_item.sender = sender_objects

        response.meta['inquiry_item'] = inquiry_item

        # Dringliche / Urgent inquiries have a different structure for steps
        # and history. This case distinction accomodates these different
        # structures.
        if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()):
            if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
                self.parse_parliament_steps(response)
        else:
            response_link = self.parse_steps(response)
            if response_link:
                post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link),
                                          callback=self.parse_inquiry_response,
                                          dont_filter=True)
                post_req.meta['inquiry_item'] = inquiry_item

                callback_requests.append(post_req)

        # Save Inquiry item and log to terminal if created or updated.
        inquiry_item.save()

        if inquiry_created:
            logtext = u"[{} of {}] Created Inquiry {} with ID {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated Inquiry {} with ID {}, LLP {} @ {}"

        logtext = logtext.format(
            self.SCRAPED_COUNTER,
            self.TOTAL_COUNTER,
            cyan(title),
            cyan(u"{}".format(parl_id)),
            green(unicode(LLP)),
            blue(response.url),
            green(u"{}".format(inquiry_item.keywords))
        )
        log.msg(logtext, level=log.DEBUG if not self.SCRAPED_COUNTER%1000==0 else log.INFO)

        # log.msg(green("Open Callback requests: {}".format(
        #   len(callback_requests))), level=log.INFO)

        return callback_requests
Exemplo n.º 24
0
    def parse(self, response):

        # rss = feedparser.parse(response.url)

        persons = PERSON.LIST.xt(response)

        callback_requests = []

        # which llp are we in?
        urloptions = response.url.split("?")[1]

        llp_roman = [opt.split("=")[1] for opt in urloptions.split("&") if opt.split("=")[0] == "GP"]
        llp_item = LegislativePeriod.objects.get(roman_numeral=llp_roman[0])

        # function string
        function = [opt.split("=")[1] for opt in urloptions.split("&") if opt.split("=")[0] == "NRBR"]
        function_str = self.RSS_TO_FUNCTION[function[0]]
        function_item, f_created = Function.objects.get_or_create(title=function_str)

        self.logger.info(u"Scraping {} persons for LLP {}".format(len(persons), llp_roman))

        # Iterate all persons
        for p in persons:
            # Extract basic data
            parl_id = p["source_link"].split("/")[-2]
            p["source_link"] = "{}{}".format(BASE_HOST, p["source_link"])

            changed = False
            # Create or update simple person's item
            person_item, created_person = Person.objects.update_or_create(
                source_link=p["source_link"], parl_id=parl_id, reversed_name=p["reversed_name"]
            )
            if created_person:
                self.logger.info(u"Created Person {}".format(green(u"[{}]".format(p["reversed_name"]))))
            else:
                self.logger.info(u"Updated Person {}".format(green(u"[{}]".format(p["reversed_name"]))))

            for mandate in p["mandates"]:
                party_item = self.get_party_item(mandate)
                state_item = self.get_state_item(p["electoral_state"])
                # Create and append mandate
                try:
                    mandate_item, m_created = Mandate.objects.update_or_create(
                        function=function_item, legislative_period=llp_item, party=party_item, state=state_item
                    )
                except:
                    self.logger.info(red(u"Error saving Mandate {} ({})".format(function_item, party_item)))
                    import ipdb

                    ipdb.set_trace()
                if mandate_item not in person_item.mandates.all():
                    changed = True
                    person_item.mandates.add(mandate_item)

            if changed:
                # In case we added/modified a mandate now,
                latest_mandate_item = person_item.get_latest_mandate()
                person_item.latest_mandate = latest_mandate_item
                self.logger.info(cyan(u"Latest mandate for {} is now {}".format(person_item, latest_mandate_item)))
                person_item.save()

            # First time we encounter a person, we scan her detail page too
            if not parl_id in self.persons_scraped:
                # Create Detail Page request
                req = scrapy.Request(p["source_link"], callback=self.parse_person_detail)
                req.meta["person"] = {
                    "reversed_name": p["reversed_name"],
                    "source_link": p["source_link"],
                    "parl_id": parl_id,
                }
                callback_requests.append(req)
                self.persons_scraped.append(parl_id)
        return callback_requests
Exemplo n.º 25
0
    def parse(self, response):
        self.SCRAPED_COUNTER += 1

        LLP = LegislativePeriod.objects.get(
            roman_numeral=response.url.split('/')[-4])

        # Extract fields
        ts = GENERIC.TIMESTAMP.xt(response)
        title = LAW.TITLE.xt(response)
        parl_id = LAW.PARL_ID.xt(response)
        status = LAW.STATUS.xt(response)

        if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts):
            self.logger.info(
                green(u"[{} of {}] Skipping Law, no changes: {}".format(
                    self.SCRAPED_COUNTER,
                    self.TOTAL_COUNTER,
                    title)))
            return

        # Extract foreign keys
        category = LAW.CATEGORY.xt(response)
        description = LAW.DESCRIPTION.xt(response)

        # Create category if we don't have it yet
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # Create and save Law
        law_data = {
            'title': title,
            'status': status,
            'description': description,
            'ts': ts,
            'source_link': response.url,
        }
        law_item, law_created = Law.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults=law_data)

        # Attach foreign keys
        law_item.keywords = self.parse_keywords(response)
        law_item.category = cat
        law_item.documents = self.parse_docs(response)

        law_item.save()

        # Log our progress
        if law_created:
            logtext = u"[{} of {}] Created {} with id {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated {} with id {}, LLP {} @ {}"

        logtext = logtext.format(
            self.SCRAPED_COUNTER,
            self.TOTAL_COUNTER,
            red(title),
            cyan(u"[{}]".format(parl_id)),
            green(unicode(LLP)),
            blue(response.url)
        )
        log.msg(logtext, level=log.INFO)

        response.meta['law_item'] = law_item

        # is the tab 'Parlamentarisches Verfahren available?'
        if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
            self.parse_parliament_steps(response)

        if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'):
            self.parse_pre_parliament_steps(response)
Exemplo n.º 26
0
    def parse(self, response):
        self.SCRAPED_COUNTER += 1

        LLP = LegislativePeriod.objects.get(
            roman_numeral=response.url.split('/')[-4])

        # Extract fields
        ts = GENERIC.TIMESTAMP.xt(response)
        title = LAW.TITLE.xt(response)
        parl_id = LAW.PARL_ID.xt(response)
        status = LAW.STATUS.xt(response)

        if not self.IGNORE_TIMESTAMP and not self.has_changes(
                parl_id, LLP, response.url, ts):
            self.logger.info(
                green(u"[{} of {}] Skipping Law, no changes: {}".format(
                    self.SCRAPED_COUNTER, self.TOTAL_COUNTER, title)))
            return

        # Extract foreign keys
        category = LAW.CATEGORY.xt(response)
        description = LAW.DESCRIPTION.xt(response)

        # Create category if we don't have it yet
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # Create and save Law
        law_data = {
            'title': title,
            'status': status,
            'description': description,
            'ts': ts,
            'source_link': response.url,
        }
        law_item, law_created = Law.objects.update_or_create(
            parl_id=parl_id, legislative_period=LLP, defaults=law_data)

        # Attach foreign keys
        law_item.keywords = self.parse_keywords(response)
        law_item.category = cat
        law_item.documents = self.parse_docs(response)

        law_item.save()

        # Log our progress
        if law_created:
            logtext = u"[{} of {}] Created {} with id {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated {} with id {}, LLP {} @ {}"

        logtext = logtext.format(self.SCRAPED_COUNTER, self.TOTAL_COUNTER,
                                 red(title), cyan(u"[{}]".format(parl_id)),
                                 green(unicode(LLP)), blue(response.url))
        log.msg(logtext, level=log.INFO)

        response.meta['law_item'] = law_item

        # is the tab 'Parlamentarisches Verfahren available?'
        if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
            self.parse_parliament_steps(response)

        if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'):
            self.parse_pre_parliament_steps(response)