Пример #1
0
    def __init__(self):
        # login and start session with pmg website
        logger.info("LOGGING IN")
        self.session = requests.Session()
        headers = {'user-agent': 'Mozilla/4.0 (compatible: MSIE 6.0)'}
        try:
            data = {
                'name': config['name'],
                'pass': config['pass'],
                'form_id': 'user_login',
                'form_build_id': 'form-ee72095493d7ed912673b8a83219772c',
                'op': 'Log in'
            }
            r = self.session.post('http://www.pmg.org.za/user/login', headers=headers, data=data)

            if not "Welcome back." in r.content:
                logger.error("Login was not successful")
                raise Exception

        except Exception as e:
            import traceback; traceback.print_exc()
            logger.error("Configuration Error:")
            logger.error("Please ensure that a file called 'scraper_config.json' exists in the scraper directory, and that it contains" \
                  "valid 'username' and 'password' parameters for logging in to the PMG website. This is needed for accessing " \
                  "much of the content.")
            raise e
        self.stats = {}
Пример #2
0
    def run_scraper(self):

        committees = Agent.query.filter(Agent.type == "committee").filter(Agent.url != None).all()
        shuffle(committees)  # randomize the order, just to keep things interesting
        for i, committee in enumerate(committees):
            self.current_committee = committee
            self.current_url = committee.url
            try:
                self.current_page = scrapertools.URLFetcher(self.current_url, self.session).html
                logger.debug("Committee: " + str(committee.name))

                self.scrape_committee()
                # give some progress feedback
                logger.info(str(i + 1) + " out of " + str(len(committees)) + " committees' reports have been scraped.")
                logger.info(json.dumps(self.stats, indent=4))

                # commit entries to database, once per committee
                logger.debug("SAVING TO DATABASE")
                db.session.commit()
            except Exception as e:
                msg = "Error scraping committee's reports."
                self.stats["errors"].append(msg)
                logger.error(msg)
                logger.exception(str(e))
        return
Пример #3
0
    def run_scraper(self):
        """
        Iterate through bill pages, and run the state machine for each page.
        """
        pager = Pager()

        # iterate through bill pages
        for url in pager.next_page:
            logger.info(url)

            # initiate parser for this page
            self.state_fn = self.start_state
            html = scrapertools.URLFetcher(url, self.session).html
            soup = BeautifulSoup(html,
                                 convertEntities=BeautifulSoup.HTML_ENTITIES)
            table = soup.find("tbody")
            rows = table.findAll("tr")

            # feed rows into state machine
            for row in rows:
                while not self.state_fn(row):
                    pass
            # commit to database after each page
            # db.session.commit()
        return
    def run_scraper(self):

        committees = Agent.query.filter(Agent.type == "committee").filter(
            Agent.url != None).all()
        shuffle(
            committees)  # randomize the order, just to keep things interesting
        for i, committee in enumerate(committees):
            self.current_committee = committee
            self.current_url = committee.url
            try:
                self.current_page = scrapertools.URLFetcher(
                    self.current_url, self.session).html
                logger.debug("Committee: " + str(committee.name))

                self.scrape_committee()
                # give some progress feedback
                logger.info(
                    str(i + 1) + " out of " + str(len(committees)) +
                    " committees' reports have been scraped.")
                logger.info(json.dumps(self.stats, indent=4))

                # commit entries to database, once per committee
                logger.debug("SAVING TO DATABASE")
                db.session.commit()
            except Exception as e:
                msg = "Error scraping committee's reports."
                self.stats["errors"].append(msg)
                logger.error(msg)
                logger.exception(str(e))
        return
Пример #5
0
    def scrape_bills(self):

        logger.info("\n ----------- SCRAPING BILLS ---------------")

        bill_scraper = bills.BillScraper(self.session)
        bill_scraper.run_scraper()
        logger.info(json.dumps(bill_scraper.stats, indent=4))
        return
Пример #6
0
    def scrape_committees(self):

        logger.info("\n ----------- SCRAPING COMMITTEES ---------------")

        committee_scraper = committees.CommitteeScraper(self.session)
        committee_scraper.run_scraper()
        logger.info(json.dumps(committee_scraper.stats, indent=4))
        return
Пример #7
0
    def scrape_hansards(self):

        logger.info("\n ----------- SCRAPING HANSARDS ---------------")

        hansard_scraper = hansards.HansardScraper(self.session)
        hansard_scraper.run_scraper()
        logger.info(json.dumps(hansard_scraper.stats, indent=4))
        return
Пример #8
0
    def scrape_committee_reports(self):

        logger.info("\n ----------- SCRAPING COMMITTEE REPORTS ---------------")

        report_scraper = committee_reports.ReportScraper(self.session)
        report_scraper.run_scraper()
        logger.info(json.dumps(report_scraper.stats, indent=4))
        return
Пример #9
0
def find_enacted_bills():
    """
    Set status of bills that have already been enacted.
    """

    for bill in Bill.query.all():
        for entry in bill.entries:
            if entry.type == "act":
                bill.status = "enacted"
                db.session.add(bill)
                logger.info("enacted: " + bill.name)
                break
    db.session.commit()
    return
Пример #10
0
def find_current_bills():
    """
    Update status of most recent set of bills from http://pmg.org.za/billsstatus/proceedings, via
    the csv at /data/current_status.csv
    """

    data = []
    with open("../data/current_status.csv", 'Ur') as f:
        reader = csv.reader(f)
        headers = reader.next()

        for i, row in enumerate(reader):

            entry = row

            # fix bill types
            if entry[0].startswith("PM"):
                entry[0] = "PMB" + entry[0][2::]
            elif not entry[0].startswith("B"):
                entry[0] = "B" + entry[0]
            tmp_code = entry[0]
            tmp_status = entry[1].lower()

            # clean bill code
            tmp = analyze_bill_code(tmp_code)
            code = tmp["code"]

            logger.info(code + " " + str(entry))

            bill = Bill.query.filter(Bill.code==code).first()
            available_status = {
                "act": "enacted",
                "": None,
                "pc": "na",
                "sc": "ncop",
                "intro": "na",
            }

            if available_status.get(tmp_status):
                tmp_status = available_status[tmp_status]
            bill.status = tmp_status
            db.session.add(bill)
    db.session.commit()
    return
Пример #11
0
    def add_or_update(self):
        """
        Add current_report to database, or update the record if it already exists.
        """

        report = Entry.query.filter_by(agent_id=self.current_committee.agent_id) \
            .filter_by(url=self.current_report['url'])\
            .filter_by(is_deleted=False).first()
        if report is None:
            report = Entry()
            self.stats["new_committee_reports"] += 1

        tmp_bills = None
        if self.current_report.get('bills'):
            tmp_bills = self.current_report['bills']
            logger.info(str(tmp_bills))
        report = scrapertools.populate_entry(report, self.current_report, tmp_bills)
        db.session.add(report)
        self.stats["total_committee_reports"] += 1
        self.current_report = {}
        return
Пример #12
0
    def add_or_update(self):
        """
        Add current_report to database, or update the record if it already exists.
        """

        report = Entry.query.filter_by(agent_id=self.current_committee.agent_id) \
            .filter_by(url=self.current_report['url'])\
            .filter_by(is_deleted=False).first()
        if report is None:
            report = Entry()
            self.stats["new_committee_reports"] += 1

        tmp_bills = None
        if self.current_report.get('bills'):
            tmp_bills = self.current_report['bills']
            logger.info(str(tmp_bills))
        report = scrapertools.populate_entry(report, self.current_report,
                                             tmp_bills)
        db.session.add(report)
        self.stats["total_committee_reports"] += 1
        self.current_report = {}
        return
Пример #13
0
    def run_scraper(self):
        """
        Iterate through bill pages, and run the state machine for each page.
        """
        pager = Pager()

        # iterate through bill pages
        for url in pager.next_page:
            logger.info(url)

            # initiate parser for this page
            self.state_fn = self.start_state
            html = scrapertools.URLFetcher(url, self.session).html
            soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
            table = soup.find("tbody")
            rows = table.findAll("tr")

            # feed rows into state machine
            for row in rows:
                while not self.state_fn(row):
                    pass
            # commit to database after each page
            # db.session.commit()
        return
Пример #14
0
    def run(self, rebuild_db=False, set_status=False):

        start_time = datetime.datetime.now()
        logger.info("Started at " + str(start_time))

        # start with a clean db if needed
        if rebuild_db:
            self.rebuild_db()

        # scrape content, and add to db
        self.scrape_bills()
        # self.scrape_hansards()
        self.scrape_committees()
        self.scrape_committee_reports()

        # update historic bill status data
        if set_status:
            bill_status.find_current_bills()
            bill_status.find_enacted_bills()
            bill_status.handle_assent()

        logger.info("Finished scraping at " + str(datetime.datetime.now()))
        logger.info("Duration: " + str(datetime.datetime.now() - start_time))
        return
Пример #15
0
            agent.location = self.current_committee['location']
            self.stats["new_committees"] += 1
        agent.url = self.current_committee['url']
        db.session.add(agent)
        self.stats["total_committees"] += 1
        self.current_committee = {}
        return

    @property
    def next_committee(self):
        html = scrapertools.URLFetcher("http://www.pmg.org.za/committees", self.session).html
        soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
        container = soup.find(id="committees-all")
        committee_lists = container.findAll("div", {"class": "item-list"})
        for committee_list in committee_lists:
            list_name = committee_list.find('h3').contents[0]
            logger.debug("\n" + list_name + ":")
            committees = committee_list.findAll('li')
            for committee in committees:
                href = "http://www.pmg.org.za" + committee.find('a').attrs[0][1]
                name = committee.find('a').contents[0]
                logger.debug("\t" + name)
                yield list_name, href, name


if __name__ == "__main__":

    committee_scraper = CommitteeScraper()
    committee_scraper.run_scraper()
    logger.info(json.dumps(committee_scraper.stats, indent=4))
Пример #16
0
                break
        return

    def add_or_update(self):
        """
        Add current_hansard to database, or update the record if it already exists.
        """

        self.current_hansard['entry_type'] = "hansard"
        bills = []
        if self.current_hansard.get('bills'):
            bills = self.current_hansard["bills"]
            # TODO: improve filtering
        hansard = Entry.query.filter(Entry.type == "hansard").filter(
            Entry.title == self.current_hansard['title']).first()
        if hansard is None:
            hansard = Entry()
            self.stats["new_hansards"] += 1
        hansard = scrapertools.populate_entry(hansard, self.current_hansard,
                                              bills)
        db.session.add(hansard)
        self.stats["total_hansards"] += 1
        return


if __name__ == "__main__":

    hansard_scraper = HansardScraper()
    hansard_scraper.run_scraper()
    logger.info(json.dumps(hansard_scraper.stats, indent=4))
Пример #17
0
                break
        return

    def add_or_update(self):
        """
        Add current_hansard to database, or update the record if it already exists.
        """

        self.current_hansard['entry_type'] = "hansard"
        bills = []
        if self.current_hansard.get('bills'):
            bills = self.current_hansard["bills"]
            # TODO: improve filtering
        hansard = Entry.query.filter(Entry.type=="hansard").filter(Entry.title==self.current_hansard['title']).first()
        if hansard is None:
            hansard = Entry()
            self.stats["new_hansards"] += 1
        hansard = scrapertools.populate_entry(hansard, self.current_hansard, bills)
        db.session.add(hansard)
        self.stats["total_hansards"] += 1
        return


if __name__ == "__main__":

    hansard_scraper = HansardScraper()
    hansard_scraper.run_scraper()
    logger.info(json.dumps(hansard_scraper.stats, indent=4))


Пример #18
0
    def scrape_committee(self):
        """
        Scrape all meeting reports for a particular committee.
        """

        for (j, (date, title, href_report)) in enumerate(self.next_report):
            logger.debug("\t\t" + str(date) + " - " +
                         (title[0:45]) if len(title) > 45 else title)
            tmp_url = href_report
            html = scrapertools.URLFetcher(tmp_url, self.session).html
            soup = BeautifulSoup(html)
            content = soup.find(id="content")
            bills = scrapertools.find_bills(str(content))
            # only save report entries that can be tagged to bills
            if bills:
                self.current_report = {
                    "entry_type": "committee-meeting",
                    "bills": bills,
                    "url": tmp_url,
                    "date": date,
                    "title": title,
                    "agent": self.current_committee,
                }

                # report URL may have changed after editing on pmg.org.za, check for this
                possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\
                    .filter(Entry.url != None)\
                    .filter(Entry.url != tmp_url)\
                    .filter(Entry.type == "committee-meeting")\
                    .filter(Entry.is_deleted == False)\
                    .filter(Entry.date == date)\
                    .order_by(Entry.entry_id).all()
                deletion_flag = False
                if possible_duplicates:
                    logger.debug(
                        str(len(possible_duplicates)) +
                        " possible duplicates found")
                    for possible_duplicate in possible_duplicates:
                        redirect_url = scrapertools.URLFetcher(
                            possible_duplicate.url,
                            self.session).follow_redirect()
                        if possible_duplicate.url != redirect_url:
                            logger.debug('redirect encountered')
                            if redirect_url == tmp_url:
                                logger.info("Updating entry URL")
                                # update the existing record's URL
                                possible_duplicate.url = tmp_url
                                # # delete all but one entry, if there are multiple duplicates
                                # if deletion_flag:
                                #     logger.info('duplicate entry deleted')
                                #     possible_duplicate.is_deleted = True
                                db.session.add(possible_duplicate)
                                db.session.commit()
                                deletion_flag = True

                if self.current_committee.location:
                    self.current_report[
                        "location"] = self.current_committee.location
                try:
                    self.add_or_update()
                except Exception, e:
                    msg = "Could not add committee report to database: "
                    if self.current_report.get("title"):
                        msg += self.current_report["title"]
                    self.stats["errors"].append(msg)
                    logger.error(msg)
                    logger.exception(str(e))
                self.current_report = {}
            else:
                logger.debug('no bills found in committee meeting report')
Пример #19
0
    def add_or_update(self):
        """
        Add current_report to database, or update the record if it already exists.
        """

        report = Entry.query.filter_by(agent_id=self.current_committee.agent_id) \
            .filter_by(url=self.current_report['url'])\
            .filter_by(is_deleted=False).first()
        if report is None:
            report = Entry()
            self.stats["new_committee_reports"] += 1

        tmp_bills = None
        if self.current_report.get('bills'):
            tmp_bills = self.current_report['bills']
            logger.info(str(tmp_bills))
        report = scrapertools.populate_entry(report, self.current_report,
                                             tmp_bills)
        db.session.add(report)
        self.stats["total_committee_reports"] += 1
        self.current_report = {}
        return


if __name__ == "__main__":

    report_scraper = ReportScraper()
    report_scraper.run_scraper()
    logger.info(json.dumps(report_scraper.stats, indent=4))
Пример #20
0
    def scrape_committee(self):
        """
        Scrape all meeting reports for a particular committee.
        """

        for (j, (date, title, href_report)) in enumerate(self.next_report):
            logger.debug("\t\t" + str(date) + " - " + (title[0:45]) if len(title) > 45 else title)
            tmp_url = href_report
            html = scrapertools.URLFetcher(tmp_url, self.session).html
            soup = BeautifulSoup(html)
            content = soup.find(id="content")
            bills = scrapertools.find_bills(str(content))
            # only save report entries that can be tagged to bills
            if bills:
                self.current_report = {
                    "entry_type": "committee-meeting",
                    "bills": bills,
                    "url": tmp_url,
                    "date": date,
                    "title": title,
                    "agent": self.current_committee,
                    }

                # report URL may have changed after editing on pmg.org.za, check for this
                possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\
                    .filter(Entry.url != None)\
                    .filter(Entry.url != tmp_url)\
                    .filter(Entry.type == "committee-meeting")\
                    .filter(Entry.is_deleted == False)\
                    .filter(Entry.date == date)\
                    .order_by(Entry.entry_id).all()
                deletion_flag = False
                if possible_duplicates:
                    logger.debug(str(len(possible_duplicates)) + " possible duplicates found")
                    for possible_duplicate in possible_duplicates:
                        redirect_url = scrapertools.URLFetcher(possible_duplicate.url, self.session).follow_redirect()
                        if possible_duplicate.url != redirect_url:
                            logger.debug('redirect encountered')
                            if redirect_url == tmp_url:
                                logger.info("Updating entry URL")
                                # update the existing record's URL
                                possible_duplicate.url = tmp_url
                                # # delete all but one entry, if there are multiple duplicates
                                # if deletion_flag:
                                #     logger.info('duplicate entry deleted')
                                #     possible_duplicate.is_deleted = True
                                db.session.add(possible_duplicate)
                                db.session.commit()
                                deletion_flag = True

                if self.current_committee.location:
                    self.current_report["location"] = self.current_committee.location
                try:
                    self.add_or_update()
                except Exception, e:
                    msg = "Could not add committee report to database: "
                    if self.current_report.get("title"):
                        msg += self.current_report["title"]
                    self.stats["errors"].append(msg)
                    logger.error(msg)
                    logger.exception(str(e))
                self.current_report = {}
            else:
                logger.debug('no bills found in committee meeting report')
Пример #21
0
                raise

            # set entry_type appropriately if this bill has already been enacted
            if "as enacted" in link.text:
                version['entry_type'] = "act"
            versions.append(version)
            self.state_fn = self.version_state
            return True
        else:
            self.state_fn = self.header_state
            return False


class Pager(object):
    """
    Return an iterable containing URLs to each of the available bills pages.
    """
    @property
    def next_page(self):
        current_year = datetime.today().year
        for current_year in range(current_year, 2005, -1):
            url = "http://www.pmg.org.za/print/bill?year=%d" % current_year
            yield url


if __name__ == "__main__":

    bill_scraper = BillScraper()
    bill_scraper.run_scraper()
    logger.info(json.dumps(bill_scraper.stats, indent=4))
Пример #22
0
            # set entry_type appropriately if this bill has already been enacted
            if "as enacted" in link.text:
                version['entry_type'] = "act"
            versions.append(version)
            self.state_fn = self.version_state
            return True
        else:
            self.state_fn = self.header_state
            return False


class Pager(object):
    """
    Return an iterable containing URLs to each of the available bills pages.
    """

    @property
    def next_page(self):
        current_year = datetime.today().year
        for current_year in range(current_year, 2005, -1):
            url = "http://www.pmg.org.za/print/bill?year=%d" % current_year
            yield url


if __name__ == "__main__":

    bill_scraper = BillScraper()
    bill_scraper.run_scraper()
    logger.info(json.dumps(bill_scraper.stats, indent=4))
Пример #23
0
def handle_assent():
    """
    Add entries relating to a bill's assent from http://pmg.org.za/billsstatus/proceedings, via
    the csv at /data/bill_assent_dates.csv
    """

    with open("../data/bill_assent_dates.csv", 'Ur') as f:
        data = list(list(rec) for rec in csv.reader(f, delimiter=','))

    president = Agent.query.filter(Agent.name == "The President").first()

    for i in range(len(data)):

        # ignore column title row
        if i==0:
            continue

        entry = data[i]

        # fix bill types
        if entry[0].startswith("PM"):
            entry[0] = "PMB" + entry[0][2::]
        elif not entry[0].startswith("B"):
            entry[0] = "B" + entry[0]
        tmp_code = entry[0]

        # clean bill code
        tmp = analyze_bill_code(tmp_code)
        if tmp:
            code = tmp["code"]
        else:
            logger.error("Error analyzing bill code " + tmp_code)
            continue

        logger.info(code + " " + str(entry))

        bill = Bill.query.filter(Bill.code==code).first()
        if bill is None:
            logger.error("Error finding bill " + code)
            continue

        try:
            act_no = unicode(entry[1])
            assent_date = unicode(entry[2])
            # convert date to python date object
            try:
                assent_date = date_parser.parse(assent_date).date()
            except Exception:
                logger.error("Error parsing date " + entry[2])
                continue
            if entry[3] and len(entry[3]) > 2:
                gazette = unicode(entry[3])
        except UnicodeDecodeError:
            logger.error("Unicode error: " + str(entry))
            continue

        # update bill record
        bill.status = "enacted"
        if gazette:
            bill.gazette = gazette
        db.session.add(bill)

        # add relevant entry in bill history
        tmp_entry = Entry.query.join(Entry.bills).filter(Bill.code==code).filter(Entry.type=="assent").first()
        if not tmp_entry:
            tmp_entry = Entry()
            tmp_entry.bills.append(bill)
        tmp_entry.date = assent_date
        tmp_entry.type = "assent"
        tmp_entry.location = 3
        tmp_entry.title = "Signed into law by the President."
        tmp_entry.agent = president
        if act_no and gazette:
            tmp_entry.description = "Enacted as Act " + act_no + ". Refer to Government Gazette " + gazette + "."
        db.session.add(tmp_entry)
    db.session.commit()
    return
Пример #24
0
        return

    def add_or_update(self):
        """
        Add current_report to database, or update the record if it already exists.
        """

        report = Entry.query.filter_by(agent_id=self.current_committee.agent_id) \
            .filter_by(url=self.current_report['url'])\
            .filter_by(is_deleted=False).first()
        if report is None:
            report = Entry()
            self.stats["new_committee_reports"] += 1

        tmp_bills = None
        if self.current_report.get('bills'):
            tmp_bills = self.current_report['bills']
            logger.info(str(tmp_bills))
        report = scrapertools.populate_entry(report, self.current_report, tmp_bills)
        db.session.add(report)
        self.stats["total_committee_reports"] += 1
        self.current_report = {}
        return


if __name__ == "__main__":

    report_scraper = ReportScraper()
    report_scraper.run_scraper()
    logger.info(json.dumps(report_scraper.stats, indent=4))