def committee_summaries(year=2013): from mptracker.scraper.committee_summaries import SummaryScraper patcher = TablePatcher(models.CommitteeSummary, models.db.session, key_columns=['pdf_url']) summary_scraper = SummaryScraper(get_cached_session(), get_cached_session('question-pdf')) records = summary_scraper.fetch_summaries(year, get_pdf_text=True) patcher.update(records)
def committee_summaries(year=2013): from mptracker.scraper.committee_summaries import SummaryScraper patcher = TablePatcher(models.CommitteeSummary, models.db.session, key_columns=['pdf_url']) summary_scraper = SummaryScraper(get_cached_session(), get_cached_session('question-pdf')) records = summary_scraper.fetch_summaries(year, get_pdf_text=True) patcher.update(records) models.db.session.commit()
def import_steno_day(day): from mptracker.scraper.common import get_cached_session from mptracker.scraper.steno import StenogramScraper http_session = get_cached_session() person_matcher = models.PersonMatcher() session = models.db.session steno_scraper = StenogramScraper(http_session) steno_day = steno_scraper.fetch_day(day) new_paragraphs = 0 for steno_chapter in steno_day.chapters: chapter_ob = models.StenoChapter(date=steno_day.date, headline=steno_chapter.headline, serial=steno_chapter.serial) session.add(chapter_ob) for paragraph in steno_chapter.paragraphs: person = person_matcher.get_person(paragraph['speaker_name'], paragraph['speaker_cdep_id']) paragraph_ob = models.StenoParagraph(text=paragraph['text'], chapter=chapter_ob, person=person, serial=paragraph['serial']) session.add(paragraph_ob) new_paragraphs += 1 print('added', new_paragraphs, 'stenogram paragraphs') session.commit()
def get_people(): person_scraper = PersonScraper(get_cached_session()) for row in person_scraper.fetch_people(year): county_name = row.pop('county_name') if county_name: ok_name = fix_local_chars(county_name.title()) if ok_name == "Bistrița-Năsăud": ok_name = "Bistrița Năsăud" county = models.County.query.filter_by(name=ok_name).first() if county is None: logger.warn("Can't match county name %r", ok_name) else: row['county'] = county yield row
def import_people(): from mptracker.scraper.common import get_cached_session from mptracker.scraper.people import PersonScraper ps = PersonScraper(get_cached_session()) existing_cdep_ids = set(p.cdep_id for p in models.Person.query) new_people = 0 session = models.db.session for person_info in ps.fetch_people(): if person_info['cdep_id'] not in existing_cdep_ids: print('adding person:', person_info) p = models.Person(**person_info) session.add(p) existing_cdep_ids.add(p.cdep_id) new_people += 1 print('added', new_people, 'people') session.commit()
def expand_minority_names(): from mptracker.scraper.common import Scraper, get_cached_session, pqitems scraper = Scraper(get_cached_session(), use_cdep_opener=False) doc = get_minority_names() roots = doc['root_names'] names = set() for root in roots: url = ('http://dexonline.ro/definitie' '/{root}/paradigma'.format(root=root)) page = scraper.fetch_url(url) for td in pqitems(page, 'table.lexem td.form'): names.add(td.text().replace(' ', '')) if '—' in names: names.remove('—') doc['search_names'] = sorted(names) print(flask.json.dumps(doc, indent=2, sort_keys=True))
class PersonScraper(Scraper): people_url = 'http://www.cdep.ro/pls/parlam/structura.de?leg={year}' def fetch_people(self, year=2012): people_page = self.fetch_url(self.people_url.format(year=year)) for tr in pqitems(people_page, 'tr'): for a in pqitems(tr, 'a'): href = a.attr('href') if 'structura.mp' in href: name = a.text() cdep_id = get_cdep_id(href) td = a.parents('tr')[2] county_name = pq(td[3]).text() minority = False if county_name in ["Mino.", "Minoritati"]: county_name = None minority = True yield { 'cdep_id': cdep_id, 'name': name, 'county_name': county_name, 'minority': minority, } if __name__ == '__main__': person_scraper = PersonScraper(get_cached_session()) print(list(person_scraper.fetch_people()))
continue # still looking for first speaker text = paragraph.text() steno_paragraph['text_buffer'].append(text) if steno_paragraph: save_paragraph() return steno_chapter def fetch_day(self, day): self.day = day self.chapter_serial = 0 steno_day = StenoDay() steno_day.date = day for link, headline in self.chapters_for_day(day): self.chapter_serial += 1 self.paragraph_serial = 0 steno_chapter = self.parse_steno_page(link) steno_chapter.headline = headline steno_chapter.serial = self.get_chapter_serial() steno_day.chapters.append(steno_chapter) return steno_day if __name__ == '__main__': steno_scraper = StenogramScraper(get_cached_session()) steno_day = steno_scraper.fetch_day(date(2013, 6, 10)) for steno_chapter in steno_day.chapters: for paragraph in steno_chapter.paragraphs: print(paragraph)