Пример #1
0
def committee_summaries(year=2013):
    from mptracker.scraper.committee_summaries import SummaryScraper

    patcher = TablePatcher(models.CommitteeSummary,
                           models.db.session,
                           key_columns=['pdf_url'])

    summary_scraper = SummaryScraper(get_cached_session(),
                                     get_cached_session('question-pdf'))
    records = summary_scraper.fetch_summaries(year, get_pdf_text=True)

    patcher.update(records)
Пример #2
0
def committee_summaries(year=2013):
    from mptracker.scraper.committee_summaries import SummaryScraper

    patcher = TablePatcher(models.CommitteeSummary,
                           models.db.session,
                           key_columns=['pdf_url'])

    summary_scraper = SummaryScraper(get_cached_session(),
                                     get_cached_session('question-pdf'))
    records = summary_scraper.fetch_summaries(year, get_pdf_text=True)

    patcher.update(records)

    models.db.session.commit()
Пример #3
0
def import_steno_day(day):
    from mptracker.scraper.common import get_cached_session
    from mptracker.scraper.steno import StenogramScraper
    http_session = get_cached_session()

    person_matcher = models.PersonMatcher()
    session = models.db.session
    steno_scraper = StenogramScraper(http_session)
    steno_day = steno_scraper.fetch_day(day)
    new_paragraphs = 0
    for steno_chapter in steno_day.chapters:
        chapter_ob = models.StenoChapter(date=steno_day.date,
                                         headline=steno_chapter.headline,
                                         serial=steno_chapter.serial)
        session.add(chapter_ob)
        for paragraph in steno_chapter.paragraphs:
            person = person_matcher.get_person(paragraph['speaker_name'],
                                               paragraph['speaker_cdep_id'])

            paragraph_ob = models.StenoParagraph(text=paragraph['text'],
                                                 chapter=chapter_ob,
                                                 person=person,
                                                 serial=paragraph['serial'])
            session.add(paragraph_ob)
            new_paragraphs += 1
    print('added', new_paragraphs, 'stenogram paragraphs')
    session.commit()
Пример #4
0
    def get_people():
        person_scraper = PersonScraper(get_cached_session())
        for row in person_scraper.fetch_people(year):
            county_name = row.pop('county_name')
            if county_name:
                ok_name = fix_local_chars(county_name.title())
                if ok_name == "Bistrița-Năsăud":
                    ok_name = "Bistrița Năsăud"
                county = models.County.query.filter_by(name=ok_name).first()
                if county is None:
                    logger.warn("Can't match county name %r", ok_name)
                else:
                    row['county'] = county

            yield row
Пример #5
0
 def import_people():
     from mptracker.scraper.common import get_cached_session
     from mptracker.scraper.people import PersonScraper
     ps = PersonScraper(get_cached_session())
     existing_cdep_ids = set(p.cdep_id for p in models.Person.query)
     new_people = 0
     session = models.db.session
     for person_info in ps.fetch_people():
         if person_info['cdep_id'] not in existing_cdep_ids:
             print('adding person:', person_info)
             p = models.Person(**person_info)
             session.add(p)
             existing_cdep_ids.add(p.cdep_id)
             new_people += 1
     print('added', new_people, 'people')
     session.commit()
Пример #6
0
def expand_minority_names():
    from mptracker.scraper.common import Scraper, get_cached_session, pqitems
    scraper = Scraper(get_cached_session(), use_cdep_opener=False)
    doc = get_minority_names()
    roots = doc['root_names']
    names = set()
    for root in roots:
        url = ('http://dexonline.ro/definitie'
               '/{root}/paradigma'.format(root=root))
        page = scraper.fetch_url(url)
        for td in pqitems(page, 'table.lexem td.form'):
            names.add(td.text().replace(' ', ''))
    if '—' in names:
        names.remove('—')
    doc['search_names'] = sorted(names)
    print(flask.json.dumps(doc, indent=2, sort_keys=True))
Пример #7
0
def expand_minority_names():
    from mptracker.scraper.common import Scraper, get_cached_session, pqitems
    scraper = Scraper(get_cached_session(), use_cdep_opener=False)
    doc = get_minority_names()
    roots = doc['root_names']
    names = set()
    for root in roots:
        url = ('http://dexonline.ro/definitie'
               '/{root}/paradigma'.format(root=root))
        page = scraper.fetch_url(url)
        for td in pqitems(page, 'table.lexem td.form'):
            names.add(td.text().replace(' ', ''))
    if '—' in names:
        names.remove('—')
    doc['search_names'] = sorted(names)
    print(flask.json.dumps(doc, indent=2, sort_keys=True))
Пример #8
0
class PersonScraper(Scraper):

    people_url = 'http://www.cdep.ro/pls/parlam/structura.de?leg={year}'

    def fetch_people(self, year=2012):
        people_page = self.fetch_url(self.people_url.format(year=year))
        for tr in pqitems(people_page, 'tr'):
            for a in pqitems(tr, 'a'):
                href = a.attr('href')
                if 'structura.mp' in href:
                    name = a.text()
                    cdep_id = get_cdep_id(href)
                    td = a.parents('tr')[2]
                    county_name = pq(td[3]).text()
                    minority = False
                    if county_name in ["Mino.", "Minoritati"]:
                        county_name = None
                        minority = True

                    yield {
                        'cdep_id': cdep_id,
                        'name': name,
                        'county_name': county_name,
                        'minority': minority,
                    }


if __name__ == '__main__':
    person_scraper = PersonScraper(get_cached_session())
    print(list(person_scraper.fetch_people()))
Пример #9
0
                            continue  # still looking for first speaker
                        text = paragraph.text()
                        steno_paragraph['text_buffer'].append(text)

        if steno_paragraph:
            save_paragraph()

        return steno_chapter

    def fetch_day(self, day):
        self.day = day
        self.chapter_serial = 0
        steno_day = StenoDay()
        steno_day.date = day
        for link, headline in self.chapters_for_day(day):
            self.chapter_serial += 1
            self.paragraph_serial = 0
            steno_chapter = self.parse_steno_page(link)
            steno_chapter.headline = headline
            steno_chapter.serial = self.get_chapter_serial()
            steno_day.chapters.append(steno_chapter)
        return steno_day


if __name__ == '__main__':
    steno_scraper = StenogramScraper(get_cached_session())
    steno_day = steno_scraper.fetch_day(date(2013, 6, 10))
    for steno_chapter in steno_day.chapters:
        for paragraph in steno_chapter.paragraphs:
            print(paragraph)