def get_questions( year='2015', reimport_existing=False, cache_name=None, throttle=None, autoanalyze=False, ): from mptracker.scraper.questions import QuestionScraper from mptracker.questions import ocr_question, ocr_answer from mptracker.policy import calculate_question if reimport_existing: known_urls = set() else: url_query = models.db.session.query(models.Question.url) known_urls = set(row[0] for row in url_query) def skip_question(url): return url in known_urls http_session = create_session(cache_name=cache_name or _get_config_cache_name(), throttle=throttle and float(throttle), counters=True) questions_scraper = QuestionScraper(session=http_session, skip=skip_question) mandate_lookup = models.MandateLookup() question_patcher = TablePatcher(models.Question, models.db.session, key_columns=['number', 'date']) answer_patcher = TablePatcher(models.Answer, models.db.session, key_columns=['question_id']) new_ask_rows = 0 changed_questions = [] changed_answers = [] with question_patcher.process() as add, \ answer_patcher.process() as add_answer: for question in questions_scraper.run(int(year)): person_list = question.pop('person') question['addressee'] = '; '.join(question['addressee']) answer_data = question.pop('answer', None) result = add(question) q = result.row old_asked = {ask.mandate_id: ask for ask in q.asked} for name, person_year, person_number in person_list: mandate = mandate_lookup.find(name, person_year, person_number) if mandate.id in old_asked: old_asked.pop(mandate.id) else: ask = models.Ask(mandate=mandate) q.asked.append(ask) ask.set_meta('new', True) logger.info("Adding ask for %s: %s", q, mandate) new_ask_rows += 1 if result.is_changed: changed_questions.append(q) if old_asked: logger.warn("Removing %d old 'ask' records", len(old_asked)) for ask in old_asked.values(): models.db.session.delete(ask) if answer_data: assert q.id is not None answer_data['question_id'] = q.id answer_result = add_answer(answer_data) if answer_result.is_changed: changed_answers.append(answer_result.row) models.db.session.commit() if new_ask_rows: logger.info("Added %d ask records", new_ask_rows) counters = http_session.counters logger.info("HTTP: %d kb in %s requests, %.2f seconds", counters['bytes'] / 1024, counters['requests'], counters['download_time'].total_seconds()) if autoanalyze: logger.info("Scheduling jobs for %d questions", len(changed_questions)) for question in changed_questions: if question.pdf_url: ocr_question.delay(question.id, autoanalyze=True) if question.policy_domain_id is None: calculate_question.delay(question.id) logger.info("Scheduling jobs for %d answers", len(changed_answers)) for answer in changed_answers: ocr_answer.delay(answer.id)