def test_retrieve_all_arxiv_rows_returns_all_rows(self, mocked_batch): rows = ['row1', 'row2', 'row3', 'row4', 'row5', 'row6'] mocked_batch.side_effect = [(rows[0:2], 'mytoken|2'), (rows[2:4], 'mytoken|4'), (rows[4:6], None)] result = list(retrieve_all_arxiv_rows()) assert result == ['row1', 'row2', 'row3', 'row4', 'row5', 'row6']
def test_retrieve_all_arxiv_rows_calls_arxiv_batch_correctly(self, mocked_batch): rows = ['row1', 'row2', 'row3', 'row4', 'row5', 'row6'] mocked_batch.side_effect = [(rows[0:2], 'mytoken|2'), (rows[2:4], 'mytoken|4'), (rows[4:6], None)] list(retrieve_all_arxiv_rows()) assert mocked_batch.mock_calls == [mock.call(None), mock.call('mytoken|2'), mock.call('mytoken|4')]
def test_retrieve_all_arxiv_rows_handles_no_records_to_collect( self, mocked_batch): mocked_batch.side_effect = ValueError result = list(retrieve_all_arxiv_rows()) assert result == []
def run(self): try: datetime.strptime(self.articles_from_date, '%Y-%m-%d') except ValueError: raise ValueError(f"From date for articles is invalid or not in YYYY-MM-DD format: {self.articles_from_date}") # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) with db_session(self.engine) as session: # create lookup for categories (less than 200) and set of article ids all_categories_lookup = {cat.id: cat for cat in session.query(Category).all()} logging.info(f"{len(all_categories_lookup)} existing categories") all_article_ids = {article.id for article in session.query(Article.id).all()} logging.info(f"{len(all_article_ids)} existing articles") already_updated = {article.id: article.updated for article in (session.query(Article) .filter(Article.updated >= self.articles_from_date) .all())} logging.info(f"{len(already_updated)} records exist in the database with a date on or after the update date.") new_count = 0 existing_count = 0 new_articles_batch = BatchWriter(self.insert_batch_size, add_new_articles, session) existing_articles_batch = BatchWriter(self.insert_batch_size, update_existing_articles, self.engine) # retrieve and process, while inserting any missing categories for row in retrieve_all_arxiv_rows(**{'from': self.articles_from_date}): try: # update only newer data if row['updated'] <= already_updated[row['id']]: continue except KeyError: pass # check for missing categories for cat in row.get('categories', []): try: cat = all_categories_lookup[cat] except KeyError: logging.warning(f"Missing category: '{cat}' for article {row['id']}. Adding to Category table") new_cat = Category(id=cat) session.add(new_cat) session.commit() all_categories_lookup.update({cat: ''}) # create new Article and append to batch if row['id'] not in all_article_ids: # convert category ids to Category objects row['categories'] = [all_categories_lookup[cat] for cat in row.get('categories', [])] new_articles_batch.append(Article(**row)) new_count += 1 else: # append to existing articles batch existing_articles_batch.append(row) existing_count += 1 count = new_count + existing_count if not count % 1000: logging.info(f"Processed {count} articles") if self.test and count == 1600: logging.warning("Limiting to 1600 rows while in test mode") break # insert any remaining new and existing articles logging.info("Processing final batches") if new_articles_batch: new_articles_batch.write() if existing_articles_batch: existing_articles_batch.write() logging.info(f"Total {new_count} new articles added") logging.info(f"Total {existing_count} existing articles updated") # mark as done logging.warning("Task complete") self.output().touch()