Exemplo n.º 1
0
    def test_retrieve_all_arxiv_rows_returns_all_rows(self, mocked_batch):
        rows = ['row1', 'row2', 'row3', 'row4', 'row5', 'row6']
        mocked_batch.side_effect = [(rows[0:2], 'mytoken|2'),
                                    (rows[2:4], 'mytoken|4'),
                                    (rows[4:6], None)]

        result = list(retrieve_all_arxiv_rows())
        assert result == ['row1', 'row2', 'row3', 'row4', 'row5', 'row6']
Exemplo n.º 2
0
    def test_retrieve_all_arxiv_rows_calls_arxiv_batch_correctly(self, mocked_batch):
        rows = ['row1', 'row2', 'row3', 'row4', 'row5', 'row6']
        mocked_batch.side_effect = [(rows[0:2], 'mytoken|2'),
                                    (rows[2:4], 'mytoken|4'),
                                    (rows[4:6], None)]

        list(retrieve_all_arxiv_rows())
        assert mocked_batch.mock_calls == [mock.call(None),
                                           mock.call('mytoken|2'),
                                           mock.call('mytoken|4')]
Exemplo n.º 3
0
    def test_retrieve_all_arxiv_rows_handles_no_records_to_collect(
            self, mocked_batch):
        mocked_batch.side_effect = ValueError

        result = list(retrieve_all_arxiv_rows())
        assert result == []
Exemplo n.º 4
0
    def run(self):
        try:
            datetime.strptime(self.articles_from_date, '%Y-%m-%d')
        except ValueError:
            raise ValueError(f"From date for articles is invalid or not in YYYY-MM-DD format: {self.articles_from_date}")
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        with db_session(self.engine) as session:
            # create lookup for categories (less than 200) and set of article ids
            all_categories_lookup = {cat.id: cat for cat in session.query(Category).all()}
            logging.info(f"{len(all_categories_lookup)} existing categories")
            all_article_ids = {article.id for article in session.query(Article.id).all()}
            logging.info(f"{len(all_article_ids)} existing articles")
            already_updated = {article.id: article.updated for article
                               in (session.query(Article)
                                   .filter(Article.updated >= self.articles_from_date)
                                   .all())}
            logging.info(f"{len(already_updated)} records exist in the database with a date on or after the update date.")

            new_count = 0
            existing_count = 0
            new_articles_batch = BatchWriter(self.insert_batch_size,
                                             add_new_articles,
                                             session)
            existing_articles_batch = BatchWriter(self.insert_batch_size,
                                                  update_existing_articles,
                                                  self.engine)

            # retrieve and process, while inserting any missing categories
            for row in retrieve_all_arxiv_rows(**{'from': self.articles_from_date}):
                try:
                    # update only newer data
                    if row['updated'] <= already_updated[row['id']]:
                        continue
                except KeyError:
                    pass

                # check for missing categories
                for cat in row.get('categories', []):
                    try:
                        cat = all_categories_lookup[cat]
                    except KeyError:
                        logging.warning(f"Missing category: '{cat}' for article {row['id']}.  Adding to Category table")
                        new_cat = Category(id=cat)
                        session.add(new_cat)
                        session.commit()
                        all_categories_lookup.update({cat: ''})

                # create new Article and append to batch
                if row['id'] not in all_article_ids:
                    # convert category ids to Category objects
                    row['categories'] = [all_categories_lookup[cat]
                                         for cat in row.get('categories', [])]
                    new_articles_batch.append(Article(**row))
                    new_count += 1
                else:
                    # append to existing articles batch
                    existing_articles_batch.append(row)
                    existing_count += 1

                count = new_count + existing_count
                if not count % 1000:
                    logging.info(f"Processed {count} articles")
                if self.test and count == 1600:
                    logging.warning("Limiting to 1600 rows while in test mode")
                    break

            # insert any remaining new and existing articles
            logging.info("Processing final batches")
            if new_articles_batch:
                new_articles_batch.write()
            if existing_articles_batch:
                existing_articles_batch.write()

        logging.info(f"Total {new_count} new articles added")
        logging.info(f"Total {existing_count} existing articles updated")

        # mark as done
        logging.warning("Task complete")
        self.output().touch()