def clear_database(data_collection_type): """Clear the specified data collection in the database Args: data_collection_type (str): Name of data collection, either 'book' or 'author' """ collection_name = data_collection_type if collection_name not in ('book', 'author'): print("Error: no collection named " + data_collection_type + ", please enter 'book' or 'author' ") return database = DataCollection(MONGO_CONNECTION_STRING, "goodReads", collection_name) database.empty_data_collection()
def export(data_collection_type, file_path): """Export data from the database into json file Args: dataCollectionType (str): Name of data collection, either 'book' or 'author' file_path (str): Path of json file to export data into """ if data_collection_type not in ('book', 'author'): print("Error: no collection named " + data_collection_type + ", please enter 'book' or 'author' ") return datacollection = DataCollection(MONGO_CONNECTION_STRING, 'goodReads', data_collection_type) data = datacollection.get_all_entries() data = list(data) json_data = dumps(data, indent=2) with open(file_path, 'w') as file: file.write(json_data)
def scrape(data_collection_type, start_url, target_number): """Scrape data from goodreads starting with the starting url Args: data_collection_type (str): Name of data collection, either 'book' or 'author' start_url (str): The url to start scraping from target_number (int): Number of books/authors to scrape """ if data_collection_type == "book": if not re.search(r'([https://]?)www.goodreads.com/book/show/(.*)', start_url): print("Please provide a valid url pointing to a book in goodReads") sys.exit(1) if target_number > 200: print("Cannot scrape more than 200 books at once") sys.exit(1) data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads", "book") book_scraper = BookScraper(data_collection) book_scraper.scrapeBooks(start_url, target_number) elif data_collection_type == "author": if not re.search(r'([https://]?)www.goodreads.com/author/show/(.*)', start_url): print( "Please provide a valid url pointing to an author in goodReads" ) sys.exit(1) if target_number > 50: print("Cannot scrape more than 50 authors at once") sys.exit(1) data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads", "author") author_scraper = AuthorScraper(data_collection) author_scraper.scrapeAuthors(start_url, target_number) else: print("Error: no collection named " + data_collection_type + ", please enter 'book' or 'author' ") return
def import_json(data_collection_type, file_path): """Import information in a json file to the database Args: dataCollectionType (str): Name of data collection, either 'book' or 'author' file_path (str): Path of json file to extract info from """ if data_collection_type not in ('book', 'author'): print("Error: no collection named " + data_collection_type + ", please enter 'book' or 'author' ") return datacollection = DataCollection(MONGO_CONNECTION_STRING, 'goodReads', data_collection_type) with open(file_path) as file: file_data = json.load(file) for entry in file_data: if "_id" in entry: del entry["_id"] if not datacollection.document_already_exist(entry): datacollection.push_to_collection(entry)
class TestScraper(unittest.TestCase): def setUp(self): self.testDB = DataCollection(os.getenv('MONGO_CONNECTION_STRING'), "testDatabase", "testCollection") self.bookScraper = BookScraper(self.testDB) self.authroScraper = AuthorScraper(self.testDB) def testBookScraper(self): self.testDB.empty_data_collection() testurl = "https://www.goodreads.com/book/show/6185.Wuthering_Heights" self.bookScraper.scrape_one_book(testurl) self.assertEqual(1, self.testDB.get_collection_size()) def testAuthorScraper(self): self.testDB.empty_data_collection() testurl = "https://www.goodreads.com/author/show/6485178.Fredrik_Backman" self.authroScraper.scrape_one_author(testurl) self.assertEqual(1, self.testDB.get_collection_size())
import pytest import requests import os from dataCollection import DataCollection from dotenv import load_dotenv BAES_URL = "http://127.0.0.1:5000/api" load_dotenv() MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING') book_data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads", 'book') author_data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads", 'author') def test_connect_to_api(): response = requests.get(BAES_URL) assert response.status_code == 200 def test_get_book(): book = {"url": "testurl2", "id":"testid2"} book_data_collection.push_to_collection(book) response = requests.get(BAES_URL + '/book?id=testid2') assert response.status_code == 200 def test_get_author(): author = {"url": "testurl2", "id":"testid2"} author_data_collection.push_to_collection(author) response = requests.get(BAES_URL + '/author?id=testid2') assert response.status_code == 200 def test_put_author():
def setUp(self): self.start_url = "https://www.goodreads.com/book/show/53175355-many-points-of-me" self.connection_string = os.getenv('MONGO_CONNECTION_STRING') self.testDB = DataCollection(self.connection_string, "testDatabase", "testCollection")
class TestDataCollection(unittest.TestCase): def setUp(self): self.start_url = "https://www.goodreads.com/book/show/53175355-many-points-of-me" self.connection_string = os.getenv('MONGO_CONNECTION_STRING') self.testDB = DataCollection(self.connection_string, "testDatabase", "testCollection") def testPushToBookCollection(self): self.testDB.empty_data_collection() test = {"url": 1, "test": 2} self.testDB.push_to_collection(test) self.assertEqual(True, self.testDB.document_already_exist(test)) def testempty_data_collection(self): self.testDB.empty_data_collection() self.assertEqual(0, self.testDB.get_collection_size()) def testget_collection_size(self): self.testDB.empty_data_collection() test1 = {"url": 3, "test": 2} test2 = {"url": 1, "test": 1} self.testDB.push_to_collection(test1) self.testDB.push_to_collection(test2) self.assertEqual(2, self.testDB.get_collection_size()) def testdocument_already_exist(self): self.testDB.empty_data_collection() test1 = {"url": 3, "test": 2} test2 = {"url": 1, "test": 1} self.testDB.push_to_collection(test1) self.assertEqual(True, self.testDB.document_already_exist(test1)) self.assertEqual(False, self.testDB.document_already_exist(test2))
def setUp(self): self.testDB = DataCollection(os.getenv('MONGO_CONNECTION_STRING'), "testDatabase", "testCollection") self.bookScraper = BookScraper(self.testDB) self.authroScraper = AuthorScraper(self.testDB)