예제 #1
0
def clear_database(data_collection_type):
    """Clear the specified data collection in the database

    Args:
        data_collection_type (str):  Name of data collection, either 'book' or 'author'
    """
    collection_name = data_collection_type
    if collection_name not in ('book', 'author'):
        print("Error: no collection named " + data_collection_type +
              ", please enter 'book' or 'author' ")
        return
    database = DataCollection(MONGO_CONNECTION_STRING, "goodReads",
                              collection_name)
    database.empty_data_collection()
예제 #2
0
def export(data_collection_type, file_path):
    """Export data from the database into json file

    Args:
        dataCollectionType (str): Name of data collection, either 'book' or 'author'
        file_path (str): Path of json file to export data into
    """

    if data_collection_type not in ('book', 'author'):
        print("Error: no collection named " + data_collection_type +
              ", please enter 'book' or 'author' ")
        return
    datacollection = DataCollection(MONGO_CONNECTION_STRING, 'goodReads',
                                    data_collection_type)
    data = datacollection.get_all_entries()
    data = list(data)
    json_data = dumps(data, indent=2)

    with open(file_path, 'w') as file:
        file.write(json_data)
예제 #3
0
def scrape(data_collection_type, start_url, target_number):
    """Scrape data from goodreads starting with the starting url

    Args:
        data_collection_type (str):  Name of data collection, either 'book' or 'author'
        start_url (str): The url to start scraping from
        target_number (int): Number of books/authors to scrape
    """

    if data_collection_type == "book":
        if not re.search(r'([https://]?)www.goodreads.com/book/show/(.*)',
                         start_url):
            print("Please provide a valid url pointing to a book in goodReads")
            sys.exit(1)
        if target_number > 200:
            print("Cannot scrape more than 200 books at once")
            sys.exit(1)
        data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads",
                                         "book")
        book_scraper = BookScraper(data_collection)
        book_scraper.scrapeBooks(start_url, target_number)
    elif data_collection_type == "author":
        if not re.search(r'([https://]?)www.goodreads.com/author/show/(.*)',
                         start_url):
            print(
                "Please provide a valid url pointing to an author in goodReads"
            )
            sys.exit(1)
        if target_number > 50:
            print("Cannot scrape more than 50 authors at once")
            sys.exit(1)
        data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads",
                                         "author")
        author_scraper = AuthorScraper(data_collection)
        author_scraper.scrapeAuthors(start_url, target_number)
    else:
        print("Error: no collection named " + data_collection_type +
              ", please enter 'book' or 'author' ")
        return
예제 #4
0
def import_json(data_collection_type, file_path):
    """Import information in a json file to the database

    Args:
        dataCollectionType (str): Name of data collection, either 'book' or 'author'
        file_path (str): Path of json file to extract info from
    """

    if data_collection_type not in ('book', 'author'):
        print("Error: no collection named " + data_collection_type +
              ", please enter 'book' or 'author' ")
        return
    datacollection = DataCollection(MONGO_CONNECTION_STRING, 'goodReads',
                                    data_collection_type)

    with open(file_path) as file:
        file_data = json.load(file)
    for entry in file_data:
        if "_id" in entry:
            del entry["_id"]
        if not datacollection.document_already_exist(entry):
            datacollection.push_to_collection(entry)
예제 #5
0
class TestScraper(unittest.TestCase):
    def setUp(self):
        self.testDB = DataCollection(os.getenv('MONGO_CONNECTION_STRING'),
                                     "testDatabase", "testCollection")
        self.bookScraper = BookScraper(self.testDB)
        self.authroScraper = AuthorScraper(self.testDB)

    def testBookScraper(self):
        self.testDB.empty_data_collection()
        testurl = "https://www.goodreads.com/book/show/6185.Wuthering_Heights"
        self.bookScraper.scrape_one_book(testurl)
        self.assertEqual(1, self.testDB.get_collection_size())

    def testAuthorScraper(self):
        self.testDB.empty_data_collection()
        testurl = "https://www.goodreads.com/author/show/6485178.Fredrik_Backman"
        self.authroScraper.scrape_one_author(testurl)
        self.assertEqual(1, self.testDB.get_collection_size())
예제 #6
0
import pytest
import requests
import os
from dataCollection import DataCollection
from dotenv import load_dotenv
BAES_URL = "http://127.0.0.1:5000/api"

load_dotenv()
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')


book_data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads", 'book')
author_data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads", 'author')

def test_connect_to_api():
     response = requests.get(BAES_URL)
     assert response.status_code == 200

def test_get_book():
    book = {"url": "testurl2", "id":"testid2"}
    book_data_collection.push_to_collection(book)
    response = requests.get(BAES_URL + '/book?id=testid2')
    assert response.status_code == 200

def test_get_author():
    author = {"url": "testurl2", "id":"testid2"}
    author_data_collection.push_to_collection(author)
    response = requests.get(BAES_URL + '/author?id=testid2')
    assert response.status_code == 200
    
def test_put_author():
 def setUp(self):
     self.start_url = "https://www.goodreads.com/book/show/53175355-many-points-of-me"
     self.connection_string = os.getenv('MONGO_CONNECTION_STRING')
     self.testDB = DataCollection(self.connection_string, "testDatabase",
                                  "testCollection")
class TestDataCollection(unittest.TestCase):
    def setUp(self):
        self.start_url = "https://www.goodreads.com/book/show/53175355-many-points-of-me"
        self.connection_string = os.getenv('MONGO_CONNECTION_STRING')
        self.testDB = DataCollection(self.connection_string, "testDatabase",
                                     "testCollection")

    def testPushToBookCollection(self):
        self.testDB.empty_data_collection()
        test = {"url": 1, "test": 2}
        self.testDB.push_to_collection(test)
        self.assertEqual(True, self.testDB.document_already_exist(test))

    def testempty_data_collection(self):
        self.testDB.empty_data_collection()
        self.assertEqual(0, self.testDB.get_collection_size())

    def testget_collection_size(self):
        self.testDB.empty_data_collection()
        test1 = {"url": 3, "test": 2}
        test2 = {"url": 1, "test": 1}
        self.testDB.push_to_collection(test1)
        self.testDB.push_to_collection(test2)
        self.assertEqual(2, self.testDB.get_collection_size())

    def testdocument_already_exist(self):
        self.testDB.empty_data_collection()
        test1 = {"url": 3, "test": 2}
        test2 = {"url": 1, "test": 1}
        self.testDB.push_to_collection(test1)
        self.assertEqual(True, self.testDB.document_already_exist(test1))
        self.assertEqual(False, self.testDB.document_already_exist(test2))
예제 #9
0
 def setUp(self):
     self.testDB = DataCollection(os.getenv('MONGO_CONNECTION_STRING'),
                                  "testDatabase", "testCollection")
     self.bookScraper = BookScraper(self.testDB)
     self.authroScraper = AuthorScraper(self.testDB)