Пример #1
0
def convert_wildcard_query_value_query_key(query_unit):
    """
    Handle the case where we got a wildcard operator in query value
    :param query_unit:  A parsed query unit of quadra-tuple
    :return: key of query dict to be used in mongo query
    """
    db_type, attr, comp_op, query_value = query_unit
    assert "*" in query_value
    assert "*" not in attr, "Sorry, only ONE wildcard operator at a time."
    assert isinstance(query_value, str)
    assert query_value[0] == '"' and query_value[-1] == '"'
    assert comp_op == "", "Sorry, combined usage of comparison and" \
                          " wildcard operators is not allowed."
    query_value = query_value[1:-1]
    value_regex = "^" + query_value.replace("*", ".*") + "$"
    book_db, author_db = connect_to_mongo()
    target_db = book_db if db_type == "book" else author_db
    match_ids = []
    #  #### "$where" not supported in MongoDB Atlas free tier
    # in {"$where": f"this.amount.toString().match({value_regex})"}
    # so we will have to manually extract id of matching objects.
    for dic in target_db.find():
        instance_val = str(dic[attr])
        if re.findall(value_regex, instance_val):
            match_ids.append(dic["_id"])
    return {"_id": {"$in": match_ids}}
    def test_valid_one_dict_update(self):
        """
        Test valid json file of a single dict can be used to
        update value of existing object in database.
        First modify one author's url and then recover it.
        """
        path = JSON_PATH + "legal_one_author.json"
        _, author_db = connect_to_mongo()
        with open(path, "r+") as file:
            author_dic = json.load(file)

        author_id = author_dic["_id"]
        true_author_url = author_dic["author_url"]
        query_key = {"_id": author_id}
        update_val = {"$set": {"author_url": "duckduckgo.com"}}
        self.assertTrue(list(author_db.find(query_key)) != [])

        # modify author url to duckduckgo
        author_db.update_one(query_key, update_val)
        author_url_modified = author_db.find_one(query_key)["author_url"]
        self.assertEqual(author_url_modified, "duckduckgo.com")

        # recover author url
        insert_into_db(path, db_type="author")
        author_url_recovered = author_db.find_one(query_key)["author_url"]
        self.assertEqual(author_url_recovered, true_author_url)
    def test_valid_many_dict_update(self):
        """
        Test valid json file of a list of dicts can be used to
        update value of existing object in database.
        First modify books' url and then recover them.
        """
        path = JSON_PATH + "legal_many_books.json"
        book_db, _ = connect_to_mongo()
        with open(path, "r+") as file:
            book_dics = json.load(file)

        true_book_urls = []
        # first modify all target book_urls in db
        for book_dic in book_dics:
            book_id = book_dic["_id"]
            true_book_urls.append(book_dic["book_url"])
            query_key = {"_id": book_id}
            update_val = {"$set": {"book_url": "duckduckgo.com"}}
            book_db.update_one(query_key, update_val)
            self.assertTrue(list(book_db.find(query_key)) != [])

        # recover book url
        insert_into_db(path, db_type="book")
        for i, book_dic in enumerate(book_dics):
            book_id = book_dic["_id"]
            true_url = true_book_urls[i]
            query_key = {"_id": book_id}
            self.assertTrue(book_db.find_one(query_key)["book_url"], true_url)
Пример #4
0
def api_author():
    """
    Users can send GET/POST/PUT/DELETE requests to https://host/api/author?{_id, update_attr*}
    This function handles the backend behavior to response to these requests.
    """
    _, author_db = connect_to_mongo()
    if request.method == "GET":
        try:
            query_id = request.args["_id"]
            result = list(author_db.find({"_id": query_id}))
        except:
            abort(400, "ID not provided.")
        if not result:  # no matching
            abort(400, "No matching results in author database.")
        return jsonify(result)

    elif request.method == "POST":
        check_json_in_body(request)  # check json is properly passed
        dict_list = request.json  # This must be valid json as sanity test is done.
        if len(dict_list) > 1:
            abort(
                400, "Please send POST request"
                " to /api/authors to upload many authors.")
        updater.write_given_dict_list_to_db(dict_list, author_db)
        return """Status Code [200] : Upload succeeded."""

    elif request.method == "PUT":
        check_json_in_body(request)
        try:
            update_key = {"_id": request.json["_id"]}
        except:
            abort(400, "_id not provided.")

        matches = list(author_db.find(update_key))
        if not matches:
            abort(400, "Target instance ID not found in DB.")
        update_val = {k: v for k, v in request.json.items() if k != "_id"}
        for key in update_val.keys():  # check to-be-updated attribute exists
            if key not in updater.AUTHOR_ATTRS:
                abort(400, "Bad attempt to update non-existing attributes.")
        if update_val == {}:
            abort(400, "Empty update value.")
        author_db.update_one(update_key, {"$set": update_val})
        return """Status code [200] : Update succeeded."""

    elif request.method == "DELETE":
        try:
            query_id = request.args["_id"]
        except:
            abort(400, "ID not provided.")
        if not list(author_db.find({"_id": query_id})):
            abort(400, "Target ID not in author_DB.")
        author_db.delete_one({"_id": query_id})
        return """Status code [200] : Delete succeeded."""

    else:
        abort(404, "Related resource not found")
Пример #5
0
def api_authors():
    """
    Handles clients requests to upload multiple authors at a time.
    """
    _, author_db = connect_to_mongo()
    check_json_in_body(request)  # check json is properly passed
    dict_list = request.json  # This must be valid json as sanity test is done.
    if len(dict_list) == 1:
        abort(
            400, "Please send POST request"
            " to /api/author to upload a single author.")
    updater.write_given_dict_list_to_db(dict_list, author_db)
    return """Status Code [200] : Upload succeeded."""
Пример #6
0
    def test_client_search_with_id(self):
        """
        Test server can response to existing id correctly.
        :return:
        """
        book_db, author_db = connect_to_mongo()
        response1 = requests.get(HOST + "api/book", params={"_id": "58128"})
        expected1 = list(book_db.find({"_id": "58128"}))
        self.assertEqual(response1.json(), expected1)

        response2 = requests.get(HOST + "api/author", params={"_id": "45372"})
        expected2 = list(author_db.find({"_id": "45372"}))
        self.assertEqual(response2.json(), expected2)
Пример #7
0
def build_graph():
    """
    The enter interface for main-subcommand draw.
    :return:
    """
    book_db, author_db = connect_to_mongo()
    books, authors = list(book_db.find({})), list(author_db.find({}))
    index2book = {i: books[i]["book_url"] for i in range(len(books))}
    book2index = {book: i for i, book in index2book.items()}
    index2author = {i: books[i]["author_url"] for i in range(len(authors))}
    author2index = {author: i for i, author in index2author.items()}
    index_dict_wrapper = tuple(
        [index2book, book2index, index2author, author2index])
    adjacency_matrix = build_adjacency_matrix(books, authors,
                                              index_dict_wrapper)
    draw_graph(adjacency_matrix, books, authors)
    def test_valid_single_dict_create(self):
        """
        Test valid json file that contains a single dictionary could be inserted,
        by deleting one existing author and reinsert it.
        """
        path = JSON_PATH + "legal_one_author.json"
        _, author_db = connect_to_mongo()
        with open(path, "r+") as file:
            author_dic = json.load(file)

        author_id = author_dic["_id"]
        self.assertTrue(list(author_db.find({"_id": author_id})) != [])
        author_db.delete_one({"_id": author_id})
        self.assertTrue(list(author_db.find({"_id": author_id})) == [])

        insert_into_db(path, db_type="author")
        self.assertTrue(list(author_db.find({"_id": author_id})) != [])
Пример #9
0
    def test_server_upload_and_delete(self):
        """
        Test that server can handle upload and delete requests correctly.
        :return:
        """
        book_db, _ = connect_to_mongo()
        book_id = "3735293"
        self.assertTrue(list(book_db.find({"_id": book_id})))  # existing book

        requests.delete(HOST + "api/book", params={"_id":
                                                   book_id})  # delete request
        self.assertFalse(list(book_db.find({"_id": book_id
                                            })))  # not existing after del

        post_body = load_json_file("legal_one_book.json")
        requests.post(HOST + "api/book", json=post_body)
        self.assertTrue(list(book_db.find({"_id": book_id
                                           })))  # existing after uploading
    def test_valid_many_dict_create(self):
        """
        Test valid json file that contains a list of dictionaries could be inserted,
        by deleting many existing books and reinserting.
        """
        path = JSON_PATH + "legal_many_books.json"
        book_db, _ = connect_to_mongo()
        with open(path, "r+") as file:
            book_dics = json.load(file)

        for _, book_dic in enumerate(book_dics):
            book_id = book_dic["_id"]
            self.assertTrue(list(book_db.find({"_id": book_id})) != [])
            book_db.delete_one({"_id": book_id})
            self.assertTrue(list(book_db.find({"_id": book_id})) == [])

        insert_into_db(path, "book")
        for _, book_dic in enumerate(book_dics):
            book_id = book_dic["_id"]
            self.assertTrue(list(book_db.find({"_id": book_id})) != [])
Пример #11
0
def execute_parsed_query(query_units, logic_op):
    """
    Given a set of parsed query units and logic connection,
    search through the MongoDB and return matches as a list.
    :param query_units: One or two unit(s) of parsed query.
    :param logic_op: logic connection for query units
    :return: matching results as a list
    """
    assert len(query_units) != 0
    book_db, author_db = connect_to_mongo()
    db_type, _, _, _ = query_units[0]
    db = book_db if db_type == "book" else author_db

    if len(query_units) == 1:
        mongo_query_key = convert_to_mongo_query_key(query_units[0])
        return list(db.find(mongo_query_key))

    assert logic_op != "NA"
    mongo_query_key1 = convert_to_mongo_query_key(query_units[0])
    mongo_query_key2 = convert_to_mongo_query_key(query_units[1])
    result_mongo_query = {"$and": [mongo_query_key1, mongo_query_key2]} \
        if logic_op == "AND" else {"$or": [mongo_query_key1, mongo_query_key2]}
    return list(db.find(result_mongo_query))
Пример #12
0
def insert_into_db(src_json, db_type):
    """
    Safely insert the entities from json file into remote mongoDB.
    Json files are required to:
    (1) The json file exists;
    (2) Be in good shape, parse-able (either one dic or list of dic);
    (3) Every single stored object has no missing keys as stated above;
    (4) Not existed in the database;
    to be inserted into the database.
    :param src_json: the NAME json file, stored in the ../JSON directory. Do not add path!
    :param db_type: either book or author
    """
    book_db, author_db = connect_to_mongo()
    attrs_to_check = BOOK_ATTRS if db_type == "book" else AUTHOR_ATTRS
    db_to_update = book_db if db_type == "book" else author_db

    dictionary_list = load_json_file(src_json)

    # make sure there are no missing attributes
    check_missing_attributes(dictionary_list, attrs_to_check)

    # Write into target db
    write_given_dict_list_to_db(dictionary_list, db_to_update)
Пример #13
0
    def test_client_elastic_search(self):
        """
        Test server can handle valid elastic search requests correctly.
        :return:
        """
        book_db, _ = connect_to_mongo()
        query1 = "book.rating_count : > 500 AND book.rating_count : < 1000"
        response1 = requests.get(HOST + "api/search", params={"q": query1})
        expected1 = list(
            book_db.find({
                "$and": [{
                    "rating_count": {
                        "$gt": 500
                    }
                }, {
                    "rating_count": {
                        "$lt": 1000
                    }
                }]
            }))
        self.assertEqual(response1.json(), expected1)

        query2 = "book.review_count : < 30 OR book.rating_value : < 3.5"
        response2 = requests.get(HOST + "api/search", params={"q": query2})
        expected2 = list(
            book_db.find({
                "$or": [{
                    "review_count": {
                        "$lt": 30
                    }
                }, {
                    "rating_value": {
                        "$lt": 3.5
                    }
                }]
            }))
        self.assertEqual(response2.json(), expected2)
def scrape_start(is_new,
                 start_url,
                 max_book=200,
                 max_author=50,
                 progress_dir=None):
    """
    Scraping either from new url or continue last progress.
    :param is_new: whether there is a new starting url
    :param start_url: if is_new, then start url should be provided
    :param max_book: max number of books to scrape
    :param max_author: max number of author to scrape
    :param progress_dir: the directory of previously saved progress
    """
    bfs_queue, visited_books, visited_authors =\
        construct_bfs_info(is_new, start_url, progress_dir)
    book_db, author_db = connect_to_mongo()
    continuous_failure = 0
    while len(bfs_queue) != 0:
        book_recorded = book_db.count()
        author_recorded = author_db.count()
        if book_recorded >= max_book and author_recorded >= max_author:
            print(f"currently there are {book_recorded} books,"
                  f" and {author_recorded} authors recorded."
                  f" Both max criterions are reached.\n"
                  f" Set larger max_author or max_book"
                  f" to continue scraping")
            break  # scraping done
        book_url = bfs_queue.pop(0)
        if book_url in visited_books:
            continue  # book already visited

        try:
            # scrape the information of the current book
            print(f"\n\n\n\n Working on {book_recorded + 1} / {max_book} -"
                  f" Scraping {book_url}\n" + SEP)
            book_dict = book_scraper.scrape_book(book_url)
            assert book_dict is not None

            book_id = book_dict.get("book_id")
            assert book_id is not None  # use book_id as storage key
            assert book_dict.get("book_title") is not None
            author_url = book_dict.get("author_url")
            assert author_url is not None  # make sure author is found

            book_dict["_id"] = book_id
            if not list(book_db.find({"_id": book_id})):
                book_db.insert_one(book_dict)  # make sure no duplicate

            visited_books.add(book_url)
            bfs_queue.extend(book_dict["similar_book_urls"])

            # scrape the information of author of the book
            if author_url in visited_authors:
                print("Author already recorded.")
                print(f"Currently recorded {book_recorded + 1} books,"
                      f"and {author_recorded} authors.")
                continuous_failure = 0
                continue

            sleep(3)  # don't make the IP got blocked
            author_dict = author_scraper.scrape_author(author_url)
            assert author_dict is not None, "author_dict is None"
            author_id = author_dict.get("author_id")
            assert author_dict.get("author_name") is not None,\
                "author_name is None"  # indicates scraping failed
            author_dict["_id"] = author_id

            if not list(author_db.find({"_id": author_id
                                        })):  # make sure no duplicate
                author_db.insert_one(author_dict)
            visited_authors.add(author_url)

            # update bfs queue and visited records only if scraping succeeded
            save_progress(bfs_queue, visited_books, visited_authors,
                          progress_dir)
            continuous_failure = 0  # reset failure count
            sleep(3)
            print(f"Currently recorded {book_recorded + 1} books, "
                  f"and {author_recorded + 1} authors.")

        except:
            continuous_failure += 1
            if continuous_failure >= 5:
                break  # out IP are likely to be blocked
            print("Scraping failed ...")