def test_insert_document_and_check_conflict(): """ Run the start.py file and insert a record into in_progress that has a conflict with the prod collection so main method should return the conflict id. Also check if the newly inserted document is in prod collection and with the correct manually_changed. Lastly fetch the manual document instead of the document in prod collection. """ id = "295cc564fe771fbb92b3278a6eee2d5cbcae2606-3" correct_conflicts = [{ "conflict_id": id, "title": " Velkommen til Trondheim kommune" }] f = open("model/test/test_data/test_data_in_progress.json") serialized_data = json.load(f) # Just to make sure the new document has changed we add a number number here. random_text = "Inserted_document_website_change: " + str( random.randint(1, 100000)) serialized_data[0]["content"]["texts"][0] = random_text # Check if the document was a conflict. conflict_ids = insert_documents(serialized_data) assert correct_conflicts[0]["conflict_id"] == conflict_ids[0][ "conflict_id"] assert correct_conflicts[0]["title"] == conflict_ids[0]["title"] # Fetch the document from the prod collection. factory = ModelFactory.get_instance() util.set_db(factory) document = next( factory.get_database().get_collection("prod").find({"id": id}), None) assert document["content"]["texts"][0] == random_text # Get the manually changed document. manually_changed_doc = util.check_manually_changed(factory, document) # Check if we actually got the manually changed document. assert manually_changed_doc["content"]["texts"][0] == "El manual changos"
def insert_documents(data): """ :param data: Is a list of serialized documents that should be inserted. :return: a list of conflict document ids. """ factory = ModelFactory.get_instance() util.set_db(factory) """ How we use MongoDB: We have 3 different collections: One for manual entries called "manual" One for production called "prod" One for the in_progress collection called "in_progress" After we have scraped we add all the scraped data into the collection "in_progress" and then we go through every entry in the "manual" collection and use that entry's ID to query both prod and in_progress collection. We compare the two contents in prod and in_progress to see if something changed from last time this was run and now. If they do not have the same content then we need to alert someone that the manual entry needs to be updated. When this is done in_progress will become our new prod. """ factory.get_database().drop_collection("in_progress") print('Starting insertion of {} documents'.format(len(data))) pbar = ProgressBar() for i, doc in enumerate(pbar(data)): factory.post_document(doc, "in_progress") print('Successfully inserted {} documents'.format(i + 1)) manual_documents = factory.get_collection("manual").find() # These are the IDs of the documents that are changed in manual and have been changed since # last time. conflict_ids = [] for manual_document in manual_documents: if "id" in manual_document: id = manual_document["id"] else: continue factory.get_database().get_collection("in_progress").update( {"id": id}, {"$set": { "manually_changed": True }}) prod_match = factory.get_collection("prod").find({"id": id}) in_progress_match = factory.get_collection("in_progress").find( {"id": id}) prod_match_doc = next(prod_match, None) in_progress_doc = next(in_progress_match, None) if prod_match_doc and in_progress_doc: if prod_match_doc['content'] != in_progress_doc['content']: conflict_ids.append({ "conflict_id": id, "title": in_progress_doc["content"]["title"] }) print("Conflict IDs are", conflict_ids) # Set ID to be unique. factory.get_collection("conflict_ids").create_index([("conflict_id", 1)], unique=True) # Insert all the conflict ids into our collection. for conflict in conflict_ids: try: factory.post_document(conflict, "conflict_ids") except pymongo.errors.DuplicateKeyError: # Then we already know this is a conflict ID and should not be added again to the list. pass # Delete the backup prod and rename prod to prod2 and then rename in_progress to prod. factory.get_database().drop_collection("prod2") try: factory.get_database().get_collection("prod").rename("prod2") except pymongo.errors.OperationFailure: pass factory.get_database().get_collection("in_progress").rename("prod") util.set_index("in_progress", factory) util.set_index("prod", factory) util.set_index("manual", factory) # Set query_text to be unique. factory.get_collection("unknown_queries").create_index([("query_text", 1)], unique=True) return conflict_ids
import model.db_util as db_util import api.flask.flask_util as flask_util from model.ModelFactory import ModelFactory from model.keyword_gen import lemmatize_content_keywords import json from flask import request, Blueprint web_api = Blueprint('Website API', __name__, template_folder='templates') factory = ModelFactory.get_instance() db_util.set_db(factory) @web_api.route("/v1/web/conflict_ids", methods=["GET"]) def get_all_conflict_ids(): """ :return: a list of {"title" "...", "id": "..."} """ conflict_ids_docs = factory.get_collection("conflict_ids").find() conflict_ids = [] for conflict_id_doc in conflict_ids_docs: conflict_ids.append({"id": conflict_id_doc["conflict_id"], "title": conflict_id_doc["title"]}) return json.dumps(conflict_ids) @web_api.route("/v1/web/content/", methods=["GET"]) def get_content(): """ :return: the content of the prod document and manual document (if we have it)