def compare_t1_t2(crawl): path_url_error = os.path.join(BVSALUD_DOWNLOADS_PATH, "urlsError.txt") file = open(path_url_error, 'w') file.write("No documents in urls") list_ids_t1 = Mongo.get_all_ids_list(COLLECTIONS_NONE_INDEXED_T1) list_ids_t2 = Mongo.get_all_ids_list(COLLECTIONS_NONE_INDEXED_T2) list_new_ids = list(set(list_ids_t2) - set(list_ids_t1)) list_modified_ids = list(set(list_ids_t1) - set(list_ids_t2)) print("\nNew records: ", len(list_new_ids), "\n") for id in list_new_ids: document_t2 = Mongo.get_document(COLLECTIONS_NONE_INDEXED_T2, id) try: print("New Document <<", document_t2['_id'], ">>\tmh: ", document_t2['mh']) print() Mongo.save_dict_to_mongo(document_t2, MODE_ALL) Mongo.save_to_mongo_updated_info(id, 'new', document_t2['db']) except (TypeError, AttributeError) as e: Mongo.save_exception_to_mongo( id, 'Saveing new none indexed document into mongo', id, str(e)) print("\nRecords to modify: ", len(list_modified_ids), "\n") for id in list_modified_ids: document_t1 = Mongo.get_document(COLLECTIONS_NONE_INDEXED_T1, id) print("\nDocument to modify: ", document_t1['_id']) if document_t1['db'] == 'IBECS': doc_id = Parse.find_id_by_alternate_id(id) else: doc_id = id base_url = crawl.get_base_url("url_for_id") url = base_url + doc_id while True: try: xml = urlopen(url) time.sleep(2) break except Exception as err: print("Error: ", err) time.sleep(90) bsObj = BeautifulSoup(xml, features='lxml') document_xml = bsObj.find('doc') if document_xml is not None: try: document_dict = Parse.xml_to_dictionary(document_xml) print("Updating document: ", doc_id) Mongo.update_modified(doc_id, document_dict['_id'], document_dict['alternate_id'], document_dict['mh'], document_dict['sh']) Mongo.save_to_mongo_updated_info(document_dict['_id'], 'update', document_dict['db']) print("Updated\n") except Exception as e: print("Error: ", e) Mongo.save_exception_to_mongo( document_dict['_id'], 'Update information from single <doc>', url, str(e)) else: try: print("Error: << id >> {url}") file.write("\n" + url) except: pass file.close() return True
def compare_t1_t2(): """The method compares articles new collection with the old one from mongoDB and if there is any new article or modified it will be saved into the collection correspondent. """ json_data = open(PATH_URL_JSON,"r") base_dictionary = json.load(json_data) try: base_url = base_dictionary["url_for_id"] except: base_url = "http://pesquisa.bvsalud.org/portal/?output=xml&lang=es&from=&sort=&format=&count=&fb=&page=1&index=tw&q=id%3A" list_ids_t1 = Mongo.get_all_ids_list(COLLECTIONS_NONE_INDEXED_T1) list_ids_t2 = Mongo.get_all_ids_list(COLLECTIONS_NONE_INDEXED_T2) list_new_ids = list(set(list_ids_t2) - set(list_ids_t1)) list_modified_ids = list(set(list_ids_t1) - set(list_ids_t2)) new_records_len = len(list_new_ids) print("\nNew records: ",new_records_len,"\n") for i, id in enumerate(list_new_ids): document_t2 = Mongo.get_document(COLLECTIONS_NONE_INDEXED_T2,id) try: print("\n",new_records_len-i, ") New Document <<",document_t2['_id'],">>\tmh: ",document_t2['mh']) print() Mongo.save_dict_to_mongo(document_t2,MODE_INDEXED) Mongo.save_to_mongo_updated_info(id,'new',document_t2['db']) except (TypeError, AttributeError) as e: Mongo.save_exception_to_mongo(id,'Saveing new none indexed document into mongo',id, str(e)) modify_records_len = len(list_modified_ids) print("\nRecords to modify: ",modify_records_len,"\n") for i, id in enumerate(list_modified_ids): document_t1 = Mongo.get_document(COLLECTIONS_NONE_INDEXED_T1,id) print("\n",modify_records_len-i,"-> Document to modify: ",document_t1['_id']) if document_t1['db'] == 'IBECS': try: doc_id = Parse.find_id_by_alternate_id(document_t1['_id']) except: doc_id = id print("Error: <<Finding id by alternate id >>") else: doc_id = id url = base_url + doc_id count = 0 while True and count < 2: try: xml = urlopen(url) time.sleep(2) break except Exception as err: count = count + 1 print(count,") Error: xml = urlopen(url) 170: ",err) print("Sleeping: ",SLEEP_TIME2, "seconds") time.sleep(SLEEP_TIME2) bsObj = BeautifulSoup(xml,features='lxml') document_xml = bsObj.find('doc') if document_xml is not None: try: document_dict = Parse.xml_to_dictionary(document_xml) print("Updating document: ", doc_id) Mongo.replace_doc_to_mongo(document_dict,document_t1['_id']) Mongo.save_to_mongo_updated_info(document_dict['_id'],'update',document_dict['db']) print("->> Updated!\n") except Exception as e: print("Error (while Mongo.replace_do_to_mongo(document_dict,document_t1['_id'])): ",e) Mongo.save_exception_to_mongo(document_dict['_id'],'Update information from single <doc>',url,str(e)) else: tmp_dict = {'_id' : doc_id} Mongo.save_dict_to_mongo(tmp_dict,MODE_PENDING) print(f"Error: No Document Found: {url}") return True