Exemplo n.º 1
0
    def get_pending_documents():
        print("Finding all pending documents")
        json_data = open(PATH_URL_JSON,"r")
        base_dictionary = json.load(json_data)
        try:
            base_url =  base_dictionary["url_for_id"]
        except:
            base_url =  "http://pesquisa.bvsalud.org/portal/?output=xml&lang=es&from=&sort=&format=&count=&fb=&page=1&index=tw&q=id%3A"


        ids_list = Mongo.get_all_ids_list(COLLECTION_PENDING)

        
        modify_records_len = len(ids_list)
        print("Total pending documents: ",modify_records_len)
        for i, old_id in enumerate(ids_list):
            print("\n",modify_records_len -i,"-> Document to modify: ",old_id)
            try:
                new_id = Parse.find_id_by_alternate_id(old_id)                
            except Exception as err:
                new_id = old_id
                print(f"Error ({err}): <<Finding id by alternate id >>")
    
            url = base_url + new_id
            count = 0
            while True and count < 2:
                try:
                    xml = urlopen(url)
                    time.sleep(2)
                    break
                except Exception as err:
                    count = count + 1
                    print(count,") Error: xml = urlopen(url) 170: ",err)
                    print("Sleeping: ",SLEEP_TIME2, "seconds")
                    time.sleep(SLEEP_TIME2)    
            bsObj = BeautifulSoup(xml,features='lxml')
            document_xml = bsObj.find('doc')
            if document_xml is not None:
                try:
                    document_dict = Parse.xml_to_dictionary(document_xml)
                    print("Updating document: ", new_id)
                    Mongo.replace_doc_to_mongo(document_dict,old_id)
                    Mongo.save_to_mongo_updated_info(document_dict['_id'],'update',document_dict['db'])
                    print("->> Updated!\n")
                    Mongo.delete_document_in_pending_coll(old_id)
                except Exception as e:
                    print("Error: (while in Parse_file.py >> Mongo.replace_do_to_mongo(document_dict,old_id): ",e)
                    Mongo.save_exception_to_mongo(document_dict['_id'],"while in Parse_file.py >> Mongo.replace_do_to_mongo(document_dict,old_id)",url,str(e))
            else:
                tmp_dict = {'_id' : old_id}
                Mongo.save_dict_to_mongo(tmp_dict,MODE_PENDING)
                print(f"Error: No Document Found: {url}")
Exemplo n.º 2
0
    def compare_t1_t2(crawl):

        path_url_error = os.path.join(BVSALUD_DOWNLOADS_PATH, "urlsError.txt")
        file = open(path_url_error, 'w')

        file.write("No documents in urls")
        list_ids_t1 = Mongo.get_all_ids_list(COLLECTIONS_NONE_INDEXED_T1)
        list_ids_t2 = Mongo.get_all_ids_list(COLLECTIONS_NONE_INDEXED_T2)

        list_new_ids = list(set(list_ids_t2) - set(list_ids_t1))
        list_modified_ids = list(set(list_ids_t1) - set(list_ids_t2))

        print("\nNew records: ", len(list_new_ids), "\n")

        for id in list_new_ids:
            document_t2 = Mongo.get_document(COLLECTIONS_NONE_INDEXED_T2, id)
            try:
                print("New Document <<", document_t2['_id'], ">>\tmh: ",
                      document_t2['mh'])
                print()
                Mongo.save_dict_to_mongo(document_t2, MODE_ALL)
                Mongo.save_to_mongo_updated_info(id, 'new', document_t2['db'])
            except (TypeError, AttributeError) as e:
                Mongo.save_exception_to_mongo(
                    id, 'Saveing new none indexed document into mongo', id,
                    str(e))

        print("\nRecords to modify: ", len(list_modified_ids), "\n")
        for id in list_modified_ids:
            document_t1 = Mongo.get_document(COLLECTIONS_NONE_INDEXED_T1, id)

            print("\nDocument to modify: ", document_t1['_id'])
            if document_t1['db'] == 'IBECS':
                doc_id = Parse.find_id_by_alternate_id(id)
            else:
                doc_id = id
            base_url = crawl.get_base_url("url_for_id")
            url = base_url + doc_id
            while True:
                try:
                    xml = urlopen(url)
                    time.sleep(2)
                    break
                except Exception as err:
                    print("Error: ", err)
                    time.sleep(90)
            bsObj = BeautifulSoup(xml, features='lxml')
            document_xml = bsObj.find('doc')

            if document_xml is not None:
                try:
                    document_dict = Parse.xml_to_dictionary(document_xml)
                    print("Updating document: ", doc_id)
                    Mongo.update_modified(doc_id, document_dict['_id'],
                                          document_dict['alternate_id'],
                                          document_dict['mh'],
                                          document_dict['sh'])
                    Mongo.save_to_mongo_updated_info(document_dict['_id'],
                                                     'update',
                                                     document_dict['db'])
                    print("Updated\n")
                except Exception as e:
                    print("Error: ", e)
                    Mongo.save_exception_to_mongo(
                        document_dict['_id'],
                        'Update information from single <doc>', url, str(e))
            else:
                try:
                    print("Error: << id >> {url}")
                    file.write("\n" + url)
                except:
                    pass
        file.close()
        return True
Exemplo n.º 3
0
    def compare_t1_t2():
        """The method compares articles new collection with the old one from mongoDB and if there is any new article or modified it will be saved into the collection correspondent.
        
        
        
        """
        json_data = open(PATH_URL_JSON,"r")
        base_dictionary = json.load(json_data)
        try:
            base_url =  base_dictionary["url_for_id"]
        except:
            base_url =  "http://pesquisa.bvsalud.org/portal/?output=xml&lang=es&from=&sort=&format=&count=&fb=&page=1&index=tw&q=id%3A"
                           
        list_ids_t1 = Mongo.get_all_ids_list(COLLECTIONS_NONE_INDEXED_T1)
        list_ids_t2 = Mongo.get_all_ids_list(COLLECTIONS_NONE_INDEXED_T2)

        list_new_ids  = list(set(list_ids_t2) - set(list_ids_t1))
        list_modified_ids  = list(set(list_ids_t1) - set(list_ids_t2))
        new_records_len = len(list_new_ids)
        print("\nNew records: ",new_records_len,"\n")

        for i, id in enumerate(list_new_ids):
            document_t2 = Mongo.get_document(COLLECTIONS_NONE_INDEXED_T2,id)
            try:
                print("\n",new_records_len-i, ") New Document <<",document_t2['_id'],">>\tmh: ",document_t2['mh'])
                print()
                Mongo.save_dict_to_mongo(document_t2,MODE_INDEXED)
                Mongo.save_to_mongo_updated_info(id,'new',document_t2['db'])                                                        
            except (TypeError, AttributeError) as e:
                Mongo.save_exception_to_mongo(id,'Saveing new none indexed document into mongo',id, str(e))

        modify_records_len = len(list_modified_ids)
        print("\nRecords to modify: ",modify_records_len,"\n")
        for i, id in enumerate(list_modified_ids):
            document_t1 = Mongo.get_document(COLLECTIONS_NONE_INDEXED_T1,id)
            
            print("\n",modify_records_len-i,"-> Document to modify: ",document_t1['_id'])
            if document_t1['db'] == 'IBECS':
                try:
                    doc_id = Parse.find_id_by_alternate_id(document_t1['_id'])                
                except:
                    doc_id = id
                    print("Error: <<Finding id by alternate id >>")
            else:
                doc_id = id
            url = base_url + doc_id
            count = 0
            while True and count < 2:
                try:
                    xml = urlopen(url)
                    time.sleep(2)
                    break
                except Exception as err:
                    count = count + 1
                    print(count,") Error: xml = urlopen(url) 170: ",err)
                    print("Sleeping: ",SLEEP_TIME2, "seconds")
                    time.sleep(SLEEP_TIME2)    
            bsObj = BeautifulSoup(xml,features='lxml')
            document_xml = bsObj.find('doc')
            
            if document_xml is not None:
                try:
                    document_dict = Parse.xml_to_dictionary(document_xml)
                    print("Updating document: ", doc_id)
                    Mongo.replace_doc_to_mongo(document_dict,document_t1['_id'])
                    Mongo.save_to_mongo_updated_info(document_dict['_id'],'update',document_dict['db'])
                    print("->> Updated!\n")
                except Exception as e:
                    print("Error (while Mongo.replace_do_to_mongo(document_dict,document_t1['_id'])): ",e)
                    Mongo.save_exception_to_mongo(document_dict['_id'],'Update information from single <doc>',url,str(e))
            else:
                tmp_dict = {'_id' : doc_id}
                Mongo.save_dict_to_mongo(tmp_dict,MODE_PENDING)
                print(f"Error: No Document Found: {url}")

        return True