Пример #1
0
    def compare_t1_t2(crawl):

        path_url_error = os.path.join(BVSALUD_DOWNLOADS_PATH, "urlsError.txt")
        file = open(path_url_error, 'w')

        file.write("No documents in urls")
        list_ids_t1 = Mongo.get_all_ids_list(COLLECTIONS_NONE_INDEXED_T1)
        list_ids_t2 = Mongo.get_all_ids_list(COLLECTIONS_NONE_INDEXED_T2)

        list_new_ids = list(set(list_ids_t2) - set(list_ids_t1))
        list_modified_ids = list(set(list_ids_t1) - set(list_ids_t2))

        print("\nNew records: ", len(list_new_ids), "\n")

        for id in list_new_ids:
            document_t2 = Mongo.get_document(COLLECTIONS_NONE_INDEXED_T2, id)
            try:
                print("New Document <<", document_t2['_id'], ">>\tmh: ",
                      document_t2['mh'])
                print()
                Mongo.save_dict_to_mongo(document_t2, MODE_ALL)
                Mongo.save_to_mongo_updated_info(id, 'new', document_t2['db'])
            except (TypeError, AttributeError) as e:
                Mongo.save_exception_to_mongo(
                    id, 'Saveing new none indexed document into mongo', id,
                    str(e))

        print("\nRecords to modify: ", len(list_modified_ids), "\n")
        for id in list_modified_ids:
            document_t1 = Mongo.get_document(COLLECTIONS_NONE_INDEXED_T1, id)

            print("\nDocument to modify: ", document_t1['_id'])
            if document_t1['db'] == 'IBECS':
                doc_id = Parse.find_id_by_alternate_id(id)
            else:
                doc_id = id
            base_url = crawl.get_base_url("url_for_id")
            url = base_url + doc_id
            while True:
                try:
                    xml = urlopen(url)
                    time.sleep(2)
                    break
                except Exception as err:
                    print("Error: ", err)
                    time.sleep(90)
            bsObj = BeautifulSoup(xml, features='lxml')
            document_xml = bsObj.find('doc')

            if document_xml is not None:
                try:
                    document_dict = Parse.xml_to_dictionary(document_xml)
                    print("Updating document: ", doc_id)
                    Mongo.update_modified(doc_id, document_dict['_id'],
                                          document_dict['alternate_id'],
                                          document_dict['mh'],
                                          document_dict['sh'])
                    Mongo.save_to_mongo_updated_info(document_dict['_id'],
                                                     'update',
                                                     document_dict['db'])
                    print("Updated\n")
                except Exception as e:
                    print("Error: ", e)
                    Mongo.save_exception_to_mongo(
                        document_dict['_id'],
                        'Update information from single <doc>', url, str(e))
            else:
                try:
                    print("Error: << id >> {url}")
                    file.write("\n" + url)
                except:
                    pass
        file.close()
        return True
Пример #2
0
    def compare_t1_t2():
        """The method compares articles new collection with the old one from mongoDB and if there is any new article or modified it will be saved into the collection correspondent.
        
        
        
        """
        json_data = open(PATH_URL_JSON,"r")
        base_dictionary = json.load(json_data)
        try:
            base_url =  base_dictionary["url_for_id"]
        except:
            base_url =  "http://pesquisa.bvsalud.org/portal/?output=xml&lang=es&from=&sort=&format=&count=&fb=&page=1&index=tw&q=id%3A"
                           
        list_ids_t1 = Mongo.get_all_ids_list(COLLECTIONS_NONE_INDEXED_T1)
        list_ids_t2 = Mongo.get_all_ids_list(COLLECTIONS_NONE_INDEXED_T2)

        list_new_ids  = list(set(list_ids_t2) - set(list_ids_t1))
        list_modified_ids  = list(set(list_ids_t1) - set(list_ids_t2))
        new_records_len = len(list_new_ids)
        print("\nNew records: ",new_records_len,"\n")

        for i, id in enumerate(list_new_ids):
            document_t2 = Mongo.get_document(COLLECTIONS_NONE_INDEXED_T2,id)
            try:
                print("\n",new_records_len-i, ") New Document <<",document_t2['_id'],">>\tmh: ",document_t2['mh'])
                print()
                Mongo.save_dict_to_mongo(document_t2,MODE_INDEXED)
                Mongo.save_to_mongo_updated_info(id,'new',document_t2['db'])                                                        
            except (TypeError, AttributeError) as e:
                Mongo.save_exception_to_mongo(id,'Saveing new none indexed document into mongo',id, str(e))

        modify_records_len = len(list_modified_ids)
        print("\nRecords to modify: ",modify_records_len,"\n")
        for i, id in enumerate(list_modified_ids):
            document_t1 = Mongo.get_document(COLLECTIONS_NONE_INDEXED_T1,id)
            
            print("\n",modify_records_len-i,"-> Document to modify: ",document_t1['_id'])
            if document_t1['db'] == 'IBECS':
                try:
                    doc_id = Parse.find_id_by_alternate_id(document_t1['_id'])                
                except:
                    doc_id = id
                    print("Error: <<Finding id by alternate id >>")
            else:
                doc_id = id
            url = base_url + doc_id
            count = 0
            while True and count < 2:
                try:
                    xml = urlopen(url)
                    time.sleep(2)
                    break
                except Exception as err:
                    count = count + 1
                    print(count,") Error: xml = urlopen(url) 170: ",err)
                    print("Sleeping: ",SLEEP_TIME2, "seconds")
                    time.sleep(SLEEP_TIME2)    
            bsObj = BeautifulSoup(xml,features='lxml')
            document_xml = bsObj.find('doc')
            
            if document_xml is not None:
                try:
                    document_dict = Parse.xml_to_dictionary(document_xml)
                    print("Updating document: ", doc_id)
                    Mongo.replace_doc_to_mongo(document_dict,document_t1['_id'])
                    Mongo.save_to_mongo_updated_info(document_dict['_id'],'update',document_dict['db'])
                    print("->> Updated!\n")
                except Exception as e:
                    print("Error (while Mongo.replace_do_to_mongo(document_dict,document_t1['_id'])): ",e)
                    Mongo.save_exception_to_mongo(document_dict['_id'],'Update information from single <doc>',url,str(e))
            else:
                tmp_dict = {'_id' : doc_id}
                Mongo.save_dict_to_mongo(tmp_dict,MODE_PENDING)
                print(f"Error: No Document Found: {url}")

        return True