def conf_corpus(corpus, corpus_conf_dir_name): if not corpus: logging.error("Error: empty corpus") else: logging.info("init corpus: {}".format(corpus)) if not corpus_conf_dir_name: corpus_conf_dir_name = corpus date_begin = datetime.datetime.now() # types results_types_common = conf_types(corpus, "common") results_types_corpus = conf_types(corpus, corpus_conf_dir_name) date_types = datetime.datetime.now() total_count = 0 logging.info("# Import common types:") if results_types_common: for entry in results_types_common: total_count += 1 logging.info("type_id: {}, _id: {}".format( entry["type_id"], entry["_id"])) else: logging.debug("Empty common types") logging.info("# Import {} types:".format(corpus)) if results_types_corpus: for entry in results_types_corpus: total_count += 1 logging.info("type_id: {}, _id: {}".format( entry["type_id"], entry["_id"])) else: logging.info("Empty {} types".format(corpus)) chrono.chrono_trace("conf_types", date_begin, date_types, total_count) # datafields results_fields_common = conf_fields(corpus, "common") results_fields_corpus = conf_fields(corpus, corpus) date_fields = datetime.datetime.now() total_count = 0 logging.info("# Import common fields:") if results_fields_common: for entry in results_fields_common: total_count += 1 logging.info("rec_type: {}, _id: {}".format( entry["rec_type"], entry["_id"])) else: logging.info("Empty common fields") logging.info("# Import {} fields:".format(corpus)) if results_fields_corpus: for entry in results_fields_corpus: total_count += 1 logging.info("rec_type: {}, _id: {}".format( entry["rec_type"], entry["_id"])) else: logging.info("Empty {} fields".format(corpus)) chrono.chrono_trace("conf_fields", date_types, date_fields, total_count)
def conf_corpus(corpus, corpus_conf_dir_name): if not corpus: logging.error("Error: empty corpus") else: logging.info("init corpus: {}".format(corpus)) if not corpus_conf_dir_name: corpus_conf_dir_name = corpus date_begin = datetime.datetime.now() # types results_types_common = conf_types(corpus, "common") results_types_corpus = conf_types(corpus, corpus_conf_dir_name) date_types = datetime.datetime.now() total_count = 0 logging.info("# Import common types:") if results_types_common: for entry in results_types_common: total_count += 1 logging.info("type_id: {}, _id: {}".format(entry["type_id"], entry["_id"])) else: logging.debug("Empty common types") logging.info("# Import {} types:".format(corpus)) if results_types_corpus: for entry in results_types_corpus: total_count += 1 logging.info("type_id: {}, _id: {}".format(entry["type_id"], entry["_id"])) else: logging.info("Empty {} types".format(corpus)) chrono.chrono_trace("conf_types", date_begin, date_types, total_count) # datafields results_fields_common = conf_fields(corpus, "common") results_fields_corpus = conf_fields(corpus, corpus) date_fields = datetime.datetime.now() total_count = 0 logging.info("# Import common fields:") if results_fields_common: for entry in results_fields_common: total_count += 1 logging.info("rec_type: {}, _id: {}".format(entry["rec_type"], entry["_id"])) else: logging.info("Empty common fields") logging.info("# Import {} fields:".format(corpus)) if results_fields_corpus: for entry in results_fields_corpus: total_count += 1 logging.info("rec_type: {}, _id: {}".format(entry["rec_type"], entry["_id"])) else: logging.info("Empty {} fields".format(corpus)) chrono.chrono_trace("conf_fields", date_types, date_fields, total_count)
def harvest_by_set(corpus, target, target_set): logging.info("harvest_by_set: {}".format(target_set)) date_begin = datetime.datetime.now() # harvest metajson_list = oaipmh_harvester.list_records(target, None, None, target_set) date_harvest = datetime.datetime.now() chrono.chrono_trace("harvest spire and convert to metajson", date_begin, date_harvest, len(ids)) # import result_import = corpus_service.import_metajson_list(corpus, metajson_list, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("harvest spire, convert metadata and save to MongoDB", date_harvest, date_import, len(result_import[0]))
def clean_corpus(corpus): if not corpus: logging.error("Error: empty corpus") else: logging.info("clean corpus: {}".format(corpus)) date_begin = datetime.datetime.now() repository_service.create_corpus(corpus) repository_service.empty_corpus(corpus) repository_service.init_corpus_indexes(corpus) date_end = datetime.datetime.now() chrono.chrono_trace("clean_corpus", date_begin, date_end, None)
def harvest_by_ids(corpus, target, ids): logging.info("harvest_by_ids: {}".format(ids)) date_begin = datetime.datetime.now() # harvest metajson_list = [] for identifier in ids: metajson_list.append(oaipmh_harvester.get_record(target, identifier)) date_harvest = datetime.datetime.now() chrono.chrono_trace("harvest spire and convert to metajson", date_begin, date_harvest, len(ids)) # import result_import = corpus_service.import_metajson_list(corpus, metajson_list, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("import", date_harvest, date_import, len(result_import))
def harvest_by_set(corpus, target, target_set): logging.info("harvest_by_set: {}".format(target_set)) date_begin = datetime.datetime.now() # harvest metajson_list = oaipmh_harvester.list_records(target, None, None, target_set) date_harvest = datetime.datetime.now() chrono.chrono_trace("harvest spire and convert to metajson", date_begin, date_harvest, len(ids)) # import result_import = corpus_service.import_metajson_list( corpus, metajson_list, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("harvest spire, convert metadata and save to MongoDB", date_harvest, date_import, len(result_import[0]))
def harvest_by_ids(corpus, target, ids): logging.info("harvest_by_ids: {}".format(ids)) date_begin = datetime.datetime.now() # harvest metajson_list = [] for identifier in ids: metajson_list.append(oaipmh_harvester.get_record(target, identifier)) date_harvest = datetime.datetime.now() chrono.chrono_trace("harvest spire and convert to metajson", date_begin, date_harvest, len(ids)) # import result_import = corpus_service.import_metajson_list( corpus, metajson_list, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("import", date_harvest, date_import, len(result_import))
chrono.chrono_trace("harvest spire and convert to metajson", date_begin, date_harvest, len(ids)) # import result_import = corpus_service.import_metajson_list(corpus, metajson_list, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("import", date_harvest, date_import, len(result_import)) if __name__ == "__main__": date_begin = datetime.datetime.now() # conf corpus corpus = "spire" corpus_service.clean_corpus(corpus) date_clean = datetime.datetime.now() chrono.chrono_trace("Initialize corpus", date_begin, date_clean, None) target = Target() target['identifier'] = 'spire' target['title'] = 'Sciences Po Institutional Repository' target['type'] = 'oaipmh' target['url'] = 'http://spire.sciencespo.fr/dissemination/oaipmh2-no-prefix-publications.xml' target['metadata_prefix'] = 'didl' ids = [ "oai:spire.sciencespo.fr:2441/dambferfb7dfprc9m26c8c8o3", "oai:spire.sciencespo.fr:2441/eo6779thqgm5r489makgoai85", "oai:spire.sciencespo.fr:2441/5l6uh8ogmqildh09h6m8hj429", "oai:spire.sciencespo.fr:2441/3fm4jv3k2s99lms9jb5i5asil", "oai:spire.sciencespo.fr:2441/f4rshpf3v1umfa09lb0joe5g5", "oai:spire.sciencespo.fr:2441/dambferfb7dfprc9m2h2og5ig",
source = "FNSP" rec_id_prefix = "sc_" input_dir_path = os.path.join("data", "num", "input") input_format = constants.FORMAT_UNIMARC output_dir_path = os.path.join("data", "num", "output") if not os.path.exists(output_dir_path): os.mkdir(output_dir_path) error_file_name = "".join(["validation-", corpus, ".txt"]) error_file_path = os.path.join(output_dir_path, error_file_name) #logging.debug("error_file_path: {}".format(error_file_path)) # conf corpus corpus_service.clean_corpus(corpus) corpus_service.conf_corpus(corpus, "aime") date_clean = datetime.datetime.now() chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None) # import input_file_paths = io_service.get_relevant_file_list_by_format( input_dir_path, input_format) results = corpus_service.import_metadata_files(corpus, input_file_paths, input_format, source, rec_id_prefix, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("Import corpus", date_clean, date_import, None) # Validate corpus_service.validate_corpus(corpus, error_file_path) date_validate = datetime.datetime.now() chrono.chrono_trace("Validate corpus", date_import, date_validate, None)
if __name__ == "__main__": date_begin = datetime.datetime.now() # conf params corpus = "perio" source = "Sciences Po | la bibliothèque" rec_id_prefix = "" input_file_path = os.path.join("data", "unimarc", "periouni.mrc") input_format = constants.FORMAT_UNIMARC csv_file_name = "".join(["validation-", corpus, ".csv"]) csv_file_path = os.path.join("data", "result", csv_file_name) # conf corpus corpus_service.clean_corpus(corpus) corpus_service.conf_corpus(corpus, "aime") date_clean = datetime.datetime.now() chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None) # import corpus_service.import_metadata_file(corpus, input_file_path, input_format, source, rec_id_prefix, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("Import corpus", date_clean, date_import, None) # Validate perio documents = repository_service.get_documents(corpus) validate_perios(documents, csv_file_path) date_validate = datetime.datetime.now() chrono.chrono_trace("Validate perio", date_import, date_validate, None)
source = "FNSP" rec_id_prefix = "sc" input_dir_path = os.path.join("data", "num", "input") input_format = constants.FORMAT_UNIMARC output_dir_path = os.path.join("data", "num", "output") if not os.path.exists(output_dir_path): os.mkdir(output_dir_path) error_file_name = "".join(["validation-", corpus, ".txt"]) error_file_path = os.path.join(output_dir_path, error_file_name) #logging.debug("error_file_path: {}".format(error_file_path)) # conf corpus corpus_service.clean_corpus(corpus) corpus_service.conf_corpus(corpus, "aime") date_clean = datetime.datetime.now() chrono.chrono_trace("Clean and conf corpus", date_begin, date_clean, None) # import input_file_paths = io_service.get_relevant_file_list_by_format(input_dir_path, input_format) results = corpus_service.import_metadata_files(corpus, input_file_paths, input_format, source, rec_id_prefix, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("Import corpus", date_clean, date_import, None) # Validate corpus_service.validate_corpus(corpus, error_file_path) date_validate = datetime.datetime.now() chrono.chrono_trace("Validate corpus", date_import, date_validate, None) # Export mods corpus_service.export_corpus(corpus, output_dir_path, constants.FORMAT_MODS, False, True) date_export_mods = datetime.datetime.now()
# import result_import = corpus_service.import_metajson_list( corpus, metajson_list, True, None) date_import = datetime.datetime.now() chrono.chrono_trace("import", date_harvest, date_import, len(result_import)) if __name__ == "__main__": date_begin = datetime.datetime.now() # conf corpus corpus = "spire" corpus_service.clean_corpus(corpus) date_clean = datetime.datetime.now() chrono.chrono_trace("Initialize corpus", date_begin, date_clean, None) target = Target() target['identifier'] = 'spire' target['title'] = 'Sciences Po Institutional Repository' target['type'] = 'oaipmh' target[ 'url'] = 'http://spire.sciencespo.fr/dissemination/oaipmh2-no-prefix-publications.xml' target['metadata_prefix'] = 'didl' ids = [ "oai:spire.sciencespo.fr:2441/dambferfb7dfprc9m26c8c8o3", "oai:spire.sciencespo.fr:2441/eo6779thqgm5r489makgoai85", "oai:spire.sciencespo.fr:2441/5l6uh8ogmqildh09h6m8hj429", "oai:spire.sciencespo.fr:2441/3fm4jv3k2s99lms9jb5i5asil", "oai:spire.sciencespo.fr:2441/f4rshpf3v1umfa09lb0joe5g5",