Python oai_harvest示例，openapc_toolkit.oai_harvest Python示例

示例#1

0

显示文件

文件： do_harvest.py 项目： tullney/openapc-de

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", "--selective_harvesting", action="store_true",
                        help=ARG_HELP_STRINGS["selective_harvesting"])
    args = parser.parse_args()
    with open("harvest_list.csv", "r") as harvest_list:
        reader = oat.UnicodeDictReader(harvest_list, encoding="utf-8")
        for line in reader:
            basic_url = line["basic_url"]
            if line["active"] == "TRUE":
                oat.print_g("Starting harvest from source " + basic_url)
                processing = None
                if len(line["processing"]) > 0:
                    processing = line["processing"]
                oat.oai_harvest(basic_url,
                                line["metadata_prefix"],
                                line["oai_set"],
                                processing,
                                args.selective_harvesting)
                now = datetime.datetime.now()
                date_string = now.strftime("%Y_%m_%d")
                file_name = "oai_harvest_" + date_string + ".csv"
                target = os.path.join("..", line["directory"], file_name)
                os.rename("out.csv", target)
            else:
                oat.print_y("Skipping inactive source " + basic_url)

示例#2

0

显示文件

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-s",
                        "--selective_harvesting",
                        action="store_true",
                        help=ARG_HELP_STRINGS["selective_harvesting"])
    args = parser.parse_args()
    with open("harvest_list.csv", "r") as harvest_list:
        reader = oat.UnicodeDictReader(harvest_list, encoding="utf-8")
        for line in reader:
            basic_url = line["basic_url"]
            if line["active"] == "TRUE":
                oat.print_g("Starting harvest from source " + basic_url)
                oai_set = line["oai_set"] if len(line["oai_set"]) > 0 else None
                prefix = line["metadata_prefix"] if len(
                    line["metadata_prefix"]) > 0 else None
                processing = line["processing"] if len(
                    line["processing"]) > 0 else None
                oat.oai_harvest(basic_url, prefix, oai_set, processing,
                                args.selective_harvesting)
                now = datetime.datetime.now()
                date_string = now.strftime("%Y_%m_%d")
                file_name = "oai_harvest_" + date_string + ".csv"
                target = os.path.join("..", line["directory"], file_name)
                os.rename("out.csv", target)
            else:
                oat.print_y("Skipping inactive source " + basic_url)

示例#3

0

显示文件

文件： do_harvest.py 项目： MPDL/unibiAPC

def main():
    with open("harvest_list.csv", "r") as harvest_list:
        reader = DictReader(harvest_list)
        for line in reader:
            basic_url = line["basic_url"]
            if line["active"] == "TRUE":
                oat.print_g("Starting harvest from source " + basic_url)
                oai_set = line["oai_set"] if len(line["oai_set"]) > 0 else None
                prefix = line["metadata_prefix"] if len(line["metadata_prefix"]) > 0 else None
                processing = line["processing"] if len(line["processing"]) > 0 else None
                directory = os.path.join("..", line["directory"])
                articles = oat.oai_harvest(basic_url, prefix, oai_set, processing)
                harvest_file_path = os.path.join(directory, "all_harvested_articles.csv")
                enriched_file_path = os.path.join(directory, "all_harvested_articles_enriched.csv")
                new_article_dicts, header = integrate_changes(articles, harvest_file_path, False)
                integrate_changes(articles, enriched_file_path, True)
                if header is None:
                    # if no header was returned, an "all_harvested" file doesn't exist yet
                    header = oat.OAI_COLLECTION_CONTENT.values()
                new_articles = [header]
                for article_dict in new_article_dicts:
                    new_articles.append([article_dict[key] for key in header])
                now = datetime.datetime.now()
                date_string = now.strftime("%Y_%m_%d")
                file_name = "new_articles_" + date_string + ".csv"
                target = os.path.join(directory, file_name)
                with open(target, "w") as t:
                    writer = oat.OpenAPCUnicodeWriter(t, openapc_quote_rules=True, has_header=True)
                    writer.write_rows(new_articles)
            else:
                oat.print_y("Skipping inactive source " + basic_url)

示例#4

0

显示文件

def main():
    with open("harvest_list.csv", "r") as harvest_list:
        reader = DictReader(harvest_list)
        for line in reader:
            basic_url = line["basic_url"]
            if line["active"] == "TRUE":
                oat.print_g("Starting harvest from source " + basic_url)
                oai_set = line["oai_set"] if len(line["oai_set"]) > 0 else None
                prefix = line["metadata_prefix"] if len(line["metadata_prefix"]) > 0 else None
                processing = line["processing"] if len(line["processing"]) > 0 else None
                directory = os.path.join("..", line["directory"])
                articles = oat.oai_harvest(basic_url, prefix, oai_set, processing)
                harvest_file_path = os.path.join(directory, "all_harvested_articles.csv")
                enriched_file_path = os.path.join(directory, "all_harvested_articles_enriched.csv")
                new_article_dicts, header = integrate_changes(articles, harvest_file_path, False)
                integrate_changes(articles, enriched_file_path, True)
                deal_wiley_path = os.path.join(directory, "all_harvested_articles_enriched_deal_wiley.csv")
                if os.path.isfile(deal_wiley_path):
                    integrate_changes(articles, deal_wiley_path, True)
                if header is None:
                    # if no header was returned, an "all_harvested" file doesn't exist yet
                    header = list(oat.OAI_COLLECTION_CONTENT.keys())
                new_articles = [header]
                for article_dict in new_article_dicts:
                    new_articles.append([article_dict[key] for key in header])
                now = datetime.datetime.now()
                date_string = now.strftime("%Y_%m_%d")
                file_name = "new_articles_" + date_string + ".csv"
                target = os.path.join(directory, file_name)
                with open(target, "w") as t:
                    writer = oat.OpenAPCUnicodeWriter(t, openapc_quote_rules=True, has_header=True)
                    writer.write_rows(new_articles)
            else:
                oat.print_y("Skipping inactive source " + basic_url)

示例#5

0

显示文件

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--integrate",
                        help=ARG_HELP_STRINGS["integrate"],
                        action="store_true")
    parser.add_argument("-o",
                        "--output",
                        help=ARG_HELP_STRINGS["output"],
                        action="store_true")
    args = parser.parse_args()

    with open("harvest_list.csv", "r") as harvest_list:
        reader = DictReader(harvest_list)
        for line in reader:
            basic_url = line["basic_url"]
            if line["active"] == "TRUE":
                oat.print_g("Starting harvest from source " + basic_url)
                oai_set = line["oai_set"] if len(line["oai_set"]) > 0 else None
                prefix = line["metadata_prefix"] if len(
                    line["metadata_prefix"]) > 0 else None
                processing = line["processing"] if len(
                    line["processing"]) > 0 else None
                directory = os.path.join("..", line["directory"])
                out_file_suffix = os.path.basename(
                    line["directory"]) if args.output else None
                articles = oat.oai_harvest(basic_url, prefix, oai_set,
                                           processing, out_file_suffix)
                harvest_file_path = os.path.join(directory,
                                                 "all_harvested_articles.csv")
                enriched_file_path = os.path.join(
                    directory, "all_harvested_articles_enriched.csv")
                new_article_dicts, header = integrate_changes(
                    articles, harvest_file_path, False, not args.integrate)
                integrate_changes(articles, enriched_file_path, True,
                                  not args.integrate)
                if header is None:
                    # if no header was returned, an "all_harvested" file doesn't exist yet
                    header = list(oat.OAI_COLLECTION_CONTENT.keys())
                new_articles = [header]
                for article_dict in new_article_dicts:
                    new_articles.append([article_dict[key] for key in header])
                now = datetime.datetime.now()
                date_string = now.strftime("%Y_%m_%d")
                file_name = "new_articles_" + date_string + ".csv"
                target = os.path.join(directory, file_name)
                with open(target, "w") as t:
                    writer = oat.OpenAPCUnicodeWriter(t,
                                                      openapc_quote_rules=True,
                                                      has_header=True)
                    writer.write_rows(new_articles)
            else:
                oat.print_y("Skipping inactive source " + basic_url)