Пример #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-s",
                        "--selective_harvesting",
                        action="store_true",
                        help=ARG_HELP_STRINGS["selective_harvesting"])
    args = parser.parse_args()
    with open("harvest_list.csv", "r") as harvest_list:
        reader = oat.UnicodeDictReader(harvest_list, encoding="utf-8")
        for line in reader:
            basic_url = line["basic_url"]
            if line["active"] == "TRUE":
                oat.print_g("Starting harvest from source " + basic_url)
                oai_set = line["oai_set"] if len(line["oai_set"]) > 0 else None
                prefix = line["metadata_prefix"] if len(
                    line["metadata_prefix"]) > 0 else None
                processing = line["processing"] if len(
                    line["processing"]) > 0 else None
                oat.oai_harvest(basic_url, prefix, oai_set, processing,
                                args.selective_harvesting)
                now = datetime.datetime.now()
                date_string = now.strftime("%Y_%m_%d")
                file_name = "oai_harvest_" + date_string + ".csv"
                target = os.path.join("..", line["directory"], file_name)
                os.rename("out.csv", target)
            else:
                oat.print_y("Skipping inactive source " + basic_url)
Пример #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", "--selective_harvesting", action="store_true",
                        help=ARG_HELP_STRINGS["selective_harvesting"])
    args = parser.parse_args()
    with open("harvest_list.csv", "r") as harvest_list:
        reader = oat.UnicodeDictReader(harvest_list, encoding="utf-8")
        for line in reader:
            basic_url = line["basic_url"]
            if line["active"] == "TRUE":
                oat.print_g("Starting harvest from source " + basic_url)
                processing = None
                if len(line["processing"]) > 0:
                    processing = line["processing"]
                oat.oai_harvest(basic_url,
                                line["metadata_prefix"],
                                line["oai_set"],
                                processing,
                                args.selective_harvesting)
                now = datetime.datetime.now()
                date_string = now.strftime("%Y_%m_%d")
                file_name = "oai_harvest_" + date_string + ".csv"
                target = os.path.join("..", line["directory"], file_name)
                os.rename("out.csv", target)
            else:
                oat.print_y("Skipping inactive source " + basic_url)
def get_hybrid_status(line, username):
    for issn in [7, 8, 9, 10]:
        if not oat.has_value(line[issn]):
            continue
        msg = 'Looking up ISSN {}...'
        oat.print_y(msg.format(line[issn]))
        jtoc_metadata = get_jtoc_metadata(line[issn], username)
        sleep(1)
        if jtoc_metadata["jtoc_id"] is not None:
            msg = ('Entry found (publisher: {}, title: {}, jtoc_ID: {}, ' +
                   'obtaining hybrid status...)')
            oat.print_g(
                msg.format(jtoc_metadata["jtoc_publisher"],
                           jtoc_metadata["jtoc_title"],
                           jtoc_metadata["jtoc_id"]))
            journal_type = get_jtoc_journal_type(jtoc_metadata["jtoc_id"])
            if not journal_type:
                oat.print_r("Error while obtaining hybrid status!")
                continue
            sleep(1)
            msg = "journaltocs type is '{}' , mapped to is_hybrid = {}"
            oat.print_g(msg.format(journal_type[0], journal_type[1]))
            return journal_type[1]
    oat.print_r("None of the ISSN values found in journaltocs!")
    return None
Пример #4
0
def main():
    with open("harvest_list.csv", "r") as harvest_list:
        reader = DictReader(harvest_list)
        for line in reader:
            basic_url = line["basic_url"]
            if line["active"] == "TRUE":
                oat.print_g("Starting harvest from source " + basic_url)
                oai_set = line["oai_set"] if len(line["oai_set"]) > 0 else None
                prefix = line["metadata_prefix"] if len(line["metadata_prefix"]) > 0 else None
                processing = line["processing"] if len(line["processing"]) > 0 else None
                directory = os.path.join("..", line["directory"])
                articles = oat.oai_harvest(basic_url, prefix, oai_set, processing)
                harvest_file_path = os.path.join(directory, "all_harvested_articles.csv")
                enriched_file_path = os.path.join(directory, "all_harvested_articles_enriched.csv")
                new_article_dicts, header = integrate_changes(articles, harvest_file_path, False)
                integrate_changes(articles, enriched_file_path, True)
                if header is None:
                    # if no header was returned, an "all_harvested" file doesn't exist yet
                    header = oat.OAI_COLLECTION_CONTENT.values()
                new_articles = [header]
                for article_dict in new_article_dicts:
                    new_articles.append([article_dict[key] for key in header])
                now = datetime.datetime.now()
                date_string = now.strftime("%Y_%m_%d")
                file_name = "new_articles_" + date_string + ".csv"
                target = os.path.join(directory, file_name)
                with open(target, "w") as t:
                    writer = oat.OpenAPCUnicodeWriter(t, openapc_quote_rules=True, has_header=True)
                    writer.write_rows(new_articles)
            else:
                oat.print_y("Skipping inactive source " + basic_url)
Пример #5
0
def main():
    with open("harvest_list.csv", "r") as harvest_list:
        reader = DictReader(harvest_list)
        for line in reader:
            basic_url = line["basic_url"]
            if line["active"] == "TRUE":
                oat.print_g("Starting harvest from source " + basic_url)
                oai_set = line["oai_set"] if len(line["oai_set"]) > 0 else None
                prefix = line["metadata_prefix"] if len(line["metadata_prefix"]) > 0 else None
                processing = line["processing"] if len(line["processing"]) > 0 else None
                directory = os.path.join("..", line["directory"])
                articles = oat.oai_harvest(basic_url, prefix, oai_set, processing)
                harvest_file_path = os.path.join(directory, "all_harvested_articles.csv")
                enriched_file_path = os.path.join(directory, "all_harvested_articles_enriched.csv")
                new_article_dicts, header = integrate_changes(articles, harvest_file_path, False)
                integrate_changes(articles, enriched_file_path, True)
                deal_wiley_path = os.path.join(directory, "all_harvested_articles_enriched_deal_wiley.csv")
                if os.path.isfile(deal_wiley_path):
                    integrate_changes(articles, deal_wiley_path, True)
                if header is None:
                    # if no header was returned, an "all_harvested" file doesn't exist yet
                    header = list(oat.OAI_COLLECTION_CONTENT.keys())
                new_articles = [header]
                for article_dict in new_article_dicts:
                    new_articles.append([article_dict[key] for key in header])
                now = datetime.datetime.now()
                date_string = now.strftime("%Y_%m_%d")
                file_name = "new_articles_" + date_string + ".csv"
                target = os.path.join(directory, file_name)
                with open(target, "w") as t:
                    writer = oat.OpenAPCUnicodeWriter(t, openapc_quote_rules=True, has_header=True)
                    writer.write_rows(new_articles)
            else:
                oat.print_y("Skipping inactive source " + basic_url)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"])
    parser.add_argument("count_column",
                        type=int,
                        help=ARG_HELP_STRINGS["count_column"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-s",
                        "--sort",
                        action="store_true",
                        help=ARG_HELP_STRINGS["sort"])

    args = parser.parse_args()

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            print("Encoding '{}' found in Python's codec collection " +
                  "as '{}'").format(args.encoding, codec.name)
            enc = args.encoding
        except LookupError:
            oat.print_r(
                "Error: '" + args.encoding + "' not found Python's " +
                "codec collection. Either look for a valid name here " +
                "(https://docs.python.org/2/library/codecs.html#standard-" +
                "encodings) or omit this argument to enable automated " +
                "guessing.")
            sys.exit()

    header, content = oat.get_csv_file_content(args.source_file, enc)

    column_name = "column " + str(args.count_column)
    if header:
        header_line = header[0]
        column_name = header_line[args.count_column]

    oat.print_g("Performing occurence count in column '" + column_name + "'")
    occurence_dict = OrderedDict()

    for line in content:
        try:
            value = line[args.count_column]
        except IndexError as ie:
            oat.print_y("IndexError ({}) at line {}, skipping...".format(
                ie.message, line))
            continue
        if value not in occurence_dict:
            occurence_dict[value] = 1
        else:
            occurence_dict[value] += 1

    if args.sort:
        occurence_dict = OrderedDict(
            sorted(occurence_dict.items(), key=lambda x: x[1], reverse=True))

    for item in occurence_dict.items():
        print item[0] + ": " + str(item[1])
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    
    args = parser.parse_args()
    
    quote_rules = args.openapc_quote_rules
    
    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            print ("Encoding '{}' found in Python's codec collection " +
                   "as '{}'").format(args.encoding, codec.name)
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()
        
    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and" +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    header, content = oat.get_csv_file_content(args.csv_file, enc)

    line_num = 1
    for line in content:
        publisher = line[5]
        journal = line[6]
        journal_new = oat.get_unified_journal_title(journal)
        publisher_new = oat.get_unified_publisher_name(publisher)
        if publisher_new != publisher:
            line[5] = publisher_new
            msg = u"Line {}: Updated publisher name ({} -> {})"
            oat.print_g(msg.format(line_num, publisher, publisher_new))
        if journal_new != journal:
            line[6] = journal_new
            msg = u"Line {}: Updated journal_full_title ({} -> {})"
            oat.print_g(msg.format(line_num, journal, journal_new))
        line_num += 1
    
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows(header + content)
Пример #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--integrate",
                        help=ARG_HELP_STRINGS["integrate"],
                        action="store_true")
    parser.add_argument("-o",
                        "--output",
                        help=ARG_HELP_STRINGS["output"],
                        action="store_true")
    args = parser.parse_args()

    with open("harvest_list.csv", "r") as harvest_list:
        reader = DictReader(harvest_list)
        for line in reader:
            basic_url = line["basic_url"]
            if line["active"] == "TRUE":
                oat.print_g("Starting harvest from source " + basic_url)
                oai_set = line["oai_set"] if len(line["oai_set"]) > 0 else None
                prefix = line["metadata_prefix"] if len(
                    line["metadata_prefix"]) > 0 else None
                processing = line["processing"] if len(
                    line["processing"]) > 0 else None
                directory = os.path.join("..", line["directory"])
                out_file_suffix = os.path.basename(
                    line["directory"]) if args.output else None
                articles = oat.oai_harvest(basic_url, prefix, oai_set,
                                           processing, out_file_suffix)
                harvest_file_path = os.path.join(directory,
                                                 "all_harvested_articles.csv")
                enriched_file_path = os.path.join(
                    directory, "all_harvested_articles_enriched.csv")
                new_article_dicts, header = integrate_changes(
                    articles, harvest_file_path, False, not args.integrate)
                integrate_changes(articles, enriched_file_path, True,
                                  not args.integrate)
                if header is None:
                    # if no header was returned, an "all_harvested" file doesn't exist yet
                    header = list(oat.OAI_COLLECTION_CONTENT.keys())
                new_articles = [header]
                for article_dict in new_article_dicts:
                    new_articles.append([article_dict[key] for key in header])
                now = datetime.datetime.now()
                date_string = now.strftime("%Y_%m_%d")
                file_name = "new_articles_" + date_string + ".csv"
                target = os.path.join(directory, file_name)
                with open(target, "w") as t:
                    writer = oat.OpenAPCUnicodeWriter(t,
                                                      openapc_quote_rules=True,
                                                      has_header=True)
                    writer.write_rows(new_articles)
            else:
                oat.print_y("Skipping inactive source " + basic_url)
Пример #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"])
    parser.add_argument("count_column", type=int, help=ARG_HELP_STRINGS["count_column"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-s", "--sort", action="store_true", help=ARG_HELP_STRINGS["sort"])

    args = parser.parse_args()

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            print(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            oat.print_r("Error: '" + args.encoding + "' not found Python's " +
                        "codec collection. Either look for a valid name here " +
                        "(https://docs.python.org/2/library/codecs.html#standard-" +
                        "encodings) or omit this argument to enable automated " +
                        "guessing.")
            sys.exit()

    header, content = oat.get_csv_file_content(args.source_file, enc)

    column_name = "column " + str(args.count_column)
    if header:
        header_line = header[0]
        column_name = header_line[args.count_column]

    oat.print_g("Performing occurence count in column '" + column_name + "'")
    occurence_dict = OrderedDict()

    for line in content:
        try:
            value = line[args.count_column]
        except IndexError as ie:
            oat.print_y("IndexError ({}) at line {}, skipping...".format(ie.message, line))
            continue
        if value not in occurence_dict:
            occurence_dict[value] = 1
        else:
            occurence_dict[value] += 1

    if args.sort:
        occurence_dict = OrderedDict(sorted(occurence_dict.items(), key=lambda x: x[1],
                                            reverse=True))

    for item in occurence_dict.items():
        print(item[0] + ": " + str(item[1]))
Пример #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"])
    args = parser.parse_args()

    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(oat.ANSIColorFormatter())
    bufferedHandler = oat.BufferedErrorHandler(handler)
    bufferedHandler.setFormatter(oat.ANSIColorFormatter())
    logging.root.addHandler(handler)
    logging.root.addHandler(bufferedHandler)
    logging.root.setLevel(logging.INFO)

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = ("Encoding '{}' found in Python's codec collection " +
                   "as '{}'").format(args.encoding, codec.name)
            oat.print_g(msg)
            enc = args.encoding
        except LookupError:
            msg = ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            oat.print_r(msg)
            sys.exit()

    head, content = oat.get_csv_file_content(args.csv_file, enc)
    content = head + content

    line_num = 0
    for line in content:
        line_num += 1
        if args.start and args.start > line_num:
            continue
        if args.end and args.end < line_num:
            continue
        # Check hybrid status
        if line[4] != "TRUE":
            continue
        institution = line[0]
        period = line[1]
        doi = line[3]
        publisher = line[5]
        journal = line[6]
        for lpl in lpl_list:
            if lpl.publisher_matches(publisher):
                init_msg = (u"Line {}: Checking {} article from {}, published in '" +
                            "{}'...").format(line_num, institution, period, journal)
                oat.print_b(init_msg)
                page_content = get_landingpage_content(doi, lpl)
                if page_content is None:
                    continue
                pdf_link = lpl.search_for_oa(page_content)
                if pdf_link is None:
                    error_msg = (u"No PDF link found! (line {}, DOI: " +
                                 "http://doi.org/{}").format(line_num, doi)
                    logging.error(error_msg)
                elif pdf_link == "":
                    warning_msg = (u"A RegexGroup matched, but no PDF " +
                                   "link was found! (line {}, DOI: " +
                                   "http://doi.org/{}").format(line_num, doi)
                    logging.warning(warning_msg)
                else:
                    oat.print_g(u"PDF link found: " + pdf_link)
        time.sleep(1)

    if not bufferedHandler.buffer:
        oat.print_g("\nLookup finished, all articles were accessible on sciencedirect")
    else:
        oat.print_r("\nLookup finished, not all articles could be accessed on sciencedirect:\n")
    # closing will implicitly flush the handler and print any buffered
    # messages to stderr
    bufferedHandler.close()
Пример #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("new_file", help=ARG_HELP_STRINGS["new_file"])
    parser.add_argument("target_file", help=ARG_HELP_STRINGS["new_file"])
    parser.add_argument('cost_tolerance', type=float, help=ARG_HELP_STRINGS["cost_tolerance"])
    parser.add_argument('enriched_files', nargs='+', help=ARG_HELP_STRINGS["enriched_files"])
    parser.add_argument('-b', '--batch', type=int, help=ARG_HELP_STRINGS["batch"])
    
    args = parser.parse_args()
    
    target_file_name = get_filename(args.target_file)
    new_file_name = get_filename(args.new_file)
    
    
    for path in args.enriched_files:
        if not os.path.isfile(path):
            oat.print_r('Error: "' + path + '" is no valid file path!')
            sys.exit()
        ENRICHED_FILES[path] = {"modified": False, "file_name": get_filename(path)}
        ENRICHED_FILES[path]["header"], ENRICHED_FILES[path]["content"] = oat.get_csv_file_content(path, enc="utf-8", force_header=True)
        
    
    target_header, target_content = oat.get_csv_file_content(args.target_file, enc="utf-8", force_header=True)
    new_header, new_content = oat.get_csv_file_content(args.new_file, enc="utf-8", force_header=True)
    ud_header, ud_content = oat.get_csv_file_content(UD_FILE, enc="utf-8", force_header=True)
    
    duplicates = []
    target_dois = [line[3] for line in target_content]
    
    for new_index, line in enumerate(new_content):
        doi = line[3]
        if doi == "NA" or doi not in target_dois:
            continue
        else:
            target_index = get_duplicate_index(target_content, doi)
            duplicates.append((new_index, target_index))
    
    count = 0
    for pair in duplicates:
        new_line = new_content[pair[0]]
        target_line = target_content[pair[1]]
        doi = target_line[3]
        new_cost = float(new_line[2])
        target_cost = float(target_line[2])
        if new_cost >= target_cost:
            deviation = (new_cost - target_cost) / new_cost
        else:
            deviation = (target_cost - new_cost) / target_cost
        oat.print_b("Duplicate found:")
        print("In new file " + new_file_name + ":")
        print(",".join(new_line))
        print("In target file " + target_file_name + ":")
        print(",".join(target_line))
        if new_line[0] != target_line[0]:
            msg = 'Institutional mismatch "{}"/"{}". Lines will be deleted and added to the unresolved duplicates file.'
            oat.print_r(msg.format(new_line[0],target_line[0]))
            new_content[pair[0]] = list(EMPTY_LINE)
            target_content[pair[1]] = REPLACEMENT
            ud_content += [target_line]
            ud_content += [new_line]
            path, index = find_in_enriched_files(doi)
            ENRICHED_FILES[path]["content"][index] = list(EMPTY_LINE)
            ENRICHED_FILES[path]["modified"] = True
        elif deviation <= args.cost_tolerance:
            msg = "Cost deviation between {} and {} is below tolerance threshold ({} <= {}). Entries are treated as equal, only the new one will be deleted."
            oat.print_g(msg.format(new_cost, target_cost, deviation, args.cost_tolerance))
            new_content[pair[0]] = list(EMPTY_LINE)
        else:
            msg = "Cost deviation between {} and {} exceeds tolerance threshold ({} > {}). Entries are treated as different, both will be deleted."
            oat.print_y(msg.format(new_cost, target_cost, deviation, args.cost_tolerance))
            new_content[pair[0]] = list(EMPTY_LINE)
            target_content[pair[1]] = REPLACEMENT
            path, index = find_in_enriched_files(doi)
            ENRICHED_FILES[path]["content"][index] = list(EMPTY_LINE)
            ENRICHED_FILES[path]["modified"] = True
        count += 1
        if args.batch and count >= args.batch:
            break

    while REPLACEMENT in target_content:
        target_content.remove(REPLACEMENT)
    
    with open(args.target_file, 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True)
        writer.write_rows(target_header + target_content)
    with open(args.new_file, 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True)
        writer.write_rows(new_header + new_content)
    with open(UD_FILE, 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True)
        writer.write_rows(ud_header + ud_content)
    for path, enriched_file in ENRICHED_FILES.items():
        if enriched_file["modified"]:
            with open(path, 'w') as out:
                writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True)
                writer.write_rows(enriched_file["header"] + enriched_file["content"])
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-O", "--offsetting_mode", help=ARG_HELP_STRINGS["offsetting"])
    parser.add_argument("-b", "--bypass-cert-verification", action="store_true",
                        help=ARG_HELP_STRINGS["bypass"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-f", "--force", action="store_true",
                        help=ARG_HELP_STRINGS["force"])
    parser.add_argument("-i", "--ignore-header", action="store_true",
                        help=ARG_HELP_STRINGS["ignore_header"])
    parser.add_argument("-j", "--force-header", action="store_true",
                        help=ARG_HELP_STRINGS["force_header"])
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-a", "--add-unknown-columns", action="store_true",
                        help=ARG_HELP_STRINGS["unknown_columns"])
    parser.add_argument("-d", "--dialect", choices=["excel", "excel-tab", "unix"],
                        help=ARG_HELP_STRINGS["dialect"])
    parser.add_argument("-v", "--verbose", action="store_true",
                        help=ARG_HELP_STRINGS["verbose"])
    parser.add_argument("-o", "--overwrite", action="store_true",
                        help=ARG_HELP_STRINGS["overwrite"])
    parser.add_argument("-u", "--update", action="store_true",
                        help=ARG_HELP_STRINGS["update"])
    parser.add_argument("-r", "--round_monetary", action="store_true",
                        help=ARG_HELP_STRINGS["round_monetary"])
    parser.add_argument("--no-crossref", action="store_true",
                        help=ARG_HELP_STRINGS["no_crossref"])
    parser.add_argument("--no-pubmed", action="store_true",
                        help=ARG_HELP_STRINGS["no_pubmed"])
    parser.add_argument("--no-doaj", action="store_true",
                        help=ARG_HELP_STRINGS["no_doaj"])
    parser.add_argument("-institution", "--institution_column", type=int,
                        help=ARG_HELP_STRINGS["institution"])
    parser.add_argument("-period", "--period_column", type=int,
                        help=ARG_HELP_STRINGS["period"])
    parser.add_argument("-doi", "--doi_column", type=int,
                        help=ARG_HELP_STRINGS["doi"])
    parser.add_argument("-euro", "--euro_column", type=int,
                        help=ARG_HELP_STRINGS["euro"])
    parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int,
                        help=ARG_HELP_STRINGS["is_hybrid"])
    parser.add_argument("-publisher", "--publisher_column", type=int,
                        help=ARG_HELP_STRINGS["publisher"])
    parser.add_argument("-journal_full_title", "--journal_full_title_column",
                        type=int, help=ARG_HELP_STRINGS["journal_full_title"])
    parser.add_argument("-book_title", "--book_title_column",
                        type=int, help=ARG_HELP_STRINGS["book_title"])
    parser.add_argument("-issn", "--issn_column",
                        type=int, help=ARG_HELP_STRINGS["issn"])
    parser.add_argument("-isbn", "--isbn_column",
                        type=int, help=ARG_HELP_STRINGS["isbn"])
    parser.add_argument("-backlist_oa", "--backlist_oa_column",
                        type=int, help=ARG_HELP_STRINGS["backlist_oa"])
    parser.add_argument("-additional_isbns", "--additional_isbn_columns", type=int, nargs='+',
                        help=ARG_HELP_STRINGS["additional_isbns"])
    parser.add_argument("-url", "--url_column",
                        type=int, help=ARG_HELP_STRINGS["url"])
    parser.add_argument("-start", type=int, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["end"])

    args = parser.parse_args()

    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(oat.ANSIColorFormatter())
    bufferedHandler = oat.BufferedErrorHandler(handler)
    bufferedHandler.setFormatter(oat.ANSIColorFormatter())
    logging.root.addHandler(handler)
    logging.root.addHandler(bufferedHandler)
    logging.root.setLevel(logging.INFO)

    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            msg = "locale '{}' not found, normalised to '{}'".format(
                  args.locale, norm)
            oat.print_y(msg)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            oat.print_g("Using locale " + loc)
        except locale.Error as loce:
            msg = "Setting locale to {} failed: {}".format(norm, loce.message)
            oat.print_r(msg)
            sys.exit()

    enc = None # CSV file encoding
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = ("Encoding '{}' found in Python's codec collection " +
                   "as '{}'").format(args.encoding, codec.name)
            oat.print_g(msg)
            enc = args.encoding
        except LookupError:
            msg = ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            oat.print_r(msg)
            sys.exit()

    result = oat.analyze_csv_file(args.csv_file, enc=enc)
    if result["success"]:
        csv_analysis = result["data"]
        print(csv_analysis)
    else:
        print(result["error_msg"])
        sys.exit()

    if args.dialect:
        dialect = args.dialect
        oat.print_g('Dialect sniffing results ignored, using built-in CSV dialect "' + dialect + '"')
    else:
        dialect = csv_analysis.dialect

    if enc is None:
        enc = csv_analysis.enc
    has_header = csv_analysis.has_header or args.force_header

    if enc is None:
        print("Error: No encoding given for CSV file and automated " +
              "detection failed. Please set the encoding manually via the " +
              "--enc argument")
        sys.exit()

    csv_file = open(args.csv_file, "r", encoding=enc)
    reader = csv.reader(csv_file, dialect=dialect)

    first_row = next(reader)
    num_columns = len(first_row)
    print("\nCSV file has {} columns.".format(num_columns))

    csv_file.seek(0)
    reader = csv.reader(csv_file, dialect=dialect)

    if args.update and args.overwrite:
        oat.print_r("Error: Either use the -u or the -o option, not both.")
        sys.exit()

    if args.overwrite:
        for column in OVERWRITE_STRATEGY.keys():
             OVERWRITE_STRATEGY[column] = CSVColumn.OW_ALWAYS
    elif not args.update:
        for column in OVERWRITE_STRATEGY.keys():
             OVERWRITE_STRATEGY[column] = CSVColumn.OW_ASK

    additional_isbn_columns = []
    if args.additional_isbn_columns:
        for index in args.additional_isbn_columns:
            if index > num_columns:
                msg = "Error: Additional ISBN column index {} exceeds number of columns ({})."
                oat.print_r(msg.format(index, num_columns))
                sys.exit()
            else:
                additional_isbn_columns.append(index)

    column_map = {
        "institution": CSVColumn("institution", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.institution_column, overwrite=OVERWRITE_STRATEGY["institution"]),
        "period": CSVColumn("period",{"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.period_column, overwrite=OVERWRITE_STRATEGY["period"]),
        "euro": CSVColumn("euro", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.euro_column, overwrite=OVERWRITE_STRATEGY["euro"]),
        "doi": CSVColumn("doi", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.doi_column, overwrite=OVERWRITE_STRATEGY["doi"]),
        "is_hybrid": CSVColumn("is_hybrid", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.NONE}, args.is_hybrid_column, overwrite=OVERWRITE_STRATEGY["is_hybrid"]),
        "publisher": CSVColumn("publisher", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.publisher_column, overwrite=OVERWRITE_STRATEGY["publisher"]),
        "journal_full_title": CSVColumn("journal_full_title", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.journal_full_title_column, overwrite=OVERWRITE_STRATEGY["journal_full_title"]),
        "issn": CSVColumn("issn", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.issn_column, overwrite=OVERWRITE_STRATEGY["issn"]),
        "issn_print": CSVColumn("issn_print", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["issn_print"]),
        "issn_electronic": CSVColumn("issn_electronic", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["issn_electronic"]),
        "issn_l": CSVColumn("issn_l", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["issn_l"]),
        "license_ref": CSVColumn("license_ref", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE} , None, overwrite=OVERWRITE_STRATEGY["license_ref"]),
        "indexed_in_crossref": CSVColumn("indexed_in_crossref", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["indexed_in_crossref"]),
        "pmid": CSVColumn("pmid", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["pmid"]),
        "pmcid": CSVColumn("pmcid", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["pmcid"]),
        "ut": CSVColumn("ut", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["ut"]),
        "url": CSVColumn("url", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.url_column, overwrite=OVERWRITE_STRATEGY["url"]),
        "doaj": CSVColumn("doaj", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["doaj"]),
        "agreement": CSVColumn("agreement", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["agreement"]),
        "book_title": CSVColumn("book_title", {"articles": CSVColumn.NONE, "books": CSVColumn.RECOMMENDED}, args.book_title_column, overwrite=OVERWRITE_STRATEGY["book_title"]),
        "backlist_oa": CSVColumn("backlist_oa", {"articles": CSVColumn.NONE, "books": CSVColumn.MANDATORY}, args.backlist_oa_column, overwrite=OVERWRITE_STRATEGY["backlist_oa"]),
        "isbn": CSVColumn("isbn", {"articles": CSVColumn.NONE, "books": CSVColumn.BACKUP}, args.isbn_column, overwrite=OVERWRITE_STRATEGY["isbn"]),
        "isbn_print": CSVColumn("isbn_print", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["isbn_print"]),
        "isbn_electronic": CSVColumn("isbn_electronic", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["isbn_electronic"])
    }

    header = None
    if has_header:
        for row in reader:
            if not row: # Skip empty lines
                continue
            header = row # First non-empty row should be the header
            if args.ignore_header:
                print("Skipping header analysis due to command line argument.")
                break
            else:
                print("\n    *** Analyzing CSV header ***\n")
            for (index, item) in enumerate(header):
                if index in additional_isbn_columns:
                    msg = "Column named '{}' at index {} is designated as additional ISBN column"
                    print(msg.format(item, index))
                    continue
                column_type = oat.get_column_type_from_whitelist(item)
                if column_type is not None and column_map[column_type].index is None:
                    column_map[column_type].index = index
                    column_map[column_type].column_name = item
                    found_msg = ("Found column named '{}' at index {}, " +
                                 "assuming this to be the '{}' column.")
                    print(found_msg.format(item, index, column_type))
            break


    print("\n    *** Starting heuristical analysis ***\n")
    for row in reader:
        if not row: # Skip empty lines
            # We analyze the first non-empty line, a possible header should
            # have been processed by now.
            continue
        column_candidates = {
            "doi": [],
            "period": [],
            "euro": []
        }
        found_msg = "The entry in column {} looks like a potential {}: {}"
        for (index, entry) in enumerate(row):
            if index in [csvcolumn.index for csvcolumn in column_map.values()] + additional_isbn_columns:
                # Skip columns already assigned
                continue
            entry = entry.strip()
            # Search for a DOI
            if column_map['doi'].index is None:
                if oat.DOI_RE.match(entry):
                    column_id = str(index)
                    # identify column either numerically or by column header
                    if header:
                        column_id += " ('" + header[index] + "')"
                    print(found_msg.format(column_id, "DOI", entry))
                    column_candidates['doi'].append(index)
                    continue
            # Search for a potential year string
            if column_map['period'].index is None:
                try:
                    maybe_period = int(entry)
                    now = datetime.date.today().year
                    # Should be a wide enough margin
                    if maybe_period >= 2000 and maybe_period <= now + 2:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print(found_msg.format(column_id, "year", entry))
                        column_candidates['period'].append(index)
                        continue
                except ValueError:
                    pass
            # Search for a potential monetary amount
            if column_map['euro'].index is None:
                try:
                    maybe_euro = locale.atof(entry)
                    if maybe_euro >= 10 and maybe_euro <= 10000:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print (found_msg.format(column_id, "euro amount", entry))
                        column_candidates['euro'].append(index)
                        continue
                except ValueError:
                    pass
        for column_type, candidates in column_candidates.items():
            if column_map[column_type].index is not None:
                continue
            if len(candidates) > 1:
                print("Could not reliably identify the '" + column_type +
                      "' column - more than one possible candiate!")
            elif len(candidates) < 1:
                print("No candidate found for column '" + column_type + "'!")
            else:
                index = candidates.pop()
                column_map[column_type].index = index
                if header:
                    column_id = header[index]
                    column_map[column_type].column_name = column_id
                else:
                    column_id = index
                msg = "Assuming column '{}' to be the '{}' column."
                print(msg.format(column_id, column_type))
                column_map[column_type].index = index
        break

    print("\n    *** CSV file analysis summary ***\n")

    index_dict = {csvc.index: csvc for csvc in column_map.values()}

    for index in range(num_columns):
        column_name = ""
        if header:
            column_name = header[index]
        if index in index_dict:
            column = index_dict[index]
            msg = u"column number {} ({}) is the '{}' column ({})".format(
                index, column_name, column.column_type, column.get_req_description())
            print(msg)
        elif index in additional_isbn_columns:
            msg = u"column number {} ({}) is an additional ISBN column".format(index, column_name)
            oat.print_c(msg)
        else:
            if args.add_unknown_columns:
                msg = (u"column number {} ({}) is an unknown column, it will be " +
                       "appended to the generated CSV file")
                print(msg.format(index, column_name))
                if not column_name:
                    # Use a generic name
                    column_name = "unknown"
                while column_name in column_map.keys():
                    # TODO: Replace by a numerical, increasing suffix
                    column_name += "_"
                column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index)
            else:
                msg = (u"column number {} ({}) is an unknown column, it will be " +
                       "ignored")
                print(msg.format(index, column_name))

    print()
    for column in column_map.values():
        if column.index is None:
            msg = "The '{}' column could not be identified ({})"
            print(msg.format(column.column_type, column.get_req_description()))
    print()

    article_mand_missing = [x.column_type for x in column_map.values() if x.requirement["articles"] == CSVColumn.MANDATORY and x.index is None]
    article_back_missing = [x.column_type for x in column_map.values() if x.requirement["articles"] == CSVColumn.BACKUP and x.index is None]
    book_mand_missing = [x.column_type for x in column_map.values() if x.requirement["books"] == CSVColumn.MANDATORY and x.index is None]
    book_back_missing = [x.column_type for x in column_map.values() if x.requirement["books"] == CSVColumn.BACKUP and x.index is None]

    if article_mand_missing:
        msg = "Article enrichment is not possible - mandatory columns are missing ({})"
        oat.print_y(msg.format(", ".join(article_mand_missing)))
    elif article_back_missing:
        msg = "Article enrichment is possible, but backup columns are missing ({}) - each record will need a valid DOI"
        oat.print_b(msg.format(", ".join(article_back_missing)))
    else:
        oat.print_g("Article enrichment is possible with all backup columns in place")
    if book_mand_missing:
        msg = "Book enrichment is not possible - mandatory columns are missing ({})"
        oat.print_y(msg.format(", ".join(book_mand_missing)))
    elif book_back_missing:
        msg = "Book enrichment is possible, but backup columns are missing ({}) - each record will need a valid DOI"
        oat.print_b(msg.format(", ".join(book_back_missing)))
    else:
        oat.print_g("Book enrichment is possible with all backup columns in place")
    print()

    if article_mand_missing and book_mand_missing:
        if not args.force:
            oat.print_r("ERROR: Could not detect the minimum mandatory data set for any " + 
                  "publication type. There are 2 ways to fix this:")
            if not header:
                print("1) Add a header row to your file and identify the " +
                      "column(s) by assigning them an appropiate column name.")
            else:
                print("1) Identify the missing column(s) by assigning them " +
                      "a different column name in the CSV header (You can " +
                      "use the column name(s) mentioned in the message above)")
            print("2) Use command line parameters when calling this script " +
                  "to identify the missing columns (use -h for help) ")
            sys.exit()
        else:
            oat.print_y("WARNING: Could not detect the minimum mandatory data set for any " + 
                  "publication type - forced to continue.")

    start = input("\nStart metadata aggregation? (y/n):")
    while start not in ["y", "n"]:
        start = input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()

    print("\n    *** Starting metadata aggregation ***\n")

    enriched_content = {}
    for record_type, fields in oat.COLUMN_SCHEMAS.items():
        # add headers
        enriched_content[record_type] = {
            "count": 0,
            "content": [list(fields)]
        }

    if not os.path.isdir("tempfiles"):
        os.mkdir("tempfiles")
    isbn_handling = oat.ISBNHandling("tempfiles/ISBNRangeFile.xml")
    doab_analysis = oat.DOABAnalysis(isbn_handling, "tempfiles/DOAB.csv", verbose=False)
    doaj_analysis = oat.DOAJAnalysis("tempfiles/DOAJ.csv")

    csv_file.seek(0)
    reader = csv.reader(csv_file, dialect=dialect)
    header_processed = False
    row_num = 0

    for row in reader:
        row_num += 1
        if not row:
            continue # skip empty lines
        if not header_processed:
            header_processed = True
            if has_header:
                # If the CSV file has a header, we are currently there - skip it
                # to get to the first data row
                continue
        if args.start and args.start > row_num:
            continue
        if args.end and args.end < row_num:
            continue
        print("---Processing line number " + str(row_num) + "---")
        result_type, enriched_row = oat.process_row(row, row_num, column_map, num_columns, additional_isbn_columns, doab_analysis, doaj_analysis,
                                                    args.no_crossref, args.no_pubmed,
                                                    args.no_doaj, args.round_monetary,
                                                    args.offsetting_mode)
        for record_type, value in enriched_content.items():
            if record_type == result_type:
                value["content"].append(enriched_row)
                value["count"] += 1
            else:
                empty_line = ["" for x in value["content"][0]]
                value["content"].append(empty_line)
    csv_file.close()

    for record_type, value in enriched_content.items():
        if value["count"] > 0:
            with open('out_' + record_type + '.csv', 'w') as out:
                writer = oat.OpenAPCUnicodeWriter(out, oat.OPENAPC_STANDARD_QUOTEMASK, 
                                                  True, True, True)
                writer.write_rows(value["content"])

    if not bufferedHandler.buffer:
        oat.print_g("Metadata enrichment successful, no errors occured")
    else:
        oat.print_r("There were errors during the enrichment process:\n")
    # closing will implicitly flush the handler and print any buffered
    # messages to stderr
    bufferedHandler.close()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file")
    parser.add_argument("source_column", type=int)
    parser.add_argument("currency_column", type=int)
    parser.add_argument("period_column", type=int)
    parser.add_argument("target_column", type=int)
    parser.add_argument("-f", "--force_overwrite", action="store_true")
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    args = parser.parse_args()
    
    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and"  +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    enc = None
    
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()
    
    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            msg = "locale '{}' not found, normalised to '{}'".format(
                  args.locale, norm)
            oat.print_y(msg)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            oat.print_g("Using locale " + loc)
        except locale.Error as loce:
            msg = "Setting locale to {} failed: {}".format(norm, loce.message)
            oat.print_r(msg)
            sys.exit()
        
    header, content = oat.get_csv_file_content(args.source_file, enc, True)
    fieldnames = header.pop()
    
    modified_content = []
    line_num = 0
    
    for column_type in ["source_column", "currency_column", "period_column", "target_column"]:
        index = getattr(args, column_type)
        msg = "Column {} ('{}') is the {}."
        oat.print_g(msg.format(index, fieldnames[index], column_type))
    
    start = input("\nStart conversion? (y/n):")
    while start not in ["y", "n"]:
        start = input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()
    
    for line in content:
        line_num += 1
        if not oat.has_value(line[args.source_column]):
            oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...")
            modified_content.append(line)
            continue
        monetary_value = None
        try: 
            monetary_value = locale.atof(line[args.source_column])
        except ValueError:
            msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, line[args.source_column]))
            modified_content.append(line)
            continue
        currency = line[args.currency_column]
        if currency == "EUR":
            msg = "WARNING: Currency in line {} is already EUR, skipping..."
            oat.print_y(msg.format(line_num))
            line[args.target_column] = line[args.source_column]
            modified_content.append(line)
            continue
        if not oat.has_value(currency):
            msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, currency))
            modified_content.append(line)
            continue
        period = line[args.period_column]
        frequency = get_frequency(period)
        if frequency is None:
            msg = "WARNING: Could not extract a valid date string from period column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, period))
            modified_content.append(line)
            continue
        if currency not in EXCHANGE_RATES[frequency]:
            msg = 'No exchange rates ({}) found for currency "{}", querying ECB data warehouse...'
            oat.print_b(msg.format(frequency, currency))
            rates = oat.get_euro_exchange_rates(currency, frequency)
            EXCHANGE_RATES[frequency][currency] = rates
        rate = EXCHANGE_RATES[frequency][currency].get(period)
        if rate is None and frequency == "A":
            rate = _calulate_preliminary_annual_average(period, currency)
            if rate:
                EXCHANGE_RATES[frequency][currency][period] = rate
        if rate is None:
            if frequency != "D":
                msg = "Error: No conversion rate found for currency {} for period {} (line {}), aborting..."
                oat.print_r(msg.format(currency, period, line_num))
                sys.exit()
            day_retries = 0
            while rate is None:
                msg = "Warning: No conversion rate found for currency {} for period {} (line {}), trying next day..."
                oat.print_y(msg.format(currency, period, line_num))
                period = get_next_day(period)
                rate = EXCHANGE_RATES[frequency][currency].get(period)
                day_retries += 1
                if day_retries > 5:
                    msg = "Error: Look-ahead limit for days exceeded, aborting..."
                    oat.print_r(msg)
                    sys.exit()

        euro_value = round(monetary_value/float(rate), 2)
        line[args.target_column] = str(euro_value)
        
        msg = "Line {}: {} exchange rate ({}) for date {} is {} -> {} / {} = {} EUR"
        msg = msg.format(line_num, currency, frequency, period, rate, monetary_value, rate, euro_value)
        oat.print_g(msg)
        
        modified_content.append(line)
    
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows([fieldnames] + modified_content)
Пример #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("apc_file", help=ARG_HELP_STRINGS["apc_file"])
    parser.add_argument("issn_l_file", help=ARG_HELP_STRINGS["issn_l_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q",
                        "--quotemask",
                        help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o",
                        "--openapc_quote_rules",
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true",
                        default=False)

    args = parser.parse_args()

    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print("Error: A quotemask may only contain the letters 't' and" +
                  "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]

    enc = None

    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print("Error: '" + args.encoding + "' not found Python's " +
                  "codec collection. Either look for a valid name here " +
                  "(https://docs.python.org/2/library/codecs.html#standard-" +
                  "encodings) or omit this argument to enable automated " +
                  "guessing.")
            sys.exit()

    header, content = oat.get_csv_file_content(args.apc_file, enc)

    oat.print_g("Preparing mapping table...")
    itself = other = 0
    issn_l_re = re.compile(
        "^(?P<issn>\d{4}-\d{3}[\dxX])\t(?P<issn_l>\d{4}-\d{3}[\dxX])$")
    issn_l_file = open(args.issn_l_file, "r")
    issn_l_dict = {}
    for i, line in enumerate(issn_l_file):
        if i % 100000 == 0:
            print(str(i) + " lines processed.")
        match = issn_l_re.match(line)
        if match:
            match_dict = match.groupdict()
            issn_l_dict[match_dict['issn']] = match_dict['issn_l']
            if match_dict['issn'] == match_dict['issn_l']:
                itself += 1
            else:
                other += 1
    print(
        str(itself) + " ISSNs pointing to itself as ISSN-L, " + str(other) +
        " to another value.")
    oat.print_g("Starting enrichment...")

    issn_matches = issn_p_matches = issn_e_matches = unmatched = different = corrections = 0
    enriched_lines = []
    for line in content:
        if len(line) == 0:
            enriched_lines.append(line)
            continue
        issn = reformat_issn(line[7])
        issn_p = reformat_issn(line[8])
        issn_e = reformat_issn(line[9])
        target = None
        if issn in issn_l_dict:
            target = issn_l_dict[issn]
            corrected_target = oat.get_corrected_issn_l(target)
            if corrected_target != target:
                corrections += 1
            line[10] = corrected_target
            issn_matches += 1
        elif issn_p in issn_l_dict:
            target = issn_l_dict[issn_p]
            corrected_target = oat.get_corrected_issn_l(target)
            if corrected_target != target:
                corrections += 1
            line[10] = corrected_target
            issn_p_matches += 1
        elif issn_e in issn_l_dict:
            target = issn_l_dict[issn_e]
            corrected_target = oat.get_corrected_issn_l(target)
            if corrected_target != target:
                corrections += 1
            line[10] = corrected_target
            issn_e_matches += 1
        else:
            unmatched += 1
        if target is not None and target not in [issn, issn_p, issn_e]:
            different += 1
        enriched_lines.append(line)

    msg = ("{} issn_l values mapped by issn, {} by issn_p, {} by issn_e. {} " +
           "could not be assigned.\n{} issn_l values were corrected during " +
           "the process.\n In {} cases the ISSN-L was different from all " +
           "existing ISSN values")
    print(
        msg.format(issn_matches, issn_p_matches, issn_e_matches, unmatched,
                   corrections, different))

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows(header + enriched_lines)
Пример #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q",
                        "--quotemask",
                        help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o",
                        "--openapc_quote_rules",
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true",
                        default=False)

    args = parser.parse_args()

    quote_rules = args.openapc_quote_rules

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            codec_msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(codec_msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print("Error: '" + args.encoding + "' not found Python's " +
                  "codec collection. Either look for a valid name here " +
                  "(https://docs.python.org/2/library/codecs.html#standard-" +
                  "encodings) or omit this argument to enable automated " +
                  "guessing.")
            sys.exit()

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print("Error: A quotemask may only contain the letters 't' and" +
                  "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]

    header, content = oat.get_csv_file_content(args.csv_file, enc)
    correction_schema = None
    for schema_type, schema in oat.COLUMN_SCHEMAS.items():
        if header[0] == schema:
            oat.print_g("Schema autodetection: " + schema_type)
            correction_schema = CORRECTION_SCHEMAS[schema_type]
            break
    else:
        oat.print_r(
            "Error: CSV header does not match any known OpenAPC data schema")

    line_num = 1
    for line in content:
        for tup in correction_schema:
            if tup[0] == "publisher":
                index = tup[1]
                publisher = line[index]
                publisher_new = oat.get_unified_publisher_name(publisher)
                if publisher_new != publisher:
                    line[index] = publisher_new
                    msg = u"Line {}: Updated publisher name ({} -> {})"
                    oat.print_g(msg.format(line_num, publisher, publisher_new))
            if tup[0] == "journal_full_title":
                index = tup[1]
                journal = line[index]
                journal_new = oat.get_unified_journal_title(journal)
                if journal_new != journal:
                    line[index] = journal_new
                    msg = u"Line {}: Updated journal_full_title ({} -> {})"
                    oat.print_g(msg.format(line_num, journal, journal_new))
        line_num += 1

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows(header + content)
Пример #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file")
    parser.add_argument("source_column", type=int)
    parser.add_argument("currency_column", type=int)
    parser.add_argument("period_column", type=int)
    parser.add_argument("target_column", type=int)
    parser.add_argument("-f", "--force_overwrite", action="store_true")
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q",
                        "--quotemask",
                        help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o",
                        "--openapc_quote_rules",
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true",
                        default=False)
    args = parser.parse_args()

    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print("Error: A quotemask may only contain the letters 't' and" +
                  "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]

    enc = None

    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print("Error: '" + args.encoding + "' not found Python's " +
                  "codec collection. Either look for a valid name here " +
                  "(https://docs.python.org/2/library/codecs.html#standard-" +
                  "encodings) or omit this argument to enable automated " +
                  "guessing.")
            sys.exit()

    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            msg = "locale '{}' not found, normalised to '{}'".format(
                args.locale, norm)
            oat.print_y(msg)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            oat.print_g("Using locale " + loc)
        except locale.Error as loce:
            msg = "Setting locale to {} failed: {}".format(norm, loce.message)
            oat.print_r(msg)
            sys.exit()

    header, content = oat.get_csv_file_content(args.source_file, enc)
    fieldnames = header.pop()

    modified_content = []
    line_num = 0

    for column_type in [
            "source_column", "currency_column", "period_column",
            "target_column"
    ]:
        index = getattr(args, column_type)
        msg = "Column {} ('{}') is the {}."
        oat.print_g(msg.format(index, fieldnames[index], column_type))

    start = input("\nStart conversion? (y/n):")
    while start not in ["y", "n"]:
        start = input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()

    for line in content:
        line_num += 1
        if not oat.has_value(line[args.source_column]):
            oat.print_y("WARNING: No source value found in line " +
                        str(line_num) + ", skipping...")
            modified_content.append(line)
            continue
        monetary_value = None
        try:
            monetary_value = locale.atof(line[args.source_column])
        except ValueError:
            msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, line[args.source_column]))
            modified_content.append(line)
            continue
        period = line[args.period_column]
        if not oat.has_value(period) or not period.isdigit():
            msg = "WARNING: Could not extract a valid year string from period column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, period))
            modified_content.append(line)
            continue
        currency = line[args.currency_column]
        if not oat.has_value(currency):
            msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, currency))
            modified_content.append(line)
            continue
        try:
            rate = AVG_YEARLY_CONVERSION_RATES[currency][period]
        except KeyError:
            msg = "ERROR: No conversion rate found for currency {} in year {} (line {}), aborting..."
            oat.print_r(msg.format(currency, period, line_num))
            sys.exit()

        euro_value = round(monetary_value / rate, 2)
        line[args.target_column] = str(euro_value)

        modified_content.append(line)

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows([fieldnames] + modified_content)
Пример #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"])
    parser.add_argument("source_file_key_column", type=int, help=ARG_HELP_STRINGS["source_file_key_column"])
    parser.add_argument("source_file_value_column", type=int, help=ARG_HELP_STRINGS["source_file_value_column"])
    parser.add_argument("target_file", help=ARG_HELP_STRINGS["target_file"])
    parser.add_argument("target_file_key_column", type=int, help=ARG_HELP_STRINGS["target_file_key_column"])
    parser.add_argument("target_file_value_column", type=int, help=ARG_HELP_STRINGS["target_file_value_column"])
    parser.add_argument("-s", "--strict", action="store_true", help=ARG_HELP_STRINGS["strict"])
    parser.add_argument("-f", "--force_overwrite", action="store_true", help=ARG_HELP_STRINGS["force_overwrite"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-e2", "--other_encoding", help=ARG_HELP_STRINGS["other_encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    
    args = parser.parse_args()
    
    quote_rules = args.openapc_quote_rules
    
    encs = [] #CSV file encodings
    
    for encoding in [args.encoding, args.other_encoding]:
        if encoding:
            try:
                codec = codecs.lookup(encoding)
                print ("Encoding '{}' found in Python's codec collection " +
                       "as '{}'").format(encoding, codec.name)
                enc = args.encoding
            except LookupError:
                print ("Error: '" + encoding + "' not found Python's " +
                       "codec collection. Either look for a valid name here " +
                       "(https://docs.python.org/2/library/codecs.html#standard-" +
                       "encodings) or omit this argument to enable automated " +
                       "guessing.")
                sys.exit()
        encs.append(encoding)
        
    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and" +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    source_header, source_content = oat.get_csv_file_content(args.source_file, enc=encs[0])
    
    key_column_name = "column " + str(args.source_file_key_column)
    value_column_name = "column " + str(args.source_file_value_column)
    if source_header:
        header = source_header[0]
        key_column_name = header[args.source_file_key_column]
        value_column_name = header[args.source_file_value_column]
    msg = u"Creating mapping table ({} -> {}) for source file {}...".format(key_column_name, value_column_name, args.source_file)
    oat.print_g(msg)
    mapping_table = {}
    ambiguous_keys = []
    for line in source_content:
        if line:
            key = line[args.source_file_key_column]
            if key == 'NA':
                continue
            value = line[args.source_file_value_column]
            if key not in mapping_table:
                mapping_table[key] = value
            else:
                if mapping_table[key] != value:
                    if not args.strict:
                        msg = u"WARNING: Replacing existing value '{}' for key '{}' with new value '{}'".format(mapping_table[key], key, value)
                        mapping_table[key] = value
                        oat.print_y(msg)
                    else:
                        if key not in ambiguous_keys:
                            ambiguous_keys.append(key)
    if args.strict:
        for key in ambiguous_keys:
            del(mapping_table[key])
            msg = u"INFO: Ambiguous key '{}' dropped from mapping table".format(key)
            oat.print_b(msg)
    
    oat.print_g("mapping table created, contains " + str(len(mapping_table)) + " entries")
    
    target_header, target_content = oat.get_csv_file_content(args.target_file, enc=encs[1])
    
    
    line_num = 0 if not target_header else 1
    
    replace_msg = u"Line {}: Found matching key '{}', replaced old value '{}' by '{}'"
    modified_content = []
    for line in target_content:
        key = line[args.target_file_key_column]
        if key in mapping_table:
            new_value = mapping_table[key]
            old_value = line[args.target_file_value_column]
            if old_value != new_value:
                if len(old_value) == 0 or old_value == "NA":
                    line[args.target_file_value_column] = new_value
                    msg = replace_msg.format(line_num, key, old_value, new_value)
                    oat.print_g(msg)
                else:
                    if args.force_overwrite:
                        line[args.target_file_value_column] = new_value
                        msg = replace_msg.format(line_num, key, old_value, new_value)
                        oat.print_y(msg)
        modified_content.append(line)
        line_num += 1
    
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
        writer.write_rows(target_header + modified_content)
Пример #18
0
        return "ElementTree ParseError: {}".format(str(etpe))


parser = argparse.ArgumentParser()
parser.add_argument(
    "doi_or_file",
    help=
    "An OpenAPC-compatible CSV file or a single DOI to look up in crossref.")
args = parser.parse_args()

arg = args.doi_or_file
if os.path.isfile(arg):
    csv_file = open(arg, "r", encoding="utf8")
    reader = csv.reader(csv_file)
    line_number = 0
    for line in reader:
        if not line:
            prefix = ""
        else:
            prefix = get_prefix(line[3])
        result = str(line_number) + ": " + prefix
        if prefix == "Springer (Biomed Central Ltd.)":
            oat.print_g(result)
        elif prefix == "Nature Publishing Group":
            oat.print_r(result)
        else:
            print(result)
        line_number += 1
else:
    print(get_prefix(arg))
Пример #19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        help=ARG_HELP_STRINGS["verbose"])
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-i",
                        "--ignore-header",
                        action="store_true",
                        help=ARG_HELP_STRINGS["headers"])
    parser.add_argument("-f",
                        "--force",
                        action="store_true",
                        help=ARG_HELP_STRINGS["force"])
    parser.add_argument("-b",
                        "--bypass-cert-verification",
                        action="store_true",
                        help=ARG_HELP_STRINGS["bypass"])
    parser.add_argument("-institution",
                        "--institution_column",
                        type=int,
                        help=ARG_HELP_STRINGS["institution"])
    parser.add_argument("-period",
                        "--period_column",
                        type=int,
                        help=ARG_HELP_STRINGS["period"])
    parser.add_argument("-doi",
                        "--doi_column",
                        type=int,
                        help=ARG_HELP_STRINGS["doi"])
    parser.add_argument("-euro",
                        "--euro_column",
                        type=int,
                        help=ARG_HELP_STRINGS["euro"])
    parser.add_argument("-is_hybrid",
                        "--is_hybrid_column",
                        type=int,
                        help=ARG_HELP_STRINGS["is_hybrid"])
    parser.add_argument("-publisher",
                        "--publisher_column",
                        type=int,
                        help=ARG_HELP_STRINGS["publisher"])
    parser.add_argument("-journal_full_title",
                        "--journal_full_title_column",
                        type=int,
                        help=ARG_HELP_STRINGS["journal_full_title"])
    parser.add_argument("-issn",
                        "--issn_column",
                        type=int,
                        help=ARG_HELP_STRINGS["issn"])
    parser.add_argument("-url",
                        "--url_column",
                        type=int,
                        help=ARG_HELP_STRINGS["url"])

    args = parser.parse_args()
    enc = None  # CSV file encoding

    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            print "locale '{}' not found, normalized to '{}'".format(
                args.locale, norm)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            print "Using locale", loc
        except locale.Error as loce:
            print "Setting locale to " + norm + " failed: " + loce.message
            sys.exit()

    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            print("Encoding '{}' found in Python's codec collection " +
                  "as '{}'").format(args.encoding, codec.name)
            enc = args.encoding
        except LookupError:
            print("Error: '" + args.encoding + "' not found Python's " +
                  "codec collection. Either look for a valid name here " +
                  "(https://docs.python.org/2/library/codecs.html#standard-" +
                  "encodings) or omit this argument to enable automated " +
                  "guessing.")
            sys.exit()

    result = oat.analyze_csv_file(args.csv_file)
    if result["success"]:
        csv_analysis = result["data"]
        print csv_analysis
    else:
        print result["error_msg"]
        sys.exit()

    if enc is None:
        enc = csv_analysis.enc
    dialect = csv_analysis.dialect
    has_header = csv_analysis.has_header

    if enc is None:
        print("Error: No encoding given for CSV file and automated " +
              "detection failed. Please set the encoding manually via the " +
              "--enc argument")
        sys.exit()

    csv_file = open(args.csv_file, "r")
    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)

    first_row = reader.next()
    num_columns = len(first_row)
    print "\nCSV file has {} columns.".format(num_columns)

    csv_file.seek(0)
    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)

    column_map = OrderedDict([
        ("institution",
         CSVColumn("institution", CSVColumn.MANDATORY,
                   args.institution_column)),
        ("period", CSVColumn("period", CSVColumn.MANDATORY,
                             args.period_column)),
        ("euro", CSVColumn("euro", CSVColumn.MANDATORY, args.euro_column)),
        ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column)),
        ("is_hybrid",
         CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column)),
        ("publisher",
         CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column)),
        ("journal_full_title",
         CSVColumn("journal_full_title", CSVColumn.OPTIONAL,
                   args.journal_full_title_column)),
        ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column)),
        ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None)),
        ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE,
                                      None)),
        ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None)),
        ("indexed_in_crossref",
         CSVColumn("indexed_in_crossref", CSVColumn.NONE, None)),
        ("pmid", CSVColumn("pmid", CSVColumn.NONE, None)),
        ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None)),
        ("ut", CSVColumn("ut", CSVColumn.NONE, None)),
        ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column)),
        ("doaj", CSVColumn("doaj", CSVColumn.NONE, None))
    ])

    # Do not quote the values in the 'period' and 'euro' columns
    quotemask = [
        True,
        False,
        False,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
    ]

    header = None
    if has_header:
        for row in reader:
            if not row:  # Skip empty lines
                continue
            header = row  # First non-empty row should be the header
            if args.ignore_header:
                print "Skipping header analysis due to command line argument."
                break
            else:
                print "\n    *** Analyzing CSV header ***\n"
            for (index, item) in enumerate(header):
                column_type = oat.get_column_type_from_whitelist(item)
                if column_type is not None and column_map[
                        column_type].index is None:
                    column_map[column_type].index = index
                    column_map[column_type].column_name = item
                    print("Found column named '{}' at index {}, " +
                          "assuming this to be the {} column.").format(
                              item, index, column_type)
            break

    print "\n    *** Starting heuristical analysis ***\n"
    for row in reader:
        if not row:  # Skip empty lines
            # We analyze the first non-empty line, a possible header should
            # have been processed by now.
            continue
        column_candidates = {"doi": [], "period": [], "euro": []}
        for (index, entry) in enumerate(row):
            if index in [csvcolumn.index for csvcolumn in column_map.values()]:
                # Skip columns already assigned
                continue
            entry = entry.strip()
            # Search for a DOI
            if column_map['doi'].index is None:
                if oat.DOI_RE.match(entry):
                    column_id = str(index)
                    # identify column either numerical or by column header
                    if header:
                        column_id += " ('" + header[index] + "')"
                    print("The entry in column {} looks like a " +
                          "DOI: {}").format(column_id, entry)
                    column_candidates['doi'].append(index)
                    continue
            # Search for a potential year string
            if column_map['period'].index is None:
                try:
                    maybe_period = int(entry)
                    now = datetime.date.today().year
                    # Should be a wide enough margin
                    if maybe_period >= 2000 and maybe_period <= now + 2:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print("The entry in column {} looks like a " +
                              "potential period: {}").format(column_id, entry)
                        column_candidates['period'].append(index)
                        continue
                except ValueError:
                    pass
            # Search for a potential monetary amount
            if column_map['euro'].index is None:
                try:
                    maybe_euro = locale.atof(entry)
                    # Are there APCs above 6000€ ??
                    if maybe_euro >= 10 and maybe_euro <= 6000:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print("The entry in column {} looks like a " +
                              "potential euro amount: {}").format(
                                  column_id, entry)
                        column_candidates['euro'].append(index)
                        continue
                except ValueError:
                    pass
        for column_type, candidates in column_candidates.iteritems():
            if column_map[column_type].index is not None:
                continue
            if len(candidates) > 1:
                print("Could not reliably identify the '" + column_type +
                      "' column - more than one possible candiate!")
            elif len(candidates) < 1:
                print "No candidate found for column '" + column_type + "'!"
            else:
                index = candidates.pop()
                column_map[column_type].index = index
                if header:
                    column_id = header[index]
                    column_map[column_type].column_name = column_id
                else:
                    column_id = index
                print("Assuming column '{}' to be the '{}' " +
                      "column.").format(column_id, column_type)
                column_map[column_type].index = index
        break

    # Wrap up: Check if there any mandatory column types left which have not
    # yet been identified - we cannot continue in that case (unless forced).
    unassigned = filter(
        lambda
        (k, v): v.requirement == CSVColumn.MANDATORY and v.index is None,
        column_map.iteritems())
    if unassigned:
        for item in unassigned:
            print "The {} column is still unidentified.".format(item[0])
        if header:
            print "The CSV header is:\n" + dialect.delimiter.join(header)
        if not args.force:
            print("ERROR: We cannot continue because not all mandatory " +
                  "column types in the CSV file could be automatically " +
                  "identified. There are 2 ways to fix this:")
            if not header:
                print(
                    "1) Add a header row to your file and identify the " +
                    "column(s) by assigning them an appropiate column name.")
            else:
                print(
                    "1) Identify the missing column(s) by assigning them " +
                    "a different column name in the CSV header (You can " +
                    "use the column name(s) mentioned in the message above)")
            print("2) Use command line parameters when calling this script " +
                  "to identify the missing columns (use -h for help) ")
            sys.exit()
        else:
            print("WARNING: Not all mandatory column types in the CSV file " +
                  "could be automatically identified - forced to continue.")

    print "\n    *** CSV file analysis summary ***\n"

    index_dict = {csvc.index: csvc for csvc in column_map.values()}

    for index in range(num_columns):
        column_name = ""
        if header:
            column_name = header[index]
        if index in index_dict:
            column = index_dict[index]
            msg = "column number {} ({}) is the {} column '{}'".format(
                index, column_name, column.requirement, column.column_type)
            if column.requirement in [CSVColumn.MANDATORY, CSVColumn.OPTIONAL]:
                oat.print_g(msg)
            else:
                oat.print_b(msg)
        else:
            msg = ("column number {} ({}) is an unknown column, it will be " +
                   "appended to the generated CSV file")
            oat.print_y(msg.format(index, column_name))
            if not column_name:
                # Use a generic name
                column_name = "unknown"
            while column_name in column_map.keys():
                # TODO: Replace by a numerical, increasing suffix
                column_name += "_"
            column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE,
                                                index)

    print ""
    for column in column_map.values():
        if column.index is None:
            msg = "The {} column '{}' could not be identified."
            print msg.format(column.requirement, column.column_type)

    # Check for unassigned optional column types. We can continue but should
    # issue a warning as all entries will need a valid DOI in this case.
    unassigned = filter(
        lambda (k, v): v.requirement == CSVColumn.OPTIONAL and v.index is None,
        column_map.iteritems())
    if unassigned:
        print("\nWARNING: Not all optional column types could be " +
              "identified. Metadata aggregation is still possible, but " +
              "every entry in the CSV file will need a valid DOI.")

    start = raw_input("\nStart metadata aggregation? (y/n):")
    while start not in ["y", "n"]:
        start = raw_input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()

    print "\n    *** Starting metadata aggregation ***\n"

    enriched_content = []

    error_messages = []

    csv_file.seek(0)
    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)
    header_processed = False
    row_num = 0

    for row in reader:
        row_num += 1
        if not row:
            continue  # skip empty lines
        if not header_processed:
            header_processed = True
            enriched_content.append(column_map.keys())
            if has_header:
                # If the CSV file has a header, we are currently there - skip it
                # to get to the first data row
                continue
        print "---Processing line number " + str(row_num) + "---"
        if len(row) != num_columns:
            error_msg = (
                "Syntax: the number of values in line {} ({}) " +
                "differs from the number of columns ({}). Line left " +
                "unchanged, please correct the error in the result " +
                "file and re-run.")
            error_msg_fmt = error_msg.format(row_num, len(row), num_columns)
            error_messages.append("Line {}: {}".format(row_num, error_msg_fmt))
            oat.print_r(error_msg_fmt)
            enriched_content.append(row)
            continue

        doi = row[column_map["doi"].index]

        current_row = OrderedDict()
        # Copy content of identified columns
        for csv_column in column_map.values():
            if csv_column.index is not None and len(row[csv_column.index]) > 0:
                if csv_column.column_type == "euro":
                    # special case for monetary values: Cast to float to ensure
                    # the decimal point is a dot (instead of a comma)
                    euro_value = row[csv_column.index]
                    try:
                        euro = locale.atof(euro_value)
                        if euro.is_integer():
                            euro = int(euro)
                        current_row[csv_column.column_type] = str(euro)
                    except ValueError:
                        msg = ERROR_MSGS["locale"].format(
                            euro_value, csv_column.index)
                        oat.print_r(msg)
                        sys.exit()
                else:
                    current_row[csv_column.column_type] = row[csv_column.index]
            else:
                current_row[csv_column.column_type] = "NA"

        # include crossref metadata
        crossref_result = oat.get_metadata_from_crossref(doi)
        if crossref_result["success"]:
            print "Crossref: DOI resolved: " + doi
            current_row["indexed_in_crossref"] = "TRUE"
            data = crossref_result["data"]
            for key, value in data.iteritems():
                if value is not None:
                    if key == "journal_full_title":
                        unified_value = oat.get_unified_journal_title(value)
                        if unified_value != value:
                            msg = INFO_MSGS["unify"].format(
                                "journal title", value, unified_value)
                            oat.print_b(msg)
                        new_value = unified_value
                    elif key == "publisher":
                        unified_value = oat.get_unified_publisher_name(value)
                        if unified_value != value:
                            msg = INFO_MSGS["unify"].format(
                                "publisher name", value, unified_value)
                            oat.print_b(msg)
                        new_value = unified_value
                    else:
                        new_value = value
                else:
                    new_value = "NA"
                    if args.verbose:
                        print(u"WARNING: Element '{}' not found in in " +
                              "response for doi {}.").format(key, doi)
                old_value = current_row[key]
                current_row[key] = column_map[key].check_overwrite(
                    old_value, new_value)
        else:
            error_msg = ("Crossref: Error while trying to resolve DOI " + doi +
                         ": " + crossref_result["error_msg"])
            oat.print_r(error_msg)
            error_messages.append("Line {}: {}".format(row_num, error_msg))
            current_row["indexed_in_crossref"] = "FALSE"

        # include pubmed metadata
        pubmed_result = oat.get_metadata_from_pubmed(doi)
        if pubmed_result["success"]:
            print "Pubmed: DOI resolved: " + doi
            data = pubmed_result["data"]
            for key, value in data.iteritems():
                if value is not None:
                    new_value = value
                else:
                    new_value = "NA"
                    if args.verbose:
                        print(u"WARNING: Element '{}' not found in in " +
                              "response for doi {}.").format(key, doi)
                old_value = current_row[key]
                current_row[key] = column_map[key].check_overwrite(
                    old_value, new_value)
        else:
            error_msg = ("Pubmed: Error while trying to resolve DOI " + doi +
                         ": " + pubmed_result["error_msg"])
            oat.print_r(error_msg)
            error_messages.append("Line {}: {}".format(row_num, error_msg))

        # lookup in DOAJ. try the EISSN first, then ISSN and finally print ISSN
        if current_row["doaj"] != "TRUE":
            issns = []
            if current_row["issn_electronic"] != "NA":
                issns.append(current_row["issn_electronic"])
            if current_row["issn"] != "NA":
                issns.append(current_row["issn"])
            if current_row["issn_print"] != "NA":
                issns.append(current_row["issn_print"])
            for issn in issns:
                doaj_res = oat.lookup_journal_in_doaj(
                    issn, args.bypass_cert_verification)
                if doaj_res["data_received"]:
                    if doaj_res["data"]["in_doaj"]:
                        msg = "DOAJ: Journal ISSN ({}) found in DOAJ ('{}')."
                        print msg.format(issn, doaj_res["data"]["title"])
                        current_row["doaj"] = "TRUE"
                        break
                    else:
                        msg = "DOAJ: Journal ISSN ({}) not found in DOAJ."
                        current_row["doaj"] = "FALSE"
                        print msg.format(issn)
                else:
                    msg = "DOAJ: Error while trying to look up ISSN {}: {}"
                    msg_fmt = msg.format(issn, doaj_res["error_msg"])
                    oat.print_r(msg_fmt)
                    error_messages.append("Line {}: {}".format(
                        row_num, msg_fmt))

        enriched_content.append(current_row.values())

    csv_file.close()

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, quotemask, True, True)
        writer.write_rows(enriched_content)

    if not error_messages:
        oat.print_g("Metadata enrichment successful, no errors occured")
    else:
        oat.print_r("There were errors during the enrichment process:\n")
        for msg in error_messages:
            print msg + "\n"
Пример #20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("column", type=int, help=ARG_HELP_STRINGS["column"])
    parser.add_argument("other_csv_file", nargs="?", help=ARG_HELP_STRINGS["other_csv_file"])
    parser.add_argument("other_column", type=int, nargs="?", help=ARG_HELP_STRINGS["other_column"])
    parser.add_argument("-e2", "--other_encoding", help=ARG_HELP_STRINGS["other_encoding"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-i", "--ignore_case", action="store_true", default=False,
                        help=ARG_HELP_STRINGS["ignore_case"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules",
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)

    args = parser.parse_args()

    quote_rules = args.openapc_quote_rules

    encs = [] #CSV file encodings

    for encoding in [args.encoding, args.other_encoding]:
        if encoding:
            try:
                codec = codecs.lookup(encoding)
                msg = "Encoding '{}' found in Python's codec collection as '{}'"
                print(msg.format(encoding, codec.name))
            except LookupError:
                print("Error: '" + encoding + "' not found Python's " +
                      "codec collection. Either look for a valid name here " +
                      "(https://docs.python.org/2/library/codecs.html#standard-" +
                      "encodings) or omit this argument to enable automated " +
                      "guessing.")
                sys.exit()
        encs.append(encoding)

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if reduced:
            print("Error: A quotemask may only contain the letters 't' and 'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]

    header, content = oat.get_csv_file_content(args.csv_file, enc=encs[0])
    column = args.column

    if not args.other_csv_file:
        rearranged_content = header + sorted(content, key=lambda x: x[column])
    else:
        rearranged_content = []
        _, second_content = oat.get_csv_file_content(args.other_csv_file, enc=encs[1])
        other_column = column # default: use same column index as in first file
        if args.other_column:
            other_column = args.other_column

        for other_row in second_content:
            if args.ignore_case:
                matching_rows = [row for row in content if row[column].lower() == other_row[other_column].lower()]
            else:
                matching_rows = [row for row in content if row[column] == other_row[other_column]]
            rearranged_content += matching_rows
            for matching_row in matching_rows:
                content.remove(matching_row)
        unmatched_msg = ("{} rows could not be rearranged (unmatched in second csv file) " +
                         "and were appended to the end of the result file " +
                         "in original order.")
        if content:
            oat.print_y(unmatched_msg.format(len(content)))
        else:
            oat.print_g("All rows matched.")
        rearranged_content = header + rearranged_content + content # append any unmatched rows

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
        writer.write_rows(rearranged_content)
Пример #21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("original_file", help=ARG_HELP_STRINGS["original_file"])
    parser.add_argument("update_file", help=ARG_HELP_STRINGS["update_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-eu", "--update_encoding", help=ARG_HELP_STRINGS["update_encoding"])
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-lu", "--update_locale", help=ARG_HELP_STRINGS["update_locale"])
    parser.add_argument("-a", "--autocreate_mappings", action="store_true", help=ARG_HELP_STRINGS["autocreate_mappings"])
    parser.add_argument("-g", "--grouping", action="store_true", help=ARG_HELP_STRINGS["grouping"])
    
    args = parser.parse_args()
    
    params = {
        "original": {
            "file": args.original_file,
            "encoding": args.encoding,
            "locale": args.locale,
            "csv_analysis": None,
            "fieldnames": None,
            "doi_field": None,
            "euro_field": None,
            "mappings": []
        },
        "update": {
            "file": args.update_file,
            "encoding": args.update_encoding,
            "locale": args.update_locale,
            "csv_analysis": None,
            "fieldnames": None,
            "doi_field": None,
            "euro_field": None,
            "mappings": []
        }
    }
    
    def field_mapped(file_type, field_name):
        if field_name == params[file_type]["euro_field"]:
            return True
        if field_name == params[file_type]["doi_field"]:
            return True
        if field_name in params[file_type]["mappings"]:
            return True
        return False
    
    for file_type in params.keys():
        msg = "*** Performing analysis for {} file ***"
        oat.print_b(msg.format(file_type))
        encoding = params[file_type]["encoding"]
        if encoding is not None:
            try:
                codec = codecs.lookup(encoding)
                msg = "Encoding '{}' found in Python's codec collection as '{}'"
                print(msg.format(encoding, codec.name))
                params[file_type]["encoding"] = encoding
            except LookupError:
                print ("Error: '" + encoding + "' not found Python's " +
                       "codec collection. Either look for a valid name here " +
                       "(https://docs.python.org/2/library/codecs.html#standard-" +
                       "encodings) or omit this argument to enable automated " +
                       "guessing.")
                sys.exit()

        loc = params[file_type]["locale"]
        if loc is not None:
            norm = locale.normalize(loc)
            if norm != loc:
                msg = "locale '{}' not found, normalised to '{}'".format(loc, norm)
                oat.print_y(msg)
                params[file_type]["locale"] = norm

        csv_analysis = oat.analyze_csv_file(params[file_type]["file"], enc=params[file_type]["encoding"])
        if not csv_analysis["success"]:
            oat.print_r(csv_analysis["error_msg"])
            sys.exit()
        params[file_type]["csv_analysis"] = csv_analysis["data"]
        print(params[file_type]["csv_analysis"])
        
        if params[file_type]["encoding"] is None:
            guessed_enc = params[file_type]["csv_analysis"].enc
            params[file_type]["encoding"] = guessed_enc

        locale_name = "default locale"
        if params[file_type]["locale"] is not None:
            locale_name = "locale " + params[file_type]["locale"]
        msg = "{} file will be opened with encoding {} and {}"
        oat.print_g(msg.format(file_type, params[file_type]["encoding"], locale_name))
        
        with open(params[file_type]["file"], "r", encoding=params[file_type]["encoding"]) as f:
            reader = csv.DictReader(f, dialect=params[file_type]["csv_analysis"].dialect)
            params[file_type]["fieldnames"] = list(reader.fieldnames)
            for index, name in enumerate(params[file_type]["fieldnames"]):
                field_type = oat.get_column_type_from_whitelist(name)
                found = False
                if field_type == "doi":
                    params[file_type]["doi_field"] = name
                    found = True
                elif field_type == "euro":
                    params[file_type]["euro_field"] = name
                    found = True
                if found:
                    msg = '{} file: Found {} column at index {} ("{}")'
                    msg = msg.format(file_type, field_type, index, name)
                    oat.print_b(msg)
            for field_type in ["doi_field", "euro_field"]:
                if params[file_type][field_type] is None:
                    msg = "Error: No {} found in {} file"
                    oat.print_r(msg.format(field_type, file_type))
                    sys.exit()
    
    for orig_index, orig_field in enumerate(params["original"]["fieldnames"]):
        if field_mapped("original", orig_field):
            continue
        norm_orig_field = orig_field.lower().strip()
        for update_index, update_field in enumerate(params["update"]["fieldnames"]):
            if field_mapped("update", update_field):
                continue
            norm_update_field = update_field.lower().strip()
            if norm_orig_field == norm_update_field:
                if args.autocreate_mappings:
                    params["original"]["mappings"].append(orig_field)
                    params["update"]["mappings"].append(update_field)
                    msg = 'Auto-created mapping "{}" (update file, index {}) -> "{}" (original file, index {})'
                    oat.print_b(msg.format(update_field, update_index, orig_field, orig_index))
                else:
                    msg = 'Possible mapping found: "{}" (update file, index {}) -> "{}" (original file, index {}). Create mapping (y/n)?'
                    msg = msg.format(update_field, update_index, orig_field, orig_index)
                    create = input(msg)
                    while create not in ["y", "n"]:
                        create = input("Please type 'y' or 'n':")
                    if create == "y":
                        params["original"]["mappings"].append(orig_field)
                        params["update"]["mappings"].append(update_field)
                    
    update_mappings = {}
    with open(params["update"]["file"], "r", encoding=params["update"]["encoding"]) as f:
        doi_field = params["update"]["doi_field"]
        euro_field = params["update"]["euro_field"]
        reader = csv.DictReader(f, dialect=params["update"]["csv_analysis"].dialect)
        locale.setlocale(locale.LC_ALL, params["update"]["locale"])
        for line in reader:
            doi = oat.get_normalised_DOI(line[doi_field])
            if doi is None:
                msg = 'Warning: Empty or invalid DOI in update file (line {}): "{}"'
                oat.print_y(msg.format(reader.line_num, line[doi_field]))
                continue
            if doi in update_mappings:
                msg = "Error: Duplicate doi in update file ({})".format(line[doi_field])
                oat.print_r(msg)
                sys.exit()
            update_mappings[doi] = {}
            euro_value = locale.atof(line[euro_field])
            orig_euro_field = params["original"]["euro_field"]
            update_mappings[doi][orig_euro_field] = euro_value
            for index, update_field_name in enumerate(params["update"]["mappings"]):
                orig_field_name = params["original"]["mappings"][index]
                update_mappings[doi][orig_field_name] = line[update_field_name]
            
    #print(json.dumps(update_mappings, sort_keys=False, indent=4))
    
    modified_content = []
    fieldnames = None
    with open(params["original"]["file"], "r", encoding=params["original"]["encoding"]) as f:
        doi_field = params["original"]["doi_field"]
        euro_field = params["original"]["euro_field"]
        reader = csv.DictReader(f, dialect=params["original"]["csv_analysis"].dialect)
        fieldnames = list(reader.fieldnames)
        locale.setlocale(locale.LC_ALL, params["original"]["locale"])
        for line in reader:
            doi = oat.get_normalised_DOI(line[doi_field])
            if doi not in update_mappings:
                msg = "line {}: DOI {} not found in update file!"
                oat.print_r(msg.format(reader.line_num, doi))
                continue
            changes = []
            old_euro_value = locale.atof(line[euro_field])
            new_euro_value = update_mappings[doi][euro_field]
            if old_euro_value != new_euro_value:
                changes.append(Change(euro_field, old_euro_value, new_euro_value, monetary=True))
            for field in update_mappings[doi].keys():
                if field == euro_field:
                    continue
                if line[field] != update_mappings[doi][field]:
                    changes.append(Change(field, line[field], update_mappings[doi][field]))
            if not changes:
                msg = "line {}: DOI {} found in update file, but nothing changed."
                oat.print_g(msg.format(reader.line_num, doi))
            else:
                msg = "line {}: DOI {} found in update file with the following updates:"
                oat.print_y(msg.format(reader.line_num, doi))
                for change in changes:
                    oat.print_y(str(change))
                    if change.monetary:
                        line[change.field_name] = locale.currency(change.new_value,symbol=False, grouping=args.grouping)
                    else:
                        line[change.field_name] = change.new_value
            del(update_mappings[doi])
            modified_content.append(line)
        if update_mappings:
            oat.print_y("{} entries in update file not contained in original file:".format(len(update_mappings)))
        for doi, changes in update_mappings.items():
            oat.print_y(doi)
            new_line = changes
            new_line[params["original"]["doi_field"]] = doi
            formatted_euro = locale.currency(new_line[params["original"]["euro_field"]], symbol=False, grouping=args.grouping)
            new_line[params["original"]["euro_field"]] = formatted_euro
            modified_content.append(new_line)
    
    with open("out.csv", "w", encoding=params["original"]["encoding"]) as out:
        writer = csv.DictWriter(out, fieldnames, dialect=params["original"]["csv_analysis"].dialect)
        writer.writeheader()
        for line in modified_content:
            writer.writerow(line)
Пример #22
0
def integrate_changes(articles, file_path, enriched_file=False, dry_run=False):
    '''
    Update existing entries in a previously created harvest file.
    
    Args:
        articles: A list of article dicts, as retured by openapc_toolkit.oai_harvest()
        file_path: Path to the CSV file the new values should be integrated into.
        enriched_file: If true, columns which are overwritten during enrichment
                       will not be updated
        dry_run: Do not make any changes to the file (but still report changes and
                 return the list of unencountered articles)
    Returns:
        A tuple. The first element is a reduced list of article dicts, containing
        those which did not find a matching DOI in the file (Order preserved).
        The second element is the list of column headers encountered in the harvest 
        file.
    '''

    messages = {
        'wet': {
            'start':
            'Integrating changes in harvest data into existing file {}',
            'line_change':
            'Line {}: Updating value in column {} ("{}" -> "{}")',
            'remove':
            'PID {} no longer found in harvest data, removing article',
        },
        'dry': {
            'start':
            'Dry Run: Comparing harvest data to existing file {}',
            'line_change':
            'Line {} ({}): Change in column {} ("{}" -> "{}")',
            'remove':
            'PID {} no longer found in harvest data, article would be removed',
        }
    }

    messages = messages['dry'] if dry_run else messages['wet']

    if not os.path.isfile(file_path):
        return (articles, None)
    enriched_blacklist = [
        "institution", "publisher", "journal_full_title", "issn",
        "license_ref", "pmid"
    ]
    article_dict = OrderedDict()
    for article in articles:
        # Harvested articles use OAI record IDs in the url field as PID.
        url = article["url"]
        if oat.has_value(url):
            article_dict[url] = article
    updated_lines = []
    fieldnames = None
    with open(file_path, "r") as f:
        reader = DictReader(f)
        fieldnames = reader.fieldnames
        updated_lines.append(list(fieldnames))  #header
        oat.print_y(messages["start"].format(file_path))
        for line in reader:
            url = line["url"]
            if not oat.has_value(line["institution"]):
                # Do not change empty lines
                updated_lines.append([line[key] for key in fieldnames])
                continue
            line_num = reader.reader.line_num
            if url in article_dict:
                for key, value in article_dict[url].items():
                    if enriched_file and key in enriched_blacklist:
                        continue
                    if key in line and value != line[key]:
                        oat.print_g(messages["line_change"].format(
                            line_num, line["url"], key, line[key], value))
                        line[key] = value
                del (article_dict[url])
                updated_line = [line[key] for key in fieldnames]
                updated_lines.append(updated_line)
            else:
                oat.print_r(messages["remove"].format(url))
    if not dry_run:
        with open(file_path, "w") as f:
            mask = oat.OPENAPC_STANDARD_QUOTEMASK if enriched_file else None
            writer = oat.OpenAPCUnicodeWriter(f,
                                              quotemask=mask,
                                              openapc_quote_rules=True,
                                              has_header=True)
            writer.write_rows(updated_lines)
    return (article_dict.values(), fieldnames)
Пример #23
0
def integrate_changes(articles, file_path, enriched_file=False):
    '''
    Update existing entries in a previously created harvest file.
    
    Args:
        articles: A list of article dicts, as retured by openapc_toolkit.oai_harvest()
        file_path: Path to the CSV file the new values should be integrated into.
        enriched_file: If true, columns which are overwritten during enrichment
                       will not be updated
    Returns:
        A tuple. The first element is a reduced list of article dicts, containing
        those which did not find a matching DOI in the file (Order preserved).
        The second element is the list of column headers encountered in the harvest 
        file.
    '''
    if not os.path.isfile(file_path):
        return (articles, None)
    enriched_blacklist = [
        "institution", "publisher", "journal_full_title", "issn",
        "license_ref", "pmid"
    ]
    article_dict = OrderedDict()
    for article in articles:
        # This is possible because currently all repos use a local ID/record url, but it's just
        # a workaround. We might have to change to OAI record IDs later.
        url = article["url"]
        if oat.has_value(url):
            article_dict[url] = article
    updated_lines = []
    fieldnames = None
    with open(file_path, "r") as f:
        reader = DictReader(f)
        fieldnames = reader.fieldnames
        updated_lines.append(list(fieldnames))  #header
        start_msg = "Integrating changes in harvest data into existing file {}"
        oat.print_g(start_msg.format(file_path))
        for line in reader:
            url = line["url"]
            line_num = reader.reader.line_num
            msg = "Line {}: Checking for changes ({})"
            oat.print_b(msg.format(line_num, url))
            if url in article_dict:
                for key, value in article_dict[url].items():
                    if enriched_file and key in enriched_blacklist:
                        continue
                    if key in line and value != line[key]:
                        update_msg = 'Updating value in column {} ("{}" -> "{}")'
                        oat.print_g(update_msg.format(key, line[key], value))
                        line[key] = value
                del (article_dict[url])
                updated_line = [line[key] for key in fieldnames]
                updated_lines.append(updated_line)
            else:
                remove_msg = "URL {} no longer found in harvest data, removing article"
                oat.print_r(remove_msg.format(url))
    with open(file_path, "w") as f:
        mask = oat.OPENAPC_STANDARD_QUOTEMASK if enriched_file else None
        writer = oat.OpenAPCUnicodeWriter(f,
                                          quotemask=mask,
                                          openapc_quote_rules=True,
                                          has_header=True)
        writer.write_rows(updated_lines)
    return (article_dict.values(), fieldnames)
Пример #24
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("index", type=int, help=ARG_HELP_STRINGS["index"])
    parser.add_argument("-v", "--value", help=ARG_HELP_STRINGS["value"])
    parser.add_argument("-f", "--file", help=ARG_HELP_STRINGS["file"])
    parser.add_argument("-d", "--full_delete", action="store_true", help=ARG_HELP_STRINGS["full_delete"])
    parser.add_argument("-i", "--ignore_case", action="store_true", help=ARG_HELP_STRINGS["ignore_case"])
    parser.add_argument("-r", "--results_file", action="store_true", help=ARG_HELP_STRINGS["results_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    
    args = parser.parse_args()
    if args.value is None and args.file is None:
        parser.error("Either a single value (-v option) or a file of " +
                     "multiple values (-f option) must be given.")
    
    values = []
    if args.file:
        if not os.path.isfile(args.file):
            print("Error: '" + args.file + "' is no valid file!")
            sys.exit() 
        with open(args.file, "r") as f:
            for line in f:
                if len(line) > 0:
                    value = line.strip("\r\n")
                    if args.ignore_case:
                        values.append(value.lower())
                    else:
                        values.append(value)
        oat.print_g(str(len(values)) + " values read from file")
    
    if args.value is not None:
        if args.ignore_case:
            values.append(args.value.lower())
        else:
            values.append(args.value)
        if args.file:
            oat.print_y("Value argument given in addition to file " +
                        "argument, adding value to file imports...")
    
    quote_rules = args.openapc_quote_rules
    
    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            print (msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()
            
    header, content = oat.get_csv_file_content(args.csv_file, enc)
        
    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and" +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    empty_line = ['' for element in content[0]]
    column_name = "column " + str(args.index)
    if header:
        header_line = header[0]
        column_name = header_line[args.index]
        empty_line = ['' for element in header_line]
    msg = u"Performing line deletion on condition '{}' in {}".format(column_name, values)
    oat.print_g(msg)
    
    modified_content = []
    deleted_lines = []
    num_total_lines = num_deleted_lines = 0
    for line in content:
        if len(line) == 0:
            continue
        num_total_lines += 1
        current_value = line[args.index]
        if args.ignore_case:
            current_value = current_value.lower()
        if current_value not in values:
            modified_content.append(line)
        else:
            num_deleted_lines += 1
            if not args.full_delete:
                modified_content.append(list(empty_line))
            if args.results_file:
                deleted_lines.append(line)
            
    msg = u"Process complete, deleted {} out of {} total lines"        
    oat.print_g(msg.format(num_deleted_lines, num_total_lines))
    
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
        writer.write_rows(copy.deepcopy(header) + modified_content)

    if args.results_file and len(deleted_lines) > 0:
        with open('del.csv', 'w') as out:
            writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
            writer.write_rows(copy.deepcopy(header) + deleted_lines)
Пример #25
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"])
    args = parser.parse_args()

    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(oat.ANSIColorFormatter())
    bufferedHandler = oat.BufferedErrorHandler(handler)
    bufferedHandler.setFormatter(oat.ANSIColorFormatter())
    logging.root.addHandler(handler)
    logging.root.addHandler(bufferedHandler)
    logging.root.setLevel(logging.INFO)

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            msg = ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            oat.print_r(msg)
            sys.exit()

    head, content = oat.get_csv_file_content(args.csv_file, enc)
    content = head + content

    line_num = 0
    for line in content:
        line_num += 1
        if args.start and args.start > line_num:
            continue
        if args.end and args.end < line_num:
            continue
        # Check hybrid status
        if line[4] != "TRUE":
            continue
        institution = line[0]
        period = line[1]
        doi = line[3]
        publisher = line[5]
        journal = line[6]
        for lpl in lpl_list:
            if lpl.publisher_matches(publisher):
                init_msg = (u"Line {}: Checking {} article from {}, published in '" +
                            "{}'...").format(line_num, institution, period, journal)
                oat.print_b(init_msg)
                page_content = get_landingpage_content(doi, lpl)
                if page_content is None:
                    continue
                pdf_link = lpl.search_for_oa(page_content)
                if pdf_link is None:
                    error_msg = (u"No PDF link found! (line {}, DOI: " +
                                 "http://doi.org/{}").format(line_num, doi)
                    logging.error(error_msg)
                elif pdf_link == "":
                    warning_msg = (u"A RegexGroup matched, but no PDF " +
                                   "link was found! (line {}, DOI: " +
                                   "http://doi.org/{}").format(line_num, doi)
                    logging.warning(warning_msg)
                else:
                    oat.print_g(u"PDF link found: " + pdf_link)
        time.sleep(1)

    if not bufferedHandler.buffer:
        oat.print_g("\nLookup finished, all articles were accessible")
    else:
        oat.print_r("\nLookup finished, not all articles could be accessed:\n")
    # closing will implicitly flush the handler and print any buffered
    # messages to stderr
    bufferedHandler.close()
Пример #26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-v", "--verbose", action="store_true",
                        help=ARG_HELP_STRINGS["verbose"])
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-i", "--ignore-header", action="store_true",
                        help=ARG_HELP_STRINGS["headers"])
    parser.add_argument("-f", "--force", action="store_true",
                        help=ARG_HELP_STRINGS["force"])
    parser.add_argument("-b", "--bypass-cert-verification", action="store_true",
                        help=ARG_HELP_STRINGS["bypass"])
    parser.add_argument("-institution", "--institution_column", type=int,
                        help=ARG_HELP_STRINGS["institution"])
    parser.add_argument("-period", "--period_column", type=int,
                        help=ARG_HELP_STRINGS["period"])
    parser.add_argument("-doi", "--doi_column", type=int,
                        help=ARG_HELP_STRINGS["doi"])
    parser.add_argument("-euro", "--euro_column", type=int,
                        help=ARG_HELP_STRINGS["euro"])
    parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int,
                        help=ARG_HELP_STRINGS["is_hybrid"])
    parser.add_argument("-publisher", "--publisher_column", type=int,
                        help=ARG_HELP_STRINGS["publisher"])
    parser.add_argument("-journal_full_title", "--journal_full_title_column",
                        type=int, help=ARG_HELP_STRINGS["journal_full_title"])
    parser.add_argument("-issn", "--issn_column",
                        type=int, help=ARG_HELP_STRINGS["issn"])
    parser.add_argument("-url", "--url_column",
                        type=int, help=ARG_HELP_STRINGS["url"])

    args = parser.parse_args()
    enc = None # CSV file encoding

    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            print "locale '{}' not found, normalized to '{}'".format(
                args.locale, norm)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            print "Using locale", loc
        except locale.Error as loce:
            print "Setting locale to " + norm + " failed: " + loce.message
            sys.exit()

    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            print ("Encoding '{}' found in Python's codec collection " +
                   "as '{}'").format(args.encoding, codec.name)
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()

    result = oat.analyze_csv_file(args.csv_file)
    if result["success"]:
        csv_analysis = result["data"]
        print csv_analysis
    else:
        print result["error_msg"]
        sys.exit()
    
    if enc is None:
        enc = csv_analysis.enc
    dialect = csv_analysis.dialect
    has_header = csv_analysis.has_header

    if enc is None:
        print ("Error: No encoding given for CSV file and automated " +
               "detection failed. Please set the encoding manually via the " +
               "--enc argument")
        sys.exit()

    csv_file = open(args.csv_file, "r")
    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)

    first_row = reader.next()
    num_columns = len(first_row)
    print "\nCSV file has {} columns.".format(num_columns)

    csv_file.seek(0)
    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)

    column_map = OrderedDict([
        ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column)),  
        ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column)),
        ("euro", CSVColumn("euro", CSVColumn.MANDATORY, args.euro_column)),
        ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column)),
        ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column)),
        ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column)),
        ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL,
                                        args.journal_full_title_column)),
        ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column)),
        ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None)),
        ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None)),
        ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None)),
        ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None)),
        ("pmid", CSVColumn("pmid", CSVColumn.NONE, None)),
        ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None)),
        ("ut", CSVColumn("ut", CSVColumn.NONE, None)),
        ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column)),
        ("doaj", CSVColumn("doaj", CSVColumn.NONE, None))
    ])

    # Do not quote the values in the 'period' and 'euro' columns
    quotemask = [
        True,
        False,
        False,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
    ]

    header = None
    if has_header:
        for row in reader:
            if not row: # Skip empty lines
                continue
            header = row # First non-empty row should be the header
            if args.ignore_header:
                print "Skipping header analysis due to command line argument."
                break
            else:
                print "\n    *** Analyzing CSV header ***\n"
            for (index, item) in enumerate(header):
                column_type = oat.get_column_type_from_whitelist(item)
                if column_type is not None and column_map[column_type].index is None:
                    column_map[column_type].index = index
                    column_map[column_type].column_name = item
                    print ("Found column named '{}' at index {}, " +
                           "assuming this to be the {} column.").format(
                               item, index, column_type)
            break


    print "\n    *** Starting heuristical analysis ***\n"
    for row in reader:
        if not row: # Skip empty lines
            # We analyze the first non-empty line, a possible header should
            # have been processed by now.
            continue
        column_candidates = {
            "doi": [],
            "period": [],
            "euro": []
        }
        for (index, entry) in enumerate(row):
            if index in [csvcolumn.index for csvcolumn in column_map.values()]:
                # Skip columns already assigned
                continue
            entry = entry.strip()
            # Search for a DOI
            if column_map['doi'].index is None:
                if oat.DOI_RE.match(entry):
                    column_id = str(index)
                    # identify column either numerical or by column header
                    if header:
                        column_id += " ('" + header[index] + "')"
                    print ("The entry in column {} looks like a " +
                           "DOI: {}").format(column_id, entry)
                    column_candidates['doi'].append(index)
                    continue
            # Search for a potential year string
            if column_map['period'].index is None:
                try:
                    maybe_period = int(entry)
                    now = datetime.date.today().year
                    # Should be a wide enough margin
                    if maybe_period >= 2000 and maybe_period <= now + 2:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print ("The entry in column {} looks like a " +
                               "potential period: {}").format(column_id, entry)
                        column_candidates['period'].append(index)
                        continue
                except ValueError:
                    pass
            # Search for a potential monetary amount
            if column_map['euro'].index is None:
                try:
                    maybe_euro = locale.atof(entry)
                    # Are there APCs above 6000€ ??
                    if maybe_euro >= 10 and maybe_euro <= 6000:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print ("The entry in column {} looks like a " +
                               "potential euro amount: {}").format(column_id,
                                                                   entry)
                        column_candidates['euro'].append(index)
                        continue
                except ValueError:
                    pass
        for column_type, candidates in column_candidates.iteritems():
            if column_map[column_type].index is not None:
                continue
            if len(candidates) > 1:
                print ("Could not reliably identify the '" + column_type +
                       "' column - more than one possible candiate!")
            elif len(candidates) < 1:
                print "No candidate found for column '" + column_type + "'!"
            else:
                index = candidates.pop()
                column_map[column_type].index = index
                if header:
                    column_id = header[index]
                    column_map[column_type].column_name = column_id
                else:
                    column_id = index
                print ("Assuming column '{}' to be the '{}' " +
                       "column.").format(column_id, column_type)
                column_map[column_type].index = index
        break

    # Wrap up: Check if there any mandatory column types left which have not
    # yet been identified - we cannot continue in that case (unless forced).
    unassigned = filter(lambda (k, v): v.requirement == CSVColumn.MANDATORY and v.index is None,
                        column_map.iteritems())
    if unassigned:
        for item in unassigned:
            print "The {} column is still unidentified.".format(item[0])
        if header:
            print "The CSV header is:\n" + dialect.delimiter.join(header)
        if not args.force:
            print ("ERROR: We cannot continue because not all mandatory " +
                   "column types in the CSV file could be automatically " +
                   "identified. There are 2 ways to fix this:")
            if not header:
                print ("1) Add a header row to your file and identify the " +
                       "column(s) by assigning them an appropiate column name.")
            else:
                print ("1) Identify the missing column(s) by assigning them " +
                       "a different column name in the CSV header (You can " +
                       "use the column name(s) mentioned in the message above)")
            print ("2) Use command line parameters when calling this script " +
                   "to identify the missing columns (use -h for help) ")
            sys.exit()
        else:
            print ("WARNING: Not all mandatory column types in the CSV file " +
                   "could be automatically identified - forced to continue.")

    print "\n    *** CSV file analysis summary ***\n"

    index_dict = {csvc.index: csvc for csvc in column_map.values()}

    for index in range(num_columns):
        column_name = ""
        if header:
            column_name = header[index]
        if index in index_dict:
            column = index_dict[index]
            msg = "column number {} ({}) is the {} column '{}'".format(
                index, column_name, column.requirement, column.column_type)
            if column.requirement in [CSVColumn.MANDATORY, CSVColumn.OPTIONAL]:
                oat.print_g(msg)
            else:
                oat.print_b(msg)
        else:
            msg = ("column number {} ({}) is an unknown column, it will be " +
                   "appended to the generated CSV file")
            oat.print_y(msg.format(index, column_name))
            if not column_name:
                # Use a generic name
                column_name = "unknown"
            while column_name in column_map.keys():
                # TODO: Replace by a numerical, increasing suffix
                column_name += "_"
            column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index)

    print ""
    for column in column_map.values():
        if column.index is None:
            msg = "The {} column '{}' could not be identified."
            print msg.format(column.requirement, column.column_type)


    # Check for unassigned optional column types. We can continue but should
    # issue a warning as all entries will need a valid DOI in this case.
    unassigned = filter(lambda (k, v): v.requirement == CSVColumn.OPTIONAL and v.index is None,
                        column_map.iteritems())
    if unassigned:
        print ("\nWARNING: Not all optional column types could be " +
               "identified. Metadata aggregation is still possible, but " +
               "every entry in the CSV file will need a valid DOI.")

    start = raw_input("\nStart metadata aggregation? (y/n):")
    while start not in ["y", "n"]:
        start = raw_input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()

    print "\n    *** Starting metadata aggregation ***\n"

    enriched_content = []
    
    error_messages = []

    csv_file.seek(0)
    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)
    header_processed = False
    row_num = 0

    for row in reader:
        row_num += 1
        if not row:
            continue # skip empty lines
        if not header_processed:
            header_processed = True
            enriched_content.append(column_map.keys())
            if has_header:
                # If the CSV file has a header, we are currently there - skip it
                # to get to the first data row
                continue
        print "---Processing line number " + str(row_num) + "---"
        if len(row) != num_columns:
            error_msg = ("Syntax: the number of values in line {} ({}) " +
                         "differs from the number of columns ({}). Line left " +
                         "unchanged, please correct the error in the result " +
                         "file and re-run.")
            error_msg_fmt = error_msg.format(row_num, len(row), num_columns)
            error_messages.append("Line {}: {}".format(row_num, error_msg_fmt))
            oat.print_r(error_msg_fmt)
            enriched_content.append(row)
            continue

        doi = row[column_map["doi"].index]
        
        current_row = OrderedDict()
        # Copy content of identified columns
        for csv_column in column_map.values():
            if csv_column.index is not None and len(row[csv_column.index]) > 0:
                if csv_column.column_type == "euro":
                    # special case for monetary values: Cast to float to ensure
                    # the decimal point is a dot (instead of a comma)
                    euro_value = row[csv_column.index]
                    try:
                        euro = locale.atof(euro_value)
                        if euro.is_integer():
                            euro = int(euro)
                        current_row[csv_column.column_type] = str(euro)
                    except ValueError:
                        msg = ERROR_MSGS["locale"].format(euro_value,
                                                          csv_column.index)
                        oat.print_r(msg)
                        sys.exit()
                else:
                    current_row[csv_column.column_type] = row[csv_column.index]
            else:
                current_row[csv_column.column_type] = "NA"

        # include crossref metadata
        crossref_result = oat.get_metadata_from_crossref(doi)
        if crossref_result["success"]:
            print "Crossref: DOI resolved: " + doi
            current_row["indexed_in_crossref"] = "TRUE"
            data = crossref_result["data"]
            for key, value in data.iteritems():
                if value is not None:
                    if key == "journal_full_title":
                        unified_value = oat.get_unified_journal_title(value)
                        if unified_value != value:
                            msg = INFO_MSGS["unify"].format("journal title",
                                                            value,
                                                            unified_value)
                            oat.print_b(msg)
                        new_value = unified_value
                    elif key == "publisher":
                        unified_value = oat.get_unified_publisher_name(value)
                        if unified_value != value:
                            msg = INFO_MSGS["unify"].format("publisher name",
                                                            value,
                                                            unified_value)
                            oat.print_b(msg)
                        new_value = unified_value
                    else:
                        new_value = value
                else:
                    new_value = "NA"
                    if args.verbose:
                        print (u"WARNING: Element '{}' not found in in " +
                               "response for doi {}.").format(key, doi)
                old_value = current_row[key]
                current_row[key] = column_map[key].check_overwrite(old_value, new_value)
        else:
            error_msg = ("Crossref: Error while trying to resolve DOI " + doi +
                         ": " + crossref_result["error_msg"])
            oat.print_r(error_msg)
            error_messages.append("Line {}: {}".format(row_num, error_msg))
            current_row["indexed_in_crossref"] = "FALSE"

        # include pubmed metadata
        pubmed_result = oat.get_metadata_from_pubmed(doi)
        if pubmed_result["success"]:
            print "Pubmed: DOI resolved: " + doi
            data = pubmed_result["data"]
            for key, value in data.iteritems():
                if value is not None:
                    new_value = value
                else:
                    new_value = "NA"
                    if args.verbose:
                        print (u"WARNING: Element '{}' not found in in " +
                               "response for doi {}.").format(key, doi)
                old_value = current_row[key]
                current_row[key] = column_map[key].check_overwrite(old_value, new_value)
        else:
            error_msg = ("Pubmed: Error while trying to resolve DOI " + doi +
                         ": " + pubmed_result["error_msg"])
            oat.print_r(error_msg)
            error_messages.append("Line {}: {}".format(row_num, error_msg))

        # lookup in DOAJ. try the EISSN first, then ISSN and finally print ISSN
        if current_row["doaj"] != "TRUE":
            issns = []
            if current_row["issn_electronic"] != "NA":
                issns.append(current_row["issn_electronic"])
            if current_row["issn"] != "NA":
                issns.append(current_row["issn"])
            if current_row["issn_print"] != "NA":
                issns.append(current_row["issn_print"])
            for issn in issns:
                doaj_res = oat.lookup_journal_in_doaj(issn, args.bypass_cert_verification)
                if doaj_res["data_received"]:
                    if doaj_res["data"]["in_doaj"]:
                        msg = "DOAJ: Journal ISSN ({}) found in DOAJ ('{}')."
                        print msg.format(issn, doaj_res["data"]["title"])
                        current_row["doaj"] = "TRUE"
                        break
                    else:
                        msg = "DOAJ: Journal ISSN ({}) not found in DOAJ."
                        current_row["doaj"] = "FALSE"
                        print msg.format(issn)
                else:
                    msg = "DOAJ: Error while trying to look up ISSN {}: {}"
                    msg_fmt = msg.format(issn, doaj_res["error_msg"])
                    oat.print_r(msg_fmt)
                    error_messages.append("Line {}: {}".format(row_num, msg_fmt))


        enriched_content.append(current_row.values())

    csv_file.close()

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, quotemask, True, True)
        writer.write_rows(enriched_content)

    if not error_messages:
        oat.print_g("Metadata enrichment successful, no errors occured")
    else:
        oat.print_r("There were errors during the enrichment process:\n")
        for msg in error_messages:
            print msg + "\n"
Пример #27
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("apc_file", help=ARG_HELP_STRINGS["apc_file"])
    parser.add_argument("issn_l_file", help=ARG_HELP_STRINGS["issn_l_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    
    args = parser.parse_args()
    
    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and"  +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    enc = None
    
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            print ("Encoding '{}' found in Python's codec collection " +
                   "as '{}'").format(args.encoding, codec.name)
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()
            
    result = oat.analyze_csv_file(args.apc_file, 500)
    if result["success"]:
        csv_analysis = result["data"]
        print csv_analysis
    else:
        print result["error_msg"]
        sys.exit()
    
    if enc is None:
        enc = csv_analysis.enc
    
    if enc is None:
        print ("Error: No encoding given for CSV file and automated " +
               "detection failed. Please set the encoding manually via the " +
               "--enc argument")
        sys.exit()
        
    dialect = csv_analysis.dialect
    
    has_header = csv_analysis.has_header
    
    csv_file = open(args.apc_file, "r")

    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)
    
    
    oat.print_g("Preparing mapping table...")
    itself = other = 0
    issn_l_re = re.compile("^(?P<issn>\d{4}-\d{3}[\dxX])\t(?P<issn_l>\d{4}-\d{3}[\dxX])$")
    issn_l_file = open(args.issn_l_file, "r")
    issn_l_dict = {}
    for i, line in enumerate(issn_l_file):
        if i % 100000 == 0:
            print str(i) + " lines processed."
        match = issn_l_re.match(line)
        if match:
            match_dict = match.groupdict()
            issn_l_dict[match_dict['issn']] = match_dict['issn_l']
            if match_dict['issn'] == match_dict['issn_l']:
                itself += 1
            else:
                other += 1
    print str(itself) + " ISSNs pointing to itself as ISSN-L, " + str(other) + " to another value."
    oat.print_g("Starting enrichment...")
    
    issn_matches = issn_p_matches = issn_e_matches = unmatched = different = corrections = 0
    enriched_lines = []
    for line in reader:
        if len(line) == 0:
            enriched_lines.append(line)
            continue
        issn = reformat_issn(line[7])
        issn_p = reformat_issn(line[8])
        issn_e = reformat_issn(line[9])
        target = None
        if issn in issn_l_dict:
            target = issn_l_dict[issn]
            corrected_target = oat.get_corrected_issn_l(target)
            if corrected_target != target:
                corrections += 1
            line[10] = corrected_target
            issn_matches += 1
        elif issn_p in issn_l_dict:
            target = issn_l_dict[issn_p]
            corrected_target = oat.get_corrected_issn_l(target)
            if corrected_target != target:
                corrections += 1
            line[10] = corrected_target
            issn_p_matches += 1
        elif issn_e in issn_l_dict:
            target = issn_l_dict[issn_e]
            corrected_target = oat.get_corrected_issn_l(target)
            if corrected_target != target:
                corrections += 1
            line[10] = corrected_target
            issn_e_matches += 1
        else:
            unmatched += 1
        if target is not None and target not in [issn, issn_p, issn_e]:
            different += 1
        enriched_lines.append(line)
    
    print "{} issn_l values mapped by issn, {} by issn_p, {} by issn_e. {} could not be assigned.\n{} issn_l values were corrected during the process.\n In {} cases the ISSN-L was different from all existing ISSN values".format(issn_matches, issn_p_matches, issn_e_matches, unmatched, corrections, different)

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, has_header)
        writer.write_rows(enriched_lines)
Пример #28
0
        return "HTTPError: {} - {}".format(code, httpe.reason)
    except urllib2.URLError as urle:
        return "URLError: {}".format(urle.reason)
    except ET.ParseError as etpe:
        return "ElementTree ParseError: {}".format(str(etpe))

parser = argparse.ArgumentParser()
parser.add_argument("doi_or_file", help="An OpenAPC-compatible CSV file or a single DOI to look up in crossref.")
args = parser.parse_args()

arg = args.doi_or_file
if os.path.isfile(arg):
    csv_file = open(arg, "r")
    reader = oat.UnicodeReader(csv_file)
    line_number = 0
    for line in reader:
        if not line:
            prefix = ""
        else:
            prefix = get_prefix(line[3])
        result = str(line_number) + ": " + prefix
        if prefix == "Springer (Biomed Central Ltd.)":
            oat.print_g(result)
        elif prefix == "Nature Publishing Group":
            oat.print_r(result)
        else:
            print result
        line_number += 1
else:
    print get_prefix(arg)
Пример #29
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file")
    parser.add_argument("source_column", type=int)
    parser.add_argument("currency_column", type=int)
    parser.add_argument("period_column", type=int)
    parser.add_argument("target_column", type=int)
    parser.add_argument("-f", "--force_overwrite", action="store_true")
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    args = parser.parse_args()
    
    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and"  +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    enc = None
    
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()
    
    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            msg = "locale '{}' not found, normalised to '{}'".format(
                  args.locale, norm)
            oat.print_y(msg)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            oat.print_g("Using locale " + loc)
        except locale.Error as loce:
            msg = "Setting locale to {} failed: {}".format(norm, loce.message)
            oat.print_r(msg)
            sys.exit()
        
    header, content = oat.get_csv_file_content(args.source_file, enc, True)
    fieldnames = header.pop()
    
    modified_content = []
    line_num = 0
    
    for column_type in ["source_column", "currency_column", "period_column", "target_column"]:
        index = getattr(args, column_type)
        msg = "Column {} ('{}') is the {}."
        oat.print_g(msg.format(index, fieldnames[index], column_type))
    
    start = input("\nStart conversion? (y/n):")
    while start not in ["y", "n"]:
        start = input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()
    
    for line in content:
        line_num += 1
        if not oat.has_value(line[args.source_column]):
            oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...")
            modified_content.append(line)
            continue
        monetary_value = None
        try: 
            monetary_value = locale.atof(line[args.source_column])
        except ValueError:
            msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, line[args.source_column]))
            modified_content.append(line)
            continue
        currency = line[args.currency_column]
        if not oat.has_value(currency):
            msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, currency))
            modified_content.append(line)
            continue
        period = line[args.period_column]
        frequency = get_frequency(period)
        if frequency is None:
            msg = "WARNING: Could not extract a valid date string from period column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, period))
            modified_content.append(line)
            continue
        if currency not in EXCHANGE_RATES[frequency]:
            msg = 'No exchange rates ({}) found for currency "{}", querying ECB data warehouse...'
            oat.print_b(msg.format(frequency, currency))
            rates = oat.get_euro_exchange_rates(currency, frequency)
            EXCHANGE_RATES[frequency][currency] = rates
        try:
            rate = EXCHANGE_RATES[frequency][currency][period]
        except KeyError:
            msg = "ERROR: No conversion rate found for currency {} for period {} (line {}), aborting..."
            oat.print_r(msg.format(currency, period, line_num))
            sys.exit()
        
        euro_value = round(monetary_value/float(rate), 2)
        line[args.target_column] = str(euro_value)
        
        msg = "Line {}: {} exchange rate ({}) for date {} is {} -> {} / {} = {} EUR"
        msg = msg.format(line_num, currency, frequency, period, rate, monetary_value, rate, euro_value)
        oat.print_g(msg)
        
        modified_content.append(line)
    
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows([fieldnames] + modified_content)
Пример #30
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-O", "--offsetting_mode", help=ARG_HELP_STRINGS["offsetting"])
    parser.add_argument("-b", "--bypass-cert-verification", action="store_true",
                        help=ARG_HELP_STRINGS["bypass"])
    parser.add_argument("-d", "--offline_doaj",
                        help=ARG_HELP_STRINGS["offline_doaj"])
    parser.add_argument("-D", "--offline_doaj_download",
                        help=ARG_HELP_STRINGS["offline_doaj_download"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-f", "--force", action="store_true",
                        help=ARG_HELP_STRINGS["force"])
    parser.add_argument("-i", "--ignore-header", action="store_true",
                        help=ARG_HELP_STRINGS["ignore_header"])
    parser.add_argument("-j", "--force-header", action="store_true",
                        help=ARG_HELP_STRINGS["force_header"])
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-u", "--add-unknown-columns", action="store_true",
                        help=ARG_HELP_STRINGS["unknown_columns"])
    parser.add_argument("-v", "--verbose", action="store_true",
                        help=ARG_HELP_STRINGS["verbose"])
    parser.add_argument("-o", "--overwrite", action="store_true",
                        help=ARG_HELP_STRINGS["overwrite"])
    parser.add_argument("-r", "--round_monetary", action="store_true",
                        help=ARG_HELP_STRINGS["round_monetary"])
    parser.add_argument("--no-crossref", action="store_true",
                        help=ARG_HELP_STRINGS["no_crossref"])
    parser.add_argument("--no-pubmed", action="store_true",
                        help=ARG_HELP_STRINGS["no_pubmed"])
    parser.add_argument("--no-doaj", action="store_true",
                        help=ARG_HELP_STRINGS["no_doaj"])
    parser.add_argument("-institution", "--institution_column", type=int,
                        help=ARG_HELP_STRINGS["institution"])
    parser.add_argument("-period", "--period_column", type=int,
                        help=ARG_HELP_STRINGS["period"])
    parser.add_argument("-doi", "--doi_column", type=int,
                        help=ARG_HELP_STRINGS["doi"])
    parser.add_argument("-euro", "--euro_column", type=int,
                        help=ARG_HELP_STRINGS["euro"])
    parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int,
                        help=ARG_HELP_STRINGS["is_hybrid"])
    parser.add_argument("-publisher", "--publisher_column", type=int,
                        help=ARG_HELP_STRINGS["publisher"])
    parser.add_argument("-journal_full_title", "--journal_full_title_column",
                        type=int, help=ARG_HELP_STRINGS["journal_full_title"])
    parser.add_argument("-issn", "--issn_column",
                        type=int, help=ARG_HELP_STRINGS["issn"])
    parser.add_argument("-url", "--url_column",
                        type=int, help=ARG_HELP_STRINGS["url"])
    parser.add_argument("-start", type=int, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["end"])
    parser.add_argument("-q", "--quotemask", default="tfftttttttttttttttt",
                        help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-n", "--no-openapc-quote-rules", 
                        help=ARG_HELP_STRINGS["no_openapc_quote_rules"],
                        action="store_true", default=False)

    args = parser.parse_args()

    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(oat.ANSIColorFormatter())
    bufferedHandler = oat.BufferedErrorHandler(handler)
    bufferedHandler.setFormatter(oat.ANSIColorFormatter())
    logging.root.addHandler(handler)
    logging.root.addHandler(bufferedHandler)
    logging.root.setLevel(logging.INFO)
    
    if args.offline_doaj and args.offline_doaj_download:
        oat.print_r("Error: Either use the -d or the -D option, not both.")
        sys.exit()

    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            msg = "locale '{}' not found, normalised to '{}'".format(
                  args.locale, norm)
            oat.print_y(msg)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            oat.print_g("Using locale " + loc)
        except locale.Error as loce:
            msg = "Setting locale to {} failed: {}".format(norm, loce.message)
            oat.print_r(msg)
            sys.exit()

    enc = None # CSV file encoding
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = ("Encoding '{}' found in Python's codec collection " +
                   "as '{}'").format(args.encoding, codec.name)
            oat.print_g(msg)
            enc = args.encoding
        except LookupError:
            msg = ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            oat.print_r(msg)
            sys.exit()

    result = oat.analyze_csv_file(args.csv_file, enc=enc)
    if result["success"]:
        csv_analysis = result["data"]
        print(csv_analysis)
    else:
        print(result["error_msg"])
        sys.exit()

    if enc is None:
        enc = csv_analysis.enc
    dialect = csv_analysis.dialect
    has_header = csv_analysis.has_header or args.force_header

    if enc is None:
        print("Error: No encoding given for CSV file and automated " +
              "detection failed. Please set the encoding manually via the " +
              "--enc argument")
        sys.exit()

    reduced = args.quotemask.replace("f", "").replace("t", "")
    if len(reduced) > 0:
        print("Error: A quotemask may only contain the letters 't' and "  +
              "'f'!")
        sys.exit()
    mask = [True if x == "t" else False for x in args.quotemask]

    doaj_offline_analysis = None
    if args.offline_doaj:
        if os.path.isfile(args.offline_doaj):
            doaj_offline_analysis = oat.DOAJOfflineAnalysis(args.offline_doaj)
        else:
            oat.print_r("Error: " + args.offline_doaj + " does not seem "
                        "to be a file!")
            sys.exit()
    elif args.offline_doaj_download:
        if os.path.isfile(args.offline_doaj_download):
            oat.print_r("Error: Target file '" + args.offline_doaj_download + "' already exists!")
            sys.exit()
        doaj_offline_analysis = oat.DOAJOfflineAnalysis(args.offline_doaj_download, download=True)

    csv_file = open(args.csv_file, "r", encoding=enc)
    reader = csv.reader(csv_file, dialect=dialect)

    first_row = next(reader)
    num_columns = len(first_row)
    print("\nCSV file has {} columns.".format(num_columns))

    csv_file.seek(0)
    reader = csv.reader(csv_file, dialect=dialect)

    if args.overwrite:
        ow_strategy = CSVColumn.OW_ALWAYS
    else:
        ow_strategy = CSVColumn.OW_ASK
        
    openapc_column_map = OrderedDict([
        ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column, overwrite=ow_strategy)),
        ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column, overwrite=ow_strategy)),
        ("euro", CSVColumn("euro", CSVColumn.MANDATORY, args.euro_column, overwrite=ow_strategy)),
        ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column, overwrite=ow_strategy)),
        ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column, overwrite=ow_strategy)),
        ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column, overwrite=ow_strategy)),
        ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL,
                                         args.journal_full_title_column, overwrite=ow_strategy)),
        ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column, overwrite=ow_strategy)),
        ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("issn_l", CSVColumn("issn_l", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("pmid", CSVColumn("pmid", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("ut", CSVColumn("ut", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column, overwrite=ow_strategy)),
        ("doaj", CSVColumn("doaj", CSVColumn.NONE, None, overwrite=ow_strategy))
    ])

    offsetting_column_map = OrderedDict([
        ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column, overwrite=ow_strategy)),
        ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column, overwrite=ow_strategy)),
        ("euro", CSVColumn("euro", CSVColumn.NONE, args.euro_column, overwrite=ow_strategy)),
        ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column, overwrite=ow_strategy)),
        ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column, overwrite=ow_strategy)),
        ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column, overwrite=ow_strategy)),
        ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL,
                                         args.journal_full_title_column, overwrite=ow_strategy)),
        ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column, overwrite=ow_strategy)),
        ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("issn_l", CSVColumn("issn_l", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("pmid", CSVColumn("pmid", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("ut", CSVColumn("ut", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column, overwrite=ow_strategy)),
        ("doaj", CSVColumn("doaj", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("agreement", CSVColumn("agreement", CSVColumn.NONE, None, overwrite=ow_strategy)),
    ])

    if args.offsetting_mode:
        column_map = offsetting_column_map
    else:
        column_map = openapc_column_map

    header = None
    if has_header:
        for row in reader:
            if not row: # Skip empty lines
                continue
            header = row # First non-empty row should be the header
            if args.ignore_header:
                print("Skipping header analysis due to command line argument.")
                break
            else:
                print("\n    *** Analyzing CSV header ***\n")
            for (index, item) in enumerate(header):
                column_type = oat.get_column_type_from_whitelist(item)
                if column_type is not None and column_map[column_type].index is None:
                    column_map[column_type].index = index
                    column_map[column_type].column_name = item
                    found_msg = ("Found column named '{}' at index {}, " +
                                 "assuming this to be the {} column.")
                    print(found_msg.format(item, index, column_type))
            break


    print("\n    *** Starting heuristical analysis ***\n")
    for row in reader:
        if not row: # Skip empty lines
            # We analyze the first non-empty line, a possible header should
            # have been processed by now.
            continue
        column_candidates = {
            "doi": [],
            "period": [],
            "euro": []
        }
        found_msg = "The entry in column {} looks like a potential {}: {}"
        for (index, entry) in enumerate(row):
            if index in [csvcolumn.index for csvcolumn in column_map.values()]:
                # Skip columns already assigned
                continue
            entry = entry.strip()
            # Search for a DOI
            if column_map['doi'].index is None:
                if oat.DOI_RE.match(entry):
                    column_id = str(index)
                    # identify column either numerically or by column header
                    if header:
                        column_id += " ('" + header[index] + "')"
                    print(found_msg.format(column_id, "DOI", entry))
                    column_candidates['doi'].append(index)
                    continue
            # Search for a potential year string
            if column_map['period'].index is None:
                try:
                    maybe_period = int(entry)
                    now = datetime.date.today().year
                    # Should be a wide enough margin
                    if maybe_period >= 2000 and maybe_period <= now + 2:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print(found_msg.format(column_id, "year", entry))
                        column_candidates['period'].append(index)
                        continue
                except ValueError:
                    pass
            # Search for a potential monetary amount
            if column_map['euro'].index is None:
                try:
                    maybe_euro = locale.atof(entry)
                    if maybe_euro >= 10 and maybe_euro <= 10000:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print (found_msg.format(column_id, "euro amount", entry))
                        column_candidates['euro'].append(index)
                        continue
                except ValueError:
                    pass
        for column_type, candidates in column_candidates.items():
            if column_map[column_type].index is not None:
                continue
            if len(candidates) > 1:
                print("Could not reliably identify the '" + column_type +
                      "' column - more than one possible candiate!")
            elif len(candidates) < 1:
                print("No candidate found for column '" + column_type + "'!")
            else:
                index = candidates.pop()
                column_map[column_type].index = index
                if header:
                    column_id = header[index]
                    column_map[column_type].column_name = column_id
                else:
                    column_id = index
                msg = "Assuming column '{}' to be the '{}' column."
                print(msg.format(column_id, column_type))
                column_map[column_type].index = index
        break

    # Wrap up: Check if there any mandatory column types left which have not
    # yet been identified - we cannot continue in that case (unless forced).
    unassigned = [x for x in iter(column_map.items()) if x[1].requirement == CSVColumn.MANDATORY and x[1].index is None]
    if unassigned:
        for item in unassigned:
            print("The {} column is still unidentified.".format(item[0]))
        if header:
            print("The CSV header is:\n" + dialect.delimiter.join(header))
        if not args.force:
            print("ERROR: We cannot continue because not all mandatory " +
                  "column types in the CSV file could be automatically " +
                  "identified. There are 2 ways to fix this:")
            if not header:
                print("1) Add a header row to your file and identify the " +
                      "column(s) by assigning them an appropiate column name.")
            else:
                print("1) Identify the missing column(s) by assigning them " +
                      "a different column name in the CSV header (You can " +
                      "use the column name(s) mentioned in the message above)")
            print("2) Use command line parameters when calling this script " +
                  "to identify the missing columns (use -h for help) ")
            sys.exit()
        else:
            print("WARNING: Not all mandatory column types in the CSV file " +
                  "could be automatically identified - forced to continue.")

    print("\n    *** CSV file analysis summary ***\n")

    index_dict = {csvc.index: csvc for csvc in column_map.values()}

    for index in range(num_columns):
        column_name = ""
        if header:
            column_name = header[index]
        if index in index_dict:
            column = index_dict[index]
            msg = u"column number {} ({}) is the {} column '{}'".format(
                index, column_name, column.requirement, column.column_type)
            if column.requirement in [CSVColumn.MANDATORY, CSVColumn.OPTIONAL]:
                oat.print_g(msg)
            else:
                oat.print_b(msg)
        else:
            if args.add_unknown_columns:
                msg = (u"column number {} ({}) is an unknown column, it will be " +
                       "appended to the generated CSV file")
                oat.print_y(msg.format(index, column_name))
                if not column_name:
                    # Use a generic name
                    column_name = "unknown"
                while column_name in column_map.keys():
                    # TODO: Replace by a numerical, increasing suffix
                    column_name += "_"
                column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index)
            else:
                msg = (u"column number {} ({}) is an unknown column, it will be " +
                       "ignored")
                oat.print_y(msg.format(index, column_name))

    print()
    for column in column_map.values():
        if column.index is None:
            msg = "The {} column '{}' could not be identified."
            print(msg.format(column.requirement, column.column_type))


    # Check for unassigned optional column types. We can continue but should
    # issue a warning as all entries will need a valid DOI in this case.
    unassigned = filter(lambda k, v: v.requirement == CSVColumn.OPTIONAL and v.index is None,
                        column_map.items())
    if unassigned:
        print ("\nWARNING: Not all optional column types could be " +
               "identified. Metadata aggregation is still possible, but " +
               "every entry in the CSV file will need a valid DOI.")

    start = input("\nStart metadata aggregation? (y/n):")
    while start not in ["y", "n"]:
        start = input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()

    print("\n    *** Starting metadata aggregation ***\n")

    enriched_content = []

    csv_file.seek(0)
    reader = csv.reader(csv_file, dialect=dialect)
    header_processed = False
    row_num = 0

    for row in reader:
        row_num += 1
        if not row:
            continue # skip empty lines
        if not header_processed:
            header_processed = True
            enriched_content.append(list(column_map.keys()))
            if has_header:
                # If the CSV file has a header, we are currently there - skip it
                # to get to the first data row
                continue
        if args.start and args.start > row_num:
            continue
        if args.end and args.end < row_num:
            continue
        print("---Processing line number " + str(row_num) + "---")
        enriched_row = oat.process_row(row, row_num, column_map, num_columns,
                                       args.no_crossref, args.no_pubmed,
                                       args.no_doaj, doaj_offline_analysis, args.round_monetary,
                                       args.offsetting_mode)
        enriched_content.append(enriched_row)

    csv_file.close()

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, 
                                          not args.no_openapc_quote_rules, True,
                                          True)
        writer.write_rows(enriched_content)

    if not bufferedHandler.buffer:
        oat.print_g("Metadata enrichment successful, no errors occured")
    else:
        oat.print_r("There were errors during the enrichment process:\n")
    # closing will implicitly flush the handler and print any buffered
    # messages to stderr
    bufferedHandler.close()
Пример #31
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("index", type=int, help=ARG_HELP_STRINGS["index"])
    parser.add_argument("-v", "--value", help=ARG_HELP_STRINGS["value"])
    parser.add_argument("-f", "--file", help=ARG_HELP_STRINGS["file"])
    parser.add_argument("-d",
                        "--full_delete",
                        action="store_true",
                        help=ARG_HELP_STRINGS["full_delete"])
    parser.add_argument("-i",
                        "--ignore_case",
                        action="store_true",
                        help=ARG_HELP_STRINGS["ignore_case"])
    parser.add_argument("-r",
                        "--results_file",
                        action="store_true",
                        help=ARG_HELP_STRINGS["results_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q",
                        "--quotemask",
                        help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o",
                        "--openapc_quote_rules",
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true",
                        default=False)

    args = parser.parse_args()
    if args.value is None and args.file is None:
        parser.error("Either a single value (-v option) or a file of " +
                     "multiple values (-f option) must be given.")

    values = []
    if args.file:
        if not os.path.isfile(args.file):
            print("Error: '" + args.file + "' is no valid file!")
            sys.exit()
        with open(args.file, "r") as f:
            for line in f:
                if len(line) > 0:
                    value = line.strip("\r\n")
                    if args.ignore_case:
                        values.append(value.lower())
                    else:
                        values.append(value)
        oat.print_g(str(len(values)) + " values read from file")

    if args.value is not None:
        if args.ignore_case:
            values.append(args.value.lower())
        else:
            values.append(args.value)
        if args.file:
            oat.print_y("Value argument given in addition to file " +
                        "argument, adding value to file imports...")

    quote_rules = args.openapc_quote_rules

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            print(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print("Error: '" + args.encoding + "' not found Python's " +
                  "codec collection. Either look for a valid name here " +
                  "(https://docs.python.org/2/library/codecs.html#standard-" +
                  "encodings) or omit this argument to enable automated " +
                  "guessing.")
            sys.exit()

    header, content = oat.get_csv_file_content(args.csv_file, enc)

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print("Error: A quotemask may only contain the letters 't' and" +
                  "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]

    empty_line = ['' for element in content[0]]
    column_name = "column " + str(args.index)
    if header:
        header_line = header[0]
        column_name = header_line[args.index]
        empty_line = ['' for element in header_line]
    msg = u"Performing line deletion on condition '{}' in {}".format(
        column_name, values)
    oat.print_g(msg)

    modified_content = []
    deleted_lines = []
    num_total_lines = num_deleted_lines = 0
    for line in content:
        if len(line) == 0:
            continue
        num_total_lines += 1
        current_value = line[args.index]
        if args.ignore_case:
            current_value = current_value.lower()
        if current_value not in values:
            modified_content.append(line)
        else:
            num_deleted_lines += 1
            if not args.full_delete:
                modified_content.append(list(empty_line))
            if args.results_file:
                deleted_lines.append(line)

    msg = u"Process complete, deleted {} out of {} total lines"
    oat.print_g(msg.format(num_deleted_lines, num_total_lines))

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
        writer.write_rows(copy.deepcopy(header) + modified_content)

    if args.results_file and len(deleted_lines) > 0:
        with open('del.csv', 'w') as out:
            writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
            writer.write_rows(copy.deepcopy(header) + deleted_lines)
Пример #32
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-b",
                        "--bypass-cert-verification",
                        action="store_true",
                        help=ARG_HELP_STRINGS["bypass"])
    parser.add_argument("-d",
                        "--offline_doaj",
                        help=ARG_HELP_STRINGS["offline_doaj"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-f",
                        "--force",
                        action="store_true",
                        help=ARG_HELP_STRINGS["force"])
    parser.add_argument("-i",
                        "--ignore-header",
                        action="store_true",
                        help=ARG_HELP_STRINGS["ignore_header"])
    parser.add_argument("-j",
                        "--force-header",
                        action="store_true",
                        help=ARG_HELP_STRINGS["force_header"])
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-u",
                        "--add-unknown-columns",
                        action="store_true",
                        help=ARG_HELP_STRINGS["unknown_columns"])
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        help=ARG_HELP_STRINGS["verbose"])
    parser.add_argument("-institution",
                        "--institution_column",
                        type=int,
                        help=ARG_HELP_STRINGS["institution"])
    parser.add_argument("-period",
                        "--period_column",
                        type=int,
                        help=ARG_HELP_STRINGS["period"])
    parser.add_argument("-doi",
                        "--doi_column",
                        type=int,
                        help=ARG_HELP_STRINGS["doi"])
    parser.add_argument("-euro",
                        "--euro_column",
                        type=int,
                        help=ARG_HELP_STRINGS["euro"])
    parser.add_argument("-is_hybrid",
                        "--is_hybrid_column",
                        type=int,
                        help=ARG_HELP_STRINGS["is_hybrid"])
    parser.add_argument("-publisher",
                        "--publisher_column",
                        type=int,
                        help=ARG_HELP_STRINGS["publisher"])
    parser.add_argument("-journal_full_title",
                        "--journal_full_title_column",
                        type=int,
                        help=ARG_HELP_STRINGS["journal_full_title"])
    parser.add_argument("-issn",
                        "--issn_column",
                        type=int,
                        help=ARG_HELP_STRINGS["issn"])
    parser.add_argument("-url",
                        "--url_column",
                        type=int,
                        help=ARG_HELP_STRINGS["url"])
    parser.add_argument("-start", type=int, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["end"])

    args = parser.parse_args()
    enc = None  # CSV file encoding

    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(ANSIColorFormatter())
    bufferedHandler = BufferedErrorHandler(handler)
    bufferedHandler.setFormatter(ANSIColorFormatter())
    logging.root.addHandler(handler)
    logging.root.addHandler(bufferedHandler)
    logging.root.setLevel(logging.INFO)

    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            print "locale '{}' not found, normalized to '{}'".format(
                args.locale, norm)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            print "Using locale", loc
        except locale.Error as loce:
            print "Setting locale to " + norm + " failed: " + loce.message
            sys.exit()

    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            print("Encoding '{}' found in Python's codec collection " +
                  "as '{}'").format(args.encoding, codec.name)
            enc = args.encoding
        except LookupError:
            print("Error: '" + args.encoding + "' not found Python's " +
                  "codec collection. Either look for a valid name here " +
                  "(https://docs.python.org/2/library/codecs.html#standard-" +
                  "encodings) or omit this argument to enable automated " +
                  "guessing.")
            sys.exit()

    result = oat.analyze_csv_file(args.csv_file)
    if result["success"]:
        csv_analysis = result["data"]
        print csv_analysis
    else:
        print result["error_msg"]
        sys.exit()

    if enc is None:
        enc = csv_analysis.enc
    dialect = csv_analysis.dialect
    has_header = csv_analysis.has_header or args.force_header

    if enc is None:
        print("Error: No encoding given for CSV file and automated " +
              "detection failed. Please set the encoding manually via the " +
              "--enc argument")
        sys.exit()

    doaj_offline_analysis = None
    if args.offline_doaj:
        if os.path.isfile(args.offline_doaj):
            doaj_offline_analysis = oat.DOAJOfflineAnalysis(args.offline_doaj)
        else:
            oat.print_r("Error: " + args.offline_doaj + " does not seem "
                        "to be a file!")

    csv_file = open(args.csv_file, "r")
    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)

    first_row = reader.next()
    num_columns = len(first_row)
    print "\nCSV file has {} columns.".format(num_columns)

    csv_file.seek(0)
    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)

    column_map = OrderedDict([
        ("institution",
         CSVColumn("institution", CSVColumn.MANDATORY,
                   args.institution_column)),
        ("period", CSVColumn("period", CSVColumn.MANDATORY,
                             args.period_column)),
        ("euro", CSVColumn("euro", CSVColumn.MANDATORY, args.euro_column)),
        ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column)),
        ("is_hybrid",
         CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column)),
        ("publisher",
         CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column)),
        ("journal_full_title",
         CSVColumn("journal_full_title", CSVColumn.OPTIONAL,
                   args.journal_full_title_column)),
        ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column)),
        ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None)),
        ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE,
                                      None)),
        ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None)),
        ("indexed_in_crossref",
         CSVColumn("indexed_in_crossref", CSVColumn.NONE, None)),
        ("pmid", CSVColumn("pmid", CSVColumn.NONE, None)),
        ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None)),
        ("ut", CSVColumn("ut", CSVColumn.NONE, None)),
        ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column)),
        ("doaj", CSVColumn("doaj", CSVColumn.NONE, None))
    ])

    # Do not quote the values in the 'period' and 'euro' columns
    quotemask = [
        True,
        False,
        False,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
    ]

    header = None
    if has_header:
        for row in reader:
            if not row:  # Skip empty lines
                continue
            header = row  # First non-empty row should be the header
            if args.ignore_header:
                print "Skipping header analysis due to command line argument."
                break
            else:
                print "\n    *** Analyzing CSV header ***\n"
            for (index, item) in enumerate(header):
                column_type = oat.get_column_type_from_whitelist(item)
                if column_type is not None and column_map[
                        column_type].index is None:
                    column_map[column_type].index = index
                    column_map[column_type].column_name = item
                    print("Found column named '{}' at index {}, " +
                          "assuming this to be the {} column.").format(
                              item, index, column_type)
            break

    print "\n    *** Starting heuristical analysis ***\n"
    for row in reader:
        if not row:  # Skip empty lines
            # We analyze the first non-empty line, a possible header should
            # have been processed by now.
            continue
        column_candidates = {"doi": [], "period": [], "euro": []}
        for (index, entry) in enumerate(row):
            if index in [csvcolumn.index for csvcolumn in column_map.values()]:
                # Skip columns already assigned
                continue
            entry = entry.strip()
            # Search for a DOI
            if column_map['doi'].index is None:
                if oat.DOI_RE.match(entry):
                    column_id = str(index)
                    # identify column either numerical or by column header
                    if header:
                        column_id += " ('" + header[index] + "')"
                    print("The entry in column {} looks like a " +
                          "DOI: {}").format(column_id, entry)
                    column_candidates['doi'].append(index)
                    continue
            # Search for a potential year string
            if column_map['period'].index is None:
                try:
                    maybe_period = int(entry)
                    now = datetime.date.today().year
                    # Should be a wide enough margin
                    if maybe_period >= 2000 and maybe_period <= now + 2:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print("The entry in column {} looks like a " +
                              "potential period: {}").format(column_id, entry)
                        column_candidates['period'].append(index)
                        continue
                except ValueError:
                    pass
            # Search for a potential monetary amount
            if column_map['euro'].index is None:
                try:
                    maybe_euro = locale.atof(entry)
                    # Are there APCs above 6000€ ??
                    if maybe_euro >= 10 and maybe_euro <= 6000:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print("The entry in column {} looks like a " +
                              "potential euro amount: {}").format(
                                  column_id, entry)
                        column_candidates['euro'].append(index)
                        continue
                except ValueError:
                    pass
        for column_type, candidates in column_candidates.iteritems():
            if column_map[column_type].index is not None:
                continue
            if len(candidates) > 1:
                print("Could not reliably identify the '" + column_type +
                      "' column - more than one possible candiate!")
            elif len(candidates) < 1:
                print "No candidate found for column '" + column_type + "'!"
            else:
                index = candidates.pop()
                column_map[column_type].index = index
                if header:
                    column_id = header[index]
                    column_map[column_type].column_name = column_id
                else:
                    column_id = index
                print("Assuming column '{}' to be the '{}' " +
                      "column.").format(column_id, column_type)
                column_map[column_type].index = index
        break

    # Wrap up: Check if there any mandatory column types left which have not
    # yet been identified - we cannot continue in that case (unless forced).
    unassigned = filter(
        lambda
        (k, v): v.requirement == CSVColumn.MANDATORY and v.index is None,
        column_map.iteritems())
    if unassigned:
        for item in unassigned:
            print "The {} column is still unidentified.".format(item[0])
        if header:
            print "The CSV header is:\n" + dialect.delimiter.join(header)
        if not args.force:
            print("ERROR: We cannot continue because not all mandatory " +
                  "column types in the CSV file could be automatically " +
                  "identified. There are 2 ways to fix this:")
            if not header:
                print(
                    "1) Add a header row to your file and identify the " +
                    "column(s) by assigning them an appropiate column name.")
            else:
                print(
                    "1) Identify the missing column(s) by assigning them " +
                    "a different column name in the CSV header (You can " +
                    "use the column name(s) mentioned in the message above)")
            print("2) Use command line parameters when calling this script " +
                  "to identify the missing columns (use -h for help) ")
            sys.exit()
        else:
            print("WARNING: Not all mandatory column types in the CSV file " +
                  "could be automatically identified - forced to continue.")

    print "\n    *** CSV file analysis summary ***\n"

    index_dict = {csvc.index: csvc for csvc in column_map.values()}

    for index in range(num_columns):
        column_name = ""
        if header:
            column_name = header[index]
        if index in index_dict:
            column = index_dict[index]
            msg = "column number {} ({}) is the {} column '{}'".format(
                index, column_name, column.requirement, column.column_type)
            if column.requirement in [CSVColumn.MANDATORY, CSVColumn.OPTIONAL]:
                oat.print_g(msg)
            else:
                oat.print_b(msg)
        else:
            if args.add_unknown_columns:
                msg = (
                    "column number {} ({}) is an unknown column, it will be " +
                    "appended to the generated CSV file")
                oat.print_y(msg.format(index, column_name))
                if not column_name:
                    # Use a generic name
                    column_name = "unknown"
                while column_name in column_map.keys():
                    # TODO: Replace by a numerical, increasing suffix
                    column_name += "_"
                column_map[column_name] = CSVColumn(column_name,
                                                    CSVColumn.NONE, index)
            else:
                msg = (
                    "column number {} ({}) is an unknown column, it will be " +
                    "ignored")
                oat.print_y(msg.format(index, column_name))

    print ""
    for column in column_map.values():
        if column.index is None:
            msg = "The {} column '{}' could not be identified."
            print msg.format(column.requirement, column.column_type)

    # Check for unassigned optional column types. We can continue but should
    # issue a warning as all entries will need a valid DOI in this case.
    unassigned = filter(
        lambda (k, v): v.requirement == CSVColumn.OPTIONAL and v.index is None,
        column_map.iteritems())
    if unassigned:
        print("\nWARNING: Not all optional column types could be " +
              "identified. Metadata aggregation is still possible, but " +
              "every entry in the CSV file will need a valid DOI.")

    start = raw_input("\nStart metadata aggregation? (y/n):")
    while start not in ["y", "n"]:
        start = raw_input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()

    print "\n    *** Starting metadata aggregation ***\n"

    enriched_content = []

    csv_file.seek(0)
    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)
    header_processed = False
    row_num = 0

    for row in reader:
        row_num += 1
        if not row:
            continue  # skip empty lines
        if not header_processed:
            header_processed = True
            enriched_content.append(column_map.keys())
            if has_header:
                # If the CSV file has a header, we are currently there - skip it
                # to get to the first data row
                continue
        if args.start and args.start > row_num:
            continue
        if args.end and args.end < row_num:
            continue
        print "---Processing line number " + str(row_num) + "---"
        enriched_row = oat.process_row(row, row_num, column_map, num_columns,
                                       doaj_offline_analysis,
                                       args.bypass_cert_verification)
        enriched_content.append(enriched_row)

    csv_file.close()

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, quotemask, True, True)
        writer.write_rows(enriched_content)

    if not bufferedHandler.buffer:
        oat.print_g("Metadata enrichment successful, no errors occured")
    else:
        oat.print_r("There were errors during the enrichment process:\n")
    # closing will implicitly flush the handler and print any buffered
    # messages to stderr
    bufferedHandler.close()
Пример #33
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("apc_file", help=ARG_HELP_STRINGS["apc_file"])
    parser.add_argument("issn_l_file", help=ARG_HELP_STRINGS["issn_l_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q",
                        "--quotemask",
                        help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o",
                        "--openapc_quote_rules",
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true",
                        default=False)

    args = parser.parse_args()

    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print("Error: A quotemask may only contain the letters 't' and" +
                  "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]

    enc = None

    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            print("Encoding '{}' found in Python's codec collection " +
                  "as '{}'").format(args.encoding, codec.name)
            enc = args.encoding
        except LookupError:
            print("Error: '" + args.encoding + "' not found Python's " +
                  "codec collection. Either look for a valid name here " +
                  "(https://docs.python.org/2/library/codecs.html#standard-" +
                  "encodings) or omit this argument to enable automated " +
                  "guessing.")
            sys.exit()

    result = oat.analyze_csv_file(args.apc_file, 500)
    if result["success"]:
        csv_analysis = result["data"]
        print csv_analysis
    else:
        print result["error_msg"]
        sys.exit()

    if enc is None:
        enc = csv_analysis.enc

    if enc is None:
        print("Error: No encoding given for CSV file and automated " +
              "detection failed. Please set the encoding manually via the " +
              "--enc argument")
        sys.exit()

    dialect = csv_analysis.dialect

    csv_file = open(args.apc_file, "r")

    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)

    oat.print_g("Preparing mapping table...")
    itself = other = 0
    issn_l_re = re.compile(
        "^(?P<issn>\d{4}-\d{3}[\dxX])\t(?P<issn_l>\d{4}-\d{3}[\dxX])$")
    issn_l_file = open(args.issn_l_file, "r")
    issn_l_dict = {}
    for i, line in enumerate(issn_l_file):
        if i % 100000 == 0:
            print str(i) + " lines processed."
        match = issn_l_re.match(line)
        if match:
            match_dict = match.groupdict()
            issn_l_dict[match_dict['issn']] = match_dict['issn_l']
            if match_dict['issn'] == match_dict['issn_l']:
                itself += 1
            else:
                other += 1
    print str(itself) + " ISSNs pointing to itself as ISSN-L, " + str(
        other) + " to another value."
    oat.print_g("Starting enrichment...")

    issn_matches = issn_p_matches = issn_e_matches = unmatched = different = 0
    enriched_lines = []
    for line in reader:
        if len(line) == 0:
            enriched_lines.append(line)
            continue
        issn = reformat_issn(line[7])
        issn_p = reformat_issn(line[8])
        issn_e = reformat_issn(line[9])
        target = None
        if issn in issn_l_dict:
            target = issn_l_dict[issn]
            line[10] = target
            issn_matches += 1
        elif issn_p in issn_l_dict:
            target = issn_l_dict[issn_p]
            line[10] = target
            issn_p_matches += 1
        elif issn_e in issn_l_dict:
            target = issn_l_dict[issn_e]
            line[10] = target
            issn_e_matches += 1
        else:
            unmatched += 1
        if target is not None and target not in [issn, issn_p, issn_e]:
            different += 1
        enriched_lines.append(line)

    print "{} issn_l values mapped by issn, {} by issn_p, {} by issn_e. {} could not be assigned.\n In {} cases the ISSN-L was different from all existing ISSN values".format(
        issn_matches, issn_p_matches, issn_e_matches, unmatched, different)

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
        writer.write_rows(enriched_lines)
Пример #34
0
def integrate_changes(articles, file_path, enriched_file=False):
    '''
    Update existing entries in a previously created harvest file.
    
    Args:
        articles: A list of article dicts, as retured by openapc_toolkit.oai_harvest()
        file_path: Path to the CSV file the new values should be integrated into.
        enriched_file: If true, columns which are overwritten during enrichment
                       will not be updated
    Returns:
        A tuple. The first element is a reduced list of article dicts, containing
        those which did not find a matching DOI in the file (Order preserved).
        The second element is the list of column headers encountered in the harvest 
        file.
    '''
    if not os.path.isfile(file_path):
        return (articles, None)
    enriched_blacklist = ["institution", "publisher", "journal_full_title", "issn", "license_ref", "pmid"]
    article_dict = OrderedDict()
    for article in articles:
        doi = article["doi"]
        if oat.has_value(doi):
            article_dict[doi] = article
    updated_lines = []
    fieldnames = None
    with open(file_path, "r") as f:
        reader = DictReader(f)
        fieldnames = reader.fieldnames
        updated_lines.append(list(fieldnames)) #header
        start_msg = "Integrating changes in harvest data into existing file {}"
        oat.print_g(start_msg.format(file_path))
        for line in reader:
            doi = line["doi"]
            line_num = reader.reader.line_num
            if not oat.has_value(doi):
                msg = "Line {}: No DOI found, change check not possible"
                oat.print_y(msg.format(line_num))
                updated_line = [line[key] for key in fieldnames]
                updated_lines.append(updated_line)
            else:
                msg = "Line {}: Checking for changes ({})"
                oat.print_b(msg.format(line_num, doi))
                if doi in article_dict:
                    for key, value in article_dict[doi].items():
                        if enriched_file and key in enriched_blacklist:
                            continue
                        if key in line and value != line[key]:
                            update_msg = 'Updating value in column {} ("{}" -> "{}")'
                            oat.print_g(update_msg.format(key, line[key], value))
                            line[key] = value
                    del(article_dict[doi])
                    updated_line = [line[key] for key in fieldnames]
                    updated_lines.append(updated_line)
                else:
                    remove_msg = "DOI {} no longer found in harvest data, removing article"
                    oat.print_r(remove_msg.format(doi))
    with open(file_path, "w") as f:
        mask = oat.OPENAPC_STANDARD_QUOTEMASK if enriched_file else None
        writer = oat.OpenAPCUnicodeWriter(f, quotemask=mask, openapc_quote_rules=True, has_header=True)
        writer.write_rows(updated_lines)
    return (article_dict.values(), fieldnames)
Пример #35
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("original_file",
                        help=ARG_HELP_STRINGS["original_file"])
    parser.add_argument("update_file", help=ARG_HELP_STRINGS["update_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-eu",
                        "--update_encoding",
                        help=ARG_HELP_STRINGS["update_encoding"])
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-lu",
                        "--update_locale",
                        help=ARG_HELP_STRINGS["update_locale"])
    parser.add_argument("-a",
                        "--autocreate_mappings",
                        action="store_true",
                        help=ARG_HELP_STRINGS["autocreate_mappings"])
    parser.add_argument("-g",
                        "--grouping",
                        action="store_true",
                        help=ARG_HELP_STRINGS["grouping"])

    args = parser.parse_args()

    params = {
        "original": {
            "file": args.original_file,
            "encoding": args.encoding,
            "locale": args.locale,
            "csv_analysis": None,
            "fieldnames": None,
            "doi_field": None,
            "euro_field": None,
            "mappings": []
        },
        "update": {
            "file": args.update_file,
            "encoding": args.update_encoding,
            "locale": args.update_locale,
            "csv_analysis": None,
            "fieldnames": None,
            "doi_field": None,
            "euro_field": None,
            "mappings": []
        }
    }

    def field_mapped(file_type, field_name):
        if field_name == params[file_type]["euro_field"]:
            return True
        if field_name == params[file_type]["doi_field"]:
            return True
        if field_name in params[file_type]["mappings"]:
            return True
        return False

    for file_type in params.keys():
        msg = "*** Performing analysis for {} file ***"
        oat.print_b(msg.format(file_type))
        encoding = params[file_type]["encoding"]
        if encoding is not None:
            try:
                codec = codecs.lookup(encoding)
                msg = "Encoding '{}' found in Python's codec collection as '{}'"
                print(msg.format(encoding, codec.name))
                params[file_type]["encoding"] = encoding
            except LookupError:
                print(
                    "Error: '" + encoding + "' not found Python's " +
                    "codec collection. Either look for a valid name here " +
                    "(https://docs.python.org/2/library/codecs.html#standard-"
                    + "encodings) or omit this argument to enable automated " +
                    "guessing.")
                sys.exit()

        loc = params[file_type]["locale"]
        if loc is not None:
            norm = locale.normalize(loc)
            if norm != loc:
                msg = "locale '{}' not found, normalised to '{}'".format(
                    loc, norm)
                oat.print_y(msg)
                params[file_type]["locale"] = norm

        csv_analysis = oat.analyze_csv_file(params[file_type]["file"],
                                            enc=params[file_type]["encoding"])
        if not csv_analysis["success"]:
            oat.print_r(csv_analysis["error_msg"])
            sys.exit()
        params[file_type]["csv_analysis"] = csv_analysis["data"]
        print(params[file_type]["csv_analysis"])

        if params[file_type]["encoding"] is None:
            guessed_enc = params[file_type]["csv_analysis"].enc
            params[file_type]["encoding"] = guessed_enc

        locale_name = "default locale"
        if params[file_type]["locale"] is not None:
            locale_name = "locale " + params[file_type]["locale"]
        msg = "{} file will be opened with encoding {} and {}"
        oat.print_g(
            msg.format(file_type, params[file_type]["encoding"], locale_name))

        with open(params[file_type]["file"],
                  "r",
                  encoding=params[file_type]["encoding"]) as f:
            reader = csv.DictReader(
                f, dialect=params[file_type]["csv_analysis"].dialect)
            params[file_type]["fieldnames"] = list(reader.fieldnames)
            for index, name in enumerate(params[file_type]["fieldnames"]):
                field_type = oat.get_column_type_from_whitelist(name)
                found = False
                if field_type == "doi":
                    params[file_type]["doi_field"] = name
                    found = True
                elif field_type == "euro":
                    params[file_type]["euro_field"] = name
                    found = True
                if found:
                    msg = '{} file: Found {} column at index {} ("{}")'
                    msg = msg.format(file_type, field_type, index, name)
                    oat.print_b(msg)
            for field_type in ["doi_field", "euro_field"]:
                if params[file_type][field_type] is None:
                    msg = "Error: No {} found in {} file"
                    oat.print_r(msg.format(field_type, file_type))
                    sys.exit()

    for orig_index, orig_field in enumerate(params["original"]["fieldnames"]):
        if field_mapped("original", orig_field):
            continue
        norm_orig_field = orig_field.lower().strip()
        for update_index, update_field in enumerate(
                params["update"]["fieldnames"]):
            if field_mapped("update", update_field):
                continue
            norm_update_field = update_field.lower().strip()
            if norm_orig_field == norm_update_field:
                if args.autocreate_mappings:
                    params["original"]["mappings"].append(orig_field)
                    params["update"]["mappings"].append(update_field)
                    msg = 'Auto-created mapping "{}" (update file, index {}) -> "{}" (original file, index {})'
                    oat.print_b(
                        msg.format(update_field, update_index, orig_field,
                                   orig_index))
                else:
                    msg = 'Possible mapping found: "{}" (update file, index {}) -> "{}" (original file, index {}). Create mapping (y/n)?'
                    msg = msg.format(update_field, update_index, orig_field,
                                     orig_index)
                    create = input(msg)
                    while create not in ["y", "n"]:
                        create = input("Please type 'y' or 'n':")
                    if create == "y":
                        params["original"]["mappings"].append(orig_field)
                        params["update"]["mappings"].append(update_field)

    update_mappings = {}
    with open(params["update"]["file"],
              "r",
              encoding=params["update"]["encoding"]) as f:
        doi_field = params["update"]["doi_field"]
        euro_field = params["update"]["euro_field"]
        reader = csv.DictReader(
            f, dialect=params["update"]["csv_analysis"].dialect)
        locale.setlocale(locale.LC_ALL, params["update"]["locale"])
        for line in reader:
            doi = oat.get_normalised_DOI(line[doi_field])
            if doi is None:
                msg = 'Warning: Empty or invalid DOI in update file (line {}): "{}"'
                oat.print_y(msg.format(reader.line_num, line[doi_field]))
                continue
            if doi in update_mappings:
                msg = "Error: Duplicate doi in update file ({})".format(
                    line[doi_field])
                oat.print_r(msg)
                sys.exit()
            update_mappings[doi] = {}
            euro_value = locale.atof(line[euro_field])
            orig_euro_field = params["original"]["euro_field"]
            update_mappings[doi][orig_euro_field] = euro_value
            for index, update_field_name in enumerate(
                    params["update"]["mappings"]):
                orig_field_name = params["original"]["mappings"][index]
                update_mappings[doi][orig_field_name] = line[update_field_name]

    #print(json.dumps(update_mappings, sort_keys=False, indent=4))

    modified_content = []
    fieldnames = None
    with open(params["original"]["file"],
              "r",
              encoding=params["original"]["encoding"]) as f:
        doi_field = params["original"]["doi_field"]
        euro_field = params["original"]["euro_field"]
        reader = csv.DictReader(
            f, dialect=params["original"]["csv_analysis"].dialect)
        fieldnames = list(reader.fieldnames)
        locale.setlocale(locale.LC_ALL, params["original"]["locale"])
        for line in reader:
            doi = oat.get_normalised_DOI(line[doi_field])
            if doi not in update_mappings:
                msg = "line {}: DOI {} not found in update file!"
                oat.print_r(msg.format(reader.line_num, doi))
                continue
            changes = []
            old_euro_value = locale.atof(line[euro_field])
            new_euro_value = update_mappings[doi][euro_field]
            if old_euro_value != new_euro_value:
                changes.append(
                    Change(euro_field,
                           old_euro_value,
                           new_euro_value,
                           monetary=True))
            for field in update_mappings[doi].keys():
                if field == euro_field:
                    continue
                if line[field] != update_mappings[doi][field]:
                    changes.append(
                        Change(field, line[field],
                               update_mappings[doi][field]))
            if not changes:
                msg = "line {}: DOI {} found in update file, but nothing changed."
                oat.print_g(msg.format(reader.line_num, doi))
            else:
                msg = "line {}: DOI {} found in update file with the following updates:"
                oat.print_y(msg.format(reader.line_num, doi))
                for change in changes:
                    oat.print_y(str(change))
                    if change.monetary:
                        line[change.field_name] = locale.currency(
                            change.new_value,
                            symbol=False,
                            grouping=args.grouping)
                    else:
                        line[change.field_name] = change.new_value
            del (update_mappings[doi])
            modified_content.append(line)
        if update_mappings:
            oat.print_y(
                "{} entries in update file not contained in original file:".
                format(len(update_mappings)))
        for doi, changes in update_mappings.items():
            oat.print_y(doi)
            new_line = changes
            new_line[params["original"]["doi_field"]] = doi
            formatted_euro = locale.currency(
                new_line[params["original"]["euro_field"]],
                symbol=False,
                grouping=args.grouping)
            new_line[params["original"]["euro_field"]] = formatted_euro
            modified_content.append(new_line)

    with open("out.csv", "w", encoding=params["original"]["encoding"]) as out:
        writer = csv.DictWriter(
            out,
            fieldnames,
            dialect=params["original"]["csv_analysis"].dialect)
        writer.writeheader()
        for line in modified_content:
            writer.writerow(line)
Пример #36
0
def find_significant_apc_differences(apc_content, institution, verbose=False):
    titles = {}
    articles = []
    # 1st run: Find all journals the institution has published articles in
    for line in apc_content:
        if line[0] != institution:
            continue
        title = line[6]
        if title not in titles:
            titles[title] = {"lines": []}
        articles.append(line)
    # 2nd run: Aggregate all articles for the journals found in 1
    for line in apc_content:
        title = line[6]
        if title in titles:
            titles[title]["lines"].append(line)
    for title in titles:
        apc_values = [float(line[2]) for line in titles[title]["lines"]]
        titles[title]["count"] = len(apc_values)
        titles[title]["stddev"] = stddev(apc_values)
        titles[title]["mean"] = mean(apc_values)

    stats = {
        "articles": len(articles),
        "not_checked": 0,
        "within_limits": 0,
        "significant": 0
    }
    sig_articles = []
    for article in articles:
        apc = article[2]
        doi = article[3]
        title = article[6]
        if titles[title]["count"] < 20:
            if verbose:
                msg = 'Article {}, journal "{}": Could not check costs, too few occurences ({})'
                oat.print_b(msg.format(doi, title, titles[title]["count"]))
            stats["not_checked"] += 1
            continue
        if abs(float(apc) - titles[title]["mean"]) > 2 * titles[title]["stddev"]:
            rounded_mean = round(titles[title]["mean"], 2)
            rounded_stddev = round(titles[title]["stddev"], 2)
            diff = round(float(apc) - rounded_mean, 2)
            if verbose:
                msg = ('Article {}, journal "{}": Cost ({}€) differs more than 2 standard ' +
                       'deviations (2 * {}€) from mean APC ({}€)')
                oat.print_y(msg.format(doi, title, apc, rounded_stddev, rounded_mean))
            stats["significant"] += 1
            article.append(rounded_mean)
            article.append(rounded_stddev)
            article.append(diff)
            article.append(titles[title]["count"])
            sig_articles.append(article)
        else:
            if verbose:
                msg = ('Article {}, journal "{}": No significant cost difference ({}€, mean ' +
                       'APC is {}€)')
                oat.print_g(msg.format(doi, title, apc, round(titles[title]["mean"], 2)))
            stats["within_limits"] += 1
    if verbose:
        oat.print_g("\nAnalysis finished, results:")
        for key, value in stats.items():
            oat.print_g(key + ": " + str(value))
    return sig_articles, stats
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"])
    args = parser.parse_args()

    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(oat.ANSIColorFormatter())
    bufferedHandler = oat.BufferedErrorHandler(handler)
    bufferedHandler.setFormatter(oat.ANSIColorFormatter())
    logging.root.addHandler(handler)
    logging.root.addHandler(bufferedHandler)
    logging.root.setLevel(logging.INFO)

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = ("Encoding '{}' found in Python's codec collection " +
                   "as '{}'").format(args.encoding, codec.name)
            oat.print_g(msg)
            enc = args.encoding
        except LookupError:
            msg = ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            oat.print_r(msg)
            sys.exit()

    head, content = oat.get_csv_file_content(args.csv_file, enc)
    content = head + content

    header = {"User-Agent": "Mozilla/5.0 Firefox/45.0"}

    line_num = 0
    for line in content:
        line_num += 1
        if args.start and args.start > line_num:
            continue
        if args.end and args.end < line_num:
            continue
        institution = line[0]
        period = line[1]
        doi = line[3]
        is_hybrid = line[4]
        publisher = line[5]
        journal = line[6]
        if publisher != "Elsevier" or is_hybrid != "TRUE":
            continue
        init_msg = (u"Line {}: Checking {} article from {}, published in " +
                    "{}...").format(line_num, institution, period, journal)
        oat.print_b(init_msg)
        url = 'http://doi.org/' + doi
        req = urllib2.Request(url, None, header)
        ret_value = {'success': True}
        try:
            response = urllib2.urlopen(req)
            target = response.geturl()
            resolve_msg = u"DOI {} resolved, led us to {}".format(doi, target)
            if "sciencedirect.com" not in target:
                oat.print_y(resolve_msg)
                oat.print_y("Journal not located at sciencedirect, skipping...")
                continue
            oat.print_b(resolve_msg)
            content_string = response.read()
            single_match = pdflink_re.search(content_string)
            if single_match:
                link_url = single_match.groups()[0]
                oat.print_g(u"PDF link found: " + link_url)
            else:
                multi_match = pdflink_multi_re.search(content_string)
                if multi_match:
                   link_url = multi_match.groups()[0]
                   link_url = link_url.replace("&amp;", "&")
                   oat.print_g(u"PDF link found (more than one document): " + link_url)
                else:
                    error_msg = (u"No PDF link found! (line {}, DOI: {}, " +
                                 "landing page: {})").format(line_num, doi, target)
                    logging.error(error_msg)
            time.sleep(1)
        except urllib2.HTTPError as httpe:
            code = str(httpe.getcode())
            oat.print_r("HTTPError: {} - {}".format(code, httpe.reason))
        except urllib2.URLError as urle:
            oat.print_r("URLError: {}".format(urle.reason))

    if not bufferedHandler.buffer:
        oat.print_g("\nLookup finished, all articles were accessible on sciencedirect")
    else:
        oat.print_r("\nLookup finished, not all articles could be accessed on sciencedirect:\n")
    # closing will implicitly flush the handler and print any buffered
    # messages to stderr
    bufferedHandler.close()
Пример #38
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("column", type=int, help=ARG_HELP_STRINGS["column"])
    parser.add_argument("other_csv_file",
                        nargs="?",
                        help=ARG_HELP_STRINGS["other_csv_file"])
    parser.add_argument("other_column",
                        type=int,
                        nargs="?",
                        help=ARG_HELP_STRINGS["other_column"])
    parser.add_argument("-e2",
                        "--other_encoding",
                        help=ARG_HELP_STRINGS["other_encoding"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-i",
                        "--ignore_case",
                        action="store_true",
                        default=False,
                        help=ARG_HELP_STRINGS["ignore_case"])
    parser.add_argument("-q",
                        "--quotemask",
                        help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o",
                        "--openapc_quote_rules",
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true",
                        default=False)

    args = parser.parse_args()

    quote_rules = args.openapc_quote_rules

    encs = []  #CSV file encodings

    for encoding in [args.encoding, args.other_encoding]:
        if encoding:
            try:
                codec = codecs.lookup(encoding)
                msg = "Encoding '{}' found in Python's codec collection as '{}'"
                print(msg.format(encoding, codec.name))
            except LookupError:
                print(
                    "Error: '" + encoding + "' not found Python's " +
                    "codec collection. Either look for a valid name here " +
                    "(https://docs.python.org/2/library/codecs.html#standard-"
                    + "encodings) or omit this argument to enable automated " +
                    "guessing.")
                sys.exit()
        encs.append(encoding)

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if reduced:
            print(
                "Error: A quotemask may only contain the letters 't' and 'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]

    header, content = oat.get_csv_file_content(args.csv_file, enc=encs[0])
    column = args.column

    if not args.other_csv_file:
        rearranged_content = header + sorted(content, key=lambda x: x[column])
    else:
        rearranged_content = []
        _, second_content = oat.get_csv_file_content(args.other_csv_file,
                                                     enc=encs[1])
        other_column = column  # default: use same column index as in first file
        if args.other_column:
            other_column = args.other_column

        for other_row in second_content:
            if args.ignore_case:
                matching_rows = [
                    row for row in content
                    if row[column].lower() == other_row[other_column].lower()
                ]
            else:
                matching_rows = [
                    row for row in content
                    if row[column] == other_row[other_column]
                ]
            rearranged_content += matching_rows
            for matching_row in matching_rows:
                content.remove(matching_row)
        unmatched_msg = (
            "{} rows could not be rearranged (unmatched in second csv file) " +
            "and were appended to the end of the result file " +
            "in original order.")
        if content:
            oat.print_y(unmatched_msg.format(len(content)))
        else:
            oat.print_g("All rows matched.")
        rearranged_content = header + rearranged_content + content  # append any unmatched rows

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
        writer.write_rows(rearranged_content)
Пример #39
0
def main():
    analysed_journals = {}
    if os.path.isfile(JOURNALTOC_RESULTS_FILE):
        with open(JOURNALTOC_RESULTS_FILE) as results:
            reader = DictReader(results)
            for line in reader:
                title = line["journal_full_title"]
                if title not in analysed_journals:
                    analysed_journals[title] = line
    remaining_journals = {}
    with open(APC_DE_FILE) as apc_de:
        reader = DictReader(apc_de)
        for line in reader:
            title = line["journal_full_title"]
            if title in analysed_journals:
                continue
            if title not in remaining_journals:
                remaining_journals[title] = {
                    "journal_full_title": line["journal_full_title"],
                    "publisher": line["publisher"],
                    "is_hybrid": line["is_hybrid"],
                    "issns": []
                }
            for issn_type in ISSN_TYPES:
                issn = line[issn_type]
                if issn not in remaining_journals[title][
                        "issns"] and oat.is_wellformed_ISSN(issn):
                    remaining_journals[title]["issns"].append(issn)
            is_hybrid = line["is_hybrid"]
            if is_hybrid in [
                    "TRUE", "FALSE"
            ] and is_hybrid != remaining_journals[title]["is_hybrid"]:
                remaining_journals[title]["is_hybrid"] = "FLIPPED"

    msg = "{} unique journals found in OpenAPC core data file, {} already analysed, {} remaining."
    oat.print_g(
        msg.format(
            len(remaining_journals) + len(analysed_journals),
            len(analysed_journals), len(remaining_journals)))

    count = 0
    for title, fields in remaining_journals.items():
        count += 1
        entry = {field: None for field in RESULTS_FILE_FIELDNAMES}
        entry["journal_full_title"] = title
        for key in ["publisher", "is_hybrid"]:
            entry[key] = fields[key]
        entry["issns"] = "|".join(fields["issns"])
        msg = 'Analysing journal "{}" ({}), OpenAPC hybrid status is {}...'
        msg = msg.format(entry["journal_full_title"], entry["issns"],
                         entry["is_hybrid"])
        oat.print_b(msg)
        for issn in fields["issns"]:
            oat.print_y("Looking up ISSN " + issn + "...")
            jtoc_metadata = get_jtoc_metadata(issn)
            if jtoc_metadata["jtoc_id"] is not None:
                entry["in_jtoc"] = "TRUE"
                for key in ["jtoc_publisher", "jtoc_title"]:
                    entry[key] = jtoc_metadata[key]
                journal_type = get_jtoc_journal_type(jtoc_metadata["jtoc_id"])
                entry["jtoc_type"] = journal_type
                msg = 'Journal found ("{}"), JournalTOCs type is {}'
                oat.print_g(msg.format(entry["jtoc_title"],
                                       entry["jtoc_type"]))
                break
        else:
            oat.print_r("None of the associated ISSNS found in JTOCs!")
        analysed_journals[title] = entry
        if count < BATCH_SIZE:
            sleep(2)
        else:
            break

    with open(JOURNALTOC_RESULTS_FILE, "w") as res_file:
        writer = DictWriter(res_file, fieldnames=RESULTS_FILE_FIELDNAMES)
        writer.writeheader()
        for _, entry in analysed_journals.items():
            writer.writerow(entry)