예제 #1
0
def check_apc_field_content(row_object):
    __tracebackhide__ = True
    row = row_object.row
    line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number)
    if not oat.has_value(row['journal_full_title']):
        fail(line_str + 'the column "journal_full_title" must not be empty')
    if len(row['journal_full_title']) != len(row['journal_full_title'].strip()):
        fail(line_str + 'journal title (' + row['journal_full_title'] + ') has leading or trailing whitespaces')
    if not oat.has_value(row['issn']):
        fail(line_str + 'the column "issn" must not be empty')
    if row['doaj'] not in ["TRUE", "FALSE"]:
        fail(line_str + 'value in row "doaj" must either be TRUE or FALSE')
    if row['is_hybrid'] not in ["TRUE", "FALSE"]:
        fail(line_str + 'value in row "is_hybrid" must either be TRUE or FALSE')
    
    if row_object.origin == "ta":
        if not oat.has_value(row['agreement']):
            fail(line_str + 'the column "agreement" must not be empty')
    if not row_object.origin == "ta":
        try:
            euro = float(row['euro'])
            if euro <= 0:
                fail(line_str + 'value in row "euro" (' + row['euro'] + ') must be larger than 0')
        except ValueError:
            fail(line_str + 'value in row "euro" (' + row['euro'] + ') is no valid number')
예제 #2
0
def check_name_consistency(row_object):
    __tracebackhide__ = True
    row = row_object.row
    issn = row["issn"] if oat.has_value(row["issn"]) else None
    issn_p = row["issn_print"] if oat.has_value(row["issn_print"]) else None
    issn_e = row["issn_electronic"] if oat.has_value(row["issn_electronic"]) else None
    hybrid_status_changed = len({issn, issn_p, issn_e}.intersection(JOURNAL_HYBRID_STATUS_CHANGED)) > 0
    journal = row["journal_full_title"]
    publ = row["publisher"]
    hybrid = row["is_hybrid"]
    line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number)
    msg = (u'' + line_str + 'Two entries share a common {}ISSN ({}), but the ' +
           '{} differs ("{}" vs "{}")')
    if issn is not None:
        same_issn_rows = issn_dict[issn]
        for other_row in same_issn_rows:
            other_publ = other_row["publisher"]
            other_journal = other_row["journal_full_title"]
            other_hybrid = other_row["is_hybrid"]
            if not other_publ == publ and not in_whitelist(issn, publ, other_publ):
                ret = msg.format("", issn, "publisher name", publ, other_publ)
                fail(ret)
            if not other_journal == journal:
                ret = msg.format("", issn, "journal title", journal, other_journal)
                fail(ret)
            if other_hybrid != hybrid and not hybrid_status_changed:
                ret = msg.format("", issn, "hybrid status", hybrid, other_hybrid)
                fail(ret)
    if issn_p is not None:
        same_issn_p_rows = issn_p_dict[issn_p]
        for other_row in same_issn_p_rows:
            other_publ = other_row["publisher"]
            other_journal = other_row["journal_full_title"]
            other_hybrid = other_row["is_hybrid"]
            if not other_publ == publ and not in_whitelist(issn_p, publ, other_publ):
                ret = msg.format("Print ", issn_p, "publisher name", publ, other_publ)
                fail(ret)
            if not other_journal == journal:
                ret = msg.format("Print ", issn_p, "journal title", journal, other_journal)
                fail(ret)
            if other_hybrid != hybrid and not hybrid_status_changed:
                ret = msg.format("Print ", issn_p, "hybrid status", hybrid, other_hybrid)
                fail(ret)
    if issn_e is not None:
        same_issn_e_rows = issn_e_dict[issn_e]
        for other_row in same_issn_e_rows:
            other_publ = other_row["publisher"]
            other_journal = other_row["journal_full_title"]
            other_hybrid = other_row["is_hybrid"]
            if not other_publ == publ and not in_whitelist(issn_e, publ, other_publ):
                ret = msg.format("Electronic ", issn_e, "publisher name", publ, other_publ)
                fail(ret)
            if not other_journal == journal:
                ret = msg.format("Electronic ", issn_e, "journal title", journal, other_journal)
                fail(ret)
            if other_hybrid != hybrid and not hybrid_status_changed:
                ret = msg.format("Electronic ", issn_e, "hybrid status", hybrid, other_hybrid)
                fail(ret)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"])
    parser.add_argument("journaltocs_user",
                        help=ARG_HELP_STRINGS["journaltocs_user"])
    parser.add_argument("-i",
                        "--integrate",
                        action="store_true",
                        help=ARG_HELP_STRINGS["integrate"])
    parser.add_argument("-m",
                        "--max_lookups",
                        type=int,
                        default=100,
                        help=ARG_HELP_STRINGS["max_lookups"])
    args = parser.parse_args()

    analysed_journals = {}

    modified_content = []

    lookups = 0
    header, content = oat.get_csv_file_content(args.source_file,
                                               enc="utf-8",
                                               force_header=True)
    header_line = header[0]
    modified_content = [list(header_line)]
    for line in content:
        if not oat.has_value(line[6]):  #journal_full_title
            modified_content.append(line)
            continue
        if not oat.has_value(line[4]):  #is_hybrid
            title = line[6]
            oat.print_y('Looking up journal {}'.format(title))
            if title not in analysed_journals:
                if lookups < args.max_lookups:
                    hybrid_status = get_hybrid_status(line,
                                                      args.journaltocs_user)
                    if hybrid_status is not None:
                        analysed_journals[title] = hybrid_status
                    else:
                        analysed_journals[title] = "NA"
                    lookups += 1
                    line[4] = analysed_journals[title]
                else:
                    oat.print_r("Maximum number of lookups reached!")
            else:
                line[4] = analysed_journals[title]
        modified_content.append(line)

    with open("out.csv", "w") as out:
        if args.integrate:
            writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True)
            writer.write_rows(modified_content)
        else:
            out.write("journal_full_title,is_hybrid\n")
            for key, value in analysed_journals.items():
                out.write(key + "," + value + "\n")
예제 #4
0
def check_name_consistency(row_object):
    __tracebackhide__ = True
    row = row_object.row
    issn = row["issn"] if oat.has_value(row["issn"]) else None
    issn_p = row["issn_print"] if oat.has_value(row["issn_print"]) else None
    issn_e = row["issn_electronic"] if oat.has_value(row["issn_electronic"]) else None
    journal = row["journal_full_title"]
    publ = row["publisher"]
    hybrid = row["is_hybrid"]
    line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number)
    msg = (u'' + line_str + 'Two entries share a common {}ISSN ({}), but the ' +
           '{} differs ("{}" vs "{}")')
    if issn is not None:
        same_issn_rows = issn_dict[issn]
        for other_row in same_issn_rows:
            other_publ = other_row["publisher"]
            other_journal = other_row["journal_full_title"]
            other_hybrid = other_row["is_hybrid"]
            if not other_publ == publ and not in_whitelist(issn, publ, other_publ):
                ret = msg.format("", issn, "publisher name", publ, other_publ)
                pytest.fail(ret)
            if not other_journal == journal:
                ret = msg.format("", issn, "journal title", journal, other_journal)
                pytest.fail(ret)
            if not other_hybrid == hybrid and issn not in JOURNAL_HYBRID_STATUS_CHANGED:
                ret = msg.format("", issn, "hybrid status", hybrid, other_hybrid)
                pytest.fail(ret)
    if issn_p is not None:
        same_issn_p_rows = issn_p_dict[issn_p]
        for other_row in same_issn_p_rows:
            other_publ = other_row["publisher"]
            other_journal = other_row["journal_full_title"]
            other_hybrid = other_row["is_hybrid"]
            if not other_publ == publ and not in_whitelist(issn_p, publ, other_publ):
                ret = msg.format("Print ", issn_p, "publisher name", publ, other_publ)
                pytest.fail(ret)
            if not other_journal == journal:
                ret = msg.format("Print ", issn_p, "journal title", journal, other_journal)
                pytest.fail(ret)
            if not other_hybrid == hybrid and issn not in JOURNAL_HYBRID_STATUS_CHANGED:
                ret = msg.format("Print ", issn_p, "hybrid status", hybrid, other_hybrid)
                pytest.fail(ret)
    if issn_e is not None:
        same_issn_e_rows = issn_e_dict[issn_e]
        for other_row in same_issn_e_rows:
            other_publ = other_row["publisher"]
            other_journal = other_row["journal_full_title"]
            other_hybrid = other_row["is_hybrid"]
            if not other_publ == publ and not in_whitelist(issn_e, publ, other_publ):
                ret = msg.format("Electronic ", issn_e, "publisher name", publ, other_publ)
                pytest.fail(ret)
            if not other_journal == journal:
                ret = msg.format("Electronic ", issn_e, "journal title", journal, other_journal)
                pytest.fail(ret)
            if not other_hybrid == hybrid and issn not in JOURNAL_HYBRID_STATUS_CHANGED:
                ret = msg.format("Electronic ", issn_e, "hybrid status", hybrid, other_hybrid)
                pytest.fail(ret)
예제 #5
0
def check_field_content(row_object):
    __tracebackhide__ = True
    row = row_object.row
    line_str = '{}, line {}: '.format(row_object.file_name,
                                      row_object.line_number)
    if not oat.has_value(row['publisher']):
        fail(line_str + 'the column "publisher" must not be empty')
    if not oat.has_value(row['journal_full_title']):
        fail(line_str + 'the column "journal_full_title" must not be empty')
    if not oat.has_value(row['issn']):
        fail(line_str + 'the column "issn" must not be empty')
    if row['doaj'] not in ["TRUE", "FALSE"]:
        fail(line_str + 'value in row "doaj" must either be TRUE or FALSE')
    if row['indexed_in_crossref'] not in ["TRUE", "FALSE"]:
        fail(line_str +
             'value in row "indexed_in_crossref" must either be TRUE or FALSE')
    if row['is_hybrid'] not in ["TRUE", "FALSE"]:
        fail(line_str +
             'value in row "is_hybrid" must either be TRUE or FALSE')
    if not row['doi'] == "NA":
        doi_norm = oat.get_normalised_DOI(row['doi'])
        if doi_norm is None:
            fail(
                line_str +
                'value in row "doi" must either be NA or represent a valid DOI'
            )
        elif doi_norm != row['doi']:
            fail(
                line_str +
                'value in row "doi" contains a valid DOI, but the format ' +
                'is not correct. It should be the simple DOI name, not ' +
                'handbook notation (doi:...) or a HTTP URI (http://dx.doi.org/...)'
            )
    if len(row['publisher']) != len(row['publisher'].strip()):
        fail(line_str + 'publisher name (' + row['publisher'] +
             ') has leading or trailing whitespaces')
    if len(row['journal_full_title']) != len(
            row['journal_full_title'].strip()):
        fail(line_str + 'journal title (' + row['journal_full_title'] +
             ') has leading or trailing whitespaces')

    if row_object.transformative_agreements:
        if not oat.has_value(row['agreement']):
            fail(line_str + 'the column "agreement" must not be empty')

    if not row_object.transformative_agreements:
        try:
            euro = float(row['euro'])
            if euro <= 0:
                fail(line_str + 'value in row "euro" (' + row['euro'] +
                     ') must be larger than 0')
        except ValueError:
            fail(line_str + 'value in row "euro" (' + row['euro'] +
                 ') is no valid number')
예제 #6
0
def generate_metadata_section(institution, ins_content, stats, lang):
    markdown = LANG[lang]["md_header"]
    ins_line = None
    for line in ins_content:
        if line[0] == institution:
            ins_line = line
            break
    else:
        oat.print_r("ERROR: Entry " + institution +
                    " not found in institutions file!")
        sys.exit()
    locale_date = format_date(date.today(), locale=lang)
    markdown += "* " + LANG[lang]["md_date"] + ": " + locale_date + "\n"
    git_rev = run(["git", "describe", "--tags", "--abbrev=0"],
                  capture_output=True).stdout.decode()
    git_rev = git_rev.replace("\n", "")
    rev_url = "https://github.com/OpenAPC/openapc-de/tree/" + git_rev
    markdown += "* " + LANG[lang][
        "md_rev"] + ": [" + git_rev + "](" + rev_url + ")\n"
    markdown += "* " + LANG[lang]["md_ins"] + ": " + ins_line[2] + "\n"
    if oat.has_value(ins_line[7]):
        grid_id = ins_line[7]
        grid_url = "https://www.grid.ac/institutes/" + grid_id
        markdown += "* " + LANG[lang][
            "md_grid"] + ": [" + grid_id + "](" + grid_url + ")\n"
    if oat.has_value(ins_line[8]):
        ror_id = ins_line[8]
        markdown += "* " + LANG[lang][
            "md_ror"] + ": [" + ror_id + "](" + ror_id + ")\n"
    markdown += "* " + LANG[lang]["md_ins_apc"] + ": " + ins_line[0] + "\n"
    url = "https://treemaps.openapc.net/apcdata/"
    treemap_url = "<" + url + ins_line[1].replace("_", "-") + ">"
    markdown += "* " + LANG[lang]["md_treemap"] + ": " + treemap_url + "\n"
    data_dir = ins_line[6]
    if oat.has_value(data_dir):
        stats = get_data_dir_stats(data_dir)
        data_url = "https://github.com/OpenAPC/openapc-de/tree/master/data/" + data_dir
        markdown += "* " + LANG[lang][
            "md_data_dir"] + ": [" + data_dir + "](" + data_url + ")\n"
        markdown += "* " + LANG[lang]["md_num_files"] + ": " + str(
            stats["orig_files"]) + "\n"
        markdown += "* " + LANG[lang]["md_readme"] + ": "
        if stats["readme"]:
            markdown += LANG[lang]["md_readme_yes"]
        else:
            markdown += LANG[lang]["md_readme_no"]
        markdown += "\n"
    else:
        oat.print_y("WARNING: No data dir entry found for " + institution +
                    "!")
    markdown += "\n"
    return markdown
def get_hybrid_status(line, username):
    for issn in [7, 8, 9, 10]:
        if not oat.has_value(line[issn]):
            continue
        msg = 'Looking up ISSN {}...'
        oat.print_y(msg.format(line[issn]))
        jtoc_metadata = get_jtoc_metadata(line[issn], username)
        sleep(1)
        if jtoc_metadata["jtoc_id"] is not None:
            msg = ('Entry found (publisher: {}, title: {}, jtoc_ID: {}, ' +
                   'obtaining hybrid status...)')
            oat.print_g(
                msg.format(jtoc_metadata["jtoc_publisher"],
                           jtoc_metadata["jtoc_title"],
                           jtoc_metadata["jtoc_id"]))
            journal_type = get_jtoc_journal_type(jtoc_metadata["jtoc_id"])
            if not journal_type:
                oat.print_r("Error while obtaining hybrid status!")
                continue
            sleep(1)
            msg = "journaltocs type is '{}' , mapped to is_hybrid = {}"
            oat.print_g(msg.format(journal_type[0], journal_type[1]))
            return journal_type[1]
    oat.print_r("None of the ISSN values found in journaltocs!")
    return None
예제 #8
0
def generate_apc_deviaton_section(institution,
                                  articles,
                                  stats,
                                  lang,
                                  csv_output=False):
    if csv_output:
        csv_content = [[
            "Journal", "Publisher", "Journal Articles in OpenAPC", "Period",
            "DOI", "Reported Costs", "OpenAPC Mean Value",
            "OpenAPC Standard Deviation", "Difference (absolute)",
            "Difference (Standard Deviations)"
        ]]
    md_content = ""
    journal_dict = {}
    for article in articles:
        journal = article[6]
        if journal not in journal_dict:
            journal_dict[journal] = [article]
        else:
            journal_dict[journal].append(article)
    journals = list(journal_dict.keys())
    journals.sort()
    md_content += LANG[lang]["ad_header"]
    md_content += LANG[lang]["ad_intro"]
    md_content += LANG[lang]["ad_disc"]
    for journal in journals:
        publisher = journal_dict[journal][0][5]
        num_articles = journal_dict[journal][0][22]
        md_content += LANG[lang]["ad_table_header"].format(
            journal, publisher, num_articles)
        md_content += LANG[lang]["ad_th"]
        for article in journal_dict[journal]:
            row = "|"
            for index in [1, 3, 2, 18, 19, 20]:
                elem = str(article[index]).replace("|", "\|")
                if index == 3:  # doi
                    if oat.has_value(elem):
                        elem = "[" + elem + "](https://doi.org/" + elem + ")"
                    else:  # No doi, use url instead
                        elem = "[Link](" + article[16] + ")"
                if index in [2, 18, 19, 20]:  # monetary
                    elem = elem + "€"
                row += elem + "|"
            row += "\n"
            md_content += row
            if csv_output:
                line = []
                for index in [6, 5, 22, 1, 3, 2, 18, 19, 20, 21]:
                    line.append(str(article[index]))
                csv_content.append(line)
        md_content += "\n\n"
    md_content += LANG[lang]["ad_stats_header"].format(institution)
    for stat in ["articles", "not_checked", "within_limits", "significant"]:
        md_content += "* " + LANG[lang]["ad_stats_" + stat]
        md_content += ": " + str(stats[stat]) + "\n"
    if csv_output:
        with open("report.csv", "w") as out:
            csv_writer = csv.writer(out)
            csv_writer.writerows(csv_content)
    return md_content
예제 #9
0
def check_isbns(row_object):
    __tracebackhide__ = True
    row = row_object.row
    line_str = '{}, line {}: '.format(row_object.file_name,
                                      row_object.line_number)
    isbn = row["isbn"]
    publisher = row["publisher"]
    if not oat.has_value(isbn):
        fail(line_str + 'The isbn column may not be empty')
        return
    test_result = ISBNHANDLING.test_and_normalize_isbn(isbn)
    if not test_result["valid"]:
        error = ISBNHANDLING.ISBN_ERRORS[test_result["error_type"]]
        fail(line_str + 'The isbn is invalid: ' + error)
        return
    group_and_publisher = _get_isbn_group_publisher(isbn)
    for other_publisher in isbn_dict[group_and_publisher]:
        if other_publisher != publisher and not wl.publisher_identity(
                publisher, other_publisher):
            msg = line_str + (
                'Two book entries share a common group-publisher combination in '
                +
                'their ISBNs ({}), but the publisher name differs ("{}" vs "{}")'
            )
            fail(msg.format(group_and_publisher, publisher, other_publisher))
예제 #10
0
def check_common_field_content(row_object):
    __tracebackhide__ = True
    row = row_object.row
    line_str = '{}, line {}: '.format(row_object.file_name,
                                      row_object.line_number)
    if not oat.has_value(row['publisher']):
        fail(line_str + 'the column "publisher" must not be empty')
    if row['indexed_in_crossref'] not in ["TRUE", "FALSE"]:
        fail(line_str +
             'value in row "indexed_in_crossref" must either be TRUE or FALSE')
    if not row['doi'] == "NA":
        doi_norm = oat.get_normalised_DOI(row['doi'])
        if doi_norm is None:
            fail(
                line_str +
                'value in row "doi" must either be NA or represent a valid DOI'
            )
        elif doi_norm != row['doi']:
            fail(
                line_str +
                'value in row "doi" contains a valid DOI, but the format ' +
                'is not correct. It should be the simple DOI name, not ' +
                'handbook notation (doi:...) or a HTTP URI (http://dx.doi.org/...)'
            )
    if len(row['publisher']) != len(row['publisher'].strip()):
        fail(line_str + 'publisher name (' + row['publisher'] +
             ') has leading or trailing whitespaces')
예제 #11
0
def check_bpc_field_content(row_object):
    __tracebackhide__ = True
    row = row_object.row
    line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number)
    if not oat.has_value(row['book_title']):
        fail(line_str + 'the column "book_title" must not be empty')
    if len(row['book_title']) != len(row['book_title'].strip()):
        fail(line_str + 'book title (' + row['book_title'] + ') has leading or trailing whitespaces')
예제 #12
0
def check_optional_fields(row_object):
    __tracebackhide__ = True
    row = row_object.row
    if row['doi'] == "NA":
        line_str = '{}, line {}: '.format(row_object.file_name,
                                          row_object.line_number)
        if not oat.has_value(row['publisher']):
            pytest.fail(line_str + 'if no DOI is given, the column ' +
                        '"publisher" must not be empty')
        if not oat.has_value(row['journal_full_title']):
            pytest.fail(line_str + 'if no DOI is given, the column ' +
                        '"journal_full_title" must not be empty')
        if not oat.has_value(row['issn']):
            pytest.fail(line_str + 'if no DOI is given, the column "issn" ' +
                        'must not be empty')
        if not oat.has_value(row['url']):
            pytest.fail(line_str + 'if no DOI is given, the column "url" ' +
                        'must not be empty')
예제 #13
0
def check_optional_fields(row_object):
    __tracebackhide__ = True
    row = row_object.row
    if row['doi'] == "NA":
        line_str = '{}, line {}: '.format(row_object.file_name,
                                          row_object.line_number)
        if not oat.has_value(row['publisher']):
            pytest.fail(line_str + 'if no DOI is given, the column ' +
                        '"publisher" must not be empty')
        if not oat.has_value(row['journal_full_title']):
            pytest.fail(line_str + 'if no DOI is given, the column ' +
                        '"journal_full_title" must not be empty')
        if not oat.has_value(row['issn']):
            pytest.fail(line_str + 'if no DOI is given, the column "issn" ' +
                        'must not be empty')
        if not oat.has_value(row['url']):
            pytest.fail(line_str + 'if no DOI is given, the column "url" ' +
                        'must not be empty')
예제 #14
0
def check_optional_identifier(row_object):
    __tracebackhide__ = True
    row = row_object.row
    if row['doi'] == "NA":
        line_str = '{}, line {}: '.format(row_object.file_name,
                                          row_object.line_number)
        if not oat.has_value(row['url']):
            fail(line_str + 'if no DOI is given, the column "url" ' +
                        'must not be empty')
예제 #15
0
def check_for_isbn_duplicates(row_object):
    __tracebackhide__ = True
    isbn_list = []
    # prepare a deduplicated list
    for isbn_type in ["isbn", "isbn_print", "isbn_electronic"]:
        isbn = row_object.row[isbn_type]
        if oat.has_value(isbn) and isbn not in isbn_list and isbn not in wl.NON_DUPLICATE_ISBNS:
            isbn_list.append(isbn)
    for isbn in isbn_list:
        isbn_duplicate_list.remove(isbn)
        if isbn in isbn_duplicate_list:
            line_str = '{}, line {}: '.format(row_object.file_name,
                                              row_object.line_number)
            fail(line_str + 'Duplicate: ISBN "' + isbn + '" was ' +
                 'encountered more than one time')
예제 #16
0
def is_whitelisted(field_type, new_value, established_value, issn, issn_p,
                   issn_e, issn_l):
    # The JOURNAL_HYBRID_STATUS_CHANGED wl only lists one of the issns,
    # so we have to compare all issn types for a match.
    if field_type == "is_hybrid":
        if len({issn, issn_p, issn_e, issn_l}.intersection(
                wl.JOURNAL_HYBRID_STATUS_CHANGED)) > 0:
            return True
        return False
    # The publisher wls, on the other hand, list all issn types, so we have
    # implement a different kind of logic
    if field_type == "publisher":
        for issn_type in [issn, issn_p, issn_e, issn_l]:
            if oat.has_value(issn_type) and not wl.in_whitelist(
                    issn_type, established_value, new_value):
                return False
        return True
    return False
def generate_apc_deviaton_section(institution, articles, stats, lang):
    md_content = ""
    journal_dict = {}
    for article in articles:
        journal = article[6]
        if journal not in journal_dict:
            journal_dict[journal] = [article]
        else:
            journal_dict[journal].append(article)
    journals = list(journal_dict.keys())
    journals.sort()
    md_content += LANG[lang]["ad_header"]
    md_content += LANG[lang]["ad_intro"]
    md_content += LANG[lang]["ad_disc"]
    for journal in journals:
        publisher = journal_dict[journal][0][5]
        num_articles = journal_dict[journal][0][21]
        md_content += LANG[lang]["ad_table_header"].format(
            journal, publisher, num_articles)
        md_content += LANG[lang]["ad_th"]
        for article in journal_dict[journal]:
            row = "|"
            for index in [1, 3, 2, 18, 19, 20]:
                elem = str(article[index]).replace("|", "\|")
                if index == 3:  # doi
                    if oat.has_value(elem):
                        elem = "[" + elem + "](https://doi.org/" + elem + ")"
                    else:  # No doi, use url instead
                        elem = "[Link](" + article[16] + ")"
                if index in [2, 18, 19, 20]:  # monetary
                    elem = elem + "€"
                row += elem + "|"
            row += "\n"
            md_content += row
        md_content += "\n\n"
    md_content += LANG[lang]["ad_stats_header"].format(institution)
    for stat in ["articles", "not_checked", "within_limits", "significant"]:
        md_content += "* " + LANG[lang]["ad_stats_" + stat]
        md_content += ": " + str(stats[stat]) + "\n"
    return md_content
예제 #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    args = parser.parse_args()

    for file_path in [APC_DE_FILE, TA_FILE]:
        with open(file_path, "r") as path:
            reader = csv.DictReader(path)
            oat.print_b("Preparing mapping tables from " + file_path + "...")
            for line in reader:
                data = {
                    "journal_full_title": line["journal_full_title"],
                    "publisher": line["publisher"],
                    "is_hybrid": line["is_hybrid"],
                    "count": 1
                }
                for issn_type in ISSN_DICTS.keys():
                    issn = line[issn_type]
                    if issn not in ISSN_DICTS[issn_type]:
                        ISSN_DICTS[issn_type][issn] = data
                    else:
                        ISSN_DICTS[issn_type][issn]["count"] += 1
                if reader.line_num % 10000 == 0:
                    oat.print_b(str(reader.line_num) + " lines processed")

    modified_content = []
    header = None
    with open(args.csv_file) as csv_file:
        reader = csv.DictReader(csv_file)
        header = list(reader.fieldnames)
        stopped = False
        for line in reader:
            if stopped:
                modified_content.append(line)
                continue
            for issn_type in ISSN_DICTS.keys():
                issn = line[issn_type]
                if not oat.has_value(issn):
                    continue
                if issn in ISSN_DICTS[issn_type]:
                    for field_type in [
                            "is_hybrid", "publisher", "journal_full_title"
                    ]:
                        new_value = line[field_type]
                        established_value = ISSN_DICTS[issn_type][issn][
                            field_type]
                        if new_value != established_value and not is_whitelisted(
                                field_type, new_value, established_value,
                                line["issn"], line["issn_print"],
                                line["issn_electronic"], line["issn_l"]):
                            msg = MISMATCH_MSG.format(
                                reader.line_num,
                                oat.colorize(field_type, "cyan"), issn_type,
                                issn, line["is_hybrid"], line["publisher"],
                                line["journal_full_title"],
                                oat.colorize(
                                    str(ISSN_DICTS[issn_type][issn]["count"]),
                                    "cyan"),
                                ISSN_DICTS[issn_type][issn]["is_hybrid"],
                                ISSN_DICTS[issn_type][issn]["publisher"],
                                ISSN_DICTS[issn_type][issn]
                                ["journal_full_title"])
                            print(msg)
                            ask_msg = CORRECT_MSG.format(
                                field_type,
                                oat.colorize(established_value, "green"),
                                field_type,
                                oat.colorize(established_value, "green"))
                            ezb_msg = None
                            ret = input(ask_msg)
                            while ret not in ["1", "2", "3", "4"]:
                                if ret == "5":
                                    if ezb_msg is None:
                                        ezb_msg = _prepare_ezb_info(issn)
                                    print(ezb_msg)
                                ret = input(
                                    "Please select an option from 1 to 5 > ")
                            print("\n\n\n\n")
                            if ret in ["1", "2"]:
                                line[field_type] = established_value
                            if ret in ["2", "4"]:
                                stopped = True
                                break
            modified_content.append(line)
    modified_lines = [header]
    for line in modified_content:
        modified_lines.append(list(line.values()))
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True, False)
        writer.write_rows(modified_lines)
예제 #19
0
 line = 2
 for row in reader:
     for field in UNUSED_FIELDS:
         del(row[field])
     transformative_agreements = False
     if file_name == TRANSAGREE_FILE_PATH:
         transformative_agreements = True
     apc_data.append(RowObject(file_name, line, row, transformative_agreements))
     doi_duplicate_list.append(row["doi"])
     
     reduced_row = {}
     for field in ISSN_DICT_FIELDS:
         reduced_row[field] = row[field]
     
     issn = row["issn"]
     if oat.has_value(issn):
         if issn not in issn_dict:
             issn_dict[issn] = [reduced_row]
         elif reduced_row not in issn_dict[issn]:
             issn_dict[issn].append(reduced_row)
     issn_p = row["issn_print"]
     if oat.has_value(issn_p):
         if issn_p not in issn_p_dict:
             issn_p_dict[issn_p] = [reduced_row]
         elif reduced_row not in issn_p_dict[issn_p]:
             issn_p_dict[issn_p].append(reduced_row)
     issn_e = row["issn_electronic"]
     if oat.has_value(issn_e):
         if issn_e not in issn_e_dict:
             issn_e_dict[issn_e] = [reduced_row]
         elif reduced_row not in issn_e_dict[issn_e]:
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file")
    parser.add_argument("source_column", type=int)
    parser.add_argument("currency_column", type=int)
    parser.add_argument("period_column", type=int)
    parser.add_argument("target_column", type=int)
    parser.add_argument("-f", "--force_overwrite", action="store_true")
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    args = parser.parse_args()
    
    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and"  +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    enc = None
    
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()
    
    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            msg = "locale '{}' not found, normalised to '{}'".format(
                  args.locale, norm)
            oat.print_y(msg)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            oat.print_g("Using locale " + loc)
        except locale.Error as loce:
            msg = "Setting locale to {} failed: {}".format(norm, loce.message)
            oat.print_r(msg)
            sys.exit()
        
    header, content = oat.get_csv_file_content(args.source_file, enc, True)
    fieldnames = header.pop()
    
    modified_content = []
    line_num = 0
    
    for column_type in ["source_column", "currency_column", "period_column", "target_column"]:
        index = getattr(args, column_type)
        msg = "Column {} ('{}') is the {}."
        oat.print_g(msg.format(index, fieldnames[index], column_type))
    
    start = input("\nStart conversion? (y/n):")
    while start not in ["y", "n"]:
        start = input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()
    
    for line in content:
        line_num += 1
        if not oat.has_value(line[args.source_column]):
            oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...")
            modified_content.append(line)
            continue
        monetary_value = None
        try: 
            monetary_value = locale.atof(line[args.source_column])
        except ValueError:
            msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, line[args.source_column]))
            modified_content.append(line)
            continue
        currency = line[args.currency_column]
        if currency == "EUR":
            msg = "WARNING: Currency in line {} is already EUR, skipping..."
            oat.print_y(msg.format(line_num))
            line[args.target_column] = line[args.source_column]
            modified_content.append(line)
            continue
        if not oat.has_value(currency):
            msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, currency))
            modified_content.append(line)
            continue
        period = line[args.period_column]
        frequency = get_frequency(period)
        if frequency is None:
            msg = "WARNING: Could not extract a valid date string from period column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, period))
            modified_content.append(line)
            continue
        if currency not in EXCHANGE_RATES[frequency]:
            msg = 'No exchange rates ({}) found for currency "{}", querying ECB data warehouse...'
            oat.print_b(msg.format(frequency, currency))
            rates = oat.get_euro_exchange_rates(currency, frequency)
            EXCHANGE_RATES[frequency][currency] = rates
        rate = EXCHANGE_RATES[frequency][currency].get(period)
        if rate is None and frequency == "A":
            rate = _calulate_preliminary_annual_average(period, currency)
            if rate:
                EXCHANGE_RATES[frequency][currency][period] = rate
        if rate is None:
            if frequency != "D":
                msg = "Error: No conversion rate found for currency {} for period {} (line {}), aborting..."
                oat.print_r(msg.format(currency, period, line_num))
                sys.exit()
            day_retries = 0
            while rate is None:
                msg = "Warning: No conversion rate found for currency {} for period {} (line {}), trying next day..."
                oat.print_y(msg.format(currency, period, line_num))
                period = get_next_day(period)
                rate = EXCHANGE_RATES[frequency][currency].get(period)
                day_retries += 1
                if day_retries > 5:
                    msg = "Error: Look-ahead limit for days exceeded, aborting..."
                    oat.print_r(msg)
                    sys.exit()

        euro_value = round(monetary_value/float(rate), 2)
        line[args.target_column] = str(euro_value)
        
        msg = "Line {}: {} exchange rate ({}) for date {} is {} -> {} / {} = {} EUR"
        msg = msg.format(line_num, currency, frequency, period, rate, monetary_value, rate, euro_value)
        oat.print_g(msg)
        
        modified_content.append(line)
    
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows([fieldnames] + modified_content)
예제 #21
0
    with open(metadata["file_path"], "r") as csv_file:
        reader = DictReader(csv_file)
        line = 2
        for row in reader:
            for field in metadata["unused_fields"]:
                del(row[field])
            metadata["target_file"].append(RowObject(metadata["file_path"], line, row, data_file))
            doi_duplicate_list.append(row["doi"])

            if metadata["has_issn"]:
                reduced_row = {}
                for field in ISSN_DICT_FIELDS:
                    reduced_row[field] = row[field]

                issn = row["issn"]
                if oat.has_value(issn):
                    if issn not in issn_dict:
                        issn_dict[issn] = [reduced_row]
                    elif reduced_row not in issn_dict[issn]:
                        issn_dict[issn].append(reduced_row)
                issn_p = row["issn_print"]
                if oat.has_value(issn_p):
                    if issn_p not in issn_p_dict:
                        issn_p_dict[issn_p] = [reduced_row]
                    elif reduced_row not in issn_p_dict[issn_p]:
                        issn_p_dict[issn_p].append(reduced_row)
                issn_e = row["issn_electronic"]
                if oat.has_value(issn_e):
                    if issn_e not in issn_e_dict:
                        issn_e_dict[issn_e] = [reduced_row]
                    elif reduced_row not in issn_e_dict[issn_e]:
예제 #22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file")
    parser.add_argument("source_column", type=int)
    parser.add_argument("currency_column", type=int)
    parser.add_argument("period_column", type=int)
    parser.add_argument("target_column", type=int)
    parser.add_argument("-f", "--force_overwrite", action="store_true")
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q",
                        "--quotemask",
                        help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o",
                        "--openapc_quote_rules",
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true",
                        default=False)
    args = parser.parse_args()

    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print("Error: A quotemask may only contain the letters 't' and" +
                  "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]

    enc = None

    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print("Error: '" + args.encoding + "' not found Python's " +
                  "codec collection. Either look for a valid name here " +
                  "(https://docs.python.org/2/library/codecs.html#standard-" +
                  "encodings) or omit this argument to enable automated " +
                  "guessing.")
            sys.exit()

    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            msg = "locale '{}' not found, normalised to '{}'".format(
                args.locale, norm)
            oat.print_y(msg)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            oat.print_g("Using locale " + loc)
        except locale.Error as loce:
            msg = "Setting locale to {} failed: {}".format(norm, loce.message)
            oat.print_r(msg)
            sys.exit()

    header, content = oat.get_csv_file_content(args.source_file, enc)
    fieldnames = header.pop()

    modified_content = []
    line_num = 0

    for column_type in [
            "source_column", "currency_column", "period_column",
            "target_column"
    ]:
        index = getattr(args, column_type)
        msg = "Column {} ('{}') is the {}."
        oat.print_g(msg.format(index, fieldnames[index], column_type))

    start = input("\nStart conversion? (y/n):")
    while start not in ["y", "n"]:
        start = input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()

    for line in content:
        line_num += 1
        if not oat.has_value(line[args.source_column]):
            oat.print_y("WARNING: No source value found in line " +
                        str(line_num) + ", skipping...")
            modified_content.append(line)
            continue
        monetary_value = None
        try:
            monetary_value = locale.atof(line[args.source_column])
        except ValueError:
            msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, line[args.source_column]))
            modified_content.append(line)
            continue
        period = line[args.period_column]
        if not oat.has_value(period) or not period.isdigit():
            msg = "WARNING: Could not extract a valid year string from period column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, period))
            modified_content.append(line)
            continue
        currency = line[args.currency_column]
        if not oat.has_value(currency):
            msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, currency))
            modified_content.append(line)
            continue
        try:
            rate = AVG_YEARLY_CONVERSION_RATES[currency][period]
        except KeyError:
            msg = "ERROR: No conversion rate found for currency {} in year {} (line {}), aborting..."
            oat.print_r(msg.format(currency, period, line_num))
            sys.exit()

        euro_value = round(monetary_value / rate, 2)
        line[args.target_column] = str(euro_value)

        modified_content.append(line)

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows([fieldnames] + modified_content)
예제 #23
0
def integrate_changes(articles, file_path, enriched_file=False):
    '''
    Update existing entries in a previously created harvest file.
    
    Args:
        articles: A list of article dicts, as retured by openapc_toolkit.oai_harvest()
        file_path: Path to the CSV file the new values should be integrated into.
        enriched_file: If true, columns which are overwritten during enrichment
                       will not be updated
    Returns:
        A tuple. The first element is a reduced list of article dicts, containing
        those which did not find a matching DOI in the file (Order preserved).
        The second element is the list of column headers encountered in the harvest 
        file.
    '''
    if not os.path.isfile(file_path):
        return (articles, None)
    enriched_blacklist = [
        "institution", "publisher", "journal_full_title", "issn",
        "license_ref", "pmid"
    ]
    article_dict = OrderedDict()
    for article in articles:
        # This is possible because currently all repos use a local ID/record url, but it's just
        # a workaround. We might have to change to OAI record IDs later.
        url = article["url"]
        if oat.has_value(url):
            article_dict[url] = article
    updated_lines = []
    fieldnames = None
    with open(file_path, "r") as f:
        reader = DictReader(f)
        fieldnames = reader.fieldnames
        updated_lines.append(list(fieldnames))  #header
        start_msg = "Integrating changes in harvest data into existing file {}"
        oat.print_g(start_msg.format(file_path))
        for line in reader:
            url = line["url"]
            line_num = reader.reader.line_num
            msg = "Line {}: Checking for changes ({})"
            oat.print_b(msg.format(line_num, url))
            if url in article_dict:
                for key, value in article_dict[url].items():
                    if enriched_file and key in enriched_blacklist:
                        continue
                    if key in line and value != line[key]:
                        update_msg = 'Updating value in column {} ("{}" -> "{}")'
                        oat.print_g(update_msg.format(key, line[key], value))
                        line[key] = value
                del (article_dict[url])
                updated_line = [line[key] for key in fieldnames]
                updated_lines.append(updated_line)
            else:
                remove_msg = "URL {} no longer found in harvest data, removing article"
                oat.print_r(remove_msg.format(url))
    with open(file_path, "w") as f:
        mask = oat.OPENAPC_STANDARD_QUOTEMASK if enriched_file else None
        writer = oat.OpenAPCUnicodeWriter(f,
                                          quotemask=mask,
                                          openapc_quote_rules=True,
                                          has_header=True)
        writer.write_rows(updated_lines)
    return (article_dict.values(), fieldnames)
예제 #24
0
def integrate_changes(articles, file_path, enriched_file=False, dry_run=False):
    '''
    Update existing entries in a previously created harvest file.
    
    Args:
        articles: A list of article dicts, as retured by openapc_toolkit.oai_harvest()
        file_path: Path to the CSV file the new values should be integrated into.
        enriched_file: If true, columns which are overwritten during enrichment
                       will not be updated
        dry_run: Do not make any changes to the file (but still report changes and
                 return the list of unencountered articles)
    Returns:
        A tuple. The first element is a reduced list of article dicts, containing
        those which did not find a matching DOI in the file (Order preserved).
        The second element is the list of column headers encountered in the harvest 
        file.
    '''

    messages = {
        'wet': {
            'start':
            'Integrating changes in harvest data into existing file {}',
            'line_change':
            'Line {}: Updating value in column {} ("{}" -> "{}")',
            'remove':
            'PID {} no longer found in harvest data, removing article',
        },
        'dry': {
            'start':
            'Dry Run: Comparing harvest data to existing file {}',
            'line_change':
            'Line {} ({}): Change in column {} ("{}" -> "{}")',
            'remove':
            'PID {} no longer found in harvest data, article would be removed',
        }
    }

    messages = messages['dry'] if dry_run else messages['wet']

    if not os.path.isfile(file_path):
        return (articles, None)
    enriched_blacklist = [
        "institution", "publisher", "journal_full_title", "issn",
        "license_ref", "pmid"
    ]
    article_dict = OrderedDict()
    for article in articles:
        # Harvested articles use OAI record IDs in the url field as PID.
        url = article["url"]
        if oat.has_value(url):
            article_dict[url] = article
    updated_lines = []
    fieldnames = None
    with open(file_path, "r") as f:
        reader = DictReader(f)
        fieldnames = reader.fieldnames
        updated_lines.append(list(fieldnames))  #header
        oat.print_y(messages["start"].format(file_path))
        for line in reader:
            url = line["url"]
            if not oat.has_value(line["institution"]):
                # Do not change empty lines
                updated_lines.append([line[key] for key in fieldnames])
                continue
            line_num = reader.reader.line_num
            if url in article_dict:
                for key, value in article_dict[url].items():
                    if enriched_file and key in enriched_blacklist:
                        continue
                    if key in line and value != line[key]:
                        oat.print_g(messages["line_change"].format(
                            line_num, line["url"], key, line[key], value))
                        line[key] = value
                del (article_dict[url])
                updated_line = [line[key] for key in fieldnames]
                updated_lines.append(updated_line)
            else:
                oat.print_r(messages["remove"].format(url))
    if not dry_run:
        with open(file_path, "w") as f:
            mask = oat.OPENAPC_STANDARD_QUOTEMASK if enriched_file else None
            writer = oat.OpenAPCUnicodeWriter(f,
                                              quotemask=mask,
                                              openapc_quote_rules=True,
                                              has_header=True)
            writer.write_rows(updated_lines)
    return (article_dict.values(), fieldnames)
예제 #25
0
    grid_list = json_dict["institutes"]

for index, ins in enumerate(grid_list):
    deciles = {
        round((len(grid_list) / 10) * i): str(i * 10) + "%"
        for i in range(1, 10)
    }
    if index in deciles:
        print(deciles[index])
    if ins["status"] != "active":
        continue
    grid_names = [ins["name"]]
    if "aliases" in ins:
        grid_names += ins["aliases"]
    for institutions_row in ins_content:
        if oat.has_value(institutions_row[7]):
            continue
        institutions_name = institutions_row[2]
        grid_name, highest_ratio = get_best_match(grid_names,
                                                  institutions_name)
        match_type = get_match_type(highest_ratio)
        if match_type != None:
            grid_id = ins["id"]
            msg = '{} match: "{}" might be Grid institution "{}" ({}).'
            question = 'Assign Grid ID {} ({})  (y/n/q)?'
            msg = msg.format(match_type["name"], institutions_name, grid_name,
                             highest_ratio)
            question = question.format(grid_id, ins["name"])
            match_type["print_func"](msg)
            start = input(question)
            while start not in ["y", "n", "q"]:
예제 #26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file")
    parser.add_argument("source_column", type=int)
    parser.add_argument("currency_column", type=int)
    parser.add_argument("period_column", type=int)
    parser.add_argument("target_column", type=int)
    parser.add_argument("-f", "--force_overwrite", action="store_true")
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    args = parser.parse_args()
    
    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and"  +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    enc = None
    
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()
    
    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            msg = "locale '{}' not found, normalised to '{}'".format(
                  args.locale, norm)
            oat.print_y(msg)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            oat.print_g("Using locale " + loc)
        except locale.Error as loce:
            msg = "Setting locale to {} failed: {}".format(norm, loce.message)
            oat.print_r(msg)
            sys.exit()
        
    header, content = oat.get_csv_file_content(args.source_file, enc, True)
    fieldnames = header.pop()
    
    modified_content = []
    line_num = 0
    
    for column_type in ["source_column", "currency_column", "period_column", "target_column"]:
        index = getattr(args, column_type)
        msg = "Column {} ('{}') is the {}."
        oat.print_g(msg.format(index, fieldnames[index], column_type))
    
    start = input("\nStart conversion? (y/n):")
    while start not in ["y", "n"]:
        start = input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()
    
    for line in content:
        line_num += 1
        if not oat.has_value(line[args.source_column]):
            oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...")
            modified_content.append(line)
            continue
        monetary_value = None
        try: 
            monetary_value = locale.atof(line[args.source_column])
        except ValueError:
            msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, line[args.source_column]))
            modified_content.append(line)
            continue
        currency = line[args.currency_column]
        if not oat.has_value(currency):
            msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, currency))
            modified_content.append(line)
            continue
        period = line[args.period_column]
        frequency = get_frequency(period)
        if frequency is None:
            msg = "WARNING: Could not extract a valid date string from period column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, period))
            modified_content.append(line)
            continue
        if currency not in EXCHANGE_RATES[frequency]:
            msg = 'No exchange rates ({}) found for currency "{}", querying ECB data warehouse...'
            oat.print_b(msg.format(frequency, currency))
            rates = oat.get_euro_exchange_rates(currency, frequency)
            EXCHANGE_RATES[frequency][currency] = rates
        try:
            rate = EXCHANGE_RATES[frequency][currency][period]
        except KeyError:
            msg = "ERROR: No conversion rate found for currency {} for period {} (line {}), aborting..."
            oat.print_r(msg.format(currency, period, line_num))
            sys.exit()
        
        euro_value = round(monetary_value/float(rate), 2)
        line[args.target_column] = str(euro_value)
        
        msg = "Line {}: {} exchange rate ({}) for date {} is {} -> {} / {} = {} EUR"
        msg = msg.format(line_num, currency, frequency, period, rate, monetary_value, rate, euro_value)
        oat.print_g(msg)
        
        modified_content.append(line)
    
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows([fieldnames] + modified_content)
예제 #27
0
파일: do_harvest.py 프로젝트: MPDL/unibiAPC
def integrate_changes(articles, file_path, enriched_file=False):
    '''
    Update existing entries in a previously created harvest file.
    
    Args:
        articles: A list of article dicts, as retured by openapc_toolkit.oai_harvest()
        file_path: Path to the CSV file the new values should be integrated into.
        enriched_file: If true, columns which are overwritten during enrichment
                       will not be updated
    Returns:
        A tuple. The first element is a reduced list of article dicts, containing
        those which did not find a matching DOI in the file (Order preserved).
        The second element is the list of column headers encountered in the harvest 
        file.
    '''
    if not os.path.isfile(file_path):
        return (articles, None)
    enriched_blacklist = ["institution", "publisher", "journal_full_title", "issn", "license_ref", "pmid"]
    article_dict = OrderedDict()
    for article in articles:
        doi = article["doi"]
        if oat.has_value(doi):
            article_dict[doi] = article
    updated_lines = []
    fieldnames = None
    with open(file_path, "r") as f:
        reader = DictReader(f)
        fieldnames = reader.fieldnames
        updated_lines.append(list(fieldnames)) #header
        start_msg = "Integrating changes in harvest data into existing file {}"
        oat.print_g(start_msg.format(file_path))
        for line in reader:
            doi = line["doi"]
            line_num = reader.reader.line_num
            if not oat.has_value(doi):
                msg = "Line {}: No DOI found, change check not possible"
                oat.print_y(msg.format(line_num))
                updated_line = [line[key] for key in fieldnames]
                updated_lines.append(updated_line)
            else:
                msg = "Line {}: Checking for changes ({})"
                oat.print_b(msg.format(line_num, doi))
                if doi in article_dict:
                    for key, value in article_dict[doi].items():
                        if enriched_file and key in enriched_blacklist:
                            continue
                        if key in line and value != line[key]:
                            update_msg = 'Updating value in column {} ("{}" -> "{}")'
                            oat.print_g(update_msg.format(key, line[key], value))
                            line[key] = value
                    del(article_dict[doi])
                    updated_line = [line[key] for key in fieldnames]
                    updated_lines.append(updated_line)
                else:
                    remove_msg = "DOI {} no longer found in harvest data, removing article"
                    oat.print_r(remove_msg.format(doi))
    with open(file_path, "w") as f:
        mask = oat.OPENAPC_STANDARD_QUOTEMASK if enriched_file else None
        writer = oat.OpenAPCUnicodeWriter(f, quotemask=mask, openapc_quote_rules=True, has_header=True)
        writer.write_rows(updated_lines)
    return (article_dict.values(), fieldnames)
예제 #28
0
issn_dict = {}
issn_p_dict = {}
issn_e_dict = {}

for file_name in ["data/apc_de.csv", "data/offsetting/offsetting.csv"]:
    csv_file = open(file_name, "r")
    reader = oat.UnicodeDictReader(csv_file)
    line = 2
    for row in reader:
        test_apc = True
        if file_name == "data/offsetting/offsetting.csv":
            test_apc = False
        apc_data.append(RowObject(file_name, line, row, test_apc))
        doi_duplicate_list.append(row["doi"])
        issn = row["issn"]
        if oat.has_value(issn):
            if issn not in issn_dict:
                issn_dict[issn] = [row]
            else:
                issn_dict[issn].append(row)
        issn_p = row["issn_print"]
        if oat.has_value(issn_p):
            if issn_p not in issn_p_dict:
                issn_p_dict[issn_p] = [row]
            else:
                issn_p_dict[issn_p].append(row)
        issn_e = row["issn_electronic"]
        if oat.has_value(issn_e):
            if issn_e not in issn_e_dict:
                issn_e_dict[issn_e] = [row]
            else: