def _process_springer_catalogue(max_lookups=None):
    global COVERAGE_CACHE, LOOKUPS_PERFORMED
    current_year = datetime.datetime.now().year
    years = [str(year) for year in range(2015, current_year + 1)]
    for year in years:
        # Perform a simple check before wasting any time on processing
        catalogue_file = os.path.join(SPRINGER_JOURNAL_LISTS_DIR,
                                      year + ".csv")
        if not os.path.isfile(catalogue_file):
            raise IOError("Catalogue file " + catalogue_file + " not found!")
    for year in years:
        msg = "Looking up coverage stats for Open Choice journals in " + year
        print(colorise("--- " + msg + " ---", "green"))
        catalogue_file = os.path.join(SPRINGER_JOURNAL_LISTS_DIR,
                                      year + ".csv")
        reader = csv.DictReader(open(catalogue_file, "r"))
        for line in reader:
            if max_lookups is not None and LOOKUPS_PERFORMED >= max_lookups:
                return
            title = line["Title"]
            oa_option = line["Open Access Option"]
            if oa_option != "Hybrid (Open Choice)":
                msg = u'Journal "{}" is not an Open Choice journal (oa_option={}), skipping...'
                print(colorise(msg.format(title, oa_option), "yellow"))
                continue
            journal_id = line["product_id"]
            already_cached = True
            try:
                _ = COVERAGE_CACHE[journal_id]['years'][year][
                    "num_journal_total_articles"]
                _ = COVERAGE_CACHE[journal_id]['years'][year][
                    "num_journal_oa_articles"]
            except KeyError:
                try:
                    _update_journal_stats(title, journal_id, year)
                except ValueError as ve:
                    error_msg = (
                        'Journal "{}" ({}): ValueError while obtaining journal '
                        + 'stats, annual stats not added to cache.')
                    error_msg = colorise(error_msg.format(title, journal_id),
                                         "red")
                    print(error_msg)
                    ERROR_MSGS.append(error_msg)
                    continue
                LOOKUPS_PERFORMED += 1
                already_cached = False
            if already_cached:
                msg = 'Stats for journal "{}" in {} already cached.'
                print(colorise(msg.format(title, year), "yellow"))
def _get_springer_journal_stats(journal_id, period, oa=False):
    if not journal_id.isdigit():
        raise ValueError("Invalid journal id " + journal_id + " (not a number)")
    url = SPRINGER_FULL_SEARCH.format(journal_id, period, period)
    if oa:
        url = SPRINGER_OA_SEARCH.format(journal_id, period, period)
    print(url)
    try:
        req = Request(url, None)
        response = urlopen(req)
        content = response.read()
        content = content.decode("utf-8")
        results = {}
    except HTTPError as httpe:
        if httpe.code == 503: # retry on timeout
            print(colorise("Timeout (HTTP 503), retrying...", "yellow"))
            return _get_springer_journal_stats(journal_id, period, oa)
        else:
            raise httpe
    count_match = SEARCH_RESULTS_COUNT_RE.search(content)
    if count_match:
        count = count_match.groupdict()['count']
        count = count.replace(",", "")
        results['count'] = int(count)
    else:
        raise ValueError("Regex could not detect a results count at " + url)
    title_match = SEARCH_RESULTS_TITLE_RE.search(content)
    if title_match:
        title = (title_match.groupdict()['title'])
        htmlparser = HTMLParser()
        results['title'] = htmlparser.unescape(title)
    else:
        raise ValueError("Regex could not detect a journal title at " + url)
    return results
Exemplo n.º 3
0
def _get_springer_journal_stats(journal_id, period, oa=False):
    if not journal_id.isdigit():
        raise ValueError("Invalid journal id " + journal_id + " (not a number)")
    url = SPRINGER_FULL_SEARCH.format(journal_id, period, period)
    if oa:
        url = SPRINGER_OA_SEARCH.format(journal_id, period, period)
    print(url)
    try:
        req = Request(url, None)
        response = urlopen(req)
        content = response.read()
        content = content.decode("utf-8")
        results = {}
    except HTTPError as httpe:
        if httpe.code == 503: # retry on timeout
            print(colorise("Timeout (HTTP 503), retrying...", "yellow"))
            return _get_springer_journal_stats(journal_id, period, oa)
        else:
            raise httpe
    count_match = SEARCH_RESULTS_COUNT_RE.search(content)
    if count_match:
        count = count_match.groupdict()['count']
        count = count.replace(",", "")
        results['count'] = int(count)
    else:
        raise ValueError("Regex could not detect a results count at " + url)
    title_match = SEARCH_RESULTS_TITLE_RE.search(content)
    if title_match:
        title = (title_match.groupdict()['title'])
        htmlparser = HTMLParser()
        results['title'] = htmlparser.unescape(title)
    else:
        raise ValueError("Regex could not detect a journal title at " + url)
    return results
Exemplo n.º 4
0
def _process_springer_catalogue(max_lookups=None):
    global COVERAGE_CACHE, LOOKUPS_PERFORMED
    current_year = datetime.datetime.now().year
    years = [str(year) for year in range(2015, current_year + 1)]
    for year in years:
        # Perform a simple check before wasting any time on processing
        catalogue_file = os.path.join(SPRINGER_JOURNAL_LISTS_DIR, year + ".csv")
        if not os.path.isfile(catalogue_file):
            raise IOError("Catalogue file " + catalogue_file + " not found!")
    for year in years:
        msg = "Looking up coverage stats for Open Choice journals in " + year
        print(colorise("--- " + msg + " ---", "green"))
        catalogue_file = os.path.join(SPRINGER_JOURNAL_LISTS_DIR, year + ".csv")
        reader = csv.DictReader(open(catalogue_file, "r"))
        for line in reader:
            if max_lookups is not None and LOOKUPS_PERFORMED >= max_lookups:
                return
            title = line["Title"]
            oa_option = line["Open Access Option"]
            if oa_option != "Hybrid (Open Choice)":
                msg = u'Journal "{}" is not an Open Choice journal (oa_option={}), skipping...'
                print(colorise(msg.format(title, oa_option), "yellow"))
                continue
            journal_id = line["product_id"]
            already_cached = True
            try:
                _ = COVERAGE_CACHE[journal_id]['years'][year]["num_journal_total_articles"]
                _ = COVERAGE_CACHE[journal_id]['years'][year]["num_journal_oa_articles"]
            except KeyError:
                try:
                    _update_journal_stats(title, journal_id, year)
                except ValueError as ve:
                    error_msg = ('Journal "{}" ({}): ValueError while obtaining journal ' +
                                 'stats, annual stats not added to cache.')
                    error_msg = colorise(error_msg.format(title, journal_id), "red")
                    print(error_msg)
                    ERROR_MSGS.append(error_msg)
                    continue
                LOOKUPS_PERFORMED += 1
                already_cached = False
            if already_cached:
                msg = 'Stats for journal "{}" in {} already cached.'
                print(colorise(msg.format(title, year), "yellow"))
def _update_journal_stats(title, journal_id, year, verbose=True):
    global COVERAGE_CACHE
    total = _get_springer_journal_stats(journal_id, year, oa=False)
    oa = _get_springer_journal_stats(journal_id, year, oa=True)
    if verbose:
        msg = 'Obtained stats for journal "{}" in {}: {} OA, {} Total'
        print(colorise(msg.format(title, year, oa["count"], total["count"]), "green"))
    if journal_id not in COVERAGE_CACHE:
        COVERAGE_CACHE[journal_id] = {'title': title, 'years': {}}
    if year not in COVERAGE_CACHE[journal_id]['years']:
        COVERAGE_CACHE[journal_id]['years'][year] = {}
    COVERAGE_CACHE[journal_id]['years'][year]["num_journal_total_articles"] = total["count"]
    COVERAGE_CACHE[journal_id]['years'][year]["num_journal_oa_articles"] = oa["count"]
Exemplo n.º 6
0
def _update_journal_stats(title, journal_id, year, verbose=True):
    global COVERAGE_CACHE
    total = _get_springer_journal_stats(journal_id, year, oa=False)
    oa = _get_springer_journal_stats(journal_id, year, oa=True)
    if verbose:
        msg = 'Obtained stats for journal "{}" in {}: {} OA, {} Total'
        print(colorise(msg.format(title, year, oa["count"], total["count"]), "green"))
    if journal_id not in COVERAGE_CACHE:
        COVERAGE_CACHE[journal_id] = {'title': title, 'years': {}}
    if year not in COVERAGE_CACHE[journal_id]['years']:
        COVERAGE_CACHE[journal_id]['years'][year] = {}
    COVERAGE_CACHE[journal_id]['years'][year]["num_journal_total_articles"] = total["count"]
    COVERAGE_CACHE[journal_id]['years'][year]["num_journal_oa_articles"] = oa["count"]
def _shutdown():
    """
    Write cache content back to disk before terminating and display collected error messages.
    """
    print("Updating cache files..")
    with open(COVERAGE_CACHE_FILE, "w") as f:
        f.write(
            json.dumps(COVERAGE_CACHE,
                       sort_keys=True,
                       indent=4,
                       separators=(',', ': ')))
        f.flush()
    with open(PUBDATES_CACHE_FILE, "w") as f:
        f.write(
            json.dumps(PERSISTENT_PUBDATES_CACHE,
                       sort_keys=True,
                       indent=4,
                       separators=(',', ': ')))
        f.flush()
    with open(JOURNAL_ID_CACHE_FILE, "w") as f:
        f.write(
            json.dumps(JOURNAL_ID_CACHE,
                       sort_keys=True,
                       indent=4,
                       separators=(',', ': ')))
        f.flush()
    print("Done.")
    num_articles = 0
    for _, dois in PERSISTENT_PUBDATES_CACHE.items():
        num_articles += len(dois)
    print(
        "The article cache now contains publication dates for {} DOIs".format(
            num_articles))
    if ERROR_MSGS:
        print(
            colorise("There were errors during the lookup process:", "yellow"))
        for msg in ERROR_MSGS:
            print(msg)
    sys.exit()
Exemplo n.º 8
0
def _shutdown():
    """
    Write cache content back to disk before terminating and display collected error messages.
    """
    print("Updating cache files..")
    with open(COVERAGE_CACHE_FILE, "w") as f:
        f.write(json.dumps(COVERAGE_CACHE, sort_keys=True, indent=4, separators=(',', ': ')))
        f.flush()
    with open(PUBDATES_CACHE_FILE, "w") as f:
        f.write(json.dumps(PERSISTENT_PUBDATES_CACHE, sort_keys=True, indent=4, separators=(',', ': ')))
        f.flush()
    with open(JOURNAL_ID_CACHE_FILE, "w") as f:
        f.write(json.dumps(JOURNAL_ID_CACHE, sort_keys=True, indent=4, separators=(',', ': ')))
        f.flush()
    print("Done.")
    num_articles = 0
    for _, dois in PERSISTENT_PUBDATES_CACHE.items():
        num_articles += len(dois)
    print("The article cache now contains publication dates for {} DOIs".format(num_articles))
    if ERROR_MSGS:
        print(colorise("There were errors during the lookup process:", "yellow"))
        for msg in ERROR_MSGS:
            print(msg)
    sys.exit()
Exemplo n.º 9
0
def create_cubes_tables(connectable,
                        apc_file_name,
                        transformative_agreements_file_name,
                        schema="openapc_schema"):

    apc_fields = [("institution", "string"), ("period", "string"),
                  ("euro", "float"), ("doi", "string"),
                  ("is_hybrid", "string"), ("publisher", "string"),
                  ("journal_full_title", "string"), ("issn", "string"),
                  ("issn_print", "string"), ("issn_electronic", "string"),
                  ("issn_l", "string"), ("license_ref", "string"),
                  ("indexed_in_crossref", "string"), ("pmid", "string"),
                  ("pmcid", "string"), ("ut", "string"), ("url", "string"),
                  ("doaj", "string"), ("country", "string")]

    transformative_agreements_fields = [("institution", "string"),
                                        ("period", "string"),
                                        ("doi", "string"),
                                        ("is_hybrid", "string"),
                                        ("publisher", "string"),
                                        ("journal_full_title", "string"),
                                        ("issn", "string"),
                                        ("issn_print", "string"),
                                        ("issn_electronic", "string"),
                                        ("issn_l", "string"),
                                        ("license_ref", "string"),
                                        ("indexed_in_crossref", "string"),
                                        ("pmid", "string"),
                                        ("pmcid", "string"), ("ut", "string"),
                                        ("url", "string"), ("doaj", "string"),
                                        ("country", "string"),
                                        ("agreement", "string")]

    springer_compact_coverage_fields = [
        ("period", "string"), ("publisher", "string"),
        ("journal_full_title", "string"), ("is_hybrid", "string"),
        ("num_springer_compact_articles", "float"),
        ("num_journal_total_articles", "float"),
        ("num_journal_oa_articles", "float")
    ]

    metadata = sqlalchemy.MetaData(bind=connectable)

    openapc_table = sqlalchemy.Table("openapc",
                                     metadata,
                                     autoload=False,
                                     schema=schema)
    if openapc_table.exists():
        openapc_table.drop(checkfirst=False)
    init_table(openapc_table, apc_fields)
    openapc_insert_command = openapc_table.insert()

    transformative_agreements_table = sqlalchemy.Table(
        "transformative_agreements", metadata, autoload=False, schema=schema)
    if transformative_agreements_table.exists():
        transformative_agreements_table.drop(checkfirst=False)
    init_table(transformative_agreements_table,
               transformative_agreements_fields)
    transformative_agreements_insert_command = transformative_agreements_table.insert(
    )

    combined_table = sqlalchemy.Table("combined",
                                      metadata,
                                      autoload=False,
                                      schema=schema)
    if combined_table.exists():
        combined_table.drop(checkfirst=False)
    init_table(combined_table, apc_fields)
    combined_insert_command = combined_table.insert()

    springer_compact_coverage_table = sqlalchemy.Table(
        "springer_compact_coverage", metadata, autoload=False, schema=schema)
    if springer_compact_coverage_table.exists():
        springer_compact_coverage_table.drop(checkfirst=False)
    init_table(springer_compact_coverage_table,
               springer_compact_coverage_fields)
    springer_compact_coverage_insert_command = springer_compact_coverage_table.insert(
    )

    # a dict to store individual insert commands for every table
    tables_insert_commands = {
        "openapc": openapc_insert_command,
        "transformative_agreements": transformative_agreements_insert_command,
        "combined": combined_insert_command,
        "springer_compact_coverage": springer_compact_coverage_insert_command
    }

    transformative_agreements_institution_countries = {}

    reader = csv.DictReader(
        open("static/institutions_transformative_agreements.csv", "r"))
    for row in reader:
        institution_name = row["institution"]
        country = row["country"]
        transformative_agreements_institution_countries[
            institution_name] = country

    journal_coverage = None
    article_pubyears = None
    try:
        cache_file = open(scc.COVERAGE_CACHE_FILE, "r")
        journal_coverage = json.loads(cache_file.read())
        cache_file.close()
        cache_file = open(scc.PUBDATES_CACHE_FILE, "r")
        article_pubyears = json.loads(cache_file.read())
        cache_file.close()
    except IOError as ioe:
        msg = "Error while trying to access cache file: {}"
        print(msg.format(ioe))
        sys.exit()
    except ValueError as ve:
        msg = "Error while trying to decode cache structure in: {}"
        print(msg.format(str(ve)))
        sys.exit()

    summarised_transformative_agreements = {}

    journal_id_title_map = {}

    reader = csv.DictReader(open(transformative_agreements_file_name, "r"))
    institution_key_errors = []
    for row in reader:
        institution = row["institution"]
        publisher = row["publisher"]
        issn = row["issn"]
        doi = row["doi"]
        # colons cannot be escaped in URL queries to the cubes server, so we have
        # to remove them here
        row["journal_full_title"] = row["journal_full_title"].replace(":", "")
        title = row["journal_full_title"]
        try:
            row["country"] = transformative_agreements_institution_countries[
                institution]
        except KeyError:
            if institution not in institution_key_errors:
                institution_key_errors.append(institution)
        tables_insert_commands["transformative_agreements"].execute(row)
        if row["euro"] != "NA":
            tables_insert_commands["combined"].execute(row)

        if publisher != "Springer Nature":
            continue

        journal_id = scc._get_springer_journal_id_from_doi(doi, issn)
        journal_id_title_map[journal_id] = title
        try:
            pub_year = article_pubyears[journal_id][doi]
        except KeyError:
            msg = (
                u"Publication year entry not found in article cache for {}. " +
                "You might have to update the article cache with 'python " +
                "assets_generator.py coverage_stats'. Using the 'period' " +
                "column for now.")
            print(colorise(msg.format(doi), "yellow"))
            pub_year = row["period"]

        if journal_id not in summarised_transformative_agreements:
            summarised_transformative_agreements[journal_id] = {}
        if pub_year not in summarised_transformative_agreements[journal_id]:
            summarised_transformative_agreements[journal_id][pub_year] = 1
        else:
            summarised_transformative_agreements[journal_id][pub_year] += 1
    if institution_key_errors:
        print("KeyError: The following institutions were not found in the " +
              "institutions_transformative_agreements file:")
        for institution in institution_key_errors:
            print(institution)
        sys.exit()
    for journal_id, info in journal_coverage.items():
        for year, stats in info["years"].items():
            row = {
                "publisher": "Springer Nature",
                "journal_full_title": info["title"],
                "period": year,
                "is_hybrid": "TRUE",
                "num_journal_total_articles":
                stats["num_journal_total_articles"],
                "num_journal_oa_articles": stats["num_journal_oa_articles"]
            }
            try:
                row["num_springer_compact_articles"] = summarised_transformative_agreements[
                    journal_id][year]
            except KeyError:
                row["num_springer_compact_articles"] = 0
            tables_insert_commands["springer_compact_coverage"].execute(row)

    institution_countries = {}

    reader = csv.DictReader(open("static/institutions.csv", "r"))
    for row in reader:
        cubes_name = row["institution_cubes_name"]
        institution_name = row["institution"]
        country = row["country"]
        institution_countries[institution_name] = country
        if institution_name not in tables_insert_commands:
            table = sqlalchemy.Table(cubes_name,
                                     metadata,
                                     autoload=False,
                                     schema=schema)
            if table.exists():
                table.drop(checkfirst=False)
            init_table(table, apc_fields)
            insert_command = table.insert()
            tables_insert_commands[institution_name] = insert_command

    reader = csv.DictReader(open(apc_file_name, "r"))
    for row in reader:
        institution = row["institution"]
        # colons cannot be escaped in URL queries to the cubes server, so we have
        # to remove them here
        row["journal_full_title"] = row["journal_full_title"].replace(":", "")
        row["country"] = institution_countries[institution]
        tables_insert_commands[institution].execute(row)
        tables_insert_commands["openapc"].execute(row)
        tables_insert_commands["combined"].execute(row)
Exemplo n.º 10
0
def create_cubes_tables(connectable, apc_file_name, offsetting_file_name, schema="openapc_schema"):

    apc_fields = [
        ("institution", "string"),
        ("period", "string"),
        ("euro", "float"),
        ("doi", "string"),
        ("is_hybrid", "string"),
        ("publisher", "string"),
        ("journal_full_title", "string"),
        ("issn", "string"),
        ("issn_print", "string"),
        ("issn_electronic", "string"),
        ("issn_l", "string"),
        ("license_ref", "string"),
        ("indexed_in_crossref", "string"),
        ("pmid", "string"),
        ("pmcid", "string"),
        ("ut", "string"),
        ("url", "string"),
        ("doaj", "string"),
        ("country", "string")
    ]

    offsetting_fields = [
        ("institution", "string"),
        ("period", "string"),
        ("doi", "string"),
        ("is_hybrid", "string"),
        ("publisher", "string"),
        ("journal_full_title", "string"),
        ("issn", "string"),
        ("issn_print", "string"),
        ("issn_electronic", "string"),
        ("issn_l", "string"),
        ("license_ref", "string"),
        ("indexed_in_crossref", "string"),
        ("pmid", "string"),
        ("pmcid", "string"),
        ("ut", "string"),
        ("url", "string"),
        ("doaj", "string"),
        ("country", "string"),
    ]

    offsetting_coverage_fields = [
        ("period", "string"),
        ("publisher", "string"),
        ("journal_full_title", "string"),
        ("is_hybrid", "string"),
        ("num_offsetting_articles", "float"),
        ("num_journal_total_articles", "float"),
        ("num_journal_oa_articles", "float")
    ]

    metadata = sqlalchemy.MetaData(bind=connectable)

    openapc_table = sqlalchemy.Table("openapc", metadata, autoload=False, schema=schema)
    if openapc_table.exists():
        openapc_table.drop(checkfirst=False)
    init_table(openapc_table, apc_fields)
    openapc_insert_command = openapc_table.insert()

    offsetting_table = sqlalchemy.Table("offsetting", metadata, autoload=False, schema=schema)
    if offsetting_table.exists():
        offsetting_table.drop(checkfirst=False)
    init_table(offsetting_table, offsetting_fields)
    offsetting_insert_command = offsetting_table.insert()

    combined_table = sqlalchemy.Table("combined", metadata, autoload=False, schema=schema)
    if combined_table.exists():
        combined_table.drop(checkfirst=False)
    init_table(combined_table, apc_fields)
    combined_insert_command = combined_table.insert()

    offsetting_coverage_table = sqlalchemy.Table("offsetting_coverage", metadata,
                                                 autoload=False, schema=schema)
    if offsetting_coverage_table.exists():
        offsetting_coverage_table.drop(checkfirst=False)
    init_table(offsetting_coverage_table, offsetting_coverage_fields)
    offsetting_coverage_insert_command = offsetting_coverage_table.insert()

    # a dict to store individual insert commands for every table
    tables_insert_commands = {
        "openapc": openapc_insert_command,
        "offsetting": offsetting_insert_command,
        "combined": combined_insert_command,
        "offsetting_coverage": offsetting_coverage_insert_command
    }

    offsetting_institution_countries = {}

    reader = csv.DictReader(open("static/institutions_offsetting.csv", "r"))
    for row in reader:
        institution_name = row["institution"]
        country = row["country"]
        offsetting_institution_countries[institution_name] = country

    journal_coverage = None
    article_pubyears = None
    try:
        cache_file = open(oc.COVERAGE_CACHE_FILE, "r")
        journal_coverage = json.loads(cache_file.read())
        cache_file.close()
        cache_file = open(oc.PUBDATES_CACHE_FILE, "r")
        article_pubyears = json.loads(cache_file.read())
        cache_file.close()
    except IOError as ioe:
        msg = "Error while trying to access cache file: {}"
        print(msg.format(ioe))
        sys.exit()
    except ValueError as ve:
        msg = "Error while trying to decode cache structure in: {}"
        print(msg.format(str(ve)))
        sys.exit()

    summarised_offsetting = {}

    journal_id_title_map = {}

    reader = csv.DictReader(open(offsetting_file_name, "r"))
    institution_key_errors = []
    for row in reader:
        institution = row["institution"]
        publisher = row["publisher"]
        issn = row["issn"]
        doi = row["doi"]
        # colons cannot be escaped in URL queries to the cubes server, so we have
        # to remove them here
        row["journal_full_title"] = row["journal_full_title"].replace(":", "")
        title = row["journal_full_title"]
        try:
            row["country"] = offsetting_institution_countries[institution]
        except KeyError:
            if institution not in institution_key_errors:
                institution_key_errors.append(institution)
        tables_insert_commands["offsetting"].execute(row)
        if row["euro"] != "NA":
            tables_insert_commands["combined"].execute(row)

        if publisher != "Springer Nature":
            continue

        journal_id = oc._get_springer_journal_id_from_doi(doi, issn)
        journal_id_title_map[journal_id] = title
        try:
            pub_year = article_pubyears[journal_id][doi]
        except KeyError:
            msg = (u"Publication year entry not found in article cache for {}. " +
                   "You might have to update the article cache with 'python " +
                   "assets_generator.py coverage_stats'. Using the 'period' " +
                   "column for now.")
            print(colorise(msg.format(doi), "yellow"))
            pub_year = row["period"]

        if journal_id not in summarised_offsetting:
            summarised_offsetting[journal_id] = {}
        if pub_year not in summarised_offsetting[journal_id]:
            summarised_offsetting[journal_id][pub_year] = 1
        else:
            summarised_offsetting[journal_id][pub_year] += 1
    if institution_key_errors:
        print("KeyError: The following institutions were not found in the " +
              "institutions_offsetting file:")
        for institution in institution_key_errors:
            print(institution)
        sys.exit()
    for journal_id, info in journal_coverage.items():
        for year, stats in info["years"].items():
            row = {
                "publisher": "Springer Nature",
                "journal_full_title": info["title"],
                "period": year,
                "is_hybrid": "TRUE",
                "num_journal_total_articles": stats["num_journal_total_articles"],
                "num_journal_oa_articles": stats["num_journal_oa_articles"]
            }
            try:
                row["num_offsetting_articles"] = summarised_offsetting[journal_id][year]
            except KeyError:
                row["num_offsetting_articles"] = 0
            tables_insert_commands["offsetting_coverage"].execute(row)

    institution_countries = {}

    reader = csv.DictReader(open("static/institutions.csv", "r"))
    for row in reader:
        cubes_name = row["institution_cubes_name"]
        institution_name = row["institution"]
        country = row["country"]
        institution_countries[institution_name] = country
        if institution_name not in tables_insert_commands:
            table = sqlalchemy.Table(cubes_name, metadata, autoload=False, schema=schema)
            if table.exists():
                table.drop(checkfirst=False)
            init_table(table, apc_fields)
            insert_command = table.insert()
            tables_insert_commands[institution_name] = insert_command

    reader = csv.DictReader(open(apc_file_name, "r"))
    for row in reader:
        institution = row["institution"]
        # colons cannot be escaped in URL queries to the cubes server, so we have
        # to remove them here
        row["journal_full_title"] = row["journal_full_title"].replace(":", "")
        row["country"] = institution_countries[institution]
        tables_insert_commands[institution].execute(row)
        tables_insert_commands["openapc"].execute(row)
        tables_insert_commands["combined"].execute(row)
Exemplo n.º 11
0
def update_coverage_stats(transformative_agreements_file, max_lookups, refetch=True):
    global COVERAGE_CACHE, JOURNAL_ID_CACHE, PERSISTENT_PUBDATES_CACHE, LOOKUPS_PERFORMED
    LOOKUPS_PERFORMED = 0
    if os.path.isfile(COVERAGE_CACHE_FILE):
        with open(COVERAGE_CACHE_FILE, "r") as f:
            try:
                COVERAGE_CACHE = json.loads(f.read())
                print("coverage cache file sucessfully loaded.")
            except ValueError:
                print("Could not decode a cache structure from " + COVERAGE_CACHE_FILE + ", starting with an empty coverage cache.")
    else:
        print("No cache file (" + COVERAGE_CACHE_FILE + ") found, starting with an empty coverage cache.")
    if os.path.isfile(PUBDATES_CACHE_FILE):
        with open(PUBDATES_CACHE_FILE, "r") as f:
            try:
                PERSISTENT_PUBDATES_CACHE = json.loads(f.read())
                print("Pub dates cache file sucessfully loaded.")
            except ValueError:
                print("Could not decode a cache structure from " + PUBDATES_CACHE_FILE + ", starting with an empty pub date cache.")
    else:
        print("No cache file (" + PUBDATES_CACHE_FILE + ") found, starting with an empty pub date cache.")
        
    if not os.path.isdir(JOURNAL_CSV_DIR):
        raise IOError("Journal CSV directory " + JOURNAL_CSV_DIR + " not found!")

    _process_springer_catalogue(max_lookups)

    reader = csv.DictReader(open(transformative_agreements_file, "r"))
    for line in reader:
        if max_lookups is not None and LOOKUPS_PERFORMED >= max_lookups:
            print("maximum number of lookups performed.")
            _shutdown()
        lookup_performed = False
        found = True
        publisher = line["publisher"]
        if publisher != "Springer Nature":
            continue
        issn = line["issn"]
        period = line["period"]
        title = line["journal_full_title"]
        doi = line["doi"]
        journal_id = _get_springer_journal_id_from_doi(doi, issn)
        # Retreive publication dates for articles from CSV summaries on SpringerLink.
        # Employ a multi-level cache structure to minimize IO:
        #  1. try to look up the doi in the persistent publication dates cache
        #  2. if the journal is not present, repopulate local cache segment from a CSV file in the journal CSV dir
        #  3a. if no CSV for the journal could be found, fetch it from SpringerLink
        #  3b. Alternative to 3: If a CSV was found but it does not contain the DOI, re-fetch it from SpringerLink 
        try:
            _ = PERSISTENT_PUBDATES_CACHE[journal_id][doi]
            print("Journal {} ('{}'): DOI {} already cached.".format(journal_id, title, doi))
        except KeyError:
            if journal_id not in TEMP_JOURNAL_CACHE:
                msg = "Journal {} ('{}'): Not found in temp cache, repopulating..."
                print(msg.format(journal_id, title))
                TEMP_JOURNAL_CACHE[journal_id] = _get_journal_cache_from_csv(journal_id, refetch=False)
            if doi not in TEMP_JOURNAL_CACHE[journal_id]:
                if refetch:
                    msg = u"Journal {} ('{}'): DOI {} not found in cache, re-fetching csv file..."
                    print(msg.format(journal_id, title, doi))
                    TEMP_JOURNAL_CACHE[journal_id] = _get_journal_cache_from_csv(journal_id, refetch=True)
                if doi not in TEMP_JOURNAL_CACHE[journal_id]:
                    msg = u"Journal {} ('{}'): DOI {} NOT FOUND in SpringerLink data!"
                    msg = colorise(msg.format(title, journal_id, doi), "red")
                    print(msg)
                    ERROR_MSGS.append(msg)
                    found = False
            lookup_performed = True
            if journal_id not in PERSISTENT_PUBDATES_CACHE:
                PERSISTENT_PUBDATES_CACHE[journal_id] = {}
            if found:
                PERSISTENT_PUBDATES_CACHE[journal_id][doi] = TEMP_JOURNAL_CACHE[journal_id][doi]
                pub_year = PERSISTENT_PUBDATES_CACHE[journal_id][doi]
                compare_msg = u"DOI {} found in Springer data, Pub year is {} ".format(doi, pub_year)
                if pub_year == period:
                    compare_msg += colorise("(same as transformative_agreements period)", "green")
                else:
                    compare_msg += colorise("(DIFFERENT from transformative_agreements period, which is {})".format(period), "yellow")
                msg = u"Journal {} ('{}'): ".format(journal_id, title)
                print(msg.ljust(80) + compare_msg)
        if found:
            pub_year = PERSISTENT_PUBDATES_CACHE[journal_id][doi]
        else:
            # If a lookup error occured we will retreive coverage stats for the period year instead, since
            # the aggregation process will make use of this value.
            pub_year = period
        # Test if journal stats are present
        try:
            _ = COVERAGE_CACHE[journal_id]['years'][pub_year]["num_journal_total_articles"]
            _ = COVERAGE_CACHE[journal_id]['years'][pub_year]["num_journal_oa_articles"]
        except KeyError:
            try:
                _update_journal_stats(title, journal_id, pub_year)
                lookup_performed = True
                error_msg = ('No stats found for journal "{}" ({}) in {} albeit having ' +
                             'downloaded the full Open Choice catalogue. Stats were ' +
                             'obtained retroactively.')
                error_msg = colorise(error_msg.format(title, journal_id, pub_year), "red")
                print(error_msg)
                ERROR_MSGS.append(error_msg)
            except ValueError as ve:
                error_msg = ('Critical Error while processing DOI {}: No stats found ' +
                             ' for journal "{}" ({}) in {} albeit having downloaded the ' +
                             'full Open Choice catalogue and stats could not be obtained ' +
                             'retroactively (ValueError: {}).')
                error_msg = colorise(error_msg.format(doi, title, journal_id, pub_year, str(ve)), "red")
                print(error_msg)
                ERROR_MSGS.append(error_msg)
                _shutdown()
        if lookup_performed:
            LOOKUPS_PERFORMED += 1
    _shutdown()
Exemplo n.º 12
0
def create_cubes_tables(connectable,
                        apc_file_name,
                        transformative_agreements_file_name,
                        schema="openapc_schema"):

    apc_fields = [("institution", "string"), ("period", "string"),
                  ("euro", "float"), ("doi", "string"),
                  ("is_hybrid", "string"), ("publisher", "string"),
                  ("journal_full_title", "string"), ("issn", "string"),
                  ("issn_print", "string"), ("issn_electronic", "string"),
                  ("issn_l", "string"), ("license_ref", "string"),
                  ("indexed_in_crossref", "string"), ("pmid", "string"),
                  ("pmcid", "string"), ("ut", "string"), ("url", "string"),
                  ("doaj", "string"), ("country", "string")]

    deal_fields = apc_fields + [("opt_out", "string")]

    transformative_agreements_fields = [("institution", "string"),
                                        ("period", "string"),
                                        ("doi", "string"),
                                        ("is_hybrid", "string"),
                                        ("publisher", "string"),
                                        ("journal_full_title", "string"),
                                        ("issn", "string"),
                                        ("issn_print", "string"),
                                        ("issn_electronic", "string"),
                                        ("issn_l", "string"),
                                        ("license_ref", "string"),
                                        ("indexed_in_crossref", "string"),
                                        ("pmid", "string"),
                                        ("pmcid", "string"), ("ut", "string"),
                                        ("url", "string"), ("doaj", "string"),
                                        ("country", "string"),
                                        ("agreement", "string")]

    bpc_fields = [("institution", "string"), ("period", "string"),
                  ("euro", "float"), ("doi", "string"),
                  ("backlist_oa", "string"), ("publisher", "string"),
                  ("book_title", "string"), ("isbn", "string"),
                  ("isbn_print", "string"), ("isbn_electronic", "string"),
                  ("license_ref", "string"), ("indexed_in_crossref", "string"),
                  ("doab", "string"), ("country", "string")]

    springer_compact_coverage_fields = [
        ("period", "string"), ("publisher", "string"),
        ("journal_full_title", "string"), ("is_hybrid", "string"),
        ("num_springer_compact_articles", "float"),
        ("num_journal_total_articles", "float"),
        ("num_journal_oa_articles", "float")
    ]

    metadata = sqlalchemy.MetaData(bind=connectable)

    openapc_table = sqlalchemy.Table("openapc",
                                     metadata,
                                     autoload=False,
                                     schema=schema)
    if openapc_table.exists():
        openapc_table.drop(checkfirst=False)
    init_table(openapc_table, apc_fields)
    openapc_insert_command = openapc_table.insert()

    transformative_agreements_table = sqlalchemy.Table(
        "transformative_agreements", metadata, autoload=False, schema=schema)
    if transformative_agreements_table.exists():
        transformative_agreements_table.drop(checkfirst=False)
    init_table(transformative_agreements_table,
               transformative_agreements_fields)
    transformative_agreements_insert_command = transformative_agreements_table.insert(
    )

    bpc_table = sqlalchemy.Table("bpc",
                                 metadata,
                                 autoload=False,
                                 schema=schema)
    if bpc_table.exists():
        bpc_table.drop(checkfirst=False)
    init_table(bpc_table, bpc_fields)
    bpc_insert_command = bpc_table.insert()

    combined_table = sqlalchemy.Table("combined",
                                      metadata,
                                      autoload=False,
                                      schema=schema)
    if combined_table.exists():
        combined_table.drop(checkfirst=False)
    init_table(combined_table, apc_fields)
    combined_insert_command = combined_table.insert()

    springer_compact_coverage_table = sqlalchemy.Table(
        "springer_compact_coverage", metadata, autoload=False, schema=schema)
    if springer_compact_coverage_table.exists():
        springer_compact_coverage_table.drop(checkfirst=False)
    init_table(springer_compact_coverage_table,
               springer_compact_coverage_fields)
    springer_compact_coverage_insert_command = springer_compact_coverage_table.insert(
    )

    deal_table = sqlalchemy.Table("deal",
                                  metadata,
                                  autoload=False,
                                  schema=schema)
    if deal_table.exists():
        deal_table.drop(checkfirst=False)
    init_table(deal_table, deal_fields)
    deal_insert_command = deal_table.insert()

    # a dict to store individual insert commands for every table
    tables_insert_commands = {
        "openapc": openapc_insert_command,
        "transformative_agreements": transformative_agreements_insert_command,
        "bpc": bpc_insert_command,
        "combined": combined_insert_command,
        "springer_compact_coverage": springer_compact_coverage_insert_command,
        "deal": deal_insert_command
    }

    bpcs_institution_countries = {}

    reader = csv.DictReader(open(INSTITUTIONS_BPC_FILE, "r"))
    for row in reader:
        institution_name = row["institution"]
        country = row["country"]
        bpcs_institution_countries[institution_name] = country

    reader = csv.DictReader(open(BPC_FILE, "r"))
    for row in reader:
        row["book_title"] = row["book_title"].replace(":", "")
        institution = row["institution"]
        row["country"] = bpcs_institution_countries[institution]
        tables_insert_commands["bpc"].execute(row)

    transformative_agreements_institution_countries = {}

    reader = csv.DictReader(
        open(INSTITUTIONS_TRANSFORMATIVE_AGREEMENTS_FILE, "r"))
    for row in reader:
        institution_name = row["institution"]
        country = row["country"]
        transformative_agreements_institution_countries[
            institution_name] = country

    journal_coverage = None
    article_pubyears = None
    try:
        cache_file = open(scc.COVERAGE_CACHE_FILE, "r")
        journal_coverage = json.loads(cache_file.read())
        cache_file.close()
        cache_file = open(scc.PUBDATES_CACHE_FILE, "r")
        article_pubyears = json.loads(cache_file.read())
        cache_file.close()
    except IOError as ioe:
        msg = "Error while trying to access cache file: {}"
        print(msg.format(ioe))
        sys.exit()
    except ValueError as ve:
        msg = "Error while trying to decode cache structure in: {}"
        print(msg.format(str(ve)))
        sys.exit()

    summarised_transformative_agreements = {}

    journal_id_title_map = {}

    institution_key_errors = []

    reader = csv.DictReader(open(DEAL_WILEY_OPT_OUT_FILE, "r"))
    for row in reader:
        row_copy = deepcopy(
            row
        )  # work on a deep copy since we make some DEAL-specific changes
        row_copy["opt_out"] = "TRUE"
        if row_copy["publisher"] in DEAL_IMPRINTS["Wiley-Blackwell"]:
            row_copy["publisher"] = "Wiley-Blackwell"
        institution = row_copy["institution"]
        try:
            row_copy[
                "country"] = transformative_agreements_institution_countries[
                    institution]
        except KeyError:
            if institution not in institution_key_errors:
                institution_key_errors.append(institution)
        if row_copy["period"] == "2019":
            # Special rule: Half 2019 costs since DEAL only started in 07/19
            halved = round(float(row_copy["euro"]) / 2, 2)
            row_copy["euro"] = str(halved)
        tables_insert_commands["deal"].execute(row_copy)

    reader = csv.DictReader(open(DEAL_SPRINGER_OPT_OUT_FILE, "r"))
    for row in reader:
        row_copy = deepcopy(
            row
        )  # work on a deep copy since we make some DEAL-specific changes
        row_copy["opt_out"] = "TRUE"
        if row_copy["publisher"] in DEAL_IMPRINTS["Springer Nature"]:
            row_copy["publisher"] = "Springer Nature"
        institution = row_copy["institution"]
        try:
            row_copy[
                "country"] = transformative_agreements_institution_countries[
                    institution]
        except KeyError:
            if institution not in institution_key_errors:
                institution_key_errors.append(institution)
        tables_insert_commands["deal"].execute(row_copy)

    reader = csv.DictReader(open(transformative_agreements_file_name, "r"))

    for row in reader:
        institution = row["institution"]
        publisher = row["publisher"]
        issn = row["issn"]
        doi = row["doi"]
        # colons cannot be escaped in URL queries to the cubes server, so we have
        # to remove them here
        row["journal_full_title"] = row["journal_full_title"].replace(":", "")
        title = row["journal_full_title"]
        try:
            row["country"] = transformative_agreements_institution_countries[
                institution]
        except KeyError:
            if institution not in institution_key_errors:
                institution_key_errors.append(institution)
        tables_insert_commands["transformative_agreements"].execute(row)
        if row["euro"] != "NA":
            tables_insert_commands["combined"].execute(row)
        if row["agreement"] == "DEAL Wiley Germany":
            # DEAL Wiley
            row_copy = deepcopy(row)
            row_copy["opt_out"] = "FALSE"
            if row_copy["period"] == "2019":
                # Special rule: Half 2019 costs since DEAL only started in 07/19
                halved = round(float(row["euro"]) / 2, 2)
                row_copy["euro"] = str(halved)
            if row_copy["publisher"] in DEAL_IMPRINTS["Wiley-Blackwell"]:
                row_copy["publisher"] = "Wiley-Blackwell"
            tables_insert_commands["deal"].execute(row_copy)

        if row["agreement"] == "DEAL Springer Nature Germany":
            row_copy = deepcopy(row)
            # DEAL SN
            row_copy["opt_out"] = "FALSE"
            if row_copy["publisher"] in DEAL_IMPRINTS["Springer Nature"]:
                row_copy["publisher"] = "Springer Nature"
            tables_insert_commands["deal"].execute(row_copy)

        if publisher != "Springer Nature":
            continue

        journal_id = scc._get_springer_journal_id_from_doi(doi, issn)
        journal_id_title_map[journal_id] = title
        try:
            pub_year = article_pubyears[journal_id][doi]
        except KeyError:
            msg = (
                u"Publication year entry not found in article cache for {}. " +
                "You might have to update the article cache with 'python " +
                "assets_generator.py coverage_stats'. Using the 'period' " +
                "column for now.")
            print(colorise(msg.format(doi), "yellow"))
            pub_year = row["period"]

        if journal_id not in summarised_transformative_agreements:
            summarised_transformative_agreements[journal_id] = {}
        if pub_year not in summarised_transformative_agreements[journal_id]:
            summarised_transformative_agreements[journal_id][pub_year] = 1
        else:
            summarised_transformative_agreements[journal_id][pub_year] += 1
    if institution_key_errors:
        print("KeyError: The following institutions were not found in the " +
              "institutions_transformative_agreements file:")
        for institution in institution_key_errors:
            print(institution)
        sys.exit()
    for journal_id, info in journal_coverage.items():
        for year, stats in info["years"].items():
            row = {
                "publisher": "Springer Nature",
                "journal_full_title": info["title"],
                "period": year,
                "is_hybrid": "TRUE",
                "num_journal_total_articles":
                stats["num_journal_total_articles"],
                "num_journal_oa_articles": stats["num_journal_oa_articles"]
            }
            try:
                row["num_springer_compact_articles"] = summarised_transformative_agreements[
                    journal_id][year]
            except KeyError:
                row["num_springer_compact_articles"] = 0
            tables_insert_commands["springer_compact_coverage"].execute(row)

    institution_countries = {}

    reader = csv.DictReader(open(INSTITUTIONS_FILE, "r"))
    for row in reader:
        cubes_name = row["institution_cubes_name"]
        institution_name = row["institution"]
        country = row["country"]
        institution_countries[institution_name] = country
        if institution_name not in tables_insert_commands:
            table = sqlalchemy.Table(cubes_name,
                                     metadata,
                                     autoload=False,
                                     schema=schema)
            if table.exists():
                table.drop(checkfirst=False)
            init_table(table, apc_fields)
            insert_command = table.insert()
            tables_insert_commands[institution_name] = insert_command

    reader = csv.DictReader(open(apc_file_name, "r"))
    for row in reader:
        institution = row["institution"]
        # colons cannot be escaped in URL queries to the cubes server, so we have
        # to remove them here
        row["journal_full_title"] = row["journal_full_title"].replace(":", "")
        row["country"] = institution_countries[institution]
        tables_insert_commands[institution].execute(row)
        tables_insert_commands["openapc"].execute(row)
        tables_insert_commands["combined"].execute(row)
        # DEAL Wiley
        if row["publisher"] in DEAL_IMPRINTS["Wiley-Blackwell"] and row[
                "country"] == "DEU" and row["is_hybrid"] == "FALSE":
            if row["period"] in ["2019", "2020", "2021", "2022"]:
                row["publisher"] = "Wiley-Blackwell"  # Imprint normalization
                tables_insert_commands["deal"].execute(row)
        # DEAL Springer
        if row["publisher"] in DEAL_IMPRINTS["Springer Nature"] and row[
                "country"] == "DEU" and row["is_hybrid"] == "FALSE":
            if row["period"] in ["2020", "2021", "2022"]:
                row["publisher"] = "Springer Nature"
                tables_insert_commands["deal"].execute(row)
Exemplo n.º 13
0
def update_coverage_stats(offsetting_file, max_lookups, refetch=True):
    global COVERAGE_CACHE, JOURNAL_ID_CACHE, PERSISTENT_PUBDATES_CACHE, LOOKUPS_PERFORMED
    LOOKUPS_PERFORMED = 0
    if os.path.isfile(COVERAGE_CACHE_FILE):
        with open(COVERAGE_CACHE_FILE, "r") as f:
            try:
                COVERAGE_CACHE = json.loads(f.read())
                print("coverage cache file sucessfully loaded.")
            except ValueError:
                print("Could not decode a cache structure from " + COVERAGE_CACHE_FILE + ", starting with an empty coverage cache.")
    else:
        print("No cache file (" + COVERAGE_CACHE_FILE + ") found, starting with an empty coverage cache.")
    if os.path.isfile(PUBDATES_CACHE_FILE):
        with open(PUBDATES_CACHE_FILE, "r") as f:
            try:
                PERSISTENT_PUBDATES_CACHE = json.loads(f.read())
                print("Pub dates cache file sucessfully loaded.")
            except ValueError:
                print("Could not decode a cache structure from " + PUBDATES_CACHE_FILE + ", starting with an empty pub date cache.")
    else:
        print("No cache file (" + PUBDATES_CACHE_FILE + ") found, starting with an empty pub date cache.")
        
    if not os.path.isdir(JOURNAL_CSV_DIR):
        raise IOError("Journal CSV directory " + JOURNAL_CSV_DIR + " not found!")

    _process_springer_catalogue(max_lookups)

    reader = csv.DictReader(open(offsetting_file, "r"))
    for line in reader:
        if max_lookups is not None and LOOKUPS_PERFORMED >= max_lookups:
            print("maximum number of lookups performed.")
            _shutdown()
        lookup_performed = False
        found = True
        publisher = line["publisher"]
        if publisher != "Springer Nature":
            continue
        issn = line["issn"]
        period = line["period"]
        title = line["journal_full_title"]
        doi = line["doi"]
        journal_id = _get_springer_journal_id_from_doi(doi, issn)
        # Retreive publication dates for articles from CSV summaries on SpringerLink.
        # Employ a multi-level cache structure to minimize IO:
        #  1. try to look up the doi in the persistent publication dates cache
        #  2. if the journal is not present, repopulate local cache segment from a CSV file in the journal CSV dir
        #  3a. if no CSV for the journal could be found, fetch it from SpringerLink
        #  3b. Alternative to 3: If a CSV was found but it does not contain the DOI, re-fetch it from SpringerLink 
        try:
            _ = PERSISTENT_PUBDATES_CACHE[journal_id][doi]
            print("Journal {} ('{}'): DOI {} already cached.".format(journal_id, title, doi))
        except KeyError:
            if journal_id not in TEMP_JOURNAL_CACHE:
                msg = "Journal {} ('{}'): Not found in temp cache, repopulating..."
                print(msg.format(journal_id, title))
                TEMP_JOURNAL_CACHE[journal_id] = _get_journal_cache_from_csv(journal_id, refetch=False)
            if doi not in TEMP_JOURNAL_CACHE[journal_id]:
                if refetch:
                    msg = u"Journal {} ('{}'): DOI {} not found in cache, re-fetching csv file..."
                    print(msg.format(journal_id, title, doi))
                    TEMP_JOURNAL_CACHE[journal_id] = _get_journal_cache_from_csv(journal_id, refetch=True)
                if doi not in TEMP_JOURNAL_CACHE[journal_id]:
                    msg = u"Journal {} ('{}'): DOI {} NOT FOUND in SpringerLink data!"
                    msg = colorise(msg.format(title, journal_id, doi), "red")
                    print(msg)
                    ERROR_MSGS.append(msg)
                    found = False
            lookup_performed = True
            if journal_id not in PERSISTENT_PUBDATES_CACHE:
                PERSISTENT_PUBDATES_CACHE[journal_id] = {}
            if found:
                PERSISTENT_PUBDATES_CACHE[journal_id][doi] = TEMP_JOURNAL_CACHE[journal_id][doi]
                pub_year = PERSISTENT_PUBDATES_CACHE[journal_id][doi]
                compare_msg = u"DOI {} found in Springer data, Pub year is {} ".format(doi, pub_year)
                if pub_year == period:
                    compare_msg += colorise("(same as offsetting period)", "green")
                else:
                    compare_msg += colorise("(DIFFERENT from offsetting period, which is {})".format(period), "yellow")
                msg = u"Journal {} ('{}'): ".format(journal_id, title)
                print(msg.ljust(80) + compare_msg)
        if found:
            pub_year = PERSISTENT_PUBDATES_CACHE[journal_id][doi]
        else:
            # If a lookup error occured we will retreive coverage stats for the period year instead, since
            # the aggregation process will make use of this value.
            pub_year = period
        # Test if journal stats are present
        try:
            _ = COVERAGE_CACHE[journal_id]['years'][pub_year]["num_journal_total_articles"]
            _ = COVERAGE_CACHE[journal_id]['years'][pub_year]["num_journal_oa_articles"]
        except KeyError:
            try:
                _update_journal_stats(title, journal_id, pub_year)
                lookup_performed = True
                error_msg = ('No stats found for journal "{}" ({}) in {} albeit having ' +
                             'downloaded the full Open Choice catalogue. Stats were ' +
                             'obtained retroactively.')
                error_msg = colorise(error_msg.format(title, journal_id, pub_year), "red")
                print(error_msg)
                ERROR_MSGS.append(error_msg)
            except ValueError as ve:
                error_msg = ('Critical Error while processing DOI {}: No stats found ' +
                             ' for journal "{}" ({}) in {} albeit having downloaded the ' +
                             'full Open Choice catalogue and stats could not be obtained ' +
                             'retroactively (ValueError: {}).')
                error_msg = colorise(error_msg.format(doi, title, journal_id, pub_year, str(ve)), "red")
                print(error_msg)
                ERROR_MSGS.append(error_msg)
                _shutdown()
        if lookup_performed:
            LOOKUPS_PERFORMED += 1
    _shutdown()