def main(): parser = argparse.ArgumentParser() parser.add_argument("-s", "--selective_harvesting", action="store_true", help=ARG_HELP_STRINGS["selective_harvesting"]) args = parser.parse_args() with open("harvest_list.csv", "r") as harvest_list: reader = oat.UnicodeDictReader(harvest_list, encoding="utf-8") for line in reader: basic_url = line["basic_url"] if line["active"] == "TRUE": oat.print_g("Starting harvest from source " + basic_url) oai_set = line["oai_set"] if len(line["oai_set"]) > 0 else None prefix = line["metadata_prefix"] if len( line["metadata_prefix"]) > 0 else None processing = line["processing"] if len( line["processing"]) > 0 else None oat.oai_harvest(basic_url, prefix, oai_set, processing, args.selective_harvesting) now = datetime.datetime.now() date_string = now.strftime("%Y_%m_%d") file_name = "oai_harvest_" + date_string + ".csv" target = os.path.join("..", line["directory"], file_name) os.rename("out.csv", target) else: oat.print_y("Skipping inactive source " + basic_url)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-s", "--selective_harvesting", action="store_true", help=ARG_HELP_STRINGS["selective_harvesting"]) args = parser.parse_args() with open("harvest_list.csv", "r") as harvest_list: reader = oat.UnicodeDictReader(harvest_list, encoding="utf-8") for line in reader: basic_url = line["basic_url"] if line["active"] == "TRUE": oat.print_g("Starting harvest from source " + basic_url) processing = None if len(line["processing"]) > 0: processing = line["processing"] oat.oai_harvest(basic_url, line["metadata_prefix"], line["oai_set"], processing, args.selective_harvesting) now = datetime.datetime.now() date_string = now.strftime("%Y_%m_%d") file_name = "oai_harvest_" + date_string + ".csv" target = os.path.join("..", line["directory"], file_name) os.rename("out.csv", target) else: oat.print_y("Skipping inactive source " + basic_url)
def get_hybrid_status(line, username): for issn in [7, 8, 9, 10]: if not oat.has_value(line[issn]): continue msg = 'Looking up ISSN {}...' oat.print_y(msg.format(line[issn])) jtoc_metadata = get_jtoc_metadata(line[issn], username) sleep(1) if jtoc_metadata["jtoc_id"] is not None: msg = ('Entry found (publisher: {}, title: {}, jtoc_ID: {}, ' + 'obtaining hybrid status...)') oat.print_g( msg.format(jtoc_metadata["jtoc_publisher"], jtoc_metadata["jtoc_title"], jtoc_metadata["jtoc_id"])) journal_type = get_jtoc_journal_type(jtoc_metadata["jtoc_id"]) if not journal_type: oat.print_r("Error while obtaining hybrid status!") continue sleep(1) msg = "journaltocs type is '{}' , mapped to is_hybrid = {}" oat.print_g(msg.format(journal_type[0], journal_type[1])) return journal_type[1] oat.print_r("None of the ISSN values found in journaltocs!") return None
def main(): with open("harvest_list.csv", "r") as harvest_list: reader = DictReader(harvest_list) for line in reader: basic_url = line["basic_url"] if line["active"] == "TRUE": oat.print_g("Starting harvest from source " + basic_url) oai_set = line["oai_set"] if len(line["oai_set"]) > 0 else None prefix = line["metadata_prefix"] if len(line["metadata_prefix"]) > 0 else None processing = line["processing"] if len(line["processing"]) > 0 else None directory = os.path.join("..", line["directory"]) articles = oat.oai_harvest(basic_url, prefix, oai_set, processing) harvest_file_path = os.path.join(directory, "all_harvested_articles.csv") enriched_file_path = os.path.join(directory, "all_harvested_articles_enriched.csv") new_article_dicts, header = integrate_changes(articles, harvest_file_path, False) integrate_changes(articles, enriched_file_path, True) if header is None: # if no header was returned, an "all_harvested" file doesn't exist yet header = oat.OAI_COLLECTION_CONTENT.values() new_articles = [header] for article_dict in new_article_dicts: new_articles.append([article_dict[key] for key in header]) now = datetime.datetime.now() date_string = now.strftime("%Y_%m_%d") file_name = "new_articles_" + date_string + ".csv" target = os.path.join(directory, file_name) with open(target, "w") as t: writer = oat.OpenAPCUnicodeWriter(t, openapc_quote_rules=True, has_header=True) writer.write_rows(new_articles) else: oat.print_y("Skipping inactive source " + basic_url)
def main(): with open("harvest_list.csv", "r") as harvest_list: reader = DictReader(harvest_list) for line in reader: basic_url = line["basic_url"] if line["active"] == "TRUE": oat.print_g("Starting harvest from source " + basic_url) oai_set = line["oai_set"] if len(line["oai_set"]) > 0 else None prefix = line["metadata_prefix"] if len(line["metadata_prefix"]) > 0 else None processing = line["processing"] if len(line["processing"]) > 0 else None directory = os.path.join("..", line["directory"]) articles = oat.oai_harvest(basic_url, prefix, oai_set, processing) harvest_file_path = os.path.join(directory, "all_harvested_articles.csv") enriched_file_path = os.path.join(directory, "all_harvested_articles_enriched.csv") new_article_dicts, header = integrate_changes(articles, harvest_file_path, False) integrate_changes(articles, enriched_file_path, True) deal_wiley_path = os.path.join(directory, "all_harvested_articles_enriched_deal_wiley.csv") if os.path.isfile(deal_wiley_path): integrate_changes(articles, deal_wiley_path, True) if header is None: # if no header was returned, an "all_harvested" file doesn't exist yet header = list(oat.OAI_COLLECTION_CONTENT.keys()) new_articles = [header] for article_dict in new_article_dicts: new_articles.append([article_dict[key] for key in header]) now = datetime.datetime.now() date_string = now.strftime("%Y_%m_%d") file_name = "new_articles_" + date_string + ".csv" target = os.path.join(directory, file_name) with open(target, "w") as t: writer = oat.OpenAPCUnicodeWriter(t, openapc_quote_rules=True, has_header=True) writer.write_rows(new_articles) else: oat.print_y("Skipping inactive source " + basic_url)
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"]) parser.add_argument("count_column", type=int, help=ARG_HELP_STRINGS["count_column"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-s", "--sort", action="store_true", help=ARG_HELP_STRINGS["sort"]) args = parser.parse_args() enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) print("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) enc = args.encoding except LookupError: oat.print_r( "Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() header, content = oat.get_csv_file_content(args.source_file, enc) column_name = "column " + str(args.count_column) if header: header_line = header[0] column_name = header_line[args.count_column] oat.print_g("Performing occurence count in column '" + column_name + "'") occurence_dict = OrderedDict() for line in content: try: value = line[args.count_column] except IndexError as ie: oat.print_y("IndexError ({}) at line {}, skipping...".format( ie.message, line)) continue if value not in occurence_dict: occurence_dict[value] = 1 else: occurence_dict[value] += 1 if args.sort: occurence_dict = OrderedDict( sorted(occurence_dict.items(), key=lambda x: x[1], reverse=True)) for item in occurence_dict.items(): print item[0] + ": " + str(item[1])
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) print ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] header, content = oat.get_csv_file_content(args.csv_file, enc) line_num = 1 for line in content: publisher = line[5] journal = line[6] journal_new = oat.get_unified_journal_title(journal) publisher_new = oat.get_unified_publisher_name(publisher) if publisher_new != publisher: line[5] = publisher_new msg = u"Line {}: Updated publisher name ({} -> {})" oat.print_g(msg.format(line_num, publisher, publisher_new)) if journal_new != journal: line[6] = journal_new msg = u"Line {}: Updated journal_full_title ({} -> {})" oat.print_g(msg.format(line_num, journal, journal_new)) line_num += 1 with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows(header + content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--integrate", help=ARG_HELP_STRINGS["integrate"], action="store_true") parser.add_argument("-o", "--output", help=ARG_HELP_STRINGS["output"], action="store_true") args = parser.parse_args() with open("harvest_list.csv", "r") as harvest_list: reader = DictReader(harvest_list) for line in reader: basic_url = line["basic_url"] if line["active"] == "TRUE": oat.print_g("Starting harvest from source " + basic_url) oai_set = line["oai_set"] if len(line["oai_set"]) > 0 else None prefix = line["metadata_prefix"] if len( line["metadata_prefix"]) > 0 else None processing = line["processing"] if len( line["processing"]) > 0 else None directory = os.path.join("..", line["directory"]) out_file_suffix = os.path.basename( line["directory"]) if args.output else None articles = oat.oai_harvest(basic_url, prefix, oai_set, processing, out_file_suffix) harvest_file_path = os.path.join(directory, "all_harvested_articles.csv") enriched_file_path = os.path.join( directory, "all_harvested_articles_enriched.csv") new_article_dicts, header = integrate_changes( articles, harvest_file_path, False, not args.integrate) integrate_changes(articles, enriched_file_path, True, not args.integrate) if header is None: # if no header was returned, an "all_harvested" file doesn't exist yet header = list(oat.OAI_COLLECTION_CONTENT.keys()) new_articles = [header] for article_dict in new_article_dicts: new_articles.append([article_dict[key] for key in header]) now = datetime.datetime.now() date_string = now.strftime("%Y_%m_%d") file_name = "new_articles_" + date_string + ".csv" target = os.path.join(directory, file_name) with open(target, "w") as t: writer = oat.OpenAPCUnicodeWriter(t, openapc_quote_rules=True, has_header=True) writer.write_rows(new_articles) else: oat.print_y("Skipping inactive source " + basic_url)
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"]) parser.add_argument("count_column", type=int, help=ARG_HELP_STRINGS["count_column"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-s", "--sort", action="store_true", help=ARG_HELP_STRINGS["sort"]) args = parser.parse_args() enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: oat.print_r("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() header, content = oat.get_csv_file_content(args.source_file, enc) column_name = "column " + str(args.count_column) if header: header_line = header[0] column_name = header_line[args.count_column] oat.print_g("Performing occurence count in column '" + column_name + "'") occurence_dict = OrderedDict() for line in content: try: value = line[args.count_column] except IndexError as ie: oat.print_y("IndexError ({}) at line {}, skipping...".format(ie.message, line)) continue if value not in occurence_dict: occurence_dict[value] = 1 else: occurence_dict[value] += 1 if args.sort: occurence_dict = OrderedDict(sorted(occurence_dict.items(), key=lambda x: x[1], reverse=True)) for item in occurence_dict.items(): print(item[0] + ": " + str(item[1]))
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"]) args = parser.parse_args() handler = logging.StreamHandler(sys.stderr) handler.setFormatter(oat.ANSIColorFormatter()) bufferedHandler = oat.BufferedErrorHandler(handler) bufferedHandler.setFormatter(oat.ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) oat.print_g(msg) enc = args.encoding except LookupError: msg = ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") oat.print_r(msg) sys.exit() head, content = oat.get_csv_file_content(args.csv_file, enc) content = head + content line_num = 0 for line in content: line_num += 1 if args.start and args.start > line_num: continue if args.end and args.end < line_num: continue # Check hybrid status if line[4] != "TRUE": continue institution = line[0] period = line[1] doi = line[3] publisher = line[5] journal = line[6] for lpl in lpl_list: if lpl.publisher_matches(publisher): init_msg = (u"Line {}: Checking {} article from {}, published in '" + "{}'...").format(line_num, institution, period, journal) oat.print_b(init_msg) page_content = get_landingpage_content(doi, lpl) if page_content is None: continue pdf_link = lpl.search_for_oa(page_content) if pdf_link is None: error_msg = (u"No PDF link found! (line {}, DOI: " + "http://doi.org/{}").format(line_num, doi) logging.error(error_msg) elif pdf_link == "": warning_msg = (u"A RegexGroup matched, but no PDF " + "link was found! (line {}, DOI: " + "http://doi.org/{}").format(line_num, doi) logging.warning(warning_msg) else: oat.print_g(u"PDF link found: " + pdf_link) time.sleep(1) if not bufferedHandler.buffer: oat.print_g("\nLookup finished, all articles were accessible on sciencedirect") else: oat.print_r("\nLookup finished, not all articles could be accessed on sciencedirect:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("new_file", help=ARG_HELP_STRINGS["new_file"]) parser.add_argument("target_file", help=ARG_HELP_STRINGS["new_file"]) parser.add_argument('cost_tolerance', type=float, help=ARG_HELP_STRINGS["cost_tolerance"]) parser.add_argument('enriched_files', nargs='+', help=ARG_HELP_STRINGS["enriched_files"]) parser.add_argument('-b', '--batch', type=int, help=ARG_HELP_STRINGS["batch"]) args = parser.parse_args() target_file_name = get_filename(args.target_file) new_file_name = get_filename(args.new_file) for path in args.enriched_files: if not os.path.isfile(path): oat.print_r('Error: "' + path + '" is no valid file path!') sys.exit() ENRICHED_FILES[path] = {"modified": False, "file_name": get_filename(path)} ENRICHED_FILES[path]["header"], ENRICHED_FILES[path]["content"] = oat.get_csv_file_content(path, enc="utf-8", force_header=True) target_header, target_content = oat.get_csv_file_content(args.target_file, enc="utf-8", force_header=True) new_header, new_content = oat.get_csv_file_content(args.new_file, enc="utf-8", force_header=True) ud_header, ud_content = oat.get_csv_file_content(UD_FILE, enc="utf-8", force_header=True) duplicates = [] target_dois = [line[3] for line in target_content] for new_index, line in enumerate(new_content): doi = line[3] if doi == "NA" or doi not in target_dois: continue else: target_index = get_duplicate_index(target_content, doi) duplicates.append((new_index, target_index)) count = 0 for pair in duplicates: new_line = new_content[pair[0]] target_line = target_content[pair[1]] doi = target_line[3] new_cost = float(new_line[2]) target_cost = float(target_line[2]) if new_cost >= target_cost: deviation = (new_cost - target_cost) / new_cost else: deviation = (target_cost - new_cost) / target_cost oat.print_b("Duplicate found:") print("In new file " + new_file_name + ":") print(",".join(new_line)) print("In target file " + target_file_name + ":") print(",".join(target_line)) if new_line[0] != target_line[0]: msg = 'Institutional mismatch "{}"/"{}". Lines will be deleted and added to the unresolved duplicates file.' oat.print_r(msg.format(new_line[0],target_line[0])) new_content[pair[0]] = list(EMPTY_LINE) target_content[pair[1]] = REPLACEMENT ud_content += [target_line] ud_content += [new_line] path, index = find_in_enriched_files(doi) ENRICHED_FILES[path]["content"][index] = list(EMPTY_LINE) ENRICHED_FILES[path]["modified"] = True elif deviation <= args.cost_tolerance: msg = "Cost deviation between {} and {} is below tolerance threshold ({} <= {}). Entries are treated as equal, only the new one will be deleted." oat.print_g(msg.format(new_cost, target_cost, deviation, args.cost_tolerance)) new_content[pair[0]] = list(EMPTY_LINE) else: msg = "Cost deviation between {} and {} exceeds tolerance threshold ({} > {}). Entries are treated as different, both will be deleted." oat.print_y(msg.format(new_cost, target_cost, deviation, args.cost_tolerance)) new_content[pair[0]] = list(EMPTY_LINE) target_content[pair[1]] = REPLACEMENT path, index = find_in_enriched_files(doi) ENRICHED_FILES[path]["content"][index] = list(EMPTY_LINE) ENRICHED_FILES[path]["modified"] = True count += 1 if args.batch and count >= args.batch: break while REPLACEMENT in target_content: target_content.remove(REPLACEMENT) with open(args.target_file, 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True) writer.write_rows(target_header + target_content) with open(args.new_file, 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True) writer.write_rows(new_header + new_content) with open(UD_FILE, 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True) writer.write_rows(ud_header + ud_content) for path, enriched_file in ENRICHED_FILES.items(): if enriched_file["modified"]: with open(path, 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True) writer.write_rows(enriched_file["header"] + enriched_file["content"])
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-O", "--offsetting_mode", help=ARG_HELP_STRINGS["offsetting"]) parser.add_argument("-b", "--bypass-cert-verification", action="store_true", help=ARG_HELP_STRINGS["bypass"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-f", "--force", action="store_true", help=ARG_HELP_STRINGS["force"]) parser.add_argument("-i", "--ignore-header", action="store_true", help=ARG_HELP_STRINGS["ignore_header"]) parser.add_argument("-j", "--force-header", action="store_true", help=ARG_HELP_STRINGS["force_header"]) parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-a", "--add-unknown-columns", action="store_true", help=ARG_HELP_STRINGS["unknown_columns"]) parser.add_argument("-d", "--dialect", choices=["excel", "excel-tab", "unix"], help=ARG_HELP_STRINGS["dialect"]) parser.add_argument("-v", "--verbose", action="store_true", help=ARG_HELP_STRINGS["verbose"]) parser.add_argument("-o", "--overwrite", action="store_true", help=ARG_HELP_STRINGS["overwrite"]) parser.add_argument("-u", "--update", action="store_true", help=ARG_HELP_STRINGS["update"]) parser.add_argument("-r", "--round_monetary", action="store_true", help=ARG_HELP_STRINGS["round_monetary"]) parser.add_argument("--no-crossref", action="store_true", help=ARG_HELP_STRINGS["no_crossref"]) parser.add_argument("--no-pubmed", action="store_true", help=ARG_HELP_STRINGS["no_pubmed"]) parser.add_argument("--no-doaj", action="store_true", help=ARG_HELP_STRINGS["no_doaj"]) parser.add_argument("-institution", "--institution_column", type=int, help=ARG_HELP_STRINGS["institution"]) parser.add_argument("-period", "--period_column", type=int, help=ARG_HELP_STRINGS["period"]) parser.add_argument("-doi", "--doi_column", type=int, help=ARG_HELP_STRINGS["doi"]) parser.add_argument("-euro", "--euro_column", type=int, help=ARG_HELP_STRINGS["euro"]) parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int, help=ARG_HELP_STRINGS["is_hybrid"]) parser.add_argument("-publisher", "--publisher_column", type=int, help=ARG_HELP_STRINGS["publisher"]) parser.add_argument("-journal_full_title", "--journal_full_title_column", type=int, help=ARG_HELP_STRINGS["journal_full_title"]) parser.add_argument("-book_title", "--book_title_column", type=int, help=ARG_HELP_STRINGS["book_title"]) parser.add_argument("-issn", "--issn_column", type=int, help=ARG_HELP_STRINGS["issn"]) parser.add_argument("-isbn", "--isbn_column", type=int, help=ARG_HELP_STRINGS["isbn"]) parser.add_argument("-backlist_oa", "--backlist_oa_column", type=int, help=ARG_HELP_STRINGS["backlist_oa"]) parser.add_argument("-additional_isbns", "--additional_isbn_columns", type=int, nargs='+', help=ARG_HELP_STRINGS["additional_isbns"]) parser.add_argument("-url", "--url_column", type=int, help=ARG_HELP_STRINGS["url"]) parser.add_argument("-start", type=int, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["end"]) args = parser.parse_args() handler = logging.StreamHandler(sys.stderr) handler.setFormatter(oat.ANSIColorFormatter()) bufferedHandler = oat.BufferedErrorHandler(handler) bufferedHandler.setFormatter(oat.ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: msg = "locale '{}' not found, normalised to '{}'".format( args.locale, norm) oat.print_y(msg) try: loc = locale.setlocale(locale.LC_ALL, norm) oat.print_g("Using locale " + loc) except locale.Error as loce: msg = "Setting locale to {} failed: {}".format(norm, loce.message) oat.print_r(msg) sys.exit() enc = None # CSV file encoding if args.encoding: try: codec = codecs.lookup(args.encoding) msg = ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) oat.print_g(msg) enc = args.encoding except LookupError: msg = ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") oat.print_r(msg) sys.exit() result = oat.analyze_csv_file(args.csv_file, enc=enc) if result["success"]: csv_analysis = result["data"] print(csv_analysis) else: print(result["error_msg"]) sys.exit() if args.dialect: dialect = args.dialect oat.print_g('Dialect sniffing results ignored, using built-in CSV dialect "' + dialect + '"') else: dialect = csv_analysis.dialect if enc is None: enc = csv_analysis.enc has_header = csv_analysis.has_header or args.force_header if enc is None: print("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() csv_file = open(args.csv_file, "r", encoding=enc) reader = csv.reader(csv_file, dialect=dialect) first_row = next(reader) num_columns = len(first_row) print("\nCSV file has {} columns.".format(num_columns)) csv_file.seek(0) reader = csv.reader(csv_file, dialect=dialect) if args.update and args.overwrite: oat.print_r("Error: Either use the -u or the -o option, not both.") sys.exit() if args.overwrite: for column in OVERWRITE_STRATEGY.keys(): OVERWRITE_STRATEGY[column] = CSVColumn.OW_ALWAYS elif not args.update: for column in OVERWRITE_STRATEGY.keys(): OVERWRITE_STRATEGY[column] = CSVColumn.OW_ASK additional_isbn_columns = [] if args.additional_isbn_columns: for index in args.additional_isbn_columns: if index > num_columns: msg = "Error: Additional ISBN column index {} exceeds number of columns ({})." oat.print_r(msg.format(index, num_columns)) sys.exit() else: additional_isbn_columns.append(index) column_map = { "institution": CSVColumn("institution", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.institution_column, overwrite=OVERWRITE_STRATEGY["institution"]), "period": CSVColumn("period",{"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.period_column, overwrite=OVERWRITE_STRATEGY["period"]), "euro": CSVColumn("euro", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.euro_column, overwrite=OVERWRITE_STRATEGY["euro"]), "doi": CSVColumn("doi", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.doi_column, overwrite=OVERWRITE_STRATEGY["doi"]), "is_hybrid": CSVColumn("is_hybrid", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.NONE}, args.is_hybrid_column, overwrite=OVERWRITE_STRATEGY["is_hybrid"]), "publisher": CSVColumn("publisher", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.publisher_column, overwrite=OVERWRITE_STRATEGY["publisher"]), "journal_full_title": CSVColumn("journal_full_title", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.journal_full_title_column, overwrite=OVERWRITE_STRATEGY["journal_full_title"]), "issn": CSVColumn("issn", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.issn_column, overwrite=OVERWRITE_STRATEGY["issn"]), "issn_print": CSVColumn("issn_print", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["issn_print"]), "issn_electronic": CSVColumn("issn_electronic", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["issn_electronic"]), "issn_l": CSVColumn("issn_l", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["issn_l"]), "license_ref": CSVColumn("license_ref", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE} , None, overwrite=OVERWRITE_STRATEGY["license_ref"]), "indexed_in_crossref": CSVColumn("indexed_in_crossref", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["indexed_in_crossref"]), "pmid": CSVColumn("pmid", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["pmid"]), "pmcid": CSVColumn("pmcid", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["pmcid"]), "ut": CSVColumn("ut", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["ut"]), "url": CSVColumn("url", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.url_column, overwrite=OVERWRITE_STRATEGY["url"]), "doaj": CSVColumn("doaj", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["doaj"]), "agreement": CSVColumn("agreement", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["agreement"]), "book_title": CSVColumn("book_title", {"articles": CSVColumn.NONE, "books": CSVColumn.RECOMMENDED}, args.book_title_column, overwrite=OVERWRITE_STRATEGY["book_title"]), "backlist_oa": CSVColumn("backlist_oa", {"articles": CSVColumn.NONE, "books": CSVColumn.MANDATORY}, args.backlist_oa_column, overwrite=OVERWRITE_STRATEGY["backlist_oa"]), "isbn": CSVColumn("isbn", {"articles": CSVColumn.NONE, "books": CSVColumn.BACKUP}, args.isbn_column, overwrite=OVERWRITE_STRATEGY["isbn"]), "isbn_print": CSVColumn("isbn_print", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["isbn_print"]), "isbn_electronic": CSVColumn("isbn_electronic", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["isbn_electronic"]) } header = None if has_header: for row in reader: if not row: # Skip empty lines continue header = row # First non-empty row should be the header if args.ignore_header: print("Skipping header analysis due to command line argument.") break else: print("\n *** Analyzing CSV header ***\n") for (index, item) in enumerate(header): if index in additional_isbn_columns: msg = "Column named '{}' at index {} is designated as additional ISBN column" print(msg.format(item, index)) continue column_type = oat.get_column_type_from_whitelist(item) if column_type is not None and column_map[column_type].index is None: column_map[column_type].index = index column_map[column_type].column_name = item found_msg = ("Found column named '{}' at index {}, " + "assuming this to be the '{}' column.") print(found_msg.format(item, index, column_type)) break print("\n *** Starting heuristical analysis ***\n") for row in reader: if not row: # Skip empty lines # We analyze the first non-empty line, a possible header should # have been processed by now. continue column_candidates = { "doi": [], "period": [], "euro": [] } found_msg = "The entry in column {} looks like a potential {}: {}" for (index, entry) in enumerate(row): if index in [csvcolumn.index for csvcolumn in column_map.values()] + additional_isbn_columns: # Skip columns already assigned continue entry = entry.strip() # Search for a DOI if column_map['doi'].index is None: if oat.DOI_RE.match(entry): column_id = str(index) # identify column either numerically or by column header if header: column_id += " ('" + header[index] + "')" print(found_msg.format(column_id, "DOI", entry)) column_candidates['doi'].append(index) continue # Search for a potential year string if column_map['period'].index is None: try: maybe_period = int(entry) now = datetime.date.today().year # Should be a wide enough margin if maybe_period >= 2000 and maybe_period <= now + 2: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print(found_msg.format(column_id, "year", entry)) column_candidates['period'].append(index) continue except ValueError: pass # Search for a potential monetary amount if column_map['euro'].index is None: try: maybe_euro = locale.atof(entry) if maybe_euro >= 10 and maybe_euro <= 10000: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print (found_msg.format(column_id, "euro amount", entry)) column_candidates['euro'].append(index) continue except ValueError: pass for column_type, candidates in column_candidates.items(): if column_map[column_type].index is not None: continue if len(candidates) > 1: print("Could not reliably identify the '" + column_type + "' column - more than one possible candiate!") elif len(candidates) < 1: print("No candidate found for column '" + column_type + "'!") else: index = candidates.pop() column_map[column_type].index = index if header: column_id = header[index] column_map[column_type].column_name = column_id else: column_id = index msg = "Assuming column '{}' to be the '{}' column." print(msg.format(column_id, column_type)) column_map[column_type].index = index break print("\n *** CSV file analysis summary ***\n") index_dict = {csvc.index: csvc for csvc in column_map.values()} for index in range(num_columns): column_name = "" if header: column_name = header[index] if index in index_dict: column = index_dict[index] msg = u"column number {} ({}) is the '{}' column ({})".format( index, column_name, column.column_type, column.get_req_description()) print(msg) elif index in additional_isbn_columns: msg = u"column number {} ({}) is an additional ISBN column".format(index, column_name) oat.print_c(msg) else: if args.add_unknown_columns: msg = (u"column number {} ({}) is an unknown column, it will be " + "appended to the generated CSV file") print(msg.format(index, column_name)) if not column_name: # Use a generic name column_name = "unknown" while column_name in column_map.keys(): # TODO: Replace by a numerical, increasing suffix column_name += "_" column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index) else: msg = (u"column number {} ({}) is an unknown column, it will be " + "ignored") print(msg.format(index, column_name)) print() for column in column_map.values(): if column.index is None: msg = "The '{}' column could not be identified ({})" print(msg.format(column.column_type, column.get_req_description())) print() article_mand_missing = [x.column_type for x in column_map.values() if x.requirement["articles"] == CSVColumn.MANDATORY and x.index is None] article_back_missing = [x.column_type for x in column_map.values() if x.requirement["articles"] == CSVColumn.BACKUP and x.index is None] book_mand_missing = [x.column_type for x in column_map.values() if x.requirement["books"] == CSVColumn.MANDATORY and x.index is None] book_back_missing = [x.column_type for x in column_map.values() if x.requirement["books"] == CSVColumn.BACKUP and x.index is None] if article_mand_missing: msg = "Article enrichment is not possible - mandatory columns are missing ({})" oat.print_y(msg.format(", ".join(article_mand_missing))) elif article_back_missing: msg = "Article enrichment is possible, but backup columns are missing ({}) - each record will need a valid DOI" oat.print_b(msg.format(", ".join(article_back_missing))) else: oat.print_g("Article enrichment is possible with all backup columns in place") if book_mand_missing: msg = "Book enrichment is not possible - mandatory columns are missing ({})" oat.print_y(msg.format(", ".join(book_mand_missing))) elif book_back_missing: msg = "Book enrichment is possible, but backup columns are missing ({}) - each record will need a valid DOI" oat.print_b(msg.format(", ".join(book_back_missing))) else: oat.print_g("Book enrichment is possible with all backup columns in place") print() if article_mand_missing and book_mand_missing: if not args.force: oat.print_r("ERROR: Could not detect the minimum mandatory data set for any " + "publication type. There are 2 ways to fix this:") if not header: print("1) Add a header row to your file and identify the " + "column(s) by assigning them an appropiate column name.") else: print("1) Identify the missing column(s) by assigning them " + "a different column name in the CSV header (You can " + "use the column name(s) mentioned in the message above)") print("2) Use command line parameters when calling this script " + "to identify the missing columns (use -h for help) ") sys.exit() else: oat.print_y("WARNING: Could not detect the minimum mandatory data set for any " + "publication type - forced to continue.") start = input("\nStart metadata aggregation? (y/n):") while start not in ["y", "n"]: start = input("Please type 'y' or 'n':") if start == "n": sys.exit() print("\n *** Starting metadata aggregation ***\n") enriched_content = {} for record_type, fields in oat.COLUMN_SCHEMAS.items(): # add headers enriched_content[record_type] = { "count": 0, "content": [list(fields)] } if not os.path.isdir("tempfiles"): os.mkdir("tempfiles") isbn_handling = oat.ISBNHandling("tempfiles/ISBNRangeFile.xml") doab_analysis = oat.DOABAnalysis(isbn_handling, "tempfiles/DOAB.csv", verbose=False) doaj_analysis = oat.DOAJAnalysis("tempfiles/DOAJ.csv") csv_file.seek(0) reader = csv.reader(csv_file, dialect=dialect) header_processed = False row_num = 0 for row in reader: row_num += 1 if not row: continue # skip empty lines if not header_processed: header_processed = True if has_header: # If the CSV file has a header, we are currently there - skip it # to get to the first data row continue if args.start and args.start > row_num: continue if args.end and args.end < row_num: continue print("---Processing line number " + str(row_num) + "---") result_type, enriched_row = oat.process_row(row, row_num, column_map, num_columns, additional_isbn_columns, doab_analysis, doaj_analysis, args.no_crossref, args.no_pubmed, args.no_doaj, args.round_monetary, args.offsetting_mode) for record_type, value in enriched_content.items(): if record_type == result_type: value["content"].append(enriched_row) value["count"] += 1 else: empty_line = ["" for x in value["content"][0]] value["content"].append(empty_line) csv_file.close() for record_type, value in enriched_content.items(): if value["count"] > 0: with open('out_' + record_type + '.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, oat.OPENAPC_STANDARD_QUOTEMASK, True, True, True) writer.write_rows(value["content"]) if not bufferedHandler.buffer: oat.print_g("Metadata enrichment successful, no errors occured") else: oat.print_r("There were errors during the enrichment process:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file") parser.add_argument("source_column", type=int) parser.add_argument("currency_column", type=int) parser.add_argument("period_column", type=int) parser.add_argument("target_column", type=int) parser.add_argument("-f", "--force_overwrite", action="store_true") parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: msg = "locale '{}' not found, normalised to '{}'".format( args.locale, norm) oat.print_y(msg) try: loc = locale.setlocale(locale.LC_ALL, norm) oat.print_g("Using locale " + loc) except locale.Error as loce: msg = "Setting locale to {} failed: {}".format(norm, loce.message) oat.print_r(msg) sys.exit() header, content = oat.get_csv_file_content(args.source_file, enc, True) fieldnames = header.pop() modified_content = [] line_num = 0 for column_type in ["source_column", "currency_column", "period_column", "target_column"]: index = getattr(args, column_type) msg = "Column {} ('{}') is the {}." oat.print_g(msg.format(index, fieldnames[index], column_type)) start = input("\nStart conversion? (y/n):") while start not in ["y", "n"]: start = input("Please type 'y' or 'n':") if start == "n": sys.exit() for line in content: line_num += 1 if not oat.has_value(line[args.source_column]): oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...") modified_content.append(line) continue monetary_value = None try: monetary_value = locale.atof(line[args.source_column]) except ValueError: msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, line[args.source_column])) modified_content.append(line) continue currency = line[args.currency_column] if currency == "EUR": msg = "WARNING: Currency in line {} is already EUR, skipping..." oat.print_y(msg.format(line_num)) line[args.target_column] = line[args.source_column] modified_content.append(line) continue if not oat.has_value(currency): msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, currency)) modified_content.append(line) continue period = line[args.period_column] frequency = get_frequency(period) if frequency is None: msg = "WARNING: Could not extract a valid date string from period column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, period)) modified_content.append(line) continue if currency not in EXCHANGE_RATES[frequency]: msg = 'No exchange rates ({}) found for currency "{}", querying ECB data warehouse...' oat.print_b(msg.format(frequency, currency)) rates = oat.get_euro_exchange_rates(currency, frequency) EXCHANGE_RATES[frequency][currency] = rates rate = EXCHANGE_RATES[frequency][currency].get(period) if rate is None and frequency == "A": rate = _calulate_preliminary_annual_average(period, currency) if rate: EXCHANGE_RATES[frequency][currency][period] = rate if rate is None: if frequency != "D": msg = "Error: No conversion rate found for currency {} for period {} (line {}), aborting..." oat.print_r(msg.format(currency, period, line_num)) sys.exit() day_retries = 0 while rate is None: msg = "Warning: No conversion rate found for currency {} for period {} (line {}), trying next day..." oat.print_y(msg.format(currency, period, line_num)) period = get_next_day(period) rate = EXCHANGE_RATES[frequency][currency].get(period) day_retries += 1 if day_retries > 5: msg = "Error: Look-ahead limit for days exceeded, aborting..." oat.print_r(msg) sys.exit() euro_value = round(monetary_value/float(rate), 2) line[args.target_column] = str(euro_value) msg = "Line {}: {} exchange rate ({}) for date {} is {} -> {} / {} = {} EUR" msg = msg.format(line_num, currency, frequency, period, rate, monetary_value, rate, euro_value) oat.print_g(msg) modified_content.append(line) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows([fieldnames] + modified_content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("apc_file", help=ARG_HELP_STRINGS["apc_file"]) parser.add_argument("issn_l_file", help=ARG_HELP_STRINGS["issn_l_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() header, content = oat.get_csv_file_content(args.apc_file, enc) oat.print_g("Preparing mapping table...") itself = other = 0 issn_l_re = re.compile( "^(?P<issn>\d{4}-\d{3}[\dxX])\t(?P<issn_l>\d{4}-\d{3}[\dxX])$") issn_l_file = open(args.issn_l_file, "r") issn_l_dict = {} for i, line in enumerate(issn_l_file): if i % 100000 == 0: print(str(i) + " lines processed.") match = issn_l_re.match(line) if match: match_dict = match.groupdict() issn_l_dict[match_dict['issn']] = match_dict['issn_l'] if match_dict['issn'] == match_dict['issn_l']: itself += 1 else: other += 1 print( str(itself) + " ISSNs pointing to itself as ISSN-L, " + str(other) + " to another value.") oat.print_g("Starting enrichment...") issn_matches = issn_p_matches = issn_e_matches = unmatched = different = corrections = 0 enriched_lines = [] for line in content: if len(line) == 0: enriched_lines.append(line) continue issn = reformat_issn(line[7]) issn_p = reformat_issn(line[8]) issn_e = reformat_issn(line[9]) target = None if issn in issn_l_dict: target = issn_l_dict[issn] corrected_target = oat.get_corrected_issn_l(target) if corrected_target != target: corrections += 1 line[10] = corrected_target issn_matches += 1 elif issn_p in issn_l_dict: target = issn_l_dict[issn_p] corrected_target = oat.get_corrected_issn_l(target) if corrected_target != target: corrections += 1 line[10] = corrected_target issn_p_matches += 1 elif issn_e in issn_l_dict: target = issn_l_dict[issn_e] corrected_target = oat.get_corrected_issn_l(target) if corrected_target != target: corrections += 1 line[10] = corrected_target issn_e_matches += 1 else: unmatched += 1 if target is not None and target not in [issn, issn_p, issn_e]: different += 1 enriched_lines.append(line) msg = ("{} issn_l values mapped by issn, {} by issn_p, {} by issn_e. {} " + "could not be assigned.\n{} issn_l values were corrected during " + "the process.\n In {} cases the ISSN-L was different from all " + "existing ISSN values") print( msg.format(issn_matches, issn_p_matches, issn_e_matches, unmatched, corrections, different)) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows(header + enriched_lines)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) codec_msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(codec_msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] header, content = oat.get_csv_file_content(args.csv_file, enc) correction_schema = None for schema_type, schema in oat.COLUMN_SCHEMAS.items(): if header[0] == schema: oat.print_g("Schema autodetection: " + schema_type) correction_schema = CORRECTION_SCHEMAS[schema_type] break else: oat.print_r( "Error: CSV header does not match any known OpenAPC data schema") line_num = 1 for line in content: for tup in correction_schema: if tup[0] == "publisher": index = tup[1] publisher = line[index] publisher_new = oat.get_unified_publisher_name(publisher) if publisher_new != publisher: line[index] = publisher_new msg = u"Line {}: Updated publisher name ({} -> {})" oat.print_g(msg.format(line_num, publisher, publisher_new)) if tup[0] == "journal_full_title": index = tup[1] journal = line[index] journal_new = oat.get_unified_journal_title(journal) if journal_new != journal: line[index] = journal_new msg = u"Line {}: Updated journal_full_title ({} -> {})" oat.print_g(msg.format(line_num, journal, journal_new)) line_num += 1 with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows(header + content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file") parser.add_argument("source_column", type=int) parser.add_argument("currency_column", type=int) parser.add_argument("period_column", type=int) parser.add_argument("target_column", type=int) parser.add_argument("-f", "--force_overwrite", action="store_true") parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: msg = "locale '{}' not found, normalised to '{}'".format( args.locale, norm) oat.print_y(msg) try: loc = locale.setlocale(locale.LC_ALL, norm) oat.print_g("Using locale " + loc) except locale.Error as loce: msg = "Setting locale to {} failed: {}".format(norm, loce.message) oat.print_r(msg) sys.exit() header, content = oat.get_csv_file_content(args.source_file, enc) fieldnames = header.pop() modified_content = [] line_num = 0 for column_type in [ "source_column", "currency_column", "period_column", "target_column" ]: index = getattr(args, column_type) msg = "Column {} ('{}') is the {}." oat.print_g(msg.format(index, fieldnames[index], column_type)) start = input("\nStart conversion? (y/n):") while start not in ["y", "n"]: start = input("Please type 'y' or 'n':") if start == "n": sys.exit() for line in content: line_num += 1 if not oat.has_value(line[args.source_column]): oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...") modified_content.append(line) continue monetary_value = None try: monetary_value = locale.atof(line[args.source_column]) except ValueError: msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, line[args.source_column])) modified_content.append(line) continue period = line[args.period_column] if not oat.has_value(period) or not period.isdigit(): msg = "WARNING: Could not extract a valid year string from period column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, period)) modified_content.append(line) continue currency = line[args.currency_column] if not oat.has_value(currency): msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, currency)) modified_content.append(line) continue try: rate = AVG_YEARLY_CONVERSION_RATES[currency][period] except KeyError: msg = "ERROR: No conversion rate found for currency {} in year {} (line {}), aborting..." oat.print_r(msg.format(currency, period, line_num)) sys.exit() euro_value = round(monetary_value / rate, 2) line[args.target_column] = str(euro_value) modified_content.append(line) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows([fieldnames] + modified_content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"]) parser.add_argument("source_file_key_column", type=int, help=ARG_HELP_STRINGS["source_file_key_column"]) parser.add_argument("source_file_value_column", type=int, help=ARG_HELP_STRINGS["source_file_value_column"]) parser.add_argument("target_file", help=ARG_HELP_STRINGS["target_file"]) parser.add_argument("target_file_key_column", type=int, help=ARG_HELP_STRINGS["target_file_key_column"]) parser.add_argument("target_file_value_column", type=int, help=ARG_HELP_STRINGS["target_file_value_column"]) parser.add_argument("-s", "--strict", action="store_true", help=ARG_HELP_STRINGS["strict"]) parser.add_argument("-f", "--force_overwrite", action="store_true", help=ARG_HELP_STRINGS["force_overwrite"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-e2", "--other_encoding", help=ARG_HELP_STRINGS["other_encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules encs = [] #CSV file encodings for encoding in [args.encoding, args.other_encoding]: if encoding: try: codec = codecs.lookup(encoding) print ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(encoding, codec.name) enc = args.encoding except LookupError: print ("Error: '" + encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() encs.append(encoding) mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] source_header, source_content = oat.get_csv_file_content(args.source_file, enc=encs[0]) key_column_name = "column " + str(args.source_file_key_column) value_column_name = "column " + str(args.source_file_value_column) if source_header: header = source_header[0] key_column_name = header[args.source_file_key_column] value_column_name = header[args.source_file_value_column] msg = u"Creating mapping table ({} -> {}) for source file {}...".format(key_column_name, value_column_name, args.source_file) oat.print_g(msg) mapping_table = {} ambiguous_keys = [] for line in source_content: if line: key = line[args.source_file_key_column] if key == 'NA': continue value = line[args.source_file_value_column] if key not in mapping_table: mapping_table[key] = value else: if mapping_table[key] != value: if not args.strict: msg = u"WARNING: Replacing existing value '{}' for key '{}' with new value '{}'".format(mapping_table[key], key, value) mapping_table[key] = value oat.print_y(msg) else: if key not in ambiguous_keys: ambiguous_keys.append(key) if args.strict: for key in ambiguous_keys: del(mapping_table[key]) msg = u"INFO: Ambiguous key '{}' dropped from mapping table".format(key) oat.print_b(msg) oat.print_g("mapping table created, contains " + str(len(mapping_table)) + " entries") target_header, target_content = oat.get_csv_file_content(args.target_file, enc=encs[1]) line_num = 0 if not target_header else 1 replace_msg = u"Line {}: Found matching key '{}', replaced old value '{}' by '{}'" modified_content = [] for line in target_content: key = line[args.target_file_key_column] if key in mapping_table: new_value = mapping_table[key] old_value = line[args.target_file_value_column] if old_value != new_value: if len(old_value) == 0 or old_value == "NA": line[args.target_file_value_column] = new_value msg = replace_msg.format(line_num, key, old_value, new_value) oat.print_g(msg) else: if args.force_overwrite: line[args.target_file_value_column] = new_value msg = replace_msg.format(line_num, key, old_value, new_value) oat.print_y(msg) modified_content.append(line) line_num += 1 with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(target_header + modified_content)
return "ElementTree ParseError: {}".format(str(etpe)) parser = argparse.ArgumentParser() parser.add_argument( "doi_or_file", help= "An OpenAPC-compatible CSV file or a single DOI to look up in crossref.") args = parser.parse_args() arg = args.doi_or_file if os.path.isfile(arg): csv_file = open(arg, "r", encoding="utf8") reader = csv.reader(csv_file) line_number = 0 for line in reader: if not line: prefix = "" else: prefix = get_prefix(line[3]) result = str(line_number) + ": " + prefix if prefix == "Springer (Biomed Central Ltd.)": oat.print_g(result) elif prefix == "Nature Publishing Group": oat.print_r(result) else: print(result) line_number += 1 else: print(get_prefix(arg))
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-v", "--verbose", action="store_true", help=ARG_HELP_STRINGS["verbose"]) parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-i", "--ignore-header", action="store_true", help=ARG_HELP_STRINGS["headers"]) parser.add_argument("-f", "--force", action="store_true", help=ARG_HELP_STRINGS["force"]) parser.add_argument("-b", "--bypass-cert-verification", action="store_true", help=ARG_HELP_STRINGS["bypass"]) parser.add_argument("-institution", "--institution_column", type=int, help=ARG_HELP_STRINGS["institution"]) parser.add_argument("-period", "--period_column", type=int, help=ARG_HELP_STRINGS["period"]) parser.add_argument("-doi", "--doi_column", type=int, help=ARG_HELP_STRINGS["doi"]) parser.add_argument("-euro", "--euro_column", type=int, help=ARG_HELP_STRINGS["euro"]) parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int, help=ARG_HELP_STRINGS["is_hybrid"]) parser.add_argument("-publisher", "--publisher_column", type=int, help=ARG_HELP_STRINGS["publisher"]) parser.add_argument("-journal_full_title", "--journal_full_title_column", type=int, help=ARG_HELP_STRINGS["journal_full_title"]) parser.add_argument("-issn", "--issn_column", type=int, help=ARG_HELP_STRINGS["issn"]) parser.add_argument("-url", "--url_column", type=int, help=ARG_HELP_STRINGS["url"]) args = parser.parse_args() enc = None # CSV file encoding if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: print "locale '{}' not found, normalized to '{}'".format( args.locale, norm) try: loc = locale.setlocale(locale.LC_ALL, norm) print "Using locale", loc except locale.Error as loce: print "Setting locale to " + norm + " failed: " + loce.message sys.exit() if args.encoding: try: codec = codecs.lookup(args.encoding) print("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() result = oat.analyze_csv_file(args.csv_file) if result["success"]: csv_analysis = result["data"] print csv_analysis else: print result["error_msg"] sys.exit() if enc is None: enc = csv_analysis.enc dialect = csv_analysis.dialect has_header = csv_analysis.has_header if enc is None: print("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() csv_file = open(args.csv_file, "r") reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) first_row = reader.next() num_columns = len(first_row) print "\nCSV file has {} columns.".format(num_columns) csv_file.seek(0) reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) column_map = OrderedDict([ ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column)), ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column)), ("euro", CSVColumn("euro", CSVColumn.MANDATORY, args.euro_column)), ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column)), ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column)), ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column)), ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL, args.journal_full_title_column)), ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column)), ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None)), ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None)), ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None)), ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None)), ("pmid", CSVColumn("pmid", CSVColumn.NONE, None)), ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None)), ("ut", CSVColumn("ut", CSVColumn.NONE, None)), ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column)), ("doaj", CSVColumn("doaj", CSVColumn.NONE, None)) ]) # Do not quote the values in the 'period' and 'euro' columns quotemask = [ True, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, ] header = None if has_header: for row in reader: if not row: # Skip empty lines continue header = row # First non-empty row should be the header if args.ignore_header: print "Skipping header analysis due to command line argument." break else: print "\n *** Analyzing CSV header ***\n" for (index, item) in enumerate(header): column_type = oat.get_column_type_from_whitelist(item) if column_type is not None and column_map[ column_type].index is None: column_map[column_type].index = index column_map[column_type].column_name = item print("Found column named '{}' at index {}, " + "assuming this to be the {} column.").format( item, index, column_type) break print "\n *** Starting heuristical analysis ***\n" for row in reader: if not row: # Skip empty lines # We analyze the first non-empty line, a possible header should # have been processed by now. continue column_candidates = {"doi": [], "period": [], "euro": []} for (index, entry) in enumerate(row): if index in [csvcolumn.index for csvcolumn in column_map.values()]: # Skip columns already assigned continue entry = entry.strip() # Search for a DOI if column_map['doi'].index is None: if oat.DOI_RE.match(entry): column_id = str(index) # identify column either numerical or by column header if header: column_id += " ('" + header[index] + "')" print("The entry in column {} looks like a " + "DOI: {}").format(column_id, entry) column_candidates['doi'].append(index) continue # Search for a potential year string if column_map['period'].index is None: try: maybe_period = int(entry) now = datetime.date.today().year # Should be a wide enough margin if maybe_period >= 2000 and maybe_period <= now + 2: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print("The entry in column {} looks like a " + "potential period: {}").format(column_id, entry) column_candidates['period'].append(index) continue except ValueError: pass # Search for a potential monetary amount if column_map['euro'].index is None: try: maybe_euro = locale.atof(entry) # Are there APCs above 6000€ ?? if maybe_euro >= 10 and maybe_euro <= 6000: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print("The entry in column {} looks like a " + "potential euro amount: {}").format( column_id, entry) column_candidates['euro'].append(index) continue except ValueError: pass for column_type, candidates in column_candidates.iteritems(): if column_map[column_type].index is not None: continue if len(candidates) > 1: print("Could not reliably identify the '" + column_type + "' column - more than one possible candiate!") elif len(candidates) < 1: print "No candidate found for column '" + column_type + "'!" else: index = candidates.pop() column_map[column_type].index = index if header: column_id = header[index] column_map[column_type].column_name = column_id else: column_id = index print("Assuming column '{}' to be the '{}' " + "column.").format(column_id, column_type) column_map[column_type].index = index break # Wrap up: Check if there any mandatory column types left which have not # yet been identified - we cannot continue in that case (unless forced). unassigned = filter( lambda (k, v): v.requirement == CSVColumn.MANDATORY and v.index is None, column_map.iteritems()) if unassigned: for item in unassigned: print "The {} column is still unidentified.".format(item[0]) if header: print "The CSV header is:\n" + dialect.delimiter.join(header) if not args.force: print("ERROR: We cannot continue because not all mandatory " + "column types in the CSV file could be automatically " + "identified. There are 2 ways to fix this:") if not header: print( "1) Add a header row to your file and identify the " + "column(s) by assigning them an appropiate column name.") else: print( "1) Identify the missing column(s) by assigning them " + "a different column name in the CSV header (You can " + "use the column name(s) mentioned in the message above)") print("2) Use command line parameters when calling this script " + "to identify the missing columns (use -h for help) ") sys.exit() else: print("WARNING: Not all mandatory column types in the CSV file " + "could be automatically identified - forced to continue.") print "\n *** CSV file analysis summary ***\n" index_dict = {csvc.index: csvc for csvc in column_map.values()} for index in range(num_columns): column_name = "" if header: column_name = header[index] if index in index_dict: column = index_dict[index] msg = "column number {} ({}) is the {} column '{}'".format( index, column_name, column.requirement, column.column_type) if column.requirement in [CSVColumn.MANDATORY, CSVColumn.OPTIONAL]: oat.print_g(msg) else: oat.print_b(msg) else: msg = ("column number {} ({}) is an unknown column, it will be " + "appended to the generated CSV file") oat.print_y(msg.format(index, column_name)) if not column_name: # Use a generic name column_name = "unknown" while column_name in column_map.keys(): # TODO: Replace by a numerical, increasing suffix column_name += "_" column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index) print "" for column in column_map.values(): if column.index is None: msg = "The {} column '{}' could not be identified." print msg.format(column.requirement, column.column_type) # Check for unassigned optional column types. We can continue but should # issue a warning as all entries will need a valid DOI in this case. unassigned = filter( lambda (k, v): v.requirement == CSVColumn.OPTIONAL and v.index is None, column_map.iteritems()) if unassigned: print("\nWARNING: Not all optional column types could be " + "identified. Metadata aggregation is still possible, but " + "every entry in the CSV file will need a valid DOI.") start = raw_input("\nStart metadata aggregation? (y/n):") while start not in ["y", "n"]: start = raw_input("Please type 'y' or 'n':") if start == "n": sys.exit() print "\n *** Starting metadata aggregation ***\n" enriched_content = [] error_messages = [] csv_file.seek(0) reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) header_processed = False row_num = 0 for row in reader: row_num += 1 if not row: continue # skip empty lines if not header_processed: header_processed = True enriched_content.append(column_map.keys()) if has_header: # If the CSV file has a header, we are currently there - skip it # to get to the first data row continue print "---Processing line number " + str(row_num) + "---" if len(row) != num_columns: error_msg = ( "Syntax: the number of values in line {} ({}) " + "differs from the number of columns ({}). Line left " + "unchanged, please correct the error in the result " + "file and re-run.") error_msg_fmt = error_msg.format(row_num, len(row), num_columns) error_messages.append("Line {}: {}".format(row_num, error_msg_fmt)) oat.print_r(error_msg_fmt) enriched_content.append(row) continue doi = row[column_map["doi"].index] current_row = OrderedDict() # Copy content of identified columns for csv_column in column_map.values(): if csv_column.index is not None and len(row[csv_column.index]) > 0: if csv_column.column_type == "euro": # special case for monetary values: Cast to float to ensure # the decimal point is a dot (instead of a comma) euro_value = row[csv_column.index] try: euro = locale.atof(euro_value) if euro.is_integer(): euro = int(euro) current_row[csv_column.column_type] = str(euro) except ValueError: msg = ERROR_MSGS["locale"].format( euro_value, csv_column.index) oat.print_r(msg) sys.exit() else: current_row[csv_column.column_type] = row[csv_column.index] else: current_row[csv_column.column_type] = "NA" # include crossref metadata crossref_result = oat.get_metadata_from_crossref(doi) if crossref_result["success"]: print "Crossref: DOI resolved: " + doi current_row["indexed_in_crossref"] = "TRUE" data = crossref_result["data"] for key, value in data.iteritems(): if value is not None: if key == "journal_full_title": unified_value = oat.get_unified_journal_title(value) if unified_value != value: msg = INFO_MSGS["unify"].format( "journal title", value, unified_value) oat.print_b(msg) new_value = unified_value elif key == "publisher": unified_value = oat.get_unified_publisher_name(value) if unified_value != value: msg = INFO_MSGS["unify"].format( "publisher name", value, unified_value) oat.print_b(msg) new_value = unified_value else: new_value = value else: new_value = "NA" if args.verbose: print(u"WARNING: Element '{}' not found in in " + "response for doi {}.").format(key, doi) old_value = current_row[key] current_row[key] = column_map[key].check_overwrite( old_value, new_value) else: error_msg = ("Crossref: Error while trying to resolve DOI " + doi + ": " + crossref_result["error_msg"]) oat.print_r(error_msg) error_messages.append("Line {}: {}".format(row_num, error_msg)) current_row["indexed_in_crossref"] = "FALSE" # include pubmed metadata pubmed_result = oat.get_metadata_from_pubmed(doi) if pubmed_result["success"]: print "Pubmed: DOI resolved: " + doi data = pubmed_result["data"] for key, value in data.iteritems(): if value is not None: new_value = value else: new_value = "NA" if args.verbose: print(u"WARNING: Element '{}' not found in in " + "response for doi {}.").format(key, doi) old_value = current_row[key] current_row[key] = column_map[key].check_overwrite( old_value, new_value) else: error_msg = ("Pubmed: Error while trying to resolve DOI " + doi + ": " + pubmed_result["error_msg"]) oat.print_r(error_msg) error_messages.append("Line {}: {}".format(row_num, error_msg)) # lookup in DOAJ. try the EISSN first, then ISSN and finally print ISSN if current_row["doaj"] != "TRUE": issns = [] if current_row["issn_electronic"] != "NA": issns.append(current_row["issn_electronic"]) if current_row["issn"] != "NA": issns.append(current_row["issn"]) if current_row["issn_print"] != "NA": issns.append(current_row["issn_print"]) for issn in issns: doaj_res = oat.lookup_journal_in_doaj( issn, args.bypass_cert_verification) if doaj_res["data_received"]: if doaj_res["data"]["in_doaj"]: msg = "DOAJ: Journal ISSN ({}) found in DOAJ ('{}')." print msg.format(issn, doaj_res["data"]["title"]) current_row["doaj"] = "TRUE" break else: msg = "DOAJ: Journal ISSN ({}) not found in DOAJ." current_row["doaj"] = "FALSE" print msg.format(issn) else: msg = "DOAJ: Error while trying to look up ISSN {}: {}" msg_fmt = msg.format(issn, doaj_res["error_msg"]) oat.print_r(msg_fmt) error_messages.append("Line {}: {}".format( row_num, msg_fmt)) enriched_content.append(current_row.values()) csv_file.close() with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, quotemask, True, True) writer.write_rows(enriched_content) if not error_messages: oat.print_g("Metadata enrichment successful, no errors occured") else: oat.print_r("There were errors during the enrichment process:\n") for msg in error_messages: print msg + "\n"
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("column", type=int, help=ARG_HELP_STRINGS["column"]) parser.add_argument("other_csv_file", nargs="?", help=ARG_HELP_STRINGS["other_csv_file"]) parser.add_argument("other_column", type=int, nargs="?", help=ARG_HELP_STRINGS["other_column"]) parser.add_argument("-e2", "--other_encoding", help=ARG_HELP_STRINGS["other_encoding"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-i", "--ignore_case", action="store_true", default=False, help=ARG_HELP_STRINGS["ignore_case"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules encs = [] #CSV file encodings for encoding in [args.encoding, args.other_encoding]: if encoding: try: codec = codecs.lookup(encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print(msg.format(encoding, codec.name)) except LookupError: print("Error: '" + encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() encs.append(encoding) mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if reduced: print("Error: A quotemask may only contain the letters 't' and 'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] header, content = oat.get_csv_file_content(args.csv_file, enc=encs[0]) column = args.column if not args.other_csv_file: rearranged_content = header + sorted(content, key=lambda x: x[column]) else: rearranged_content = [] _, second_content = oat.get_csv_file_content(args.other_csv_file, enc=encs[1]) other_column = column # default: use same column index as in first file if args.other_column: other_column = args.other_column for other_row in second_content: if args.ignore_case: matching_rows = [row for row in content if row[column].lower() == other_row[other_column].lower()] else: matching_rows = [row for row in content if row[column] == other_row[other_column]] rearranged_content += matching_rows for matching_row in matching_rows: content.remove(matching_row) unmatched_msg = ("{} rows could not be rearranged (unmatched in second csv file) " + "and were appended to the end of the result file " + "in original order.") if content: oat.print_y(unmatched_msg.format(len(content))) else: oat.print_g("All rows matched.") rearranged_content = header + rearranged_content + content # append any unmatched rows with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(rearranged_content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("original_file", help=ARG_HELP_STRINGS["original_file"]) parser.add_argument("update_file", help=ARG_HELP_STRINGS["update_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-eu", "--update_encoding", help=ARG_HELP_STRINGS["update_encoding"]) parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-lu", "--update_locale", help=ARG_HELP_STRINGS["update_locale"]) parser.add_argument("-a", "--autocreate_mappings", action="store_true", help=ARG_HELP_STRINGS["autocreate_mappings"]) parser.add_argument("-g", "--grouping", action="store_true", help=ARG_HELP_STRINGS["grouping"]) args = parser.parse_args() params = { "original": { "file": args.original_file, "encoding": args.encoding, "locale": args.locale, "csv_analysis": None, "fieldnames": None, "doi_field": None, "euro_field": None, "mappings": [] }, "update": { "file": args.update_file, "encoding": args.update_encoding, "locale": args.update_locale, "csv_analysis": None, "fieldnames": None, "doi_field": None, "euro_field": None, "mappings": [] } } def field_mapped(file_type, field_name): if field_name == params[file_type]["euro_field"]: return True if field_name == params[file_type]["doi_field"]: return True if field_name in params[file_type]["mappings"]: return True return False for file_type in params.keys(): msg = "*** Performing analysis for {} file ***" oat.print_b(msg.format(file_type)) encoding = params[file_type]["encoding"] if encoding is not None: try: codec = codecs.lookup(encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print(msg.format(encoding, codec.name)) params[file_type]["encoding"] = encoding except LookupError: print ("Error: '" + encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() loc = params[file_type]["locale"] if loc is not None: norm = locale.normalize(loc) if norm != loc: msg = "locale '{}' not found, normalised to '{}'".format(loc, norm) oat.print_y(msg) params[file_type]["locale"] = norm csv_analysis = oat.analyze_csv_file(params[file_type]["file"], enc=params[file_type]["encoding"]) if not csv_analysis["success"]: oat.print_r(csv_analysis["error_msg"]) sys.exit() params[file_type]["csv_analysis"] = csv_analysis["data"] print(params[file_type]["csv_analysis"]) if params[file_type]["encoding"] is None: guessed_enc = params[file_type]["csv_analysis"].enc params[file_type]["encoding"] = guessed_enc locale_name = "default locale" if params[file_type]["locale"] is not None: locale_name = "locale " + params[file_type]["locale"] msg = "{} file will be opened with encoding {} and {}" oat.print_g(msg.format(file_type, params[file_type]["encoding"], locale_name)) with open(params[file_type]["file"], "r", encoding=params[file_type]["encoding"]) as f: reader = csv.DictReader(f, dialect=params[file_type]["csv_analysis"].dialect) params[file_type]["fieldnames"] = list(reader.fieldnames) for index, name in enumerate(params[file_type]["fieldnames"]): field_type = oat.get_column_type_from_whitelist(name) found = False if field_type == "doi": params[file_type]["doi_field"] = name found = True elif field_type == "euro": params[file_type]["euro_field"] = name found = True if found: msg = '{} file: Found {} column at index {} ("{}")' msg = msg.format(file_type, field_type, index, name) oat.print_b(msg) for field_type in ["doi_field", "euro_field"]: if params[file_type][field_type] is None: msg = "Error: No {} found in {} file" oat.print_r(msg.format(field_type, file_type)) sys.exit() for orig_index, orig_field in enumerate(params["original"]["fieldnames"]): if field_mapped("original", orig_field): continue norm_orig_field = orig_field.lower().strip() for update_index, update_field in enumerate(params["update"]["fieldnames"]): if field_mapped("update", update_field): continue norm_update_field = update_field.lower().strip() if norm_orig_field == norm_update_field: if args.autocreate_mappings: params["original"]["mappings"].append(orig_field) params["update"]["mappings"].append(update_field) msg = 'Auto-created mapping "{}" (update file, index {}) -> "{}" (original file, index {})' oat.print_b(msg.format(update_field, update_index, orig_field, orig_index)) else: msg = 'Possible mapping found: "{}" (update file, index {}) -> "{}" (original file, index {}). Create mapping (y/n)?' msg = msg.format(update_field, update_index, orig_field, orig_index) create = input(msg) while create not in ["y", "n"]: create = input("Please type 'y' or 'n':") if create == "y": params["original"]["mappings"].append(orig_field) params["update"]["mappings"].append(update_field) update_mappings = {} with open(params["update"]["file"], "r", encoding=params["update"]["encoding"]) as f: doi_field = params["update"]["doi_field"] euro_field = params["update"]["euro_field"] reader = csv.DictReader(f, dialect=params["update"]["csv_analysis"].dialect) locale.setlocale(locale.LC_ALL, params["update"]["locale"]) for line in reader: doi = oat.get_normalised_DOI(line[doi_field]) if doi is None: msg = 'Warning: Empty or invalid DOI in update file (line {}): "{}"' oat.print_y(msg.format(reader.line_num, line[doi_field])) continue if doi in update_mappings: msg = "Error: Duplicate doi in update file ({})".format(line[doi_field]) oat.print_r(msg) sys.exit() update_mappings[doi] = {} euro_value = locale.atof(line[euro_field]) orig_euro_field = params["original"]["euro_field"] update_mappings[doi][orig_euro_field] = euro_value for index, update_field_name in enumerate(params["update"]["mappings"]): orig_field_name = params["original"]["mappings"][index] update_mappings[doi][orig_field_name] = line[update_field_name] #print(json.dumps(update_mappings, sort_keys=False, indent=4)) modified_content = [] fieldnames = None with open(params["original"]["file"], "r", encoding=params["original"]["encoding"]) as f: doi_field = params["original"]["doi_field"] euro_field = params["original"]["euro_field"] reader = csv.DictReader(f, dialect=params["original"]["csv_analysis"].dialect) fieldnames = list(reader.fieldnames) locale.setlocale(locale.LC_ALL, params["original"]["locale"]) for line in reader: doi = oat.get_normalised_DOI(line[doi_field]) if doi not in update_mappings: msg = "line {}: DOI {} not found in update file!" oat.print_r(msg.format(reader.line_num, doi)) continue changes = [] old_euro_value = locale.atof(line[euro_field]) new_euro_value = update_mappings[doi][euro_field] if old_euro_value != new_euro_value: changes.append(Change(euro_field, old_euro_value, new_euro_value, monetary=True)) for field in update_mappings[doi].keys(): if field == euro_field: continue if line[field] != update_mappings[doi][field]: changes.append(Change(field, line[field], update_mappings[doi][field])) if not changes: msg = "line {}: DOI {} found in update file, but nothing changed." oat.print_g(msg.format(reader.line_num, doi)) else: msg = "line {}: DOI {} found in update file with the following updates:" oat.print_y(msg.format(reader.line_num, doi)) for change in changes: oat.print_y(str(change)) if change.monetary: line[change.field_name] = locale.currency(change.new_value,symbol=False, grouping=args.grouping) else: line[change.field_name] = change.new_value del(update_mappings[doi]) modified_content.append(line) if update_mappings: oat.print_y("{} entries in update file not contained in original file:".format(len(update_mappings))) for doi, changes in update_mappings.items(): oat.print_y(doi) new_line = changes new_line[params["original"]["doi_field"]] = doi formatted_euro = locale.currency(new_line[params["original"]["euro_field"]], symbol=False, grouping=args.grouping) new_line[params["original"]["euro_field"]] = formatted_euro modified_content.append(new_line) with open("out.csv", "w", encoding=params["original"]["encoding"]) as out: writer = csv.DictWriter(out, fieldnames, dialect=params["original"]["csv_analysis"].dialect) writer.writeheader() for line in modified_content: writer.writerow(line)
def integrate_changes(articles, file_path, enriched_file=False, dry_run=False): ''' Update existing entries in a previously created harvest file. Args: articles: A list of article dicts, as retured by openapc_toolkit.oai_harvest() file_path: Path to the CSV file the new values should be integrated into. enriched_file: If true, columns which are overwritten during enrichment will not be updated dry_run: Do not make any changes to the file (but still report changes and return the list of unencountered articles) Returns: A tuple. The first element is a reduced list of article dicts, containing those which did not find a matching DOI in the file (Order preserved). The second element is the list of column headers encountered in the harvest file. ''' messages = { 'wet': { 'start': 'Integrating changes in harvest data into existing file {}', 'line_change': 'Line {}: Updating value in column {} ("{}" -> "{}")', 'remove': 'PID {} no longer found in harvest data, removing article', }, 'dry': { 'start': 'Dry Run: Comparing harvest data to existing file {}', 'line_change': 'Line {} ({}): Change in column {} ("{}" -> "{}")', 'remove': 'PID {} no longer found in harvest data, article would be removed', } } messages = messages['dry'] if dry_run else messages['wet'] if not os.path.isfile(file_path): return (articles, None) enriched_blacklist = [ "institution", "publisher", "journal_full_title", "issn", "license_ref", "pmid" ] article_dict = OrderedDict() for article in articles: # Harvested articles use OAI record IDs in the url field as PID. url = article["url"] if oat.has_value(url): article_dict[url] = article updated_lines = [] fieldnames = None with open(file_path, "r") as f: reader = DictReader(f) fieldnames = reader.fieldnames updated_lines.append(list(fieldnames)) #header oat.print_y(messages["start"].format(file_path)) for line in reader: url = line["url"] if not oat.has_value(line["institution"]): # Do not change empty lines updated_lines.append([line[key] for key in fieldnames]) continue line_num = reader.reader.line_num if url in article_dict: for key, value in article_dict[url].items(): if enriched_file and key in enriched_blacklist: continue if key in line and value != line[key]: oat.print_g(messages["line_change"].format( line_num, line["url"], key, line[key], value)) line[key] = value del (article_dict[url]) updated_line = [line[key] for key in fieldnames] updated_lines.append(updated_line) else: oat.print_r(messages["remove"].format(url)) if not dry_run: with open(file_path, "w") as f: mask = oat.OPENAPC_STANDARD_QUOTEMASK if enriched_file else None writer = oat.OpenAPCUnicodeWriter(f, quotemask=mask, openapc_quote_rules=True, has_header=True) writer.write_rows(updated_lines) return (article_dict.values(), fieldnames)
def integrate_changes(articles, file_path, enriched_file=False): ''' Update existing entries in a previously created harvest file. Args: articles: A list of article dicts, as retured by openapc_toolkit.oai_harvest() file_path: Path to the CSV file the new values should be integrated into. enriched_file: If true, columns which are overwritten during enrichment will not be updated Returns: A tuple. The first element is a reduced list of article dicts, containing those which did not find a matching DOI in the file (Order preserved). The second element is the list of column headers encountered in the harvest file. ''' if not os.path.isfile(file_path): return (articles, None) enriched_blacklist = [ "institution", "publisher", "journal_full_title", "issn", "license_ref", "pmid" ] article_dict = OrderedDict() for article in articles: # This is possible because currently all repos use a local ID/record url, but it's just # a workaround. We might have to change to OAI record IDs later. url = article["url"] if oat.has_value(url): article_dict[url] = article updated_lines = [] fieldnames = None with open(file_path, "r") as f: reader = DictReader(f) fieldnames = reader.fieldnames updated_lines.append(list(fieldnames)) #header start_msg = "Integrating changes in harvest data into existing file {}" oat.print_g(start_msg.format(file_path)) for line in reader: url = line["url"] line_num = reader.reader.line_num msg = "Line {}: Checking for changes ({})" oat.print_b(msg.format(line_num, url)) if url in article_dict: for key, value in article_dict[url].items(): if enriched_file and key in enriched_blacklist: continue if key in line and value != line[key]: update_msg = 'Updating value in column {} ("{}" -> "{}")' oat.print_g(update_msg.format(key, line[key], value)) line[key] = value del (article_dict[url]) updated_line = [line[key] for key in fieldnames] updated_lines.append(updated_line) else: remove_msg = "URL {} no longer found in harvest data, removing article" oat.print_r(remove_msg.format(url)) with open(file_path, "w") as f: mask = oat.OPENAPC_STANDARD_QUOTEMASK if enriched_file else None writer = oat.OpenAPCUnicodeWriter(f, quotemask=mask, openapc_quote_rules=True, has_header=True) writer.write_rows(updated_lines) return (article_dict.values(), fieldnames)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("index", type=int, help=ARG_HELP_STRINGS["index"]) parser.add_argument("-v", "--value", help=ARG_HELP_STRINGS["value"]) parser.add_argument("-f", "--file", help=ARG_HELP_STRINGS["file"]) parser.add_argument("-d", "--full_delete", action="store_true", help=ARG_HELP_STRINGS["full_delete"]) parser.add_argument("-i", "--ignore_case", action="store_true", help=ARG_HELP_STRINGS["ignore_case"]) parser.add_argument("-r", "--results_file", action="store_true", help=ARG_HELP_STRINGS["results_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() if args.value is None and args.file is None: parser.error("Either a single value (-v option) or a file of " + "multiple values (-f option) must be given.") values = [] if args.file: if not os.path.isfile(args.file): print("Error: '" + args.file + "' is no valid file!") sys.exit() with open(args.file, "r") as f: for line in f: if len(line) > 0: value = line.strip("\r\n") if args.ignore_case: values.append(value.lower()) else: values.append(value) oat.print_g(str(len(values)) + " values read from file") if args.value is not None: if args.ignore_case: values.append(args.value.lower()) else: values.append(args.value) if args.file: oat.print_y("Value argument given in addition to file " + "argument, adding value to file imports...") quote_rules = args.openapc_quote_rules enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print (msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() header, content = oat.get_csv_file_content(args.csv_file, enc) mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] empty_line = ['' for element in content[0]] column_name = "column " + str(args.index) if header: header_line = header[0] column_name = header_line[args.index] empty_line = ['' for element in header_line] msg = u"Performing line deletion on condition '{}' in {}".format(column_name, values) oat.print_g(msg) modified_content = [] deleted_lines = [] num_total_lines = num_deleted_lines = 0 for line in content: if len(line) == 0: continue num_total_lines += 1 current_value = line[args.index] if args.ignore_case: current_value = current_value.lower() if current_value not in values: modified_content.append(line) else: num_deleted_lines += 1 if not args.full_delete: modified_content.append(list(empty_line)) if args.results_file: deleted_lines.append(line) msg = u"Process complete, deleted {} out of {} total lines" oat.print_g(msg.format(num_deleted_lines, num_total_lines)) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(copy.deepcopy(header) + modified_content) if args.results_file and len(deleted_lines) > 0: with open('del.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(copy.deepcopy(header) + deleted_lines)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"]) args = parser.parse_args() handler = logging.StreamHandler(sys.stderr) handler.setFormatter(oat.ANSIColorFormatter()) bufferedHandler = oat.BufferedErrorHandler(handler) bufferedHandler.setFormatter(oat.ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: msg = ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") oat.print_r(msg) sys.exit() head, content = oat.get_csv_file_content(args.csv_file, enc) content = head + content line_num = 0 for line in content: line_num += 1 if args.start and args.start > line_num: continue if args.end and args.end < line_num: continue # Check hybrid status if line[4] != "TRUE": continue institution = line[0] period = line[1] doi = line[3] publisher = line[5] journal = line[6] for lpl in lpl_list: if lpl.publisher_matches(publisher): init_msg = (u"Line {}: Checking {} article from {}, published in '" + "{}'...").format(line_num, institution, period, journal) oat.print_b(init_msg) page_content = get_landingpage_content(doi, lpl) if page_content is None: continue pdf_link = lpl.search_for_oa(page_content) if pdf_link is None: error_msg = (u"No PDF link found! (line {}, DOI: " + "http://doi.org/{}").format(line_num, doi) logging.error(error_msg) elif pdf_link == "": warning_msg = (u"A RegexGroup matched, but no PDF " + "link was found! (line {}, DOI: " + "http://doi.org/{}").format(line_num, doi) logging.warning(warning_msg) else: oat.print_g(u"PDF link found: " + pdf_link) time.sleep(1) if not bufferedHandler.buffer: oat.print_g("\nLookup finished, all articles were accessible") else: oat.print_r("\nLookup finished, not all articles could be accessed:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-v", "--verbose", action="store_true", help=ARG_HELP_STRINGS["verbose"]) parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-i", "--ignore-header", action="store_true", help=ARG_HELP_STRINGS["headers"]) parser.add_argument("-f", "--force", action="store_true", help=ARG_HELP_STRINGS["force"]) parser.add_argument("-b", "--bypass-cert-verification", action="store_true", help=ARG_HELP_STRINGS["bypass"]) parser.add_argument("-institution", "--institution_column", type=int, help=ARG_HELP_STRINGS["institution"]) parser.add_argument("-period", "--period_column", type=int, help=ARG_HELP_STRINGS["period"]) parser.add_argument("-doi", "--doi_column", type=int, help=ARG_HELP_STRINGS["doi"]) parser.add_argument("-euro", "--euro_column", type=int, help=ARG_HELP_STRINGS["euro"]) parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int, help=ARG_HELP_STRINGS["is_hybrid"]) parser.add_argument("-publisher", "--publisher_column", type=int, help=ARG_HELP_STRINGS["publisher"]) parser.add_argument("-journal_full_title", "--journal_full_title_column", type=int, help=ARG_HELP_STRINGS["journal_full_title"]) parser.add_argument("-issn", "--issn_column", type=int, help=ARG_HELP_STRINGS["issn"]) parser.add_argument("-url", "--url_column", type=int, help=ARG_HELP_STRINGS["url"]) args = parser.parse_args() enc = None # CSV file encoding if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: print "locale '{}' not found, normalized to '{}'".format( args.locale, norm) try: loc = locale.setlocale(locale.LC_ALL, norm) print "Using locale", loc except locale.Error as loce: print "Setting locale to " + norm + " failed: " + loce.message sys.exit() if args.encoding: try: codec = codecs.lookup(args.encoding) print ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() result = oat.analyze_csv_file(args.csv_file) if result["success"]: csv_analysis = result["data"] print csv_analysis else: print result["error_msg"] sys.exit() if enc is None: enc = csv_analysis.enc dialect = csv_analysis.dialect has_header = csv_analysis.has_header if enc is None: print ("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() csv_file = open(args.csv_file, "r") reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) first_row = reader.next() num_columns = len(first_row) print "\nCSV file has {} columns.".format(num_columns) csv_file.seek(0) reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) column_map = OrderedDict([ ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column)), ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column)), ("euro", CSVColumn("euro", CSVColumn.MANDATORY, args.euro_column)), ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column)), ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column)), ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column)), ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL, args.journal_full_title_column)), ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column)), ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None)), ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None)), ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None)), ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None)), ("pmid", CSVColumn("pmid", CSVColumn.NONE, None)), ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None)), ("ut", CSVColumn("ut", CSVColumn.NONE, None)), ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column)), ("doaj", CSVColumn("doaj", CSVColumn.NONE, None)) ]) # Do not quote the values in the 'period' and 'euro' columns quotemask = [ True, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, ] header = None if has_header: for row in reader: if not row: # Skip empty lines continue header = row # First non-empty row should be the header if args.ignore_header: print "Skipping header analysis due to command line argument." break else: print "\n *** Analyzing CSV header ***\n" for (index, item) in enumerate(header): column_type = oat.get_column_type_from_whitelist(item) if column_type is not None and column_map[column_type].index is None: column_map[column_type].index = index column_map[column_type].column_name = item print ("Found column named '{}' at index {}, " + "assuming this to be the {} column.").format( item, index, column_type) break print "\n *** Starting heuristical analysis ***\n" for row in reader: if not row: # Skip empty lines # We analyze the first non-empty line, a possible header should # have been processed by now. continue column_candidates = { "doi": [], "period": [], "euro": [] } for (index, entry) in enumerate(row): if index in [csvcolumn.index for csvcolumn in column_map.values()]: # Skip columns already assigned continue entry = entry.strip() # Search for a DOI if column_map['doi'].index is None: if oat.DOI_RE.match(entry): column_id = str(index) # identify column either numerical or by column header if header: column_id += " ('" + header[index] + "')" print ("The entry in column {} looks like a " + "DOI: {}").format(column_id, entry) column_candidates['doi'].append(index) continue # Search for a potential year string if column_map['period'].index is None: try: maybe_period = int(entry) now = datetime.date.today().year # Should be a wide enough margin if maybe_period >= 2000 and maybe_period <= now + 2: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print ("The entry in column {} looks like a " + "potential period: {}").format(column_id, entry) column_candidates['period'].append(index) continue except ValueError: pass # Search for a potential monetary amount if column_map['euro'].index is None: try: maybe_euro = locale.atof(entry) # Are there APCs above 6000€ ?? if maybe_euro >= 10 and maybe_euro <= 6000: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print ("The entry in column {} looks like a " + "potential euro amount: {}").format(column_id, entry) column_candidates['euro'].append(index) continue except ValueError: pass for column_type, candidates in column_candidates.iteritems(): if column_map[column_type].index is not None: continue if len(candidates) > 1: print ("Could not reliably identify the '" + column_type + "' column - more than one possible candiate!") elif len(candidates) < 1: print "No candidate found for column '" + column_type + "'!" else: index = candidates.pop() column_map[column_type].index = index if header: column_id = header[index] column_map[column_type].column_name = column_id else: column_id = index print ("Assuming column '{}' to be the '{}' " + "column.").format(column_id, column_type) column_map[column_type].index = index break # Wrap up: Check if there any mandatory column types left which have not # yet been identified - we cannot continue in that case (unless forced). unassigned = filter(lambda (k, v): v.requirement == CSVColumn.MANDATORY and v.index is None, column_map.iteritems()) if unassigned: for item in unassigned: print "The {} column is still unidentified.".format(item[0]) if header: print "The CSV header is:\n" + dialect.delimiter.join(header) if not args.force: print ("ERROR: We cannot continue because not all mandatory " + "column types in the CSV file could be automatically " + "identified. There are 2 ways to fix this:") if not header: print ("1) Add a header row to your file and identify the " + "column(s) by assigning them an appropiate column name.") else: print ("1) Identify the missing column(s) by assigning them " + "a different column name in the CSV header (You can " + "use the column name(s) mentioned in the message above)") print ("2) Use command line parameters when calling this script " + "to identify the missing columns (use -h for help) ") sys.exit() else: print ("WARNING: Not all mandatory column types in the CSV file " + "could be automatically identified - forced to continue.") print "\n *** CSV file analysis summary ***\n" index_dict = {csvc.index: csvc for csvc in column_map.values()} for index in range(num_columns): column_name = "" if header: column_name = header[index] if index in index_dict: column = index_dict[index] msg = "column number {} ({}) is the {} column '{}'".format( index, column_name, column.requirement, column.column_type) if column.requirement in [CSVColumn.MANDATORY, CSVColumn.OPTIONAL]: oat.print_g(msg) else: oat.print_b(msg) else: msg = ("column number {} ({}) is an unknown column, it will be " + "appended to the generated CSV file") oat.print_y(msg.format(index, column_name)) if not column_name: # Use a generic name column_name = "unknown" while column_name in column_map.keys(): # TODO: Replace by a numerical, increasing suffix column_name += "_" column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index) print "" for column in column_map.values(): if column.index is None: msg = "The {} column '{}' could not be identified." print msg.format(column.requirement, column.column_type) # Check for unassigned optional column types. We can continue but should # issue a warning as all entries will need a valid DOI in this case. unassigned = filter(lambda (k, v): v.requirement == CSVColumn.OPTIONAL and v.index is None, column_map.iteritems()) if unassigned: print ("\nWARNING: Not all optional column types could be " + "identified. Metadata aggregation is still possible, but " + "every entry in the CSV file will need a valid DOI.") start = raw_input("\nStart metadata aggregation? (y/n):") while start not in ["y", "n"]: start = raw_input("Please type 'y' or 'n':") if start == "n": sys.exit() print "\n *** Starting metadata aggregation ***\n" enriched_content = [] error_messages = [] csv_file.seek(0) reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) header_processed = False row_num = 0 for row in reader: row_num += 1 if not row: continue # skip empty lines if not header_processed: header_processed = True enriched_content.append(column_map.keys()) if has_header: # If the CSV file has a header, we are currently there - skip it # to get to the first data row continue print "---Processing line number " + str(row_num) + "---" if len(row) != num_columns: error_msg = ("Syntax: the number of values in line {} ({}) " + "differs from the number of columns ({}). Line left " + "unchanged, please correct the error in the result " + "file and re-run.") error_msg_fmt = error_msg.format(row_num, len(row), num_columns) error_messages.append("Line {}: {}".format(row_num, error_msg_fmt)) oat.print_r(error_msg_fmt) enriched_content.append(row) continue doi = row[column_map["doi"].index] current_row = OrderedDict() # Copy content of identified columns for csv_column in column_map.values(): if csv_column.index is not None and len(row[csv_column.index]) > 0: if csv_column.column_type == "euro": # special case for monetary values: Cast to float to ensure # the decimal point is a dot (instead of a comma) euro_value = row[csv_column.index] try: euro = locale.atof(euro_value) if euro.is_integer(): euro = int(euro) current_row[csv_column.column_type] = str(euro) except ValueError: msg = ERROR_MSGS["locale"].format(euro_value, csv_column.index) oat.print_r(msg) sys.exit() else: current_row[csv_column.column_type] = row[csv_column.index] else: current_row[csv_column.column_type] = "NA" # include crossref metadata crossref_result = oat.get_metadata_from_crossref(doi) if crossref_result["success"]: print "Crossref: DOI resolved: " + doi current_row["indexed_in_crossref"] = "TRUE" data = crossref_result["data"] for key, value in data.iteritems(): if value is not None: if key == "journal_full_title": unified_value = oat.get_unified_journal_title(value) if unified_value != value: msg = INFO_MSGS["unify"].format("journal title", value, unified_value) oat.print_b(msg) new_value = unified_value elif key == "publisher": unified_value = oat.get_unified_publisher_name(value) if unified_value != value: msg = INFO_MSGS["unify"].format("publisher name", value, unified_value) oat.print_b(msg) new_value = unified_value else: new_value = value else: new_value = "NA" if args.verbose: print (u"WARNING: Element '{}' not found in in " + "response for doi {}.").format(key, doi) old_value = current_row[key] current_row[key] = column_map[key].check_overwrite(old_value, new_value) else: error_msg = ("Crossref: Error while trying to resolve DOI " + doi + ": " + crossref_result["error_msg"]) oat.print_r(error_msg) error_messages.append("Line {}: {}".format(row_num, error_msg)) current_row["indexed_in_crossref"] = "FALSE" # include pubmed metadata pubmed_result = oat.get_metadata_from_pubmed(doi) if pubmed_result["success"]: print "Pubmed: DOI resolved: " + doi data = pubmed_result["data"] for key, value in data.iteritems(): if value is not None: new_value = value else: new_value = "NA" if args.verbose: print (u"WARNING: Element '{}' not found in in " + "response for doi {}.").format(key, doi) old_value = current_row[key] current_row[key] = column_map[key].check_overwrite(old_value, new_value) else: error_msg = ("Pubmed: Error while trying to resolve DOI " + doi + ": " + pubmed_result["error_msg"]) oat.print_r(error_msg) error_messages.append("Line {}: {}".format(row_num, error_msg)) # lookup in DOAJ. try the EISSN first, then ISSN and finally print ISSN if current_row["doaj"] != "TRUE": issns = [] if current_row["issn_electronic"] != "NA": issns.append(current_row["issn_electronic"]) if current_row["issn"] != "NA": issns.append(current_row["issn"]) if current_row["issn_print"] != "NA": issns.append(current_row["issn_print"]) for issn in issns: doaj_res = oat.lookup_journal_in_doaj(issn, args.bypass_cert_verification) if doaj_res["data_received"]: if doaj_res["data"]["in_doaj"]: msg = "DOAJ: Journal ISSN ({}) found in DOAJ ('{}')." print msg.format(issn, doaj_res["data"]["title"]) current_row["doaj"] = "TRUE" break else: msg = "DOAJ: Journal ISSN ({}) not found in DOAJ." current_row["doaj"] = "FALSE" print msg.format(issn) else: msg = "DOAJ: Error while trying to look up ISSN {}: {}" msg_fmt = msg.format(issn, doaj_res["error_msg"]) oat.print_r(msg_fmt) error_messages.append("Line {}: {}".format(row_num, msg_fmt)) enriched_content.append(current_row.values()) csv_file.close() with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, quotemask, True, True) writer.write_rows(enriched_content) if not error_messages: oat.print_g("Metadata enrichment successful, no errors occured") else: oat.print_r("There were errors during the enrichment process:\n") for msg in error_messages: print msg + "\n"
def main(): parser = argparse.ArgumentParser() parser.add_argument("apc_file", help=ARG_HELP_STRINGS["apc_file"]) parser.add_argument("issn_l_file", help=ARG_HELP_STRINGS["issn_l_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) print ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() result = oat.analyze_csv_file(args.apc_file, 500) if result["success"]: csv_analysis = result["data"] print csv_analysis else: print result["error_msg"] sys.exit() if enc is None: enc = csv_analysis.enc if enc is None: print ("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() dialect = csv_analysis.dialect has_header = csv_analysis.has_header csv_file = open(args.apc_file, "r") reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) oat.print_g("Preparing mapping table...") itself = other = 0 issn_l_re = re.compile("^(?P<issn>\d{4}-\d{3}[\dxX])\t(?P<issn_l>\d{4}-\d{3}[\dxX])$") issn_l_file = open(args.issn_l_file, "r") issn_l_dict = {} for i, line in enumerate(issn_l_file): if i % 100000 == 0: print str(i) + " lines processed." match = issn_l_re.match(line) if match: match_dict = match.groupdict() issn_l_dict[match_dict['issn']] = match_dict['issn_l'] if match_dict['issn'] == match_dict['issn_l']: itself += 1 else: other += 1 print str(itself) + " ISSNs pointing to itself as ISSN-L, " + str(other) + " to another value." oat.print_g("Starting enrichment...") issn_matches = issn_p_matches = issn_e_matches = unmatched = different = corrections = 0 enriched_lines = [] for line in reader: if len(line) == 0: enriched_lines.append(line) continue issn = reformat_issn(line[7]) issn_p = reformat_issn(line[8]) issn_e = reformat_issn(line[9]) target = None if issn in issn_l_dict: target = issn_l_dict[issn] corrected_target = oat.get_corrected_issn_l(target) if corrected_target != target: corrections += 1 line[10] = corrected_target issn_matches += 1 elif issn_p in issn_l_dict: target = issn_l_dict[issn_p] corrected_target = oat.get_corrected_issn_l(target) if corrected_target != target: corrections += 1 line[10] = corrected_target issn_p_matches += 1 elif issn_e in issn_l_dict: target = issn_l_dict[issn_e] corrected_target = oat.get_corrected_issn_l(target) if corrected_target != target: corrections += 1 line[10] = corrected_target issn_e_matches += 1 else: unmatched += 1 if target is not None and target not in [issn, issn_p, issn_e]: different += 1 enriched_lines.append(line) print "{} issn_l values mapped by issn, {} by issn_p, {} by issn_e. {} could not be assigned.\n{} issn_l values were corrected during the process.\n In {} cases the ISSN-L was different from all existing ISSN values".format(issn_matches, issn_p_matches, issn_e_matches, unmatched, corrections, different) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, has_header) writer.write_rows(enriched_lines)
return "HTTPError: {} - {}".format(code, httpe.reason) except urllib2.URLError as urle: return "URLError: {}".format(urle.reason) except ET.ParseError as etpe: return "ElementTree ParseError: {}".format(str(etpe)) parser = argparse.ArgumentParser() parser.add_argument("doi_or_file", help="An OpenAPC-compatible CSV file or a single DOI to look up in crossref.") args = parser.parse_args() arg = args.doi_or_file if os.path.isfile(arg): csv_file = open(arg, "r") reader = oat.UnicodeReader(csv_file) line_number = 0 for line in reader: if not line: prefix = "" else: prefix = get_prefix(line[3]) result = str(line_number) + ": " + prefix if prefix == "Springer (Biomed Central Ltd.)": oat.print_g(result) elif prefix == "Nature Publishing Group": oat.print_r(result) else: print result line_number += 1 else: print get_prefix(arg)
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file") parser.add_argument("source_column", type=int) parser.add_argument("currency_column", type=int) parser.add_argument("period_column", type=int) parser.add_argument("target_column", type=int) parser.add_argument("-f", "--force_overwrite", action="store_true") parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: msg = "locale '{}' not found, normalised to '{}'".format( args.locale, norm) oat.print_y(msg) try: loc = locale.setlocale(locale.LC_ALL, norm) oat.print_g("Using locale " + loc) except locale.Error as loce: msg = "Setting locale to {} failed: {}".format(norm, loce.message) oat.print_r(msg) sys.exit() header, content = oat.get_csv_file_content(args.source_file, enc, True) fieldnames = header.pop() modified_content = [] line_num = 0 for column_type in ["source_column", "currency_column", "period_column", "target_column"]: index = getattr(args, column_type) msg = "Column {} ('{}') is the {}." oat.print_g(msg.format(index, fieldnames[index], column_type)) start = input("\nStart conversion? (y/n):") while start not in ["y", "n"]: start = input("Please type 'y' or 'n':") if start == "n": sys.exit() for line in content: line_num += 1 if not oat.has_value(line[args.source_column]): oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...") modified_content.append(line) continue monetary_value = None try: monetary_value = locale.atof(line[args.source_column]) except ValueError: msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, line[args.source_column])) modified_content.append(line) continue currency = line[args.currency_column] if not oat.has_value(currency): msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, currency)) modified_content.append(line) continue period = line[args.period_column] frequency = get_frequency(period) if frequency is None: msg = "WARNING: Could not extract a valid date string from period column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, period)) modified_content.append(line) continue if currency not in EXCHANGE_RATES[frequency]: msg = 'No exchange rates ({}) found for currency "{}", querying ECB data warehouse...' oat.print_b(msg.format(frequency, currency)) rates = oat.get_euro_exchange_rates(currency, frequency) EXCHANGE_RATES[frequency][currency] = rates try: rate = EXCHANGE_RATES[frequency][currency][period] except KeyError: msg = "ERROR: No conversion rate found for currency {} for period {} (line {}), aborting..." oat.print_r(msg.format(currency, period, line_num)) sys.exit() euro_value = round(monetary_value/float(rate), 2) line[args.target_column] = str(euro_value) msg = "Line {}: {} exchange rate ({}) for date {} is {} -> {} / {} = {} EUR" msg = msg.format(line_num, currency, frequency, period, rate, monetary_value, rate, euro_value) oat.print_g(msg) modified_content.append(line) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows([fieldnames] + modified_content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-O", "--offsetting_mode", help=ARG_HELP_STRINGS["offsetting"]) parser.add_argument("-b", "--bypass-cert-verification", action="store_true", help=ARG_HELP_STRINGS["bypass"]) parser.add_argument("-d", "--offline_doaj", help=ARG_HELP_STRINGS["offline_doaj"]) parser.add_argument("-D", "--offline_doaj_download", help=ARG_HELP_STRINGS["offline_doaj_download"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-f", "--force", action="store_true", help=ARG_HELP_STRINGS["force"]) parser.add_argument("-i", "--ignore-header", action="store_true", help=ARG_HELP_STRINGS["ignore_header"]) parser.add_argument("-j", "--force-header", action="store_true", help=ARG_HELP_STRINGS["force_header"]) parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-u", "--add-unknown-columns", action="store_true", help=ARG_HELP_STRINGS["unknown_columns"]) parser.add_argument("-v", "--verbose", action="store_true", help=ARG_HELP_STRINGS["verbose"]) parser.add_argument("-o", "--overwrite", action="store_true", help=ARG_HELP_STRINGS["overwrite"]) parser.add_argument("-r", "--round_monetary", action="store_true", help=ARG_HELP_STRINGS["round_monetary"]) parser.add_argument("--no-crossref", action="store_true", help=ARG_HELP_STRINGS["no_crossref"]) parser.add_argument("--no-pubmed", action="store_true", help=ARG_HELP_STRINGS["no_pubmed"]) parser.add_argument("--no-doaj", action="store_true", help=ARG_HELP_STRINGS["no_doaj"]) parser.add_argument("-institution", "--institution_column", type=int, help=ARG_HELP_STRINGS["institution"]) parser.add_argument("-period", "--period_column", type=int, help=ARG_HELP_STRINGS["period"]) parser.add_argument("-doi", "--doi_column", type=int, help=ARG_HELP_STRINGS["doi"]) parser.add_argument("-euro", "--euro_column", type=int, help=ARG_HELP_STRINGS["euro"]) parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int, help=ARG_HELP_STRINGS["is_hybrid"]) parser.add_argument("-publisher", "--publisher_column", type=int, help=ARG_HELP_STRINGS["publisher"]) parser.add_argument("-journal_full_title", "--journal_full_title_column", type=int, help=ARG_HELP_STRINGS["journal_full_title"]) parser.add_argument("-issn", "--issn_column", type=int, help=ARG_HELP_STRINGS["issn"]) parser.add_argument("-url", "--url_column", type=int, help=ARG_HELP_STRINGS["url"]) parser.add_argument("-start", type=int, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["end"]) parser.add_argument("-q", "--quotemask", default="tfftttttttttttttttt", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-n", "--no-openapc-quote-rules", help=ARG_HELP_STRINGS["no_openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() handler = logging.StreamHandler(sys.stderr) handler.setFormatter(oat.ANSIColorFormatter()) bufferedHandler = oat.BufferedErrorHandler(handler) bufferedHandler.setFormatter(oat.ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) if args.offline_doaj and args.offline_doaj_download: oat.print_r("Error: Either use the -d or the -D option, not both.") sys.exit() if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: msg = "locale '{}' not found, normalised to '{}'".format( args.locale, norm) oat.print_y(msg) try: loc = locale.setlocale(locale.LC_ALL, norm) oat.print_g("Using locale " + loc) except locale.Error as loce: msg = "Setting locale to {} failed: {}".format(norm, loce.message) oat.print_r(msg) sys.exit() enc = None # CSV file encoding if args.encoding: try: codec = codecs.lookup(args.encoding) msg = ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) oat.print_g(msg) enc = args.encoding except LookupError: msg = ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") oat.print_r(msg) sys.exit() result = oat.analyze_csv_file(args.csv_file, enc=enc) if result["success"]: csv_analysis = result["data"] print(csv_analysis) else: print(result["error_msg"]) sys.exit() if enc is None: enc = csv_analysis.enc dialect = csv_analysis.dialect has_header = csv_analysis.has_header or args.force_header if enc is None: print("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print("Error: A quotemask may only contain the letters 't' and " + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] doaj_offline_analysis = None if args.offline_doaj: if os.path.isfile(args.offline_doaj): doaj_offline_analysis = oat.DOAJOfflineAnalysis(args.offline_doaj) else: oat.print_r("Error: " + args.offline_doaj + " does not seem " "to be a file!") sys.exit() elif args.offline_doaj_download: if os.path.isfile(args.offline_doaj_download): oat.print_r("Error: Target file '" + args.offline_doaj_download + "' already exists!") sys.exit() doaj_offline_analysis = oat.DOAJOfflineAnalysis(args.offline_doaj_download, download=True) csv_file = open(args.csv_file, "r", encoding=enc) reader = csv.reader(csv_file, dialect=dialect) first_row = next(reader) num_columns = len(first_row) print("\nCSV file has {} columns.".format(num_columns)) csv_file.seek(0) reader = csv.reader(csv_file, dialect=dialect) if args.overwrite: ow_strategy = CSVColumn.OW_ALWAYS else: ow_strategy = CSVColumn.OW_ASK openapc_column_map = OrderedDict([ ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column, overwrite=ow_strategy)), ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column, overwrite=ow_strategy)), ("euro", CSVColumn("euro", CSVColumn.MANDATORY, args.euro_column, overwrite=ow_strategy)), ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column, overwrite=ow_strategy)), ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column, overwrite=ow_strategy)), ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column, overwrite=ow_strategy)), ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL, args.journal_full_title_column, overwrite=ow_strategy)), ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column, overwrite=ow_strategy)), ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None, overwrite=ow_strategy)), ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None, overwrite=ow_strategy)), ("issn_l", CSVColumn("issn_l", CSVColumn.NONE, None, overwrite=ow_strategy)), ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None, overwrite=ow_strategy)), ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None, overwrite=ow_strategy)), ("pmid", CSVColumn("pmid", CSVColumn.NONE, None, overwrite=ow_strategy)), ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None, overwrite=ow_strategy)), ("ut", CSVColumn("ut", CSVColumn.NONE, None, overwrite=ow_strategy)), ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column, overwrite=ow_strategy)), ("doaj", CSVColumn("doaj", CSVColumn.NONE, None, overwrite=ow_strategy)) ]) offsetting_column_map = OrderedDict([ ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column, overwrite=ow_strategy)), ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column, overwrite=ow_strategy)), ("euro", CSVColumn("euro", CSVColumn.NONE, args.euro_column, overwrite=ow_strategy)), ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column, overwrite=ow_strategy)), ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column, overwrite=ow_strategy)), ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column, overwrite=ow_strategy)), ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL, args.journal_full_title_column, overwrite=ow_strategy)), ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column, overwrite=ow_strategy)), ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None, overwrite=ow_strategy)), ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None, overwrite=ow_strategy)), ("issn_l", CSVColumn("issn_l", CSVColumn.NONE, None, overwrite=ow_strategy)), ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None, overwrite=ow_strategy)), ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None, overwrite=ow_strategy)), ("pmid", CSVColumn("pmid", CSVColumn.NONE, None, overwrite=ow_strategy)), ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None, overwrite=ow_strategy)), ("ut", CSVColumn("ut", CSVColumn.NONE, None, overwrite=ow_strategy)), ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column, overwrite=ow_strategy)), ("doaj", CSVColumn("doaj", CSVColumn.NONE, None, overwrite=ow_strategy)), ("agreement", CSVColumn("agreement", CSVColumn.NONE, None, overwrite=ow_strategy)), ]) if args.offsetting_mode: column_map = offsetting_column_map else: column_map = openapc_column_map header = None if has_header: for row in reader: if not row: # Skip empty lines continue header = row # First non-empty row should be the header if args.ignore_header: print("Skipping header analysis due to command line argument.") break else: print("\n *** Analyzing CSV header ***\n") for (index, item) in enumerate(header): column_type = oat.get_column_type_from_whitelist(item) if column_type is not None and column_map[column_type].index is None: column_map[column_type].index = index column_map[column_type].column_name = item found_msg = ("Found column named '{}' at index {}, " + "assuming this to be the {} column.") print(found_msg.format(item, index, column_type)) break print("\n *** Starting heuristical analysis ***\n") for row in reader: if not row: # Skip empty lines # We analyze the first non-empty line, a possible header should # have been processed by now. continue column_candidates = { "doi": [], "period": [], "euro": [] } found_msg = "The entry in column {} looks like a potential {}: {}" for (index, entry) in enumerate(row): if index in [csvcolumn.index for csvcolumn in column_map.values()]: # Skip columns already assigned continue entry = entry.strip() # Search for a DOI if column_map['doi'].index is None: if oat.DOI_RE.match(entry): column_id = str(index) # identify column either numerically or by column header if header: column_id += " ('" + header[index] + "')" print(found_msg.format(column_id, "DOI", entry)) column_candidates['doi'].append(index) continue # Search for a potential year string if column_map['period'].index is None: try: maybe_period = int(entry) now = datetime.date.today().year # Should be a wide enough margin if maybe_period >= 2000 and maybe_period <= now + 2: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print(found_msg.format(column_id, "year", entry)) column_candidates['period'].append(index) continue except ValueError: pass # Search for a potential monetary amount if column_map['euro'].index is None: try: maybe_euro = locale.atof(entry) if maybe_euro >= 10 and maybe_euro <= 10000: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print (found_msg.format(column_id, "euro amount", entry)) column_candidates['euro'].append(index) continue except ValueError: pass for column_type, candidates in column_candidates.items(): if column_map[column_type].index is not None: continue if len(candidates) > 1: print("Could not reliably identify the '" + column_type + "' column - more than one possible candiate!") elif len(candidates) < 1: print("No candidate found for column '" + column_type + "'!") else: index = candidates.pop() column_map[column_type].index = index if header: column_id = header[index] column_map[column_type].column_name = column_id else: column_id = index msg = "Assuming column '{}' to be the '{}' column." print(msg.format(column_id, column_type)) column_map[column_type].index = index break # Wrap up: Check if there any mandatory column types left which have not # yet been identified - we cannot continue in that case (unless forced). unassigned = [x for x in iter(column_map.items()) if x[1].requirement == CSVColumn.MANDATORY and x[1].index is None] if unassigned: for item in unassigned: print("The {} column is still unidentified.".format(item[0])) if header: print("The CSV header is:\n" + dialect.delimiter.join(header)) if not args.force: print("ERROR: We cannot continue because not all mandatory " + "column types in the CSV file could be automatically " + "identified. There are 2 ways to fix this:") if not header: print("1) Add a header row to your file and identify the " + "column(s) by assigning them an appropiate column name.") else: print("1) Identify the missing column(s) by assigning them " + "a different column name in the CSV header (You can " + "use the column name(s) mentioned in the message above)") print("2) Use command line parameters when calling this script " + "to identify the missing columns (use -h for help) ") sys.exit() else: print("WARNING: Not all mandatory column types in the CSV file " + "could be automatically identified - forced to continue.") print("\n *** CSV file analysis summary ***\n") index_dict = {csvc.index: csvc for csvc in column_map.values()} for index in range(num_columns): column_name = "" if header: column_name = header[index] if index in index_dict: column = index_dict[index] msg = u"column number {} ({}) is the {} column '{}'".format( index, column_name, column.requirement, column.column_type) if column.requirement in [CSVColumn.MANDATORY, CSVColumn.OPTIONAL]: oat.print_g(msg) else: oat.print_b(msg) else: if args.add_unknown_columns: msg = (u"column number {} ({}) is an unknown column, it will be " + "appended to the generated CSV file") oat.print_y(msg.format(index, column_name)) if not column_name: # Use a generic name column_name = "unknown" while column_name in column_map.keys(): # TODO: Replace by a numerical, increasing suffix column_name += "_" column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index) else: msg = (u"column number {} ({}) is an unknown column, it will be " + "ignored") oat.print_y(msg.format(index, column_name)) print() for column in column_map.values(): if column.index is None: msg = "The {} column '{}' could not be identified." print(msg.format(column.requirement, column.column_type)) # Check for unassigned optional column types. We can continue but should # issue a warning as all entries will need a valid DOI in this case. unassigned = filter(lambda k, v: v.requirement == CSVColumn.OPTIONAL and v.index is None, column_map.items()) if unassigned: print ("\nWARNING: Not all optional column types could be " + "identified. Metadata aggregation is still possible, but " + "every entry in the CSV file will need a valid DOI.") start = input("\nStart metadata aggregation? (y/n):") while start not in ["y", "n"]: start = input("Please type 'y' or 'n':") if start == "n": sys.exit() print("\n *** Starting metadata aggregation ***\n") enriched_content = [] csv_file.seek(0) reader = csv.reader(csv_file, dialect=dialect) header_processed = False row_num = 0 for row in reader: row_num += 1 if not row: continue # skip empty lines if not header_processed: header_processed = True enriched_content.append(list(column_map.keys())) if has_header: # If the CSV file has a header, we are currently there - skip it # to get to the first data row continue if args.start and args.start > row_num: continue if args.end and args.end < row_num: continue print("---Processing line number " + str(row_num) + "---") enriched_row = oat.process_row(row, row_num, column_map, num_columns, args.no_crossref, args.no_pubmed, args.no_doaj, doaj_offline_analysis, args.round_monetary, args.offsetting_mode) enriched_content.append(enriched_row) csv_file.close() with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, not args.no_openapc_quote_rules, True, True) writer.write_rows(enriched_content) if not bufferedHandler.buffer: oat.print_g("Metadata enrichment successful, no errors occured") else: oat.print_r("There were errors during the enrichment process:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("index", type=int, help=ARG_HELP_STRINGS["index"]) parser.add_argument("-v", "--value", help=ARG_HELP_STRINGS["value"]) parser.add_argument("-f", "--file", help=ARG_HELP_STRINGS["file"]) parser.add_argument("-d", "--full_delete", action="store_true", help=ARG_HELP_STRINGS["full_delete"]) parser.add_argument("-i", "--ignore_case", action="store_true", help=ARG_HELP_STRINGS["ignore_case"]) parser.add_argument("-r", "--results_file", action="store_true", help=ARG_HELP_STRINGS["results_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() if args.value is None and args.file is None: parser.error("Either a single value (-v option) or a file of " + "multiple values (-f option) must be given.") values = [] if args.file: if not os.path.isfile(args.file): print("Error: '" + args.file + "' is no valid file!") sys.exit() with open(args.file, "r") as f: for line in f: if len(line) > 0: value = line.strip("\r\n") if args.ignore_case: values.append(value.lower()) else: values.append(value) oat.print_g(str(len(values)) + " values read from file") if args.value is not None: if args.ignore_case: values.append(args.value.lower()) else: values.append(args.value) if args.file: oat.print_y("Value argument given in addition to file " + "argument, adding value to file imports...") quote_rules = args.openapc_quote_rules enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() header, content = oat.get_csv_file_content(args.csv_file, enc) mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] empty_line = ['' for element in content[0]] column_name = "column " + str(args.index) if header: header_line = header[0] column_name = header_line[args.index] empty_line = ['' for element in header_line] msg = u"Performing line deletion on condition '{}' in {}".format( column_name, values) oat.print_g(msg) modified_content = [] deleted_lines = [] num_total_lines = num_deleted_lines = 0 for line in content: if len(line) == 0: continue num_total_lines += 1 current_value = line[args.index] if args.ignore_case: current_value = current_value.lower() if current_value not in values: modified_content.append(line) else: num_deleted_lines += 1 if not args.full_delete: modified_content.append(list(empty_line)) if args.results_file: deleted_lines.append(line) msg = u"Process complete, deleted {} out of {} total lines" oat.print_g(msg.format(num_deleted_lines, num_total_lines)) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(copy.deepcopy(header) + modified_content) if args.results_file and len(deleted_lines) > 0: with open('del.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(copy.deepcopy(header) + deleted_lines)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-b", "--bypass-cert-verification", action="store_true", help=ARG_HELP_STRINGS["bypass"]) parser.add_argument("-d", "--offline_doaj", help=ARG_HELP_STRINGS["offline_doaj"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-f", "--force", action="store_true", help=ARG_HELP_STRINGS["force"]) parser.add_argument("-i", "--ignore-header", action="store_true", help=ARG_HELP_STRINGS["ignore_header"]) parser.add_argument("-j", "--force-header", action="store_true", help=ARG_HELP_STRINGS["force_header"]) parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-u", "--add-unknown-columns", action="store_true", help=ARG_HELP_STRINGS["unknown_columns"]) parser.add_argument("-v", "--verbose", action="store_true", help=ARG_HELP_STRINGS["verbose"]) parser.add_argument("-institution", "--institution_column", type=int, help=ARG_HELP_STRINGS["institution"]) parser.add_argument("-period", "--period_column", type=int, help=ARG_HELP_STRINGS["period"]) parser.add_argument("-doi", "--doi_column", type=int, help=ARG_HELP_STRINGS["doi"]) parser.add_argument("-euro", "--euro_column", type=int, help=ARG_HELP_STRINGS["euro"]) parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int, help=ARG_HELP_STRINGS["is_hybrid"]) parser.add_argument("-publisher", "--publisher_column", type=int, help=ARG_HELP_STRINGS["publisher"]) parser.add_argument("-journal_full_title", "--journal_full_title_column", type=int, help=ARG_HELP_STRINGS["journal_full_title"]) parser.add_argument("-issn", "--issn_column", type=int, help=ARG_HELP_STRINGS["issn"]) parser.add_argument("-url", "--url_column", type=int, help=ARG_HELP_STRINGS["url"]) parser.add_argument("-start", type=int, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["end"]) args = parser.parse_args() enc = None # CSV file encoding handler = logging.StreamHandler(sys.stderr) handler.setFormatter(ANSIColorFormatter()) bufferedHandler = BufferedErrorHandler(handler) bufferedHandler.setFormatter(ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: print "locale '{}' not found, normalized to '{}'".format( args.locale, norm) try: loc = locale.setlocale(locale.LC_ALL, norm) print "Using locale", loc except locale.Error as loce: print "Setting locale to " + norm + " failed: " + loce.message sys.exit() if args.encoding: try: codec = codecs.lookup(args.encoding) print("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() result = oat.analyze_csv_file(args.csv_file) if result["success"]: csv_analysis = result["data"] print csv_analysis else: print result["error_msg"] sys.exit() if enc is None: enc = csv_analysis.enc dialect = csv_analysis.dialect has_header = csv_analysis.has_header or args.force_header if enc is None: print("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() doaj_offline_analysis = None if args.offline_doaj: if os.path.isfile(args.offline_doaj): doaj_offline_analysis = oat.DOAJOfflineAnalysis(args.offline_doaj) else: oat.print_r("Error: " + args.offline_doaj + " does not seem " "to be a file!") csv_file = open(args.csv_file, "r") reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) first_row = reader.next() num_columns = len(first_row) print "\nCSV file has {} columns.".format(num_columns) csv_file.seek(0) reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) column_map = OrderedDict([ ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column)), ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column)), ("euro", CSVColumn("euro", CSVColumn.MANDATORY, args.euro_column)), ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column)), ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column)), ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column)), ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL, args.journal_full_title_column)), ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column)), ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None)), ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None)), ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None)), ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None)), ("pmid", CSVColumn("pmid", CSVColumn.NONE, None)), ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None)), ("ut", CSVColumn("ut", CSVColumn.NONE, None)), ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column)), ("doaj", CSVColumn("doaj", CSVColumn.NONE, None)) ]) # Do not quote the values in the 'period' and 'euro' columns quotemask = [ True, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, ] header = None if has_header: for row in reader: if not row: # Skip empty lines continue header = row # First non-empty row should be the header if args.ignore_header: print "Skipping header analysis due to command line argument." break else: print "\n *** Analyzing CSV header ***\n" for (index, item) in enumerate(header): column_type = oat.get_column_type_from_whitelist(item) if column_type is not None and column_map[ column_type].index is None: column_map[column_type].index = index column_map[column_type].column_name = item print("Found column named '{}' at index {}, " + "assuming this to be the {} column.").format( item, index, column_type) break print "\n *** Starting heuristical analysis ***\n" for row in reader: if not row: # Skip empty lines # We analyze the first non-empty line, a possible header should # have been processed by now. continue column_candidates = {"doi": [], "period": [], "euro": []} for (index, entry) in enumerate(row): if index in [csvcolumn.index for csvcolumn in column_map.values()]: # Skip columns already assigned continue entry = entry.strip() # Search for a DOI if column_map['doi'].index is None: if oat.DOI_RE.match(entry): column_id = str(index) # identify column either numerical or by column header if header: column_id += " ('" + header[index] + "')" print("The entry in column {} looks like a " + "DOI: {}").format(column_id, entry) column_candidates['doi'].append(index) continue # Search for a potential year string if column_map['period'].index is None: try: maybe_period = int(entry) now = datetime.date.today().year # Should be a wide enough margin if maybe_period >= 2000 and maybe_period <= now + 2: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print("The entry in column {} looks like a " + "potential period: {}").format(column_id, entry) column_candidates['period'].append(index) continue except ValueError: pass # Search for a potential monetary amount if column_map['euro'].index is None: try: maybe_euro = locale.atof(entry) # Are there APCs above 6000€ ?? if maybe_euro >= 10 and maybe_euro <= 6000: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print("The entry in column {} looks like a " + "potential euro amount: {}").format( column_id, entry) column_candidates['euro'].append(index) continue except ValueError: pass for column_type, candidates in column_candidates.iteritems(): if column_map[column_type].index is not None: continue if len(candidates) > 1: print("Could not reliably identify the '" + column_type + "' column - more than one possible candiate!") elif len(candidates) < 1: print "No candidate found for column '" + column_type + "'!" else: index = candidates.pop() column_map[column_type].index = index if header: column_id = header[index] column_map[column_type].column_name = column_id else: column_id = index print("Assuming column '{}' to be the '{}' " + "column.").format(column_id, column_type) column_map[column_type].index = index break # Wrap up: Check if there any mandatory column types left which have not # yet been identified - we cannot continue in that case (unless forced). unassigned = filter( lambda (k, v): v.requirement == CSVColumn.MANDATORY and v.index is None, column_map.iteritems()) if unassigned: for item in unassigned: print "The {} column is still unidentified.".format(item[0]) if header: print "The CSV header is:\n" + dialect.delimiter.join(header) if not args.force: print("ERROR: We cannot continue because not all mandatory " + "column types in the CSV file could be automatically " + "identified. There are 2 ways to fix this:") if not header: print( "1) Add a header row to your file and identify the " + "column(s) by assigning them an appropiate column name.") else: print( "1) Identify the missing column(s) by assigning them " + "a different column name in the CSV header (You can " + "use the column name(s) mentioned in the message above)") print("2) Use command line parameters when calling this script " + "to identify the missing columns (use -h for help) ") sys.exit() else: print("WARNING: Not all mandatory column types in the CSV file " + "could be automatically identified - forced to continue.") print "\n *** CSV file analysis summary ***\n" index_dict = {csvc.index: csvc for csvc in column_map.values()} for index in range(num_columns): column_name = "" if header: column_name = header[index] if index in index_dict: column = index_dict[index] msg = "column number {} ({}) is the {} column '{}'".format( index, column_name, column.requirement, column.column_type) if column.requirement in [CSVColumn.MANDATORY, CSVColumn.OPTIONAL]: oat.print_g(msg) else: oat.print_b(msg) else: if args.add_unknown_columns: msg = ( "column number {} ({}) is an unknown column, it will be " + "appended to the generated CSV file") oat.print_y(msg.format(index, column_name)) if not column_name: # Use a generic name column_name = "unknown" while column_name in column_map.keys(): # TODO: Replace by a numerical, increasing suffix column_name += "_" column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index) else: msg = ( "column number {} ({}) is an unknown column, it will be " + "ignored") oat.print_y(msg.format(index, column_name)) print "" for column in column_map.values(): if column.index is None: msg = "The {} column '{}' could not be identified." print msg.format(column.requirement, column.column_type) # Check for unassigned optional column types. We can continue but should # issue a warning as all entries will need a valid DOI in this case. unassigned = filter( lambda (k, v): v.requirement == CSVColumn.OPTIONAL and v.index is None, column_map.iteritems()) if unassigned: print("\nWARNING: Not all optional column types could be " + "identified. Metadata aggregation is still possible, but " + "every entry in the CSV file will need a valid DOI.") start = raw_input("\nStart metadata aggregation? (y/n):") while start not in ["y", "n"]: start = raw_input("Please type 'y' or 'n':") if start == "n": sys.exit() print "\n *** Starting metadata aggregation ***\n" enriched_content = [] csv_file.seek(0) reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) header_processed = False row_num = 0 for row in reader: row_num += 1 if not row: continue # skip empty lines if not header_processed: header_processed = True enriched_content.append(column_map.keys()) if has_header: # If the CSV file has a header, we are currently there - skip it # to get to the first data row continue if args.start and args.start > row_num: continue if args.end and args.end < row_num: continue print "---Processing line number " + str(row_num) + "---" enriched_row = oat.process_row(row, row_num, column_map, num_columns, doaj_offline_analysis, args.bypass_cert_verification) enriched_content.append(enriched_row) csv_file.close() with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, quotemask, True, True) writer.write_rows(enriched_content) if not bufferedHandler.buffer: oat.print_g("Metadata enrichment successful, no errors occured") else: oat.print_r("There were errors during the enrichment process:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("apc_file", help=ARG_HELP_STRINGS["apc_file"]) parser.add_argument("issn_l_file", help=ARG_HELP_STRINGS["issn_l_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) print("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() result = oat.analyze_csv_file(args.apc_file, 500) if result["success"]: csv_analysis = result["data"] print csv_analysis else: print result["error_msg"] sys.exit() if enc is None: enc = csv_analysis.enc if enc is None: print("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() dialect = csv_analysis.dialect csv_file = open(args.apc_file, "r") reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) oat.print_g("Preparing mapping table...") itself = other = 0 issn_l_re = re.compile( "^(?P<issn>\d{4}-\d{3}[\dxX])\t(?P<issn_l>\d{4}-\d{3}[\dxX])$") issn_l_file = open(args.issn_l_file, "r") issn_l_dict = {} for i, line in enumerate(issn_l_file): if i % 100000 == 0: print str(i) + " lines processed." match = issn_l_re.match(line) if match: match_dict = match.groupdict() issn_l_dict[match_dict['issn']] = match_dict['issn_l'] if match_dict['issn'] == match_dict['issn_l']: itself += 1 else: other += 1 print str(itself) + " ISSNs pointing to itself as ISSN-L, " + str( other) + " to another value." oat.print_g("Starting enrichment...") issn_matches = issn_p_matches = issn_e_matches = unmatched = different = 0 enriched_lines = [] for line in reader: if len(line) == 0: enriched_lines.append(line) continue issn = reformat_issn(line[7]) issn_p = reformat_issn(line[8]) issn_e = reformat_issn(line[9]) target = None if issn in issn_l_dict: target = issn_l_dict[issn] line[10] = target issn_matches += 1 elif issn_p in issn_l_dict: target = issn_l_dict[issn_p] line[10] = target issn_p_matches += 1 elif issn_e in issn_l_dict: target = issn_l_dict[issn_e] line[10] = target issn_e_matches += 1 else: unmatched += 1 if target is not None and target not in [issn, issn_p, issn_e]: different += 1 enriched_lines.append(line) print "{} issn_l values mapped by issn, {} by issn_p, {} by issn_e. {} could not be assigned.\n In {} cases the ISSN-L was different from all existing ISSN values".format( issn_matches, issn_p_matches, issn_e_matches, unmatched, different) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(enriched_lines)
def integrate_changes(articles, file_path, enriched_file=False): ''' Update existing entries in a previously created harvest file. Args: articles: A list of article dicts, as retured by openapc_toolkit.oai_harvest() file_path: Path to the CSV file the new values should be integrated into. enriched_file: If true, columns which are overwritten during enrichment will not be updated Returns: A tuple. The first element is a reduced list of article dicts, containing those which did not find a matching DOI in the file (Order preserved). The second element is the list of column headers encountered in the harvest file. ''' if not os.path.isfile(file_path): return (articles, None) enriched_blacklist = ["institution", "publisher", "journal_full_title", "issn", "license_ref", "pmid"] article_dict = OrderedDict() for article in articles: doi = article["doi"] if oat.has_value(doi): article_dict[doi] = article updated_lines = [] fieldnames = None with open(file_path, "r") as f: reader = DictReader(f) fieldnames = reader.fieldnames updated_lines.append(list(fieldnames)) #header start_msg = "Integrating changes in harvest data into existing file {}" oat.print_g(start_msg.format(file_path)) for line in reader: doi = line["doi"] line_num = reader.reader.line_num if not oat.has_value(doi): msg = "Line {}: No DOI found, change check not possible" oat.print_y(msg.format(line_num)) updated_line = [line[key] for key in fieldnames] updated_lines.append(updated_line) else: msg = "Line {}: Checking for changes ({})" oat.print_b(msg.format(line_num, doi)) if doi in article_dict: for key, value in article_dict[doi].items(): if enriched_file and key in enriched_blacklist: continue if key in line and value != line[key]: update_msg = 'Updating value in column {} ("{}" -> "{}")' oat.print_g(update_msg.format(key, line[key], value)) line[key] = value del(article_dict[doi]) updated_line = [line[key] for key in fieldnames] updated_lines.append(updated_line) else: remove_msg = "DOI {} no longer found in harvest data, removing article" oat.print_r(remove_msg.format(doi)) with open(file_path, "w") as f: mask = oat.OPENAPC_STANDARD_QUOTEMASK if enriched_file else None writer = oat.OpenAPCUnicodeWriter(f, quotemask=mask, openapc_quote_rules=True, has_header=True) writer.write_rows(updated_lines) return (article_dict.values(), fieldnames)
def main(): parser = argparse.ArgumentParser() parser.add_argument("original_file", help=ARG_HELP_STRINGS["original_file"]) parser.add_argument("update_file", help=ARG_HELP_STRINGS["update_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-eu", "--update_encoding", help=ARG_HELP_STRINGS["update_encoding"]) parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-lu", "--update_locale", help=ARG_HELP_STRINGS["update_locale"]) parser.add_argument("-a", "--autocreate_mappings", action="store_true", help=ARG_HELP_STRINGS["autocreate_mappings"]) parser.add_argument("-g", "--grouping", action="store_true", help=ARG_HELP_STRINGS["grouping"]) args = parser.parse_args() params = { "original": { "file": args.original_file, "encoding": args.encoding, "locale": args.locale, "csv_analysis": None, "fieldnames": None, "doi_field": None, "euro_field": None, "mappings": [] }, "update": { "file": args.update_file, "encoding": args.update_encoding, "locale": args.update_locale, "csv_analysis": None, "fieldnames": None, "doi_field": None, "euro_field": None, "mappings": [] } } def field_mapped(file_type, field_name): if field_name == params[file_type]["euro_field"]: return True if field_name == params[file_type]["doi_field"]: return True if field_name in params[file_type]["mappings"]: return True return False for file_type in params.keys(): msg = "*** Performing analysis for {} file ***" oat.print_b(msg.format(file_type)) encoding = params[file_type]["encoding"] if encoding is not None: try: codec = codecs.lookup(encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print(msg.format(encoding, codec.name)) params[file_type]["encoding"] = encoding except LookupError: print( "Error: '" + encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() loc = params[file_type]["locale"] if loc is not None: norm = locale.normalize(loc) if norm != loc: msg = "locale '{}' not found, normalised to '{}'".format( loc, norm) oat.print_y(msg) params[file_type]["locale"] = norm csv_analysis = oat.analyze_csv_file(params[file_type]["file"], enc=params[file_type]["encoding"]) if not csv_analysis["success"]: oat.print_r(csv_analysis["error_msg"]) sys.exit() params[file_type]["csv_analysis"] = csv_analysis["data"] print(params[file_type]["csv_analysis"]) if params[file_type]["encoding"] is None: guessed_enc = params[file_type]["csv_analysis"].enc params[file_type]["encoding"] = guessed_enc locale_name = "default locale" if params[file_type]["locale"] is not None: locale_name = "locale " + params[file_type]["locale"] msg = "{} file will be opened with encoding {} and {}" oat.print_g( msg.format(file_type, params[file_type]["encoding"], locale_name)) with open(params[file_type]["file"], "r", encoding=params[file_type]["encoding"]) as f: reader = csv.DictReader( f, dialect=params[file_type]["csv_analysis"].dialect) params[file_type]["fieldnames"] = list(reader.fieldnames) for index, name in enumerate(params[file_type]["fieldnames"]): field_type = oat.get_column_type_from_whitelist(name) found = False if field_type == "doi": params[file_type]["doi_field"] = name found = True elif field_type == "euro": params[file_type]["euro_field"] = name found = True if found: msg = '{} file: Found {} column at index {} ("{}")' msg = msg.format(file_type, field_type, index, name) oat.print_b(msg) for field_type in ["doi_field", "euro_field"]: if params[file_type][field_type] is None: msg = "Error: No {} found in {} file" oat.print_r(msg.format(field_type, file_type)) sys.exit() for orig_index, orig_field in enumerate(params["original"]["fieldnames"]): if field_mapped("original", orig_field): continue norm_orig_field = orig_field.lower().strip() for update_index, update_field in enumerate( params["update"]["fieldnames"]): if field_mapped("update", update_field): continue norm_update_field = update_field.lower().strip() if norm_orig_field == norm_update_field: if args.autocreate_mappings: params["original"]["mappings"].append(orig_field) params["update"]["mappings"].append(update_field) msg = 'Auto-created mapping "{}" (update file, index {}) -> "{}" (original file, index {})' oat.print_b( msg.format(update_field, update_index, orig_field, orig_index)) else: msg = 'Possible mapping found: "{}" (update file, index {}) -> "{}" (original file, index {}). Create mapping (y/n)?' msg = msg.format(update_field, update_index, orig_field, orig_index) create = input(msg) while create not in ["y", "n"]: create = input("Please type 'y' or 'n':") if create == "y": params["original"]["mappings"].append(orig_field) params["update"]["mappings"].append(update_field) update_mappings = {} with open(params["update"]["file"], "r", encoding=params["update"]["encoding"]) as f: doi_field = params["update"]["doi_field"] euro_field = params["update"]["euro_field"] reader = csv.DictReader( f, dialect=params["update"]["csv_analysis"].dialect) locale.setlocale(locale.LC_ALL, params["update"]["locale"]) for line in reader: doi = oat.get_normalised_DOI(line[doi_field]) if doi is None: msg = 'Warning: Empty or invalid DOI in update file (line {}): "{}"' oat.print_y(msg.format(reader.line_num, line[doi_field])) continue if doi in update_mappings: msg = "Error: Duplicate doi in update file ({})".format( line[doi_field]) oat.print_r(msg) sys.exit() update_mappings[doi] = {} euro_value = locale.atof(line[euro_field]) orig_euro_field = params["original"]["euro_field"] update_mappings[doi][orig_euro_field] = euro_value for index, update_field_name in enumerate( params["update"]["mappings"]): orig_field_name = params["original"]["mappings"][index] update_mappings[doi][orig_field_name] = line[update_field_name] #print(json.dumps(update_mappings, sort_keys=False, indent=4)) modified_content = [] fieldnames = None with open(params["original"]["file"], "r", encoding=params["original"]["encoding"]) as f: doi_field = params["original"]["doi_field"] euro_field = params["original"]["euro_field"] reader = csv.DictReader( f, dialect=params["original"]["csv_analysis"].dialect) fieldnames = list(reader.fieldnames) locale.setlocale(locale.LC_ALL, params["original"]["locale"]) for line in reader: doi = oat.get_normalised_DOI(line[doi_field]) if doi not in update_mappings: msg = "line {}: DOI {} not found in update file!" oat.print_r(msg.format(reader.line_num, doi)) continue changes = [] old_euro_value = locale.atof(line[euro_field]) new_euro_value = update_mappings[doi][euro_field] if old_euro_value != new_euro_value: changes.append( Change(euro_field, old_euro_value, new_euro_value, monetary=True)) for field in update_mappings[doi].keys(): if field == euro_field: continue if line[field] != update_mappings[doi][field]: changes.append( Change(field, line[field], update_mappings[doi][field])) if not changes: msg = "line {}: DOI {} found in update file, but nothing changed." oat.print_g(msg.format(reader.line_num, doi)) else: msg = "line {}: DOI {} found in update file with the following updates:" oat.print_y(msg.format(reader.line_num, doi)) for change in changes: oat.print_y(str(change)) if change.monetary: line[change.field_name] = locale.currency( change.new_value, symbol=False, grouping=args.grouping) else: line[change.field_name] = change.new_value del (update_mappings[doi]) modified_content.append(line) if update_mappings: oat.print_y( "{} entries in update file not contained in original file:". format(len(update_mappings))) for doi, changes in update_mappings.items(): oat.print_y(doi) new_line = changes new_line[params["original"]["doi_field"]] = doi formatted_euro = locale.currency( new_line[params["original"]["euro_field"]], symbol=False, grouping=args.grouping) new_line[params["original"]["euro_field"]] = formatted_euro modified_content.append(new_line) with open("out.csv", "w", encoding=params["original"]["encoding"]) as out: writer = csv.DictWriter( out, fieldnames, dialect=params["original"]["csv_analysis"].dialect) writer.writeheader() for line in modified_content: writer.writerow(line)
def find_significant_apc_differences(apc_content, institution, verbose=False): titles = {} articles = [] # 1st run: Find all journals the institution has published articles in for line in apc_content: if line[0] != institution: continue title = line[6] if title not in titles: titles[title] = {"lines": []} articles.append(line) # 2nd run: Aggregate all articles for the journals found in 1 for line in apc_content: title = line[6] if title in titles: titles[title]["lines"].append(line) for title in titles: apc_values = [float(line[2]) for line in titles[title]["lines"]] titles[title]["count"] = len(apc_values) titles[title]["stddev"] = stddev(apc_values) titles[title]["mean"] = mean(apc_values) stats = { "articles": len(articles), "not_checked": 0, "within_limits": 0, "significant": 0 } sig_articles = [] for article in articles: apc = article[2] doi = article[3] title = article[6] if titles[title]["count"] < 20: if verbose: msg = 'Article {}, journal "{}": Could not check costs, too few occurences ({})' oat.print_b(msg.format(doi, title, titles[title]["count"])) stats["not_checked"] += 1 continue if abs(float(apc) - titles[title]["mean"]) > 2 * titles[title]["stddev"]: rounded_mean = round(titles[title]["mean"], 2) rounded_stddev = round(titles[title]["stddev"], 2) diff = round(float(apc) - rounded_mean, 2) if verbose: msg = ('Article {}, journal "{}": Cost ({}€) differs more than 2 standard ' + 'deviations (2 * {}€) from mean APC ({}€)') oat.print_y(msg.format(doi, title, apc, rounded_stddev, rounded_mean)) stats["significant"] += 1 article.append(rounded_mean) article.append(rounded_stddev) article.append(diff) article.append(titles[title]["count"]) sig_articles.append(article) else: if verbose: msg = ('Article {}, journal "{}": No significant cost difference ({}€, mean ' + 'APC is {}€)') oat.print_g(msg.format(doi, title, apc, round(titles[title]["mean"], 2))) stats["within_limits"] += 1 if verbose: oat.print_g("\nAnalysis finished, results:") for key, value in stats.items(): oat.print_g(key + ": " + str(value)) return sig_articles, stats
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"]) args = parser.parse_args() handler = logging.StreamHandler(sys.stderr) handler.setFormatter(oat.ANSIColorFormatter()) bufferedHandler = oat.BufferedErrorHandler(handler) bufferedHandler.setFormatter(oat.ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) oat.print_g(msg) enc = args.encoding except LookupError: msg = ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") oat.print_r(msg) sys.exit() head, content = oat.get_csv_file_content(args.csv_file, enc) content = head + content header = {"User-Agent": "Mozilla/5.0 Firefox/45.0"} line_num = 0 for line in content: line_num += 1 if args.start and args.start > line_num: continue if args.end and args.end < line_num: continue institution = line[0] period = line[1] doi = line[3] is_hybrid = line[4] publisher = line[5] journal = line[6] if publisher != "Elsevier" or is_hybrid != "TRUE": continue init_msg = (u"Line {}: Checking {} article from {}, published in " + "{}...").format(line_num, institution, period, journal) oat.print_b(init_msg) url = 'http://doi.org/' + doi req = urllib2.Request(url, None, header) ret_value = {'success': True} try: response = urllib2.urlopen(req) target = response.geturl() resolve_msg = u"DOI {} resolved, led us to {}".format(doi, target) if "sciencedirect.com" not in target: oat.print_y(resolve_msg) oat.print_y("Journal not located at sciencedirect, skipping...") continue oat.print_b(resolve_msg) content_string = response.read() single_match = pdflink_re.search(content_string) if single_match: link_url = single_match.groups()[0] oat.print_g(u"PDF link found: " + link_url) else: multi_match = pdflink_multi_re.search(content_string) if multi_match: link_url = multi_match.groups()[0] link_url = link_url.replace("&", "&") oat.print_g(u"PDF link found (more than one document): " + link_url) else: error_msg = (u"No PDF link found! (line {}, DOI: {}, " + "landing page: {})").format(line_num, doi, target) logging.error(error_msg) time.sleep(1) except urllib2.HTTPError as httpe: code = str(httpe.getcode()) oat.print_r("HTTPError: {} - {}".format(code, httpe.reason)) except urllib2.URLError as urle: oat.print_r("URLError: {}".format(urle.reason)) if not bufferedHandler.buffer: oat.print_g("\nLookup finished, all articles were accessible on sciencedirect") else: oat.print_r("\nLookup finished, not all articles could be accessed on sciencedirect:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("column", type=int, help=ARG_HELP_STRINGS["column"]) parser.add_argument("other_csv_file", nargs="?", help=ARG_HELP_STRINGS["other_csv_file"]) parser.add_argument("other_column", type=int, nargs="?", help=ARG_HELP_STRINGS["other_column"]) parser.add_argument("-e2", "--other_encoding", help=ARG_HELP_STRINGS["other_encoding"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-i", "--ignore_case", action="store_true", default=False, help=ARG_HELP_STRINGS["ignore_case"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules encs = [] #CSV file encodings for encoding in [args.encoding, args.other_encoding]: if encoding: try: codec = codecs.lookup(encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print(msg.format(encoding, codec.name)) except LookupError: print( "Error: '" + encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() encs.append(encoding) mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if reduced: print( "Error: A quotemask may only contain the letters 't' and 'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] header, content = oat.get_csv_file_content(args.csv_file, enc=encs[0]) column = args.column if not args.other_csv_file: rearranged_content = header + sorted(content, key=lambda x: x[column]) else: rearranged_content = [] _, second_content = oat.get_csv_file_content(args.other_csv_file, enc=encs[1]) other_column = column # default: use same column index as in first file if args.other_column: other_column = args.other_column for other_row in second_content: if args.ignore_case: matching_rows = [ row for row in content if row[column].lower() == other_row[other_column].lower() ] else: matching_rows = [ row for row in content if row[column] == other_row[other_column] ] rearranged_content += matching_rows for matching_row in matching_rows: content.remove(matching_row) unmatched_msg = ( "{} rows could not be rearranged (unmatched in second csv file) " + "and were appended to the end of the result file " + "in original order.") if content: oat.print_y(unmatched_msg.format(len(content))) else: oat.print_g("All rows matched.") rearranged_content = header + rearranged_content + content # append any unmatched rows with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(rearranged_content)
def main(): analysed_journals = {} if os.path.isfile(JOURNALTOC_RESULTS_FILE): with open(JOURNALTOC_RESULTS_FILE) as results: reader = DictReader(results) for line in reader: title = line["journal_full_title"] if title not in analysed_journals: analysed_journals[title] = line remaining_journals = {} with open(APC_DE_FILE) as apc_de: reader = DictReader(apc_de) for line in reader: title = line["journal_full_title"] if title in analysed_journals: continue if title not in remaining_journals: remaining_journals[title] = { "journal_full_title": line["journal_full_title"], "publisher": line["publisher"], "is_hybrid": line["is_hybrid"], "issns": [] } for issn_type in ISSN_TYPES: issn = line[issn_type] if issn not in remaining_journals[title][ "issns"] and oat.is_wellformed_ISSN(issn): remaining_journals[title]["issns"].append(issn) is_hybrid = line["is_hybrid"] if is_hybrid in [ "TRUE", "FALSE" ] and is_hybrid != remaining_journals[title]["is_hybrid"]: remaining_journals[title]["is_hybrid"] = "FLIPPED" msg = "{} unique journals found in OpenAPC core data file, {} already analysed, {} remaining." oat.print_g( msg.format( len(remaining_journals) + len(analysed_journals), len(analysed_journals), len(remaining_journals))) count = 0 for title, fields in remaining_journals.items(): count += 1 entry = {field: None for field in RESULTS_FILE_FIELDNAMES} entry["journal_full_title"] = title for key in ["publisher", "is_hybrid"]: entry[key] = fields[key] entry["issns"] = "|".join(fields["issns"]) msg = 'Analysing journal "{}" ({}), OpenAPC hybrid status is {}...' msg = msg.format(entry["journal_full_title"], entry["issns"], entry["is_hybrid"]) oat.print_b(msg) for issn in fields["issns"]: oat.print_y("Looking up ISSN " + issn + "...") jtoc_metadata = get_jtoc_metadata(issn) if jtoc_metadata["jtoc_id"] is not None: entry["in_jtoc"] = "TRUE" for key in ["jtoc_publisher", "jtoc_title"]: entry[key] = jtoc_metadata[key] journal_type = get_jtoc_journal_type(jtoc_metadata["jtoc_id"]) entry["jtoc_type"] = journal_type msg = 'Journal found ("{}"), JournalTOCs type is {}' oat.print_g(msg.format(entry["jtoc_title"], entry["jtoc_type"])) break else: oat.print_r("None of the associated ISSNS found in JTOCs!") analysed_journals[title] = entry if count < BATCH_SIZE: sleep(2) else: break with open(JOURNALTOC_RESULTS_FILE, "w") as res_file: writer = DictWriter(res_file, fieldnames=RESULTS_FILE_FIELDNAMES) writer.writeheader() for _, entry in analysed_journals.items(): writer.writerow(entry)