def main(): args = parse() _, apc_content = oat.get_csv_file_content("../data/apc_de.csv", "utf-8", True) _, ins_content = oat.get_csv_file_content("../data/institutions.csv", "utf-8", True) _, dup_content = oat.get_csv_file_content( "../data/unresolved_duplicates.csv", "utf-8", True) sig_articles, stats = find_significant_apc_differences( apc_content, args.institution, args.verbose) report = "" report += generate_header(args.lang) report += generate_metadata_section(args.institution, ins_content, stats, args.lang) report += generate_duplicates_section(args.institution, dup_content, ins_content, args.lang) if not args.no_doi_resolve_test: report += generate_nonresolving_dois_section(args.institution, apc_content, args.lang) report += generate_apc_deviaton_section(args.institution, sig_articles, stats, args.lang, args.csv_output) ins = args.institution.lower().replace(" ", "_") today = format_date(date.today(), format="dd_MM_yy") file_name = "report_" + ins + "_" + today + ".pdf" with open("report.md", "w") as out: out.write(report) run([ "pandoc", "report.md", "-f", "markdown", "-o", file_name, "--pdf-engine=xelatex" ])
def main(): parser = argparse.ArgumentParser() parser.add_argument("enriched_file", help=ARG_HELP_STRINGS["enriched_file"]) args = parser.parse_args() header, content = oat.get_csv_file_content(args.enriched_file, enc="utf-8", force_header=True) header_line = header[0] core_content = [list(header_line)] ta_content = [list(header_line) + ["agreement"]] print(core_content) print(ta_content) for line in content: if line[4] == "TRUE" and line[5] in PUBLISHER_LIST: core_content.append(list(EMPTY_LINE_CORE)) ta_content.append(line + [AGREEMENT_NAME]) else: core_content.append(line) ta_content.append(list(EMPTY_LINE_TA)) with open("out_orig.csv", "w") as out: writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True) writer.write_rows(core_content) with open("out_deal_wiley.csv", "w") as out: writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True) writer.write_rows(ta_content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) codec_msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(codec_msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] header, content = oat.get_csv_file_content(args.csv_file, enc) line_num = 1 for line in content: publisher = line[5] journal = line[6] journal_new = oat.get_unified_journal_title(journal) publisher_new = oat.get_unified_publisher_name(publisher) if publisher_new != publisher: line[5] = publisher_new msg = u"Line {}: Updated publisher name ({} -> {})" oat.print_g(msg.format(line_num, publisher, publisher_new)) if journal_new != journal: line[6] = journal_new msg = u"Line {}: Updated journal_full_title ({} -> {})" oat.print_g(msg.format(line_num, journal, journal_new)) line_num += 1 with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows(header + content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"]) parser.add_argument("count_column", type=int, help=ARG_HELP_STRINGS["count_column"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-s", "--sort", action="store_true", help=ARG_HELP_STRINGS["sort"]) args = parser.parse_args() enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) print("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) enc = args.encoding except LookupError: oat.print_r( "Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() header, content = oat.get_csv_file_content(args.source_file, enc) column_name = "column " + str(args.count_column) if header: header_line = header[0] column_name = header_line[args.count_column] oat.print_g("Performing occurence count in column '" + column_name + "'") occurence_dict = OrderedDict() for line in content: try: value = line[args.count_column] except IndexError as ie: oat.print_y("IndexError ({}) at line {}, skipping...".format( ie.message, line)) continue if value not in occurence_dict: occurence_dict[value] = 1 else: occurence_dict[value] += 1 if args.sort: occurence_dict = OrderedDict( sorted(occurence_dict.items(), key=lambda x: x[1], reverse=True)) for item in occurence_dict.items(): print item[0] + ": " + str(item[1])
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"]) parser.add_argument("journaltocs_user", help=ARG_HELP_STRINGS["journaltocs_user"]) parser.add_argument("-i", "--integrate", action="store_true", help=ARG_HELP_STRINGS["integrate"]) parser.add_argument("-m", "--max_lookups", type=int, default=100, help=ARG_HELP_STRINGS["max_lookups"]) args = parser.parse_args() analysed_journals = {} modified_content = [] lookups = 0 header, content = oat.get_csv_file_content(args.source_file, enc="utf-8", force_header=True) header_line = header[0] modified_content = [list(header_line)] for line in content: if not oat.has_value(line[6]): #journal_full_title modified_content.append(line) continue if not oat.has_value(line[4]): #is_hybrid title = line[6] oat.print_y('Looking up journal {}'.format(title)) if title not in analysed_journals: if lookups < args.max_lookups: hybrid_status = get_hybrid_status(line, args.journaltocs_user) if hybrid_status is not None: analysed_journals[title] = hybrid_status else: analysed_journals[title] = "NA" lookups += 1 line[4] = analysed_journals[title] else: oat.print_r("Maximum number of lookups reached!") else: line[4] = analysed_journals[title] modified_content.append(line) with open("out.csv", "w") as out: if args.integrate: writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True) writer.write_rows(modified_content) else: out.write("journal_full_title,is_hybrid\n") for key, value in analysed_journals.items(): out.write(key + "," + value + "\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"]) parser.add_argument("count_column", type=int, help=ARG_HELP_STRINGS["count_column"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-s", "--sort", action="store_true", help=ARG_HELP_STRINGS["sort"]) args = parser.parse_args() enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: oat.print_r("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() header, content = oat.get_csv_file_content(args.source_file, enc) column_name = "column " + str(args.count_column) if header: header_line = header[0] column_name = header_line[args.count_column] oat.print_g("Performing occurence count in column '" + column_name + "'") occurence_dict = OrderedDict() for line in content: try: value = line[args.count_column] except IndexError as ie: oat.print_y("IndexError ({}) at line {}, skipping...".format(ie.message, line)) continue if value not in occurence_dict: occurence_dict[value] = 1 else: occurence_dict[value] += 1 if args.sort: occurence_dict = OrderedDict(sorted(occurence_dict.items(), key=lambda x: x[1], reverse=True)) for item in occurence_dict.items(): print(item[0] + ": " + str(item[1]))
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"]) args = parser.parse_args() handler = logging.StreamHandler(sys.stderr) handler.setFormatter(oat.ANSIColorFormatter()) bufferedHandler = oat.BufferedErrorHandler(handler) bufferedHandler.setFormatter(oat.ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) oat.print_g(msg) enc = args.encoding except LookupError: msg = ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") oat.print_r(msg) sys.exit() head, content = oat.get_csv_file_content(args.csv_file, enc) content = head + content line_num = 0 for line in content: line_num += 1 if args.start and args.start > line_num: continue if args.end and args.end < line_num: continue # Check hybrid status if line[4] != "TRUE": continue institution = line[0] period = line[1] doi = line[3] publisher = line[5] journal = line[6] for lpl in lpl_list: if lpl.publisher_matches(publisher): init_msg = (u"Line {}: Checking {} article from {}, published in '" + "{}'...").format(line_num, institution, period, journal) oat.print_b(init_msg) page_content = get_landingpage_content(doi, lpl) if page_content is None: continue pdf_link = lpl.search_for_oa(page_content) if pdf_link is None: error_msg = (u"No PDF link found! (line {}, DOI: " + "http://doi.org/{}").format(line_num, doi) logging.error(error_msg) elif pdf_link == "": warning_msg = (u"A RegexGroup matched, but no PDF " + "link was found! (line {}, DOI: " + "http://doi.org/{}").format(line_num, doi) logging.warning(warning_msg) else: oat.print_g(u"PDF link found: " + pdf_link) time.sleep(1) if not bufferedHandler.buffer: oat.print_g("\nLookup finished, all articles were accessible on sciencedirect") else: oat.print_r("\nLookup finished, not all articles could be accessed on sciencedirect:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"]) args = parser.parse_args() handler = logging.StreamHandler(sys.stderr) handler.setFormatter(oat.ANSIColorFormatter()) bufferedHandler = oat.BufferedErrorHandler(handler) bufferedHandler.setFormatter(oat.ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) oat.print_g(msg) enc = args.encoding except LookupError: msg = ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") oat.print_r(msg) sys.exit() head, content = oat.get_csv_file_content(args.csv_file, enc) content = head + content header = {"User-Agent": "Mozilla/5.0 Firefox/45.0"} line_num = 0 for line in content: line_num += 1 if args.start and args.start > line_num: continue if args.end and args.end < line_num: continue institution = line[0] period = line[1] doi = line[3] is_hybrid = line[4] publisher = line[5] journal = line[6] if publisher != "Elsevier" or is_hybrid != "TRUE": continue init_msg = (u"Line {}: Checking {} article from {}, published in " + "{}...").format(line_num, institution, period, journal) oat.print_b(init_msg) url = 'http://doi.org/' + doi req = urllib2.Request(url, None, header) ret_value = {'success': True} try: response = urllib2.urlopen(req) target = response.geturl() resolve_msg = u"DOI {} resolved, led us to {}".format(doi, target) if "sciencedirect.com" not in target: oat.print_y(resolve_msg) oat.print_y("Journal not located at sciencedirect, skipping...") continue oat.print_b(resolve_msg) content_string = response.read() single_match = pdflink_re.search(content_string) if single_match: link_url = single_match.groups()[0] oat.print_g(u"PDF link found: " + link_url) else: multi_match = pdflink_multi_re.search(content_string) if multi_match: link_url = multi_match.groups()[0] link_url = link_url.replace("&", "&") oat.print_g(u"PDF link found (more than one document): " + link_url) else: error_msg = (u"No PDF link found! (line {}, DOI: {}, " + "landing page: {})").format(line_num, doi, target) logging.error(error_msg) time.sleep(1) except urllib2.HTTPError as httpe: code = str(httpe.getcode()) oat.print_r("HTTPError: {} - {}".format(code, httpe.reason)) except urllib2.URLError as urle: oat.print_r("URLError: {}".format(urle.reason)) if not bufferedHandler.buffer: oat.print_g("\nLookup finished, all articles were accessible on sciencedirect") else: oat.print_r("\nLookup finished, not all articles could be accessed on sciencedirect:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("index", type=int, help=ARG_HELP_STRINGS["index"]) parser.add_argument("-v", "--value", help=ARG_HELP_STRINGS["value"]) parser.add_argument("-f", "--file", help=ARG_HELP_STRINGS["file"]) parser.add_argument("-d", "--full_delete", action="store_true", help=ARG_HELP_STRINGS["full_delete"]) parser.add_argument("-i", "--ignore_case", action="store_true", help=ARG_HELP_STRINGS["ignore_case"]) parser.add_argument("-r", "--results_file", action="store_true", help=ARG_HELP_STRINGS["results_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() if args.value is None and args.file is None: parser.error("Either a single value (-v option) or a file of " + "multiple values (-f option) must be given.") values = [] if args.file: if not os.path.isfile(args.file): print("Error: '" + args.file + "' is no valid file!") sys.exit() with open(args.file, "r") as f: for line in f: if len(line) > 0: value = line.strip("\r\n") if args.ignore_case: values.append(value.lower()) else: values.append(value) oat.print_g(str(len(values)) + " values read from file") if args.value is not None: if args.ignore_case: values.append(args.value.lower()) else: values.append(args.value) if args.file: oat.print_y("Value argument given in addition to file " + "argument, adding value to file imports...") quote_rules = args.openapc_quote_rules enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() header, content = oat.get_csv_file_content(args.csv_file, enc) mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] empty_line = ['' for element in content[0]] column_name = "column " + str(args.index) if header: header_line = header[0] column_name = header_line[args.index] empty_line = ['' for element in header_line] msg = u"Performing line deletion on condition '{}' in {}".format( column_name, values) oat.print_g(msg) modified_content = [] deleted_lines = [] num_total_lines = num_deleted_lines = 0 for line in content: if len(line) == 0: continue num_total_lines += 1 current_value = line[args.index] if args.ignore_case: current_value = current_value.lower() if current_value not in values: modified_content.append(line) else: num_deleted_lines += 1 if not args.full_delete: modified_content.append(list(empty_line)) if args.results_file: deleted_lines.append(line) msg = u"Process complete, deleted {} out of {} total lines" oat.print_g(msg.format(num_deleted_lines, num_total_lines)) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(copy.deepcopy(header) + modified_content) if args.results_file and len(deleted_lines) > 0: with open('del.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(copy.deepcopy(header) + deleted_lines)
for name in grid_names: current_ratio = ratio(name, institutions_name) if current_ratio > highest_ratio: highest_ratio = current_ratio grid_name = name return grid_name, highest_ratio def write_out_file(ins_header, ins_content): with open("out.csv", "w") as out_file: quote_mask = [False for x in range(7)] writer = oat.OpenAPCUnicodeWriter(out_file, quote_mask, False, False) writer.write_rows(ins_header + ins_content) ins_header, ins_content = oat.get_csv_file_content("../data/institutions.csv", "utf-8", True, False) with open("grid.json") as grid_file: content = grid_file.read() json_dict = json.loads(content) grid_list = json_dict["institutes"] for index, ins in enumerate(grid_list): deciles = { round((len(grid_list) / 10) * i): str(i * 10) + "%" for i in range(1, 10) } if index in deciles: print(deciles[index]) if ins["status"] != "active": continue
def main(): parser = argparse.ArgumentParser() parser.add_argument("new_file", help=ARG_HELP_STRINGS["new_file"]) parser.add_argument("target_file", help=ARG_HELP_STRINGS["new_file"]) parser.add_argument('cost_tolerance', type=float, help=ARG_HELP_STRINGS["cost_tolerance"]) parser.add_argument('enriched_files', nargs='+', help=ARG_HELP_STRINGS["enriched_files"]) parser.add_argument('-b', '--batch', type=int, help=ARG_HELP_STRINGS["batch"]) args = parser.parse_args() target_file_name = get_filename(args.target_file) new_file_name = get_filename(args.new_file) for path in args.enriched_files: if not os.path.isfile(path): oat.print_r('Error: "' + path + '" is no valid file path!') sys.exit() ENRICHED_FILES[path] = {"modified": False, "file_name": get_filename(path)} ENRICHED_FILES[path]["header"], ENRICHED_FILES[path]["content"] = oat.get_csv_file_content(path, enc="utf-8", force_header=True) target_header, target_content = oat.get_csv_file_content(args.target_file, enc="utf-8", force_header=True) new_header, new_content = oat.get_csv_file_content(args.new_file, enc="utf-8", force_header=True) ud_header, ud_content = oat.get_csv_file_content(UD_FILE, enc="utf-8", force_header=True) duplicates = [] target_dois = [line[3] for line in target_content] for new_index, line in enumerate(new_content): doi = line[3] if doi == "NA" or doi not in target_dois: continue else: target_index = get_duplicate_index(target_content, doi) duplicates.append((new_index, target_index)) count = 0 for pair in duplicates: new_line = new_content[pair[0]] target_line = target_content[pair[1]] doi = target_line[3] new_cost = float(new_line[2]) target_cost = float(target_line[2]) if new_cost >= target_cost: deviation = (new_cost - target_cost) / new_cost else: deviation = (target_cost - new_cost) / target_cost oat.print_b("Duplicate found:") print("In new file " + new_file_name + ":") print(",".join(new_line)) print("In target file " + target_file_name + ":") print(",".join(target_line)) if new_line[0] != target_line[0]: msg = 'Institutional mismatch "{}"/"{}". Lines will be deleted and added to the unresolved duplicates file.' oat.print_r(msg.format(new_line[0],target_line[0])) new_content[pair[0]] = list(EMPTY_LINE) target_content[pair[1]] = REPLACEMENT ud_content += [target_line] ud_content += [new_line] path, index = find_in_enriched_files(doi) ENRICHED_FILES[path]["content"][index] = list(EMPTY_LINE) ENRICHED_FILES[path]["modified"] = True elif deviation <= args.cost_tolerance: msg = "Cost deviation between {} and {} is below tolerance threshold ({} <= {}). Entries are treated as equal, only the new one will be deleted." oat.print_g(msg.format(new_cost, target_cost, deviation, args.cost_tolerance)) new_content[pair[0]] = list(EMPTY_LINE) else: msg = "Cost deviation between {} and {} exceeds tolerance threshold ({} > {}). Entries are treated as different, both will be deleted." oat.print_y(msg.format(new_cost, target_cost, deviation, args.cost_tolerance)) new_content[pair[0]] = list(EMPTY_LINE) target_content[pair[1]] = REPLACEMENT path, index = find_in_enriched_files(doi) ENRICHED_FILES[path]["content"][index] = list(EMPTY_LINE) ENRICHED_FILES[path]["modified"] = True count += 1 if args.batch and count >= args.batch: break while REPLACEMENT in target_content: target_content.remove(REPLACEMENT) with open(args.target_file, 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True) writer.write_rows(target_header + target_content) with open(args.new_file, 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True) writer.write_rows(new_header + new_content) with open(UD_FILE, 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True) writer.write_rows(ud_header + ud_content) for path, enriched_file in ENRICHED_FILES.items(): if enriched_file["modified"]: with open(path, 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True) writer.write_rows(enriched_file["header"] + enriched_file["content"])
def main(): parser = argparse.ArgumentParser() parser.add_argument("apc_file", help=ARG_HELP_STRINGS["apc_file"]) parser.add_argument("issn_l_file", help=ARG_HELP_STRINGS["issn_l_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() header, content = oat.get_csv_file_content(args.apc_file, enc) oat.print_g("Preparing mapping table...") itself = other = 0 issn_l_re = re.compile( "^(?P<issn>\d{4}-\d{3}[\dxX])\t(?P<issn_l>\d{4}-\d{3}[\dxX])$") issn_l_file = open(args.issn_l_file, "r") issn_l_dict = {} for i, line in enumerate(issn_l_file): if i % 100000 == 0: print(str(i) + " lines processed.") match = issn_l_re.match(line) if match: match_dict = match.groupdict() issn_l_dict[match_dict['issn']] = match_dict['issn_l'] if match_dict['issn'] == match_dict['issn_l']: itself += 1 else: other += 1 print( str(itself) + " ISSNs pointing to itself as ISSN-L, " + str(other) + " to another value.") oat.print_g("Starting enrichment...") issn_matches = issn_p_matches = issn_e_matches = unmatched = different = corrections = 0 enriched_lines = [] for line in content: if len(line) == 0: enriched_lines.append(line) continue issn = reformat_issn(line[7]) issn_p = reformat_issn(line[8]) issn_e = reformat_issn(line[9]) target = None if issn in issn_l_dict: target = issn_l_dict[issn] corrected_target = oat.get_corrected_issn_l(target) if corrected_target != target: corrections += 1 line[10] = corrected_target issn_matches += 1 elif issn_p in issn_l_dict: target = issn_l_dict[issn_p] corrected_target = oat.get_corrected_issn_l(target) if corrected_target != target: corrections += 1 line[10] = corrected_target issn_p_matches += 1 elif issn_e in issn_l_dict: target = issn_l_dict[issn_e] corrected_target = oat.get_corrected_issn_l(target) if corrected_target != target: corrections += 1 line[10] = corrected_target issn_e_matches += 1 else: unmatched += 1 if target is not None and target not in [issn, issn_p, issn_e]: different += 1 enriched_lines.append(line) msg = ("{} issn_l values mapped by issn, {} by issn_p, {} by issn_e. {} " + "could not be assigned.\n{} issn_l values were corrected during " + "the process.\n In {} cases the ISSN-L was different from all " + "existing ISSN values") print( msg.format(issn_matches, issn_p_matches, issn_e_matches, unmatched, corrections, different)) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows(header + enriched_lines)
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file") parser.add_argument("source_column", type=int) parser.add_argument("currency_column", type=int) parser.add_argument("period_column", type=int) parser.add_argument("target_column", type=int) parser.add_argument("-f", "--force_overwrite", action="store_true") parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: msg = "locale '{}' not found, normalised to '{}'".format( args.locale, norm) oat.print_y(msg) try: loc = locale.setlocale(locale.LC_ALL, norm) oat.print_g("Using locale " + loc) except locale.Error as loce: msg = "Setting locale to {} failed: {}".format(norm, loce.message) oat.print_r(msg) sys.exit() header, content = oat.get_csv_file_content(args.source_file, enc, True) fieldnames = header.pop() modified_content = [] line_num = 0 for column_type in ["source_column", "currency_column", "period_column", "target_column"]: index = getattr(args, column_type) msg = "Column {} ('{}') is the {}." oat.print_g(msg.format(index, fieldnames[index], column_type)) start = input("\nStart conversion? (y/n):") while start not in ["y", "n"]: start = input("Please type 'y' or 'n':") if start == "n": sys.exit() for line in content: line_num += 1 if not oat.has_value(line[args.source_column]): oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...") modified_content.append(line) continue monetary_value = None try: monetary_value = locale.atof(line[args.source_column]) except ValueError: msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, line[args.source_column])) modified_content.append(line) continue currency = line[args.currency_column] if currency == "EUR": msg = "WARNING: Currency in line {} is already EUR, skipping..." oat.print_y(msg.format(line_num)) line[args.target_column] = line[args.source_column] modified_content.append(line) continue if not oat.has_value(currency): msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, currency)) modified_content.append(line) continue period = line[args.period_column] frequency = get_frequency(period) if frequency is None: msg = "WARNING: Could not extract a valid date string from period column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, period)) modified_content.append(line) continue if currency not in EXCHANGE_RATES[frequency]: msg = 'No exchange rates ({}) found for currency "{}", querying ECB data warehouse...' oat.print_b(msg.format(frequency, currency)) rates = oat.get_euro_exchange_rates(currency, frequency) EXCHANGE_RATES[frequency][currency] = rates rate = EXCHANGE_RATES[frequency][currency].get(period) if rate is None and frequency == "A": rate = _calulate_preliminary_annual_average(period, currency) if rate: EXCHANGE_RATES[frequency][currency][period] = rate if rate is None: if frequency != "D": msg = "Error: No conversion rate found for currency {} for period {} (line {}), aborting..." oat.print_r(msg.format(currency, period, line_num)) sys.exit() day_retries = 0 while rate is None: msg = "Warning: No conversion rate found for currency {} for period {} (line {}), trying next day..." oat.print_y(msg.format(currency, period, line_num)) period = get_next_day(period) rate = EXCHANGE_RATES[frequency][currency].get(period) day_retries += 1 if day_retries > 5: msg = "Error: Look-ahead limit for days exceeded, aborting..." oat.print_r(msg) sys.exit() euro_value = round(monetary_value/float(rate), 2) line[args.target_column] = str(euro_value) msg = "Line {}: {} exchange rate ({}) for date {} is {} -> {} / {} = {} EUR" msg = msg.format(line_num, currency, frequency, period, rate, monetary_value, rate, euro_value) oat.print_g(msg) modified_content.append(line) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows([fieldnames] + modified_content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("index", type=int, help=ARG_HELP_STRINGS["index"]) parser.add_argument("-v", "--value", help=ARG_HELP_STRINGS["value"]) parser.add_argument("-f", "--file", help=ARG_HELP_STRINGS["file"]) parser.add_argument("-d", "--full_delete", action="store_true", help=ARG_HELP_STRINGS["full_delete"]) parser.add_argument("-i", "--ignore_case", action="store_true", help=ARG_HELP_STRINGS["ignore_case"]) parser.add_argument("-r", "--results_file", action="store_true", help=ARG_HELP_STRINGS["results_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() if args.value is None and args.file is None: parser.error("Either a single value (-v option) or a file of " + "multiple values (-f option) must be given.") values = [] if args.file: if not os.path.isfile(args.file): print("Error: '" + args.file + "' is no valid file!") sys.exit() with open(args.file, "r") as f: for line in f: if len(line) > 0: value = line.strip("\r\n") if args.ignore_case: values.append(value.lower()) else: values.append(value) oat.print_g(str(len(values)) + " values read from file") if args.value is not None: if args.ignore_case: values.append(args.value.lower()) else: values.append(args.value) if args.file: oat.print_y("Value argument given in addition to file " + "argument, adding value to file imports...") quote_rules = args.openapc_quote_rules enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print (msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() header, content = oat.get_csv_file_content(args.csv_file, enc) mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] empty_line = ['' for element in content[0]] column_name = "column " + str(args.index) if header: header_line = header[0] column_name = header_line[args.index] empty_line = ['' for element in header_line] msg = u"Performing line deletion on condition '{}' in {}".format(column_name, values) oat.print_g(msg) modified_content = [] deleted_lines = [] num_total_lines = num_deleted_lines = 0 for line in content: if len(line) == 0: continue num_total_lines += 1 current_value = line[args.index] if args.ignore_case: current_value = current_value.lower() if current_value not in values: modified_content.append(line) else: num_deleted_lines += 1 if not args.full_delete: modified_content.append(list(empty_line)) if args.results_file: deleted_lines.append(line) msg = u"Process complete, deleted {} out of {} total lines" oat.print_g(msg.format(num_deleted_lines, num_total_lines)) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(copy.deepcopy(header) + modified_content) if args.results_file and len(deleted_lines) > 0: with open('del.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(copy.deepcopy(header) + deleted_lines)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) subparsers = parser.add_subparsers(help='The column operation to perform') delete_parser = subparsers.add_parser("delete", help="delete help") delete_parser.add_argument("column_index", type=int, help='bar help') delete_parser.set_defaults(func=delete_column) insert_parser = subparsers.add_parser("insert", help="insert help") insert_parser.add_argument("target_index", type=int, help='bar help') insert_parser.add_argument("column_name", help='bar help') insert_parser.add_argument("default_value", help='bar help') insert_parser.set_defaults(func=insert_column) move_parser = subparsers.add_parser("move", help="move help") move_parser.add_argument("column_index", type=int, help='bar help') move_parser.add_argument("target_index", type=int, help='bar help') move_parser.set_defaults(func=move_column) copy_parser = subparsers.add_parser("copy", help="copy help") copy_parser.set_defaults(func=copy) args = parser.parse_args() quote_rules = args.openapc_quote_rules enc = None #CSV file encoding if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() header, content = oat.get_csv_file_content(args.csv_file, enc) mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] new_rows = args.func(header, content, args) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows(new_rows)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("column", type=int, help=ARG_HELP_STRINGS["column"]) parser.add_argument("other_csv_file", nargs="?", help=ARG_HELP_STRINGS["other_csv_file"]) parser.add_argument("other_column", type=int, nargs="?", help=ARG_HELP_STRINGS["other_column"]) parser.add_argument("-e2", "--other_encoding", help=ARG_HELP_STRINGS["other_encoding"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-i", "--ignore_case", action="store_true", default=False, help=ARG_HELP_STRINGS["ignore_case"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules encs = [] #CSV file encodings for encoding in [args.encoding, args.other_encoding]: if encoding: try: codec = codecs.lookup(encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print(msg.format(encoding, codec.name)) except LookupError: print("Error: '" + encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() encs.append(encoding) mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if reduced: print("Error: A quotemask may only contain the letters 't' and 'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] header, content = oat.get_csv_file_content(args.csv_file, enc=encs[0]) column = args.column if not args.other_csv_file: rearranged_content = header + sorted(content, key=lambda x: x[column]) else: rearranged_content = [] _, second_content = oat.get_csv_file_content(args.other_csv_file, enc=encs[1]) other_column = column # default: use same column index as in first file if args.other_column: other_column = args.other_column for other_row in second_content: if args.ignore_case: matching_rows = [row for row in content if row[column].lower() == other_row[other_column].lower()] else: matching_rows = [row for row in content if row[column] == other_row[other_column]] rearranged_content += matching_rows for matching_row in matching_rows: content.remove(matching_row) unmatched_msg = ("{} rows could not be rearranged (unmatched in second csv file) " + "and were appended to the end of the result file " + "in original order.") if content: oat.print_y(unmatched_msg.format(len(content))) else: oat.print_g("All rows matched.") rearranged_content = header + rearranged_content + content # append any unmatched rows with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(rearranged_content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file") parser.add_argument("source_column", type=int) parser.add_argument("currency_column", type=int) parser.add_argument("period_column", type=int) parser.add_argument("target_column", type=int) parser.add_argument("-f", "--force_overwrite", action="store_true") parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: msg = "locale '{}' not found, normalised to '{}'".format( args.locale, norm) oat.print_y(msg) try: loc = locale.setlocale(locale.LC_ALL, norm) oat.print_g("Using locale " + loc) except locale.Error as loce: msg = "Setting locale to {} failed: {}".format(norm, loce.message) oat.print_r(msg) sys.exit() header, content = oat.get_csv_file_content(args.source_file, enc) fieldnames = header.pop() modified_content = [] line_num = 0 for column_type in [ "source_column", "currency_column", "period_column", "target_column" ]: index = getattr(args, column_type) msg = "Column {} ('{}') is the {}." oat.print_g(msg.format(index, fieldnames[index], column_type)) start = input("\nStart conversion? (y/n):") while start not in ["y", "n"]: start = input("Please type 'y' or 'n':") if start == "n": sys.exit() for line in content: line_num += 1 if not oat.has_value(line[args.source_column]): oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...") modified_content.append(line) continue monetary_value = None try: monetary_value = locale.atof(line[args.source_column]) except ValueError: msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, line[args.source_column])) modified_content.append(line) continue period = line[args.period_column] if not oat.has_value(period) or not period.isdigit(): msg = "WARNING: Could not extract a valid year string from period column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, period)) modified_content.append(line) continue currency = line[args.currency_column] if not oat.has_value(currency): msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, currency)) modified_content.append(line) continue try: rate = AVG_YEARLY_CONVERSION_RATES[currency][period] except KeyError: msg = "ERROR: No conversion rate found for currency {} in year {} (line {}), aborting..." oat.print_r(msg.format(currency, period, line_num)) sys.exit() euro_value = round(monetary_value / rate, 2) line[args.target_column] = str(euro_value) modified_content.append(line) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows([fieldnames] + modified_content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"]) parser.add_argument("source_file_key_column", type=int, help=ARG_HELP_STRINGS["source_file_key_column"]) parser.add_argument("source_file_value_column", type=int, help=ARG_HELP_STRINGS["source_file_value_column"]) parser.add_argument("target_file", help=ARG_HELP_STRINGS["target_file"]) parser.add_argument("target_file_key_column", type=int, help=ARG_HELP_STRINGS["target_file_key_column"]) parser.add_argument("target_file_value_column", type=int, help=ARG_HELP_STRINGS["target_file_value_column"]) parser.add_argument("-s", "--strict", action="store_true", help=ARG_HELP_STRINGS["strict"]) parser.add_argument("-f", "--force_overwrite", action="store_true", help=ARG_HELP_STRINGS["force_overwrite"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-e2", "--other_encoding", help=ARG_HELP_STRINGS["other_encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules encs = [] #CSV file encodings for encoding in [args.encoding, args.other_encoding]: if encoding: try: codec = codecs.lookup(encoding) print ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(encoding, codec.name) enc = args.encoding except LookupError: print ("Error: '" + encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() encs.append(encoding) mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] source_header, source_content = oat.get_csv_file_content(args.source_file, enc=encs[0]) key_column_name = "column " + str(args.source_file_key_column) value_column_name = "column " + str(args.source_file_value_column) if source_header: header = source_header[0] key_column_name = header[args.source_file_key_column] value_column_name = header[args.source_file_value_column] msg = u"Creating mapping table ({} -> {}) for source file {}...".format(key_column_name, value_column_name, args.source_file) oat.print_g(msg) mapping_table = {} ambiguous_keys = [] for line in source_content: if line: key = line[args.source_file_key_column] if key == 'NA': continue value = line[args.source_file_value_column] if key not in mapping_table: mapping_table[key] = value else: if mapping_table[key] != value: if not args.strict: msg = u"WARNING: Replacing existing value '{}' for key '{}' with new value '{}'".format(mapping_table[key], key, value) mapping_table[key] = value oat.print_y(msg) else: if key not in ambiguous_keys: ambiguous_keys.append(key) if args.strict: for key in ambiguous_keys: del(mapping_table[key]) msg = u"INFO: Ambiguous key '{}' dropped from mapping table".format(key) oat.print_b(msg) oat.print_g("mapping table created, contains " + str(len(mapping_table)) + " entries") target_header, target_content = oat.get_csv_file_content(args.target_file, enc=encs[1]) line_num = 0 if not target_header else 1 replace_msg = u"Line {}: Found matching key '{}', replaced old value '{}' by '{}'" modified_content = [] for line in target_content: key = line[args.target_file_key_column] if key in mapping_table: new_value = mapping_table[key] old_value = line[args.target_file_value_column] if old_value != new_value: if len(old_value) == 0 or old_value == "NA": line[args.target_file_value_column] = new_value msg = replace_msg.format(line_num, key, old_value, new_value) oat.print_g(msg) else: if args.force_overwrite: line[args.target_file_value_column] = new_value msg = replace_msg.format(line_num, key, old_value, new_value) oat.print_y(msg) modified_content.append(line) line_num += 1 with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(target_header + modified_content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"]) args = parser.parse_args() handler = logging.StreamHandler(sys.stderr) handler.setFormatter(oat.ANSIColorFormatter()) bufferedHandler = oat.BufferedErrorHandler(handler) bufferedHandler.setFormatter(oat.ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: msg = ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") oat.print_r(msg) sys.exit() head, content = oat.get_csv_file_content(args.csv_file, enc) content = head + content line_num = 0 for line in content: line_num += 1 if args.start and args.start > line_num: continue if args.end and args.end < line_num: continue # Check hybrid status if line[4] != "TRUE": continue institution = line[0] period = line[1] doi = line[3] publisher = line[5] journal = line[6] for lpl in lpl_list: if lpl.publisher_matches(publisher): init_msg = (u"Line {}: Checking {} article from {}, published in '" + "{}'...").format(line_num, institution, period, journal) oat.print_b(init_msg) page_content = get_landingpage_content(doi, lpl) if page_content is None: continue pdf_link = lpl.search_for_oa(page_content) if pdf_link is None: error_msg = (u"No PDF link found! (line {}, DOI: " + "http://doi.org/{}").format(line_num, doi) logging.error(error_msg) elif pdf_link == "": warning_msg = (u"A RegexGroup matched, but no PDF " + "link was found! (line {}, DOI: " + "http://doi.org/{}").format(line_num, doi) logging.warning(warning_msg) else: oat.print_g(u"PDF link found: " + pdf_link) time.sleep(1) if not bufferedHandler.buffer: oat.print_g("\nLookup finished, all articles were accessible") else: oat.print_r("\nLookup finished, not all articles could be accessed:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file") parser.add_argument("source_column", type=int) parser.add_argument("currency_column", type=int) parser.add_argument("period_column", type=int) parser.add_argument("target_column", type=int) parser.add_argument("-f", "--force_overwrite", action="store_true") parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: msg = "locale '{}' not found, normalised to '{}'".format( args.locale, norm) oat.print_y(msg) try: loc = locale.setlocale(locale.LC_ALL, norm) oat.print_g("Using locale " + loc) except locale.Error as loce: msg = "Setting locale to {} failed: {}".format(norm, loce.message) oat.print_r(msg) sys.exit() header, content = oat.get_csv_file_content(args.source_file, enc, True) fieldnames = header.pop() modified_content = [] line_num = 0 for column_type in ["source_column", "currency_column", "period_column", "target_column"]: index = getattr(args, column_type) msg = "Column {} ('{}') is the {}." oat.print_g(msg.format(index, fieldnames[index], column_type)) start = input("\nStart conversion? (y/n):") while start not in ["y", "n"]: start = input("Please type 'y' or 'n':") if start == "n": sys.exit() for line in content: line_num += 1 if not oat.has_value(line[args.source_column]): oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...") modified_content.append(line) continue monetary_value = None try: monetary_value = locale.atof(line[args.source_column]) except ValueError: msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, line[args.source_column])) modified_content.append(line) continue currency = line[args.currency_column] if not oat.has_value(currency): msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, currency)) modified_content.append(line) continue period = line[args.period_column] frequency = get_frequency(period) if frequency is None: msg = "WARNING: Could not extract a valid date string from period column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, period)) modified_content.append(line) continue if currency not in EXCHANGE_RATES[frequency]: msg = 'No exchange rates ({}) found for currency "{}", querying ECB data warehouse...' oat.print_b(msg.format(frequency, currency)) rates = oat.get_euro_exchange_rates(currency, frequency) EXCHANGE_RATES[frequency][currency] = rates try: rate = EXCHANGE_RATES[frequency][currency][period] except KeyError: msg = "ERROR: No conversion rate found for currency {} for period {} (line {}), aborting..." oat.print_r(msg.format(currency, period, line_num)) sys.exit() euro_value = round(monetary_value/float(rate), 2) line[args.target_column] = str(euro_value) msg = "Line {}: {} exchange rate ({}) for date {} is {} -> {} / {} = {} EUR" msg = msg.format(line_num, currency, frequency, period, rate, monetary_value, rate, euro_value) oat.print_g(msg) modified_content.append(line) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows([fieldnames] + modified_content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("index", type=int, help=ARG_HELP_STRINGS["index"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-m", "--min_ratio", type=float, help=ARG_HELP_STRINGS["min_ratio"], default=0.0) args = parser.parse_args() if args.min_ratio < 0.0 or args.min_ratio > 1.0: oat.print_r("Error: min_ratio parameter must be a float between 0.0 and 1.0") sys.exit() enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() header, content = oat.get_csv_file_content(args.csv_file, enc) header = header.pop() entities = [] line_num = 0 msg = "Processed {} entries in column '{}', {} unique entities found." last_msg = None for line in content: line_num += 1 if line[args.index] not in entities: entities.append(line[args.index]) if line_num == len(content) or line_num % 100 == 0: last_msg = msg.format(line_num, header[args.index], len(entities)) print(last_msg, end="\r") print(last_msg) sim_pairs = [] n = len(entities) - 1 num_pairs = int((n*n + n) / 2) msg = ("Calculated Levenshtein ratio for {} out of {} possible entity combinations ({}%), " + "{} have passed the minimum ratio so far.") last_msg = None num_calcs = 0 while entities: first_part = entities.pop(0) for second_part in entities: lev_ratio = ratio(first_part, second_part) num_calcs += 1 if lev_ratio >= args.min_ratio: sim_pairs.append([first_part, second_part, str(lev_ratio)]) if num_calcs == num_pairs or num_calcs % 100 == 0: last_msg = msg.format(num_calcs, num_pairs, round(num_calcs/num_pairs * 100, 1), len(sim_pairs)) print(last_msg, end="\r") print(last_msg) sim_pairs.sort(key=lambda x: x[2], reverse=True) sim_pairs.insert(0, ["first_item", "second_item", "levenshtein_ratio"]) with open("out.csv", "w") as out_file: writer = oat.OpenAPCUnicodeWriter(out_file) writer.write_rows(sim_pairs)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("column", type=int, help=ARG_HELP_STRINGS["column"]) parser.add_argument("other_csv_file", nargs="?", help=ARG_HELP_STRINGS["other_csv_file"]) parser.add_argument("other_column", type=int, nargs="?", help=ARG_HELP_STRINGS["other_column"]) parser.add_argument("-e2", "--other_encoding", help=ARG_HELP_STRINGS["other_encoding"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-i", "--ignore_case", action="store_true", default=False, help=ARG_HELP_STRINGS["ignore_case"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules encs = [] #CSV file encodings for encoding in [args.encoding, args.other_encoding]: if encoding: try: codec = codecs.lookup(encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" print(msg.format(encoding, codec.name)) except LookupError: print( "Error: '" + encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() encs.append(encoding) mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if reduced: print( "Error: A quotemask may only contain the letters 't' and 'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] header, content = oat.get_csv_file_content(args.csv_file, enc=encs[0]) column = args.column if not args.other_csv_file: rearranged_content = header + sorted(content, key=lambda x: x[column]) else: rearranged_content = [] _, second_content = oat.get_csv_file_content(args.other_csv_file, enc=encs[1]) other_column = column # default: use same column index as in first file if args.other_column: other_column = args.other_column for other_row in second_content: if args.ignore_case: matching_rows = [ row for row in content if row[column].lower() == other_row[other_column].lower() ] else: matching_rows = [ row for row in content if row[column] == other_row[other_column] ] rearranged_content += matching_rows for matching_row in matching_rows: content.remove(matching_row) unmatched_msg = ( "{} rows could not be rearranged (unmatched in second csv file) " + "and were appended to the end of the result file " + "in original order.") if content: oat.print_y(unmatched_msg.format(len(content))) else: oat.print_g("All rows matched.") rearranged_content = header + rearranged_content + content # append any unmatched rows with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(rearranged_content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("apc_file", help=ARG_HELP_STRINGS["apc_file"]) parser.add_argument("issn_l_file", help=ARG_HELP_STRINGS["issn_l_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() header, content = oat.get_csv_file_content(args.apc_file, enc) oat.print_g("Preparing mapping table...") itself = other = 0 issn_l_re = re.compile("^(?P<issn>\d{4}-\d{3}[\dxX])\t(?P<issn_l>\d{4}-\d{3}[\dxX])$") issn_l_file = open(args.issn_l_file, "r") issn_l_dict = {} for i, line in enumerate(issn_l_file): if i % 100000 == 0: print(str(i) + " lines processed.") match = issn_l_re.match(line) if match: match_dict = match.groupdict() issn_l_dict[match_dict['issn']] = match_dict['issn_l'] if match_dict['issn'] == match_dict['issn_l']: itself += 1 else: other += 1 print(str(itself) + " ISSNs pointing to itself as ISSN-L, " + str(other) + " to another value.") oat.print_g("Starting enrichment...") issn_matches = issn_p_matches = issn_e_matches = unmatched = different = corrections = 0 enriched_lines = [] for line in content: if len(line) == 0: enriched_lines.append(line) continue issn = reformat_issn(line[7]) issn_p = reformat_issn(line[8]) issn_e = reformat_issn(line[9]) target = None if issn in issn_l_dict: target = issn_l_dict[issn] corrected_target = oat.get_corrected_issn_l(target) if corrected_target != target: corrections += 1 line[10] = corrected_target issn_matches += 1 elif issn_p in issn_l_dict: target = issn_l_dict[issn_p] corrected_target = oat.get_corrected_issn_l(target) if corrected_target != target: corrections += 1 line[10] = corrected_target issn_p_matches += 1 elif issn_e in issn_l_dict: target = issn_l_dict[issn_e] corrected_target = oat.get_corrected_issn_l(target) if corrected_target != target: corrections += 1 line[10] = corrected_target issn_e_matches += 1 else: unmatched += 1 if target is not None and target not in [issn, issn_p, issn_e]: different += 1 enriched_lines.append(line) msg = ("{} issn_l values mapped by issn, {} by issn_p, {} by issn_e. {} " + "could not be assigned.\n{} issn_l values were corrected during " + "the process.\n In {} cases the ISSN-L was different from all " + "existing ISSN values") print(msg.format(issn_matches, issn_p_matches, issn_e_matches, unmatched, corrections, different)) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows(header + enriched_lines)