def diffAlign(s1, s2): matching = matching_blocks( editops(s1, s2), s1, s2) s1 = [c for c in s1] s2 = [c for c in s2] s1n = [] s2n = [] popped1, added1 = 0, 0 popped2, added2 = 0, 0 for mb in matching: index1 = mb[0] index2 = mb[1] # Controllo allineamento indici while(popped1 < index1): s1n.append(s1.pop(0)) popped1 += 1 while(popped2 < index2): s2n.append(s2.pop(0)) popped2 += 1 while(len(s1n) < len(s2n)): s1n.append("$") added1 += 1 while(len(s2n) < len(s1n)): s2n.append("$") added2 += 1 seqLen = mb[2] for i in range(seqLen): s1n.append(s1.pop(0)) s2n.append(s2.pop(0)) popped1, popped2 = popped1 + 1, popped2 + 1 s1n = "".join(s1n) s2n = "".join(s2n) return s1n, s2n
def extractPair(correctw, incorrectw): matching = matching_blocks(editops(correctw, incorrectw), correctw, incorrectw) correctw = [c for c in correctw] incorrectw = [c for c in incorrectw] for mb in matching: for i in range(mb[0], mb[0] + mb[2], 1): correctw[i] = None for i in range(mb[1], mb[1] + mb[2], 1): incorrectw[i] = None correctw = "".join([c for c in correctw if c]) incorrectw = "".join([c for c in incorrectw if c]) return (correctw, incorrectw)
def get_matching_blocks(self): if not self._matching_blocks: self._matching_blocks = matching_blocks(self.get_opcodes(), self._str1, self._str2) return self._matching_blocks
def lcs(s1, s2): z = matching_blocks(editops(s1, s2),s1, s2) return np.max(list(zip(*z))[2])
def main(): parser = argparse.ArgumentParser() parser.add_argument("title_file", help=ARG_HELP_STRINGS["title_file"]) parser.add_argument("-m", "--match_threshold", type=float, default=0.9, help=ARG_HELP_STRINGS["match_threshold"]) parser.add_argument("-a", "--ask_threshold", type=float, default=0.8, help=ARG_HELP_STRINGS["ask_threshold"]) parser.add_argument("-c", "--colors", type=bool, default=COLORS_DEFAULT, help=ARG_HELP_STRINGS["ansi_colors"]) parser.add_argument("--start", type=int, default=0, help=ARG_HELP_STRINGS["start"]) parser.add_argument("--end", type=int, default=inf, help=ARG_HELP_STRINGS["end"]) args = parser.parse_args() header = None additional_fields = ["doi", "similarity"] with open(args.title_file, "r") as f: reader = csv.DictReader(f) title_field = None for field in reader.fieldnames: if field.lower() in TITLE_HEADER_WL: print( colorise("Using column '" + field + "' as title column", "green")) title_field = field break else: print( colorise( "ERROR: Could not find a column name which might denote a title column", "red")) sys.exit() header = reader.fieldnames for field in additional_fields: if field not in header: header.append(field) modified_lines = [] ask_count = 0 for line in reader: line["ask"] = False if reader.line_num < args.start or reader.line_num > args.end: continue print(BREAK) title = line[title_field] head = "line " + str(reader.line_num) + ", query title:" print(colorise(head.ljust(L_JUST) + "'" + title + "'", "blue")) ret = crossref_query_title(title) retries = 0 while not ret['success'] and retries < MAX_RETRIES_ON_ERROR: retries += 1 msg = "Error while querying CrossRef API ({}), retrying ({})...".format( ret["exception"], retries) print(colorise(msg, "red")) ret = crossref_query_title(title) result = ret["result"] msg_tail = "'{}' [{}]" msg_tail = msg_tail.format(result["crossref_title"], result["doi"]) if result["similarity"] == 1.0: msg_head = "Perfect match found ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "cyan")) line.update(result) line["ask"] = False elif result["similarity"] >= args.match_threshold: msg_head = "Good match found ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "green")) line.update(result) line["ask"] = False elif result["similarity"] >= args.ask_threshold: msg_head = "Possible match found ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "yellow")) line.update(result) line["line_num"] = reader.line_num line["ask"] = True ask_count += 1 else: msg_head = "No match found, most similar was ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "red")) line.update(EMPTY_RESULT) line["ask"] = False modified_lines.append(line) if ask_count > 0: print(BREAK) ask_msg = "{} matches found with a similarity between {} and {} will need manual confirmation:" ask_msg = ask_msg.format(ask_count, args.ask_threshold, args.match_threshold) print(colorise(ask_msg, "green")) for line in modified_lines: if line["ask"]: print(BREAK) query_t = line[title_field] xref_t = line["crossref_title"] # display matching segments in identical colors for easier recognition diff = matching_blocks( editops(query_t.lower(), xref_t.lower()), query_t, xref_t) query_print = query_t xref_print = xref_t # ANSI codes increase string length, so we need an offset to compensate offset = 0 for i in range(len(diff)): a, b, c = diff[i] a += offset b += offset offset += 9 color = CMP_COLORS[i % len(CMP_COLORS)] query_print = colorise_text_segment( query_print, a, a + c, color) xref_print = colorise_text_segment(xref_print, b, b + c, color) query_head = colorise( "line {}, query title:".format(line["line_num"]), "blue") xref_head = colorise( "Possible match ({}):".format(round(line["similarity"], 2)), "yellow") print(query_head.ljust(L_JUST) + query_print) print(xref_head.ljust(L_JUST) + xref_print) answer = input( "Do you want to accept the DOI for the match title? (y/n):" ) while answer not in ["y", "n"]: answer = input("Please type 'y' or 'n':") if answer == "n": line.update(EMPTY_RESULT) with open("out.csv", "w") as out: dialect = csv.excel dialect.quoting = csv.QUOTE_ALL writer = csv.DictWriter(out, header, extrasaction='ignore', dialect=dialect) writer.writeheader() writer.writerows(modified_lines)
def main(): parser = argparse.ArgumentParser() parser.add_argument("title_file", help=ARG_HELP_STRINGS["title_file"]) parser.add_argument("-m", "--match_threshold", type=float, default=0.9, help=ARG_HELP_STRINGS["match_threshold"]) parser.add_argument("-a", "--ask_threshold", type=float, default=0.8, help=ARG_HELP_STRINGS["ask_threshold"]) parser.add_argument("-c", "--colors", type=bool, default=COLORS_DEFAULT, help=ARG_HELP_STRINGS["ansi_colors"]) parser.add_argument("--start", type=int, default=0, help=ARG_HELP_STRINGS["start"]) parser.add_argument("--end", type=int, default=inf, help=ARG_HELP_STRINGS["end"]) args = parser.parse_args() header = None additional_fields = ["doi", "similarity"] with open(args.title_file, "r") as f: reader = csv.DictReader(f) title_field = None for field in reader.fieldnames: if field.lower() in TITLE_HEADER_WL: print(colorise("Using column '" + field + "' as title column", "green")) title_field = field break else: print(colorise("ERROR: Could not find a column name which might denote a title column", "red")) sys.exit() header = reader.fieldnames for field in additional_fields: if field not in header: header.append(field) modified_lines = [] ask_count = 0 for line in reader: line["ask"] = False if reader.line_num < args.start or reader.line_num > args.end: continue print(BREAK) title = line[title_field] head = "line " + str(reader.line_num) + ", query title:" print(colorise(head.ljust(L_JUST) + "'" + title + "'", "blue")) ret = crossref_query_title(title) retries = 0 while not ret['success'] and retries < MAX_RETRIES_ON_ERROR: retries += 1 msg = "Error while querying CrossRef API ({}), retrying ({})...".format(ret["exception"], retries) print(colorise(msg, "red")) ret = crossref_query_title(title) result = ret["result"] msg_tail = "'{}' [{}]" msg_tail = msg_tail.format(result["crossref_title"], result["doi"]) if result["similarity"] == 1.0: msg_head = "Perfect match found ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "cyan")) line.update(result) line["ask"] = False elif result["similarity"] >= args.match_threshold: msg_head = "Good match found ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "green")) line.update(result) line["ask"] = False elif result["similarity"] >= args.ask_threshold: msg_head = "Possible match found ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "yellow")) line.update(result) line["line_num"] = reader.line_num line["ask"] = True ask_count += 1 else: msg_head = "No match found, most similar was ({}):" msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST) print(colorise(msg_head + msg_tail, "red")) line.update(EMPTY_RESULT) line["ask"] = False modified_lines.append(line) if ask_count > 0: print(BREAK) ask_msg = "{} matches found with a similarity between {} and {} will need manual confirmation:" ask_msg = ask_msg.format(ask_count, args.ask_threshold, args.match_threshold) print(colorise(ask_msg, "green")) for line in modified_lines: if line["ask"]: print(BREAK) query_t = line[title_field] xref_t = line["crossref_title"] # display matching segments in identical colors for easier recognition diff = matching_blocks(editops(query_t.lower(), xref_t.lower()), query_t, xref_t) query_print = query_t xref_print = xref_t # ANSI codes increase string length, so we need an offset to compensate offset = 0 for i in range(len(diff)): a, b, c = diff[i] a += offset b += offset offset += 9 color = CMP_COLORS[i % len(CMP_COLORS)] query_print = colorise_text_segment(query_print, a, a + c , color) xref_print = colorise_text_segment(xref_print, b, b + c , color) query_head = colorise("line {}, query title:".format(line["line_num"]), "blue") xref_head = colorise("Possible match ({}):".format(round(line["similarity"], 2)), "yellow") print(query_head.ljust(L_JUST) + query_print) print(xref_head.ljust(L_JUST) + xref_print) answer = input("Do you want to accept the DOI for the match title? (y/n):") while answer not in ["y", "n"]: answer = input("Please type 'y' or 'n':") if answer == "n": line.update(EMPTY_RESULT) with open("out.csv", "w") as out: dialect = csv.excel dialect.quoting = csv.QUOTE_ALL writer = csv.DictWriter(out, header, extrasaction='ignore', dialect=dialect) writer.writeheader() writer.writerows(modified_lines)
def _get_matching_blocks(query, text): return matching_blocks(editops(query, text), query, text)