Пример #1
0
def diffAlign(s1, s2):
    matching = matching_blocks(
        editops(s1, s2), s1, s2)
    s1 = [c for c in s1]
    s2 = [c for c in s2]
    s1n = []
    s2n = []
    popped1, added1 = 0, 0
    popped2, added2 = 0, 0
    for mb in matching:
        index1 = mb[0]
        index2 = mb[1]
        # Controllo allineamento indici
        while(popped1 < index1):
            s1n.append(s1.pop(0))
            popped1 += 1
        while(popped2 < index2):
            s2n.append(s2.pop(0))
            popped2 += 1
        while(len(s1n) < len(s2n)):
            s1n.append("$")
            added1 += 1
        while(len(s2n) < len(s1n)):
            s2n.append("$")
            added2 += 1

        seqLen = mb[2]
        for i in range(seqLen):
            s1n.append(s1.pop(0))
            s2n.append(s2.pop(0))
            popped1, popped2 = popped1 + 1, popped2 + 1

    s1n = "".join(s1n)
    s2n = "".join(s2n)
    return s1n, s2n
Пример #2
0
def extractPair(correctw, incorrectw):
    matching = matching_blocks(editops(correctw, incorrectw), correctw,
                               incorrectw)
    correctw = [c for c in correctw]
    incorrectw = [c for c in incorrectw]
    for mb in matching:
        for i in range(mb[0], mb[0] + mb[2], 1):
            correctw[i] = None
        for i in range(mb[1], mb[1] + mb[2], 1):
            incorrectw[i] = None
    correctw = "".join([c for c in correctw if c])
    incorrectw = "".join([c for c in incorrectw if c])
    return (correctw, incorrectw)
Пример #3
0
 def get_matching_blocks(self):
     if not self._matching_blocks:
         self._matching_blocks = matching_blocks(self.get_opcodes(),
                                                 self._str1, self._str2)
     return self._matching_blocks
Пример #4
0
def lcs(s1, s2):
    z = matching_blocks(editops(s1, s2),s1, s2)
    return np.max(list(zip(*z))[2])
Пример #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("title_file", help=ARG_HELP_STRINGS["title_file"])
    parser.add_argument("-m",
                        "--match_threshold",
                        type=float,
                        default=0.9,
                        help=ARG_HELP_STRINGS["match_threshold"])
    parser.add_argument("-a",
                        "--ask_threshold",
                        type=float,
                        default=0.8,
                        help=ARG_HELP_STRINGS["ask_threshold"])
    parser.add_argument("-c",
                        "--colors",
                        type=bool,
                        default=COLORS_DEFAULT,
                        help=ARG_HELP_STRINGS["ansi_colors"])
    parser.add_argument("--start",
                        type=int,
                        default=0,
                        help=ARG_HELP_STRINGS["start"])
    parser.add_argument("--end",
                        type=int,
                        default=inf,
                        help=ARG_HELP_STRINGS["end"])
    args = parser.parse_args()

    header = None
    additional_fields = ["doi", "similarity"]

    with open(args.title_file, "r") as f:
        reader = csv.DictReader(f)
        title_field = None
        for field in reader.fieldnames:
            if field.lower() in TITLE_HEADER_WL:
                print(
                    colorise("Using column '" + field + "' as title column",
                             "green"))
                title_field = field
                break
        else:
            print(
                colorise(
                    "ERROR: Could not find a column name which might denote a title column",
                    "red"))
            sys.exit()
        header = reader.fieldnames
        for field in additional_fields:
            if field not in header:
                header.append(field)
        modified_lines = []
        ask_count = 0
        for line in reader:
            line["ask"] = False
            if reader.line_num < args.start or reader.line_num > args.end:
                continue
            print(BREAK)
            title = line[title_field]
            head = "line " + str(reader.line_num) + ", query title:"
            print(colorise(head.ljust(L_JUST) + "'" + title + "'", "blue"))
            ret = crossref_query_title(title)
            retries = 0
            while not ret['success'] and retries < MAX_RETRIES_ON_ERROR:
                retries += 1
                msg = "Error while querying CrossRef API ({}), retrying ({})...".format(
                    ret["exception"], retries)
                print(colorise(msg, "red"))
                ret = crossref_query_title(title)
            result = ret["result"]
            msg_tail = "'{}' [{}]"
            msg_tail = msg_tail.format(result["crossref_title"], result["doi"])
            if result["similarity"] == 1.0:
                msg_head = "Perfect match found ({}):"
                msg_head = msg_head.format(round(result["similarity"],
                                                 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "cyan"))
                line.update(result)
                line["ask"] = False
            elif result["similarity"] >= args.match_threshold:
                msg_head = "Good match found ({}):"
                msg_head = msg_head.format(round(result["similarity"],
                                                 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "green"))
                line.update(result)
                line["ask"] = False
            elif result["similarity"] >= args.ask_threshold:
                msg_head = "Possible match found ({}):"
                msg_head = msg_head.format(round(result["similarity"],
                                                 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "yellow"))
                line.update(result)
                line["line_num"] = reader.line_num
                line["ask"] = True
                ask_count += 1
            else:
                msg_head = "No match found, most similar was ({}):"
                msg_head = msg_head.format(round(result["similarity"],
                                                 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "red"))
                line.update(EMPTY_RESULT)
                line["ask"] = False
            modified_lines.append(line)
        if ask_count > 0:
            print(BREAK)
            ask_msg = "{} matches found with a similarity between {} and {} will need manual confirmation:"
            ask_msg = ask_msg.format(ask_count, args.ask_threshold,
                                     args.match_threshold)
            print(colorise(ask_msg, "green"))
        for line in modified_lines:
            if line["ask"]:
                print(BREAK)
                query_t = line[title_field]
                xref_t = line["crossref_title"]
                # display matching segments in identical colors for easier recognition
                diff = matching_blocks(
                    editops(query_t.lower(), xref_t.lower()), query_t, xref_t)
                query_print = query_t
                xref_print = xref_t
                # ANSI codes increase string length, so we need an offset to compensate
                offset = 0
                for i in range(len(diff)):
                    a, b, c = diff[i]
                    a += offset
                    b += offset
                    offset += 9
                    color = CMP_COLORS[i % len(CMP_COLORS)]
                    query_print = colorise_text_segment(
                        query_print, a, a + c, color)
                    xref_print = colorise_text_segment(xref_print, b, b + c,
                                                       color)
                query_head = colorise(
                    "line {}, query title:".format(line["line_num"]), "blue")
                xref_head = colorise(
                    "Possible match ({}):".format(round(line["similarity"],
                                                        2)), "yellow")
                print(query_head.ljust(L_JUST) + query_print)
                print(xref_head.ljust(L_JUST) + xref_print)
                answer = input(
                    "Do you want to accept the DOI for the match title? (y/n):"
                )
                while answer not in ["y", "n"]:
                    answer = input("Please type 'y' or 'n':")
                if answer == "n":
                    line.update(EMPTY_RESULT)

        with open("out.csv", "w") as out:
            dialect = csv.excel
            dialect.quoting = csv.QUOTE_ALL
            writer = csv.DictWriter(out,
                                    header,
                                    extrasaction='ignore',
                                    dialect=dialect)
            writer.writeheader()
            writer.writerows(modified_lines)
Пример #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("title_file", help=ARG_HELP_STRINGS["title_file"])
    parser.add_argument("-m", "--match_threshold", type=float, default=0.9, help=ARG_HELP_STRINGS["match_threshold"])
    parser.add_argument("-a", "--ask_threshold", type=float, default=0.8, help=ARG_HELP_STRINGS["ask_threshold"])
    parser.add_argument("-c", "--colors", type=bool, default=COLORS_DEFAULT, help=ARG_HELP_STRINGS["ansi_colors"])
    parser.add_argument("--start", type=int, default=0, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("--end", type=int, default=inf, help=ARG_HELP_STRINGS["end"])
    args = parser.parse_args()
    
    header = None
    additional_fields = ["doi", "similarity"]
    
    with open(args.title_file, "r") as f:
        reader = csv.DictReader(f)
        title_field = None
        for field in reader.fieldnames:
            if field.lower() in TITLE_HEADER_WL:
                print(colorise("Using column '" + field + "' as title column", "green"))
                title_field = field
                break
        else:
            print(colorise("ERROR: Could not find a column name which might denote a title column", "red"))
            sys.exit()
        header = reader.fieldnames
        for field in additional_fields:
            if field not in header:
                header.append(field)
        modified_lines = []
        ask_count = 0
        for line in reader:
            line["ask"] = False
            if reader.line_num < args.start or reader.line_num > args.end:
                continue
            print(BREAK)
            title = line[title_field]
            head = "line " + str(reader.line_num) + ", query title:"
            print(colorise(head.ljust(L_JUST) + "'" + title + "'", "blue"))
            ret = crossref_query_title(title)
            retries = 0
            while not ret['success'] and retries < MAX_RETRIES_ON_ERROR:
                retries += 1
                msg = "Error while querying CrossRef API ({}), retrying ({})...".format(ret["exception"], retries)
                print(colorise(msg, "red"))
                ret = crossref_query_title(title)
            result = ret["result"]
            msg_tail = "'{}' [{}]"
            msg_tail = msg_tail.format(result["crossref_title"], result["doi"])
            if result["similarity"] == 1.0:
                msg_head = "Perfect match found ({}):"
                msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "cyan"))
                line.update(result)
                line["ask"] = False
            elif result["similarity"] >= args.match_threshold:
                msg_head = "Good match found ({}):"
                msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "green"))
                line.update(result)
                line["ask"] = False
            elif result["similarity"] >= args.ask_threshold:
                msg_head = "Possible match found ({}):"
                msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "yellow"))
                line.update(result)
                line["line_num"] = reader.line_num
                line["ask"] = True
                ask_count += 1
            else:
                msg_head = "No match found, most similar was ({}):"
                msg_head = msg_head.format(round(result["similarity"], 2)).ljust(L_JUST)
                print(colorise(msg_head + msg_tail, "red"))
                line.update(EMPTY_RESULT)
                line["ask"] = False
            modified_lines.append(line)
        if ask_count > 0:
            print(BREAK)
            ask_msg = "{} matches found with a similarity between {} and {} will need manual confirmation:"
            ask_msg = ask_msg.format(ask_count, args.ask_threshold, args.match_threshold)
            print(colorise(ask_msg, "green"))
        for line in modified_lines:
            if line["ask"]:
                print(BREAK)
                query_t = line[title_field]
                xref_t = line["crossref_title"]
                # display matching segments in identical colors for easier recognition
                diff = matching_blocks(editops(query_t.lower(), xref_t.lower()), query_t, xref_t)
                query_print = query_t
                xref_print = xref_t
                # ANSI codes increase string length, so we need an offset to compensate
                offset = 0
                for i in range(len(diff)):
                    a, b, c = diff[i]
                    a += offset
                    b += offset
                    offset += 9
                    color = CMP_COLORS[i % len(CMP_COLORS)]
                    query_print = colorise_text_segment(query_print, a, a + c , color)
                    xref_print = colorise_text_segment(xref_print, b, b + c , color)
                query_head = colorise("line {}, query title:".format(line["line_num"]), "blue")
                xref_head = colorise("Possible match ({}):".format(round(line["similarity"], 2)), "yellow")
                print(query_head.ljust(L_JUST) + query_print)
                print(xref_head.ljust(L_JUST) + xref_print)
                answer = input("Do you want to accept the DOI for the match title? (y/n):")
                while answer not in ["y", "n"]:
                    answer = input("Please type 'y' or 'n':")
                if answer == "n":
                    line.update(EMPTY_RESULT)
                
        with open("out.csv", "w") as out:
            dialect = csv.excel
            dialect.quoting = csv.QUOTE_ALL
            writer = csv.DictWriter(out, header, extrasaction='ignore', dialect=dialect)
            writer.writeheader()
            writer.writerows(modified_lines)
Пример #7
0
 def _get_matching_blocks(query, text):
     return matching_blocks(editops(query, text), query, text)