示例#1
0
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, force=False):
    from billtext import load_bill_text, compare_xml_text, get_current_version
    import lxml
    
    left_bill = Bill.objects.get(id = left_bill)
    right_bill = Bill.objects.get(id = right_bill)
    
    if left_version == "": left_version = get_current_version(left_bill)
    if right_version == "": right_version = get_current_version(right_bill)
    
    btc = None
    try:
        btc = BillTextComparison.objects.get(
            bill1 = left_bill,
            ver1 = left_version,
            bill2 = right_bill,
            ver2 = right_version)
        btc.decompress()
        if not force: return btc.data
    except BillTextComparison.DoesNotExist:
        pass
    
    # Try with the bills swapped.
    try:
        btc2 = BillTextComparison.objects.get(
            bill2 = left_bill,
            ver2 = left_version,
            bill1 = right_bill,
            ver1 = right_version)
        btc2.decompress()
        data = btc2.data
        return {
            "left_meta": data["right_meta"],
            "right_meta": data["left_meta"],
            "left_text": data["right_text"],
            "right_text": data["left_text"],
        }
    except BillTextComparison.DoesNotExist:
        pass
    
    left = load_bill_text(left_bill, left_version, mods_only=True)
    right = load_bill_text(right_bill, right_version, mods_only=True)
    
    doc1 = lxml.etree.parse(left["basename"] + ".html")
    doc2 = lxml.etree.parse(right["basename"] + ".html")
    compare_xml_text(doc1, doc2, timelimit=timelimit) # revises DOMs in-place
    
    # dates aren't JSON serializable
    left["docdate"] = left["docdate"].strftime("%x")
    right["docdate"] = right["docdate"].strftime("%x")
    
    ret = {
        "left_meta": left,
        "right_meta": right,
        "left_text": lxml.etree.tostring(doc1),
        "right_text": lxml.etree.tostring(doc2),
    }
    
    if not btc:
        btc = BillTextComparison(
            bill1 = left_bill,
            ver1 = left_version,
            bill2 = right_bill,
            ver2 = right_version,
            data = dict(ret)) # clone before compress()
    else:
        btc.data = dict(ret) # clone before compress()
        
    btc.compress()
    btc.save()
    
    return ret
示例#2
0
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, use_cache=True, force_update=False):
    from .billtext import load_bill_text, get_current_version
    from xml_diff import compare
    import lxml

    left_bill = Bill.objects.get(id = left_bill)
    right_bill = Bill.objects.get(id = right_bill)

    if left_version == "": left_version = get_current_version(left_bill)
    if right_version == "": right_version = get_current_version(right_bill)

    if use_cache:
        # Load from cache.
        try:
            btc = BillTextComparison.objects.get(
                bill1 = left_bill,
                ver1 = left_version,
                bill2 = right_bill,
                ver2 = right_version)
            btc.decompress()
            return btc.data
        except BillTextComparison.DoesNotExist:
            pass

        # Load from cache - Try with the bills swapped.
        try:
            btc2 = BillTextComparison.objects.get(
                bill2 = left_bill,
                ver2 = left_version,
                bill1 = right_bill,
                ver1 = right_version)
            btc2.decompress()
            data = btc2.data
            # un-swap
            return {
                "left_meta": data["right_meta"],
                "right_meta": data["left_meta"],
                "left_text": data["right_text"],
                "right_text": data["left_text"],
            }
        except BillTextComparison.DoesNotExist:
            pass

    # Load bill text metadata.
    left = load_bill_text(left_bill, left_version, mods_only=True)
    right = load_bill_text(right_bill, right_version, mods_only=True)

    # Load XML DOMs for each document and perform the comparison.
    def load_bill_text_xml(docinfo):
        # If XML text is available, use it, but pre-render it
        # into HTML. Otherwise use the legacy HTML that we
        # scraped from THOMAS.
        if "xml_file" in docinfo:
            import congressxml
            return congressxml.convert_xml(docinfo["xml_file"])
        elif "html_file" in docinfo:
            return lxml.etree.parse(docinfo["html_file"])
        else:
            raise IOError("Bill text is not available for one of the bills.")
    doc1 = load_bill_text_xml(left)
    doc2 = load_bill_text_xml(right)
    def make_tag_func(ins_del):
        import lxml.etree
        elem = lxml.etree.Element("comparison-change")
        return elem
    def differ(text1, text2):
        # ensure we use the C++ Google DMP and can specify the time limit
        import diff_match_patch
        for x in diff_match_patch.diff(text1, text2, timelimit=timelimit):
            yield x
    compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func, differ=differ)

    # Prepare JSON response data.
        # dates aren't JSON serializable
    left["docdate"] = left["docdate"].strftime("%x")
    right["docdate"] = right["docdate"].strftime("%x")
    ret = {
        "left_meta": left,
        "right_meta": right,
        "left_text": lxml.etree.tostring(doc1, encoding=str),
        "right_text": lxml.etree.tostring(doc2, encoding=str),
    }

    if use_cache or force_update:
        # For force_update, or race conditions, delete any existing record.
        fltr = { "bill1": left_bill,
            "ver1": left_version,
            "bill2": right_bill,
            "ver2": right_version }
        BillTextComparison.objects.filter(**fltr).delete()

        # Cache in database so we don't have to re-do the comparison
        # computation again.
        btc = BillTextComparison(
            data = dict(ret), # clone before compress()
            **fltr)
        btc.compress()
        btc.save()

    # Return JSON comparison data.
    return ret
示例#3
0
def load_comparison(left_bill,
                    left_version,
                    right_bill,
                    right_version,
                    timelimit=10):
    from billtext import load_bill_text, get_current_version
    from xml_diff import compare
    import lxml

    left_bill = Bill.objects.get(id=left_bill)
    right_bill = Bill.objects.get(id=right_bill)

    if left_version == "": left_version = get_current_version(left_bill)
    if right_version == "": right_version = get_current_version(right_bill)

    use_cache = True

    if use_cache:
        # Load from cache.
        try:
            btc = BillTextComparison.objects.get(bill1=left_bill,
                                                 ver1=left_version,
                                                 bill2=right_bill,
                                                 ver2=right_version)
            btc.decompress()
            return btc.data
        except BillTextComparison.DoesNotExist:
            pass

        # Load from cache - Try with the bills swapped.
        try:
            btc2 = BillTextComparison.objects.get(bill2=left_bill,
                                                  ver2=left_version,
                                                  bill1=right_bill,
                                                  ver1=right_version)
            btc2.decompress()
            data = btc2.data
            # un-swap
            return {
                "left_meta": data["right_meta"],
                "right_meta": data["left_meta"],
                "left_text": data["right_text"],
                "right_text": data["left_text"],
            }
        except BillTextComparison.DoesNotExist:
            pass

    # Load bill text metadata.
    left = load_bill_text(left_bill, left_version, mods_only=True)
    right = load_bill_text(right_bill, right_version, mods_only=True)

    # Load XML DOMs for each document and perform the comparison.
    def load_bill_text_xml(docinfo):
        # If XML text is available, use it, but pre-render it
        # into HTML. Otherwise use the legacy HTML that we
        # scraped from THOMAS.
        if "xml_file" in docinfo:
            import congressxml
            return congressxml.convert_xml(docinfo["xml_file"])
        elif "html_file" in docinfo:
            return lxml.etree.parse(docinfo["html_file"])
        else:
            raise IOError("Bill text is not available for one of the bills.")

    doc1 = load_bill_text_xml(left)
    doc2 = load_bill_text_xml(right)

    def make_tag_func(ins_del):
        import lxml.etree
        elem = lxml.etree.Element("comparison-change")
        return elem

    compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func)

    # Prepare JSON response data.
    # dates aren't JSON serializable
    left["docdate"] = left["docdate"].strftime("%x")
    right["docdate"] = right["docdate"].strftime("%x")
    ret = {
        "left_meta": left,
        "right_meta": right,
        "left_text": lxml.etree.tostring(doc1),
        "right_text": lxml.etree.tostring(doc2),
    }

    if use_cache:
        # Cache in database so we don't have to re-do the comparison
        # computation again.
        btc = BillTextComparison(bill1=left_bill,
                                 ver1=left_version,
                                 bill2=right_bill,
                                 ver2=right_version,
                                 data=dict(ret))  # clone before compress()
        btc.compress()
        btc.save()

    # Return JSON comparison data.
    return ret
示例#4
0
def load_comparison(left_bill,
                    left_version,
                    right_bill,
                    right_version,
                    timelimit=10,
                    force=False):
    from billtext import load_bill_text, compare_xml_text, get_current_version
    import lxml

    left_bill = Bill.objects.get(id=left_bill)
    right_bill = Bill.objects.get(id=right_bill)

    if left_version == "": left_version = get_current_version(left_bill)
    if right_version == "": right_version = get_current_version(right_bill)

    btc = None
    try:
        btc = BillTextComparison.objects.get(bill1=left_bill,
                                             ver1=left_version,
                                             bill2=right_bill,
                                             ver2=right_version)
        btc.decompress()
        if not force: return btc.data
    except BillTextComparison.DoesNotExist:
        pass

    # Try with the bills swapped.
    try:
        btc2 = BillTextComparison.objects.get(bill2=left_bill,
                                              ver2=left_version,
                                              bill1=right_bill,
                                              ver1=right_version)
        btc2.decompress()
        data = btc2.data
        return {
            "left_meta": data["right_meta"],
            "right_meta": data["left_meta"],
            "left_text": data["right_text"],
            "right_text": data["left_text"],
        }
    except BillTextComparison.DoesNotExist:
        pass

    left = load_bill_text(left_bill, left_version, mods_only=True)
    right = load_bill_text(right_bill, right_version, mods_only=True)

    try:
        doc1 = lxml.etree.parse(left["html_file"])
        doc2 = lxml.etree.parse(right["html_file"])
    except KeyError:
        raise IOError(
            "The HTML bill text format is not available for one of the bills.")

    compare_xml_text(doc1, doc2, timelimit=timelimit)  # revises DOMs in-place

    # dates aren't JSON serializable
    left["docdate"] = left["docdate"].strftime("%x")
    right["docdate"] = right["docdate"].strftime("%x")

    ret = {
        "left_meta": left,
        "right_meta": right,
        "left_text": lxml.etree.tostring(doc1),
        "right_text": lxml.etree.tostring(doc2),
    }

    if not btc:
        btc = BillTextComparison(bill1=left_bill,
                                 ver1=left_version,
                                 bill2=right_bill,
                                 ver2=right_version,
                                 data=dict(ret))  # clone before compress()
    else:
        btc.data = dict(ret)  # clone before compress()

    btc.compress()
    btc.save()

    return ret
示例#5
0
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, use_cache=True, force_update=False):
    from billtext import load_bill_text, get_current_version
    from xml_diff import compare
    import lxml

    left_bill = Bill.objects.get(id = left_bill)
    right_bill = Bill.objects.get(id = right_bill)

    if left_version == "": left_version = get_current_version(left_bill)
    if right_version == "": right_version = get_current_version(right_bill)

    if use_cache:
        # Load from cache.
        try:
            btc = BillTextComparison.objects.get(
                bill1 = left_bill,
                ver1 = left_version,
                bill2 = right_bill,
                ver2 = right_version)
            btc.decompress()
            return btc.data
        except BillTextComparison.DoesNotExist:
            pass

        # Load from cache - Try with the bills swapped.
        try:
            btc2 = BillTextComparison.objects.get(
                bill2 = left_bill,
                ver2 = left_version,
                bill1 = right_bill,
                ver1 = right_version)
            btc2.decompress()
            data = btc2.data
            # un-swap
            return {
                "left_meta": data["right_meta"],
                "right_meta": data["left_meta"],
                "left_text": data["right_text"],
                "right_text": data["left_text"],
            }
        except BillTextComparison.DoesNotExist:
            pass

    # Load bill text metadata.
    left = load_bill_text(left_bill, left_version, mods_only=True)
    right = load_bill_text(right_bill, right_version, mods_only=True)

    # Load XML DOMs for each document and perform the comparison.
    def load_bill_text_xml(docinfo):
        # If XML text is available, use it, but pre-render it
        # into HTML. Otherwise use the legacy HTML that we
        # scraped from THOMAS.
        if "xml_file" in docinfo:
            import congressxml
            return congressxml.convert_xml(docinfo["xml_file"])
        elif "html_file" in docinfo:
            return lxml.etree.parse(docinfo["html_file"])
        else:
            raise IOError("Bill text is not available for one of the bills.")
    doc1 = load_bill_text_xml(left)
    doc2 = load_bill_text_xml(right)
    def make_tag_func(ins_del):
        import lxml.etree
        elem = lxml.etree.Element("comparison-change")
        return elem
    def differ(text1, text2):
        # ensure we use the C++ Google DMP and can specify the time limit
        import diff_match_patch
        for x in diff_match_patch.diff_unicode(text1, text2, timelimit=timelimit):
            yield x
    compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func, differ=differ)

    # Prepare JSON response data.
        # dates aren't JSON serializable
    left["docdate"] = left["docdate"].strftime("%x")
    right["docdate"] = right["docdate"].strftime("%x")
    ret = {
        "left_meta": left,
        "right_meta": right,
        "left_text": lxml.etree.tostring(doc1),
        "right_text": lxml.etree.tostring(doc2),
    }

    if use_cache or force_update:
        # For force_update, or race conditions, delete any existing record.
        fltr = { "bill1": left_bill,
            "ver1": left_version,
            "bill2": right_bill,
            "ver2": right_version }
        BillTextComparison.objects.filter(**fltr).delete()

        # Cache in database so we don't have to re-do the comparison
        # computation again.
        btc = BillTextComparison(
            data = dict(ret), # clone before compress()
            **fltr)
        btc.compress()
        btc.save()

    # Return JSON comparison data.
    return ret