def g(options, filename, *args, **kwargs):
        f = open(filename, "rb")
        
        progress = Progress(name='rows [%s]' % filename, step=2000)
        
        f.seek(0, os.SEEK_END)
        total=f.tell()
        f.seek(0, os.SEEK_SET)
        
        fmt = 'tsv'
        if "BillSubject" in filename: fmt = 'csv'

        for rownum, row in BT50FileReader(
            f, open(filename.replace(".txt", "FD.txt"), "rb"),
            "cp1252",
            delimiter = ',' if fmt == 'csv' else '\t',
            quotechar = '"' if fmt == 'csv' else None):
        
            # make a hash of the row so we can tell quickly if it's been changed, but
            # skip the LastUpdated column because it might be spurriously updated and
            # we don't care if the other fields didn't change anyway.
            row["_hash"] = hashlib.sha1(repr(sorted(kv for kv in row.items() if kv[0] != 'LastUpdated'))).hexdigest()
            
            rownum += 1
            progress.tick(x=f.tell(), y=total)
            
            try:
                func(row, options, filename, *args, **kwargs)
            except:
                print "Error in %s line %d." % (filename, rownum)
                raise
        return None
示例#2
0
    def g(options, filename, *args, **kwargs):
        f = open(filename, "rb")

        progress = Progress(name='rows [%s]' % filename, step=2000)

        f.seek(0, os.SEEK_END)
        total = f.tell()
        f.seek(0, os.SEEK_SET)

        fmt = 'tsv'
        if "BillSubject" in filename: fmt = 'csv'

        for rownum, row in BT50FileReader(
                f,
                open(filename.replace(".txt", "FD.txt"), "rb"),
                "cp1252",
                delimiter=',' if fmt == 'csv' else '\t',
                quotechar='"' if fmt == 'csv' else None):

            # make a hash of the row so we can tell quickly if it's been changed, but
            # skip the LastUpdated column because it might be spurriously updated and
            # we don't care if the other fields didn't change anyway.
            row["_hash"] = hashlib.sha1(
                repr(sorted(kv for kv in row.items()
                            if kv[0] != 'LastUpdated'))).hexdigest()

            rownum += 1
            progress.tick(x=f.tell(), y=total)

            try:
                func(row, options, filename, *args, **kwargs)
            except:
                print "Error in %s line %d." % (filename, rownum)
                raise
        return None
def parse_committee_names(options):
    log.info('Processing committees')
    COMMITTEES_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committees-current.yaml'

    if not File.objects.is_changed(COMMITTEES_FILE) and not options.force:
        log.info('File %s was not changed' % COMMITTEES_FILE)
    else:
        tree = yaml_load(COMMITTEES_FILE)
        total = len(tree)
        progress = Progress(total=total)
        seen_committees = set()
        for committee in tree:
            try:
                cobj = Committee.objects.get(code=committee["thomas_id"])
            except Committee.DoesNotExist:
                print("New committee:", committee["thomas_id"])
                cobj = Committee(code=committee["thomas_id"])

            cobj.committee_type = TYPE_MAPPING[committee["type"]]
            cobj.name = committee["name"]
            cobj.url = committee.get("url", None)
            cobj.obsolete = False
            cobj.committee = None
            cobj.jurisdiction = committee.get("jurisdiction")
            cobj.jurisdiction_link = committee.get("jurisdiction_source")
            cobj.save()
            seen_committees.add(cobj.id)

            for subcom in committee.get('subcommittees', []):
                code = committee["thomas_id"] + subcom["thomas_id"]
                try:
                    sobj = Committee.objects.get(code=code)
                except Committee.DoesNotExist:
                    print("New subcommittee:", code)
                    sobj = Committee(code=code)

                sobj.name = subcom["name"]
                sobj.url = subcom.get("url", None)
                sobj.type = None
                sobj.committee = cobj
                sobj.obsolete = False
                sobj.save()
                seen_committees.add(sobj.id)

            progress.tick()

        # Check for non-obsolete committees in the database that aren't in our
        # file.
        other_committees = Committee.objects.filter(obsolete=False).exclude(
            id__in=seen_committees)
        if len(other_committees) > 0:
            print("Marking obsolete:",
                  ", ".join(c.code for c in other_committees))
            other_committees.update(obsolete=True)

        File.objects.save_file(COMMITTEES_FILE)
def parse_committee_names(options):
    log.info('Processing committees')
    COMMITTEES_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committees-current.yaml'

    if not File.objects.is_changed(COMMITTEES_FILE) and not options.force:
        log.info('File %s was not changed' % COMMITTEES_FILE)
    else:
        tree = yaml_load(COMMITTEES_FILE)
        total = len(tree)
        progress = Progress(total=total)
        seen_committees = set()
        for committee in tree:
            try:
                cobj = Committee.objects.get(code=committee["thomas_id"])
            except Committee.DoesNotExist:
                print("New committee:", committee["thomas_id"])
                cobj = Committee(code=committee["thomas_id"])
               
            cobj.committee_type = TYPE_MAPPING[committee["type"]]
            cobj.name = committee["name"]
            cobj.url = committee.get("url", None)
            cobj.obsolete = False
            cobj.committee = None
            cobj.jurisdiction = committee.get("jurisdiction")
            cobj.jurisdiction_link = committee.get("jurisdiction_source")
            cobj.save()
            seen_committees.add(cobj.id)

            for subcom in committee.get('subcommittees', []):
                code = committee["thomas_id"] + subcom["thomas_id"]
                try:
                    sobj = Committee.objects.get(code=code)
                except Committee.DoesNotExist:
                    print("New subcommittee:", code)
                    sobj = Committee(code=code)
                
                sobj.name = subcom["name"]
                sobj.url = subcom.get("url", None)
                sobj.type = None
                sobj.committee = cobj
                sobj.obsolete = False
                sobj.save()
                seen_committees.add(sobj.id)
                
            progress.tick()
            
        # Check for non-obsolete committees in the database that aren't in our
        # file.
        other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees)
        if len(other_committees) > 0:
            print("Marking obsolete:", ", ".join(c.code for c in other_committees))
            other_committees.update(obsolete=True)

        File.objects.save_file(COMMITTEES_FILE)
def parse_committee_members(options):
    log.info('Processing committee members')
    MEMBERS_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committee-membership-current.yaml'
    file_changed = File.objects.is_changed(MEMBERS_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % MEMBERS_FILE)
    else:
        # map Bioguide IDs to GovTrack IDs
        y = yaml_load(settings.CONGRESS_PROJECT_PATH +
                      "/congress-legislators/legislators-current.yaml")
        person_id_map = {}
        for m in y:
            if "id" in m and "govtrack" in m["id"] and "bioguide" in m["id"]:
                person_id_map[m["id"]["bioguide"]] = m["id"]["govtrack"]

        # load committee members
        tree = yaml_load(MEMBERS_FILE)
        total = len(tree)
        progress = Progress(total=total, name='committees')

        # We can delete CommitteeMember objects because we don't have
        # any foreign keys to them.
        CommitteeMember.objects.all().delete()

        # Process committee nodes
        for committee, members in tree.items():
            try:
                cobj = Committee.objects.get(code=committee)
            except Committee.DoesNotExist:
                print("Committee not found:", committee)
                continue

            # Process members of current committee node
            for member in members:
                mobj = CommitteeMember()
                mobj.person = Person.objects.get(
                    id=person_id_map[member["bioguide"]])
                mobj.committee = cobj
                if "title" in member:
                    mobj.role = ROLE_MAPPING[member["title"]]
                mobj.save()

            progress.tick()

        File.objects.save_file(MEMBERS_FILE)
def parse_committee_members(options):
    log.info('Processing committee members')
    MEMBERS_FILE = settings.CONGRESS_PROJECT_PATH + '/congress-legislators/committee-membership-current.yaml'
    file_changed = File.objects.is_changed(MEMBERS_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % MEMBERS_FILE)
    else:
        # map Bioguide IDs to GovTrack IDs
        y = yaml_load(settings.CONGRESS_PROJECT_PATH + "/congress-legislators/legislators-current.yaml")
        person_id_map = { }
        for m in y:
            if "id" in m and "govtrack" in m["id"] and "bioguide" in m["id"]:
                person_id_map[m["id"]["bioguide"]] = m["id"]["govtrack"]
        
        # load committee members
        tree = yaml_load(MEMBERS_FILE)
        total = len(tree)
        progress = Progress(total=total, name='committees')
        
        # We can delete CommitteeMember objects because we don't have
        # any foreign keys to them.
        CommitteeMember.objects.all().delete()

        # Process committee nodes
        for committee, members in tree.items():
            try:
                cobj = Committee.objects.get(code=committee)
            except Committee.DoesNotExist:
                print("Committee not found:", committee)
                continue

            # Process members of current committee node
            for member in members:
                mobj = CommitteeMember()
                mobj.person = Person.objects.get(id=person_id_map[member["bioguide"]])
                mobj.committee = cobj
                if "title" in member:
                    mobj.role = ROLE_MAPPING[member["title"]]
                mobj.save()
            
            progress.tick()

        File.objects.save_file(MEMBERS_FILE)
def main(options):
    """
    Process amendments
    """

    if options.congress:
        files = glob.glob('data/us/%s/bills.amdt/*.xml' % options.congress)
        log.info('Parsing amendments of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills.amdt/*.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing amendments: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    amendment_processor = AmendmentProcessor()
    seen_amdt_ids = []
    for fname in files:
        progress.tick()
        
        if not File.objects.is_changed(fname) and not options.force:
            m = re.match(r"data/us/(\d+)/bills.amdt/([sh])(\d+).xml", fname)
            if not m:
                print "Invalid file name", fname
            else:
                amdt = Amendment.objects.get(congress=m.group(1), amendment_type=AmendmentType.by_slug(m.group(2)), number=m.group(3))
                seen_amdt_ids.append(amdt.id) # don't delete me later
            continue
            
        tree = etree.parse(fname)
        node = tree.xpath('/amendment')[0]
        
        try:
            amdt = amendment_processor.process(Amendment(), node)
        except:
            print fname
            raise

        if not amdt:
            # Amendments to treaties. Can't process.
            continue
            
        # update if already in db
        try:
            amdt.id = Amendment.objects.get(congress=amdt.congress, amendment_type=amdt.amendment_type, number=amdt.number).id
        except Amendment.DoesNotExist:
            pass # a new amendment
       
        seen_amdt_ids.append(amdt.id) # don't delete me later
        
        try:
            amdt.save()
        except:
            print amdt
            raise
            
        # For House votes on amendments, the only way to associate the vote with the
        # amendment is to use the THOMAS/Congress.gov action lines. The House vote XML
        # has an amendment-num field but its meaning is ambiguous, so it is useless.
        # When we parse a House amendment with an action line referencing a roll call vote,
        # save this amendment as that vote's related_amendment, then mark the vote as
        # 'missing data' (below) so that on the next parse of votes its title gets updated.
        if amdt.amendment_type == AmendmentType.house_amendment:
            for vote in node.xpath("actions/vote[@how='roll']"):
                v_congress, v_session = get_session_from_date(XmlProcessor.parse_datetime(vote.get('datetime')).date())
                v_roll = int(vote.get("roll"))
                try:
                    vote = Vote.objects.get(congress=v_congress, chamber=CongressChamber.house, session=v_session, number=v_roll)
                    vote.related_amendment = amdt
                    vote.save()
                except Vote.DoesNotExist:
                    print "Missing vote data in", fname
            
        # If this amendment is related to a vote, mark the vote as missing data because
        # we may need to update the vote title if the amendment title has changed.
        Vote.objects.filter(related_amendment=amdt).update(missing_data=True)

        File.objects.save_file(fname)
        
    # Are any amendments in the database no longer on disk?
    if options.congress and not options.filter:
        missing = Amendment.objects.filter(congress=options.congress).exclude(id__in = seen_amdt_ids)
        if missing.exists():
            print "Amendments should be deleted: ", missing
示例#8
0
def main(options):
    """
    Parse rolls.
    """

    # Setup XML processors
    vote_processor = VoteProcessor()
    option_processor = VoteOptionProcessor()
    voter_processor = VoterProcessor()
    voter_processor.PERSON_CACHE = dict(
        (x.pk, x) for x in Person.objects.all())

    # The pattern which the roll file matches
    # Filename contains info which should be placed to DB
    # along with info extracted from the XML file
    re_path = re.compile('data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml')

    chamber_mapping = {'s': CongressChamber.senate, 'h': CongressChamber.house}

    if options.filter:
        files = glob.glob(options.filter)
        log.info('Parsing rolls matching %s' % options.filter)
    elif options.congress:
        files = glob.glob('data/us/%s/rolls/*.xml' % options.congress)
        log.info('Parsing rolls of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/rolls/*.xml')
    log.info('Processing votes: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=10)

    def log_delete_qs(qs):
        if qs.count() == 0: return
        print "Deleting obsoleted records: ", qs
        #if qs.count() > 3:
        #    print "Delete skipped..."
        #    return
        qs.delete()

    seen_obj_ids = set()
    had_error = False

    for fname in files:
        progress.tick()

        match = re_path.search(fname)

        try:
            existing_vote = Vote.objects.get(
                congress=match.group(1),
                chamber=chamber_mapping[match.group(2)],
                session=match.group(3),
                number=match.group(4))
        except Vote.DoesNotExist:
            existing_vote = None

        if not File.objects.is_changed(
                fname
        ) and not options.force and existing_vote != None and not existing_vote.missing_data:
            seen_obj_ids.add(existing_vote.id)
            continue

        try:
            tree = etree.parse(fname)

            ## Look for votes with VP tie breakers.
            #if len(tree.xpath("/roll/voter[@VP='1']")) == 0:
            #    had_error = True # prevent delete at the end
            #    continue

            # Process role object
            roll_node = tree.xpath('/roll')[0]

            # Sqlite is much faster when lots of saves are wrapped in a transaction,
            # and we do a lot of saves because it's a lot of voters.
            from django.db import transaction
            with transaction.atomic():

                vote = vote_processor.process(Vote(), roll_node)
                if existing_vote: vote.id = existing_vote.id
                match = re_path.search(fname)
                vote.congress = int(match.group(1))
                vote.chamber = chamber_mapping[match.group(2)]
                vote.session = match.group(3)
                vote.number = int(match.group(4))

                # Get related bill & amendment.

                for bill_node in roll_node.xpath("bill"):
                    related_bill_num = bill_node.get("number")
                    if 9 <= vote.congress <= 42 and vote.session in ('1', '2'):
                        # Bill numbering from the American Memory colletion is different. The number combines
                        # the session, bill number, and a 0 or 5 for regular or 'half' numbering. Prior to
                        # the 9th congress numbering seems to be wholly assigned by us and not related to
                        # actual numbering, so we skip matching those bills.
                        related_bill_num = "%d%04d%d" % (int(
                            vote.session), int(bill_node.get("number")), 0)
                    try:
                        vote.related_bill = Bill.objects.get(
                            congress=bill_node.get("session"),
                            bill_type=BillType.by_xml_code(
                                bill_node.get("type")),
                            number=related_bill_num)
                    except Bill.DoesNotExist:
                        if vote.congress >= 93:
                            vote.missing_data = True

                for amdt_node in roll_node.xpath("amendment"):
                    if amdt_node.get(
                            "ref"
                    ) == "regular" and vote.related_bill is not None:
                        try:
                            vote.related_amendment = Amendment.objects.get(
                                congress=vote.related_bill.congress,
                                amendment_type=AmendmentType.by_slug(
                                    amdt_node.get("number")[0]),
                                number=amdt_node.get("number")[1:])
                        except Amendment.DoesNotExist:
                            if vote.congress >= 93:
                                print "Missing amendment", fname
                                vote.missing_data = True
                    elif amdt_node.get("ref") == "bill-serial":
                        # It is impossible to associate House votes with amendments just from the House
                        # vote XML because the amendment-num might correspond either with the A___ number
                        # or with the "An amendment, numbered ____" number from the amendment purpose,
                        # and there's really no way to figure out which. Maybe we can use the amendment
                        # sponsor instead?
                        #vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number"))
                        # Instead, we set related_amendment from the amendment parser. Here, we have to
                        # preserve the related_amendment if it is set.
                        if existing_vote:
                            vote.related_amendment = existing_vote.related_amendment

                # clean up some question text and use the question_details field

                if vote.category in (
                        VoteCategory.passage, VoteCategory.passage_suspension,
                        VoteCategory.veto_override) and vote.related_bill:
                    # For passage votes, set the question to the bill title and put the question
                    # details in the details field.
                    vote.question = vote.related_bill.title
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display(
                    )

                elif vote.category == VoteCategory.amendment and vote.related_amendment:
                    # For votes on amendments, make a better title/explanation.
                    vote.question = vote.related_amendment.title
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display(
                    )

                elif vote.related_bill and vote.question.startswith(
                        "On the Cloture Motion " +
                        vote.related_bill.display_number):
                    vote.question = "Cloture on " + vote.related_bill.title
                elif vote.related_bill and vote.question.startswith(
                        "On Cloture on the Motion to Proceed " +
                        vote.related_bill.display_number):
                    vote.question = "Cloture on " + vote.related_bill.title
                    vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display(
                    )
                elif vote.related_bill and vote.question.startswith(
                        "On the Motion to Proceed " +
                        vote.related_bill.display_number):
                    vote.question = "Motion to Proceed on " + vote.related_bill.title

                elif vote.related_amendment and vote.question.startswith(
                        "On the Cloture Motion " +
                        vote.related_amendment.get_amendment_type_display() +
                        " " + str(vote.related_amendment.number)):
                    vote.question = "Cloture on " + vote.related_amendment.title
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display(
                    )

                # weird House foratting of bill numbers ("H RES 123 Blah blah")
                if vote.related_bill:
                    vote.question = re.sub(
                        "(On [^:]+): " +
                        vote.related_bill.display_number.replace(
                            ". ", " ").replace(".", " ").upper() + " .*",
                        r"\1: " + truncatewords(vote.related_bill.title, 15),
                        vote.question)

                vote.save()

                seen_obj_ids.add(vote.id)  # don't delete me later

                # Process roll options, overwrite existing options where possible.
                seen_option_ids = set()
                roll_options = {}
                for option_node in roll_node.xpath('./option'):
                    option = option_processor.process(VoteOption(),
                                                      option_node)
                    option.vote = vote
                    if existing_vote:
                        try:
                            option.id = VoteOption.objects.filter(
                                vote=vote, key=option.key
                            )[0].id  # get is better, but I had the database corruption problem
                        except IndexError:
                            pass
                    option.save()
                    roll_options[option.key] = option
                    seen_option_ids.add(option.id)
                log_delete_qs(
                    VoteOption.objects.filter(vote=vote).exclude(
                        id__in=seen_option_ids)
                )  # may cascade and delete the Voters too?

                # Process roll voters, overwriting existing voters where possible.
                if existing_vote:
                    existing_voters = dict(
                        Voter.objects.filter(vote=vote).values_list(
                            "person", "id"))
                seen_voter_ids = set()
                voters = list()
                for voter_node in roll_node.xpath('./voter'):
                    voter = voter_processor.process(roll_options, Voter(),
                                                    voter_node)
                    voter.vote = vote
                    voter.created = vote.created

                    # for VP votes, load the actual person & role...
                    if voter.voter_type == VoterType.vice_president:
                        try:
                            r = PersonRole.objects.get(
                                role_type=RoleType.vicepresident,
                                startdate__lte=vote.created,
                                enddate__gte=vote.created)
                            voter.person_role = r
                            voter.person = r.person
                        except PersonRole.DoesNotExist:
                            # overlapping roles? missing data?
                            log.error(
                                'Could not resolve vice president in %s' %
                                fname)

                    if existing_vote and voter.person:
                        try:
                            voter.id = existing_voters[voter.person.id]
                        except KeyError:
                            pass

                    voters.append(voter)

                    if voter.voter_type == VoterType.unknown and not vote.missing_data:
                        vote.missing_data = True
                        vote.save()

                # pre-fetch the role of each voter
                load_roles_at_date(
                    [x.person for x in voters if x.person != None],
                    vote.created, vote.congress)
                for voter in list(voters):
                    if voter.voter_type != VoterType.vice_president:
                        voter.person_role = voter.person.role
                    # If we couldn't match a role for this person on the date of the vote, and if the voter was Not Voting,
                    # and we're looking at historical data, then this is probably a data error --- the voter wasn't even in office.
                    if voter.person_role is None:
                        if vote.source == VoteSource.keithpoole and voter.option.key == "0":
                            # Drop this record.
                            voters.remove(voter)
                        else:
                            log.error("%s: Could not find role for %s on %s." %
                                      (fname, voter.person, vote.created))
                            vote.missing_data = True
                            vote.save()

                # save all of the records (inserting/updating)
                for voter in voters:
                    voter.save()
                    seen_voter_ids.add(voter.id)

                # remove obsolete voter records
                log_delete_qs(
                    Voter.objects.filter(vote=vote).exclude(
                        id__in=seen_voter_ids)
                )  # possibly already deleted by cascade above

                # pre-calculate totals
                vote.calculate_totals()

                if not options.disable_events:
                    vote.create_event()

            File.objects.save_file(fname)

        except Exception, ex:
            log.error('Error in processing %s' % fname, exc_info=ex)
            had_error = True
示例#9
0
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()
    
    # Cache existing terms. There aren't so many.
    existing_terms = { }
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'data/us/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()
            
        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                term.subterms.add(subterm) 
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)
                    
                    existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' % term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)
                        
                        existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' % term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills
    
    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress:
        files = glob.glob('data/us/%s/bills/*.xml' % options.congress)
        log.info('Parsing bills of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills/*.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()
        
        if not File.objects.is_changed(fname) and not options.force:
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
                seen_bill_ids.append(b.id)
                
                # Update the index/events for any bill with recently changed text
                textfile = "data/us/bills.text/%s/%s/%s%s.txt" % (m.group(1), m.group(2), m.group(2), m.group(3))
                if (bill_index and not options.disable_events) and os.path.exists(textfile) and File.objects.is_changed(textfile):
                    bill_index.update_object(b, using="bill") # index the full text
                    b.create_events() # events for new bill text documents
                    File.objects.save_file(textfile)
                    
                continue
            except Bill.DoesNotExist:
                pass # just parse as normal
            
        if options.slow:
            time.sleep(1)
            
        skip_stuff = False
            
        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            if not skip_stuff:
                try:
                    bill = bill_processor.process(Bill(), node)
                except:
                    print fname
                    raise
            else:
                m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)
                bill = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
           
            seen_bill_ids.append(bill.id) # don't delete me later
            
            actions = []
            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/*[@state]"):
                actions.append( (repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)")) )
                
                if actions[-1][1] in (BillStatus.enacted_signed, BillStatus.enacted_veto_override):
                    bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI"
                    bill.sliplawnum = int(axn.get("number").split("-")[1])
                    
            bill.major_actions = actions
            try:
                bill.save()
            except:
            	print bill
            	raise
            if bill_index: bill_index.update_object(bill, using="bill")
            
            if not options.disable_events:
                bill.create_events()

        if not skip_stuff:
            File.objects.save_file(fname)
        
    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter and False:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids).delete()
        
    # Parse docs.house.gov for what might be coming up this week.
    import iso8601
    dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read()
    m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html)
    if not m:
        log.error('No docs.house.gov download link found at http://docs.house.gov.')
    else:
        def bt_re(bt): return re.escape(bt[1]).replace(r"\.", "\.?\s*")
        try:
            dhg = etree.parse(urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot()
        except:
            print "http://docs.house.gov/" + m.group(1)
            raise
        # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date()
        for item in dhg.xpath("category/floor-items/floor-item"):
            billname = item.xpath("legis-num")[0].text
            m = re.match("\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?("
                + "|".join(bt_re(bt) for bt in BillType)
                + ")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I)
            if not m:
                if billname.strip() != "H.R. __":
                    log.error('Could not parse legis-num "%s" in docs.house.gov.' % billname)
            else:
                for bt in BillType:
                    if re.match(bt_re(bt) + "$", m.group(1)):
                        try:
                            bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2))
                            bill.docs_house_gov_postdate = iso8601.parse_date(item.get("add-date")).replace(tzinfo=None)
                            bill.save()
                            if bill_index: bill_index.update_object(bill, using="bill")
                            
                            if not options.disable_events:
                                bill.create_events()
                        except Bill.DoesNotExist:
                            log.error('Could not find bill "%s" in docs.house.gov.' % billname)
                        break
                else:
                    log.error('Could not parse legis-num bill type "%s" in docs.house.gov.' % billname)

    # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow.
    now = datetime.now()
    sfs = urllib.urlopen("http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm").read()
    try:
        sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1)
        for congress, bill_type, number in re.findall(r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs):
            bill_type = BillType.by_slug(bill_type)
            bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number)
            if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(days=7):
                bill.senate_floor_schedule_postdate = now
                bill.save()
                if bill_index: bill_index.update_object(bill, using="bill")
                if not options.disable_events:
                    bill.create_events()
    except Exception as e:
        log.error('Could not parse Senate Floor Schedule: ' + repr(e))
示例#10
0
def main(options):
    """
    Update Person and PersonRole models.
    
    Do safe update: touch only those records
    which have been changed.
    """

    BASE_PATH = CONGRESS_LEGISLATORS_PATH
    SRC_FILES = [
        'legislators-current', 'legislators-historical',
        'legislators-social-media', 'executive'
    ]  # order matters

    for p in SRC_FILES:
        f = BASE_PATH + p + ".yaml"
        if not File.objects.is_changed(f) and not options.force:
            log.info('File %s was not changed' % f)
        else:
            # file modified...
            break
    else:
        # no 'break' ==> no files modified
        return

    # Start parsing.

    had_error = False

    # Get combined data.
    legislator_data = {}
    leg_id_map = {}
    for p in SRC_FILES:
        log.info('Opening %s...' % p)
        f = BASE_PATH + p + ".yaml"
        y = yaml_load(f)
        for m in y:
            if p == "legislators-current":
                # We know all terms but the last are non-current and the last is.
                for r in m["terms"]:
                    r["current"] = False
                m["terms"][-1]["current"] = True
            elif p == "legislators-historical":
                # We know all terms are non-current.
                for r in m["terms"]:
                    r["current"] = False

            if p != 'legislators-social-media':
                govtrack_id = m["id"].get("govtrack")

                # For the benefit of the social media file, make a mapping of IDs.
                for k, v in m["id"].items():
                    if type(v) != list:
                        leg_id_map[(k, v)] = govtrack_id
            else:
                # GovTrack IDs are not always listed in this file.
                govtrack_id = None
                for k, v in m["id"].items():
                    if type(v) != list and (k, v) in leg_id_map:
                        govtrack_id = leg_id_map[(k, v)]
                        break

            if not govtrack_id:
                print("No GovTrack ID:")
                pprint.pprint(m)
                had_error = True
                continue

            if govtrack_id not in legislator_data:
                legislator_data[govtrack_id] = m
            elif p == "legislators-social-media":
                legislator_data[govtrack_id]["social"] = m["social"]
            elif p == "executive":
                legislator_data[govtrack_id]["terms"].extend(m["terms"])
            else:
                raise ValueError("Duplication in an unexpected way (%d, %s)." %
                                 (govtrack_id, p))

    person_processor = PersonProcessor()
    role_processor = PersonRoleProcessor()

    existing_persons = set(Person.objects.values_list('pk', flat=True))
    processed_persons = set()
    created_persons = set()

    progress = Progress(total=len(legislator_data))
    log.info('Processing persons')

    for node in legislator_data.values():
        # Wrap each iteration in try/except
        # so that if some node breaks the parsing process
        # then other nodes could be parsed
        try:
            person = person_processor.process(Person(), node)

            # Create cached name strings. This is done again later
            # after the roles are updated.
            person.set_names()

            # Now try to load the person with such ID from
            # database. If found it then just update it
            # else create new Person object
            try:
                ex_person = Person.objects.get(pk=person.pk)
                if person_processor.changed(ex_person,
                                            person) or options.force:
                    # If the person has PK of existing record,
                    # coming in via the YAML-specified GovTrack ID,
                    # then Django ORM will update existing record
                    if not options.force:
                        log.warn("Updated %s" % person)
                    person.save()

            except Person.DoesNotExist:
                created_persons.add(person.pk)
                person.save()
                log.warn("Created %s" % person)

            processed_persons.add(person.pk)

            # Parse all of the roles.
            new_roles = []
            for termnode in node['terms']:
                role = role_processor.process(PersonRole(), termnode)
                role.person = person
                role.extra = filter_yaml_term_structure(
                    termnode)  # copy in the whole YAML structure

                # Is this role current? For legislators, same as whether it came from legislators-current, which eases Jan 3 transitions when we can't distinguish by date.
                if "current" in termnode:
                    role.current = termnode["current"]

                # But executives...
                else:
                    now = datetime.now().date()
                    role.current = role.startdate <= now and role.enddate >= now
                    # Because of date overlaps at noon transition dates, ensure that only the last term that covers
                    # today is current --- reset past roles to not current. Doesn't handle turning off retirning people tho.
                    for r in new_roles:
                        r.current = False

                # Scan for most recent leadership role within the time period of this term,
                # which isn't great for Senators because it's likely it changed a few times
                # within a term, especially if there was a party switch.
                role.leadership_title = None
                for leadership_node in node.get("leadership_roles", []):
                    # must match on date and chamber
                    if leadership_node["start"] >= role.enddate.isoformat():
                        continue  # might start on the same day but is for the next Congress
                    if "end" in leadership_node and leadership_node[
                            "end"] <= role.startdate.isoformat():
                        continue  # might start on the same day but is for the previous Congress
                    if leadership_node["chamber"] != RoleType.by_value(
                            role.role_type).congress_chamber.lower():
                        continue
                    role.leadership_title = leadership_node["title"]

                new_roles.append(role)

            # Try matching the new roles to existing db records. Since we don't have a primry key
            # in the source data, we have to match on the record values. But because of errors in data,
            # term start/end dates can change, so matching has to be a little fuzzy.
            existing_roles = list(PersonRole.objects.filter(person=person))
            matches = []

            def run_match_rule(rule):
                import itertools
                for new_role, existing_role in itertools.product(
                        new_roles, existing_roles):
                    if new_role not in new_roles or existing_role not in existing_roles:
                        continue  # already matched on a previous iteration
                    if new_role.role_type != existing_role.role_type: continue
                    if new_role.state != existing_role.state: continue
                    if rule(new_role, existing_role):
                        matches.append((new_role, existing_role))
                        new_roles.remove(new_role)
                        existing_roles.remove(existing_role)

            # First match exactly, then exact on just one date, then on contractions and expansions.
            run_match_rule(lambda new_role, existing_role: new_role.startdate
                           == existing_role.startdate and new_role.enddate ==
                           existing_role.enddate)
            run_match_rule(lambda new_role, existing_role: new_role.startdate
                           == existing_role.startdate or new_role.enddate ==
                           existing_role.enddate)
            run_match_rule(lambda new_role, existing_role: new_role.startdate
                           >= existing_role.startdate and new_role.enddate <=
                           existing_role.enddate)
            run_match_rule(lambda new_role, existing_role: new_role.startdate
                           <= existing_role.startdate and new_role.enddate >=
                           existing_role.enddate)

            # Update the database entries that correspond with records in the data file.
            did_update_any = False
            for new_role, existing_role in matches:
                new_role.id = existing_role.id
                if role_processor.changed(existing_role,
                                          new_role) or options.force:
                    new_role.save()
                    did_update_any = True
                    if not options.force:
                        log.warn("Updated %s" % new_role)

            # If we have mutliple records on disk that didn't match and multiple records in the database
            # that didn't match, then we don't know how to align them.
            if len(new_roles) > 0 and len(existing_roles) > 0:
                print(new_roles)
                print(existing_roles)
                raise Exception("There is an unmatched role.")

            # Otherwise if there are any unmatched new roles, we can just add them.
            for role in new_roles:
                log.warn("Created %s" % role)
                role.save()
                did_update_any = True

            # And likewise for any existing roles that are left over.
            for pr in existing_roles:
                print(pr.person.id, pr)
                raise ValueError("Deleted role??")
                log.warn("Deleted %s" % pr)
                pr.delete()

            if did_update_any and not options.disable_events:
                # Create the events for the roles after all have been loaded
                # because we don't create events for ends of terms and
                # starts of terms that are adjacent. Refresh the list to get
                # the roles in order.
                role_list = list(
                    PersonRole.objects.filter(
                        person=person).order_by('startdate'))
                for i in range(len(role_list)):
                    role_list[i].create_events(
                        role_list[i - 1] if i > 0 else None,
                        role_list[i + 1] if i < len(role_list) - 1 else None)

            # The name can't be determined until all of the roles are set. If
            # it changes, re-save. Unfortunately roles are cached so this actually
            # doesn't work yet. Re-run the parser to fix names.
            nn = (person.name, person.sortname)
            if hasattr(person, "role"):
                delattr(person, "role")  # clear the cached info
            person._most_recent_role = None  # clear cache here too
            person.set_names()
            if nn != (person.name, person.sortname):
                log.warn("%s is now %s." % (nn[0], person.name))
                person.save()

        except Exception as ex:
            # Catch unexpected exceptions and log them
            pprint.pprint(node)
            log.error('', exc_info=ex)
            had_error = True

        progress.tick()

    log.info('Processed persons: %d' % len(processed_persons))
    log.info('Created persons: %d' % len(created_persons))

    if not had_error:
        # Remove person which were not found in XML file
        removed_persons = existing_persons - processed_persons
        for pk in removed_persons:
            p = Person.objects.get(pk=pk)
            if p.roles.all().count() > 0:
                log.warn("Missing? Deleted? %d: %s" % (p.id, p))
            else:
                log.warn("Deleting... %d: %s (remember to prune_index!)" %
                         (p.id, p))
                raise Exception("Won't delete!")
                p.delete()
        log.info('Missing/deleted persons: %d' % len(removed_persons))

        # Mark the files as processed.
        for p in SRC_FILES:
            f = BASE_PATH + p + ".yaml"
            File.objects.save_file(f)

    update_twitter_list()
示例#11
0
def main(options):
    """
    Process committees, subcommittees and
    members of current congress committees.
    """

    BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH

    meeting_processor = CommitteeMeetingProcessor()

    log.info('Processing committees')
    COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml'

    if not File.objects.is_changed(COMMITTEES_FILE) and not options.force:
        log.info('File %s was not changed' % COMMITTEES_FILE)
    else:
        tree = yaml_load(COMMITTEES_FILE)
        total = len(tree)
        progress = Progress(total=total)
        seen_committees = set()
        for committee in tree:
            try:
                cobj = Committee.objects.get(code=committee["thomas_id"])
            except Committee.DoesNotExist:
                print "New committee:", committee["thomas_id"]
                cobj = Committee(code=committee["thomas_id"])

            cobj.committee_type = TYPE_MAPPING[committee["type"]]
            cobj.name = committee["name"]
            cobj.url = committee.get("url", None)
            cobj.obsolete = False
            cobj.committee = None
            cobj.jurisdiction = committee.get("jurisdiction")
            cobj.jurisdiction_link = committee.get("jurisdiction_source")
            cobj.save()
            seen_committees.add(cobj.id)

            for subcom in committee.get('subcommittees', []):
                code = committee["thomas_id"] + subcom["thomas_id"]
                try:
                    sobj = Committee.objects.get(code=code)
                except Committee.DoesNotExist:
                    print "New subcommittee:", code
                    sobj = Committee(code=code)

                sobj.name = subcom["name"]
                sobj.url = subcom.get("url", None)
                sobj.type = None
                sobj.committee = cobj
                sobj.obsolete = False
                sobj.save()
                seen_committees.add(sobj.id)

            progress.tick()

        # Check for non-obsolete committees in the database that aren't in our
        # file.
        other_committees = Committee.objects.filter(obsolete=False).exclude(
            id__in=seen_committees)
        if len(other_committees) > 0:
            print "Marking obsolete:", ", ".join(c.code
                                                 for c in other_committees)
            other_committees.update(obsolete=True)

        File.objects.save_file(COMMITTEES_FILE)

    log.info('Processing committee members')
    MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml'
    file_changed = File.objects.is_changed(MEMBERS_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % MEMBERS_FILE)
    else:
        # map THOMAS IDs to GovTrack IDs
        y = yaml_load(BASE_PATH + "legislators-current.yaml")
        person_id_map = {}
        for m in y:
            if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]:
                person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"]

        # load committee members
        tree = yaml_load(MEMBERS_FILE)
        total = len(tree)
        progress = Progress(total=total, name='committees')

        # We can delete CommitteeMember objects because we don't have
        # any foreign keys to them.
        CommitteeMember.objects.all().delete()

        # Process committee nodes
        for committee, members in tree.items():
            try:
                cobj = Committee.objects.get(code=committee)
            except Committee.DoesNotExist:
                print "Committee not found:", committee
                continue

            # Process members of current committee node
            for member in members:
                mobj = CommitteeMember()
                mobj.person = Person.objects.get(
                    id=person_id_map[member["thomas"]])
                mobj.committee = cobj
                if "title" in member:
                    mobj.role = ROLE_MAPPING[member["title"]]
                mobj.save()

            progress.tick()

        File.objects.save_file(MEMBERS_FILE)

    log.info('Processing committee schedule')
    for chamber in ("house", "senate"):
        meetings_file = 'data/congress/committee_meetings_%s.json' % chamber
        file_changed = File.objects.is_changed(meetings_file)

        if not file_changed and not options.force:
            log.info('File %s was not changed' % meetings_file)
        else:
            meetings = json.load(open(meetings_file))

            # Process committee event nodes
            for meeting in meetings:
                try:
                    # Associate it with an existing meeting object if GUID is already known.
                    # Must get it like this, vs just assigning the ID as we do in other parsers,
                    # because of the auto_now_add created field, which otherwise misbehaves.
                    try:
                        mobj = CommitteeMeeting.objects.get(
                            guid=meeting['guid'])
                    except CommitteeMeeting.DoesNotExist:
                        mobj = CommitteeMeeting()

                    # Parse.
                    mobj = meeting_processor.process(mobj, meeting)

                    # Attach the meeting to the subcommittee if set.
                    if mobj.subcommittee:
                        mobj.committee = Committee.objects.get(
                            code=mobj.committee.code + mobj.subcommittee)

                    mobj.save()

                    mobj.bills.clear()
                    for bill in meeting["bill_ids"]:
                        try:
                            bill_type, bill_num, bill_cong = re.match(
                                r"([a-z]+)(\d+)-(\d+)$", bill).groups()
                            bill = Bill.objects.get(
                                congress=bill_cong,
                                bill_type=BillType.by_slug(bill_type),
                                number=int(bill_num))
                            mobj.bills.add(bill)
                        except AttributeError:
                            pass  # regex failed
                        except common.enum.NotFound:
                            pass  # invalid bill type code in source data
                        except Bill.DoesNotExist:
                            pass  # we don't know about bill yet
                except Committee.DoesNotExist:
                    log.error(
                        'Could not load Committee object for meeting %s' %
                        meeting_processor.display_node(meeting))

            for committee in Committee.objects.all():
                if not options.disable_events:
                    committee.create_events()

            File.objects.save_file(meetings_file)
示例#12
0
def main(options):
    """
    Update Person and PersonRole models.
    
    Do safe update: touch only those records
    which have been changed.
    """

    BASE_PATH = CONGRESS_LEGISLATORS_PATH
    SRC_FILES = ['legislators-current', 'legislators-historical', 'legislators-social-media', 'executive'] # order matters

    for p in SRC_FILES:
        f = BASE_PATH + p + ".yaml"
        if not File.objects.is_changed(f) and not options.force:
            log.info('File %s was not changed' % f)
        else:
            # file modified...
            break
    else:
        # no 'break' ==> no files modified
        return

    # Start parsing.
    
    had_error = False

    # Get combined data.
    legislator_data = { }
    leg_id_map = { }
    for p in SRC_FILES:
        log.info('Opening %s...' % p)
        f = BASE_PATH + p + ".yaml"
        y = yaml_load(f)
        for m in y:
            if p == "legislators-current":
                # We know all terms but the last are non-current and the last is.
                for r in m["terms"]: r["current"] = False
                m["terms"][-1]["current"] = True
            elif p == "legislators-historical":
                # We know all terms are non-current.
                for r in m["terms"]: r["current"] = False

            if p != 'legislators-social-media':
                govtrack_id = m["id"].get("govtrack")
                
                # For the benefit of the social media file, make a mapping of IDs.
                for k, v in m["id"].items():
                    if type(v) != list:
                        leg_id_map[(k,v)] = govtrack_id
            else:
                # GovTrack IDs are not always listed in this file.
                govtrack_id = None
                for k, v in m["id"].items():
                    if type(v) != list and (k, v) in leg_id_map:
                        govtrack_id = leg_id_map[(k,v)]
                        break
            
            if not govtrack_id:
                print "No GovTrack ID:"
                pprint.pprint(m)
                had_error = True
                continue
                
            if govtrack_id not in legislator_data:
                legislator_data[govtrack_id] = m
            elif p == "legislators-social-media":
                legislator_data[govtrack_id]["social"] = m["social"]
            elif p == "executive":
                legislator_data[govtrack_id]["terms"].extend( m["terms"] )
            else:
                raise ValueError("Duplication in an unexpected way (%d, %s)." % (govtrack_id, p))
    
    person_processor = PersonProcessor()
    role_processor = PersonRoleProcessor()

    existing_persons = set(Person.objects.values_list('pk', flat=True))
    processed_persons = set()
    created_persons = set()

    progress = Progress(total=len(legislator_data))
    log.info('Processing persons')

    for node in legislator_data.values():
        # Wrap each iteration in try/except
        # so that if some node breaks the parsing process
        # then other nodes could be parsed
        try:
            person = person_processor.process(Person(), node)
            
            # Create cached name strings. This is done again later
            # after the roles are updated.
            person.set_names()

            # Now try to load the person with such ID from
            # database. If found it then just update it
            # else create new Person object
            try:
                ex_person = Person.objects.get(pk=person.pk)
                if person_processor.changed(ex_person, person) or options.force:
                    # If the person has PK of existing record,
                    # coming in via the YAML-specified GovTrack ID,
                    # then Django ORM will update existing record
                    if not options.force:
                        log.warn("Updated %s" % person)
                    person.save()
                    
            except Person.DoesNotExist:
                created_persons.add(person.pk)
                person.save()
                log.warn("Created %s" % person)

            processed_persons.add(person.pk)

            # Parse all of the roles.
            new_roles = []
            for termnode in node['terms']:
                role = role_processor.process(PersonRole(), termnode)
                role.person = person
                role.extra = filter_yaml_term_structure(termnode) # copy in the whole YAML structure

                # Is this role current? For legislators, same as whether it came from legislators-current, which eases Jan 3 transitions when we can't distinguish by date.
                if "current" in termnode:
                    role.current = termnode["current"]

                # But executives...
                else:
                    now = datetime.now().date()
                    role.current = role.startdate <= now and role.enddate >= now
                    # Because of date overlaps at noon transition dates, ensure that only the last term that covers
                    # today is current --- reset past roles to not current. Doesn't handle turning off retirning people tho.
                    for r in new_roles: r.current = False

                # Scan for most recent leadership role within the time period of this term,
                # which isn't great for Senators because it's likely it changed a few times
                # within a term, especially if there was a party switch.
                role.leadership_title = None
                for leadership_node in node.get("leadership_roles", []):
                    # must match on date and chamber
                    if leadership_node["start"] >= role.enddate.isoformat(): continue # might start on the same day but is for the next Congress
                    if "end" in leadership_node and leadership_node["end"] <= role.startdate.isoformat(): continue # might start on the same day but is for the previous Congress
                    if leadership_node["chamber"] != RoleType.by_value(role.role_type).congress_chamber.lower(): continue
                    role.leadership_title = leadership_node["title"]

                new_roles.append(role)

            # Try matching the new roles to existing db records. Since we don't have a primry key
            # in the source data, we have to match on the record values. But because of errors in data,
            # term start/end dates can change, so matching has to be a little fuzzy.
            existing_roles = list(PersonRole.objects.filter(person=person))
            matches = []
            def run_match_rule(rule):
                import itertools
                for new_role, existing_role in itertools.product(new_roles, existing_roles):
                    if new_role not in new_roles or existing_role not in existing_roles: continue # already matched on a previous iteration
                    if new_role.role_type != existing_role.role_type: continue
                    if new_role.state != existing_role.state: continue
                    if rule(new_role, existing_role):
                        matches.append((new_role, existing_role))
                        new_roles.remove(new_role)
                        existing_roles.remove(existing_role)

            # First match exactly, then exact on just one date, then on contractions and expansions.
            run_match_rule(lambda new_role, existing_role : new_role.startdate == existing_role.startdate and new_role.enddate == existing_role.enddate)
            run_match_rule(lambda new_role, existing_role : new_role.startdate == existing_role.startdate or new_role.enddate == existing_role.enddate)
            run_match_rule(lambda new_role, existing_role : new_role.startdate >= existing_role.startdate and new_role.enddate <= existing_role.enddate)
            run_match_rule(lambda new_role, existing_role : new_role.startdate <= existing_role.startdate and new_role.enddate >= existing_role.enddate)

            # Update the database entries that correspond with records in the data file.
            did_update_any = False
            for new_role, existing_role in matches:
                new_role.id = existing_role.id
                if role_processor.changed(existing_role, new_role) or options.force:
                    new_role.save()
                    did_update_any = True
                    if not options.force:
                        log.warn("Updated %s" % new_role)

            # If we have mutliple records on disk that didn't match and multiple records in the database
            # that didn't match, then we don't know how to align them.
            if len(new_roles) > 0 and len(existing_roles) > 0:
                print(new_roles)
                print(existing_roles)
                raise Exception("There is an unmatched role.")

            # Otherwise if there are any unmatched new roles, we can just add them.
            for role in new_roles:
                log.warn("Created %s" % role)
                role.save()
                did_update_any = True
            
            # And likewise for any existing roles that are left over.
            for pr in existing_roles:
                print pr.person.id, pr
                raise ValueError("Deleted role??")
                log.warn("Deleted %s" % pr)
                pr.delete()
            
            if did_update_any and not options.disable_events:
                # Create the events for the roles after all have been loaded
                # because we don't create events for ends of terms and
                # starts of terms that are adjacent. Refresh the list to get
                # the roles in order.
                role_list = list(PersonRole.objects.filter(person=person).order_by('startdate'))
                for i in xrange(len(role_list)):
                    role_list[i].create_events(
                        role_list[i-1] if i > 0 else None,
                        role_list[i+1] if i < len(role_list)-1 else None
                        )
            
            # The name can't be determined until all of the roles are set. If
            # it changes, re-save. Unfortunately roles are cached so this actually
            # doesn't work yet. Re-run the parser to fix names.
            nn = (person.name, person.sortname)
            if hasattr(person, "role"): delattr(person, "role") # clear the cached info
            person.set_names()
            if nn != (person.name, person.sortname):
                log.warn("%s is now %s." % (nn[0], person.name))
                person.save()
            
        except Exception, ex:
            # Catch unexpected exceptions and log them
            pprint.pprint(node)
            log.error('', exc_info=ex)
            had_error = True

        progress.tick()
示例#13
0
def main(options):
    """
    Parse rolls.
    """
    
    # Setup XML processors
    vote_processor = VoteProcessor()
    option_processor = VoteOptionProcessor()
    voter_processor = VoterProcessor()
    voter_processor.PERSON_CACHE = dict((x.pk, x) for x in Person.objects.all())

    chamber_mapping = {'s': CongressChamber.senate,
                       'h': CongressChamber.house}

    if options.filter:
        files = glob.glob(options.filter)
        log.info('Parsing rolls matching %s' % options.filter)
    elif options.congress:
        files = glob.glob(settings.CONGRESS_DATA_PATH + '/%s/votes/*/*/data.xml' % options.congress)
        log.info('Parsing rolls of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/congress/*/votes/*/*/data.xml')
    log.info('Processing votes: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=10)

    def log_delete_qs(qs):
        if qs.count() == 0: return
        print("Deleting obsoleted records: ", qs)
        #if qs.count() > 3:
        #    print "Delete skipped..."
        #    return
        qs.delete()

    seen_obj_ids = set()
    had_error = False

    for fname in files:
        progress.tick()

        match = re.match(r"data/congress/(?P<congress>\d+)/votes/(?P<session>[ABC0-9]+)/(?P<chamber>[hs])(?P<number>\d+)/data.xml$", fname)
        
        try:
            existing_vote = Vote.objects.get(congress=int(match.group("congress")), chamber=chamber_mapping[match.group("chamber")], session=match.group("session"), number=int(match.group("number")))
        except Vote.DoesNotExist:
            existing_vote = None
        
        if not File.objects.is_changed(fname) and not options.force and existing_vote != None and not existing_vote.missing_data:
            seen_obj_ids.add(existing_vote.id)
            continue
            
        try:
            tree = etree.parse(fname)
            
            ## Look for votes with VP tie breakers.
            #if len(tree.xpath("/roll/voter[@VP='1']")) == 0:
            #    had_error = True # prevent delete at the end
            #    continue
            
            # Process role object
            roll_node = tree.xpath('/roll')[0]

            # Sqlite is much faster when lots of saves are wrapped in a transaction,
            # and we do a lot of saves because it's a lot of voters.
            from django.db import transaction
            with transaction.atomic():

                vote = vote_processor.process(Vote(), roll_node)
                if existing_vote: vote.id = existing_vote.id
                vote.congress = int(match.group("congress"))
                vote.chamber = chamber_mapping[match.group("chamber")]
                vote.session = match.group("session")
                vote.number = int(match.group("number"))
                
                # Get related bill & amendment.
                
                for bill_node in roll_node.xpath("bill"):
                    related_bill_num = bill_node.get("number")
                    if 9 <= vote.congress <= 42 and vote.session in ('1', '2'):
                         # Bill numbering from the American Memory colletion is different. The number combines
                         # the session, bill number, and a 0 or 5 for regular or 'half' numbering. Prior to
                         # the 9th congress numbering seems to be wholly assigned by us and not related to
                         # actual numbering, so we skip matching those bills.
                         related_bill_num = "%d%04d%d" % (int(vote.session), int(bill_node.get("number")), 0)
                    try:
                        vote.related_bill = Bill.objects.get(congress=bill_node.get("session"), bill_type=BillType.by_xml_code(bill_node.get("type")), number=related_bill_num)
                    except Bill.DoesNotExist:
                        if vote.congress >= 93:
                            vote.missing_data = True

                for amdt_node in roll_node.xpath("amendment"):
                    if amdt_node.get("ref") == "regular" and vote.related_bill is not None:
                        try:
                            vote.related_amendment = Amendment.objects.get(congress=vote.related_bill.congress, amendment_type=AmendmentType.by_slug(amdt_node.get("number")[0]+"amdt"), number=amdt_node.get("number")[1:])
                        except Amendment.DoesNotExist:
                            if vote.congress >= 93:
                                print("Missing amendment", fname)
                                vote.missing_data = True
                    elif amdt_node.get("ref") == "bill-serial":
                        # It is impossible to associate House votes with amendments just from the House
                        # vote XML because the amendment-num might correspond either with the A___ number
                        # or with the "An amendment, numbered ____" number from the amendment purpose,
                        # and there's really no way to figure out which. Maybe we can use the amendment
                        # sponsor instead?
                        #vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number"))
                        # Instead, we set related_amendment from the amendment parser. Here, we have to
                        # preserve the related_amendment if it is set.
                        if existing_vote: vote.related_amendment = existing_vote.related_amendment

                # clean up some question text and use the question_details field
                
                if vote.category in (VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override) and vote.related_bill:
                    # For passage votes, set the question to the bill title and put the question
                    # details in the details field.
                    vote.question = vote.related_bill.title
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display()
                    
                elif vote.category == VoteCategory.amendment and vote.related_amendment:
                    # For votes on amendments, make a better title/explanation.
                    vote.question = vote.related_amendment.title
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display()
                    
                elif vote.related_bill and vote.question.startswith("On the Cloture Motion " + vote.related_bill.display_number):
                    vote.question = "Cloture on " + vote.related_bill.title
                elif vote.related_bill and vote.question.startswith("On Cloture on the Motion to Proceed " + vote.related_bill.display_number):
                    vote.question = "Cloture on " + vote.related_bill.title
                    vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display()
                elif vote.related_bill and vote.question.startswith("On the Motion to Proceed " + vote.related_bill.display_number):
                    vote.question = "Motion to Proceed on " + vote.related_bill.title
                    
                elif vote.related_amendment and vote.question.startswith("On the Cloture Motion " + vote.related_amendment.get_amendment_type_display() + " " + str(vote.related_amendment.number)):
                    vote.question = "Cloture on " + vote.related_amendment.title
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display()
                
                # weird House foratting of bill numbers ("H RES 123 Blah blah")
                if vote.related_bill:
                    vote.question = re.sub(
                        "(On [^:]+): " + vote.related_bill.display_number.replace(". ", " ").replace(".", " ").upper() + " .*",
                        r"\1: " + truncatewords(vote.related_bill.title, 15),
                        vote.question)
                    
                vote.save()
                
                seen_obj_ids.add(vote.id) # don't delete me later
                
                # Process roll options, overwrite existing options where possible.
                seen_option_ids = set()
                roll_options = {}
                for option_node in roll_node.xpath('./option'):
                    option = option_processor.process(VoteOption(), option_node)
                    option.vote = vote
                    if existing_vote:
                        try:
                            option.id = VoteOption.objects.filter(vote=vote, key=option.key)[0].id # get is better, but I had the database corruption problem
                        except IndexError:
                            pass
                    option.save()
                    roll_options[option.key] = option
                    seen_option_ids.add(option.id)
                log_delete_qs(VoteOption.objects.filter(vote=vote).exclude(id__in=seen_option_ids)) # may cascade and delete the Voters too?

                # Process roll voters, overwriting existing voters where possible.
                if existing_vote:
                    existing_voters = dict(Voter.objects.filter(vote=vote).values_list("person", "id"))
                seen_voter_ids = set()
                voters = list()
                for voter_node in roll_node.xpath('./voter'):
                    voter = voter_processor.process(roll_options, Voter(), voter_node)
                    voter.vote = vote
                    voter.created = vote.created
                        
                    # for VP votes, load the actual person & role...
                    if voter.voter_type == VoterType.vice_president:
                        try:
                            r = PersonRole.objects.get(role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created)
                            voter.person_role = r
                            voter.person = r.person
                        except PersonRole.DoesNotExist:
                            # overlapping roles? missing data?
                            log.error('Could not resolve vice president in %s' % fname)
                        
                    if existing_vote and voter.person:
                        try:
                            voter.id = existing_voters[voter.person.id]
                        except KeyError:
                            pass
                        
                    voters.append(voter)
                    
                    if voter.voter_type == VoterType.unknown and not vote.missing_data:
                        vote.missing_data = True
                        vote.save()
                        
                # pre-fetch the role of each voter
                load_roles_at_date([x.person for x in voters if x.person != None], vote.created, vote.congress)
                for voter in list(voters):
                    if voter.voter_type != VoterType.vice_president:
                        voter.person_role = voter.person.role
                    # If we couldn't match a role for this person on the date of the vote, and if the voter was Not Voting,
                    # and we're looking at historical data, then this is probably a data error --- the voter wasn't even in office.
                    # At the start of each Congress, the House does a Call by States and Election of the Speaker, before swearing
                    # in. In the 116th Congress, these votes had a Not Voting for Walter Jones who had not yet made it to DC, and
                    # then omitted Jones in the votes after swearing in. In those cases, look for a role coming up.
                    if voter.person_role is None and voter.option.key == "0" and vote.question in ("Call by States", "Election of the Speaker"):
                        voter.person_role = voter.person.roles.filter(startdate__gt=vote.created, startdate__lt=vote.created+timedelta(days=30)).first()
                    if voter.person_role is None:
                        if vote.source == VoteSource.keithpoole and voter.option.key == "0":
                            # Drop this record.
                            voters.remove(voter)
                        else:
                            log.error("%s: Could not find role for %s on %s." % (fname, voter.person, vote.created))
                            vote.missing_data = True
                            vote.save()

                # save all of the records (inserting/updating)
                for voter in voters:
                    voter.save()
                    seen_voter_ids.add(voter.id)
                    
                # remove obsolete voter records
                log_delete_qs(Voter.objects.filter(vote=vote).exclude(id__in=seen_voter_ids)) # possibly already deleted by cascade above

                # pre-calculate totals
                vote.calculate_totals()

                if not options.disable_events:
                    vote.create_event()
                    
            File.objects.save_file(fname)

        except Exception as ex:
            log.error('Error in processing %s' % fname, exc_info=ex)
            had_error = True
        
    # delete vote objects that are no longer represented on disk
    if options.congress and not options.filter and not had_error:
        log_delete_qs(Vote.objects.filter(congress=options.congress).exclude(id__in = seen_obj_ids))
示例#14
0
def main(options):
    """
    Update Person and PersonRole models.
    
    Do safe update: touch only those records
    which have been changed.
    """

    BASE_PATH = CONGRESS_LEGISLATORS_PATH
    SRC_FILES = ['legislators-current', 'legislators-historical', 'legislators-social-media', 'executive'] # order matters

    for p in SRC_FILES:
        f = BASE_PATH + p + ".yaml"
        if not File.objects.is_changed(f) and not options.force:
            log.info('File %s was not changed' % f)
        else:
            # file modified...
            break
    else:
        # no 'break' ==> no files modified
        return

    # Start parsing.
    
    had_error = False

    # Get combined data.
    legislator_data = { }
    leg_id_map = { }
    for p in SRC_FILES:
        log.info('Opening %s...' % p)
        f = BASE_PATH + p + ".yaml"
        y = yaml_load(f)
        for m in y:
            if p != 'legislators-social-media':
                govtrack_id = m["id"].get("govtrack")
                
                # For the benefit of the social media file, make a mapping of IDs.
                for k, v in m["id"].items():
                    if type(v) != list:
                        leg_id_map[(k,v)] = govtrack_id
            else:
                # GovTrack IDs are not always listed in this file.
                govtrack_id = None
                for k, v in m["id"].items():
                    if type(v) != list and (k, v) in leg_id_map:
                        govtrack_id = leg_id_map[(k,v)]
                        break
            
            if not govtrack_id:
                print "No GovTrack ID:"
                pprint.pprint(m)
                had_error = True
                continue
                
            if govtrack_id not in legislator_data:
                legislator_data[govtrack_id] = m
            elif p == "legislators-social-media":
                legislator_data[govtrack_id]["social"] = m["social"]
            elif p == "executive":
                legislator_data[govtrack_id]["terms"].extend( m["terms"] )
            else:
                raise ValueError("Duplication in an unexpected way (%d, %s)." % (govtrack_id, p))
    
    person_processor = PersonProcessor()
    role_processor = PersonRoleProcessor()

    existing_persons = set(Person.objects.values_list('pk', flat=True))
    processed_persons = set()
    created_persons = set()

    progress = Progress(total=len(legislator_data))
    log.info('Processing persons')

    for node in legislator_data.values():
        # Wrap each iteration in try/except
        # so that if some node breaks the parsing process
        # then other nodes could be parsed
        try:
            person = person_processor.process(Person(), node)
            
            # Create cached name strings. This is done again later
            # after the roles are updated.
            person.set_names()

            # Now try to load the person with such ID from
            # database. If found it then just update it
            # else create new Person object
            try:
                ex_person = Person.objects.get(pk=person.pk)
                if person_processor.changed(ex_person, person) or options.force:
                    # If the person has PK of existing record,
                    # coming in via the YAML-specified GovTrack ID,
                    # then Django ORM will update existing record
                    if not options.force:
                        log.warn("Updated %s" % person)
                    person.save()
                    
            except Person.DoesNotExist:
                created_persons.add(person.pk)
                person.save()
                log.warn("Created %s" % person)

            processed_persons.add(person.pk)

            # Process roles of the person
            roles = list(PersonRole.objects.filter(person=person))
            existing_roles = set(PersonRole.objects.filter(person=person).values_list('pk', flat=True))
            processed_roles = set()
            role_list = []
            for role in node['terms']:
                role = role_processor.process(PersonRole(), role)
                role.person = person
                
                role.current = role.startdate <= datetime.now().date() and role.enddate >= datetime.now().date() # \
                        #and CURRENT_CONGRESS in role.congress_numbers()

                # Scan for most recent leadership role within the time period of this term,
                # which isn't great for Senators because it's likely it changed a few times
                # within a term, especially if there was a party switch.
                role.leadership_title = None
                for leadership_node in node.get("leadership_roles", []):
                    # must match on date and chamber
                    if leadership_node["start"] >= role.enddate.isoformat(): continue # might start on the same day but is for the next Congress
                    if "end" in leadership_node and leadership_node["end"] <= role.startdate.isoformat(): continue # might start on the same day but is for the previous Congress
                    if leadership_node["chamber"] != RoleType.by_value(role.role_type).congress_chamber.lower(): continue
                    role.leadership_title = leadership_node["title"]
                
                # Try to match this role with one already in the database.
                # First search for an exact match on type/start/end.
                ex_role = None
                for r in roles:
                    if role.role_type == r.role_type and r.startdate == role.startdate and r.enddate == role.enddate:
                        ex_role = r
                        break
                        
                # Otherwise match on type/start only.
                if not ex_role:
                    for r in roles:
                        if role.role_type == r.role_type and r.startdate == role.startdate:
                            ex_role = r
                            break
                        
                if ex_role:    
                    # These roles correspond.
                    processed_roles.add(ex_role.id)
                    role.id = ex_role.id
                    if role_processor.changed(ex_role, role) or options.force:
                        role.save()
                        role_list.append(role)
                        if not options.force:
                            log.warn("Updated %s" % role)
                    roles.remove(ex_role) # don't need to try matching this to any other node
                else:
                    # Didn't find a matching role.
                    if len([r for r in roles if r.role_type == role.role_type]) > 0:
                        print role, "is one of these?"
                        for ex_role in roles:
                            print "\t", ex_role
                        raise Exception("There is an unmatched role.")
                    log.warn("Created %s" % role)
                    role.save()
                    role_list.append(role)
                        
            # create the events for the roles after all have been loaded
            # because we don't create events for ends of terms and
            # starts of terms that are adjacent.
            if not options.disable_events:
                for i in xrange(len(role_list)):
                    role_list[i].create_events(
                        role_list[i-1] if i > 0 else None,
                        role_list[i+1] if i < len(role_list)-1 else None
                        )
            
            removed_roles = existing_roles - processed_roles
            for pk in removed_roles:
                pr = PersonRole.objects.get(pk=pk)
                print pr.person.id, pr
                raise ValueError("Deleted role??")
                log.warn("Deleted %s" % pr)
                pr.delete()
            
            # The name can't be determined until all of the roles are set. If
            # it changes, re-save. Unfortunately roles are cached so this actually
            # doesn't work yet. Re-run the parser to fix names.
            nn = (person.name, person.sortname)
            if hasattr(person, "role"): delattr(person, "role") # clear the cached info
            person.set_names()
            if nn != (person.name, person.sortname):
                log.warn("%s is now %s." % (nn[0], person.name))
                person.save()
            
        except Exception, ex:
            # Catch unexpected exceptions and log them
            pprint.pprint(node)
            log.error('', exc_info=ex)
            had_error = True

        progress.tick()
示例#15
0
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()

    # Cache existing terms. There aren't so many.
    existing_terms = {}
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info("Processing old bill terms")
    TERMS_FILE = "data/us/liv.xml"
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath("/liv/top-term"):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()

        for subnode in node.xpath("./term"):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                term.subterms.add(subterm)
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)

                    existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error("Duplicated term %s" % term_processor.display_node(subnode))

    log.info("Processing new bill terms")
    for FILE in ("data/us/liv111.xml", "data/us/crsnet.xml"):
        tree = etree.parse(FILE)
        for node in tree.xpath("/liv/top-term"):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath("./term"):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)

                        existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error("Duplicated term %s" % term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills

    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex

        bill_index = BillIndex()

    if options.congress and int(options.congress) <= 42:
        files = glob.glob("data/congress/%s/bills/*/*/*.xml" % options.congress)
        log.info("Parsing unitedstates/congress bills of only congress#%s" % options.congress)
    elif options.congress:
        files = glob.glob("data/us/%s/bills/*.xml" % options.congress)
        log.info("Parsing bills of only congress#%s" % options.congress)
    else:
        files = glob.glob("data/us/*/bills/*.xml")

    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]

    log.info("Processing bills: %d files" % len(files))
    total = len(files)
    progress = Progress(total=total, name="files", step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()

        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (
            (not options.congress or options.congress > 42)
            and (bill_index and not options.disable_events)
            and not File.objects.is_changed(fname)
            and not options.force
        ):
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
                seen_bill_ids.append(b.id)

                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (datetime.now() - timedelta(days=14)).date():
                        print "No bill text?", fname, b.introduced_date
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(textfile):
                    bill_index.update_object(b, using="bill")  # index the full text
                    b.create_events()  # events for new bill text documents
                    File.objects.save_file(textfile)

                continue
            except Bill.DoesNotExist:
                print "Unchanged metadata file but bill doesn't exist:", fname
                pass  # just parse as normal

        if options.slow:
            time.sleep(1)

        tree = etree.parse(fname)
        for node in tree.xpath("/bill"):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print fname
                raise

            seen_bill_ids.append(bill.id)  # don't delete me later

            if bill.congress >= 93:
                bill.source = "thomas-congproj"
            elif bill.congress >= 82:
                bill.source = "statutesatlarge"
                if bill.current_status == BillStatus.enacted_signed:
                    bill.current_status = BillStatus.enacted_unknown
            elif bill.congress <= 42:
                bill.source = "americanmemory"
            else:
                raise ValueError()

            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = unicode(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                actions.append(
                    (
                        repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))),
                        BillStatus.by_xml_code(axn.xpath("string(@state)")),
                        axn.xpath("string(text)"),
                        etree.tostring(axn),
                    )
                )

            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])

            bill.major_actions = actions
            try:
                bill.save()
            except:
                print bill
                raise
            if bill_index:
                bill_index.update_object(bill, using="bill")

            if not options.disable_events:
                bill.create_events()

        File.objects.save_file(fname)

    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(id__in=seen_bill_ids):
            print "Bill is no longer on disk: ", b.id, b

    # The rest is for current only...

    if options.congress and int(options.congress) != CURRENT_CONGRESS:
        return

    # Parse docs.house.gov for what might be coming up this week.
    import iso8601

    dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read()
    m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html)
    if not m:
        log.error("No docs.house.gov download link found at http://docs.house.gov.")
    else:

        def bt_re(bt):
            return re.escape(bt[1]).replace(r"\.", r"\.?\s*")

        try:
            dhg = etree.parse(urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot()
        except:
            print "http://docs.house.gov/floor/" + m.group(1)
            raise
        # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date()
        for item in dhg.xpath("category/floor-items/floor-item"):
            billname = item.xpath("legis-num")[0].text
            if billname is None:
                continue  # weird but OK
            m = re.match(
                r"\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?("
                + "|".join(bt_re(bt) for bt in BillType)
                + r")(\d+)\s*(\[Conference Report\]\s*)?$",
                billname,
                re.I,
            )
            if not m:
                if not billname.strip().endswith(" __"):
                    log.error('Could not parse legis-num "%s" in docs.house.gov.' % billname)
            else:
                for bt in BillType:
                    if re.match(bt_re(bt) + "$", m.group(1), re.I):
                        try:
                            bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2))
                            bill.docs_house_gov_postdate = iso8601.parse_date(item.get("add-date")).replace(tzinfo=None)
                            bill.save()
                            if bill_index:
                                bill_index.update_object(bill, using="bill")

                            if not options.disable_events:
                                bill.create_events()
                        except Bill.DoesNotExist:
                            log.error('Could not find bill "%s" in docs.house.gov.' % billname)
                        break
                else:
                    log.error('Could not parse legis-num bill type "%s" in docs.house.gov.' % m.group(1))

    # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow.
    now = datetime.now()
    sfs = urllib.urlopen(
        "http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm"
    ).read()
    try:
        sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1)
        for congress, bill_type, number in re.findall(
            r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs
        ):
            bill_type = BillType.by_slug(bill_type)
            bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number)
            if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(
                days=7
            ):
                bill.senate_floor_schedule_postdate = now
                bill.save()
                if bill_index:
                    bill_index.update_object(bill, using="bill")
                if not options.disable_events:
                    bill.create_events()
    except Exception as e:
        log.error("Could not parse Senate Floor Schedule: " + repr(e))
示例#16
0
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()

    # Cache existing terms. There aren't so many.
    existing_terms = {}
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'data/us/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()

        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type),
                                          subterm.name)]
                term.subterms.add(subterm)
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)

                    existing_terms[(int(subterm.term_type),
                                    subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' %
                              term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type),
                                              subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)

                        existing_terms[(int(subterm.term_type),
                                        subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' %
                                  term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills

    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress and int(options.congress) <= 42:
        files = glob.glob('data/congress/%s/bills/*/*/*.xml' %
                          options.congress)
        log.info('Parsing unitedstates/congress bills of only congress#%s' %
                 options.congress)
    elif options.congress:
        files = glob.glob('data/us/%s/bills/*.xml' % options.congress)
        log.info('Parsing bills of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills/*.xml')

    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]

    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()

        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (not options.congress or options.congress > 42) and (
                bill_index and not options.disable_events
        ) and not File.objects.is_changed(fname) and not options.force:
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1),
                                     bill_type=BillType.by_xml_code(
                                         m.group(2)),
                                     number=m.group(3))
                seen_bill_ids.append(b.id)

                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (
                            datetime.now() - timedelta(days=14)).date():
                        print "No bill text?", fname, b.introduced_date
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(
                        textfile):
                    bill_index.update_object(
                        b, using="bill")  # index the full text
                    b.create_events()  # events for new bill text documents
                    File.objects.save_file(textfile)

                continue
            except Bill.DoesNotExist:
                print "Unchanged metadata file but bill doesn't exist:", fname
                pass  # just parse as normal

        if options.slow:
            time.sleep(1)

        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print fname
                raise

            seen_bill_ids.append(bill.id)  # don't delete me later

            if bill.congress >= 93:
                bill.source = "thomas-congproj"
            elif bill.congress >= 82:
                bill.source = "statutesatlarge"
                if bill.current_status == BillStatus.enacted_signed:
                    bill.current_status = BillStatus.enacted_unknown
            elif bill.congress <= 42:
                bill.source = "americanmemory"
            else:
                raise ValueError()

            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = unicode(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                actions.append((
                    repr(
                        bill_processor.parse_datetime(
                            axn.xpath("string(@datetime)"))),
                    BillStatus.by_xml_code(axn.xpath("string(@state)")),
                    axn.xpath("string(text)"),
                    etree.tostring(axn),
                ))

            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get(
                    "type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])

            bill.major_actions = actions
            try:
                bill.save()
            except:
                print bill
                raise
            if bill_index: bill_index.update_object(bill, using="bill")

            if not options.disable_events:
                bill.create_events()

        File.objects.save_file(fname)

    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(
                id__in=seen_bill_ids):
            print "Bill is no longer on disk: ", b.id, b

    # The rest is for current only...

    if options.congress and int(options.congress) != CURRENT_CONGRESS:
        return

    # Parse docs.house.gov for what might be coming up this week.
    import iso8601
    dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read()
    m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"",
                  dhg_html)
    if not m:
        log.error(
            'No docs.house.gov download link found at http://docs.house.gov.')
    else:

        def bt_re(bt):
            return re.escape(bt[1]).replace(r"\.", r"\.?\s*")

        try:
            dhg = etree.parse(
                urllib.urlopen("http://docs.house.gov/floor/" +
                               m.group(1))).getroot()
        except:
            print "http://docs.house.gov/floor/" + m.group(1)
            raise
        # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date()
        for item in dhg.xpath("category/floor-items/floor-item"):
            billname = item.xpath("legis-num")[0].text
            if billname is None: continue  # weird but OK
            m = re.match(
                r"\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?("
                + "|".join(bt_re(bt) for bt in BillType) +
                r")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I)
            if not m:
                if not billname.strip().endswith(" __"):
                    log.error(
                        'Could not parse legis-num "%s" in docs.house.gov.' %
                        billname)
            else:
                for bt in BillType:
                    if re.match(bt_re(bt) + "$", m.group(1), re.I):
                        try:
                            bill = Bill.objects.get(congress=CURRENT_CONGRESS,
                                                    bill_type=bt[0],
                                                    number=m.group(2))
                            bill.docs_house_gov_postdate = iso8601.parse_date(
                                item.get("add-date")).replace(tzinfo=None)
                            bill.save()
                            if bill_index:
                                bill_index.update_object(bill, using="bill")

                            if not options.disable_events:
                                bill.create_events()
                        except Bill.DoesNotExist:
                            log.error(
                                'Could not find bill "%s" in docs.house.gov.' %
                                billname)
                        break
                else:
                    log.error(
                        'Could not parse legis-num bill type "%s" in docs.house.gov.'
                        % m.group(1))

    # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow.
    now = datetime.now()
    sfs = urllib.urlopen(
        "http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm"
    ).read()
    try:
        sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting",
                        sfs).group(1)
        for congress, bill_type, number in re.findall(
                r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)",
                sfs):
            bill_type = BillType.by_slug(bill_type)
            bill = Bill.objects.get(congress=congress,
                                    bill_type=bill_type,
                                    number=number)
            if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(
                    days=7):
                bill.senate_floor_schedule_postdate = now
                bill.save()
                if bill_index: bill_index.update_object(bill, using="bill")
                if not options.disable_events:
                    bill.create_events()
    except Exception as e:
        log.error('Could not parse Senate Floor Schedule: ' + repr(e))
示例#17
0
def main(options):
    """
    Parse rolls.
    """
    
    # Setup XML processors
    vote_processor = VoteProcessor()
    option_processor = VoteOptionProcessor()
    voter_processor = VoterProcessor()
    voter_processor.PERSON_CACHE = dict((x.pk, x) for x in Person.objects.all())

    # The pattern which the roll file matches
    # Filename contains info which should be placed to DB
    # along with info extracted from the XML file
    re_path = re.compile('data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml')

    chamber_mapping = {'s': CongressChamber.senate,
                       'h': CongressChamber.house}

    if options.filter:
        files = glob.glob(options.filter)
        log.info('Parsing rolls matching %s' % options.filter)
    elif options.congress:
        files = glob.glob('data/us/%s/rolls/*.xml' % options.congress)
        log.info('Parsing rolls of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/rolls/*.xml')
    log.info('Processing votes: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=10)

    def log_delete_qs(qs):
        if qs.count() > 0:
            try:
                print "Deleting: ", qs
            except Exception as e:
                print "Deleting [%s]..." % str(e)
            if qs.count() > 3:
                print "Delete skipped..."
                return
            qs.delete()

    seen_obj_ids = set()
    had_error = False

    for fname in files:
        progress.tick()

        match = re_path.search(fname)
        
        try:
            existing_vote = Vote.objects.get(congress=match.group(1), chamber=chamber_mapping[match.group(2)], session=match.group(3), number=match.group(4))
        except Vote.DoesNotExist:
            existing_vote = None
        
        if not File.objects.is_changed(fname) and not options.force and existing_vote != None and not existing_vote.missing_data:
            seen_obj_ids.add(existing_vote.id)
            continue
            
        try:
            tree = etree.parse(fname)
            
            ## Look for votes with VP tie breakers.
            #if len(tree.xpath("/roll/voter[@VP='1']")) == 0:
            #    had_error = True # prevent delete at the end
            #    continue
            
            # Process role object
            for roll_node in tree.xpath('/roll'):
                vote = vote_processor.process(Vote(), roll_node)
                if existing_vote: vote.id = existing_vote.id
                match = re_path.search(fname)
                vote.congress = int(match.group(1))
                vote.chamber = chamber_mapping[match.group(2)]
                vote.session = match.group(3)
                vote.number = int(match.group(4))
                
                for bill_node in roll_node.xpath("bill"):
                    try:
                        vote.related_bill = Bill.objects.get(congress=bill_node.get("session"), bill_type=BillType.by_xml_code(bill_node.get("type")), number=bill_node.get("number"))
                        
                        # for votes on passage, reverse the order of the title so that the
                        # name of the bill comes first, but keep the vote_type at the end
                        # to distinguish suspension votes etc. also, the title that comes
                        # from the upstream source is not formatted in our style.
                        if vote.category in (VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override):
                            vote.question = truncatewords(vote.related_bill.title, 12) + " (" + vote.vote_type + ")"
                        
                    except Bill.DoesNotExist:
                        vote.missing_data = True
                
                vote.save()
                
                seen_obj_ids.add(vote.id) # don't delete me later
                
                # Process roll options, overwrite existing options where possible.
                seen_option_ids = set()
                roll_options = {}
                for option_node in roll_node.xpath('./option'):
                    option = option_processor.process(VoteOption(), option_node)
                    option.vote = vote
                    if existing_vote:
                        try:
                            option.id = VoteOption.objects.filter(vote=vote, key=option.key)[0].id # get is better, but I had the database corruption problem
                        except IndexError:
                            pass
                    option.save()
                    roll_options[option.key] = option
                    seen_option_ids.add(option.id)
                log_delete_qs(VoteOption.objects.filter(vote=vote).exclude(id__in=seen_option_ids)) # may cascade and delete the Voters too?

                # Process roll voters, overwriting existing voters where possible.
                if existing_vote:
                    existing_voters = dict(Voter.objects.filter(vote=vote).values_list("person", "id"))
                seen_voter_ids = set()
                for voter_node in roll_node.xpath('./voter'):
                    voter = voter_processor.process(roll_options, Voter(), voter_node)
                    voter.vote = vote
                    voter.created = vote.created
                        
                    # for VP votes, load the actual person...
                    if voter.voter_type == VoterType.vice_president:
                        try:
                            r = PersonRole.objects.get(role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created)
                            voter.person = r.person
                        except:
                            # overlapping roles? missing data?
                            log.error('Could not resolve vice president in %s' % fname, exc_info=ex)
                        
                    if existing_vote and voter.person:
                        try:
                            voter.id = existing_voters[voter.person.id]
                        except KeyError:
                            pass
                        
                    voter.save()
                    
                    if voter.voter_type == VoterType.unknown and not vote.missing_data:
                        vote.missing_data = True
                        vote.save()
                        
                    seen_voter_ids.add(voter.id)
                    
                log_delete_qs(Voter.objects.filter(vote=vote).exclude(id__in=seen_voter_ids)) # possibly already deleted by cascade above

                vote.calculate_totals()

                if not options.disable_events:
                    vote.create_event()
                    
            File.objects.save_file(fname)

        except Exception, ex:
            log.error('Error in processing %s' % fname, exc_info=ex)
            had_error = True
示例#18
0
def main(options):
    """
    Update Person and PersonRole models.
    
    Do safe update: touch only those records
    which have been changed.
    """

    BASE_PATH = CONGRESS_LEGISLATORS_PATH
    SRC_FILES = ['legislators-current', 'legislators-historical', 'legislators-social-media', 'executive'] # order matters

    for p in SRC_FILES:
        f = BASE_PATH + p + ".yaml"
        if not File.objects.is_changed(f) and not options.force:
            log.info('File %s was not changed' % f)
        else:
            # file modified...
            break
    else:
        # no 'break' ==> no files modified
        return

    # Start parsing.
    
    had_error = False

    # Get combined data.
    legislator_data = { }
    leg_id_map = { }
    for p in SRC_FILES:
        log.info('Opening %s...' % p)
        f = BASE_PATH + p + ".yaml"
        y = yaml_load(f)
        for m in y:
            if p != 'legislators-social-media':
                govtrack_id = m["id"].get("govtrack")
                
                # For the benefit of the social media file, make a mapping of IDs.
                for k, v in m["id"].items():
                    if type(v) != list:
                        leg_id_map[(k,v)] = govtrack_id
            else:
                # GovTrack IDs are not always listed in this file.
                for k, v in m["id"].items():
                    if type(v) != list and (k, v) in leg_id_map:
                        govtrack_id = leg_id_map[(k,v)]
                        break
            
            if not govtrack_id:
                print "No GovTrack ID:"
                pprint.pprint(m)
                had_error = True
                continue
                
            if govtrack_id not in legislator_data:
                legislator_data[govtrack_id] = m
            elif p == "legislators-social-media":
                legislator_data[govtrack_id]["social"] = m["social"]
            elif p == "executive":
                legislator_data[govtrack_id]["terms"].extend( m["terms"] )
            else:
                raise ValueError("Duplication in an unexpected way.")
    
    person_processor = PersonProcessor()
    role_processor = PersonRoleProcessor()

    existing_persons = set(Person.objects.values_list('pk', flat=True))
    processed_persons = set()
    created_persons = set()

    progress = Progress(total=len(legislator_data))
    log.info('Processing persons')

    for node in legislator_data.values():
        # Wrap each iteration in try/except
        # so that if some node breaks the parsing process
        # then other nodes could be parsed
        try:
            person = person_processor.process(Person(), node)
            
            # Create cached name strings. This is done again later
            # after the roles are updated.
            person.set_names()

            # Now try to load the person with such ID from
            # database. If found it then just update it
            # else create new Person object
            try:
                ex_person = Person.objects.get(pk=person.pk)
                if person_processor.changed(ex_person, person) or options.force:
                    # If the person has PK of existing record,
                    # coming in via the YAML-specified GovTrack ID,
                    # then Django ORM will update existing record
                    if not options.force:
                        log.warn("Updated %s" % person)
                    person.save()
                    
            except Person.DoesNotExist:
                created_persons.add(person.pk)
                person.save()
                log.warn("Created %s" % person)

            processed_persons.add(person.pk)

            # Process roles of the person
            roles = list(PersonRole.objects.filter(person=person))
            existing_roles = set(PersonRole.objects.filter(person=person).values_list('pk', flat=True))
            processed_roles = set()
            role_list = []
            for role in node['terms']:
                role = role_processor.process(PersonRole(), role)
                role.person = person
                
                role.current = role.startdate <= datetime.now().date() and role.enddate >= datetime.now().date() # \
                        #and CURRENT_CONGRESS in role.congress_numbers()
                
                # Try to match this role with one already in the database.
                # First search for an exact match on type/start/end.
                ex_role = None
                for r in roles:
                    if role.role_type == r.role_type and r.startdate == role.startdate and r.enddate == role.enddate:
                        ex_role = r
                        break
                        
                # Otherwise match on type/start only.
                if not ex_role:
                    for r in roles:
                        if role.role_type == r.role_type and r.startdate == role.startdate:
                            ex_role = r
                            break
                        
                if ex_role:    
                    # These roles correspond.
                    processed_roles.add(ex_role.id)
                    role.id = ex_role.id
                    if role_processor.changed(ex_role, role) or options.force:
                        role.save()
                        role_list.append(role)
                        if not options.force:
                            log.warn("Updated %s" % role)
                    roles.remove(ex_role) # don't need to try matching this to any other node
                else:
                    # Didn't find a matching role.
                    if len([r for r in roles if r.role_type == role.role_type]) > 0:
                        print role, "is one of these?"
                        for ex_role in roles:
                            print "\t", ex_role
                        raise Exception("There is an unmatched role.")
                    log.warn("Created %s" % role)
                    role.save()
                    role_list.append(role)
                        
            # create the events for the roles after all have been loaded
            # because we don't create events for ends of terms and
            # starts of terms that are adjacent.
            if not options.disable_events:
                for i in xrange(len(role_list)):
                    role_list[i].create_events(
                        role_list[i-1] if i > 0 else None,
                        role_list[i+1] if i < len(role_list)-1 else None
                        )
            
            removed_roles = existing_roles - processed_roles
            for pk in removed_roles:
                pr = PersonRole.objects.get(pk=pk)
                print pr.person.id, pr
                raise ValueError("Deleted role??")
                log.warn("Deleted %s" % pr)
                pr.delete()
            
            # The name can't be determined until all of the roles are set. If
            # it changes, re-save. Unfortunately roles are cached so this actually
            # doesn't work yet. Re-run the parser to fix names.
            nn = (person.name, person.sortname)
            if hasattr(person, "role"): delattr(person, "role") # clear the cached info
            person.set_names()
            if nn != (person.name, person.sortname):
                log.warn("%s is now %s." % (nn[0], person.name))
                person.save()
            
        except Exception, ex:
            # Catch unexpected exceptions and log them
            pprint.pprint(node)
            log.error('', exc_info=ex)
            had_error = True

        progress.tick()
def main(options):
    """
    Process amendments
    """

    if options.congress:
        files = glob.glob(CONGRESS_DATA_PATH + '/{congress}/amendments/*/*/data.xml'.format(congress=options.congress))
        log.info('Parsing amendments of only congress#%s' % options.congress)
    else:
        files = glob.glob(CONGRESS_DATA_PATH + '/*/amendments/*/*/data.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing amendments: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    amendment_processor = AmendmentProcessor()
    seen_amdt_ids = []
    for fname in files:
        progress.tick()

        m = re.match(re.escape(CONGRESS_DATA_PATH) + r'/(?P<congress>\d+)/amendments/(?P<amendment_type>[a-z]+)/(?P<amendment_type2>[a-z]+)(?P<number>[0-9]+)/data.xml', fname)
        
        if not File.objects.is_changed(fname) and not options.force:
            if not m:
                raise ValueError("Invalid file name", fname)
            else:
                amdt = Amendment.objects.get(congress=int(m.group("congress")), amendment_type=AmendmentType.by_slug(m.group("amendment_type")), number=int(m.group("number")))
                seen_amdt_ids.append(amdt.id) # don't delete me later
            continue
            
        tree = etree.parse(fname)
        node = tree.xpath('/amendment')[0]
        node.set("amendment_type", m.group("amendment_type")) # move from the filename to a place where we can see it in the XML
        
        try:
            amdt = amendment_processor.process(Amendment(), node)
        except:
            print(fname)
            raise

        if not amdt:
            # Amendments to treaties. Can't process.
            continue
            
        # update if already in db
        try:
            amdt.id = Amendment.objects.get(congress=amdt.congress, amendment_type=amdt.amendment_type, number=amdt.number).id
        except Amendment.DoesNotExist:
            pass # a new amendment
       
        seen_amdt_ids.append(amdt.id) # don't delete me later
        
        try:
            amdt.save()
        except:
            print(amdt)
            raise
            
        # For House votes on amendments, the only way to associate the vote with the
        # amendment is to use the THOMAS/Congress.gov action lines. The House vote XML
        # has an amendment-num field but its meaning is ambiguous, so it is useless.
        # When we parse a House amendment with an action line referencing a roll call vote,
        # save this amendment as that vote's related_amendment, then mark the vote as
        # 'missing data' (below) so that on the next parse of votes its title gets updated.
        if amdt.amendment_type == AmendmentType.house_amendment:
            for vote in node.xpath("actions/vote[@how='roll']"):
                v_congress, v_session = get_session_from_date(XmlProcessor.parse_datetime(vote.get('datetime')).date())
                v_roll = int(vote.get("roll"))
                try:
                    vote = Vote.objects.get(congress=v_congress, chamber=CongressChamber.house, session=v_session, number=v_roll)
                    vote.related_amendment = amdt
                    vote.save()
                except Vote.DoesNotExist:
                    print("Missing vote data in", fname)
            
        # If this amendment is related to a vote, mark the vote as missing data because
        # we may need to update the vote title if the amendment title has changed.
        Vote.objects.filter(related_amendment=amdt).update(missing_data=True)

        File.objects.save_file(fname)
        
    # Are any amendments in the database no longer on disk?
    if options.congress and not options.filter:
        missing = Amendment.objects.filter(congress=options.congress).exclude(id__in = seen_amdt_ids)
        if missing.exists():
            print("Amendments should be deleted: ", missing)
示例#20
0
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()
    
    # Cache existing terms. There aren't so many.
    existing_terms = { }
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'data/us/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()
            
        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                term.subterms.add(subterm) 
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)
                    
                    existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' % term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)
                        
                        existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' % term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills
    
    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress and int(options.congress) <= 42:
        files = glob.glob('data/congress/%s/bills/*/*/*.xml' % options.congress)
        log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress)
    elif options.congress:
        files = glob.glob('data/us/%s/bills/*.xml' % options.congress)
        log.info('Parsing bills of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills/*.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()
        
        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (not options.congress or options.congress>42) and (bill_index and not options.disable_events) and not File.objects.is_changed(fname) and not options.force:
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
                seen_bill_ids.append(b.id)
                
                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (datetime.now()-timedelta(days=14)).date():
                        print "No bill text?", fname, b.introduced_date
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(textfile):
                    b.update_index(bill_index) # index the full text
                    b.create_events() # events for new bill text documents
                    File.objects.save_file(textfile)
                    
                continue
            except Bill.DoesNotExist:
                print "Unchanged metadata file but bill doesn't exist:", fname
                pass # just parse as normal
            
        if options.slow:
            time.sleep(1)
            
        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print fname
                raise
           
            seen_bill_ids.append(bill.id) # don't delete me later
            
            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = unicode(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                actions.append( (
                	repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))),
                	BillStatus.by_xml_code(axn.xpath("string(@state)")),
                	axn.xpath("string(text)"),
                    etree.tostring(axn),
                	) )
                
            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])
                    
            bill.major_actions = actions
            try:
                bill.save()
            except:
                print bill
                raise

            if bill_index:
                bill.update_index(bill_index)

            if not options.disable_events:
                bill.create_events()
                
        File.objects.save_file(fname)
        
    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids):
            print "Bill is no longer on disk: ", b.id, b
        
    # The rest is for current only...
    
    if options.congress and int(options.congress) != CURRENT_CONGRESS:
        return
        
    # Find what might be coming up this week.
    load_docs_house_gov(options, bill_index)
    load_senate_floor_schedule(options, bill_index)
def main(options):
    """
    Process committees, subcommittees and
    members of current congress committees.
    """

    BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH
    
    meeting_processor = CommitteeMeetingProcessor()

    log.info('Processing committees')
    COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml'

    if not File.objects.is_changed(COMMITTEES_FILE) and not options.force:
        log.info('File %s was not changed' % COMMITTEES_FILE)
    else:
        tree = yaml_load(COMMITTEES_FILE)
        total = len(tree)
        progress = Progress(total=total)
        seen_committees = set()
        for committee in tree:
            try:
                cobj = Committee.objects.get(code=committee["thomas_id"])
            except Committee.DoesNotExist:
                print "New committee:", committee["thomas_id"]
                cobj = Committee(code=committee["thomas_id"])
               
            cobj.committee_type = TYPE_MAPPING[committee["type"]]
            cobj.name = committee["name"]
            cobj.url = committee.get("url", None)
            cobj.obsolete = False
            cobj.committee = None
            cobj.jurisdiction = committee.get("jurisdiction")
            cobj.jurisdiction_link = committee.get("jurisdiction_source")
            cobj.save()
            seen_committees.add(cobj.id)

            for subcom in committee.get('subcommittees', []):
                code = committee["thomas_id"] + subcom["thomas_id"]
                try:
                    sobj = Committee.objects.get(code=code)
                except Committee.DoesNotExist:
                    print "New subcommittee:", code
                    sobj = Committee(code=code)
                
                sobj.name = subcom["name"]
                sobj.url = subcom.get("url", None)
                sobj.type = None
                sobj.committee = cobj
                sobj.obsolete = False
                sobj.save()
                seen_committees.add(sobj.id)
                
            progress.tick()
            
        # Check for non-obsolete committees in the database that aren't in our
        # file.
        other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees)
        if len(other_committees) > 0:
            print "Marking obsolete:", ", ".join(c.code for c in other_committees)
            other_committees.update(obsolete=True)

        File.objects.save_file(COMMITTEES_FILE)
        
    log.info('Processing committee members')
    MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml'
    file_changed = File.objects.is_changed(MEMBERS_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % MEMBERS_FILE)
    else:
        # map THOMAS IDs to GovTrack IDs
        y = yaml_load(BASE_PATH + "legislators-current.yaml")
        person_id_map = { }
        for m in y:
            if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]:
                person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"]
        
        # load committee members
        tree = yaml_load(MEMBERS_FILE)
        total = len(tree)
        progress = Progress(total=total, name='committees')
        
        # We can delete CommitteeMember objects because we don't have
        # any foreign keys to them.
        CommitteeMember.objects.all().delete()

        # Process committee nodes
        for committee, members in tree.items():
            try:
                cobj = Committee.objects.get(code=committee)
            except Committee.DoesNotExist:
                print "Committee not found:", committee
                continue

            # Process members of current committee node
            for member in members:
                mobj = CommitteeMember()
                mobj.person = Person.objects.get(id=person_id_map[member["thomas"]])
                mobj.committee = cobj
                if "title" in member:
                    mobj.role = ROLE_MAPPING[member["title"]]
                mobj.save()
            
            progress.tick()

        File.objects.save_file(MEMBERS_FILE)
        
    log.info('Processing committee schedule')
    for chamber in ("house", "senate"):
		meetings_file = 'data/congress/committee_meetings_%s.json' % chamber
		file_changed = File.objects.is_changed(meetings_file)
	
		if not file_changed and not options.force:
			log.info('File %s was not changed' % meetings_file)
		else:
			meetings = json.load(open(meetings_file))
			
			# Process committee event nodes
			for meeting in meetings:
				try:
					# Associate it with an existing meeting object if GUID is already known.
					# Must get it like this, vs just assigning the ID as we do in other parsers,
					# because of the auto_now_add created field, which otherwise misbehaves.
					try:
						mobj = CommitteeMeeting.objects.get(guid=meeting['guid'])
					except CommitteeMeeting.DoesNotExist:
						mobj = CommitteeMeeting()
					
					# Parse.
					mobj = meeting_processor.process(mobj, meeting)
					
					# Attach the meeting to the subcommittee if set.
					if mobj.subcommittee:
						mobj.committee = Committee.objects.get(code=mobj.committee.code + mobj.subcommittee)
					
					mobj.save()
					
					mobj.bills.clear()
					for bill in meeting["bill_ids"]:
					    try:
					        bill_type, bill_num, bill_cong = re.match(r"([a-z]+)(\d+)-(\d+)$", bill).groups()
					        bill = Bill.objects.get(congress=bill_cong, bill_type=BillType.by_slug(bill_type), number=int(bill_num))
					        mobj.bills.add(bill)
					    except AttributeError:
					        pass # regex failed
					    except common.enum.NotFound:
					        pass # invalid bill type code in source data
					    except Bill.DoesNotExist:
					        pass # we don't know about bill yet
				except Committee.DoesNotExist:
					log.error('Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting))
	
			for committee in Committee.objects.all():
				if not options.disable_events:
					committee.create_events()
				
			File.objects.save_file(meetings_file)
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()

    # Cache existing terms. There aren't so many.
    existing_terms = {}
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'bill/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()

        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type),
                                          subterm.name)]
                term.subterms.add(subterm)
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)

                    existing_terms[(int(subterm.term_type),
                                    subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' %
                              term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('bill/liv111.xml', 'bill/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type),
                                              subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)

                        existing_terms[(int(subterm.term_type),
                                        subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' %
                                  term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills

    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress:
        files = glob.glob(settings.CONGRESS_DATA_PATH +
                          '/%s/bills/*/*/data.xml' % options.congress)
        log.info('Parsing unitedstates/congress bills of only congress#%s' %
                 options.congress)
    else:
        files = glob.glob(settings.CONGRESS_DATA_PATH +
                          '/*/bills/*/*/data.xml')

    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]

    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()

        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (not options.congress or int(options.congress) > 42) and (
                bill_index and not options.disable_events
        ) and not File.objects.is_changed(fname) and not options.force:
            m = re.match(
                re.escape(settings.CONGRESS_DATA_PATH) +
                r'/(?P<congress>\d+)/bills/(?P<bill_type>[a-z]+)/(?P<bill_type_2>[a-z]+)(?P<number>\d+)/data.xml',
                fname)

            try:
                b = Bill.objects.get(congress=int(m.group("congress")),
                                     bill_type=BillType.by_slug(
                                         m.group("bill_type")),
                                     number=m.group("number"))
                seen_bill_ids.append(b.id)

                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (
                            datetime.now() - timedelta(days=14)).date():
                        print("No bill text?", fname, b.introduced_date)
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(
                        textfile):
                    b.update_index(bill_index)  # index the full text
                    b.create_events()  # events for new bill text documents
                    File.objects.save_file(textfile)

                continue
            except Bill.DoesNotExist:
                print("Unchanged metadata file but bill doesn't exist:", fname)
                pass  # just parse as normal

        if options.slow:
            time.sleep(1)

        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print(fname)
                raise

            seen_bill_ids.append(bill.id)  # don't delete me later

            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = str(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                if axn.xpath("string(@state)") == "REFERRED":
                    continue  # we don't track this state
                actions.append((
                    repr(
                        bill_processor.parse_datetime(
                            axn.xpath("string(@datetime)"))),
                    BillStatus.by_xml_code(axn.xpath("string(@state)")),
                    axn.xpath("string(text)"),
                    etree.tostring(axn, encoding=str),
                ))

            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get(
                    "type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])

            bill.major_actions = actions
            try:
                bill.save()
            except:
                print(bill)
                raise

            if bill_index:
                bill.update_index(bill_index)

            if not options.disable_events:
                bill.create_events()

        File.objects.save_file(fname)

    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(
                id__in=seen_bill_ids):
            print("Bill is no longer on disk: ", b.id, b)

    # The rest is for current only...

    if options.congress and int(options.congress) != settings.CURRENT_CONGRESS:
        return

    # Find what might be coming up this week.
    load_docs_house_gov(options, bill_index)
    load_senate_floor_schedule(options, bill_index)
示例#23
0
def main(options):
    """
    Parse rolls.
    """

    # Setup XML processors
    vote_processor = VoteProcessor()
    option_processor = VoteOptionProcessor()
    voter_processor = VoterProcessor()
    voter_processor.PERSON_CACHE = dict((x.pk, x) for x in Person.objects.all())

    # The pattern which the roll file matches
    # Filename contains info which should be placed to DB
    # along with info extracted from the XML file
    re_path = re.compile("data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml")

    chamber_mapping = {"s": CongressChamber.senate, "h": CongressChamber.house}

    if options.filter:
        files = glob.glob(options.filter)
        log.info("Parsing rolls matching %s" % options.filter)
    elif options.congress:
        files = glob.glob("data/us/%s/rolls/*.xml" % options.congress)
        log.info("Parsing rolls of only congress#%s" % options.congress)
    else:
        files = glob.glob("data/us/*/rolls/*.xml")
    log.info("Processing votes: %d files" % len(files))
    total = len(files)
    progress = Progress(total=total, name="files", step=10)

    def log_delete_qs(qs):
        if qs.count() == 0:
            return
        print "Deleting obsoleted records: ", qs
        # if qs.count() > 3:
        #    print "Delete skipped..."
        #    return
        qs.delete()

    seen_obj_ids = set()
    had_error = False

    for fname in files:
        progress.tick()

        match = re_path.search(fname)

        try:
            existing_vote = Vote.objects.get(
                congress=match.group(1),
                chamber=chamber_mapping[match.group(2)],
                session=match.group(3),
                number=match.group(4),
            )
        except Vote.DoesNotExist:
            existing_vote = None

        if (
            not File.objects.is_changed(fname)
            and not options.force
            and existing_vote != None
            and not existing_vote.missing_data
        ):
            seen_obj_ids.add(existing_vote.id)
            continue

        try:
            tree = etree.parse(fname)

            ## Look for votes with VP tie breakers.
            # if len(tree.xpath("/roll/voter[@VP='1']")) == 0:
            #    had_error = True # prevent delete at the end
            #    continue

            # Process role object
            for roll_node in tree.xpath("/roll"):
                vote = vote_processor.process(Vote(), roll_node)
                if existing_vote:
                    vote.id = existing_vote.id
                match = re_path.search(fname)
                vote.congress = int(match.group(1))
                vote.chamber = chamber_mapping[match.group(2)]
                vote.session = match.group(3)
                vote.number = int(match.group(4))

                # Get related bill & amendment.

                for bill_node in roll_node.xpath("bill"):
                    related_bill_num = bill_node.get("number")
                    if 9 <= vote.congress <= 42 and vote.session in ("1", "2"):
                        # Bill numbering from the American Memory colletion is different. The number combines
                        # the session, bill number, and a 0 or 5 for regular or 'half' numbering. Prior to
                        # the 9th congress numbering seems to be wholly assigned by us and not related to
                        # actual numbering, so we skip matching those bills.
                        related_bill_num = "%d%04d%d" % (int(vote.session), int(bill_node.get("number")), 0)
                    try:
                        vote.related_bill = Bill.objects.get(
                            congress=bill_node.get("session"),
                            bill_type=BillType.by_xml_code(bill_node.get("type")),
                            number=related_bill_num,
                        )
                    except Bill.DoesNotExist:
                        if vote.congress >= 93:
                            vote.missing_data = True

                for amdt_node in roll_node.xpath("amendment"):
                    if amdt_node.get("ref") == "regular" and vote.related_bill is not None:
                        try:
                            vote.related_amendment = Amendment.objects.get(
                                congress=vote.related_bill.congress,
                                amendment_type=AmendmentType.by_slug(amdt_node.get("number")[0]),
                                number=amdt_node.get("number")[1:],
                            )
                        except Amendment.DoesNotExist:
                            if vote.congress >= 93:
                                print "Missing amendment", fname
                                vote.missing_data = True
                    elif amdt_node.get("ref") == "bill-serial":
                        # It is impossible to associate House votes with amendments just from the House
                        # vote XML because the amendment-num might correspond either with the A___ number
                        # or with the "An amendment, numbered ____" number from the amendment purpose,
                        # and there's really no way to figure out which. Maybe we can use the amendment
                        # sponsor instead?
                        # vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number"))
                        # Instead, we set related_amendment from the amendment parser. Here, we have to
                        # preserve the related_amendment if it is set.
                        if existing_vote:
                            vote.related_amendment = existing_vote.related_amendment

                # clean up some question text and use the question_details field

                if (
                    vote.category in (VoteCategory.passage, VoteCategory.passage_suspension, VoteCategory.veto_override)
                    and vote.related_bill
                ):
                    # For passage votes, set the question to the bill title and put the question
                    # details in the details field.
                    vote.question = truncatewords(vote.related_bill.title, 20)
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display()

                elif vote.category == VoteCategory.amendment and vote.related_amendment:
                    # For votes on amendments, make a better title/explanation.
                    vote.question = truncatewords(vote.related_amendment.title, 20)
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display()

                elif vote.related_bill and vote.question.startswith(
                    "On the Cloture Motion " + vote.related_bill.display_number
                ):
                    vote.question = "Cloture on " + truncatewords(vote.related_bill.title, 20)
                elif vote.related_bill and vote.question.startswith(
                    "On Cloture on the Motion to Proceed " + vote.related_bill.display_number
                ):
                    vote.question = "Cloture on " + truncatewords(vote.related_bill.title, 20)
                    vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display()
                elif vote.related_bill and vote.question.startswith(
                    "On the Motion to Proceed " + vote.related_bill.display_number
                ):
                    vote.question = "Motion to Proceed on " + truncatewords(vote.related_bill.title, 20)

                elif vote.related_amendment and vote.question.startswith(
                    "On the Cloture Motion "
                    + vote.related_amendment.get_amendment_type_display()
                    + " "
                    + str(vote.related_amendment.number)
                ):
                    vote.question = "Cloture on " + truncatewords(vote.related_amendment.title, 20)
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display()

                # weird House foratting of bill numbers ("H RES 123 Blah blah")
                if vote.related_bill:
                    vote.question = re.sub(
                        "(On [^:]+): "
                        + vote.related_bill.display_number.replace(". ", " ").replace(".", " ").upper()
                        + " .*",
                        r"\1: " + truncatewords(vote.related_bill.title, 15),
                        vote.question,
                    )

                vote.save()

                seen_obj_ids.add(vote.id)  # don't delete me later

                # Process roll options, overwrite existing options where possible.
                seen_option_ids = set()
                roll_options = {}
                for option_node in roll_node.xpath("./option"):
                    option = option_processor.process(VoteOption(), option_node)
                    option.vote = vote
                    if existing_vote:
                        try:
                            option.id = VoteOption.objects.filter(vote=vote, key=option.key)[
                                0
                            ].id  # get is better, but I had the database corruption problem
                        except IndexError:
                            pass
                    option.save()
                    roll_options[option.key] = option
                    seen_option_ids.add(option.id)
                log_delete_qs(
                    VoteOption.objects.filter(vote=vote).exclude(id__in=seen_option_ids)
                )  # may cascade and delete the Voters too?

                # Process roll voters, overwriting existing voters where possible.
                if existing_vote:
                    existing_voters = dict(Voter.objects.filter(vote=vote).values_list("person", "id"))
                seen_voter_ids = set()
                voters = list()
                for voter_node in roll_node.xpath("./voter"):
                    voter = voter_processor.process(roll_options, Voter(), voter_node)
                    voter.vote = vote
                    voter.created = vote.created

                    # for VP votes, load the actual person...
                    if voter.voter_type == VoterType.vice_president:
                        try:
                            r = PersonRole.objects.get(
                                role_type=RoleType.vicepresident, startdate__lte=vote.created, enddate__gte=vote.created
                            )
                            voter.person_role = r
                            voter.person = r.person
                        except:
                            # overlapping roles? missing data?
                            log.error("Could not resolve vice president in %s" % fname, exc_info=ex)

                    if existing_vote and voter.person:
                        try:
                            voter.id = existing_voters[voter.person.id]
                        except KeyError:
                            pass

                    voters.append(voter)

                    if voter.voter_type == VoterType.unknown and not vote.missing_data:
                        vote.missing_data = True
                        vote.save()

                # pre-fetch the role of each voter
                load_roles_at_date([x.person for x in voters if x.person != None], vote.created)
                for voter in voters:
                    voter.person_role = voter.person.role
                    if voter.person_role is None:
                        log.error("%s: Could not find role for %s on %s." % (fname, voter.person, vote.created))
                        vote.missing_data = True
                        vote.save()

                # save all of the records (inserting/updating)
                for voter in voters:
                    voter.save()
                    seen_voter_ids.add(voter.id)

                # remove obsolete voter records
                log_delete_qs(
                    Voter.objects.filter(vote=vote).exclude(id__in=seen_voter_ids)
                )  # possibly already deleted by cascade above

                # pre-calculate totals
                vote.calculate_totals()

                if not options.disable_events:
                    vote.create_event()

            File.objects.save_file(fname)

        except Exception, ex:
            log.error("Error in processing %s" % fname, exc_info=ex)
            had_error = True
def main(options):
    """
    Process committees, subcommittees and
    members of current congress committees.
    """

    BASE_PATH = settings.CONGRESS_LEGISLATORS_PATH
    
    meeting_processor = CommitteeMeetingProcessor()

    log.info('Processing committees')
    COMMITTEES_FILE = BASE_PATH + 'committees-current.yaml'

    if not File.objects.is_changed(COMMITTEES_FILE) and not options.force:
        log.info('File %s was not changed' % COMMITTEES_FILE)
    else:
        tree = yaml_load(COMMITTEES_FILE)
        total = len(tree)
        progress = Progress(total=total)
        seen_committees = set()
        for committee in tree:
            try:
                cobj = Committee.objects.get(code=committee["thomas_id"])
            except Committee.DoesNotExist:
                print "New committee:", committee["thomas_id"]
                cobj = Committee(code=committee["thomas_id"])
               
            cobj.committee_type = TYPE_MAPPING[committee["type"]]
            cobj.name = committee["name"]
            cobj.url = committee.get("url", None)
            cobj.obsolete = False
            cobj.committee = None
            cobj.save()
            seen_committees.add(cobj.id)

            for subcom in committee.get('subcommittees', []):
                code = committee["thomas_id"] + subcom["thomas_id"]
                try:
                    sobj = Committee.objects.get(code=code)
                except Committee.DoesNotExist:
                    print "New subcommittee:", code
                    sobj = Committee(code=code)
                
                sobj.name = subcom["name"]
                sobj.url = subcom.get("url", None)
                sobj.type = None
                sobj.committee = cobj
                sobj.obsolete = False
                sobj.save()
                seen_committees.add(sobj.id)
                
            progress.tick()
            
        # Check for non-obsolete committees in the database that aren't in our
        # file.
        other_committees = Committee.objects.filter(obsolete=False).exclude(id__in=seen_committees)
        if len(other_committees) > 0:
            print "Marking obsolete:", ", ".join(c.code for c in other_committees)
            other_committees.update(obsolete=True)

        File.objects.save_file(COMMITTEES_FILE)
        
    log.info('Processing committee members')
    MEMBERS_FILE = BASE_PATH + 'committee-membership-current.yaml'
    file_changed = File.objects.is_changed(MEMBERS_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % MEMBERS_FILE)
    else:
        # map THOMAS IDs to GovTrack IDs
        y = yaml_load(BASE_PATH + "legislators-current.yaml")
        person_id_map = { }
        for m in y:
            if "id" in m and "govtrack" in m["id"] and "thomas" in m["id"]:
                person_id_map[m["id"]["thomas"]] = m["id"]["govtrack"]
        
        # load committee members
        tree = yaml_load(MEMBERS_FILE)
        total = len(tree)
        progress = Progress(total=total, name='committees')
        
        # We can delete CommitteeMember objects because we don't have
        # any foreign keys to them.
        CommitteeMember.objects.all().delete()

        # Process committee nodes
        for committee, members in tree.items():
            if committee[0] == "H": continue # House data is out of date
            
            try:
                cobj = Committee.objects.get(code=committee)
            except Committee.DoesNotExist:
                print "Committee not found:", committee
                continue

            # Process members of current committee node
            for member in members:
                mobj = CommitteeMember()
                mobj.person = Person.objects.get(id=person_id_map[member["thomas"]])
                mobj.committee = cobj
                if "title" in member:
                    mobj.role = ROLE_MAPPING[member["title"]]
                mobj.save()
            
            progress.tick()

        File.objects.save_file(MEMBERS_FILE)
        
    return

    log.info('Processing committee schedule')
    SCHEDULE_FILE = 'data/us/112/committeeschedule.xml'
    file_changed = File.objects.is_changed(SCHEDULE_FILE)

    if not file_changed and not options.force:
        log.info('File %s was not changed' % SCHEDULE_FILE)
    else:
        tree = etree.parse(SCHEDULE_FILE)
        
        # We have to clear out all CommitteeMeeting objects when we refresh because
        # we have no unique identifier in the upstream data for a meeting. We might use
        # the meeting's committee & date as an identifier, but since meeting times can
        # change this might have awkward consequences for the end user if we even
        # attempted to track that.

        CommitteeMeeting.objects.all().delete()

        # Process committee event nodes
        for meeting in tree.xpath('/committee-schedule/meeting'):
            try:
                mobj = meeting_processor.process(CommitteeMeeting(), meeting)
                mobj.save()
                
                mobj.bills.clear()
                for bill in meeting.xpath('bill'):
                    bill = Bill.objects.get(congress=bill.get("session"), bill_type=BillType.by_xml_code(bill.get("type")), number=int(bill.get("number")))
                    mobj.bills.add(bill)
            except Committee.DoesNotExist:
                log.error('Could not load Committee object for meeting %s' % meeting_processor.display_node(meeting))

        for committee in Committee.objects.all():
            if not options.disable_events:
                committee.create_events()
            
        File.objects.save_file(SCHEDULE_FILE)
示例#25
0
def main(options):
    """
    Parse rolls.
    """

    # Setup XML processors
    vote_processor = VoteProcessor()
    option_processor = VoteOptionProcessor()
    voter_processor = VoterProcessor()
    voter_processor.PERSON_CACHE = dict(
        (x.pk, x) for x in Person.objects.all())

    # The pattern which the roll file matches
    # Filename contains info which should be placed to DB
    # along with info extracted from the XML file
    re_path = re.compile('data/us/(\d+)/rolls/([hs])(\w+)-(\d+)\.xml')

    chamber_mapping = {'s': CongressChamber.senate, 'h': CongressChamber.house}

    if options.filter:
        files = glob.glob(options.filter)
        log.info('Parsing rolls matching %s' % options.filter)
    elif options.congress:
        files = glob.glob('data/us/%s/rolls/*.xml' % options.congress)
        log.info('Parsing rolls of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/rolls/*.xml')
    log.info('Processing votes: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=10)

    def log_delete_qs(qs):
        if qs.count() > 0:
            try:
                print "Deleting: ", qs
            except Exception as e:
                print "Deleting [%s]..." % str(e)
            if qs.count() > 3:
                print "Delete skipped..."
                return
            qs.delete()

    seen_obj_ids = set()
    had_error = False

    for fname in files:
        progress.tick()

        match = re_path.search(fname)

        try:
            existing_vote = Vote.objects.get(
                congress=match.group(1),
                chamber=chamber_mapping[match.group(2)],
                session=match.group(3),
                number=match.group(4))
        except Vote.DoesNotExist:
            existing_vote = None

        if not File.objects.is_changed(
                fname
        ) and not options.force and existing_vote != None and not existing_vote.missing_data:
            seen_obj_ids.add(existing_vote.id)
            continue

        try:
            tree = etree.parse(fname)

            ## Look for votes with VP tie breakers.
            #if len(tree.xpath("/roll/voter[@VP='1']")) == 0:
            #    had_error = True # prevent delete at the end
            #    continue

            # Process role object
            for roll_node in tree.xpath('/roll'):
                vote = vote_processor.process(Vote(), roll_node)
                if existing_vote: vote.id = existing_vote.id
                match = re_path.search(fname)
                vote.congress = int(match.group(1))
                vote.chamber = chamber_mapping[match.group(2)]
                vote.session = match.group(3)
                vote.number = int(match.group(4))

                # Get related bill & amendment.

                for bill_node in roll_node.xpath("bill"):
                    try:
                        vote.related_bill = Bill.objects.get(
                            congress=bill_node.get("session"),
                            bill_type=BillType.by_xml_code(
                                bill_node.get("type")),
                            number=bill_node.get("number"))
                    except Bill.DoesNotExist:
                        vote.missing_data = True

                for amdt_node in roll_node.xpath("amendment"):
                    if amdt_node.get("ref") == "regular":
                        try:
                            vote.related_amendment = Amendment.objects.get(
                                congress=vote.related_bill.congress,
                                amendment_type=AmendmentType.by_slug(
                                    amdt_node.get("number")[0]),
                                number=amdt_node.get("number")[1:])
                        except Amendment.DoesNotExist:
                            print "Missing amendment", fname
                            vote.missing_data = True
                    elif amdt_node.get("ref") == "bill-serial":
                        # It is impossible to associate House votes with amendments just from the House
                        # vote XML because the amendment-num might correspond either with the A___ number
                        # or with the "An amendment, numbered ____" number from the amendment purpose,
                        # and there's really no way to figure out which. Maybe we can use the amendment
                        # sponsor instead?
                        #vote.related_amendment = Amendment.objects.get(bill=vote.related_bill, sequence=amdt_node.get("number"))
                        # Instead, we set related_amendment from the amendment parser. Here, we have to
                        # preserve the related_amendment if it is set.
                        if existing_vote:
                            vote.related_amendment = existing_vote.related_amendment

                # clean up some question text and use the question_details field

                if vote.category in (
                        VoteCategory.passage, VoteCategory.passage_suspension,
                        VoteCategory.veto_override) and vote.related_bill:
                    # For passage votes, set the question to the bill title and put the question
                    # details in the details field.
                    vote.question = truncatewords(vote.related_bill.title, 20)
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display(
                    )

                elif vote.category == VoteCategory.amendment and vote.related_amendment:
                    # For votes on amendments, make a better title/explanation.
                    vote.question = truncatewords(vote.related_amendment.title,
                                                  20)
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display(
                    )

                elif vote.related_bill and vote.question.startswith(
                        "On the Cloture Motion " +
                        vote.related_bill.display_number):
                    vote.question = "Cloture on " + truncatewords(
                        vote.related_bill.title, 20)
                elif vote.related_bill and vote.question.startswith(
                        "On Cloture on the Motion to Proceed " +
                        vote.related_bill.display_number):
                    vote.question = "Cloture on " + truncatewords(
                        vote.related_bill.title, 20)
                    vote.question_details = "On Cloture on the Motion to Proceed in the " + vote.get_chamber_display(
                    )
                elif vote.related_bill and vote.question.startswith(
                        "On the Motion to Proceed " +
                        vote.related_bill.display_number):
                    vote.question = "Motion to Proceed on " + truncatewords(
                        vote.related_bill.title, 20)

                elif vote.related_amendment and vote.question.startswith(
                        "On the Cloture Motion " +
                        vote.related_amendment.get_amendment_type_display() +
                        " " + str(vote.related_amendment.number)):
                    vote.question = "Cloture on " + truncatewords(
                        vote.related_amendment.title, 20)
                    vote.question_details = vote.vote_type + " in the " + vote.get_chamber_display(
                    )

                # weird House foratting of bill numbers ("H RES 123 Blah blah")
                if vote.related_bill:
                    vote.question = re.sub(
                        "(On [^:]+): " +
                        vote.related_bill.display_number.replace(
                            ". ", " ").replace(".", " ").upper() + " .*",
                        r"\1: " + truncatewords(vote.related_bill.title, 15),
                        vote.question)

                vote.save()

                seen_obj_ids.add(vote.id)  # don't delete me later

                # Process roll options, overwrite existing options where possible.
                seen_option_ids = set()
                roll_options = {}
                for option_node in roll_node.xpath('./option'):
                    option = option_processor.process(VoteOption(),
                                                      option_node)
                    option.vote = vote
                    if existing_vote:
                        try:
                            option.id = VoteOption.objects.filter(
                                vote=vote, key=option.key
                            )[0].id  # get is better, but I had the database corruption problem
                        except IndexError:
                            pass
                    option.save()
                    roll_options[option.key] = option
                    seen_option_ids.add(option.id)
                log_delete_qs(
                    VoteOption.objects.filter(vote=vote).exclude(
                        id__in=seen_option_ids)
                )  # may cascade and delete the Voters too?

                # Process roll voters, overwriting existing voters where possible.
                if existing_vote:
                    existing_voters = dict(
                        Voter.objects.filter(vote=vote).values_list(
                            "person", "id"))
                seen_voter_ids = set()
                for voter_node in roll_node.xpath('./voter'):
                    voter = voter_processor.process(roll_options, Voter(),
                                                    voter_node)
                    voter.vote = vote
                    voter.created = vote.created

                    # for VP votes, load the actual person...
                    if voter.voter_type == VoterType.vice_president:
                        try:
                            r = PersonRole.objects.get(
                                role_type=RoleType.vicepresident,
                                startdate__lte=vote.created,
                                enddate__gte=vote.created)
                            voter.person = r.person
                        except:
                            # overlapping roles? missing data?
                            log.error(
                                'Could not resolve vice president in %s' %
                                fname,
                                exc_info=ex)

                    if existing_vote and voter.person:
                        try:
                            voter.id = existing_voters[voter.person.id]
                        except KeyError:
                            pass

                    voter.save()

                    if voter.voter_type == VoterType.unknown and not vote.missing_data:
                        vote.missing_data = True
                        vote.save()

                    seen_voter_ids.add(voter.id)

                log_delete_qs(
                    Voter.objects.filter(vote=vote).exclude(
                        id__in=seen_voter_ids)
                )  # possibly already deleted by cascade above

                vote.calculate_totals()

                if not options.disable_events:
                    vote.create_event()

            File.objects.save_file(fname)

        except Exception, ex:
            log.error('Error in processing %s' % fname, exc_info=ex)
            had_error = True