示例#1
0
    def scrape_senators(self, year):
        senator_url = 'http://www.senate.state.tx.us/75r/senate/senmem.htm'
        with self.urlopen_context(senator_url) as page:
            root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

            for el in root.xpath('//table[@summary="senator identification"]'):
                sen_link = el.xpath('tr/td[@headers="senator"]/a')[0]
                full_name = sen_link.text
                district = el.xpath('string(tr/td[@headers="district"])')
                party = el.xpath('string(tr/td[@headers="party"])')

                pre, first, last, suffixes = name_tools.split(full_name)

                leg = Legislator('81', 'upper', district, full_name,
                                 first, last, '', party,
                                 suffix=suffixes)
                leg.add_source(senator_url)

                details_url = ('http://www.senate.state.tx.us/75r/senate/' +
                               sen_link.attrib['href'])
                with self.urlopen_context(details_url) as details_page:
                    details = lxml.etree.fromstring(details_page,
                                                    lxml.etree.HTMLParser())

                    comms = details.xpath("//h2[contains(text(), 'Committee Membership')]")[0]
                    comms = comms.getnext()
                    for comm in comms.xpath('li/a'):
                        comm_name = comm.text
                        if comm.tail:
                            comm_name += comm.tail

                        leg.add_role('committee member', '81',
                                     committee=comm_name.strip())

                self.save_legislator(leg)
示例#2
0
    def _scrape_lower_special_committees(self):
        url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx'
        page = self.lxmlize(url)
        
        committee_list = page.xpath('//table[@id="table106"]//div[@class='
            '"exBody1A"]/div[@class="accordion"]')[0]
        headers = committee_list.xpath('./h3')

        for header in headers:
            committee_name_text = header.xpath('string()')
            committee_name = committee_name_text.strip()
            committee_name = self._normalize_committee_name(committee_name)

            chamber = 'joint' if committee_name.startswith('Joint') else 'lower'

            committee = Committee(chamber, committee_name)
            committee.add_source(url)

            committee_memberlist = header.xpath('./following-sibling::div['
                '@class="pane"]//tr[@class="linkStyle2"]')

            for row in committee_memberlist:
                member_name = row.xpath('normalize-space(string(./td[1]))')
                member_name = ' '.join(filter(None, name_tools.split(member_name)))
                member_role = row.xpath('normalize-space(string(./td[2]))')

                member_role = self._normalize_member_role(member_role)

                committee.add_member(member_name, member_role)

            self.save_committee(committee)
示例#3
0
    def scrape_reps(self, year):
        rep_url = 'http://www.house.state.tx.us/members/welcome.php'
        with self.urlopen_context(rep_url) as page:
            root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

            for el in root.xpath('//form[@name="frmMembers"]/table/tr')[1:]:
                full_name = el.xpath('string(td/a/font/span)')
                district = el.xpath('string(td[2]/span)')
                county = el.xpath('string(td[3]/span)')

                if full_name.startswith('District'):
                    # Ignore empty seats
                    continue

                pre, first, last, suffixes = name_tools.split(full_name)
                party = ''

                leg = Legislator('81', 'lower', district,
                                 full_name, first, last,
                                 '', party, suffix=suffixes)
                leg.add_source(rep_url)

                # Is there anything out there that handles meta refresh?
                redirect_url = el.xpath('td/a')[0].attrib['href']
                redirect_url = ('http://www.house.state.tx.us/members/' +
                                redirect_url)
                details_url = redirect_url
                with self.urlopen_context(redirect_url) as redirect_page:
                    redirect = lxml.etree.fromstring(redirect_page,
                                                     lxml.etree.HTMLParser())

                    try:
                        filename = redirect.xpath(
                            "//meta[@http-equiv='refresh']"
                            )[0].attrib['content']

                        filename = filename.split('0;URL=')[1]

                        details_url = details_url.replace('welcome.htm',
                                                          filename)
                    except:
                        # The Speaker's member page does not redirect.
                        # The Speaker is not on any committees
                        # so we can just continue with the next member.
                        self.save_legislator(leg)
                        continue


                with self.urlopen_context(details_url) as details_page:
                    details = lxml.etree.fromstring(details_page,
                                                    lxml.etree.HTMLParser())

                    comms = details.xpath(
                        "//b[contains(text(), 'Committee Assignments')]/"
                        "..//a")
                    for comm in comms:
                        leg.add_role('committee member', '81',
                                     committee=comm.text.strip())

                self.save_legislator(leg)
示例#4
0
    def scrape_legislators(self, chamber, year):
        year = int(year)
        session = self.internal_sessions[year][0][1]
        # iterating through subsessions would be a better way to do this..
        if year % 2 == 0 and (year != dt.date.today().year or year + 1 != dt.date.today().year):
            raise NoDataForYear(year)

        if chamber == "upper":
            url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=senate"
        else:
            url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=assembly"

        body = unicode(self.urlopen(url), "latin-1")
        page = lxml.html.fromstring(body)

        for row in page.cssselect("#ctl00_C_dgLegData tr"):
            if len(row.cssselect("td a")) > 0:
                rep_url = list(row)[0].cssselect("a[href]")[0].get("href")
                (full_name, party) = re.findall(r"([\w\-\,\s\.]+)\s+\(([\w])\)", list(row)[0].text_content())[0]

                pre, first, last, suffixes = name_tools.split(full_name)

                district = str(int(list(row)[2].text_content()))

                leg = Legislator(session, chamber, district, full_name, first, last, "", party, suffix=suffixes)
                leg.add_source(rep_url)

                leg = self.add_committees(leg, rep_url, session)
                self.save_legislator(leg)
示例#5
0
    def scrape_upper_offices(self, legislator):

        guessed_url_tmpl = ('http://www.oksenate.gov/Senators/'
                            'biographies/%s_bio.html')
        last_name_parts = name_tools.split(legislator['full_name'])
        last_name = last_name_parts[2].replace(' ', '_')

        guessed_url = guessed_url_tmpl % last_name

        try:
            html = self.urlopen(guessed_url)
        except scrapelib.HTTPError:
            # The name was backwards; retry with first name (i.e., last name)
            last_name = last_name_parts[1].replace(' ', '_').strip(',')
            guessed_url = guessed_url_tmpl % last_name

            html = self.urlopen(guessed_url)

        legislator.add_source(guessed_url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(guessed_url)

        xpath = '//h3[contains(., "Office")]'
        table = doc.xpath(xpath)[0].itersiblings().next()
        col1, col2 = table.xpath('tr[2]/td')

        # Add the capitol office.
        col1 = map(scrub, col1.itertext())
        while True:
            # Throw away anything after the email address.
            last = col1[-1]
            if '@' not in last and not re.search(r'[\d\-\(\) ]{7,}', last):
                print col1.pop()
            else:
                break

        # Set email on the leg object.
        email = col1.pop()
        legislator['email'] = email

        # Next line is the phone number.
        phone = col1.pop()
        office = dict(
            name='Capitol Office',
            type='capitol',
            address='\n'.join(col1),
            fax=None, email=None, phone=phone)
        legislator.add_office(**office)

        col2 = map(scrub, col2.itertext())
        if len(col2) < 2:
            return

        office = dict(
            name='District Office',
            type='district',
            address='\n'.join(col2),
            fax=None, email=None, phone=phone)
        legislator.add_office(**office)
def breakout_names(apps, schema_editor):
    User = apps.get_model("faceitweb", "User")
    for user in User.objects.all():
        prefix, first_name, last_name, suffix = name_tools.split(user.full_name)

        user.first_name = first_name
        user.last_name = last_name

        user.save()
示例#7
0
def split_name(obj):
    """
    If the supplied legislator/person object is missing 'first_name'
    or 'last_name' then use name_tools to split.
    """
    if obj['_type'] in ('person', 'legislator'):
        for key in ('first_name', 'last_name'):
            if key not in obj or not obj[key]:
                # Need to split
                (obj['first_name'], obj['last_name'],
                 obj['suffixes']) = name_tools.split(obj['full_name'])[1:]
                break

    return obj
示例#8
0
文件: forms.py 项目: 4bic/open_county
def name_forms(name):
    """
    >>> forms = name_forms("Michael Stephens")
    >>> 'michael stehpens' in forms
    True
    >>> 'stephens, michael' in forms
    True
    >>> 'm stephens' in forms
    True
    >>> 'stephens' in forms
    True
    """
    sname = {}
    (sname['pre'], sname['first'],
     sname['last'], sname['post']) = name_tools.split(name)

    forms = set()

    def add_form(str):
        str = (str % sname).strip(', \t\r\n').lower()
        str = str.replace('.', '')

        # Collapse all whitespace segments into single space characters
        str = ' '.join(str.split())

        forms.add(str)

    add_form("%(first)s %(last)s")
    add_form("%(last)s")
    add_form("%(pre)s %(first)s %(last)s")
    add_form("%(first)s %(last)s %(post)s")
    add_form("%(pre)s %(first)s %(last)s")
    add_form("%(pre)s %(first)s %(last)s %(post)s")
    add_form("%(last)s, %(first)s")

    pre_first = ("%(pre)s %(first)s" % sname).strip(', \t\r\n')
    add_form("%(last)s, " + pre_first)

    add_form("%s %s" % (sname['first'][0], sname['last']))
    add_form("%s. %s" % (sname['first'][0], sname['last']))

    initials = ' '.join([w[0] for w in sname['first'].split()])

    add_form(initials + " %(last)s")

    add_form("%(last)s, " + initials)

    return forms
    def handle(self, *args, **options):

        if args:
            congress = args[0]
        else:
            congress = 112

        data = 'lastname=&firstname=&position=&state=&party=&congress=%s' % str(congress)
        url = 'http://bioguide.congress.gov/biosearch/biosearch1.asp'
        req = urllib2.Request(url, data)
        response = urllib2.urlopen(req).read()

        soup = BeautifulSoup(response)

        for row in soup.findAll('tr')[2:]:
            cells = row.findAll('td')
            if len(cells) != 6:
                continue

            try:
                try:
                    name = cells[0].find('a').renderContents()
                    bioguide_id = cells[0].find('a')['href'].split('=')[-1]
                except AttributeError:
                    pass

                birth_death, position, party, state, congress = [x.renderContents() for x in cells[1:]]
                congress = congress.split('<br />')[0]

                data = {'bioguide_id': bioguide_id,
                        'birth_death': birth_death,
                        'position': position,
                        'party': party,
                        'state': state,
                        'congress': congress, }

                data['prefix'], data['first'], data['last'], data['suffix'] = name_tools.split(name)
                print data
            except Exception, e:
                print Exception, e

            try:
                legislator, created = Legislator.objects.get_or_create(**data)
            except IntegrityError:
                continue
示例#10
0
def import_committees(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, 'committees', '*.json')
    for path in glob.iglob(pattern):
        with open(path) as f:
            data = prepare_obj(json.load(f))

        meta = db.metadata.find_one({'_id': state})
        current_term = meta['terms'][-1]['name']

        for member in data['members']:
            if not member['legislator']:
                continue

            (pre, first, last, suff) = name_tools.split(member['legislator'])

            found = db.legislators.find({
                    'first_name': first,
                    'last_name': last,
                    'roles': {'$elemMatch': {'term': current_term,
                                             'state': state}}})

            if found.count() > 1:
                print "Too many matches for %s" % member['legislator']
                continue
            elif found.count() == 0:
                print "No matches for %s" % member['legislator']
                continue

            legislator = found[0]

            for role in legislator['roles']:
                if (role['type'] == 'committee member' and
                    role['term'] == current_term and
                    role['committee'] == data['name']):
                    break
            else:
                legislator['roles'].append({
                        'type': 'committee member',
                        'committee': data['name'],
                        'term': current_term,
                        'chamber': data['chamber']})
                legislator['updated_at'] = datetime.datetime.now()
                db.legislators.save(legislator)
示例#11
0
    def _scrape_lower_standing_committee(self, committee_name, url):
        page = self.lxmlize(url)

        committee = Committee('lower', committee_name)
        committee.add_source(url)

        rows = page.xpath('//table[@id="body_ListView1_itemPlaceholder'
            'Container"]/tr[@class="linkStyle2"]')

        for row in rows:
            member_name = row.xpath('normalize-space(string(./td[1]/a))')
            member_name = ' '.join(filter(None, name_tools.split(member_name)))
            member_role = row.xpath('normalize-space(string(./td[2]))')

            member_role = self._normalize_member_role(member_role)

            committee.add_member(member_name, member_role)

        self.save_committee(committee)
示例#12
0
    def getVirtualSets(element, source):
        namespaces = {
         'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
         'dc' : 'http://purl.org/dc/elements/1.1/'}

        xpath_ev = etree.XPathEvaluator(element, namespaces=namespaces)
        matches = xpath_ev.evaluate('oai_dc:dc/dc:creator/text()')
        result = []
        for v in matches:
            if v.strip() == "":
                continue
            name = unicode(html.fromstring(v).text)
            name = unidecode(name)
            pre, first, last, post = name_tools.split(name)
            name = last.lower().strip()
            name = OAIDCLastnameExtractor.escaping_chars_re.sub('',name)
            name = OAIDCLastnameExtractor.final_nontext_re.sub('',name)
            name = OAIDCLastnameExtractor.nontext_re.sub('-',name)
            result.append(name)
        return result
示例#13
0
def prepare_obj(obj):
    """
    Convert timestamps in the scraper output to datetimes so that they
    will be saved as Mongo datetimes, and standardize some other fields.
    """
    for source in obj.get('sources', []):
        source['retrieved'] = timestamp_to_dt(source['retrieved'])

    for action in obj.get('actions', []):
        action['date'] = timestamp_to_dt(action['date'])

    for role in obj.get('roles', []):
        if role['start_date']:
            role['start_date'] = timestamp_to_dt(role['start_date'])

        if role['end_date']:
            role['end_date'] = timestamp_to_dt(role['end_date'])

        role['state'] = obj['state']

    for vote in obj.get('votes', []):
        vote['date'] = timestamp_to_dt(vote['date'])

    if 'date' in obj:
        obj['date'] = timestamp_to_dt(obj['date'])

    # If we are handling a legislator and the scraped data
    # includes both 'first_name' and 'last_name' fields, then use them.
    # If one or both of these fields is missing, then run the name_tools
    # splitting code to generate them.
    if obj['_type'] in ('person', 'legislator'):
        split_name = False
        for key in ('first_name', 'last_name'):
            if key not in obj or not obj[key]:
                # Need to split
                (obj['first_name'], obj['last_name'],
                 obj['suffixes']) = name_tools.split(obj['full_name'])[1:]
                break

    return obj
示例#14
0
def parse_comma_name(name):
    """
    Parse a name of the form "Last name, First name" to (first name, last name)
    Try to do something reasonable if there is no comma.
    """
    if ',' in name:
        # In this case name_tools does it well
        prefix, first_name, last_name, suffix = name_tools.split(name)
    else:
        words, separators = split_name_words(name)
        if not words:
            return ('', '')
        first_name = None
        last_name = None
        from_lists = True

        # Search for initials in the words
        initial = list(map(contains_initials, words))
        capitalized = list(map(is_fully_capitalized, words))

        # CASE 1: the first word is capitalized but not all of them are
        # we assume that it is the first word of the last name
        if not initial[0] and capitalized[0] and not all(capitalized):
            (last, first) = predsplit_forward(
                    (lambda i: capitalized[i] and not initial[i]),
                    words)

        # CASE 2: the last word is capitalized but not all of them are
        # we assume that it is the last word of the last name
        elif not initial[-1] and capitalized[-1] and not all(capitalized):
            (first, last) = predsplit_forward(
                    (lambda i: (not capitalized[i]) or initial[i]),
                    words)

        # CASE 3: the first word is an initial
        elif initial[0]:
            (first, last) = predsplit_forward(
                    (lambda i: initial[i]),
                    words)

        # CASE 4: the last word is an initial
        # this is trickier, we know that the last name comes first
        # but we don't really know where it stops.
        # For simplicity we assume that all the words in the first
        # name are initials
        elif initial[-1]:
            (last, first) = predsplit_backwards(
                    (lambda i: initial[i]),
                    words)

        # CASE 5: there are initials in the name, but neither
        # at the beginning nor at the end
        elif True in initial:
            last_initial_idx = None
            for i in range(len(words)):
                if initial[i]:
                    last_initial_idx = i
            first = words[:last_initial_idx+1]
            last = words[last_initial_idx+1:]

        # CASE 6: we have no clue
        # We fall back on name_tools, where wise things are done
        # to parse correctly names such as "Colin de la Higuera"
        else:
            prefix, first_name, last_name, suffix = name_tools.split(name)
            from_lists = False

        if from_lists:
            first_name = ' '.join(first)
            last_name = ' '.join(last)

    first_name = first_name.strip()
    last_name = last_name.strip()
    first_name = normalize_name_words(first_name)
    last_name = normalize_name_words(last_name)

    if not last_name:
        first_name, last_name = last_name, first_name

    return (first_name, last_name)
示例#15
0
def import_committees(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, "committees", "*.json")

    meta = db.metadata.find_one({"_id": state})
    current_term = meta["terms"][-1]["name"]
    current_session = meta["terms"][-1]["sessions"][-1]

    paths = glob.glob(pattern)

    if not paths:
        # Not standalone committees
        for legislator in db.legislators.find({"roles": {"$elemMatch": {"term": current_term, "state": state}}}):

            for role in legislator["roles"]:
                if role["type"] == "committee member" and "committee_id" not in role:

                    spec = {"state": role["state"], "chamber": role["chamber"], "committee": role["committee"]}
                    if "subcommittee" in role:
                        spec["subcommittee"] = role["subcommittee"]

                    committee = db.committees.find_one(spec)

                    if not committee:
                        committee = spec
                        committee["_type"] = "committee"
                        committee["members"] = []
                        committee["sources"] = []
                        insert_with_id(committee)

                    for member in committee["members"]:
                        if member["leg_id"] == legislator["leg_id"]:
                            break
                    else:
                        committee["members"].append(
                            {"name": legislator["full_name"], "leg_id": legislator["leg_id"], "role": "member"}
                        )
                        db.committees.save(committee, safe=True)

                        role["committee_id"] = committee["_id"]

            db.legislators.save(legislator, safe=True)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        spec = {"state": state, "chamber": data["chamber"], "committee": data["committee"]}
        if "subcommittee" in data:
            spec["subcommittee"] = data["subcommittee"]

        committee = db.committees.find_one(spec)

        if not committee:
            insert_with_id(data)
            committee = data
        else:
            update(committee, data, db.committees)

        for member in committee["members"]:
            if not member["name"]:
                continue

            (pre, first, last, suff) = name_tools.split(member["name"])

            leg_id = get_legislator_id(state, current_session, data["chamber"], member["name"])

            if not leg_id:
                print "No matches for %s" % member["name"].encode("ascii", "ignore")
                member["leg_id"] = None
                continue

            legislator = db.legislators.find_one({"_id": leg_id})

            member["leg_id"] = leg_id

            for role in legislator["roles"]:
                if (
                    role["type"] == "committee member"
                    and role["term"] == current_term
                    and role["committee_id"] == committee["_id"]
                ):
                    break
            else:
                new_role = {
                    "type": "committee member",
                    "committee": committee["committee"],
                    "term": current_term,
                    "chamber": committee["chamber"],
                    "committee_id": committee["_id"],
                    "state": state,
                }
                if "subcommittee" in committee:
                    new_role["subcommittee"] = committee["subcommittee"]
                legislator["roles"].append(new_role)
                legislator["updated_at"] = datetime.datetime.utcnow()
                db.legislators.save(legislator, safe=True)

        db.committees.save(committee, safe=True)

    print "imported %s committee files" % len(paths)

    link_parents(state)

    ensure_indexes()
示例#16
0
def import_committees(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, 'committees', '*.json')

    meta = db.metadata.find_one({'_id': state})
    current_term = meta['terms'][-1]['name']
    current_session = meta['terms'][-1]['sessions'][-1]

    paths = glob.glob(pattern)

    for committee in db.committees.find({'state': state}):
        committee['members'] = []
        db.committees.save(committee)

    if not paths:
        # Not standalone committees
        for legislator in db.legislators.find({
            'roles': {'$elemMatch': {'term': current_term,
                                     'state': state}}}):

            for role in legislator['roles']:
                if (role['type'] == 'committee member' and
                    'committee_id' not in role):

                    spec = {'state': role['state'],
                            'chamber': role['chamber'],
                            'committee': role['committee']}
                    if 'subcommittee' in role:
                        spec['subcommittee'] = role['subcommittee']

                    committee = db.committees.find_one(spec)

                    if not committee:
                        committee = spec
                        committee['_type'] = 'committee'
                        committee['members'] = []
                        committee['sources'] = []
                        if 'subcommittee' not in committee:
                            committee['subcommittee'] = None
                        insert_with_id(committee)

                    for member in committee['members']:
                        if member['leg_id'] == legislator['leg_id']:
                            break
                    else:
                        committee['members'].append(
                            {'name': legislator['full_name'],
                             'leg_id': legislator['leg_id'],
                             'role': role.get('position') or 'member'})
                        db.committees.save(committee, safe=True)

                        role['committee_id'] = committee['_id']

            db.legislators.save(legislator, safe=True)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        spec = {'state': state,
                'chamber': data['chamber'],
                'committee': data['committee']}
        if 'subcommittee' in data:
            spec['subcommittee'] = data['subcommittee']

        committee = db.committees.find_one(spec)

        if not committee:
            insert_with_id(data)
            committee = data
        else:
            update(committee, data, db.committees)

        for member in committee['members']:
            if not member['name']:
                continue

            (pre, first, last, suff) = name_tools.split(member['name'])

            leg_id = get_legislator_id(state, current_session,
                                       data['chamber'],
                                       member['name'])

            if not leg_id:
                print "No matches for %s" % member['name'].encode(
                    'ascii', 'ignore')
                member['leg_id'] = None
                continue

            legislator = db.legislators.find_one({'_id': leg_id})

            member['leg_id'] = leg_id

            for role in legislator['roles']:
                if (role['type'] == 'committee member' and
                    role['term'] == current_term and
                    role['committee_id'] == committee['_id']):
                    break
            else:
                new_role = {'type': 'committee member',
                            'committee': committee['committee'],
                            'term': current_term,
                            'chamber': committee['chamber'],
                            'committee_id': committee['_id'],
                            'state': state}
                if 'subcommittee' in committee:
                    new_role['subcommittee'] = committee['subcommittee']
                legislator['roles'].append(new_role)
                legislator['updated_at'] = datetime.datetime.utcnow()
                db.legislators.save(legislator, safe=True)

        db.committees.save(committee, safe=True)

    print 'imported %s committee files' % len(paths)

    link_parents(state)

    ensure_indexes()
示例#17
0
 def last_first(self):
     prefix, first, last, suffix = name_tools.split(self.__unicode__())
     return re.sub(r'\s+([^\w])', r'\1', '%s %s, %s' % (last, suffix, first))
示例#18
0
def import_committees(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, 'committees', '*.json')

    meta = db.metadata.find_one({'_id': state})
    current_term = meta['terms'][-1]['name']
    current_session = meta['terms'][-1]['sessions'][-1]

    paths = glob.glob(pattern)

    for committee in db.committees.find({'state': state}):
        committee['members'] = []
        db.committees.save(committee)

    if not paths:
        # Not standalone committees
        for legislator in db.legislators.find({
            'roles': {'$elemMatch': {'term': current_term,
                                     'state': state}}}):

            for role in legislator['roles']:
                if (role['type'] == 'committee member' and
                    'committee_id' not in role):

                    spec = {'state': role['state'],
                            'chamber': role['chamber'],
                            'committee': role['committee']}
                    if 'subcommittee' in role:
                        spec['subcommittee'] = role['subcommittee']

                    committee = db.committees.find_one(spec)

                    if not committee:
                        committee = spec
                        committee['_type'] = 'committee'
                        committee['members'] = []
                        committee['sources'] = []
                        if 'subcommittee' not in committee:
                            committee['subcommittee'] = None
                        insert_with_id(committee)

                    for member in committee['members']:
                        if member['leg_id'] == legislator['leg_id']:
                            break
                    else:
                        committee['members'].append(
                            {'name': legislator['full_name'],
                             'leg_id': legislator['leg_id'],
                             'role': role.get('position') or 'member'})
                        db.committees.save(committee, safe=True)

                        role['committee_id'] = committee['_id']

            db.legislators.save(legislator, safe=True)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        spec = {'state': state,
                'chamber': data['chamber'],
                'committee': data['committee']}
        if 'subcommittee' in data:
            spec['subcommittee'] = data['subcommittee']

        committee = db.committees.find_one(spec)

        if not committee:
            insert_with_id(data)
            committee = data
        else:
            update(committee, data, db.committees)

        for member in committee['members']:
            if not member['name']:
                continue

            (pre, first, last, suff) = name_tools.split(member['name'])

            leg_id = get_legislator_id(state, current_session,
                                       data['chamber'],
                                       member['name'])

            if not leg_id:
                print "No matches for %s" % member['name'].encode(
                    'ascii', 'ignore')
                member['leg_id'] = None
                continue

            legislator = db.legislators.find_one({'_id': leg_id})

            member['leg_id'] = leg_id

            for role in legislator['roles']:
                if (role['type'] == 'committee member' and
                    role['term'] == current_term and
                    role['committee_id'] == committee['_id']):
                    break
            else:
                new_role = {'type': 'committee member',
                            'committee': committee['committee'],
                            'term': current_term,
                            'chamber': committee['chamber'],
                            'committee_id': committee['_id'],
                            'state': state}
                if 'subcommittee' in committee:
                    new_role['subcommittee'] = committee['subcommittee']
                legislator['roles'].append(new_role)
                legislator['updated_at'] = datetime.datetime.utcnow()
                db.legislators.save(legislator, safe=True)

        db.committees.save(committee, safe=True)

    print 'imported %s committee files' % len(paths)

    link_parents(state)

    ensure_indexes()
示例#19
0
文件: name.py 项目: davidar/proaixy
def parse_comma_name(name):
    """
    Parse a name of the form "Last name, First name" to (first name, last name)
    Try to do something reasonable if there is no comma.
    """
    if ',' in name:
        # In this case name_tools does it well
        prefix, first_name, last_name, suffix = name_tools.split(name)
    else:
        words, separators = split_name_words(name)
        if not words:
            return ('','')
        first_name = None
        last_name = None
        from_lists = True

        # Search for initials in the words
        initial = map(contains_initials, words)
        capitalized = map(is_fully_capitalized, words)

        # CASE 1: the first word is capitalized but not all of them are
        # we assume that it is the first word of the last name
        if not initial[0] and capitalized[0] and not all(capitalized):
            (last,first) = predsplit_forward(
                    (lambda i: capitalized[i] and not initial[i]),
                    words)
            

        # CASE 2: the last word is capitalized but not all of them are
        # we assume that it is the last word of the last name
        elif not initial[-1] and capitalized[-1] and not all(capitalized):
            (first,last) = predsplit_forward(
                    (lambda i: (not capitalized[i]) or initial[i]),
                    words)

        # CASE 3: the first word is an initial
        elif initial[0]:
            (first,last) = predsplit_forward(
                    (lambda i: initial[i]),
                    words)

        # CASE 4: the last word is an initial
        # this is trickier, we know that the last name comes first
        # but we don't really know where it stops.
        # For simplicity we assume that all the words in the first
        # name are initials
        elif initial[-1]:
            (last,first) = predsplit_backwards(
                    (lambda i: initial[i]),
                    words)

        # CASE 5: there are initials in the name, but neither
        # at the beginning nor at the end
        elif True in initial:
            last_initial_idx = None
            for i in range(len(words)):
                if initial[i]:
                    last_initial_idx = i
            first = words[:last_initial_idx+1]
            last = words[last_initial_idx+1:]

        # CASE 6: we have no clue
        # We fall back on name_tools, where wise things are done
        # to parse correctly names such as "Colin de la Higuera"
        else:
            prefix, first_name, last_name, suffix = name_tools.split(name)
            from_lists = False
            
        if from_lists:
            first_name = ' '.join(first)
            last_name = ' '.join(last)

    first_name = first_name.strip()
    last_name = last_name.strip()
    first_name = normalize_name_words(first_name)
    last_name = normalize_name_words(last_name)

    if not last_name:
        first_name, last_name = last_name, first_name

    return (first_name,last_name)
示例#20
0
def convert_legislator(leg):
    if leg.given_name and leg.family_name:
        first_name = leg.given_name
        last_name = leg.family_name
        suffixes = ""
    else:
        _, first_name, last_name, suffixes = name_tools.split(leg.name)

    legacy_ids = [
        oid.identifier for oid in leg.identifiers.all()
        if oid.scheme == "legacy_openstates"
    ]

    if not legacy_ids:
        legacy_ids = ["~not available~"]

    party = None
    chamber = None
    district = None
    state = None

    cr = leg.current_role
    party = cr["party"]
    chamber = cr["chamber"]
    district = cr["district"]
    state = cr["state"]

    email = None
    offices = defaultdict(dict)
    for cd in leg.contact_details.all():
        offices[cd.note][cd.type] = cd.value
        if cd.type == "email" and not email:
            email = cd.value

    active = bool(chamber and district)

    try:
        url = leg.links.all()[0].url
    except IndexError:
        url = ""

    return {
        "id":
        legacy_ids[0],
        "leg_id":
        legacy_ids[0],
        "all_ids":
        legacy_ids,
        "full_name":
        leg.name,
        "first_name":
        first_name,
        "last_name":
        last_name,
        "suffix":
        suffixes,
        "photo_url":
        leg.image,
        "url":
        url,
        "email":
        email,
        "party":
        party,
        "chamber":
        chamber,
        "district":
        district,
        "state":
        state,
        "sources": [{
            "url": s.url
        } for s in leg.sources.all()],
        "active":
        active,
        "roles": [{
            "term": static.TERMS[state][-1]["name"],
            "district": district,
            "chamber": chamber,
            "state": state,
            "party": party,
            "type": "member",
            "start_date": None,
            "end_date": None,
        }] if active else [],
        "offices": [{
            "name":
            label,
            "fax":
            details.get("fax"),
            "phone":
            details.get("voice"),
            "email":
            details.get("email"),
            "address":
            details.get("address"),
            "type":
            "capitol" if "capitol" in label.lower() else "district",
        } for label, details in offices.items()],
        "old_roles": {},
        "middle_name":
        "",
        "country":
        "us",
        "level":
        "state",
        "created_at":
        leg.created_at.strftime(DATE_FORMAT),
        "updated_at":
        leg.updated_at.strftime(DATE_FORMAT),
    }
示例#21
0
def import_committees(state, data_dir):
    data_dir = os.path.join(data_dir, state)
    pattern = os.path.join(data_dir, 'committees', '*.json')

    meta = db.metadata.find_one({'_id': state})
    current_term = meta['terms'][-1]['name']

    paths = glob.glob(pattern)

    if not paths:
        # Not standalone committees
        for legislator in db.legislators.find({
            'roles': {'$elemMatch': {'term': current_term,
                                     'state': state}}}):

            for role in legislator['roles']:
                if (role['type'] == 'committee member' and
                    'committee_id' not in role):

                    spec = {'state': role['state'],
                            'chamber': role['chamber'],
                            'committee': role['committee']}
                    if 'subcommittee' in role:
                        spec['subcommittee'] = role['subcommittee']

                    committee = db.committees.find_one(spec)

                    if not committee:
                        committee = spec
                        committee['_type'] = 'committee'
                        committee['members'] = []
                        insert_with_id(committee)

                    for member in committee['members']:
                        if member['leg_id'] == legislator['leg_id']:
                            break
                    else:
                        committee['members'].append(
                            {'name': legislator['full_name'],
                             'leg_id': legislator['leg_id'],
                             'role': 'member'})
                        db.committees.save(committee, safe=True)

                        role['committee_id'] = committee['_id']

            db.legislators.save(legislator, safe=True)

    for path in paths:
        with open(path) as f:
            data = prepare_obj(json.load(f))

        spec = {'state': state,
                'committee': data['committee']}
        if 'subcommittee' in data:
            spec['subcommittee'] = data['subcommittee']

        committee = db.committees.find_one(spec)

        if not committee:
            insert_with_id(data)
            committee = data
        else:
            update(committee, data, db.committees)

        for member in committee['members']:
            if not member['legislator']:
                continue

            (pre, first, last, suff) = name_tools.split(member['legislator'])

            found = db.legislators.find({
                    'first_name': first,
                    'last_name': last,
                    'roles': {'$elemMatch': {'term': current_term,
                                             'state': state}}})

            if found.count() > 1:
                print "Too many matches for %s" % member['legislator'].encode(
                    'ascii', 'ignore')
                continue
            elif found.count() == 0:
                print "No matches for %s" % member['legislator'].encode(
                    'ascii', 'ignore')
                continue

            legislator = found[0]

            member['leg_id'] = legislator['_id']

            for role in legislator['roles']:
                if (role['type'] == 'committee member' and
                    role['term'] == current_term and
                    role['committee_id'] == committee['_id']):
                    break
            else:
                new_role = {'type': 'committee member',
                            'committee': committee['committee'],
                            'term': current_term,
                            'chamber': committee['chamber'],
                            'committee_id': committee['_id'],
                            'state': state}
                if 'subcommittee' in committee:
                    new_role['subcommittee'] = committee['subcommittee']
                legislator['roles'].append(new_role)
                legislator['updated_at'] = datetime.datetime.now()
                db.legislators.save(legislator, safe=True)

        db.committees.save(committee, safe=True)

    ensure_indexes()
示例#22
0
def convert_legislator(leg):
    if leg.given_name and leg.family_name:
        first_name = leg.given_name
        last_name = leg.family_name
        suffixes = ''
    else:
        _, first_name, last_name, suffixes = name_tools.split(leg.name)

    legacy_ids = [oid.identifier for oid in leg.identifiers.all()
                  if oid.scheme == 'legacy_openstates']

    if not legacy_ids:
        legacy_ids = ['~not available~']

    party = None
    chamber = None
    district = None
    state = None

    cr = get_current_role(leg)
    party = cr['party']
    chamber = cr['chamber']
    district = cr['district']
    state = cr['state']

    email = None
    offices = defaultdict(dict)
    for cd in leg.contact_details.all():
        offices[cd.note][cd.type] = cd.value
        if cd.type == 'email' and not email:
            email = cd.value

    active = bool(chamber and district)

    try:
        url = leg.links.all()[0].url
    except IndexError:
        url = ""

    return {
        'id': legacy_ids[0],
        'leg_id': legacy_ids[0],
        'all_ids': legacy_ids,
        'full_name': leg.name,
        'first_name': first_name,
        'last_name': last_name,
        'suffix': suffixes,
        'photo_url': leg.image,
        'url': url,
        'email': email,
        'party': party,
        'chamber': chamber,
        'district': district,
        'state': state,
        'sources': [{'url': s.url} for s in leg.sources.all()],
        'active': active,
        'roles': [{
            "term": static.TERMS[state][-1]['name'],
            "district": district,
            "chamber": chamber,
            "state": state,
            "party": party,
            "type": "member",
            "start_date": None,
            "end_date": None,
        }] if active else [],
        'offices': [
            {
                'name': label,
                'fax': details.get('fax'),
                'phone': details.get('voice'),
                'email': details.get('email'),
                'address': details.get('address'),
                'type': 'capitol' if 'capitol' in label.lower() else 'district',
            }
            for label, details in offices.items()
        ],

        'old_roles': {},
        'middle_name': '',
        'country': 'us',
        'level': 'state',
        'created_at': leg.created_at.strftime(DATE_FORMAT),
        'updated_at': leg.updated_at.strftime(DATE_FORMAT),
    }