def sweep():
    to_check = []

    bioguide = utils.flags().get('bioguide', None)
    if bioguide:
      possibles = [bioguide]
    else:
      possibles = current_bioguide.keys()

    for bioguide in possibles:
      if media_bioguide.get(bioguide, None) is None:
        to_check.append(bioguide)
      elif media_bioguide[bioguide]["social"].get(service, None) is None:
        to_check.append(bioguide)
      else:
        pass

    utils.mkdir_p("cache/social_media")
    writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w'))
    writer.writerow(["bioguide", "official_full", "website", "service", "candidate"])

    for bioguide in to_check:
      candidate = candidate_for(bioguide)
      if candidate:
        url = current_bioguide[bioguide]["terms"][-1].get("url", None)
        writer.writerow([bioguide, current_bioguide[bioguide]['name']['official_full'], url, service, candidate])
        print "\tWrote: %s" % candidate
  def resolveig():
    # in order to preserve the comment block at the top of the file,
    # copy it over into a new RtYamlList instance. We do this because
    # Python list instances can't hold other random attributes.
    import rtyaml
    updated_media = rtyaml.RtYamlList()
    if hasattr(media, '__initial_comment_block'):
      updated_media.__initial_comment_block = getattr(media, '__initial_comment_block')

    client_id_file = open('cache/instagram_client_id','r')
    client_id = client_id_file.read()

    bioguide = utils.flags().get('bioguide', None)

    for m in media:
      if bioguide and (m['id']['bioguide'] != bioguide):
        updated_media.append(m)
        continue

      social = m['social']
      if 'instagram' not in social and 'instagram_id' not in social:
        updated_media.append(m)
        continue

      instagram_handle = social['instagram']
      query_url = "https://api.instagram.com/v1/users/search?q={query}&client_id={client_id}".format(query=instagram_handle,client_id=client_id)
      instagram_user_search = requests.get(query_url).json()
      for user in instagram_user_search['data']:
        time.sleep(0.5)
        if user['username'] == instagram_handle:
          m['social']['instagram_id'] = int(user['id'])
          print("matched instagram_id {instagram_id} to {instagram_handle}".format(instagram_id=social['instagram_id'],instagram_handle=instagram_handle))
      updated_media.append(m)

    save_data(updated_media, "legislators-social-media.yaml")
  def verify():
    bioguide = utils.flags().get('bioguide', None)
    if bioguide:
      to_check = [bioguide]
    else:
      to_check = media_bioguide.keys()

    for bioguide in to_check:
      entry = media_bioguide[bioguide]
      current = entry['social'].get(service, None)
      if not current:
        continue

      bioguide = entry['id']['bioguide']

      candidate = candidate_for(bioguide)
      if not candidate:
        # if current is in whitelist, and none is on the page, that's okay
        if current.lower() in whitelist[service]:
          continue
        else:
          candidate = ""

      url = current_bioguide[bioguide]['terms'][-1].get('url')

      if current.lower() != candidate.lower():
        print "[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate)
예제 #4
0
  def sweep():
    to_check = []

    bioguide = utils.flags().get('bioguide', None)
    if bioguide:
      possibles = [bioguide]
    else:
      possibles = current_bioguide.keys()

    for bioguide in possibles:
      if media_bioguide.get(bioguide, None) is None:
        to_check.append(bioguide)
      elif media_bioguide[bioguide]["social"].get(service, None) is None:
        to_check.append(bioguide)
      else:
        pass

    utils.mkdir_p("cache/social_media")
    writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w'))
    writer.writerow(["bioguide", "official_full", "website", "service", "candidate", "candidate_url"])

    if len(to_check) > 0:
      email_body = "Social media leads found:\n\n"
      for bioguide in to_check:
        candidate = candidate_for(bioguide)
        if candidate:
          url = current_bioguide[bioguide]["terms"][-1].get("url", None)
          candidate_url = "https://%s.com/%s" % (service, candidate)
          row = [bioguide, current_bioguide[bioguide]['name']['official_full'].encode('utf-8'), url, service, candidate, candidate_url]
          writer.writerow(row)
          print "\tWrote: %s" % candidate
          email_body += ("%s\n" % row)

      if email_enabled:
        utils.send_email(email_body)
def run():
    options = utils.flags()
    debug = options.get('debug', False)

    filename = "legislators-current.yaml"
    args = utils.args()
    legislators = load_data(filename)

    if len(args) != 0:
        bioguides = args
        print("Fetching contact forms for %s..." % ', '.join(bioguides))
    else:
        bioguides = [member['id']['bioguide'] for member in legislators]
        print("Fetching contact forms for all current members...")

    for legislator in legislators:
        bioguide = legislator['id']['bioguide']
        if bioguide not in bioguides: continue
        if bioguide in SKIP_BIOGUIDES: continue

        if debug: print("Downloading form for %s" % bioguide, flush=True)

        try:
            steps = contact_steps_for(bioguide)
        except LegislatorNotFoundError as e:
            if debug: print("skipping, %s..." % e, flush=True)
            continue

        legislator['terms'][-1]['contact_form'] = steps['contact_form']['steps'][0]['visit']

    print("Saving data to %s..." % filename)
    save_data(legislators, filename)
def run():

  # pick either current or historical
  # order is important here, since current defaults to true
  if utils.flags().get('historical', False):
    filename = "legislators-historical.yaml"
  elif utils.flags().get('current', True):
    filename = "legislators-current.yaml"
  else:
    print("No legislators selected.")
    exit(0)

  print("Loading %s..." % filename)
  legislators = load_data(filename)

  # reoriented cache to access by bioguide ID
  by_bioguide = { }
  for m in legislators:
    if "bioguide" in m["id"]:
      by_bioguide[m["id"]["bioguide"]] = m

  count = 0

  for id in range(8245,21131):
    print(id)
    url = "http://history.house.gov/People/Detail/%s" % id
    r = requests.get(url, allow_redirects=False)
    if r.status_code == 200:
        dom = lxml.html.parse(io.StringIO(r.text)).getroot()
        try:
            bioguide_link = dom.cssselect("a.view-in-bioguide")[0].get('href')
            bioguide_id = bioguide_link.split('=')[1]
            by_bioguide[bioguide_id]["id"]["house_history"] = id
            count = count + 1
        except:
            continue
    else:
        continue

  print("Saving data to %s..." % filename)
  save_data(legislators, filename)

  print("Saved %d legislators to %s" % (count, filename))
def run():
  house_labels = "labels-113.csv"

  names = utils.flags().get('names', False)

  y = load_data("legislators-current.yaml")
  by_district = { }
  for m in y:
    last_term = m['terms'][-1]
    if last_term['type'] != 'sen':
      full_district = "%s%02d" % (last_term['state'], int(last_term['district']))
      by_district[full_district] = m


  for rec in csv.DictReader(open(house_labels)):
    full_district = rec['113 ST/DIS']

    # empty seat - IL-02
    if full_district not in by_district:
      if full_district == "IL02":
        continue
      else:
        raise "No!!"

    rec["MIDDLE"] = rec["MIDDLE"].decode("utf8").strip()
    rec["NICK"] = None
    m = re.match('^(.*) \u201c(.*)\u201d$', rec["MIDDLE"])
    if m:
      rec["MIDDLE"] = m.group(1)
      rec["NICK"] = m.group(2)

    by_district[full_district]['terms'][-1]['office'] = rec["ADDRESS"].strip()

    # only set name fields if we've been asked to (as a stopgap)
    if names:
      by_district[full_district]["name"]["first"] = rec["FIRST"].decode("utf8").strip()
      if rec["MIDDLE"]:
        by_district[full_district]["name"]["middle"] = rec["MIDDLE"]
      if rec["NICK"]:
        by_district[full_district]["name"]["nickname"] = rec["NICK"]
      by_district[full_district]["name"]["last"] = rec["LAST"].decode("utf8").strip()

    if rec["BIOGUIDE ID"] == "G000574":
      # The Clerk has the wrong ID for Alan Grayson!
      rec["BIOGUIDE ID"] = "G000556"

    by_district[full_district]["id"]["bioguide"] = rec["BIOGUIDE ID"]

    print("[%s] Saved" % full_district)

  save_data(y, "legislators-current.yaml")
예제 #8
0
def main():
    regexes = {
        "youtube": [
            "(?:https?:)?//(?:www\\.)?youtube.com/embed/?\?(list=[^\\s\"/\\?#&']+)",
            "(?:https?:)?//(?:www\\.)?youtube.com/channel/([^\\s\"/\\?#']+)",
            "(?:https?:)?//(?:www\\.)?youtube.com/(?:subscribe_widget\\?p=)?(?:subscription_center\\?add_user=)?(?:user/)?([^\\s\"/\\?#']+)"
        ],
        "facebook": [
            "\\('facebook.com/([^']+)'\\)",
            "(?:https?:)?//(?:www\\.)?facebook.com/(?:home\\.php)?(?:business/dashboard/#/)?(?:government)?(?:#!/)?(?:#%21/)?(?:#/)?pages/[^/]+/(\\d+)",
            "(?:https?:)?//(?:www\\.)?facebook.com/(?:profile.php\\?id=)?(?:home\\.php)?(?:#!)?/?(?:people)?/?([^/\\s\"#\\?&']+)"
        ],
        "twitter": [
            "(?:https?:)?//(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/?]+)",
            "\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)"
        ],
        "instagram": ["instagram.com/(\w{3,})"]
    }

    email_enabled = utils.flags().get('email', False)
    debug = utils.flags().get('debug', False)
    do_update = utils.flags().get('update', False)
    do_clean = utils.flags().get('clean', False)
    do_verify = utils.flags().get('verify', False)
    do_resolveyt = utils.flags().get('resolveyt', False)
    do_resolveig = utils.flags().get('resolveig', False)
    do_resolvetw = utils.flags().get('resolvetw', False)

    # default to not caching
    cache = utils.flags().get('cache', False)
    force = not cache

    if do_resolveyt:
        service = "youtube"
    elif do_resolveig:
        service = "instagram"
    elif do_resolvetw:
        service = "twitter"
    else:
        service = utils.flags().get('service', None)
    if service not in ["twitter", "youtube", "facebook", "instagram"]:
        print(
            "--service must be one of twitter, youtube, facebook, or instagram"
        )
        exit(0)

    # load in members, orient by bioguide ID
    print("Loading current legislators...")
    current = load_data("legislators-current.yaml")

    current_bioguide = {}
    for m in current:
        if "bioguide" in m["id"]:
            current_bioguide[m["id"]["bioguide"]] = m

    print("Loading blacklist...")
    blacklist = {'twitter': [], 'facebook': [], 'youtube': [], 'instagram': []}
    for rec in csv.DictReader(open("data/social_media_blacklist.csv")):
        blacklist[rec["service"]].append(rec["pattern"])

    print("Loading whitelist...")
    whitelist = {'twitter': [], 'facebook': [], 'youtube': []}
    for rec in csv.DictReader(open("data/social_media_whitelist.csv")):
        whitelist[rec["service"]].append(rec["account"].lower())

    # reorient currently known social media by ID
    print("Loading social media...")
    media = load_data("legislators-social-media.yaml")
    media_bioguide = {}
    for m in media:
        media_bioguide[m["id"]["bioguide"]] = m

    def resolveyt():
        # To avoid hitting quota limits, register for a YouTube 2.0 API key at
        # https://code.google.com/apis/youtube/dashboard
        # and put it below
        api_file = open('cache/youtube_api_key', 'r')
        api_key = api_file.read()

        bioguide = utils.flags().get('bioguide', None)

        updated_media = []
        for m in media:
            if bioguide and (m['id']['bioguide'] != bioguide):
                updated_media.append(m)
                continue

            social = m['social']

            if ('youtube' in social) or ('youtube_id' in social):

                if 'youtube' not in social:
                    social['youtube'] = social['youtube_id']

                ytid = social['youtube']

                profile_url = ("https://gdata.youtube.com/feeds/api/users/%s"
                               "?v=2&prettyprint=true&alt=json&key=%s" %
                               (ytid, api_key))

                try:
                    print("Resolving YT info for %s" % social['youtube'])
                    ytreq = requests.get(profile_url)
                    # print "\tFetched with status code %i..." % ytreq.status_code

                    if ytreq.status_code == 404:
                        # If the account name isn't valid, it's probably a redirect.
                        try:
                            # Try to scrape the real YouTube username
                            print("\Scraping YouTube username")
                            search_url = ("https://www.youtube.com/%s" %
                                          social['youtube'])
                            csearch = requests.get(search_url).text.encode(
                                'ascii', 'ignore')

                            u = re.search(
                                r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>',
                                csearch)

                            if u:
                                print("\t%s maps to %s" %
                                      (social['youtube'], u.group(1)))
                                social['youtube'] = u.group(1)
                                profile_url = (
                                    "https://gdata.youtube.com/feeds/api/users/%s"
                                    "?v=2&prettyprint=true&alt=json" %
                                    social['youtube'])

                                print("\tFetching GData profile...")
                                ytreq = requests.get(profile_url)
                                print("\tFetched GData profile")

                            else:
                                raise Exception(
                                    "Couldn't figure out the username format for %s"
                                    % social['youtube'])

                        except:
                            print("\tCouldn't locate YouTube account")
                            raise

                    ytobj = ytreq.json()
                    social['youtube_id'] = ytobj['entry']['yt$channelId']['$t']
                    print("\tResolved youtube_id to %s" % social['youtube_id'])

                    # even though we have their channel ID, do they also have a username?
                    if ytobj['entry']['yt$username']['$t'] != ytobj['entry'][
                            'yt$userId']['$t']:
                        if social['youtube'].lower(
                        ) != ytobj['entry']['yt$username']['$t'].lower():
                            # YT accounts are case-insensitive.  Preserve capitalization if possible.
                            social['youtube'] = ytobj['entry']['yt$username'][
                                '$t']
                            print("\tAdded YouTube username of %s" %
                                  social['youtube'])
                    else:
                        print(
                            "\tYouTube says they do not have a separate username"
                        )
                        del social['youtube']
                except:
                    print("Unable to get YouTube Channel ID for: %s" %
                          social['youtube'])

            updated_media.append(m)

        print("Saving social media...")
        save_data(updated_media, "legislators-social-media.yaml")

    def resolveig():
        # in order to preserve the comment block at the top of the file,
        # copy it over into a new RtYamlList instance. We do this because
        # Python list instances can't hold other random attributes.
        import rtyaml
        updated_media = rtyaml.RtYamlList()
        if hasattr(media, '__initial_comment_block'):
            updated_media.__initial_comment_block = getattr(
                media, '__initial_comment_block')

        client_id_file = open('cache/instagram_client_id', 'r')
        client_id = client_id_file.read()

        bioguide = utils.flags().get('bioguide', None)

        for m in media:
            if bioguide and (m['id']['bioguide'] != bioguide):
                updated_media.append(m)
                continue

            social = m['social']
            if 'instagram' not in social and 'instagram_id' not in social:
                updated_media.append(m)
                continue

            instagram_handle = social['instagram']
            query_url = "https://api.instagram.com/v1/users/search?q={query}&client_id={client_id}".format(
                query=instagram_handle, client_id=client_id)
            instagram_user_search = requests.get(query_url).json()
            for user in instagram_user_search['data']:
                time.sleep(0.5)
                if user['username'] == instagram_handle:
                    m['social']['instagram_id'] = int(user['id'])
                    print(
                        "matched instagram_id {instagram_id} to {instagram_handle}"
                        .format(instagram_id=social['instagram_id'],
                                instagram_handle=instagram_handle))
            updated_media.append(m)

        save_data(updated_media, "legislators-social-media.yaml")

    def resolvetw():
        """
    Does two batch lookups:

    1. All entries with `twitter_id`: Checks to see if the corresponding Twitter profile has the same screen_name
        as found in the entry's `twitter`. If not, the `twitter` value is updated.
    2. All entries with `twitter` (but not `twitter_id`): fetches the corresponding Twitter profile by screen_name and
        inserts ID. If no profile is found, the `twitter` value is deleted.

    Note: cache/twitter_client_id must be a formatted JSON dict:
        {
        "consumer_secret": "xyz",
        "access_token": "abc",
        "access_token_secret": "def",
        "consumer_key": "jk"
       }
    """
        import rtyaml
        from social.twitter import get_api, fetch_profiles
        updated_media = rtyaml.RtYamlList()
        if hasattr(media, '__initial_comment_block'):
            updated_media.__initial_comment_block = getattr(
                media, '__initial_comment_block')

        client_id_file = open('cache/twitter_client_id', 'r')
        _c = json.load(client_id_file)
        api = get_api(_c['access_token'], _c['access_token_secret'],
                      _c['consumer_key'], _c['consumer_secret'])
        bioguide = utils.flags().get('bioguide', None)
        lookups = {
            'screen_names': [],
            'ids': []
        }  # store members that have `twitter` or `twitter_id` info
        for m in media:
            # we start with appending to updated_media so that we keep the same order of entries
            # as found in the loaded file
            updated_media.append(m)
            if bioguide and (m['id']['bioguide'] != bioguide):
                continue
            social = m['social']
            # now we add entries to either the `ids` or the `screen_names` list to batch lookup
            if 'twitter_id' in social:
                # add to the queue to be batched-looked-up
                lookups['ids'].append(m)
                # append
            elif 'twitter' in social:
                lookups['screen_names'].append(m)

        #######################################
        # perform Twitter batch lookup for ids:
        if lookups['screen_names']:
            arr = lookups['screen_names']
            print("Looking up Twitter ids for", len(arr), "names.")
            tw_names = [m['social']['twitter'] for m in arr]
            tw_profiles = fetch_profiles(api, screen_names=tw_names)
            for m in arr:
                social = m['social']
                # find profile that corresponds to a given screen_name
                twitter_handle = social['twitter']
                twp = next(
                    (p for p in tw_profiles
                     if p['screen_name'].lower() == twitter_handle.lower()),
                    None)
                if twp:
                    m['social']['twitter_id'] = int(twp['id'])
                    print("Matched twitter_id `%s` to `%s`" %
                          (social['twitter_id'], twitter_handle))
                else:
                    # Remove errant Twitter entry for now
                    print("No Twitter user profile for:", twitter_handle)
                    m['social'].pop('twitter')
                    print("\t ! removing Twitter handle:", twitter_handle)
        ##########################################
        # perform Twitter batch lookup for names by id, to update any renamings:
        if lookups['ids']:
            arr = lookups['ids']
            print("Looking up Twitter screen_names for", len(arr), "ids.")
            tw_ids = [m['social']['twitter_id'] for m in arr]
            tw_profiles = fetch_profiles(api, ids=tw_ids)
            any_renames_needed = False
            for m in arr:
                social = m['social']
                # find profile that corresponds to a given screen_name
                t_id = social['twitter_id']
                t_name = social.get('twitter')
                twp = next((p for p in tw_profiles if int(p['id']) == t_id),
                           None)
                if twp:
                    # Be silent if there is no change to screen name
                    if t_name and (twp['screen_name'].lower()
                                   == t_name.lower()):
                        pass
                    else:
                        any_renames_needed = True
                        m['social']['twitter'] = twp['screen_name']
                        print("For twitter_id `%s`, renamed `%s` to `%s`" %
                              (t_id, t_name, m['social']['twitter']))
                else:
                    # No entry found for this twitter id
                    print("No Twitter user profile for %s, %s" %
                          (t_id, t_name))
                    m['social'].pop('twitter_id')
                    print("\t ! removing Twitter id:", t_id)
            if not any_renames_needed:
                print("No renames needed")
        # all done with Twitter
        save_data(updated_media, "legislators-social-media.yaml")

    def sweep():
        to_check = []

        bioguide = utils.flags().get('bioguide', None)
        if bioguide:
            possibles = [bioguide]
        else:
            possibles = list(current_bioguide.keys())

        for bioguide in possibles:
            if media_bioguide.get(bioguide, None) is None:
                to_check.append(bioguide)
            elif (media_bioguide[bioguide]["social"].get(service, None) is None) and \
              (media_bioguide[bioguide]["social"].get(service + "_id", None) is None):
                to_check.append(bioguide)
            else:
                pass

        utils.mkdir_p("cache/social_media")
        writer = csv.writer(
            open("cache/social_media/%s_candidates.csv" % service, 'w'))
        writer.writerow([
            "bioguide", "official_full", "website", "service", "candidate",
            "candidate_url"
        ])

        if len(to_check) > 0:
            rows_found = []
            for bioguide in to_check:
                candidate = candidate_for(bioguide)
                if candidate:
                    url = current_bioguide[bioguide]["terms"][-1].get(
                        "url", None)
                    candidate_url = "https://%s.com/%s" % (service, candidate)
                    row = [
                        bioguide, current_bioguide[bioguide]['name']
                        ['official_full'].encode('utf-8'), url, service,
                        candidate, candidate_url
                    ]
                    writer.writerow(row)
                    print("\tWrote: %s" % candidate)
                    rows_found.append(row)

            if email_enabled and len(rows_found) > 0:
                email_body = "Social media leads found:\n\n"
                for row in rows_found:
                    email_body += ("%s\n" % row)
                utils.send_email(email_body)

    def verify():
        bioguide = utils.flags().get('bioguide', None)
        if bioguide:
            to_check = [bioguide]
        else:
            to_check = list(media_bioguide.keys())

        for bioguide in to_check:
            entry = media_bioguide[bioguide]
            current = entry['social'].get(service, None)
            if not current:
                continue

            bioguide = entry['id']['bioguide']

            candidate = candidate_for(bioguide, current)
            if not candidate:
                # if current is in whitelist, and none is on the page, that's okay
                if current.lower() in whitelist[service]:
                    continue
                else:
                    candidate = ""

            url = current_bioguide[bioguide]['terms'][-1].get('url')

            if current.lower() != candidate.lower():
                print("[%s] mismatch on %s - %s -> %s" %
                      (bioguide, url, current, candidate))

    def update():
        for rec in csv.DictReader(
                open("cache/social_media/%s_candidates.csv" % service)):
            bioguide = rec["bioguide"]
            candidate = rec["candidate"]

            if bioguide in media_bioguide:
                media_bioguide[bioguide]['social'][service] = candidate
            else:
                new_media = {'id': {}, 'social': {}}

                new_media['id']['bioguide'] = bioguide
                thomas_id = current_bioguide[bioguide]['id'].get(
                    "thomas", None)
                govtrack_id = current_bioguide[bioguide]['id'].get(
                    "govtrack", None)
                if thomas_id:
                    new_media['id']['thomas'] = thomas_id
                if govtrack_id:
                    new_media['id']['govtrack'] = govtrack_id

                new_media['social'][service] = candidate
                media.append(new_media)

        print("Saving social media...")
        save_data(media, "legislators-social-media.yaml")

        # if it's a youtube update, always do the resolve
        # if service == "youtube":
        #   resolveyt()

    def clean():
        print("Loading historical legislators...")
        historical = load_data("legislators-historical.yaml")

        count = 0
        for m in historical:
            if m["id"]["bioguide"] in media_bioguide:
                media.remove(media_bioguide[m["id"]["bioguide"]])
                count += 1
        print(
            "Removed %i out of office legislators from social media file..." %
            count)

        print("Saving historical legislators...")
        save_data(media, "legislators-social-media.yaml")

    def candidate_for(bioguide, current=None):
        """find the most likely candidate account from the URL.
    If current is passed, the candidate will match it if found
    otherwise, the first candidate match is returned
    """
        url = current_bioguide[bioguide]["terms"][-1].get("url", None)
        if not url:
            if debug:
                print("[%s] No official website, skipping" % bioguide)
            return None

        if debug:
            print("[%s] Downloading..." % bioguide)
        cache = "congress/%s.html" % bioguide
        body = utils.download(url, cache, force, {'check_redirects': True})
        if not body:
            return None

        all_matches = []
        for regex in regexes[service]:
            matches = re.findall(regex, body, re.I)
            if matches:
                all_matches.extend(matches)

        if not current == None and current in all_matches:
            return current

        if all_matches:
            for candidate in all_matches:
                passed = True
                for blacked in blacklist[service]:
                    if re.search(blacked, candidate, re.I):
                        passed = False

                if not passed:
                    if debug:
                        print("\tBlacklisted: %s" % candidate)
                    continue

                return candidate
            return None

    if do_update:
        update()
    elif do_clean:
        clean()
    elif do_verify:
        verify()
    elif do_resolveyt:
        resolveyt()
    elif do_resolveig:
        resolveig()
    elif do_resolvetw:
        resolvetw()

    else:
        sweep()
예제 #9
0
def run():

  # default to not caching
  cache = utils.flags().get('cache', False)
  force = not cache


  states = []
  current = load_data("legislators-current.yaml")
  by_district = { }
  for m in current:
    last_term = m['terms'][-1]
    if last_term['type'] != 'sen':
      state = last_term['state']

      full_district = "%s%02d" % (state, int(last_term['district']))
      by_district[full_district] = m

      if not state in states:
        # house lists AS (American Samoa) as AQ, awesome
        if state == "AS":
          state = "AQ"
        states.append(state)

  destination = "legislators/house.html"
  url = "https://www.house.gov/representatives/"
  body = utils.download(url, destination, force)
  if not body:
    print("Couldn't download House listing!")
    exit(0)

  try:
    dom = lxml.html.parse(io.StringIO(body)).getroot()
  except lxml.etree.XMLSyntaxError:
    print("Error parsing House listing!")
    exit(0)


  # process:
  #   go through every state in our records, fetching that state's table
  #   go through every row after the first, pick the district to isolate the member
  #   pluck out the URL, update that member's last term's URL
  count = 0
  for state in states:
    rows = dom.cssselect("h2#state_%s+table tr" % state.lower())

    for row in rows:
      cells = row.cssselect("td")
      if not cells:
        continue

      district = str(cells[0].text_content())
      if district == "At Large":
        district = 0

      url = cells[1].cssselect("a")[0].get("href")

      # The House uses subdomains now, and occasionally the directory
      # uses URLs with some trailing redirected-to page, like /home.
      # We can safely use the subdomain as the root, to be future-proof
      # against redirects changing mid-session.

      # We should still follow any redirects, and not just trust the
      # directory to have the current active subdomain. As an example,
      # the directory lists randyforbes.house.gov, which redirects to
      # forbes.house.gov.
      resp = urllib.request.urlopen(url)
      url = resp.geturl()

      # kill everything after the domain
      url = re.sub(".gov/.*$", ".gov", url)

      if state == "AQ":
        state = "AS"
      full_district = "%s%02d" % (state, int(district))
      if full_district in by_district:
        print("[%s] %s" % (full_district, url))
        by_district[full_district]['terms'][-1]['url'] = url
      else:
        print("[%s] No current legislator" % full_district)

      count += 1

  print("Processed %i people rows on House listing." % count)

  print("Saving data...")
  save_data(current, "legislators-current.yaml")
예제 #10
0
  def resolvetw():
    """
    Does two batch lookups:

    1. All entries with `twitter_id`: Checks to see if the corresponding Twitter profile has the same screen_name
        as found in the entry's `twitter`. If not, the `twitter` value is updated.
    2. All entries with `twitter` (but not `twitter_id`): fetches the corresponding Twitter profile by screen_name and
        inserts ID. If no profile is found, the `twitter` value is deleted.

    Note: cache/twitter_client_id must be a formatted JSON dict:
        {
        "consumer_secret": "xyz",
        "access_token": "abc",
        "access_token_secret": "def",
        "consumer_key": "jk"
       }
    """
    import rtyaml
    from social.twitter import get_api, fetch_profiles
    updated_media = rtyaml.RtYamlList()
    if hasattr(media, '__initial_comment_block'):
      updated_media.__initial_comment_block = getattr(media, '__initial_comment_block')

    client_id_file = open('cache/twitter_client_id', 'r')
    _c = json.load(client_id_file)
    api = get_api(_c['access_token'], _c['access_token_secret'], _c['consumer_key'], _c['consumer_secret'])
    bioguide = utils.flags().get('bioguide', None)
    lookups = {'screen_names': [], 'ids': []} # store members that have `twitter` or `twitter_id` info
    for m in media:
      # we start with appending to updated_media so that we keep the same order of entries
      # as found in the loaded file
      updated_media.append(m)
      if bioguide and (m['id']['bioguide'] != bioguide):
        continue
      social = m['social']
      # now we add entries to either the `ids` or the `screen_names` list to batch lookup
      if 'twitter_id' in social:
        # add to the queue to be batched-looked-up
        lookups['ids'].append(m)
        # append
      elif 'twitter' in social:
        lookups['screen_names'].append(m)

    #######################################
    # perform Twitter batch lookup for ids:
    if lookups['screen_names']:
      arr = lookups['screen_names']
      print("Looking up Twitter ids for", len(arr), "names.")
      tw_names = [m['social']['twitter'] for m in arr]
      tw_profiles = fetch_profiles(api, screen_names = tw_names)
      for m in arr:
        social = m['social']
        # find profile that corresponds to a given screen_name
        twitter_handle = social['twitter']
        twp = next((p for p in tw_profiles if p['screen_name'].lower() == twitter_handle.lower()), None)
        if twp:
          m['social']['twitter_id'] = int(twp['id'])
          print("Matched twitter_id `%s` to `%s`" % (social['twitter_id'], twitter_handle))
        else:
          # Remove errant Twitter entry for now
          print("No Twitter user profile for:", twitter_handle)
          m['social'].pop('twitter')
          print("\t ! removing Twitter handle:", twitter_handle)
    ##########################################
    # perform Twitter batch lookup for names by id, to update any renamings:
    if lookups['ids']:
      arr = lookups['ids']
      print("Looking up Twitter screen_names for", len(arr), "ids.")
      tw_ids = [m['social']['twitter_id'] for m in arr]
      tw_profiles = fetch_profiles(api, ids = tw_ids)
      any_renames_needed = False
      for m in arr:
        social = m['social']
        # find profile that corresponds to a given screen_name
        t_id = social['twitter_id']
        t_name = social.get('twitter')
        twp = next((p for p in tw_profiles if int(p['id']) == t_id), None)
        if twp:
          # Be silent if there is no change to screen name
          if t_name and (twp['screen_name'].lower() == t_name.lower()):
            pass
          else:
            any_renames_needed = True
            m['social']['twitter'] = twp['screen_name']
            print("For twitter_id `%s`, renamed `%s` to `%s`" % (t_id, t_name, m['social']['twitter']))
        else:
          # No entry found for this twitter id
          print("No Twitter user profile for %s, %s" % (t_id, t_name))
          m['social'].pop('twitter_id')
          print("\t ! removing Twitter id:", t_id)
      if not any_renames_needed:
        print("No renames needed")
    # all done with Twitter
    save_data(updated_media, "legislators-social-media.yaml")
예제 #11
0
  def resolveyt():
    # To avoid hitting quota limits, register for a YouTube 2.0 API key at
    # https://code.google.com/apis/youtube/dashboard
    # and put it below
    api_file = open('cache/youtube_api_key','r')
    api_key = api_file.read()

    bioguide = utils.flags().get('bioguide', None)

    updated_media = []
    for m in media:
      if bioguide and (m['id']['bioguide'] != bioguide):
        updated_media.append(m)
        continue

      social = m['social']

      if ('youtube' in social) or ('youtube_id' in social):

        if 'youtube' not in social:
          social['youtube'] = social['youtube_id']

        ytid = social['youtube']

        profile_url = ("https://gdata.youtube.com/feeds/api/users/%s"
        "?v=2&prettyprint=true&alt=json&key=%s" % (ytid, api_key))

        try:
          print("Resolving YT info for %s" % social['youtube'])
          ytreq = requests.get(profile_url)
          # print "\tFetched with status code %i..." % ytreq.status_code

          if ytreq.status_code == 404:
            # If the account name isn't valid, it's probably a redirect.
            try:
              # Try to scrape the real YouTube username
              print("\Scraping YouTube username")
              search_url = ("https://www.youtube.com/%s" % social['youtube'])
              csearch = requests.get(search_url).text.encode('ascii','ignore')

              u = re.search(r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>',csearch)

              if u:
                print("\t%s maps to %s" % (social['youtube'],u.group(1)))
                social['youtube'] = u.group(1)
                profile_url = ("https://gdata.youtube.com/feeds/api/users/%s"
                "?v=2&prettyprint=true&alt=json" % social['youtube'])

                print("\tFetching GData profile...")
                ytreq = requests.get(profile_url)
                print("\tFetched GData profile")

              else:
                raise Exception("Couldn't figure out the username format for %s" % social['youtube'])

            except:
              print("\tCouldn't locate YouTube account")
              raise

          ytobj = ytreq.json()
          social['youtube_id'] = ytobj['entry']['yt$channelId']['$t']
          print("\tResolved youtube_id to %s" % social['youtube_id'])

          # even though we have their channel ID, do they also have a username?
          if ytobj['entry']['yt$username']['$t'] != ytobj['entry']['yt$userId']['$t']:
            if social['youtube'].lower() != ytobj['entry']['yt$username']['$t'].lower():
              # YT accounts are case-insensitive.  Preserve capitalization if possible.
              social['youtube'] = ytobj['entry']['yt$username']['$t']
              print("\tAdded YouTube username of %s" % social['youtube'])
          else:
            print("\tYouTube says they do not have a separate username")
            del social['youtube']
        except:
          print("Unable to get YouTube Channel ID for: %s" % social['youtube'])

      updated_media.append(m)

    print("Saving social media...")
    save_data(updated_media, "legislators-social-media.yaml")
예제 #12
0
def main():
  regexes = {
    "youtube": [
      "https?://(?:www\\.)?youtube.com/channel/([^\\s\"/\\?#']+)",
      "https?://(?:www\\.)?youtube.com/(?:subscribe_widget\\?p=)?(?:subscription_center\\?add_user=)?(?:user/)?([^\\s\"/\\?#']+)"
    ],
    "facebook": [
      "\\('facebook.com/([^']+)'\\)",
      "https?://(?:www\\.)?facebook.com/(?:home\\.php)?(?:business/dashboard/#/)?(?:government)?(?:#!/)?(?:#%21/)?(?:#/)?pages/[^/]+/(\\d+)",
      "https?://(?:www\\.)?facebook.com/(?:profile.php\\?id=)?(?:home\\.php)?(?:#!)?/?(?:people)?/?([^/\\s\"#\\?&']+)"
    ],
    "twitter": [
      "https?://(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/]+)",
      "\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)"
    ]
  }

  email_enabled = utils.flags().get('email', False)
  debug = utils.flags().get('debug', False)
  do_update = utils.flags().get('update', False)
  do_clean = utils.flags().get('clean', False)
  do_verify = utils.flags().get('verify', False)
  do_resolvefb = utils.flags().get('resolvefb', False)
  do_resolveyt = utils.flags().get('resolveyt', False)

  # default to not caching
  cache = utils.flags().get('cache', False)
  force = not cache

  if do_resolvefb:
    service = "facebook"
  elif do_resolveyt:
    service = "youtube"
  else:
    service = utils.flags().get('service', None)
  if service not in ["twitter", "youtube", "facebook"]:
    print("--service must be one of twitter, youtube, or facebook")
    exit(0)

  # load in members, orient by bioguide ID
  print("Loading current legislators...")
  current = load_data("legislators-current.yaml")

  current_bioguide = { }
  for m in current:
    if "bioguide" in m["id"]:
      current_bioguide[m["id"]["bioguide"]] = m

  print("Loading blacklist...")
  blacklist = {
    'twitter': [], 'facebook': [], 'youtube': []
  }
  for rec in csv.DictReader(open("data/social_media_blacklist.csv")):
    blacklist[rec["service"]].append(rec["pattern"])

  print("Loading whitelist...")
  whitelist = {
    'twitter': [], 'facebook': [], 'youtube': []
  }
  for rec in csv.DictReader(open("data/social_media_whitelist.csv")):
    whitelist[rec["service"]].append(rec["account"].lower())

  # reorient currently known social media by ID
  print("Loading social media...")
  media = load_data("legislators-social-media.yaml")
  media_bioguide = { }
  for m in media:
    media_bioguide[m["id"]["bioguide"]] = m


  def resolvefb():
    # in order to preserve the comment block at the top of the file,
    # copy it over into a new RtYamlList instance. We do this because
    # Python list instances can't hold other random attributes.
    import rtyaml
    updated_media = rtyaml.RtYamlList()
    if hasattr(media, '__initial_comment_block'):
      updated_media.__initial_comment_block = getattr(media, '__initial_comment_block')

    for m in media:
      social = m['social']

      if ('facebook' in social and social['facebook']) and ('facebook_id' not in social):
        graph_url = "https://graph.facebook.com/%s" % social['facebook']

        if re.match('\d+', social['facebook']):
          social['facebook_id'] = social['facebook']
          print("Looking up graph username for %s" % social['facebook'])
          fbobj = requests.get(graph_url).json()
          if 'username' in fbobj:
            print("\tGot graph username of %s" % fbobj['username'])
            social['facebook'] = fbobj['username']
          else:
            print("\tUnable to get graph username")

        else:
          try:
            print("Looking up graph ID for %s" % social['facebook'])
            fbobj = requests.get(graph_url).json()
            if 'id' in fbobj:
              print("\tGot graph ID of %s" % fbobj['id'])
              social['facebook_id'] = fbobj['id']
            else:
              print("\tUnable to get graph ID")
          except:
            print("\tUnable to get graph ID for: %s" % social['facebook'])
            social['facebook_id'] = None

      updated_media.append(m)

    print("Saving social media...")
    save_data(updated_media, "legislators-social-media.yaml")


  def resolveyt():
    # To avoid hitting quota limits, register for a YouTube 2.0 API key at
    # https://code.google.com/apis/youtube/dashboard
    # and put it below
    api_file = open('cache/youtube_api_key','r')
    api_key = api_file.read()

    bioguide = utils.flags().get('bioguide', None)

    updated_media = []
    for m in media:
      if bioguide and (m['id']['bioguide'] != bioguide):
        updated_media.append(m)
        continue

      social = m['social']

      if ('youtube' in social) or ('youtube_id' in social):

        if 'youtube' not in social:
          social['youtube'] = social['youtube_id']

        ytid = social['youtube']

        profile_url = ("http://gdata.youtube.com/feeds/api/users/%s"
        "?v=2&prettyprint=true&alt=json&key=%s" % (ytid, api_key))

        try:
          print("Resolving YT info for %s" % social['youtube'])
          ytreq = requests.get(profile_url)
          # print "\tFetched with status code %i..." % ytreq.status_code

          if ytreq.status_code == 404:
            # If the account name isn't valid, it's probably a redirect.
            try:
              # Try to scrape the real YouTube username
              print("\Scraping YouTube username")
              search_url = ("http://www.youtube.com/%s" % social['youtube'])
              csearch = requests.get(search_url).text.encode('ascii','ignore')

              u = re.search(r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>',csearch)

              if u:
                print("\t%s maps to %s" % (social['youtube'],u.group(1)))
                social['youtube'] = u.group(1)
                profile_url = ("http://gdata.youtube.com/feeds/api/users/%s"
                "?v=2&prettyprint=true&alt=json" % social['youtube'])

                print("\tFetching GData profile...")
                ytreq = requests.get(profile_url)
                print("\tFetched GData profile")

              else:
                raise Exception("Couldn't figure out the username format for %s" % social['youtube'])

            except:
              print("\tCouldn't locate YouTube account")
              raise

          ytobj = ytreq.json()
          social['youtube_id'] = ytobj['entry']['yt$channelId']['$t']
          print("\tResolved youtube_id to %s" % social['youtube_id'])

          # even though we have their channel ID, do they also have a username?
          if ytobj['entry']['yt$username']['$t'] != ytobj['entry']['yt$userId']['$t']:
            if social['youtube'].lower() != ytobj['entry']['yt$username']['$t'].lower():
              # YT accounts are case-insensitive.  Preserve capitalization if possible.
              social['youtube'] = ytobj['entry']['yt$username']['$t']
              print("\tAdded YouTube username of %s" % social['youtube'])
          else:
            print("\tYouTube says they do not have a separate username")
            del social['youtube']
        except:
          print("Unable to get YouTube Channel ID for: %s" % social['youtube'])

      updated_media.append(m)

    print("Saving social media...")
    save_data(updated_media, "legislators-social-media.yaml")


  def sweep():
    to_check = []

    bioguide = utils.flags().get('bioguide', None)
    if bioguide:
      possibles = [bioguide]
    else:
      possibles = list(current_bioguide.keys())

    for bioguide in possibles:
      if media_bioguide.get(bioguide, None) is None:
        to_check.append(bioguide)
      elif (media_bioguide[bioguide]["social"].get(service, None) is None) and \
        (media_bioguide[bioguide]["social"].get(service + "_id", None) is None):
        to_check.append(bioguide)
      else:
        pass

    utils.mkdir_p("cache/social_media")
    writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w'))
    writer.writerow(["bioguide", "official_full", "website", "service", "candidate", "candidate_url"])

    if len(to_check) > 0:
      rows_found = []
      for bioguide in to_check:
        candidate = candidate_for(bioguide)
        if candidate:
          url = current_bioguide[bioguide]["terms"][-1].get("url", None)
          candidate_url = "https://%s.com/%s" % (service, candidate)
          row = [bioguide, current_bioguide[bioguide]['name']['official_full'].encode('utf-8'), url, service, candidate, candidate_url]
          writer.writerow(row)
          print("\tWrote: %s" % candidate)
          rows_found.append(row)

      if email_enabled and len(rows_found) > 0:
        email_body = "Social media leads found:\n\n"
        for row in rows_found:
          email_body += ("%s\n" % row)
        utils.send_email(email_body)

  def verify():
    bioguide = utils.flags().get('bioguide', None)
    if bioguide:
      to_check = [bioguide]
    else:
      to_check = list(media_bioguide.keys())

    for bioguide in to_check:
      entry = media_bioguide[bioguide]
      current = entry['social'].get(service, None)
      if not current:
        continue

      bioguide = entry['id']['bioguide']

      candidate = candidate_for(bioguide)
      if not candidate:
        # if current is in whitelist, and none is on the page, that's okay
        if current.lower() in whitelist[service]:
          continue
        else:
          candidate = ""

      url = current_bioguide[bioguide]['terms'][-1].get('url')

      if current.lower() != candidate.lower():
        print("[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate))

  def update():
    for rec in csv.DictReader(open("cache/social_media/%s_candidates.csv" % service)):
      bioguide = rec["bioguide"]
      candidate = rec["candidate"]

      if bioguide in media_bioguide:
        media_bioguide[bioguide]['social'][service] = candidate
      else:
        new_media = {'id': {}, 'social': {}}

        new_media['id']['bioguide'] = bioguide
        thomas_id = current_bioguide[bioguide]['id'].get("thomas", None)
        govtrack_id = current_bioguide[bioguide]['id'].get("govtrack", None)
        if thomas_id:
          new_media['id']['thomas'] = thomas_id
        if govtrack_id:
          new_media['id']['govtrack'] = govtrack_id


        new_media['social'][service] = candidate
        media.append(new_media)

    print("Saving social media...")
    save_data(media, "legislators-social-media.yaml")

    # if it's a youtube update, always do the resolve
    # if service == "youtube":
    #   resolveyt()


  def clean():
    print("Loading historical legislators...")
    historical = load_data("legislators-historical.yaml")

    count = 0
    for m in historical:
      if m["id"]["bioguide"] in media_bioguide:
        media.remove(media_bioguide[m["id"]["bioguide"]])
        count += 1
    print("Removed %i out of office legislators from social media file..." % count)

    print("Saving historical legislators...")
    save_data(media, "legislators-social-media.yaml")

  def candidate_for(bioguide):
    url = current_bioguide[bioguide]["terms"][-1].get("url", None)
    if not url:
      if debug:
        print("[%s] No official website, skipping" % bioguide)
      return None

    if debug:
      print("[%s] Downloading..." % bioguide)
    cache = "congress/%s.html" % bioguide
    body = utils.download(url, cache, force, {'check_redirects': True})

    all_matches = []
    for regex in regexes[service]:
      matches = re.findall(regex, body, re.I)
      if matches:
        all_matches.extend(matches)

    if all_matches:
      for candidate in all_matches:
        passed = True
        for blacked in blacklist[service]:
          if re.search(blacked, candidate, re.I):
            passed = False

        if not passed:
          if debug:
            print("\tBlacklisted: %s" % candidate)
          continue

        return candidate
      return None

  if do_update:
    update()
  elif do_clean:
    clean()
  elif do_verify:
    verify()
  elif do_resolvefb:
    resolvefb()
  elif do_resolveyt:
    resolveyt()
  else:
    sweep()
def run():

	# Field mapping. And which fields should be turned into integers.
	# See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available.
	fieldmap = {
		"congbio": "bioguide",
		#"fec": "fec", # handled specially...
		"govtrack": "govtrack", # for sanity checking since we definitely have this already (I caught some Wikipedia errors)
		"opensecrets": "opensecrets",
		"votesmart": "votesmart",
		"cspan": "cspan",
	}
	int_fields = ("govtrack", "votesmart", "cspan")

	# default to not caching
	cache = utils.flags().get('cache', False)

	# Load legislator files and map bioguide IDs.
	y1 = utils.load_data("legislators-current.yaml")
	y2 = utils.load_data("legislators-historical.yaml")
	bioguides = { }
	for y in y1+y2:
	  bioguides[y["id"]["bioguide"]] = y

	# Okay now the Wikipedia stuff...

	def get_matching_pages():
		# Does a Wikipedia API search for pages containing either of the
		# two templates. Returns the pages.

		page_titles = set()

		for template in ("CongLinks", "CongBio"):
			eicontinue = ""
			while True:
				# construct query URL, using the "eicontinue" of the last query to get the next batch
				url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template
				if eicontinue: url += "&eicontinue=" + eicontinue

				# load the XML
				print("Getting %s pages (%d...)" % (template, len(page_titles)))
				dom = lxml.etree.fromstring(utils.download(url, None, True)) # can't cache eicontinue probably

				for pgname in dom.xpath("query/embeddedin/ei/@title"):
					page_titles.add(pgname)

				# get the next eicontinue value and loop
				eicontinue = dom.xpath("string(query-continue/embeddedin/@eicontinue)")
				if not eicontinue: break

		return page_titles

	# Get the list of Wikipedia pages that use any of the templates we care about.
	page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles")
	if cache and os.path.exists(page_list_cache_file):
		# Load from cache.
		matching_pages = open(page_list_cache_file).read().split("\n")
	else:
		# Query Wikipedia API and save to cache.
		matching_pages = get_matching_pages()
		utils.write(("\n".join(matching_pages)), page_list_cache_file)

	# Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon).
	matching_pages = [p for p in matching_pages if ":" not in p]

	# Load each page's content and parse the template.
	for p in sorted(matching_pages):
		if " campaign" in p: continue
		if " (surname)" in p: continue
		if "career of " in p: continue
		if "for Congress" in p: continue
		if p.startswith("List of "): continue
		if p in ("New York in the American Civil War", "Upper Marlboro, Maryland"): continue

		# Query the Wikipedia API to get the raw page content in XML,
		# and then use XPath to get the raw page text.
		url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote(p.encode("utf8")) + "&export&exportnowrap"
		cache_path = "legislators/wikipedia/pages/" + p
		dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache))
		page_content = dom.xpath("string(mw:page/mw:revision/mw:text)", namespaces={ "mw": "http://www.mediawiki.org/xml/export-0.8/" })

		# Build a dict for the IDs that we want to insert into our files.
		new_ids = {
			"wikipedia": p # Wikipedia page name, with spaces for spaces (not underscores)
		}

		if "CongLinks" in page_content:
			# Parse the key/val pairs in the template.
			m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content)
			if not m: continue # no template?
			for arg in m.group(1).split("|"):
				if "=" not in arg: continue
				key, val = arg.split("=", 1)
				key = key.strip()
				val = val.strip()
				if val and key in fieldmap:
					try:
						if fieldmap[key] in int_fields: val = int(val)
					except ValueError:
						print("invalid value", key, val)
						continue

					if key == "opensecrets": val = val.replace("&newMem=Y", "").replace("&newmem=Y", "").replace("&cycle=2004", "").upper()
					new_ids[fieldmap[key]] = val

			if "bioguide" not in new_ids: continue
			new_ids["bioguide"] = new_ids["bioguide"].upper() # hmm
			bioguide = new_ids["bioguide"]

		else:
			m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content)
			if not m: continue # no template?
			bioguide = m.group(1).upper()


		if not bioguide in bioguides:
			print("Member not found: " + bioguide, p.encode("utf8"), "(Might have been a delegate to the Constitutional Convention.)")
			continue

		# handle FEC ids specially because they are stored in an array...
		fec_id = new_ids.get("fec")
		if fec_id: del new_ids["fec"]

		member = bioguides[bioguide]
		member["id"].update(new_ids)

		# ...finish the FEC id.
		if fec_id:
			if fec_id not in bioguides[bioguide]["id"].get("fec", []):
				bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id)

		#print p.encode("utf8"), new_ids

	utils.save_data(y1, "legislators-current.yaml")
	utils.save_data(y2, "legislators-historical.yaml")
예제 #14
0
    for n in chunks:
      if isinstance(n, tree.Tree) and n.node == "MATCH":
        people = []
        relationship = None
        for piece in n:
          if piece.node == "RELATIONSHIP":
            relationship = " ".join([x[0] for x in piece])
          elif piece.node == "NAMES":
            for name in [x for x in piece if isinstance(x, tree.Tree)]:
              people.append(" ".join([x[0] for x in name]))
        for person in people:
          relationships.append({ "relation": relationship, "name": person})
  return relationships


debug = utils.flags().get('debug', False)

# default to caching
cache = utils.flags().get('cache', True)
force = not cache

# pick either current or historical
# order is important here, since current defaults to true
if utils.flags().get('historical', False):
  filename = "legislators-historical.yaml"
elif utils.flags().get('current', True):
  filename = "legislators-current.yaml"
else:
  print "No legislators selected."
  exit(0)
예제 #15
0
def run():
    committees_historical = load_data("committees-historical.yaml")

    # default to not caching
    flags = utils.flags()
    cache = flags.get('cache', False)

    if cache:
        from scrapelib.cache import FileCache
        scraper.cache_storage = FileCache('cache')
        scraper.cache_write_only = False
    else:
        raise

    # map thomas_id's to their dicts
    committees_historical_ref = {}
    for cx in committees_historical:
        committees_historical_ref[cx["thomas_id"]] = cx

    # pick the range of committees to get
    single_congress = flags.get('congress', False)
    if single_congress:
        start_congress = int(single_congress)
        end_congress = int(single_congress) + 1
    else:
        start_congress = 113
        end_congress = CURRENT_CONGRESS + 1

    urls = {
        'senate':
        'https://www.govinfo.gov/bulkdata/BILLSTATUS/{congress}/s/BILLSTATUS-{congress}-s.zip',
        'house':
        'https://www.govinfo.gov/bulkdata/BILLSTATUS/{congress}/hr/BILLSTATUS-{congress}-hr.zip'
    }

    all_committees = {'house': {}, 'senate': {}}

    for congress in range(start_congress, end_congress):
        for chamber, bill_status_url in urls.items():
            chamber_committees = all_committees[chamber]

            url = bill_status_url.format(congress=congress)
            response = scraper.get(url)

            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                for name in z.namelist():
                    if name.startswith('BILLSTATUS'):
                        with z.open(name) as xml_file:
                            bill_status = lxml.etree.parse(xml_file)
                            committees = bill_status.xpath(
                                '//billCommittees/item')
                            for committee in committees:
                                code = str(
                                    committee.xpath('./systemCode/text()')[0])
                                name = str(committee.xpath('./name/text()')[0])
                                if name.endswith(' Committee'):
                                    name = name[:-10]
                                if code not in chamber_committees:
                                    chamber_committees[code] = {
                                        'names': {
                                            congress: name
                                        },
                                        'subcommittees': {}
                                    }
                                else:
                                    if congress not in chamber_committees[
                                            code]:
                                        chamber_committees[code]['names'][
                                            congress] = name

                                subcommittees_d = chamber_committees[code][
                                    'subcommittees']
                                for subcommittee in committee.xpath(
                                        './subcommittees/item'):
                                    code = str(
                                        subcommittee.xpath(
                                            './systemCode/text()')[0])
                                    name = str(
                                        subcommittee.xpath('./name/text()')[0])
                                    if name.endswith(' Subcommittee'):
                                        name = name[:-13]
                                    if code not in subcommittees_d:
                                        subcommittees_d[code] = {
                                            congress: name
                                        }
                                    else:
                                        if congress not in subcommittees_d[
                                                code]:
                                            subcommittees_d[code][
                                                congress] = name

            import pprint
            pprint.pprint(chamber_committees)
            print(len(chamber_committees))

    for chamber, committees in all_committees.items():
        for code, committee in committees.items():
            id = str(code).upper()

            id = id[:-2]

            if id in committees_historical_ref:
                # Update existing record.
                cx = committees_historical_ref[id]

            else:
                # Create a new record.
                cx = OrderedDict()
                committees_historical_ref[id] = cx
                cx['type'] = chamber.lower()
                if id[0] != "J":  # Joint committees show their full name, otherwise they show a partial name
                    cx['name'] = chamber + " Committee on " + name
                else:
                    cx['name'] = committee['names'][min(committee['names'])]
                cx['thomas_id'] = id
                committees_historical.append(cx)

            for code, subcommittee in committee['subcommittees'].items():

                for sx in cx.setdefault('subcommittees', []):
                    if sx['thomas_id'] == code[-2:]:
                        # found existing record
                        break
                else:
                    # 'break' not executed, so create a new record
                    sx = OrderedDict()
                    sx['name'] = subcommittee[min(subcommittee)]
                    sx['thomas_id'] = code[-2:]
                    cx['subcommittees'].append(sx)

                    sx.setdefault('congresses', [])
                    sx.setdefault('names', {})

                    for congress, name in subcommittee.items():
                        if congress not in sx['congresses']:
                            sx['congresses'].append(congress)

                            sx['names'][congress] = name

            cx.setdefault('congresses', [])
            cx.setdefault('names', {})

            for congress, name in committee['names'].items():
                if congress not in cx['congresses']:
                    cx['congresses'].append(congress)
                    cx['names'][congress] = name

    # TODO
    # after checking diff on first commit, we should re-sort
    #committees_historical.sort(key = lambda c : c["thomas_id"])
    #for c in committees_historical:
    #  c.get("subcommittees", []).sort(key = lambda s : s["thomas_id"])

    save_data(committees_historical, "committees-historical.yaml")
예제 #16
0
def run():
    committee_membership = {}

    committees_current = load_data("committees-current.yaml")
    memberships_current = load_data("committee-membership-current.yaml")

    # default to not caching
    cache = utils.flags().get('cache', False)
    force = not cache

    # map house/senate committee IDs to their dicts
    house_ref = {}
    for cx in committees_current:
        if "house_committee_id" in cx:
            house_ref[cx["house_committee_id"]] = cx
    senate_ref = {}
    for cx in committees_current:
        if "senate_committee_id" in cx:
            senate_ref[cx["senate_committee_id"]] = cx

    # map state/district to current representatives and state/lastname to current senators
    # since the House/Senate pages do not provide IDs for Members of Congress
    today = datetime.datetime.now().date()
    legislators_current = load_data("legislators-current.yaml")
    congressmen = {}
    senators = {}
    for moc in legislators_current:
        term = moc["terms"][-1]
        if today < parse_date(term["start"]) or today > parse_date(
                term["end"]):
            raise ValueError("Member's last listed term is not current: " +
                             repr(moc) + " / " + term["start"])
        if term["type"] == "rep":
            congressmen["%s%02d" % (term["state"], term["district"])] = moc
        elif term["type"] == "sen":
            for n in [moc["name"]] + moc.get("other_names", []):
                senators[(term["state"], n["last"])] = moc

    # Scrape clerk.house.gov...

    def scrape_house_alt():
        for id, cx in list(house_ref.items()):
            scrape_house_committee(cx, cx["thomas_id"], id + "00")

    def scrape_house():
        """The old way of scraping House committees was to start with the committee list
    at the URL below, but this page no longer has links to the committee info pages
    even though those pages exist. Preserving this function in case we need it later."""
        url = "http://clerk.house.gov/committee_info/index.aspx"
        body = download(url, "committees/membership/house.html", force)
        for id, name in re.findall(
                r'<a href="/committee_info/index.aspx\?comcode=(..)00">(.*)</a>',
                body, re.I):
            if id not in house_ref:
                print("Unrecognized committee:", id, name)
                continue
            cx = house_ref[id]
            scrape_house_committee(cx, cx["thomas_id"], id + "00")

    def scrape_house_committee(cx, output_code, house_code):
        # load the House Clerk's committee membership page for the committee
        # (it is encoded in utf-8 even though the page indicates otherwise, and
        # while we don't really care, it helps our sanity check that compares
        # names)
        url = "http://clerk.house.gov/committee_info/index.aspx?%s=%s" % (
            'comcode' if house_code[-2:] == '00' else 'subcomcode', house_code)
        body = download(url,
                        "committees/membership/house/%s.html" % house_code,
                        force)
        dom = lxml.html.parse(io.StringIO(body)).getroot()

        # update official name metadata
        if house_code[-2:] == "00":
            cx["name"] = "House " + str(
                dom.cssselect("#com_display h3")[0].text_content())
        else:
            cx["name"] = str(
                dom.cssselect("#subcom_title h4")[0].text_content())

        # update address/phone metadata
        address_info = re.search(
            r"""Mailing Address:\s*(.*\S)\s*Telephone:\s*(\(202\) .*\S)""",
            dom.cssselect("#address")[0].text_content(), re.I | re.S)
        if not address_info:
            raise Exception("Failed to parse address info in %s." % house_code)
        cx["address"] = address_info.group(1)
        cx["address"] = re.sub(r"\s+", " ", cx["address"])
        cx["address"] = re.sub(
            r"(.*\S)(Washington, DC \d+)\s*(-\d+)?",
            lambda m: m.group(1) + "; " + m.group(2) +
            (m.group(3) if m.group(3) else ""), cx["address"])
        cx["phone"] = address_info.group(2)

        # get the ratio line to use in a sanity check later
        ratio = dom.cssselect("#ratio")
        if len(ratio):  # some committees are missing
            ratio = re.search(r"Ratio (\d+)/(\d+)", ratio[0].text_content())
        else:
            ratio = None

        # scan the membership, which is listed by party
        for i, party, nodename in ((1, 'majority', 'primary'), (2, 'minority',
                                                                'secondary')):
            ctr = 0
            for rank, node in enumerate(
                    dom.cssselect("#%s_group li" % nodename)):
                ctr += 1
                lnk = node.cssselect('a')
                if len(lnk) == 0:
                    if node.text_content() == "Vacancy": continue
                    raise ValueError("Failed to parse a <li> node.")
                moc = lnk[0].get('href')
                m = re.search(r"statdis=([A-Z][A-Z]\d\d)", moc)
                if not m:
                    raise ValueError("Failed to parse member link: " + moc)
                if not m.group(1) in congressmen:
                    print("Vacancy discrepancy? " + m.group(1))
                    continue

                moc = congressmen[m.group(1)]

                # Sanity check that the name matches the name in our data.
                found_name = node.cssselect('a')[0].text_content()
                found_name = re.sub(r"\s+", " ", found_name)  # fix whitespace
                found_name = found_name.replace("'", "’")  # fix smart apos
                if moc['name'].get("official_full", None) is None:
                    print("No official_full field for %s" % found_name)
                    continue
                if found_name != moc['name']['official_full']:
                    print(
                        "Name mismatch: %s (in our file) vs %s (on the Clerk page)"
                        % (moc['name']['official_full'], found_name))

                entry = OrderedDict()
                entry["name"] = moc['name']['official_full']
                entry["party"] = party
                entry["rank"] = rank + 1
                if rank == 0:
                    entry["title"] = "Chair" if entry[
                        "party"] == "majority" else "Ranking Member"  # not explicit, frown
                entry.update(ids_from(moc["id"]))

                committee_membership.setdefault(output_code, []).append(entry)

                # the .tail attribute has the text to the right of the link
                m = re.match(r", [A-Z][A-Z](,\s*)?(.*\S)?", lnk[0].tail)
                if m.group(2):
                    # Chairman, Vice Chair, etc. (all but Ex Officio) started appearing on subcommittees around Feb 2014.
                    # For the chair, this should overwrite the implicit title given for the rank 0 majority party member.
                    if m.group(2) in ("Chair", "Chairman", "Chairwoman"):
                        entry["title"] = "Chair"
                    elif m.group(2) in ("Vice Chair", "Vice Chairman"):
                        entry["title"] = "Vice Chair"

                    elif m.group(2) == "Ex Officio":
                        entry["title"] = m.group(2)

                    else:
                        raise ValueError(
                            "Unrecognized title information '%s' in %s." %
                            (m.group(2), url))

            # sanity check we got the right number of nodes
            if ratio and ctr != int(ratio.group(i)):
                raise ValueError(
                    "Parsing didn't get the right count of members.")

        # scan for subcommittees
        for subcom in dom.cssselect("#subcom_list li a"):
            m = re.search("subcomcode=(..(\d\d))", subcom.get('href'))
            if not m: raise ValueError("Failed to parse subcommittee link.")

            for sx in cx['subcommittees']:
                if sx["thomas_id"] == m.group(2):
                    break
            else:
                print("Subcommittee not found, creating it", output_code,
                      m.group(1))
                sx = OrderedDict()
                sx['name'] = "[not initialized]"  # will be set inside of scrape_house_committee
                sx['thomas_id'] = m.group(2)
                cx['subcommittees'].append(sx)
            scrape_house_committee(sx, cx["thomas_id"] + sx["thomas_id"],
                                   m.group(1))

    # Scrape senate.gov....
    def scrape_senate():
        url = "https://www.senate.gov/pagelayout/committees/b_three_sections_with_teasers/membership.htm"
        body = download(url, "committees/membership/senate.html", force)

        for id, name in re.findall(
                r'value="/general/committee_membership/committee_memberships_(....).htm">(.*?)</option>',
                body, re.I | re.S):
            if id not in senate_ref:
                print("Unrecognized committee:", id, name)
                continue

            cx = senate_ref[id]
            is_joint = (id[0] == "J")

            # Scrape some metadata on the HTML page first.

            committee_url = "https://www.senate.gov/general/committee_membership/committee_memberships_%s.htm" % id
            print("[%s] Fetching members for %s (%s)" %
                  (id, name, committee_url))
            body2 = download(committee_url,
                             "committees/membership/senate/%s.html" % id,
                             force)

            if not body2:
                print("\tcommittee page not good:", committee_url)
                continue

            m = re.search(
                r'<span class="contenttext"><a href="(http://(.*?).senate.gov/)">',
                body2, re.I)
            if m:
                cx["url"] = m.group(1)

            # Use the XML for the rest.

            print("\tDownloading XML...")
            committee_url = "https://www.senate.gov/general/committee_membership/committee_memberships_%s.xml" % id

            body3 = download(committee_url,
                             "committees/membership/senate/%s.xml" % id, force)
            dom = lxml.etree.fromstring(
                body3.encode("utf8")
            )  # must be bytes to parse if there is an encoding declaration inside the string

            cx["name"] = dom.xpath("committees/committee_name")[0].text
            if id[0] != "J" and id[0:2] != 'SC':
                cx["name"] = "Senate " + cx["name"]

            majority_party = dom.xpath("committees/majority_party")[0].text

            # update full committee members
            committee_membership[id] = []
            for member in dom.xpath("committees/members/member"):
                scrape_senate_member(committee_membership[id], member,
                                     majority_party, is_joint)

            # update subcommittees
            for subcom in dom.xpath("committees/subcommittee"):
                scid = subcom.xpath("committee_code")[0].text[4:]
                for sx in cx.get('subcommittees', []):
                    if sx["thomas_id"] == scid:
                        break
                else:
                    print("Subcommittee not found, creating it", scid, name)
                    sx = OrderedDict()
                    sx['thomas_id'] = scid
                    cx.setdefault('subcommittees', []).append(sx)

                # update metadata
                name = subcom.xpath("subcommittee_name")[0].text
                sx["name"] = name.strip()
                sx["name"] = re.sub(r"^\s*Subcommittee on\s*", "", sx["name"])
                sx["name"] = re.sub(r"\s+", " ", sx["name"])

                committee_membership[id + scid] = []
                for member in subcom.xpath("members/member"):
                    scrape_senate_member(committee_membership[id + scid],
                                         member, majority_party, is_joint)

    def scrape_senate_member(output_list, membernode, majority_party,
                             is_joint):
        last_name = membernode.xpath("name/last")[0].text
        state = membernode.xpath("state")[0].text
        party = "majority" if membernode.xpath(
            "party")[0].text == majority_party else "minority"
        title = membernode.xpath("position")[0].text
        if title == "Member": title = None
        if title == "Ranking": title = "Ranking Member"

        # look up senator by state and last name
        if (state, last_name) not in senators:
            print("\t[%s] Unknown member: %s" % (state, last_name))
            return None

        moc = senators[(state, last_name)]

        entry = OrderedDict()
        if 'official_full' in moc['name']:
            entry["name"] = moc['name']['official_full']
        else:
            print("missing name->official_full field for",
                  moc['id']['bioguide'])
        entry["party"] = party
        entry["rank"] = len([
            e for e in output_list if e["party"] == entry["party"]
        ]) + 1  # how many have we seen so far in this party, +1
        if title: entry["title"] = title
        entry.update(ids_from(moc["id"]))
        if is_joint: entry["chamber"] = "senate"

        output_list.append(entry)

        # sort by party, then by rank, since we get the nodes in the XML in a rough seniority order that ignores party
        # should be done once at the end, but cleaner to do it here
        output_list.sort(key=lambda e: (e["party"] != "majority", e["rank"]))

    # stick to a specific small set of official IDs to cross-link members
    # this limits the IDs from going out of control in this file, while
    # preserving us flexibility to be inclusive of IDs in the main leg files
    def ids_from(moc):
        ids = OrderedDict()
        for id in ["thomas", "bioguide"]:
            if id in moc:
                ids[id] = moc[id]
        if len(ids) == 0:
            raise ValueError(
                "Missing an official ID for this legislator, won't be able to link back"
            )
        return ids

    def restore_house_members_on_joint_committees():
        # The House doesn't publish joint committee members, but we're manaually gathering
        # that. Add them back into the output from whatever we have on disk. Put them after
        # Senate members.
        for c, mbrs in list(memberships_current.items()):
            if c[0] != "J": continue
            for m in mbrs:
                if m["chamber"] != "house": continue
                committee_membership[c].append(m)

    # MAIN

    scrape_house()
    scrape_senate()
    restore_house_members_on_joint_committees()

    save_data(committee_membership, "committee-membership-current.yaml")
    save_data(committees_current, "committees-current.yaml")
def main():
  regexes = {
    "youtube": [
      "https?://(?:www\\.)?youtube.com/(channel/[^\\s\"/\\?#']+)",
      "https?://(?:www\\.)?youtube.com/(?:subscribe_widget\\?p=)?(?:subscription_center\\?add_user=)?(?:user/)?([^\\s\"/\\?#']+)"
    ],
    "facebook": [
      "\\('facebook.com/([^']+)'\\)",
      "https?://(?:www\\.)?facebook.com/(?:home\\.php)?(?:business/dashboard/#/)?(?:government)?(?:#!/)?(?:#%21/)?(?:#/)?pages/[^/]+/(\\d+)",
      "https?://(?:www\\.)?facebook.com/(?:profile.php\\?id=)?(?:home\\.php)?(?:#!)?/?(?:people)?/?([^/\\s\"#\\?&']+)"
    ],
    "twitter": [
      "https?://(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/]+)",
      "\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)"
    ]
  }

  debug = utils.flags().get('debug', False)
  do_update = utils.flags().get('update', False)
  do_clean = utils.flags().get('clean', False)
  do_verify = utils.flags().get('verify', False)
  do_resolvefb = utils.flags().get('resolvefb', False)
  do_resolveyt = utils.flags().get('resolveyt', False)

  # default to not caching
  cache = utils.flags().get('cache', False)
  force = not cache

  if do_resolvefb:
    service = "facebook"
  elif do_resolveyt:
    service = "youtube"
  else:
    service = utils.flags().get('service', None)
  if service not in ["twitter", "youtube", "facebook"]:
    print "--service must be one of twitter, youtube, or facebook"
    exit(0)

  # load in members, orient by bioguide ID
  print "Loading current legislators..."
  current = load_data("legislators-current.yaml")

  current_bioguide = { }
  for m in current:
    if m["id"].has_key("bioguide"):
      current_bioguide[m["id"]["bioguide"]] = m

  print "Loading blacklist..."
  blacklist = {
    'twitter': [], 'facebook': [], 'youtube': []
  }
  for rec in csv.DictReader(open("data/social_media_blacklist.csv")):
    blacklist[rec["service"]].append(rec["pattern"])

  print "Loading whitelist..."
  whitelist = {
    'twitter': [], 'facebook': [], 'youtube': []
  }
  for rec in csv.DictReader(open("data/social_media_whitelist.csv")):
    whitelist[rec["service"]].append(rec["account"].lower())

  # reorient currently known social media by ID
  print "Loading social media..."
  media = load_data("legislators-social-media.yaml")
  media_bioguide = { }
  for m in media:
    media_bioguide[m["id"]["bioguide"]] = m


  def resolvefb():
    updated_media = []
    for m in media:
      social = m['social']

      if 'facebook' in social and social['facebook']:
        graph_url = "https://graph.facebook.com/%s" % social['facebook']

        if re.match('\d+', social['facebook']):
          social['facebook_id'] = social['facebook']
          fbobj = requests.get(graph_url).json()
          if 'username' in fbobj:
            social['facebook'] = fbobj['username']

        else:
          try:
            social['facebook_id'] = requests.get(graph_url).json()['id']
          except:
            print "Unable to get graph ID for: %s" % social['facebook']
            social['facebook_id'] = None

      updated_media.append(m)

    print "Saving social media..."
    save_data(updated_media, "legislators-social-media.yaml")


  def resolveyt():
    # To avoid hitting quota limits, register for a YouTube 2.0 API key at
    # https://code.google.com/apis/youtube/dashboard
    # and put it below
    api_file = open('cache/youtube_api_key','r')
    api_key = api_file.read()

    updated_media = []
    for m in media:
      social = m['social']

      if 'youtube' in social and (social['youtube'] or social['youtube_id']):

        if not social['youtube']:
          social['youtube'] = social['youtube_id']

        if re.match('^channel/',social['youtube']):
          ytid = social['youtube'][8:]
        else:
          ytid = social['youtube']

        profile_url = ("http://gdata.youtube.com/feeds/api/users/%s"
        "?v=2&prettyprint=true&alt=json&key=%s" % (ytid, api_key))

        try:
          ytreq = requests.get(profile_url)
          if ytreq.status_code == 404:
            # If the account name isn't valid, it's probably a redirect.
            try:
              # Try to scrape the real YouTube username
              search_url = ("http://www.youtube.com/%s" % social['youtube'])
              csearch = requests.get(search_url).text.encode('ascii','ignore')
              u = re.search(r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>',csearch)

              if u:
                print "%s maps to %s" % (social['youtube'],u.group(1))
                social['youtube'] = u.group(1)
                profile_url = ("http://gdata.youtube.com/feeds/api/users/%s"
                "?v=2&prettyprint=true&alt=json" % social['youtube'])
                ytreq = requests.get(profile_url)

              else:
                raise Exception("Couldn't figure out the username format for %s" % social['youtube'])

            except:
              print "Search couldn't locate YouTube account for %s" % social['youtube']
              raise

          ytobj = ytreq.json()
          social['youtube_id'] = ytobj['entry']['yt$channelId']['$t']

          if ytobj['entry']['yt$username']['$t'] != ytobj['entry']['yt$userId']['$t']:
            if social['youtube'].lower() != ytobj['entry']['yt$username']['$t']:
              # YT accounts are case-insensitive.  Preserve capitalization if possible.
              social['youtube'] = ytobj['entry']['yt$username']['$t']

          else:
            del social['youtube']
        except:
          print "Unable to get YouTube Channel ID for: %s" % social['youtube']
      updated_media.append(m)

    print "Saving social media..."
    save_data(updated_media, "legislators-social-media.yaml")


  def sweep():
    to_check = []

    bioguide = utils.flags().get('bioguide', None)
    if bioguide:
      possibles = [bioguide]
    else:
      possibles = current_bioguide.keys()

    for bioguide in possibles:
      if media_bioguide.get(bioguide, None) is None:
        to_check.append(bioguide)
      elif media_bioguide[bioguide]["social"].get(service, None) is None:
        to_check.append(bioguide)
      else:
        pass

    utils.mkdir_p("cache/social_media")
    writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w'))
    writer.writerow(["bioguide", "official_full", "website", "service", "candidate", "candidate_url"])

    for bioguide in to_check:
      candidate = candidate_for(bioguide)
      if candidate:
        url = current_bioguide[bioguide]["terms"][-1].get("url", None)
        candidate_url = "https://%s.com/%s" % (service, candidate)
        writer.writerow([bioguide, current_bioguide[bioguide]['name']['official_full'].encode('utf-8'), url, service, candidate, candidate_url])
        print "\tWrote: %s" % candidate

  def verify():
    bioguide = utils.flags().get('bioguide', None)
    if bioguide:
      to_check = [bioguide]
    else:
      to_check = media_bioguide.keys()

    for bioguide in to_check:
      entry = media_bioguide[bioguide]
      current = entry['social'].get(service, None)
      if not current:
        continue

      bioguide = entry['id']['bioguide']

      candidate = candidate_for(bioguide)
      if not candidate:
        # if current is in whitelist, and none is on the page, that's okay
        if current.lower() in whitelist[service]:
          continue
        else:
          candidate = ""

      url = current_bioguide[bioguide]['terms'][-1].get('url')

      if current.lower() != candidate.lower():
        print "[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate)

  def update():
    for rec in csv.DictReader(open("cache/social_media/%s_candidates.csv" % service)):
      bioguide = rec["bioguide"]
      candidate = rec["candidate"]

      if media_bioguide.has_key(bioguide):
        media_bioguide[bioguide]['social'][service] = candidate
      else:
        new_media = {'id': {}, 'social': {}}

        new_media['id']['bioguide'] = bioguide
        thomas_id = current_bioguide[bioguide]['id'].get("thomas", None)
        if thomas_id:
          new_media['id']['thomas'] = thomas_id

        new_media['social'][service] = candidate
        media.append(new_media)

    print "Saving social media..."
    save_data(media, "legislators-social-media.yaml")

  def clean():
    print "Loading historical legislators..."
    historical = load_data("legislators-historical.yaml")

    count = 0
    for m in historical:
      if media_bioguide.has_key(m["id"]["bioguide"]):
        media.remove(media_bioguide[m["id"]["bioguide"]])
        count += 1
    print "Removed %i out of office legislators from social media file..." % count

    print "Saving historical legislators..."
    save_data(media, "legislators-social-media.yaml")

  def candidate_for(bioguide):
    url = current_bioguide[bioguide]["terms"][-1].get("url", None)
    if not url:
      if debug:
        print "[%s] No official website, skipping" % bioguide
      return None

    if debug:
      print "[%s] Downloading..." % bioguide
    cache = "congress/%s.html" % bioguide
    body = utils.download(url, cache, force)

    all_matches = []
    for regex in regexes[service]:
      matches = re.findall(regex, body, re.I)
      if matches:
        all_matches.extend(matches)

    if all_matches:
      for candidate in all_matches:
        passed = True
        for blacked in blacklist[service]:
          if re.search(blacked, candidate, re.I):
            passed = False

        if not passed:
          if debug:
            print "\tBlacklisted: %s" % candidate
          continue

        return candidate
      return None

  if do_update:
    update()
  elif do_clean:
    clean()
  elif do_verify:
    verify()
  elif do_resolvefb:
    resolvefb()
  elif do_resolveyt:
    resolveyt()
  else:
    sweep()
예제 #18
0
#  --bioguide: do *only* a single legislator

import lxml.html, StringIO
import datetime
import re
import utils
from utils import download, load_data, save_data, parse_date

def birthday_for(string):
  pattern = "born(.+?)((?:January|February|March|April|May|June|July|August|September|October|November|December),? \\d{1,2},? \\d{4})"
  match = re.search(pattern, string, re.I)
  if match:
    if len(re.findall(";", match.group(1))) <= 1:
      return match.group(2).strip()

debug = utils.flags().get('debug', False)

# default to caching
cache = utils.flags().get('cache', True)
force = not cache

# pick either current or historical
# order is important here, since current defaults to true
if utils.flags().get('historical', False):
  filename = "legislators-historical.yaml"
elif utils.flags().get('current', True):
  filename = "legislators-current.yaml"
else:
  print "No legislators selected."
  exit(0)
def run():
    today = datetime.now().date()

    # default to not caching
    cache = utils.flags().get('cache', False)
    force = not cache

    y = load_data("legislators-current.yaml")

    for moc in y:
        try:
            term = moc["terms"][-1]
        except IndexError:
            print("Member has no terms", moc)
            continue

        if term["type"] != "rep": continue

        if today < parse_date(term["start"]) or today > parse_date(
                term["end"]):
            print("Member's last listed term is not current", moc,
                  term["start"])
            continue

        # Specify districts e.g. WA-02 on the command line to only update those.
        # if len(sys.argv) > 1 and ("%s-%02d" % (term["state"], term["district"])) not in sys.argv: continue

        if "class" in term: del term["class"]

        url = "http://clerk.house.gov/member_info/mem_contact_info.aspx?statdis=%s%02d" % (
            term["state"], term["district"])
        cache = "legislators/house/%s%02d.html" % (term["state"],
                                                   term["district"])
        try:
            # the meta tag say it's iso-8859-1, but... names are actually in utf8...
            body = download(url, cache, force)
            dom = lxml.html.parse(io.StringIO(body)).getroot()
        except lxml.etree.XMLSyntaxError:
            print("Error parsing: ", url)
            continue

        name = str(dom.cssselect("#results h3")[0].text_content())
        addressinfo = str(dom.cssselect("#results p")[0].text_content())

        # Sanity check that the name is similar.
        if name != moc["name"].get("official_full", ""):
            cfname = moc["name"]["first"] + " " + moc["name"]["last"]
            print("Warning: Are these the same people?", name.encode("utf8"),
                  "|", cfname.encode("utf8"))

        # Parse the address out of the address p tag.
        addressinfo = "; ".join(line.strip()
                                for line in addressinfo.split("\n")
                                if line.strip() != "")
        m = re.match(
            r"[\w\s]+-(\d+(st|nd|rd|th)|At Large|Delegate|Resident Commissioner), ([A-Za-z]*)(.+); Phone: (.*)",
            addressinfo, re.DOTALL)
        if not m:
            print("Error parsing address info: ", name.encode("utf8"), ":",
                  addressinfo.encode("utf8"))
            continue

        address = m.group(4)
        phone = re.sub(
            "^\((\d\d\d)\) ", lambda m: m.group(1) + "-", m.group(5)
        )  # replace (XXX) area code with XXX- for compatibility w/ existing format

        office = address.split(";")[0].replace("HOB", "House Office Building")

        moc["name"]["official_full"] = name
        term["address"] = address
        term["office"] = office
        term["phone"] = phone

    save_data(y, "legislators-current.yaml")
예제 #20
0
# Scrape house.gov and senate.gov for current committee membership,
# and updates the committees-current.yaml file with metadata including
# name, url, address, and phone number.

import re, lxml.html, lxml.etree, StringIO, datetime
from collections import OrderedDict
import utils
from utils import download, load_data, save_data, parse_date, CURRENT_CONGRESS

committee_membership = {}

committees_current = load_data("committees-current.yaml")
memberships_current = load_data("committee-membership-current.yaml")

# default to not caching
cache = utils.flags().get('cache', False)
force = not cache

# map house/senate committee IDs to their dicts
house_ref = {}
for cx in committees_current:
    if "house_committee_id" in cx:
        house_ref[cx["house_committee_id"]] = cx
senate_ref = {}
for cx in committees_current:
    if "senate_committee_id" in cx:
        senate_ref[cx["senate_committee_id"]] = cx

# map state/district to current representatives and state/lastname to current senators
# since the House/Senate pages do not provide IDs for Members of Congress
today = datetime.datetime.now().date()
예제 #21
0
def run(options):
  cache = utils.flags().get('cache', False)
  force = not cache
  scrape(options)
예제 #22
0
def run():

    # Field mapping. And which fields should be turned into integers.
    # See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available.
    fieldmap = {
        "congbio": "bioguide",
        #"fec": "fec", # handled specially...
        "govtrack":
        "govtrack",  # for sanity checking since we definitely have this already (I caught some Wikipedia errors)
        "opensecrets": "opensecrets",
        "votesmart": "votesmart",
        "cspan": "cspan",
    }
    int_fields = ("govtrack", "votesmart", "cspan")

    # default to not caching
    cache = utils.flags().get('cache', False)

    # Load legislator files and map bioguide IDs.
    y1 = utils.load_data("legislators-current.yaml")
    y2 = utils.load_data("legislators-historical.yaml")
    bioguides = {}
    for y in y1 + y2:
        bioguides[y["id"]["bioguide"]] = y

    # Okay now the Wikipedia stuff...

    def get_matching_pages():
        # Does a Wikipedia API search for pages containing either of the
        # two templates. Returns the pages.

        page_titles = set()

        for template in ("CongLinks", "CongBio"):
            eicontinue = ""
            while True:
                # construct query URL, using the "eicontinue" of the last query to get the next batch
                url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template
                if eicontinue: url += "&eicontinue=" + eicontinue

                # load the XML
                print("Getting %s pages (%d...)" %
                      (template, len(page_titles)))
                dom = lxml.etree.fromstring(utils.download(
                    url, None, True))  # can't cache eicontinue probably

                for pgname in dom.xpath("query/embeddedin/ei/@title"):
                    page_titles.add(pgname)

                # get the next eicontinue value and loop
                eicontinue = dom.xpath(
                    "string(query-continue/embeddedin/@eicontinue)")
                if not eicontinue: break

        return page_titles

    # Get the list of Wikipedia pages that use any of the templates we care about.
    page_list_cache_file = os.path.join(utils.cache_dir(),
                                        "legislators/wikipedia/page_titles")
    if cache and os.path.exists(page_list_cache_file):
        # Load from cache.
        matching_pages = open(page_list_cache_file).read().split("\n")
    else:
        # Query Wikipedia API and save to cache.
        matching_pages = get_matching_pages()
        utils.write(("\n".join(matching_pages)), page_list_cache_file)

    # Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon).
    matching_pages = [p for p in matching_pages if ":" not in p]

    # Load each page's content and parse the template.
    for p in sorted(matching_pages):
        if " campaign" in p: continue
        if " (surname)" in p: continue
        if "career of " in p: continue
        if "for Congress" in p: continue
        if p.startswith("List of "): continue
        if p in ("New York in the American Civil War",
                 "Upper Marlboro, Maryland"):
            continue

        # Query the Wikipedia API to get the raw page content in XML,
        # and then use XPath to get the raw page text.
        url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote(
            p.encode("utf8")) + "&export&exportnowrap"
        cache_path = "legislators/wikipedia/pages/" + p
        dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache))
        page_content = dom.xpath(
            "string(mw:page/mw:revision/mw:text)",
            namespaces={"mw": "http://www.mediawiki.org/xml/export-0.8/"})

        # Build a dict for the IDs that we want to insert into our files.
        new_ids = {
            "wikipedia":
            p  # Wikipedia page name, with spaces for spaces (not underscores)
        }

        if "CongLinks" in page_content:
            # Parse the key/val pairs in the template.
            m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content)
            if not m: continue  # no template?
            for arg in m.group(1).split("|"):
                if "=" not in arg: continue
                key, val = arg.split("=", 1)
                key = key.strip()
                val = val.strip()
                if val and key in fieldmap:
                    try:
                        if fieldmap[key] in int_fields: val = int(val)
                    except ValueError:
                        print("invalid value", key, val)
                        continue

                    if key == "opensecrets":
                        val = val.replace("&newMem=Y", "").replace(
                            "&newmem=Y", "").replace("&cycle=2004",
                                                     "").upper()
                    new_ids[fieldmap[key]] = val

            if "bioguide" not in new_ids: continue
            new_ids["bioguide"] = new_ids["bioguide"].upper()  # hmm
            bioguide = new_ids["bioguide"]

        else:
            m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content)
            if not m: continue  # no template?
            bioguide = m.group(1).upper()

        if not bioguide in bioguides:
            print(
                "Member not found: " + bioguide, p,
                "(Might have been a delegate to the Constitutional Convention.)"
            )
            continue

        # handle FEC ids specially because they are stored in an array...
        fec_id = new_ids.get("fec")
        if fec_id: del new_ids["fec"]

        member = bioguides[bioguide]
        member["id"].update(new_ids)

        # ...finish the FEC id.
        if fec_id:
            if fec_id not in bioguides[bioguide]["id"].get("fec", []):
                bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id)

        #print p.encode("utf8"), new_ids

    utils.save_data(y1, "legislators-current.yaml")
    utils.save_data(y2, "legislators-historical.yaml")
def run():

  # default to not caching
  cache = utils.flags().get('cache', False)
  force = not cache


  states = []
  current = load_data("legislators-current.yaml")
  by_district = { }
  for m in current:
    last_term = m['terms'][-1]
    if last_term['type'] != 'sen':
      state = last_term['state']

      full_district = "%s%02d" % (state, int(last_term['district']))
      by_district[full_district] = m

      if not state in states:
        # house lists AS (American Samoa) as AQ, awesome
        if state == "AS":
          state = "AQ"
        states.append(state)

  destination = "legislators/house.html"
  url = "http://house.gov/representatives/"
  body = utils.download(url, destination, force)
  if not body:
    print("Couldn't download House listing!")
    exit(0)

  try:
    dom = lxml.html.parse(io.StringIO(body)).getroot()
  except lxml.etree.XMLSyntaxError:
    print("Error parsing House listing!")
    exit(0)


  # process:
  #   go through every state in our records, fetching that state's table
  #   go through every row after the first, pick the district to isolate the member
  #   pluck out the URL, update that member's last term's URL
  count = 0
  for state in states:
    rows = dom.cssselect("h2#state_%s+table tr" % state.lower())

    for row in rows:
      cells = row.cssselect("td")
      if not cells:
        continue

      district = str(cells[0].text_content())
      if district == "At Large":
        district = 0

      url = cells[1].cssselect("a")[0].get("href")

      # hit the URL to resolve any redirects to get the canonical URL,
      # since the listing on house.gov sometimes gives URLs that redirect.
      resp = urllib.request.urlopen(url)
      url = resp.geturl()

      # kill trailing slashes
      url = re.sub("/$", "", url)

      if state == "AQ":
        state = "AS"
      full_district = "%s%02d" % (state, int(district))
      if full_district in by_district:
        by_district[full_district]['terms'][-1]['url'] = url
      else:
        print("[%s] No current legislator" % full_district)

      count += 1

  print("Processed %i people rows on House listing." % count)

  print("Saving data...")
  save_data(current, "legislators-current.yaml")
예제 #24
0
def run():

  def update_birthday(bioguide, person, main):

    birthday = birthday_for(main)
    if not birthday:
      print("[%s] NO BIRTHDAY :(\n\n%s" % (bioguide, main))
      warnings.append(bioguide)
      return
    if birthday == "UNKNOWN":
      return

    try:
      birthday = datetime.datetime.strptime(birthday.replace(",", ""), "%B %d %Y")
    except ValueError:
      print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide, main))
      warnings.append(bioguide)
      return

    birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day)
    person.setdefault("bio", {})["birthday"] = birthday


  def birthday_for(string):
    # exceptions for not-nicely-placed semicolons
    string = string.replace("born in Cresskill, Bergen County, N. J.; April", "born April")
    string = string.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802")
    string = string.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967")
    string = string.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962")
    string = string.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947")
    string = string.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968")

    # look for a date
    pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})"
    match = re.search(pattern, string, re.I)
    if not match or not match.group(1):
      # specifically detect cases that we can't handle to avoid unnecessary warnings
      if re.search("birth dates? unknown|date of birth is unknown", string, re.I): return "UNKNOWN"
      if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", string, re.I): return "UNKNOWN"
      return None
    return match.group(1).strip()

  def relationships_of(string):
    # relationship data is stored in a parenthetical immediately after the end of the </font> tag in the bio
    # e.g. "(son of Joseph Patrick Kennedy, II, and great-nephew of Edward Moore Kennedy and John Fitzgerald Kennedy)"
    pattern = "^\((.*?)\)"
    match = re.search(pattern, string, re.I)

    relationships = []

    if match and len(match.groups()) > 0:
      relationship_text = match.group(1).encode("ascii", "replace")

      # since some relationships refer to multiple people--great-nephew of Edward Moore Kennedy AND John Fitzgerald Kennedy--we need a special grammar
      from nltk import tree, pos_tag, RegexpParser
      tokens = re.split("[ ,;]+|-(?![0-9])", relationship_text)
      pos = pos_tag(tokens)

      grammar = r"""
        NAME: {<NNP>+}
        NAMES: { <IN><NAME>(?:<CC><NAME>)* }
        RELATIONSHIP: { <JJ|NN|RB|VB|VBD|VBN|IN|PRP\$>+ }
        MATCH: { <RELATIONSHIP><NAMES> }
        """
      cp = RegexpParser(grammar)
      chunks = cp.parse(pos)

      # iterate through the Relationship/Names pairs
      for n in chunks:
        if isinstance(n, tree.Tree) and n.node == "MATCH":
          people = []
          relationship = None
          for piece in n:
            if piece.node == "RELATIONSHIP":
              relationship = " ".join([x[0] for x in piece])
            elif piece.node == "NAMES":
              for name in [x for x in piece if isinstance(x, tree.Tree)]:
                people.append(" ".join([x[0] for x in name]))
          for person in people:
            relationships.append({ "relation": relationship, "name": person})
    return relationships

  # default to caching
  cache = utils.flags().get('cache', True)
  force = not cache

  # pick either current or historical
  # order is important here, since current defaults to true
  if utils.flags().get('historical', False):
    filename = "legislators-historical.yaml"
  elif utils.flags().get('current', True):
    filename = "legislators-current.yaml"
  else:
    print("No legislators selected.")
    exit(0)

  print("Loading %s..." % filename)
  legislators = load_data(filename)


  # reoriented cache to access by bioguide ID
  by_bioguide = { }
  for m in legislators:
    if "bioguide" in m["id"]:
      by_bioguide[m["id"]["bioguide"]] = m


  # optionally focus on one legislator

  bioguide = utils.flags().get('bioguide', None)
  if bioguide:
    bioguides = [bioguide]
  else:
    bioguides = list(by_bioguide.keys())

  warnings = []
  missing = []
  count = 0
  families = 0

  for bioguide in bioguides:
    # Download & parse the HTML of the bioguide page.
    try:
    	dom = fetch_bioguide_page(bioguide, force)
    except Exception as e:
    	print(e)
    	missing.append(bioguide)
    	continue

    # Extract the member's name and the biography paragraph (main).

    try:
      name = dom.cssselect("p font")[0]
      main = dom.cssselect("p")[0]
    except IndexError:
      print("[%s] Missing name or content!" % bioguide)
      exit(0)

    name = name.text_content().strip()
    main = main.text_content().strip().replace("\n", " ").replace("\r", " ")
    main = re.sub("\s+", " ", main)

    # Extract the member's birthday.

    update_birthday(bioguide, by_bioguide[bioguide], main)

    # Extract relationships with other Members of Congress.

    if utils.flags().get("relationships", False):
      #relationship information, if present, is in a parenthetical immediately after the name.
      #should always be present if we passed the IndexError catch above
      after_name = dom.cssselect("p font")[0].tail.strip()
      relationships = relationships_of(after_name)
      if len(relationships):
        families = families + 1
        by_bioguide[bioguide]["family"] = relationships

    count = count + 1


  print()
  if warnings:
    print("Missed %d birthdays: %s" % (len(warnings), str.join(", ", warnings)))

  if missing:
    print("Missing a page for %d bioguides: %s" % (len(missing), str.join(", ", missing)))

  print("Saving data to %s..." % filename)
  save_data(legislators, filename)

  print("Saved %d legislators to %s" % (count, filename))

  if utils.flags().get("relationships", False):
    print("Found family members for %d of those legislators" % families)
#!/usr/bin/env python

# Uses http://house.gov/representatives/ to scrape official member websites.
# Only known source.

# Assumptions:
#  member's state and district fields are present and accurate.
#  member's most recent term in the terms field is their current one.

import lxml.html, StringIO, urllib2
import re
import utils
from utils import download, load_data, save_data, parse_date

# default to not caching
cache = utils.flags().get("cache", False)
force = not cache


states = []
current = load_data("legislators-current.yaml")
by_district = {}
for m in current:
    last_term = m["terms"][-1]
    if last_term["type"] != "sen":
        state = last_term["state"]

        full_district = "%s%02d" % (state, int(last_term["district"]))
        by_district[full_district] = m

        if not state in states:
예제 #26
0
def run():
    CONGRESS_ID = "113th Congress (2013-2014)"  # the query string parameter

    # constants
    state_names = {
        "Alabama": "AL",
        "Alaska": "AK",
        "American Samoa": "AS",
        "Arizona": "AZ",
        "Arkansas": "AR",
        "California": "CA",
        "Colorado": "CO",
        "Connecticut": "CT",
        "Delaware": "DE",
        "District of Columbia": "DC",
        "Florida": "FL",
        "Georgia": "GA",
        "Guam": "GU",
        "Hawaii": "HI",
        "Idaho": "ID",
        "Illinois": "IL",
        "Indiana": "IN",
        "Iowa": "IA",
        "Kansas": "KS",
        "Kentucky": "KY",
        "Louisiana": "LA",
        "Maine": "ME",
        "Maryland": "MD",
        "Massachusetts": "MA",
        "Michigan": "MI",
        "Minnesota": "MN",
        "Mississippi": "MS",
        "Missouri": "MO",
        "Montana": "MT",
        "Nebraska": "NE",
        "Nevada": "NV",
        "New Hampshire": "NH",
        "New Jersey": "NJ",
        "New Mexico": "NM",
        "New York": "NY",
        "North Carolina": "NC",
        "North Dakota": "ND",
        "Northern Mariana Islands": "MP",
        "Ohio": "OH",
        "Oklahoma": "OK",
        "Oregon": "OR",
        "Pennsylvania": "PA",
        "Puerto Rico": "PR",
        "Rhode Island": "RI",
        "South Carolina": "SC",
        "South Dakota": "SD",
        "Tennessee": "TN",
        "Texas": "TX",
        "Utah": "UT",
        "Vermont": "VT",
        "Virgin Islands": "VI",
        "Virginia": "VA",
        "Washington": "WA",
        "West Virginia": "WV",
        "Wisconsin": "WI",
        "Wyoming": "WY"
    }

    # default to not caching
    cache = utils.flags().get('cache', False)
    force = not cache

    # load in current members
    y = load_data("legislators-current.yaml")
    by_district = {}
    existing_senator_ids = set()
    for m in y:
        last_term = m['terms'][-1]
        if last_term['type'] == 'rep':
            full_district = "%s%02d" % (last_term['state'],
                                        int(last_term['district']))
            by_district[full_district] = m
        elif last_term['type'] == 'sen':
            if "thomas" in m["id"]:
                existing_senator_ids.add(m["id"]["thomas"])

    seen_ids = set()
    for chamber in ("House of Representatives", "Senate"):
        url = "http://beta.congress.gov/members?pageSize=500&Legislative_Source=Member+Profiles&Congress=%s&Chamber_of_Congress=%s" % (
            urllib.parse.quote_plus(CONGRESS_ID),
            urllib.parse.quote_plus(chamber))
        cache = "congress.gov/members/%s-%s.html" % (CONGRESS_ID, chamber)
        try:
            body = download(url, cache, force)
            dom = lxml.html.parse(io.StringIO(body)).getroot()
        except lxml.etree.XMLSyntaxError:
            print("Error parsing: ", url)
            continue

        for node in dom.xpath("//ul[@class='results_list']/li"):
            thomas_id = "%05d" % int(
                re.search("/member/.*/(\d+)$",
                          node.xpath('h2/a')[0].get('href')).group(1))

            # THOMAS misassigned these 'new' IDs to existing individuals.
            if thomas_id in ('02139', '02132'):
                continue

            name = node.xpath('h2/a')[0].text

            state = node.xpath(
                'div[@class="memberProfile"]/table/tbody/tr[1]/td'
            )[0].text.strip()
            state = state_names[state]

            if chamber == "House of Representatives":
                # There's enough information to easily pick out which Member this refers to, so write it
                # directly to the file.
                district = node.xpath(
                    'div[@class="memberProfile"]/table/tbody/tr[2]/td'
                )[0].text.strip()
                if district == "At Large": district = 0
                district = "%02d" % int(district)

                if state + district not in by_district:
                    print(
                        state + district + "'s", name,
                        "appears on Congress.gov but the office is vacant in our data."
                    )
                    continue

                if state + district in seen_ids:
                    print("Congress.gov lists two people for %s%s!" %
                          (state, district))
                seen_ids.add(state + district)

                by_district[state + district]["id"]["thomas"] = thomas_id

            elif chamber == "Senate":
                # For senators we'd have to match on name or something else, so that's too difficult.
                # Just look for new IDs.
                if thomas_id not in existing_senator_ids:
                    print("Please manually set", thomas_id, "for", name,
                          "from", state)

    save_data(y, "legislators-current.yaml")
예제 #27
0
#  --cache: load from cache if present on disk (default: true)
#  --bioguide: load only one legislator, by his/her bioguide ID
#  --congress: do *only* updates for legislators serving in specific congress

import datetime
import re
import utils
import urllib2
import requests
from utils import download, load_data, save_data, parse_date, states, congress_from_legislative_year, legislative_year
import json
import string
import csv
import unicodedata

debug = utils.flags().get("debug", False)

# default to caching
cache = utils.flags().get("cache", True)
force = not cache


only_bioguide = utils.flags().get("bioguide", None)
congress = utils.flags().get("congress", None)


filename_historical = "legislators-historical.yaml"
filename_current = "legislators-current.yaml"
data_files = []

print "Loading %s..." % "legislators-current.yaml"
예제 #28
0
def run():
  CONGRESS_ID = "113th Congress (2013-2014)" # the query string parameter

  # constants
  state_names = {"Alabama": "AL", "Alaska": "AK", "American Samoa": "AS", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Guam": "GU", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Northern Mariana Islands": "MP", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Puerto Rico": "PR", "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virgin Islands": "VI", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY"}

  # default to not caching
  cache = utils.flags().get('cache', False)
  force = not cache

  # load in current members
  y = load_data("legislators-current.yaml")
  by_district = { }
  existing_senator_ids = set()
  for m in y:
    last_term = m['terms'][-1]
    if last_term['type'] == 'rep':
      full_district = "%s%02d" % (last_term['state'], int(last_term['district']))
      by_district[full_district] = m
    elif last_term['type'] == 'sen':
      if "thomas" in m["id"]:
        existing_senator_ids.add(m["id"]["thomas"])

  seen_ids = set()
  for chamber in ("House of Representatives", "Senate"):
    url = "http://beta.congress.gov/members?pageSize=500&Legislative_Source=Member+Profiles&Congress=%s&Chamber_of_Congress=%s" % (
      urllib.parse.quote_plus(CONGRESS_ID), urllib.parse.quote_plus(chamber))
    cache = "congress.gov/members/%s-%s.html" % (CONGRESS_ID, chamber)
    try:
      body = download(url, cache, force)
      dom = lxml.html.parse(io.StringIO(body)).getroot()
    except lxml.etree.XMLSyntaxError:
      print("Error parsing: ", url)
      continue

    for node in dom.xpath("//ul[@class='results_list']/li"):
      thomas_id = "%05d" % int(re.search("/member/.*/(\d+)$", node.xpath('h2/a')[0].get('href')).group(1))

      # THOMAS misassigned these 'new' IDs to existing individuals.
      if thomas_id in ('02139', '02132'):
        continue

      name = node.xpath('h2/a')[0].text

      state = node.xpath('div[@class="memberProfile"]/table/tbody/tr[1]/td')[0].text.strip()
      state = state_names[state]

      if chamber == "House of Representatives":
        # There's enough information to easily pick out which Member this refers to, so write it
        # directly to the file.
        district = node.xpath('div[@class="memberProfile"]/table/tbody/tr[2]/td')[0].text.strip()
        if district == "At Large": district = 0
        district = "%02d" % int(district)

        if state + district not in by_district:
          print(state + district + "'s", name, "appears on Congress.gov but the office is vacant in our data.")
          continue

        if state + district in seen_ids:
          print("Congress.gov lists two people for %s%s!" % (state, district))
        seen_ids.add(state+district)

        by_district[state + district]["id"]["thomas"] = thomas_id

      elif chamber == "Senate":
        # For senators we'd have to match on name or something else, so that's too difficult.
        # Just look for new IDs.
        if thomas_id not in existing_senator_ids:
          print("Please manually set", thomas_id, "for", name, "from", state)

  save_data(y, "legislators-current.yaml")
# Parse the THOMAS advanced search page for a list of all committees
# and subcommittees from the 93rd Congress forward and store them in
# the committees-historical.yaml file. It will include current committees
# as well.

import re, itertools
from collections import OrderedDict
import utils
from utils import download, load_data, save_data, CURRENT_CONGRESS

committees_historical = load_data("committees-historical.yaml")


# default to not caching
flags = utils.flags()
cache = flags.get('cache', False)
force = not cache


# map thomas_id's to their dicts
committees_historical_ref = { }
for cx in committees_historical: committees_historical_ref[cx["thomas_id"]] = cx


# pick the range of committees to get
single_congress = flags.get('congress', False)
if single_congress:
  start_congress = int(single_congress)
  end_congress = int(single_congress) + 1
else:
# and updates the committees-current.yaml file with metadata including
# name, url, address, and phone number.

import re, lxml.html, lxml.etree, StringIO, datetime
from collections import OrderedDict
import utils
from utils import download, load_data, save_data, parse_date, CURRENT_CONGRESS


committee_membership = { }

committees_current = load_data("committees-current.yaml")
memberships_current = load_data("committee-membership-current.yaml")

# default to not caching
cache = utils.flags().get('cache', False)
force = not cache


# map house/senate committee IDs to their dicts
house_ref = { }
for cx in committees_current:
  if "house_committee_id" in cx:
    house_ref[cx["house_committee_id"]] = cx
senate_ref = { }
for cx in committees_current:
  if "senate_committee_id" in cx:
    senate_ref[cx["senate_committee_id"]] = cx


# map state/district to current representatives and state/lastname to current senators
예제 #31
0
    def resolveyt():
        # To avoid hitting quota limits, register for a YouTube 2.0 API key at
        # https://code.google.com/apis/youtube/dashboard
        # and put it below
        api_file = open('cache/youtube_api_key', 'r')
        api_key = api_file.read()

        bioguide = utils.flags().get('bioguide', None)

        updated_media = []
        for m in media:
            if bioguide and (m['id']['bioguide'] != bioguide):
                updated_media.append(m)
                continue

            social = m['social']

            if ('youtube' in social) or ('youtube_id' in social):

                if 'youtube' not in social:
                    social['youtube'] = social['youtube_id']

                ytid = social['youtube']

                profile_url = ("https://gdata.youtube.com/feeds/api/users/%s"
                               "?v=2&prettyprint=true&alt=json&key=%s" %
                               (ytid, api_key))

                try:
                    print("Resolving YT info for %s" % social['youtube'])
                    ytreq = requests.get(profile_url)
                    # print "\tFetched with status code %i..." % ytreq.status_code

                    if ytreq.status_code == 404:
                        # If the account name isn't valid, it's probably a redirect.
                        try:
                            # Try to scrape the real YouTube username
                            print("\Scraping YouTube username")
                            search_url = ("https://www.youtube.com/%s" %
                                          social['youtube'])
                            csearch = requests.get(search_url).text.encode(
                                'ascii', 'ignore')

                            u = re.search(
                                r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>',
                                csearch)

                            if u:
                                print("\t%s maps to %s" %
                                      (social['youtube'], u.group(1)))
                                social['youtube'] = u.group(1)
                                profile_url = (
                                    "https://gdata.youtube.com/feeds/api/users/%s"
                                    "?v=2&prettyprint=true&alt=json" %
                                    social['youtube'])

                                print("\tFetching GData profile...")
                                ytreq = requests.get(profile_url)
                                print("\tFetched GData profile")

                            else:
                                raise Exception(
                                    "Couldn't figure out the username format for %s"
                                    % social['youtube'])

                        except:
                            print("\tCouldn't locate YouTube account")
                            raise

                    ytobj = ytreq.json()
                    social['youtube_id'] = ytobj['entry']['yt$channelId']['$t']
                    print("\tResolved youtube_id to %s" % social['youtube_id'])

                    # even though we have their channel ID, do they also have a username?
                    if ytobj['entry']['yt$username']['$t'] != ytobj['entry'][
                            'yt$userId']['$t']:
                        if social['youtube'].lower(
                        ) != ytobj['entry']['yt$username']['$t'].lower():
                            # YT accounts are case-insensitive.  Preserve capitalization if possible.
                            social['youtube'] = ytobj['entry']['yt$username'][
                                '$t']
                            print("\tAdded YouTube username of %s" %
                                  social['youtube'])
                    else:
                        print(
                            "\tYouTube says they do not have a separate username"
                        )
                        del social['youtube']
                except:
                    print("Unable to get YouTube Channel ID for: %s" %
                          social['youtube'])

            updated_media.append(m)

        print("Saving social media...")
        save_data(updated_media, "legislators-social-media.yaml")
예제 #32
0
def main():
    regexes = {
        "youtube": [
            "https?://(?:www\\.)?youtube.com/channel/([^\\s\"/\\?#']+)",
            "https?://(?:www\\.)?youtube.com/(?:subscribe_widget\\?p=)?(?:subscription_center\\?add_user=)?(?:user/)?([^\\s\"/\\?#']+)"
        ],
        "facebook": [
            "\\('facebook.com/([^']+)'\\)",
            "https?://(?:www\\.)?facebook.com/(?:home\\.php)?(?:business/dashboard/#/)?(?:government)?(?:#!/)?(?:#%21/)?(?:#/)?pages/[^/]+/(\\d+)",
            "https?://(?:www\\.)?facebook.com/(?:profile.php\\?id=)?(?:home\\.php)?(?:#!)?/?(?:people)?/?([^/\\s\"#\\?&']+)"
        ],
        "twitter": [
            "https?://(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/]+)",
            "\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)"
        ]
    }

    email_enabled = utils.flags().get('email', False)
    debug = utils.flags().get('debug', False)
    do_update = utils.flags().get('update', False)
    do_clean = utils.flags().get('clean', False)
    do_verify = utils.flags().get('verify', False)
    do_resolvefb = utils.flags().get('resolvefb', False)
    do_resolveyt = utils.flags().get('resolveyt', False)

    # default to not caching
    cache = utils.flags().get('cache', False)
    force = not cache

    if do_resolvefb:
        service = "facebook"
    elif do_resolveyt:
        service = "youtube"
    else:
        service = utils.flags().get('service', None)
    if service not in ["twitter", "youtube", "facebook"]:
        print("--service must be one of twitter, youtube, or facebook")
        exit(0)

    # load in members, orient by bioguide ID
    print("Loading current legislators...")
    current = load_data("legislators-current.yaml")

    current_bioguide = {}
    for m in current:
        if "bioguide" in m["id"]:
            current_bioguide[m["id"]["bioguide"]] = m

    print("Loading blacklist...")
    blacklist = {'twitter': [], 'facebook': [], 'youtube': []}
    for rec in csv.DictReader(open("data/social_media_blacklist.csv")):
        blacklist[rec["service"]].append(rec["pattern"])

    print("Loading whitelist...")
    whitelist = {'twitter': [], 'facebook': [], 'youtube': []}
    for rec in csv.DictReader(open("data/social_media_whitelist.csv")):
        whitelist[rec["service"]].append(rec["account"].lower())

    # reorient currently known social media by ID
    print("Loading social media...")
    media = load_data("legislators-social-media.yaml")
    media_bioguide = {}
    for m in media:
        media_bioguide[m["id"]["bioguide"]] = m

    def resolvefb():
        # in order to preserve the comment block at the top of the file,
        # copy it over into a new RtYamlList instance. We do this because
        # Python list instances can't hold other random attributes.
        import rtyaml
        updated_media = rtyaml.RtYamlList()
        if hasattr(media, '__initial_comment_block'):
            updated_media.__initial_comment_block = getattr(
                media, '__initial_comment_block')

        for m in media:
            social = m['social']

            if ('facebook' in social
                    and social['facebook']) and ('facebook_id' not in social):
                graph_url = "https://graph.facebook.com/%s" % social['facebook']

                if re.match('\d+', social['facebook']):
                    social['facebook_id'] = social['facebook']
                    print("Looking up graph username for %s" %
                          social['facebook'])
                    fbobj = requests.get(graph_url).json()
                    if 'username' in fbobj:
                        print("\tGot graph username of %s" % fbobj['username'])
                        social['facebook'] = fbobj['username']
                    else:
                        print("\tUnable to get graph username")

                else:
                    try:
                        print("Looking up graph ID for %s" %
                              social['facebook'])
                        fbobj = requests.get(graph_url).json()
                        if 'id' in fbobj:
                            print("\tGot graph ID of %s" % fbobj['id'])
                            social['facebook_id'] = fbobj['id']
                        else:
                            print("\tUnable to get graph ID")
                    except:
                        print("\tUnable to get graph ID for: %s" %
                              social['facebook'])
                        social['facebook_id'] = None

            updated_media.append(m)

        print("Saving social media...")
        save_data(updated_media, "legislators-social-media.yaml")

    def resolveyt():
        # To avoid hitting quota limits, register for a YouTube 2.0 API key at
        # https://code.google.com/apis/youtube/dashboard
        # and put it below
        api_file = open('cache/youtube_api_key', 'r')
        api_key = api_file.read()

        bioguide = utils.flags().get('bioguide', None)

        updated_media = []
        for m in media:
            if bioguide and (m['id']['bioguide'] != bioguide):
                updated_media.append(m)
                continue

            social = m['social']

            if ('youtube' in social) or ('youtube_id' in social):

                if 'youtube' not in social:
                    social['youtube'] = social['youtube_id']

                ytid = social['youtube']

                profile_url = ("http://gdata.youtube.com/feeds/api/users/%s"
                               "?v=2&prettyprint=true&alt=json&key=%s" %
                               (ytid, api_key))

                try:
                    print("Resolving YT info for %s" % social['youtube'])
                    ytreq = requests.get(profile_url)
                    # print "\tFetched with status code %i..." % ytreq.status_code

                    if ytreq.status_code == 404:
                        # If the account name isn't valid, it's probably a redirect.
                        try:
                            # Try to scrape the real YouTube username
                            print("\Scraping YouTube username")
                            search_url = ("http://www.youtube.com/%s" %
                                          social['youtube'])
                            csearch = requests.get(search_url).text.encode(
                                'ascii', 'ignore')

                            u = re.search(
                                r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>',
                                csearch)

                            if u:
                                print("\t%s maps to %s" %
                                      (social['youtube'], u.group(1)))
                                social['youtube'] = u.group(1)
                                profile_url = (
                                    "http://gdata.youtube.com/feeds/api/users/%s"
                                    "?v=2&prettyprint=true&alt=json" %
                                    social['youtube'])

                                print("\tFetching GData profile...")
                                ytreq = requests.get(profile_url)
                                print("\tFetched GData profile")

                            else:
                                raise Exception(
                                    "Couldn't figure out the username format for %s"
                                    % social['youtube'])

                        except:
                            print("\tCouldn't locate YouTube account")
                            raise

                    ytobj = ytreq.json()
                    social['youtube_id'] = ytobj['entry']['yt$channelId']['$t']
                    print("\tResolved youtube_id to %s" % social['youtube_id'])

                    # even though we have their channel ID, do they also have a username?
                    if ytobj['entry']['yt$username']['$t'] != ytobj['entry'][
                            'yt$userId']['$t']:
                        if social['youtube'].lower(
                        ) != ytobj['entry']['yt$username']['$t'].lower():
                            # YT accounts are case-insensitive.  Preserve capitalization if possible.
                            social['youtube'] = ytobj['entry']['yt$username'][
                                '$t']
                            print("\tAdded YouTube username of %s" %
                                  social['youtube'])
                    else:
                        print(
                            "\tYouTube says they do not have a separate username"
                        )
                        del social['youtube']
                except:
                    print("Unable to get YouTube Channel ID for: %s" %
                          social['youtube'])

            updated_media.append(m)

        print("Saving social media...")
        save_data(updated_media, "legislators-social-media.yaml")

    def sweep():
        to_check = []

        bioguide = utils.flags().get('bioguide', None)
        if bioguide:
            possibles = [bioguide]
        else:
            possibles = list(current_bioguide.keys())

        for bioguide in possibles:
            if media_bioguide.get(bioguide, None) is None:
                to_check.append(bioguide)
            elif (media_bioguide[bioguide]["social"].get(service, None) is None) and \
              (media_bioguide[bioguide]["social"].get(service + "_id", None) is None):
                to_check.append(bioguide)
            else:
                pass

        utils.mkdir_p("cache/social_media")
        writer = csv.writer(
            open("cache/social_media/%s_candidates.csv" % service, 'w'))
        writer.writerow([
            "bioguide", "official_full", "website", "service", "candidate",
            "candidate_url"
        ])

        if len(to_check) > 0:
            rows_found = []
            for bioguide in to_check:
                candidate = candidate_for(bioguide)
                if candidate:
                    url = current_bioguide[bioguide]["terms"][-1].get(
                        "url", None)
                    candidate_url = "https://%s.com/%s" % (service, candidate)
                    row = [
                        bioguide, current_bioguide[bioguide]['name']
                        ['official_full'].encode('utf-8'), url, service,
                        candidate, candidate_url
                    ]
                    writer.writerow(row)
                    print("\tWrote: %s" % candidate)
                    rows_found.append(row)

            if email_enabled and len(rows_found) > 0:
                email_body = "Social media leads found:\n\n"
                for row in rows_found:
                    email_body += ("%s\n" % row)
                utils.send_email(email_body)

    def verify():
        bioguide = utils.flags().get('bioguide', None)
        if bioguide:
            to_check = [bioguide]
        else:
            to_check = list(media_bioguide.keys())

        for bioguide in to_check:
            entry = media_bioguide[bioguide]
            current = entry['social'].get(service, None)
            if not current:
                continue

            bioguide = entry['id']['bioguide']

            candidate = candidate_for(bioguide)
            if not candidate:
                # if current is in whitelist, and none is on the page, that's okay
                if current.lower() in whitelist[service]:
                    continue
                else:
                    candidate = ""

            url = current_bioguide[bioguide]['terms'][-1].get('url')

            if current.lower() != candidate.lower():
                print("[%s] mismatch on %s - %s -> %s" %
                      (bioguide, url, current, candidate))

    def update():
        for rec in csv.DictReader(
                open("cache/social_media/%s_candidates.csv" % service)):
            bioguide = rec["bioguide"]
            candidate = rec["candidate"]

            if bioguide in media_bioguide:
                media_bioguide[bioguide]['social'][service] = candidate
            else:
                new_media = {'id': {}, 'social': {}}

                new_media['id']['bioguide'] = bioguide
                thomas_id = current_bioguide[bioguide]['id'].get(
                    "thomas", None)
                govtrack_id = current_bioguide[bioguide]['id'].get(
                    "govtrack", None)
                if thomas_id:
                    new_media['id']['thomas'] = thomas_id
                if govtrack_id:
                    new_media['id']['govtrack'] = govtrack_id

                new_media['social'][service] = candidate
                media.append(new_media)

        print("Saving social media...")
        save_data(media, "legislators-social-media.yaml")

        # if it's a youtube update, always do the resolve
        # if service == "youtube":
        #   resolveyt()

    def clean():
        print("Loading historical legislators...")
        historical = load_data("legislators-historical.yaml")

        count = 0
        for m in historical:
            if m["id"]["bioguide"] in media_bioguide:
                media.remove(media_bioguide[m["id"]["bioguide"]])
                count += 1
        print(
            "Removed %i out of office legislators from social media file..." %
            count)

        print("Saving historical legislators...")
        save_data(media, "legislators-social-media.yaml")

    def candidate_for(bioguide):
        url = current_bioguide[bioguide]["terms"][-1].get("url", None)
        if not url:
            if debug:
                print("[%s] No official website, skipping" % bioguide)
            return None

        if debug:
            print("[%s] Downloading..." % bioguide)
        cache = "congress/%s.html" % bioguide
        body = utils.download(url, cache, force, {'check_redirects': True})

        all_matches = []
        for regex in regexes[service]:
            matches = re.findall(regex, body, re.I)
            if matches:
                all_matches.extend(matches)

        if all_matches:
            for candidate in all_matches:
                passed = True
                for blacked in blacklist[service]:
                    if re.search(blacked, candidate, re.I):
                        passed = False

                if not passed:
                    if debug:
                        print("\tBlacklisted: %s" % candidate)
                    continue

                return candidate
            return None

    if do_update:
        update()
    elif do_clean:
        clean()
    elif do_verify:
        verify()
    elif do_resolvefb:
        resolvefb()
    elif do_resolveyt:
        resolveyt()
    else:
        sweep()
예제 #33
0
def main():
  regexes = {
    "youtube": [
      "(?:https?:)?//(?:www\\.)?youtube.com/embed/?\?(list=[^\\s\"/\\?#&']+)",
      "(?:https?:)?//(?:www\\.)?youtube.com/channel/([^\\s\"/\\?#']+)",
      "(?:https?:)?//(?:www\\.)?youtube.com/(?:subscribe_widget\\?p=)?(?:subscription_center\\?add_user=)?(?:user/)?([^\\s\"/\\?#']+)"
    ],
    "facebook": [
      "\\('facebook.com/([^']+)'\\)",
      "(?:https?:)?//(?:www\\.)?facebook.com/(?:home\\.php)?(?:business/dashboard/#/)?(?:government)?(?:#!/)?(?:#%21/)?(?:#/)?pages/[^/]+/(\\d+)",
      "(?:https?:)?//(?:www\\.)?facebook.com/(?:profile.php\\?id=)?(?:home\\.php)?(?:#!)?/?(?:people)?/?([^/\\s\"#\\?&']+)"
    ],
    "twitter": [
      "(?:https?:)?//(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/]+)",
      "\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)"
    ],
    "instagram": [
      "instagram.com/(\w{3,})"
    ]
  }

  email_enabled = utils.flags().get('email', False)
  debug = utils.flags().get('debug', False)
  do_update = utils.flags().get('update', False)
  do_clean = utils.flags().get('clean', False)
  do_verify = utils.flags().get('verify', False)
  do_resolveyt = utils.flags().get('resolveyt', False)
  do_resolveig = utils.flags().get('resolveig', False)
  do_resolvetw = utils.flags().get('resolvetw', False)


  # default to not caching
  cache = utils.flags().get('cache', False)
  force = not cache

  if do_resolveyt:
    service = "youtube"
  elif do_resolveig:
    service = "instagram"
  elif do_resolvetw:
    service = "twitter"
  else:
    service = utils.flags().get('service', None)
  if service not in ["twitter", "youtube", "facebook", "instagram"]:
    print("--service must be one of twitter, youtube, facebook, or instagram")
    exit(0)

  # load in members, orient by bioguide ID
  print("Loading current legislators...")
  current = load_data("legislators-current.yaml")

  current_bioguide = { }
  for m in current:
    if "bioguide" in m["id"]:
      current_bioguide[m["id"]["bioguide"]] = m

  print("Loading blacklist...")
  blacklist = {
    'twitter': [], 'facebook': [], 'youtube': [], 'instagram': []
  }
  for rec in csv.DictReader(open("data/social_media_blacklist.csv")):
    blacklist[rec["service"]].append(rec["pattern"])

  print("Loading whitelist...")
  whitelist = {
    'twitter': [], 'facebook': [], 'youtube': []
  }
  for rec in csv.DictReader(open("data/social_media_whitelist.csv")):
    whitelist[rec["service"]].append(rec["account"].lower())

  # reorient currently known social media by ID
  print("Loading social media...")
  media = load_data("legislators-social-media.yaml")
  media_bioguide = { }
  for m in media:
    media_bioguide[m["id"]["bioguide"]] = m


  def resolveyt():
    # To avoid hitting quota limits, register for a YouTube 2.0 API key at
    # https://code.google.com/apis/youtube/dashboard
    # and put it below
    api_file = open('cache/youtube_api_key','r')
    api_key = api_file.read()

    bioguide = utils.flags().get('bioguide', None)

    updated_media = []
    for m in media:
      if bioguide and (m['id']['bioguide'] != bioguide):
        updated_media.append(m)
        continue

      social = m['social']

      if ('youtube' in social) or ('youtube_id' in social):

        if 'youtube' not in social:
          social['youtube'] = social['youtube_id']

        ytid = social['youtube']

        profile_url = ("https://gdata.youtube.com/feeds/api/users/%s"
        "?v=2&prettyprint=true&alt=json&key=%s" % (ytid, api_key))

        try:
          print("Resolving YT info for %s" % social['youtube'])
          ytreq = requests.get(profile_url)
          # print "\tFetched with status code %i..." % ytreq.status_code

          if ytreq.status_code == 404:
            # If the account name isn't valid, it's probably a redirect.
            try:
              # Try to scrape the real YouTube username
              print("\Scraping YouTube username")
              search_url = ("https://www.youtube.com/%s" % social['youtube'])
              csearch = requests.get(search_url).text.encode('ascii','ignore')

              u = re.search(r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>',csearch)

              if u:
                print("\t%s maps to %s" % (social['youtube'],u.group(1)))
                social['youtube'] = u.group(1)
                profile_url = ("https://gdata.youtube.com/feeds/api/users/%s"
                "?v=2&prettyprint=true&alt=json" % social['youtube'])

                print("\tFetching GData profile...")
                ytreq = requests.get(profile_url)
                print("\tFetched GData profile")

              else:
                raise Exception("Couldn't figure out the username format for %s" % social['youtube'])

            except:
              print("\tCouldn't locate YouTube account")
              raise

          ytobj = ytreq.json()
          social['youtube_id'] = ytobj['entry']['yt$channelId']['$t']
          print("\tResolved youtube_id to %s" % social['youtube_id'])

          # even though we have their channel ID, do they also have a username?
          if ytobj['entry']['yt$username']['$t'] != ytobj['entry']['yt$userId']['$t']:
            if social['youtube'].lower() != ytobj['entry']['yt$username']['$t'].lower():
              # YT accounts are case-insensitive.  Preserve capitalization if possible.
              social['youtube'] = ytobj['entry']['yt$username']['$t']
              print("\tAdded YouTube username of %s" % social['youtube'])
          else:
            print("\tYouTube says they do not have a separate username")
            del social['youtube']
        except:
          print("Unable to get YouTube Channel ID for: %s" % social['youtube'])

      updated_media.append(m)

    print("Saving social media...")
    save_data(updated_media, "legislators-social-media.yaml")


  def resolveig():
    # in order to preserve the comment block at the top of the file,
    # copy it over into a new RtYamlList instance. We do this because
    # Python list instances can't hold other random attributes.
    import rtyaml
    updated_media = rtyaml.RtYamlList()
    if hasattr(media, '__initial_comment_block'):
      updated_media.__initial_comment_block = getattr(media, '__initial_comment_block')

    client_id_file = open('cache/instagram_client_id','r')
    client_id = client_id_file.read()

    bioguide = utils.flags().get('bioguide', None)

    for m in media:
      if bioguide and (m['id']['bioguide'] != bioguide):
        updated_media.append(m)
        continue

      social = m['social']
      if 'instagram' not in social and 'instagram_id' not in social:
        updated_media.append(m)
        continue

      instagram_handle = social['instagram']
      query_url = "https://api.instagram.com/v1/users/search?q={query}&client_id={client_id}".format(query=instagram_handle,client_id=client_id)
      instagram_user_search = requests.get(query_url).json()
      for user in instagram_user_search['data']:
        time.sleep(0.5)
        if user['username'] == instagram_handle:
          m['social']['instagram_id'] = int(user['id'])
          print("matched instagram_id {instagram_id} to {instagram_handle}".format(instagram_id=social['instagram_id'],instagram_handle=instagram_handle))
      updated_media.append(m)

    save_data(updated_media, "legislators-social-media.yaml")


  def resolvetw():
    """
    Does two batch lookups:

    1. All entries with `twitter_id`: Checks to see if the corresponding Twitter profile has the same screen_name
        as found in the entry's `twitter`. If not, the `twitter` value is updated.
    2. All entries with `twitter` (but not `twitter_id`): fetches the corresponding Twitter profile by screen_name and
        inserts ID. If no profile is found, the `twitter` value is deleted.

    Note: cache/twitter_client_id must be a formatted JSON dict:
        {
        "consumer_secret": "xyz",
        "access_token": "abc",
        "access_token_secret": "def",
        "consumer_key": "jk"
       }
    """
    import rtyaml
    from social.twitter import get_api, fetch_profiles
    updated_media = rtyaml.RtYamlList()
    if hasattr(media, '__initial_comment_block'):
      updated_media.__initial_comment_block = getattr(media, '__initial_comment_block')

    client_id_file = open('cache/twitter_client_id', 'r')
    _c = json.load(client_id_file)
    api = get_api(_c['access_token'], _c['access_token_secret'], _c['consumer_key'], _c['consumer_secret'])
    bioguide = utils.flags().get('bioguide', None)
    lookups = {'screen_names': [], 'ids': []} # store members that have `twitter` or `twitter_id` info
    for m in media:
      # we start with appending to updated_media so that we keep the same order of entries
      # as found in the loaded file
      updated_media.append(m)
      if bioguide and (m['id']['bioguide'] != bioguide):
        continue
      social = m['social']
      # now we add entries to either the `ids` or the `screen_names` list to batch lookup
      if 'twitter_id' in social:
        # add to the queue to be batched-looked-up
        lookups['ids'].append(m)
        # append
      elif 'twitter' in social:
        lookups['screen_names'].append(m)

    #######################################
    # perform Twitter batch lookup for ids:
    if lookups['screen_names']:
      arr = lookups['screen_names']
      print("Looking up Twitter ids for", len(arr), "names.")
      tw_names = [m['social']['twitter'] for m in arr]
      tw_profiles = fetch_profiles(api, screen_names = tw_names)
      for m in arr:
        social = m['social']
        # find profile that corresponds to a given screen_name
        twitter_handle = social['twitter']
        twp = next((p for p in tw_profiles if p['screen_name'].lower() == twitter_handle.lower()), None)
        if twp:
          m['social']['twitter_id'] = int(twp['id'])
          print("Matched twitter_id `%s` to `%s`" % (social['twitter_id'], twitter_handle))
        else:
          # Remove errant Twitter entry for now
          print("No Twitter user profile for:", twitter_handle)
          m['social'].pop('twitter')
          print("\t ! removing Twitter handle:", twitter_handle)
    ##########################################
    # perform Twitter batch lookup for names by id, to update any renamings:
    if lookups['ids']:
      arr = lookups['ids']
      print("Looking up Twitter screen_names for", len(arr), "ids.")
      tw_ids = [m['social']['twitter_id'] for m in arr]
      tw_profiles = fetch_profiles(api, ids = tw_ids)
      any_renames_needed = False
      for m in arr:
        social = m['social']
        # find profile that corresponds to a given screen_name
        t_id = social['twitter_id']
        t_name = social.get('twitter')
        twp = next((p for p in tw_profiles if int(p['id']) == t_id), None)
        if twp:
          # Be silent if there is no change to screen name
          if t_name and (twp['screen_name'].lower() == t_name.lower()):
            pass
          else:
            any_renames_needed = True
            m['social']['twitter'] = twp['screen_name']
            print("For twitter_id `%s`, renamed `%s` to `%s`" % (t_id, t_name, m['social']['twitter']))
        else:
          # No entry found for this twitter id
          print("No Twitter user profile for %s, %s" % (t_id, t_name))
          m['social'].pop('twitter_id')
          print("\t ! removing Twitter id:", t_id)
      if not any_renames_needed:
        print("No renames needed")
    # all done with Twitter
    save_data(updated_media, "legislators-social-media.yaml")


  def sweep():
    to_check = []

    bioguide = utils.flags().get('bioguide', None)
    if bioguide:
      possibles = [bioguide]
    else:
      possibles = list(current_bioguide.keys())

    for bioguide in possibles:
      if media_bioguide.get(bioguide, None) is None:
        to_check.append(bioguide)
      elif (media_bioguide[bioguide]["social"].get(service, None) is None) and \
        (media_bioguide[bioguide]["social"].get(service + "_id", None) is None):
        to_check.append(bioguide)
      else:
        pass

    utils.mkdir_p("cache/social_media")
    writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w'))
    writer.writerow(["bioguide", "official_full", "website", "service", "candidate", "candidate_url"])

    if len(to_check) > 0:
      rows_found = []
      for bioguide in to_check:
        candidate = candidate_for(bioguide)
        if candidate:
          url = current_bioguide[bioguide]["terms"][-1].get("url", None)
          candidate_url = "https://%s.com/%s" % (service, candidate)
          row = [bioguide, current_bioguide[bioguide]['name']['official_full'].encode('utf-8'), url, service, candidate, candidate_url]
          writer.writerow(row)
          print("\tWrote: %s" % candidate)
          rows_found.append(row)

      if email_enabled and len(rows_found) > 0:
        email_body = "Social media leads found:\n\n"
        for row in rows_found:
          email_body += ("%s\n" % row)
        utils.send_email(email_body)

  def verify():
    bioguide = utils.flags().get('bioguide', None)
    if bioguide:
      to_check = [bioguide]
    else:
      to_check = list(media_bioguide.keys())

    for bioguide in to_check:
      entry = media_bioguide[bioguide]
      current = entry['social'].get(service, None)
      if not current:
        continue

      bioguide = entry['id']['bioguide']

      candidate = candidate_for(bioguide, current)
      if not candidate:
        # if current is in whitelist, and none is on the page, that's okay
        if current.lower() in whitelist[service]:
          continue
        else:
          candidate = ""

      url = current_bioguide[bioguide]['terms'][-1].get('url')

      if current.lower() != candidate.lower():
        print("[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate))

  def update():
    for rec in csv.DictReader(open("cache/social_media/%s_candidates.csv" % service)):
      bioguide = rec["bioguide"]
      candidate = rec["candidate"]

      if bioguide in media_bioguide:
        media_bioguide[bioguide]['social'][service] = candidate
      else:
        new_media = {'id': {}, 'social': {}}

        new_media['id']['bioguide'] = bioguide
        thomas_id = current_bioguide[bioguide]['id'].get("thomas", None)
        govtrack_id = current_bioguide[bioguide]['id'].get("govtrack", None)
        if thomas_id:
          new_media['id']['thomas'] = thomas_id
        if govtrack_id:
          new_media['id']['govtrack'] = govtrack_id


        new_media['social'][service] = candidate
        media.append(new_media)

    print("Saving social media...")
    save_data(media, "legislators-social-media.yaml")

    # if it's a youtube update, always do the resolve
    # if service == "youtube":
    #   resolveyt()


  def clean():
    print("Loading historical legislators...")
    historical = load_data("legislators-historical.yaml")

    count = 0
    for m in historical:
      if m["id"]["bioguide"] in media_bioguide:
        media.remove(media_bioguide[m["id"]["bioguide"]])
        count += 1
    print("Removed %i out of office legislators from social media file..." % count)

    print("Saving historical legislators...")
    save_data(media, "legislators-social-media.yaml")


  def candidate_for(bioguide, current = None):
    """find the most likely candidate account from the URL.
    If current is passed, the candidate will match it if found
    otherwise, the first candidate match is returned
    """
    url = current_bioguide[bioguide]["terms"][-1].get("url", None)
    if not url:
      if debug:
        print("[%s] No official website, skipping" % bioguide)
      return None

    if debug:
      print("[%s] Downloading..." % bioguide)
    cache = "congress/%s.html" % bioguide
    body = utils.download(url, cache, force, {'check_redirects': True})
    if not body:
      return None

    all_matches = []
    for regex in regexes[service]:
      matches = re.findall(regex, body, re.I)
      if matches:
        all_matches.extend(matches)

    if not current == None and current in all_matches:
      return current

    if all_matches:
      for candidate in all_matches:
        passed = True
        for blacked in blacklist[service]:
          if re.search(blacked, candidate, re.I):
            passed = False

        if not passed:
          if debug:
            print("\tBlacklisted: %s" % candidate)
          continue

        return candidate
      return None

  if do_update:
    update()
  elif do_clean:
    clean()
  elif do_verify:
    verify()
  elif do_resolveyt:
    resolveyt()
  elif do_resolveig:
    resolveig()
  elif do_resolvetw:
    resolvetw()

  else:
    sweep()
def run():
	today = datetime.now().date()

	# default to not caching
	cache = utils.flags().get('cache', False)
	force = not cache

	y = load_data("legislators-current.yaml")

	for moc in y:
		try:
			term = moc["terms"][-1]
		except IndexError:
			print("Member has no terms", moc)
			continue

		if term["type"] != "rep": continue

		if today < parse_date(term["start"]) or today > parse_date(term["end"]):
			print("Member's last listed term is not current", moc, term["start"])
			continue

		# Specify districts e.g. WA-02 on the command line to only update those.
		# if len(sys.argv) > 1 and ("%s-%02d" % (term["state"], term["district"])) not in sys.argv: continue

		if "class" in term: del term["class"]

		url = "http://clerk.house.gov/member_info/mem_contact_info.aspx?statdis=%s%02d" % (term["state"], term["district"])
		cache = "legislators/house/%s%02d.html" % (term["state"], term["district"])
		try:
			# the meta tag say it's iso-8859-1, but... names are actually in utf8...
			body = download(url, cache, force)
			dom = lxml.html.parse(io.StringIO(body)).getroot()
		except lxml.etree.XMLSyntaxError:
			print("Error parsing: ", url)
			continue

		name = str(dom.cssselect("#results h3")[0].text_content())
		addressinfo = str(dom.cssselect("#results p")[0].text_content())

		# Sanity check that the name is similar.
		if name != moc["name"].get("official_full", ""):
			cfname = moc["name"]["first"] + " " + moc["name"]["last"]
			print("Warning: Are these the same people?", name.encode("utf8"), "|", cfname.encode("utf8"))

		# Parse the address out of the address p tag.
		addressinfo = "; ".join(line.strip() for line in addressinfo.split("\n") if line.strip() != "")
		m = re.match(r"[\w\s]+-(\d+(st|nd|rd|th)|At Large|Delegate|Resident Commissioner), ([A-Za-z]*)(.+); Phone: (.*)", addressinfo, re.DOTALL)
		if not m:
			print("Error parsing address info: ", name.encode("utf8"), ":", addressinfo.encode("utf8"))
			continue

		address = m.group(4)
		phone = re.sub("^\((\d\d\d)\) ", lambda m : m.group(1) + "-", m.group(5)) # replace (XXX) area code with XXX- for compatibility w/ existing format

		office = address.split(";")[0].replace("HOB", "House Office Building")

		moc["name"]["official_full"] = name
		term["address"] = address
		term["office"] = office
		term["phone"] = phone

	save_data(y, "legislators-current.yaml")
예제 #35
0
def run(options):
  cache = utils.flags().get('cache', False)
  force = not cache
  scrape(options)
예제 #36
0
    def resolvetw():
        """
    Does two batch lookups:

    1. All entries with `twitter_id`: Checks to see if the corresponding Twitter profile has the same screen_name
        as found in the entry's `twitter`. If not, the `twitter` value is updated.
    2. All entries with `twitter` (but not `twitter_id`): fetches the corresponding Twitter profile by screen_name and
        inserts ID. If no profile is found, the `twitter` value is deleted.

    Note: cache/twitter_client_id must be a formatted JSON dict:
        {
        "consumer_secret": "xyz",
        "access_token": "abc",
        "access_token_secret": "def",
        "consumer_key": "jk"
       }
    """
        import rtyaml
        from social.twitter import get_api, fetch_profiles
        updated_media = rtyaml.RtYamlList()
        if hasattr(media, '__initial_comment_block'):
            updated_media.__initial_comment_block = getattr(
                media, '__initial_comment_block')

        client_id_file = open('cache/twitter_client_id', 'r')
        _c = json.load(client_id_file)
        api = get_api(_c['access_token'], _c['access_token_secret'],
                      _c['consumer_key'], _c['consumer_secret'])
        bioguide = utils.flags().get('bioguide', None)
        lookups = {
            'screen_names': [],
            'ids': []
        }  # store members that have `twitter` or `twitter_id` info
        for m in media:
            # we start with appending to updated_media so that we keep the same order of entries
            # as found in the loaded file
            updated_media.append(m)
            if bioguide and (m['id']['bioguide'] != bioguide):
                continue
            social = m['social']
            # now we add entries to either the `ids` or the `screen_names` list to batch lookup
            if 'twitter_id' in social:
                # add to the queue to be batched-looked-up
                lookups['ids'].append(m)
                # append
            elif 'twitter' in social:
                lookups['screen_names'].append(m)

        #######################################
        # perform Twitter batch lookup for ids:
        if lookups['screen_names']:
            arr = lookups['screen_names']
            print("Looking up Twitter ids for", len(arr), "names.")
            tw_names = [m['social']['twitter'] for m in arr]
            tw_profiles = fetch_profiles(api, screen_names=tw_names)
            for m in arr:
                social = m['social']
                # find profile that corresponds to a given screen_name
                twitter_handle = social['twitter']
                twp = next(
                    (p for p in tw_profiles
                     if p['screen_name'].lower() == twitter_handle.lower()),
                    None)
                if twp:
                    m['social']['twitter_id'] = int(twp['id'])
                    print("Matched twitter_id `%s` to `%s`" %
                          (social['twitter_id'], twitter_handle))
                else:
                    # Remove errant Twitter entry for now
                    print("No Twitter user profile for:", twitter_handle)
                    m['social'].pop('twitter')
                    print("\t ! removing Twitter handle:", twitter_handle)
        ##########################################
        # perform Twitter batch lookup for names by id, to update any renamings:
        if lookups['ids']:
            arr = lookups['ids']
            print("Looking up Twitter screen_names for", len(arr), "ids.")
            tw_ids = [m['social']['twitter_id'] for m in arr]
            tw_profiles = fetch_profiles(api, ids=tw_ids)
            any_renames_needed = False
            for m in arr:
                social = m['social']
                # find profile that corresponds to a given screen_name
                t_id = social['twitter_id']
                t_name = social.get('twitter')
                twp = next((p for p in tw_profiles if int(p['id']) == t_id),
                           None)
                if twp:
                    # Be silent if there is no change to screen name
                    if t_name and (twp['screen_name'].lower()
                                   == t_name.lower()):
                        pass
                    else:
                        any_renames_needed = True
                        m['social']['twitter'] = twp['screen_name']
                        print("For twitter_id `%s`, renamed `%s` to `%s`" %
                              (t_id, t_name, m['social']['twitter']))
                else:
                    # No entry found for this twitter id
                    print("No Twitter user profile for %s, %s" %
                          (t_id, t_name))
                    m['social'].pop('twitter_id')
                    print("\t ! removing Twitter id:", t_id)
            if not any_renames_needed:
                print("No renames needed")
        # all done with Twitter
        save_data(updated_media, "legislators-social-media.yaml")
def run():

    options = utils.flags()
    options['urllib'] = True # disable scrapelib for this

    debug = options.get('debug', False)

    # default to NOT caching
    cache = options.get('cache', False)
    force = not cache


    only_bioguide = options.get('bioguide', None)


    # pick either current or historical
    # order is important here, since current defaults to true
    if utils.flags().get('historical', False):
      filename = "legislators-historical.yaml"
    elif utils.flags().get('current', True):
      filename = "legislators-current.yaml"
    else:
      print("No legislators selected.")
      exit(0)


    print("Loading %s..." % filename)
    legislators = load_data(filename)


    api_file = open('cache/sunlight_api_key.txt','r')
    api_key = api_file.read()


    for m in legislators:

        # this can't run unless we've already collected a bioguide for this person
        bioguide = m["id"].get("bioguide", None)
        if not bioguide:
            continue
        # if we've limited this to just one bioguide, skip over everyone else
        if only_bioguide and (bioguide != only_bioguide):
            continue

        url_BG = "http://transparencydata.com/api/1.0/entities/id_lookup.json?bioguide_id="
        url_BG += bioguide
        url_BG += "&apikey="+api_key


        destination = "legislators/influence_explorer/lookups/%s.json" % bioguide
        if debug: print("[%s] Looking up ID..." % bioguide)
        body = utils.download(url_BG, destination, force, options)

        if not body:
            print("[%s] Bad request, skipping" % bioguide)
            continue

        jsondata = json.loads(body)
        if (jsondata != []):
            IE_ID = jsondata[0]['id']
            url_CRP = "http://transparencydata.com/api/1.0/entities/"
            url_CRP += IE_ID
            url_CRP += ".json?apikey=" + api_key

            destination = "legislators/influence_explorer/entities/%s.json" % IE_ID
            body = utils.download(url_CRP, destination, force, options)

            jsondata = json.loads(body)

            opensecrets_id = None
            fec_ids = []
            for external in jsondata['external_ids']:
                if external["namespace"].startswith("urn:crp"):
                    opensecrets_id = external['id']
                elif external["namespace"].startswith("urn:fec"):
                    fec_ids.append(external['id'])

            if opensecrets_id:
                m["id"]["opensecrets"] = opensecrets_id

            # preserve existing FEC IDs, but don't duplicate them
            if len(fec_ids) > 0:
                if m["id"].get("fec", None) is None: m["id"]["fec"] = []
                for fec_id in fec_ids:
                    if fec_id not in m["id"]["fec"]:
                        m["id"]["fec"].append(fec_id)

            print("[%s] Added opensecrets ID of %s" % (bioguide, opensecrets_id))
        else:
            print("[%s] NO DATA" % bioguide)




    print("Saving data to %s..." % filename)
    save_data(legislators, filename)
예제 #38
0
def run():

    today = datetime.now().date()

    # default to not caching
    cache = utils.flags().get('cache', False)
    force = not cache

    y = load_data("legislators-current.yaml")

    # Map bioguide IDs to dicts. Reference the same dicts
    # in y so we are updating y when we update biogiude.
    bioguide = {}
    by_name = {}
    for m in y:
        if "bioguide" in m["id"]:
            bioguide[m["id"]["bioguide"]] = m
        party = m["terms"][-1]["party"][0]
        state = m["terms"][-1]["state"]
        last_name = m["name"]["last"]
        member_full = "%s (%s-%s)" % (last_name, party, state)
        by_name[member_full] = m

    print("Fetching general Senate information from senators_cfm.xml...")

    url = "http://www.senate.gov/general/contact_information/senators_cfm.xml"
    body = download(url, "legislators/senate.xml", force)
    dom = lxml.etree.parse(
        io.BytesIO(body.encode("utf8"))
    )  # file has an <?xml declaration and so must be parsed as a bytes array
    for node in dom.xpath("member"):
        bioguide_id = str(node.xpath("string(bioguide_id)")).strip()
        member_full = node.xpath("string(member_full)")

        if bioguide_id == "":
            print("Someone has an empty bioguide ID!")
            print(lxml.etree.tostring(node))
            continue

        print("[%s] Processing Senator %s..." % (bioguide_id, member_full))

        # find member record in our YAML, either by bioguide_id or member_full
        if bioguide_id in bioguide:
            member = bioguide[bioguide_id]
        else:
            if member_full in by_name:
                member = by_name[member_full]
            else:
                print("Bioguide ID '%s' and full name '%s' not recognized." %
                      (bioguide_id, member_full))
                exit(0)

        try:
            term = member["terms"][-1]
        except IndexError:
            print("Member has no terms", bioguide_id, member_full)
            continue

        if today < parse_date(term["start"]) or today > parse_date(
                term["end"]):
            print("Member's last listed term is not current", bioguide_id,
                  member_full, term["start"])
            continue

        if term["type"] != "sen":
            print("Member's last listed term is not a Senate term",
                  bioguide_id, member_full)
            continue

        if term["state"] != str(node.xpath("string(state)")):
            print("Member's last listed term has the wrong state", bioguide_id,
                  member_full)
            continue

        if "district" in term: del term["district"]

        full_name = str(node.xpath("string(first_name)"))
        suffix = None
        if ", " in full_name: full_name, suffix = full_name.split(", ")
        full_name += " " + str(node.xpath("string(last_name)"))
        if suffix: full_name += ", " + suffix
        member["name"]["official_full"] = full_name

        member["id"]["bioguide"] = bioguide_id

        term["class"] = {
            "Class I": 1,
            "Class II": 2,
            "Class III": 3
        }[node.xpath("string(class)")]
        term["party"] = {
            "D": "Democrat",
            "R": "Republican",
            "I": "Independent",
            "ID": "Independent"
        }[node.xpath("string(party)")]

        url = str(node.xpath("string(website)")).strip()
        if not url.startswith("/"):
            # temporary home pages for new senators are relative links?

            # hit the URL to resolve any redirects to get the canonical URL,
            # since the listing on house.gov sometimes gives URLs that redirect.
            try:
                resp = urllib.request.urlopen(url)
                url = resp.geturl()
            except Exception as e:
                print(url, e)

            # kill trailing slash
            url = re.sub("/$", "", url)

            term["url"] = url

        term["address"] = str(node.xpath("string(address)")).strip().replace(
            "\n      ", " ")
        term["office"] = string.capwords(
            term["address"].upper().split(" WASHINGTON ")[0])

        phone = str(node.xpath("string(phone)")).strip()
        term["phone"] = phone.replace("(", "").replace(")",
                                                       "").replace(" ", "-")

        #contact_form = str(node.xpath("string(email)")).strip().replace(".Senate.gov", ".senate.gov")
        #if contact_form: # can be blank
        #	term["contact_form"] = contact_form

    print(
        "\n\nUpdating Senate stateRank and LIS ID from cvc_member_data.xml...")

    url = "http://www.senate.gov/legislative/LIS_MEMBER/cvc_member_data.xml"
    body = download(url, "legislators/senate_cvc.xml", force)
    dom = lxml.etree.parse(io.StringIO(body))
    for node in dom.getroot():
        if node.tag == "lastUpdate":
            date, time = node.getchildren()
            print("Last updated: %s, %s" % (date.text, time.text))
            continue

        bioguide_id = str(node.xpath("string(bioguideId)")).strip()
        if bioguide_id == "":
            print("Someone has an empty bioguide ID!")
            print(lxml.etree.tostring(node))
            continue

        last_name = node.xpath("string(name/last)")
        party = node.xpath("string(party)")
        state = node.xpath("string(state)")
        member_full = "%s (%s-%s)" % (last_name, party, state)

        print("[%s] Processing Senator %s..." % (bioguide_id, member_full))

        # find member record in our YAML, either by bioguide_id or member_full
        if bioguide_id in bioguide:
            member = bioguide[bioguide_id]
        else:
            if member_full in by_name:
                member = by_name[member_full]
            else:
                print(
                    "Bioguide ID '%s' and synthesized official name '%s' not recognized."
                    % (bioguide_id, member_full))
                exit(0)

        try:
            term = member["terms"][-1]
        except IndexError:
            print("Member has no terms", bioguide_id, member_full)
            continue

        if "id" not in member:
            member["id"] = {}

        member["id"]["lis"] = node.attrib["lis_member_id"]
        state_rank = node.xpath("string(stateRank)")
        if state_rank == '1':
            term["state_rank"] = "senior"
        elif state_rank == '2':
            term["state_rank"] = "junior"

    print("Saving data...")
    save_data(y, "legislators-current.yaml")
def main():
  regexes = {
    "youtube": [
      "https?://(?:www\\.)?youtube.com/(?:user/)?([^\\s\"']+)"
    ],
    "facebook": [
      "https?://(?:www\\.)?facebook.com/(?:home\\.php#!)?(?:#!)?(?:people/)?/?([^\\s\"']+)"
    ],
    "twitter": [
      "https?://(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/]+)",
      "\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)"
    ]
  }

  debug = utils.flags().get('debug', False)
  do_update = utils.flags().get('update', False)
  do_clean = utils.flags().get('clean', False)
  do_verify = utils.flags().get('verify', False)

  # default to not caching
  cache = utils.flags().get('cache', False)
  force = not cache

  service = utils.flags().get('service', None)
  if service not in ["twitter", "youtube", "facebook"]:
    print "--service must be one of twitter, youtube, or facebook"
    exit(0)

  # load in members, orient by bioguide ID
  print "Loading current legislators..."
  current = load_data("legislators-current.yaml")
  
  current_bioguide = { }
  for m in current:
    if m["id"].has_key("bioguide"):
      current_bioguide[m["id"]["bioguide"]] = m

  print "Loading blacklist..."
  blacklist = {
    'twitter': [], 'facebook': [], 'services': []
  }
  for rec in csv.DictReader(open("data/social_media_blacklist.csv")):
    blacklist[rec["service"]].append(rec["pattern"])

  print "Loading whitelist..."
  whitelist = {
    'twitter': [], 'facebook': [], 'services': []
  }
  for rec in csv.DictReader(open("data/social_media_whitelist.csv")):
    whitelist[rec["service"]].append(rec["account"].lower())

  # reorient currently known social media by ID
  print "Loading social media..."
  media = load_data("legislators-social-media.yaml")
  media_bioguide = { }
  for m in media:
    media_bioguide[m["id"]["bioguide"]] = m

  def sweep():
    to_check = []

    bioguide = utils.flags().get('bioguide', None)
    if bioguide:
      possibles = [bioguide]
    else:
      possibles = current_bioguide.keys()

    for bioguide in possibles:
      if media_bioguide.get(bioguide, None) is None:
        to_check.append(bioguide)
      elif media_bioguide[bioguide]["social"].get(service, None) is None:
        to_check.append(bioguide)
      else:
        pass

    utils.mkdir_p("cache/social_media")
    writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w'))
    writer.writerow(["bioguide", "official_full", "website", "service", "candidate"])

    for bioguide in to_check:
      candidate = candidate_for(bioguide)
      if candidate:
        url = current_bioguide[bioguide]["terms"][-1].get("url", None)
        writer.writerow([bioguide, current_bioguide[bioguide]['name']['official_full'], url, service, candidate])
        print "\tWrote: %s" % candidate

  def verify():
    bioguide = utils.flags().get('bioguide', None)
    if bioguide:
      to_check = [bioguide]
    else:
      to_check = media_bioguide.keys()

    for bioguide in to_check:
      entry = media_bioguide[bioguide]
      current = entry['social'].get(service, None)
      if not current:
        continue

      bioguide = entry['id']['bioguide']

      candidate = candidate_for(bioguide)
      if not candidate:
        # if current is in whitelist, and none is on the page, that's okay
        if current.lower() in whitelist[service]:
          continue
        else:
          candidate = ""

      url = current_bioguide[bioguide]['terms'][-1].get('url')

      if current.lower() != candidate.lower():
        print "[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate)

  def update():
    for rec in csv.DictReader(open("cache/social_media/%s_candidates.csv" % service)):
      bioguide = rec["bioguide"]
      candidate = rec["candidate"]

      if media_bioguide.has_key(bioguide):
        media_bioguide[bioguide]['social'][service] = candidate
      else:
        new_media = {'id': {}, 'social': {}}

        new_media['id']['bioguide'] = bioguide
        thomas_id = current_bioguide[bioguide]['id'].get("thomas", None)
        if thomas_id:
          new_media['id']['thomas'] = thomas_id

        new_media['social'][service] = candidate
        media.append(new_media)

    print "Saving social media..."
    save_data(media, "legislators-social-media.yaml")

  def clean():
    print "Loading historical legislators..."
    historical = load_data("legislators-historical.yaml")

    count = 0
    for m in historical:
      if media_bioguide.has_key(m["id"]["bioguide"]):
        media.remove(media_bioguide[m["id"]["bioguide"]])
        count += 1
    print "Removed %i out of office legislators from social media file..." % count

    print "Saving historical legislators..."
    save_data(media, "legislators-social-media.yaml")

  def candidate_for(bioguide):
    url = current_bioguide[bioguide]["terms"][-1].get("url", None)
    if not url:
      if debug:
        print "[%s] No official website, skipping" % bioguide
      return None

    if debug:
      print "[%s] Downloading..." % bioguide
    cache = "congress/%s.html" % bioguide
    body = utils.download(url, cache, force)

    all_matches = []
    for regex in regexes[service]:
      matches = re.findall(regex, body, re.I)
      if matches:
        all_matches.extend(matches)

    if all_matches:
      for candidate in all_matches:
        passed = True
        for blacked in blacklist[service]:
          if re.search(blacked, candidate, re.I):
            passed = False
        
        if not passed:
          if debug:
            print "\tBlacklisted: %s" % candidate
          continue

        return candidate
      return None

  if do_update:
    update()
  elif do_clean:
    clean()
  elif do_verify:
    verify()
  else:
    sweep()
def run():
  committee_membership = { }

  committees_current = load_data("committees-current.yaml")
  memberships_current = load_data("committee-membership-current.yaml")

  # default to not caching
  cache = utils.flags().get('cache', False)
  force = not cache


  # map house/senate committee IDs to their dicts
  house_ref = { }
  for cx in committees_current:
    if "house_committee_id" in cx:
      house_ref[cx["house_committee_id"]] = cx
  senate_ref = { }
  for cx in committees_current:
    if "senate_committee_id" in cx:
      senate_ref[cx["senate_committee_id"]] = cx


  # map state/district to current representatives and state/lastname to current senators
  # since the House/Senate pages do not provide IDs for Members of Congress
  today = datetime.datetime.now().date()
  legislators_current = load_data("legislators-current.yaml")
  congressmen = { }
  senators = { }
  for moc in legislators_current:
    term = moc["terms"][-1]
    if today < parse_date(term["start"]) or today > parse_date(term["end"]):
      raise ValueError("Member's last listed term is not current: " + repr(moc) + " / " + term["start"])
    if term["type"] == "rep":
      congressmen["%s%02d" % (term["state"], term["district"])] = moc
    elif term["type"] == "sen":
      for n in [moc["name"]] + moc.get("other_names", []):
        senators[(term["state"], n["last"])] = moc


  # Scrape clerk.house.gov...

  def scrape_house_alt():
    for id, cx in list(house_ref.items()):
      scrape_house_committee(cx, cx["thomas_id"], id + "00")

  def scrape_house():
    """The old way of scraping House committees was to start with the committee list
    at the URL below, but this page no longer has links to the committee info pages
    even though those pages exist. Preserving this function in case we need it later."""
    url = "http://clerk.house.gov/committee_info/index.aspx"
    body = download(url, "committees/membership/house.html", force)
    for id, name in re.findall(r'<a href="/committee_info/index.aspx\?comcode=(..)00">(.*)</a>', body, re.I):
      if id not in house_ref:
        print("Unrecognized committee:", id, name)
        continue
      cx = house_ref[id]
      scrape_house_committee(cx, cx["thomas_id"], id + "00")

  def scrape_house_committee(cx, output_code, house_code):
    # load the House Clerk's committee membership page for the committee
    # (it is encoded in utf-8 even though the page indicates otherwise, and
    # while we don't really care, it helps our sanity check that compares
    # names)
    url = "http://clerk.house.gov/committee_info/index.aspx?%s=%s" % ('comcode' if house_code[-2:] == '00' else 'subcomcode', house_code)
    body = download(url, "committees/membership/house/%s.html" % house_code, force)
    dom = lxml.html.parse(io.StringIO(body)).getroot()

    # update official name metadata
    if house_code[-2:] == "00":
      cx["name"] = "House " + str(dom.cssselect("#com_display h3")[0].text_content())
    else:
      cx["name"] = str(dom.cssselect("#subcom_title h4")[0].text_content())

    # update address/phone metadata
    address_info = re.search(r"""Mailing Address:\s*(.*\S)\s*Telephone:\s*(\(202\) .*\S)""", dom.cssselect("#address")[0].text_content(), re.I | re.S)
    if not address_info: raise Exception("Failed to parse address info in %s." % house_code)
    cx["address"] = address_info.group(1)
    cx["address"] = re.sub(r"\s+", " ", cx["address"])
    cx["address"] = re.sub(r"(.*\S)(Washington, DC \d+)\s*(-\d+)?", lambda m : m.group(1) + "; " + m.group(2) + (m.group(3) if m.group(3) else ""), cx["address"])
    cx["phone"] = address_info.group(2)

    # get the ratio line to use in a sanity check later
    ratio = dom.cssselect("#ratio")
    if len(ratio): # some committees are missing
      ratio = re.search(r"Ratio (\d+)/(\d+)", ratio[0].text_content())
    else:
      ratio = None

    # scan the membership, which is listed by party
    for i, party, nodename in ((1, 'majority', 'primary'), (2, 'minority', 'secondary')):
      ctr = 0
      for rank, node in enumerate(dom.cssselect("#%s_group li" % nodename)):
        ctr += 1
        lnk = node.cssselect('a')
        if len(lnk) == 0:
          if node.text_content() == "Vacancy": continue
          raise ValueError("Failed to parse a <li> node.")
        moc = lnk[0].get('href')
        m = re.search(r"statdis=([A-Z][A-Z]\d\d)", moc)
        if not m: raise ValueError("Failed to parse member link: " + moc)
        if not m.group(1) in congressmen:
          print("Vacancy discrepancy? " + m.group(1))
          continue

        moc = congressmen[m.group(1)]
        found_name = node.cssselect('a')[0].text_content().replace(", ", "")

        if moc['name'].get("official_full", None) is None:
          print("No official_full field for %s" % found_name)
          continue

        if found_name != moc['name']['official_full']:
          print("Name mismatch: %s (in our file) vs %s (on the Clerk page)" % (moc['name']['official_full'], node.cssselect('a')[0].text_content()))

        entry = OrderedDict()
        entry["name"] = moc['name']['official_full']
        entry["party"] = party
        entry["rank"] = rank+1
        if rank == 0:
          entry["title"] = "Chair" if entry["party"] == "majority" else "Ranking Member" # not explicit, frown
        entry.update(ids_from(moc["id"]))

        committee_membership.setdefault(output_code, []).append(entry)

        # the .tail attribute has the text to the right of the link
        m = re.match(r", [A-Z][A-Z](,\s*)?(.*\S)?", lnk[0].tail)
        if m.group(2):
          # Chairman, Vice Chair, etc. (all but Ex Officio) started appearing on subcommittees around Feb 2014.
          # For the chair, this should overwrite the implicit title given for the rank 0 majority party member.
          if m.group(2) in ("Chair", "Chairman", "Chairwoman"):
            entry["title"] = "Chair"
          elif m.group(2) in ("Vice Chair", "Vice Chairman"):
            entry["title"] = "Vice Chair"

          elif m.group(2) == "Ex Officio":
            entry["title"] = m.group(2)

          else:
            raise ValueError("Unrecognized title information '%s' in %s." % (m.group(2), url))

      # sanity check we got the right number of nodes
      if ratio and ctr != int(ratio.group(i)): raise ValueError("Parsing didn't get the right count of members.")

    # scan for subcommittees
    for subcom in dom.cssselect("#subcom_list li a"):
      m = re.search("subcomcode=(..(\d\d))", subcom.get('href'))
      if not m: raise ValueError("Failed to parse subcommittee link.")

      for sx in cx['subcommittees']:
        if sx["thomas_id"] == m.group(2):
          break
      else:
        print("Subcommittee not found, creating it", output_code, m.group(1))
        sx = OrderedDict()
        sx['name'] = "[not initialized]" # will be set inside of scrape_house_committee
        sx['thomas_id'] = m.group(2)
        cx['subcommittees'].append(sx)
      scrape_house_committee(sx, cx["thomas_id"] + sx["thomas_id"], m.group(1))

  # Scrape senate.gov....
  def scrape_senate():
    url = "https://www.senate.gov/pagelayout/committees/b_three_sections_with_teasers/membership.htm"
    body = download(url, "committees/membership/senate.html", force)

    for id, name in re.findall(r'value="/general/committee_membership/committee_memberships_(....).htm">(.*?)</option>', body, re.I |  re.S):
      if id not in senate_ref:
        print("Unrecognized committee:", id, name)
        continue

      cx = senate_ref[id]
      is_joint = (id[0] == "J")

      # Scrape some metadata on the HTML page first.

      committee_url = "https://www.senate.gov/general/committee_membership/committee_memberships_%s.htm" % id
      print("[%s] Fetching members for %s (%s)" % (id, name, committee_url))
      body2 = download(committee_url, "committees/membership/senate/%s.html" % id, force)

      if not body2:
        print("\tcommittee page not good:", committee_url)
        continue

      m = re.search(r'<span class="contenttext"><a href="(http://(.*?).senate.gov/)">', body2, re.I)
      if m:
        cx["url"] = m.group(1)

      # Use the XML for the rest.

      print("\tDownloading XML...")
      committee_url = "https://www.senate.gov/general/committee_membership/committee_memberships_%s.xml" % id

      body3 = download(committee_url, "committees/membership/senate/%s.xml" % id, force)
      dom = lxml.etree.fromstring(body3.encode("utf8")) # must be bytes to parse if there is an encoding declaration inside the string

      cx["name"] = dom.xpath("committees/committee_name")[0].text
      if id[0] != "J" and id[0:2] != 'SC':
        cx["name"] = "Senate " + cx["name"]

      majority_party = dom.xpath("committees/majority_party")[0].text

      # update full committee members
      committee_membership[id] = []
      for member in dom.xpath("committees/members/member"):
        scrape_senate_member(committee_membership[id], member, majority_party, is_joint)

      # update subcommittees
      for subcom in dom.xpath("committees/subcommittee"):
        scid = subcom.xpath("committee_code")[0].text[4:]
        for sx in cx.get('subcommittees', []):
          if sx["thomas_id"] == scid:
            break
        else:
          print("Subcommittee not found, creating it", scid, name)
          sx = OrderedDict()
          sx['thomas_id'] = scid
          cx.setdefault('subcommittees', []).append(sx)

        # update metadata
        name = subcom.xpath("subcommittee_name")[0].text
        sx["name"] = name.strip()
        sx["name"] = re.sub(r"^\s*Subcommittee on\s*", "", sx["name"])
        sx["name"] = re.sub(r"\s+", " ", sx["name"])

        committee_membership[id + scid] = []
        for member in subcom.xpath("members/member"):
          scrape_senate_member(committee_membership[id + scid], member, majority_party, is_joint)

  def scrape_senate_member(output_list, membernode, majority_party, is_joint):
    last_name = membernode.xpath("name/last")[0].text
    state = membernode.xpath("state")[0].text
    party = "majority" if membernode.xpath("party")[0].text == majority_party else "minority"
    title = membernode.xpath("position")[0].text
    if title == "Member": title = None
    if title == "Ranking": title = "Ranking Member"

    # look up senator by state and last name
    if (state, last_name) not in senators:
      print("\t[%s] Unknown member: %s" % (state, last_name))
      return None

    moc = senators[(state, last_name)]

    entry = OrderedDict()
    if 'official_full' in moc['name']:
      entry["name"] = moc['name']['official_full']
    else:
      print("missing name->official_full field for", moc['id']['bioguide'])
    entry["party"] = party
    entry["rank"] = len([e for e in output_list if e["party"] == entry["party"]]) + 1 # how many have we seen so far in this party, +1
    if title: entry["title"] = title
    entry.update(ids_from(moc["id"]))
    if is_joint: entry["chamber"] = "senate"

    output_list.append(entry)

    # sort by party, then by rank, since we get the nodes in the XML in a rough seniority order that ignores party
    # should be done once at the end, but cleaner to do it here
    output_list.sort(key = lambda e : (e["party"] != "majority", e["rank"]))

  # stick to a specific small set of official IDs to cross-link members
  # this limits the IDs from going out of control in this file, while
  # preserving us flexibility to be inclusive of IDs in the main leg files
  def ids_from(moc):
    ids = {}
    for id in ["bioguide", "thomas"]:
      if id in moc:
        ids[id] = moc[id]
    if len(ids) == 0:
      raise ValueError("Missing an official ID for this legislator, won't be able to link back")
    return ids

  def restore_house_members_on_joint_committees():
    # The House doesn't publish joint committee members, but we're manaually gathering
    # that. Add them back into the output from whatever we have on disk. Put them after
    # Senate members.
    for c, mbrs in list(memberships_current.items()):
      if c[0] != "J": continue
      for m in mbrs:
        if m["chamber"] != "house": continue
        committee_membership[c].append(m)

  # MAIN

  scrape_house()
  scrape_senate()
  restore_house_members_on_joint_committees()

  save_data(committee_membership, "committee-membership-current.yaml")
  save_data(committees_current, "committees-current.yaml")
#!/usr/bin/env python

# Use the House' member labels file to update some basic info, including bioguide IDs, for members.

# Assumes state and district are already present.

import csv, re
import utils
from utils import download, load_data, save_data, parse_date


house_labels = "labels-113.csv"

# default to not caching
cache = utils.flags().get("cache", False)
force = not cache
names = utils.flags().get("names", False)

y = load_data("legislators-current.yaml")
by_district = {}
for m in y:
    last_term = m["terms"][-1]
    if last_term["type"] != "sen":
        full_district = "%s%02d" % (last_term["state"], int(last_term["district"]))
        by_district[full_district] = m


for rec in csv.DictReader(open(house_labels)):
    full_district = rec["113 ST/DIS"]

    # empty seat - IL-02
def run():

	today = datetime.now().date()

	# default to not caching
	cache = utils.flags().get('cache', False)
	force = not cache

	y = load_data("legislators-current.yaml")

	# Map bioguide IDs to dicts. Reference the same dicts
	# in y so we are updating y when we update biogiude.
	bioguide = { }
	by_name = { }
	for m in y:
		if "bioguide" in m["id"]:
			bioguide[m["id"]["bioguide"]] = m
		party = m["terms"][-1]["party"][0]
		state = m["terms"][-1]["state"]
		last_name = m["name"]["last"]
		member_full = "%s (%s-%s)" % (last_name, party, state)
		by_name[member_full] = m


	print("Fetching general Senate information from senators_cfm.xml...")

	url = "http://www.senate.gov/general/contact_information/senators_cfm.xml"
	body = download(url, "legislators/senate.xml", force)
	dom = lxml.etree.parse(io.StringIO(body))
	for node in dom.xpath("member"):
		bioguide_id = str(node.xpath("string(bioguide_id)")).strip()
		member_full = node.xpath("string(member_full)")

		if bioguide_id == "":
			print("Someone has an empty bioguide ID!")
			print(lxml.etree.tostring(node))
			continue

		print("[%s] Processing Senator %s..." % (bioguide_id, member_full))

		# find member record in our YAML, either by bioguide_id or member_full
		if bioguide_id in bioguide:
			member = bioguide[bioguide_id]
		else:
			if member_full in by_name:
				member = by_name[member_full]
			else:
				print("Bioguide ID '%s' and full name '%s' not recognized." % (bioguide_id, member_full))
				exit(0)

		try:
			term = member["terms"][-1]
		except IndexError:
			print("Member has no terms", bioguide_id, member_full)
			continue

		if today < parse_date(term["start"]) or today > parse_date(term["end"]):
			print("Member's last listed term is not current", bioguide_id, member_full, term["start"])
			continue

		if term["type"] != "sen":
			print("Member's last listed term is not a Senate term", bioguide_id, member_full)
			continue


		if term["state"] != str(node.xpath("string(state)")):
			print("Member's last listed term has the wrong state", bioguide_id, member_full)
			continue

		if "district" in term: del term["district"]

		full_name = str(node.xpath("string(first_name)"))
		suffix = None
		if ", " in full_name: full_name, suffix = full_name.split(", ")
		full_name += " " + str(node.xpath("string(last_name)"))
		if suffix: full_name += ", " + suffix
		member["name"]["official_full"] = full_name

		member["id"]["bioguide"] = bioguide_id

		term["class"] = { "Class I": 1, "Class II": 2, "Class III": 3}[ node.xpath("string(class)") ]
		term["party"] = { "D": "Democrat", "R": "Republican", "I": "Independent", "ID": "Independent"}[ node.xpath("string(party)") ]

		url = str(node.xpath("string(website)")).strip()

		# kill trailing slashes and force hostname to lowercase since around December 2013 they started uppercasing "Senate.gov"
		url = re.sub("/$", "", url).replace(".Senate.gov", ".senate.gov")

		if not url.startswith("/"): term["url"] = url # temporary home pages for new senators
		term["address"] = str(node.xpath("string(address)")).strip().replace("\n      ", " ")
		term["office"] = string.capwords(term["address"].upper().split(" WASHINGTON ")[0])

		phone = str(node.xpath("string(phone)")).strip()
		term["phone"] = phone.replace("(", "").replace(")", "").replace(" ", "-")

		contact_form = str(node.xpath("string(email)")).strip().replace(".Senate.gov", ".senate.gov")
		if contact_form: # can be blank
			term["contact_form"] = contact_form



	print("\n\nUpdating Senate stateRank and LIS ID from cvc_member_data.xml...")

	url = "http://www.senate.gov/legislative/LIS_MEMBER/cvc_member_data.xml"
	body = download(url, "legislators/senate_cvc.xml", force)
	dom = lxml.etree.parse(io.StringIO(body))
	for node in dom.getroot():
		if node.tag == "lastUpdate":
			date, time = node.getchildren()
			print("Last updated: %s, %s" % (date.text, time.text))
			continue

		bioguide_id = str(node.xpath("string(bioguideId)")).strip()
		if bioguide_id == "":
			print("Someone has an empty bioguide ID!")
			print(lxml.etree.tostring(node))
			continue

		last_name = node.xpath("string(name/last)")
		party = node.xpath("string(party)")
		state = node.xpath("string(state)")
		member_full = "%s (%s-%s)" % (last_name, party, state)

		print("[%s] Processing Senator %s..." % (bioguide_id, member_full))

		# find member record in our YAML, either by bioguide_id or member_full
		if bioguide_id in bioguide:
			member = bioguide[bioguide_id]
		else:
			if member_full in by_name:
				member = by_name[member_full]
			else:
				print("Bioguide ID '%s' and synthesized official name '%s' not recognized." % (bioguide_id, member_full))
				exit(0)

		try:
			term = member["terms"][-1]
		except IndexError:
			print("Member has no terms", bioguide_id, member_full)
			continue

		if "id" not in member:
			member["id"] = {}

		member["id"]["lis"] = node.attrib["lis_member_id"]
		state_rank = node.xpath("string(stateRank)")
		if state_rank == '1':
			term["state_rank"] = "senior"
		elif state_rank == '2':
			term["state_rank"] = "junior"


	print("Saving data...")
	save_data(y, "legislators-current.yaml")
# gets CRP id for every member with a bioguide ID:

# options:
#  --cache: load from cache if present on disk (default: true)
#  --current: do *only* current legislators (default: true)
#  --historical: do *only* historical legislators (default: false)

import datetime
import re
import utils
import urllib2
import requests
from utils import download, load_data, save_data, parse_date
import json

options = utils.flags()
options['urllib'] = True # disable scrapelib for this

debug = options.get('debug', False)

# default to NOT caching
cache = options.get('cache', False)
force = not cache


only_bioguide = options.get('bioguide', None)


# pick either current or historical
# order is important here, since current defaults to true
if utils.flags().get('historical', False):
예제 #44
0
def run():
    def update_birthday(bioguide, person, main):

        birthday = birthday_for(main)
        if not birthday:
            print("[%s] NO BIRTHDAY :(\n\n%s" %
                  (bioguide, main.encode("utf8")))
            warnings.append(bioguide)
            return
        if birthday == "UNKNOWN":
            return

        try:
            birthday = datetime.datetime.strptime(birthday.replace(",", ""),
                                                  "%B %d %Y")
        except ValueError:
            print("[%s] BAD BIRTHDAY :(\n\n%s" %
                  (bioguide, main.encode("utf8")))
            warnings.append(bioguide)
            return

        birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month,
                                       birthday.day)
        person.setdefault("bio", {})["birthday"] = birthday

    def birthday_for(string):
        # exceptions for not-nicely-placed semicolons
        string = string.replace(
            "born in Cresskill, Bergen County, N. J.; April", "born April")
        string = string.replace(
            "FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;",
            "born September 17, 1802")
        string = string.replace(
            "CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967",
            "born March 13, 1967")
        string = string.replace(
            "CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;",
            "born January 5, 1962")
        string = string.replace(
            "SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947",
            "born March 18, 1947")
        string = string.replace(
            'KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968',
            "born May 29, 1968")

        # look for a date
        pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})"
        match = re.search(pattern, string, re.I)
        if not match or not match.group(1):
            # specifically detect cases that we can't handle to avoid unnecessary warnings
            if re.search("birth dates? unknown|date of birth is unknown",
                         string, re.I):
                return "UNKNOWN"
            if re.search(
                    "born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}",
                    string, re.I):
                return "UNKNOWN"
            return None
        return match.group(1).strip()

    def relationships_of(string):
        # relationship data is stored in a parenthetical immediately after the end of the </font> tag in the bio
        # e.g. "(son of Joseph Patrick Kennedy, II, and great-nephew of Edward Moore Kennedy and John Fitzgerald Kennedy)"
        pattern = "^\((.*?)\)"
        match = re.search(pattern, string, re.I)

        relationships = []

        if match and len(match.groups()) > 0:
            relationship_text = match.group(1).encode("ascii", "replace")

            # since some relationships refer to multiple people--great-nephew of Edward Moore Kennedy AND John Fitzgerald Kennedy--we need a special grammar
            from nltk import tree, pos_tag, RegexpParser
            tokens = re.split("[ ,;]+|-(?![0-9])", relationship_text)
            pos = pos_tag(tokens)

            grammar = r"""
        NAME: {<NNP>+}
        NAMES: { <IN><NAME>(?:<CC><NAME>)* }
        RELATIONSHIP: { <JJ|NN|RB|VB|VBD|VBN|IN|PRP\$>+ }
        MATCH: { <RELATIONSHIP><NAMES> }
        """
            cp = RegexpParser(grammar)
            chunks = cp.parse(pos)

            # iterate through the Relationship/Names pairs
            for n in chunks:
                if isinstance(n, tree.Tree) and n.node == "MATCH":
                    people = []
                    relationship = None
                    for piece in n:
                        if piece.node == "RELATIONSHIP":
                            relationship = " ".join([x[0] for x in piece])
                        elif piece.node == "NAMES":
                            for name in [
                                    x for x in piece
                                    if isinstance(x, tree.Tree)
                            ]:
                                people.append(" ".join([x[0] for x in name]))
                    for person in people:
                        relationships.append({
                            "relation": relationship,
                            "name": person
                        })
        return relationships

    # default to caching
    cache = utils.flags().get('cache', True)
    force = not cache

    # pick either current or historical
    # order is important here, since current defaults to true
    if utils.flags().get('historical', False):
        filename = "legislators-historical.yaml"
    elif utils.flags().get('current', True):
        filename = "legislators-current.yaml"
    else:
        print("No legislators selected.")
        exit(0)

    print("Loading %s..." % filename)
    legislators = load_data(filename)

    # reoriented cache to access by bioguide ID
    by_bioguide = {}
    for m in legislators:
        if "bioguide" in m["id"]:
            by_bioguide[m["id"]["bioguide"]] = m

    # optionally focus on one legislator

    bioguide = utils.flags().get('bioguide', None)
    if bioguide:
        bioguides = [bioguide]
    else:
        bioguides = list(by_bioguide.keys())

    warnings = []
    missing = []
    count = 0
    families = 0

    for bioguide in bioguides:
        # Download & parse the HTML of the bioguide page.
        try:
            dom = fetch_bioguide_page(bioguide, force)
        except Exception as e:
            print(e)
            missing.append(bioguide)
            continue

        # Extract the member's name and the biography paragraph (main).

        try:
            name = dom.cssselect("p font")[0]
            main = dom.cssselect("p")[0]
        except IndexError:
            print("[%s] Missing name or content!" % bioguide)
            exit(0)

        name = name.text_content().strip()
        main = main.text_content().strip().replace("\n",
                                                   " ").replace("\r", " ")
        main = re.sub("\s+", " ", main)

        # Extract the member's birthday.

        update_birthday(bioguide, by_bioguide[bioguide], main)

        # Extract relationships with other Members of Congress.

        if utils.flags().get("relationships", False):
            #relationship information, if present, is in a parenthetical immediately after the name.
            #should always be present if we passed the IndexError catch above
            after_name = dom.cssselect("p font")[0].tail.strip()
            relationships = relationships_of(after_name)
            if len(relationships):
                families = families + 1
                by_bioguide[bioguide]["family"] = relationships

        count = count + 1

    print()
    if warnings:
        print("Missed %d birthdays: %s" %
              (len(warnings), str.join(", ", warnings)))

    if missing:
        print("Missing a page for %d bioguides: %s" %
              (len(missing), str.join(", ", missing)))

    print("Saving data to %s..." % filename)
    save_data(legislators, filename)

    print("Saved %d legislators to %s" % (count, filename))

    if utils.flags().get("relationships", False):
        print("Found family members for %d of those legislators" % families)