def main():
    """Generates public data dump files from the latest prod data."""

    # Connect to the latest schemas.
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    schema_profil = db.get_latest_schema('source_internal_profil_')
    db.execute('SET search_path="' + schema + '", "' + schema_profil + '";')
    timestamp = schema[schema.rfind('_') + 1:]
    print('[OK] Dumping from schemas "%s" and "%s"...' %
          (schema, schema_profil))

    # Read YAML configuration file.
    config = yaml_load('public_dumps.yaml')
    dir_save = config['save_directory']
    dumps = config['dumps']

    # Process all dumps.
    for dump_name in dumps:
        save_path = os.path.join(dir_save,
                                 '%s_%s.csv' % (dump_name, timestamp))
        db.dump_to_CSV(dumps[dump_name]['query'], save_path)
        print('[OK] Saved dump "%s" to %s' % (dump_name, save_path))

        stage_path = os.path.join(dir_save, dump_name + '.csv')
        shutil.copyfile(save_path, stage_path)
        print('[OK] Copied dump "%s" to %s' % (dump_name, stage_path))

    # Close database connection.
    db.close()
Exemplo n.º 2
0
    def gather_meta(self):
        """
        Return the meta file.
        """
        if not os.path.exists(self.paths["meta"]):
            return ""

        meta_dict = utils.yaml_load(self.paths["meta"])

        # gather the dependencies
        if meta_dict and "dependencies" in meta_dict:
            # create a simple list of each role that is a dependency
            dep_list = []

            for dependency in meta_dict["dependencies"]:
                if type(dependency) is dict:
                    dep_list.append(dependency["role"])
                else:
                    dep_list.append(dependency)

            # unique set of dependencies
            meta_dict["dependencies"] = list(set(dep_list))

            self.dependencies = meta_dict["dependencies"]
        else:
            self.dependencies = []

        return utils.file_to_string(self.paths["meta"])
Exemplo n.º 3
0
def run(options):
    # Load the committee metadata from the congress-legislators repository and make a
    # mapping from thomas_id and house_id to the committee dict. For each committee,
    # replace the subcommittees list with a dict from thomas_id to the subcommittee.
    utils.require_congress_legislators_repo()
    committees = {}
    for c in utils.yaml_load("congress-legislators/committees-current.yaml"):
        committees[c["thomas_id"]] = c
        if "house_committee_id" in c:
            committees[c["house_committee_id"] + "00"] = c
        c["subcommittees"] = dict(
            (s["thomas_id"], s) for s in c.get("subcommittees", []))

    for chamber in ("house", "senate"):
        # Load any existing meetings file so we can recycle GUIDs generated for Senate meetings.
        existing_meetings = []
        output_file = utils.data_dir(
        ) + "/committee_meetings_%s.json" % chamber
        if os.path.exists(output_file):
            existing_meetings = json.load(open(output_file))

        # Scrape for meeting info.
        if chamber == "senate":
            meetings = fetch_senate_committee_meetings(existing_meetings,
                                                       committees, options)
        else:
            meetings = fetch_house_committee_meetings(existing_meetings,
                                                      committees, options)

        # Write out.
        utils.write(
            json.dumps(meetings,
                       sort_keys=True,
                       indent=2,
                       default=utils.format_datetime), output_file)
Exemplo n.º 4
0
def run(options):
  # Load the committee metadata from the congress-legislators repository and make a
  # mapping from thomas_id and house_id to the committee dict. For each committee,
  # replace the subcommittees list with a dict from thomas_id to the subcommittee.
  utils.require_congress_legislators_repo()
  committees = { }
  for c in utils.yaml_load("congress-legislators/committees-current.yaml"):
    committees[c["thomas_id"]] = c
    if "house_committee_id" in c: committees[c["house_committee_id"] + "00"] = c
    c["subcommittees"] = dict((s["thomas_id"], s) for s in c.get("subcommittees", []))

  for chamber in ("house", "senate"):
    # Load any existing meetings file so we can recycle GUIDs generated for Senate meetings.
    existing_meetings = []
    output_file = utils.data_dir() + "/committee_meetings_%s.json" % chamber
    if os.path.exists(output_file):
      existing_meetings = json.load(open(output_file))

    # Scrape for meeting info.
    if chamber == "senate":
      meetings = fetch_senate_committee_meetings(existing_meetings, committees, options)
    else:
      meetings = fetch_house_committee_meetings(existing_meetings, committees, options)

    # Write out.
    utils.write(json.dumps(meetings, sort_keys=True, indent=2, default=utils.format_datetime),
      output_file)
Exemplo n.º 5
0
        def _gather_included_roles_recursive(tasks):
            if not tasks:
                return []

            included = []
            include_file = None
            for task in tasks:
                if "include_role" in task:
                    included.append(task.get("include_role")['name'])
                elif "include" in task:
                    include_file = task.get("include")
                elif "include_tasks" in task:
                    include_file = task.get("include_tasks")
                elif "import_tasks" in task:
                    include_file = task.get("import_tasks")
                elif "block" in task:
                    included.extend(
                        _gather_included_roles_recursive(task['block']))
                if include_file:
                    # TODO: check playbooks dir, and role dir
                    include_path = os.path.join(
                        os.path.dirname(self.paths["tasks"]), include_file)
                    if not os.path.exists(include_path):
                        include_path = os.path.join(
                            os.path.dirname(self.paths["role"]), include_file)
                    if not os.path.exists(include_path):
                        continue

                    tasks = utils.yaml_load(include_path)
                    included.extend(_gather_included_roles_recursive(tasks))
            return included
Exemplo n.º 6
0
 def init(self):
     # load record
     if not os.path.exists('.cache'):
         os.mkdir('.cache')
     if os.path.exists('.cache/record.yaml'):
         record = utils.yaml_load('.cache/record.yaml')
         logger.info('record found, load record')
     else:
         record = {'hash': []}
         utils.yaml_dump(record, '.cache/record.yaml')
         logger.info('no record found, create new record')
     cache_flag = False
     # get file list
     file_list = self._get_file_list()
     for path, type_ in file_list:
         fhash = utils.file_md5(path)
         assert len(fhash) > 0
         if fhash not in record['hash']:
             # add new data to cache
             cache_flag = True
             record['hash'].append(fhash)
             logger.info('New file {0} {1} found, add to cache'.format(
                 path, type_))
             prep_sym(path, type_)
             logger.info('{0} {1} add to cache success'.format(path, type_))
             utils.yaml_dump(record, '.cache/record.yaml')
     if cache_flag:
         logger.info('Cache data update success')
         prep_wm()
         prep_vec()
     else:
         logger.info('Data up to date, use cache data')
def run():

	# Use the House History Website's Women in Congress search results to get a list of IDs.
	# Because this requires a POST, our utils.download() function won't work.
	querystring = b"Command=Next&Term=Search&TermType=Last&ShowNonMember=true&ShowNonMember=false&Office=&Leadership=&State=&Party=&ContinentalCongress=false&BlackAmericansInCongress=false&WomenInCongress=true&WomenInCongress=false&CongressNumber=65&CongressNumber=66&CongressNumber=67&CongressNumber=68&CongressNumber=69&CongressNumber=70&CongressNumber=71&CongressNumber=72&CongressNumber=73&CongressNumber=74&CongressNumber=75&CongressNumber=76&CongressNumber=77&CongressNumber=78&CongressNumber=79&CongressNumber=80&CongressNumber=81&CongressNumber=82&CongressNumber=83&CongressNumber=84&CongressNumber=85&CongressNumber=86&CongressNumber=87&CongressNumber=88&CongressNumber=89&CongressNumber=90&CongressNumber=91&CongressNumber=92&CongressNumber=93&CongressNumber=94&CongressNumber=95&CongressNumber=96&CongressNumber=97&CongressNumber=98&CongressNumber=99&CongressNumber=100&CongressNumber=101&CongressNumber=102&CongressNumber=103&CongressNumber=104&CongressNumber=105&CongressNumber=106&CongressNumber=107&CongressNumber=108&CongressNumber=109&CongressNumber=110&CongressNumber=111&CongressNumber=112&CongressNumber=113&CurrentPage=__PAGE__&SortOrder=LastName&ResultType=Grid&PreviousSearch=Search%2CLast%2C%2C%2C%2C%2CFalse%2CFalse%2CTrue%2C65%2C66%2C67%2C68%2C69%2C70%2C71%2C72%2C73%2C74%2C75%2C76%2C77%2C78%2C79%2C80%2C81%2C82%2C83%2C84%2C85%2C86%2C87%2C88%2C89%2C90%2C91%2C92%2C93%2C94%2C95%2C96%2C97%2C98%2C99%2C100%2C101%2C102%2C103%2C104%2C105%2C106%2C107%2C108%2C109%2C110%2C111%2C112%2C113%2CLastName&X-Requested-With=XMLHttpRequest"
	women_house_history_ids = set()
	for pagenum in range(0, 25+1):
		body = urllib.request.urlopen(
			"http://history.house.gov/People/Search?Length=6",
			querystring.replace(b"__PAGE__", str(pagenum).encode("ascii"))
			).read().decode("utf8")
		for match in re.findall(r"/People/Detail/(\d+)\?ret=True", body):
			women_house_history_ids.add(int(match))

	# Now check and update the gender of all legislators.
	missing_ids = set()
	for fn in ("../legislators-current.yaml", "../legislators-historical.yaml"):
		legislators = yaml_load(fn)
		for p in legislators:
			house_history_id = p.get("id", {}).get("house_history")

			if not house_history_id:
				missing_ids.add(p.get("id", {}).get("bioguide"))
				continue

			p.setdefault("bio", {})["gender"] = "F" if house_history_id in women_house_history_ids else "M"

			if house_history_id in women_house_history_ids:
				women_house_history_ids.remove(house_history_id)

		yaml_dump(legislators, fn)

	print("%d women in Congress were not found in our files." % len(women_house_history_ids))
	print("%d legislators are missing house_history IDs:" % len(missing_ids))
Exemplo n.º 8
0
    def gather_meta(self):
        """
        Return the meta file.
        """
        if not os.path.exists(self.paths["meta"]):
            self.dependencies = []
            return ""

        meta_dict = utils.yaml_load(self.paths["meta"])

        # gather the dependencies
        if meta_dict and "dependencies" in meta_dict:
            # create a simple list of each role that is a dependency
            dep_list = []

            for dependency in meta_dict["dependencies"]:
                if type(dependency) is dict:
                    dep_list.append(dependency["role"])
                else:
                    dep_list.append(dependency)

            # unique set of dependencies
            meta_dict["dependencies"] = list(set(dep_list))

            self.dependencies = meta_dict["dependencies"]
        else:
            self.dependencies = []

        return utils.file_to_string(self.paths["meta"])
Exemplo n.º 9
0
def get_public_dumps_info():
    # Read public dumps YAML configuration file
    config = yaml_load('../data/public_dumps.yaml')
    dir_save = config['save_directory']
    dumps = config['dumps']

    # Iterate through the dumps
    result = []
    for dump_name in dumps:
        # Find dump file with the latest timestamp (inherited from prod data)
        filenames = [
            n for n in os.listdir(dir_save)
            if n.startswith(dump_name + '_') and n.endswith('.csv')
        ]
        if len(filenames) == 0:
            print('[WARNING] Could not find dump file for dump "%s"' %
                  (dump_name))
            continue
        filename = sorted(filenames, reverse=True)[0]

        # Append dump info to results
        result.append({
            'name': dump_name,
            'notebook_url': dumps[dump_name]['notebook_url'],
            'query': dumps[dump_name]['query'].strip(),
            'url': 'https://verejne.digital/data/%s' % (filename)
        })
    return result
Exemplo n.º 10
0
    def test_hearing(self):
        committees = {}
        for c in utils.yaml_load("test/fixtures/committees-current.yaml"):
            committees[c["thomas_id"]] = c
            if "house_committee_id" in c:
                committees[c["house_committee_id"] + "00"] = c
            c["subcommittees"] = dict(
                (s["thomas_id"], s) for s in c.get("subcommittees", []))

        hearing_xml = "test/fixtures/hearings/sample_hearing.xml"
        file_xml = open(hearing_xml, "r")
        dom = lxml.etree.parse(file_xml)
        test_output = committee_meetings.parse_house_committee_meeting(
            '102252', dom, [], committees, {"debug": False}, None,
            ["BILLS-113hr4435ih.pdf", "BILLS-113hr4435ih.xml"])

        #          event_id, dom, existing_meetings, committees, options, witnesses, uploaded_documents
        self.assertEqual(test_output['bill_ids'], ['hr4435-113'])
        self.assertEqual(test_output['chamber'], 'house')
        self.assertEqual(test_output['committee'], 'HSRU')
        self.assertEqual(test_output['congress'], 113)
        self.assertEqual(test_output['house_meeting_type'], 'HMTG')
        self.assertEqual(test_output['meeting_documents'][0]['description'],
                         'H.R. 4435 (as introduced)')
        self.assertEqual(test_output['meeting_documents'][0]['bill_id'],
                         'hr4435-113')
        self.assertEqual(test_output['meeting_documents'][0]['version_code'],
                         'ih')
        self.assertEqual(test_output['meeting_documents'][0]['type'], 'BR')
        self.assertEqual(test_output['meeting_documents'][0]['urls'], [
            {
                'url':
                'http://beta.congress.gov/113/bills/hr4435/BILLS-113hr4435ih.pdf',
                'file_found': True
            },
            {
                'url':
                'http://beta.congress.gov/113/bills/hr4435/BILLS-113hr4435ih.xml',
                'file_found': True
            },
        ])
        self.assertEqual(test_output['occurs_at'], '2014-05-19T17:00:00')
        self.assertEqual(test_output['room'], 'CAPITOL H-313')
        self.assertEqual(test_output['subcommittee'], None)
        self.assertEqual(
            test_output['topic'],
            u'H.R. 4435\u2014National Defense Authorization Act for Fiscal Year 2015 [General Debate]; H.R. 4660\u2014Commerce, Justice, Science, and Related Agencies Appropriations Act, 2015'
        )
        self.assertEqual(
            test_output['url'],
            'http://docs.house.gov/Committee/Calendar/ByEvent.aspx?EventID=102252'
        )
Exemplo n.º 11
0
    def read_and_valid_meta(self, role):
        """
        Read the meta files and return whether or not the meta file being read
        is valid.
        """
        if os.path.exists(self.paths["meta"]):
            self.meta_dict = utils.yaml_load(self.paths["meta"])
            if os.path.exists(self.paths["ansigenome"]):
                self.meta_dict['ansigenome_info'] = utils.yaml_load(
                    self.paths["ansigenome"])['ansigenome_info']
        else:
            self.report["state"]["missing_meta_role"] += 1
            self.report["roles"][role]["state"] = "missing_meta"

            return False

        is_valid = True

        # utils.yaml_load returns False when the file is invalid
        if isinstance(self.meta_dict, bool):
            is_valid = False
            sys.exit(1)

        return is_valid
Exemplo n.º 12
0
def run():

    # Use the House History Website's Women in Congress search results to get a list of IDs.
    # Because this requires a POST, our utils.download() function won't work.
    querystring = b"Command=Next&Term=Search&SearchIn=LastName&ShowNonMember=true&ShowNonMember=false&Office=&Leadership=&State=&Party=&ContinentalCongress=false&BlackAmericansInCongress=false&WomenInCongress=true&WomenInCongress=false&HispanicAmericansInCongress=false&CongressNumber=65&CongressNumber=66&CongressNumber=67&CongressNumber=68&CongressNumber=69&CongressNumber=70&CongressNumber=71&CongressNumber=72&CongressNumber=73&CongressNumber=74&CongressNumber=75&CongressNumber=76&CongressNumber=77&CongressNumber=78&CongressNumber=79&CongressNumber=80&CongressNumber=81&CongressNumber=82&CongressNumber=83&CongressNumber=84&CongressNumber=85&CongressNumber=86&CongressNumber=87&CongressNumber=88&CongressNumber=89&CongressNumber=90&CongressNumber=91&CongressNumber=92&CongressNumber=93&CongressNumber=94&CongressNumber=95&CongressNumber=96&CongressNumber=97&CongressNumber=98&CongressNumber=99&CongressNumber=100&CongressNumber=101&CongressNumber=102&CongressNumber=103&CongressNumber=104&CongressNumber=105&CongressNumber=106&CongressNumber=107&CongressNumber=108&CongressNumber=109&CongressNumber=110&CongressNumber=111&CongressNumber=112&CongressNumber=113&CongressNumber=114&CurrentPage=__PAGE__&SortOrder=LastName&ResultType=Grid&PreviousSearch=Search%2CLastName%2C%2C%2C%2C%2CFalse%2CFalse%2CTrue%2C65%2C66%2C67%2C68%2C69%2C70%2C71%2C72%2C73%2C74%2C75%2C76%2C77%2C78%2C79%2C80%2C81%2C82%2C83%2C84%2C85%2C86%2C87%2C88%2C89%2C90%2C91%2C92%2C93%2C94%2C95%2C96%2C97%2C98%2C99%2C100%2C101%2C102%2C103%2C104%2C105%2C106%2C107%2C108%2C109%2C110%2C111%2C112%2C113%2C114%2CLastName&X-Requested-With=XMLHttpRequest"
    women_house_history_ids = set()
    for pagenum in range(0, 30 + 1):
        body = urllib.request.urlopen(
            "http://history.house.gov/People/Search?Length=6",
            querystring.replace(
                b"__PAGE__",
                str(pagenum).encode("ascii"))).read().decode("utf8")
        for match in re.findall(r"/People/Detail/(\d+)\?ret=True", body):
            women_house_history_ids.add(int(match))

    # Now check and update the gender of all legislators.
    matched_women_house_history_ids = set()
    missing_ids = set()
    for fn in ("../legislators-current.yaml",
               "../legislators-historical.yaml"):
        legislators = yaml_load(fn)
        for p in legislators:
            house_history_id = p.get("id", {}).get("house_history")

            if not house_history_id:
                # We have all of the women, so anyone left must be a man.
                p.setdefault("bio", {})["gender"] = "M"
                missing_ids.add(p.get("id", {}).get("bioguide"))
                continue

            p.setdefault(
                "bio", {}
            )["gender"] = "F" if house_history_id in women_house_history_ids else "M"

            if house_history_id in women_house_history_ids:
                matched_women_house_history_ids.add(house_history_id)

        yaml_dump(legislators, fn)

    print("%d women in Congress reported by the House History website" %
          len(women_house_history_ids))
    print("%d women in Congress were not found in our files." %
          len(women_house_history_ids - matched_women_house_history_ids))
    print(
        " ", " ".join((str(x) for x in (women_house_history_ids -
                                        matched_women_house_history_ids))))
    print("%d legislators are missing house_history IDs, set to male." %
          len(missing_ids))
Exemplo n.º 13
0
    def export_roles(self):
        """
        Export the roles to one of the export types.
        """
        # prepare the report by removing unnecessary fields
        del self.report["state"]
        del self.report["stats"]
        for role in self.report["roles"]:
            del self.report["roles"][role]["state"]

            defaults_path = os.path.join(self.roles_path, role, "defaults",
                                         "main.yml")
            if os.path.exists(defaults_path):
                defaults = self.report["roles"][role]["defaults"]
                self.report["roles"][role]["defaults"] = \
                    utils.yaml_load("", defaults)

        Export(self.roles_path, self.report, self.config, self.options)
Exemplo n.º 14
0
    def export_roles(self):
        """
        Export the roles to one of the export types.
        """
        # prepare the report by removing unnecessary fields
        del self.report["state"]
        del self.report["stats"]
        for role in self.report["roles"]:
            del self.report["roles"][role]["state"]

            defaults_path = os.path.join(self.roles_path, role,
                                         "defaults", "main.yml")
            if os.path.exists(defaults_path):
                defaults = self.report["roles"][role]["defaults"]
                self.report["roles"][role]["defaults"] = \
                    utils.yaml_load("", defaults)

        Export(self.roles_path, self.report, self.config, self.options)
Exemplo n.º 15
0
    def valid_meta(self, role):
        """
        Return whether or not the meta file being read is valid.
        """
        if os.path.exists(self.paths["meta"]):
            self.meta_dict = utils.yaml_load(self.paths["meta"])
        else:
            self.report["state"]["missing_meta_role"] += 1
            self.report["roles"][role]["state"] = "missing_meta"

            return False

        is_valid = True

        # utils.yaml_load returns False when the file is invalid
        if isinstance(self.meta_dict, bool):
            is_valid = False
            sys.exit(1)

        return is_valid
Exemplo n.º 16
0
def initialise_app(serving_directory):
    """ Procedure for initialising the app with precomputed values that
        are shared across different requests. The registry property is
        intended for this purpose, in order to avoid global variables.
    """

    # database
    db = DatabaseConnection(path_config='db_config.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path to ' + schema + ';')
    app.registry['db'] = db

    # data_sources
    data_sources = yaml_load('datasources.yaml')
    app.registry['data_sources'] = data_sources

    # entities
    entities = Entities()
    entities.loadFromDirectory(serving_directory)
    app.registry['entities'] = entities
Exemplo n.º 17
0
    def test_hearing(self):
        committees = {}
        for c in utils.yaml_load("test/fixtures/committees-current.yaml"):
            committees[c["thomas_id"]] = c
            if "house_committee_id" in c:
                committees[c["house_committee_id"] + "00"] = c
            c["subcommittees"] = dict((s["thomas_id"], s) for s in c.get("subcommittees", []))

        hearing_xml = "test/fixtures/hearings/sample_hearing.xml"
        file_xml = open(hearing_xml, "r")
        dom = lxml.etree.parse(file_xml)
        test_output = committee_meetings.parse_house_committee_meeting(
            "102252", dom, [], committees, {"debug": False}, None, ["BILLS-113hr4435ih.pdf", "BILLS-113hr4435ih.xml"]
        )

        #          event_id, dom, existing_meetings, committees, options, witnesses, uploaded_documents
        self.assertEqual(test_output["bill_ids"], ["hr4435-113"])
        self.assertEqual(test_output["chamber"], "house")
        self.assertEqual(test_output["committee"], "HSRU")
        self.assertEqual(test_output["congress"], 113)
        self.assertEqual(test_output["house_meeting_type"], "HMTG")
        self.assertEqual(test_output["meeting_documents"][0]["description"], "H.R. 4435 (as introduced)")
        self.assertEqual(test_output["meeting_documents"][0]["bill_id"], "hr4435-113")
        self.assertEqual(test_output["meeting_documents"][0]["version_code"], "ih")
        self.assertEqual(test_output["meeting_documents"][0]["type"], "BR")
        self.assertEqual(
            test_output["meeting_documents"][0]["urls"],
            [
                {"url": "http://beta.congress.gov/113/bills/hr4435/BILLS-113hr4435ih.pdf", "file_found": True},
                {"url": "http://beta.congress.gov/113/bills/hr4435/BILLS-113hr4435ih.xml", "file_found": True},
            ],
        )
        self.assertEqual(test_output["occurs_at"], "2014-05-19T17:00:00")
        self.assertEqual(test_output["room"], "CAPITOL H-313")
        self.assertEqual(test_output["subcommittee"], None)
        self.assertEqual(
            test_output["topic"],
            u"H.R. 4435\u2014National Defense Authorization Act for Fiscal Year 2015 [General Debate]; H.R. 4660\u2014Commerce, Justice, Science, and Related Agencies Appropriations Act, 2015",
        )
        self.assertEqual(test_output["url"], "http://docs.house.gov/Committee/Calendar/ByEvent.aspx?EventID=102252")
Exemplo n.º 18
0
def generate_public_data_dumps(limit=None, verbose=False):
    """ Generates the public data dump files from the latest production data """

    # Connect to the latest production data schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = db.get_latest_schema('prod_')
    db.execute('SET search_path="' + schema + '";')
    timestamp = schema[schema.rfind('_') + 1:]
    if verbose:
        print('[OK] Dumping from schema "%s"...' % (schema))
    if limit is not None:
        print('[WARNING] Dumping with row limit %d!' % (limit))

    # Read YAML configuration file
    config = yaml_load('public_dumps.yaml')
    dir_save = config['save_directory']
    dumps = config['dumps']

    # Process all dumps
    for dump_name in dumps:
        # Construct dump query
        q = dumps[dump_name]['query']
        q = q.rstrip().rstrip(';')  # possibly remove ; ending
        if limit is not None:
            q += ' LIMIT %d' % (limit)

        # Dump to CSV without timestamp
        path_output = '%s%s.csv' % (dir_save, dump_name)
        db.dump_to_CSV(q, path_output)
        if verbose:
            print('[OK] Created dump "%s" in %s' % (dump_name, path_output))

        # Dump to CSV with timestamp
        path_output = '%s%s_%s.csv' % (dir_save, dump_name, timestamp)
        db.dump_to_CSV(q, path_output)
        if verbose:
            print('[OK] Created dump "%s" in %s' % (dump_name, path_output))

    # Close database connection
    db.close()
Exemplo n.º 19
0
def run(options):
    # can limit it to one chamber
    chamber = options.get("chamber", None)
    if chamber and (chamber in ("house", "senate")):
        chambers = (chamber)
    else:
        chambers = ("house", "senate")

    load_by = options.get("load_by", None)

    # Load the committee metadata from the congress-legislators repository and make a
    # mapping from thomas_id and house_id to the committee dict. For each committee,
    # replace the subcommittees list with a dict from thomas_id to the subcommittee.
    utils.require_congress_legislators_repo()
    committees = {}
    for c in utils.yaml_load("congress-legislators/committees-current.yaml"):
        committees[c["thomas_id"]] = c
        if "house_committee_id" in c:
            committees[c["house_committee_id"] + "00"] = c
        c["subcommittees"] = dict(
            (s["thomas_id"], s) for s in c.get("subcommittees", []))

    if "senate" in chambers:
        print("Fetching Senate meetings...")
        meetings = fetch_senate_committee_meetings(committees, options)
        print("Writing Senate meeting data to disk.")
        utils.write_json(meetings, output_for("senate"))

    if "house" in chambers:
        if load_by == None:
            print("Fetching House meetings...")
            meetings = fetch_house_committee_meetings(committees, options)
        else:
            print("Fetching House meetings by event_id...")
            meetings = fetch_meeting_from_event_id(committees, options,
                                                   load_by)

        print("Writing House meeting data to disk.")
        utils.write_json(meetings, output_for("house"))
Exemplo n.º 20
0
def get_numpy_word_embed():
    row = 0

    config = yaml_load("config.yaml")
    model_cfg = config.get("model", {})
    data_cfg = config.get("data", {})
    glove_path = model_cfg["glove_path"]
    glove_length = model_cfg["glove_length"]
    vocab_file = data_cfg["vocab_file"]

    words_embed = {}
    with open(glove_path, mode='r') as f:
        lines = f.readlines()
        for line in lines:
            line_list = line.split()
            word = line_list[0]
            embed = line_list[1:]
            embed = [float(num) for num in embed]
            words_embed[word] = embed
            if row > 20000:
                break
            row += 1

    word2idx = {}
    with open(vocab_file, 'rb') as handle:
        word2idx = pickle.load(handle)
    idx2word = {idx: w for w, idx in word2idx.items()}
    id2emb = {}
    id2emb[0] = [0.0] * glove_length
    for (_, idx) in word2idx.items():
        if idx2word[idx] in words_embed:
            id2emb[idx] = words_embed[idx2word[idx]]
        else:
            id2emb[idx] = [0.0] * glove_length
    data = [id2emb[idx] for idx in range(len(word2idx) + 1)]

    return data
Exemplo n.º 21
0
def run(options):
    # can limit it to one chamber
    chamber = options.get("chamber", None)
    if chamber and (chamber in ("house", "senate")):
        chambers = chamber
    else:
        chambers = ("house", "senate")

    load_by = options.get("load_by", None)

    # Load the committee metadata from the congress-legislators repository and make a
    # mapping from thomas_id and house_id to the committee dict. For each committee,
    # replace the subcommittees list with a dict from thomas_id to the subcommittee.
    utils.require_congress_legislators_repo()
    committees = {}
    for c in utils.yaml_load("congress-legislators/committees-current.yaml"):
        committees[c["thomas_id"]] = c
        if "house_committee_id" in c:
            committees[c["house_committee_id"] + "00"] = c
        c["subcommittees"] = dict((s["thomas_id"], s) for s in c.get("subcommittees", []))

    if "senate" in chambers:
        print "Fetching Senate meetings..."
        meetings = fetch_senate_committee_meetings(committees, options)
        print "Writing Senate meeting data to disk."
        utils.write_json(meetings, output_for("senate"))

    if "house" in chambers:
        if load_by == None:
            print "Fetching House meetings..."
            meetings = fetch_house_committee_meetings(committees, options)
        else:
            print "Fetching House meetings by event_id..."
            meetings = fetch_meeting_from_event_id(committees, options, load_by)

        print "Writing House meeting data to disk."
        utils.write_json(meetings, output_for("house"))
Exemplo n.º 22
0
        passages_mask.append(
            np.concatenate((np.ones(len(passage_ids)),
                            np.zeros(max_passage_length - len(passage_ids)))))
        questions_mask.append(
            np.concatenate(
                (np.ones(len(question_ids)),
                 np.zeros(max_question_length - len(question_ids)))))

        answers.append(answer)

    return passages_ids, questions_ids, passages_length, questions_length, passages_mask, questions_mask, answers


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = yaml_load("config.yaml")
model_cfg = config.get("model", {})
preprocess_cfg = config.get("preprocess", {})
data_cfg = config.get("data", {})
dev_cfg = config.get("dev", {})
train_cfg = config.get("train", {})
eval_cfg = config.get("eval", {})

vocab = {}
with open(data_cfg['vocab_file'], 'rb') as handle:
    vocab = pickle.load(handle)
handle.close()

dev_data = get_dev_data(
    data_cfg['data_path'], data_cfg['dev_data'], vocab,
    [preprocess_cfg['pa_max_sent_len'], preprocess_cfg['qu_max_sent_len']])
Exemplo n.º 23
0
            results.update(result)

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as f_w:
            for key in sorted(results.keys()):
                f_w.write("{} = {}\n".format(key, str(results[key])))


if __name__ == "__main__":
    cli_parser = argparse.ArgumentParser()

    cli_parser.add_argument("--config_file", type=str, required=True)
    cli_parser.add_argument("--do_nni", action="store_true")
    cli_parser.add_argument("--do_reinit", action="store_true")

    cli_args = cli_parser.parse_args()

    logger.setLevel(logging.INFO)

    with open(cli_args.config_file) as f:
        train_conf = yaml_load(f)

    if cli_args.do_nni:
        nni_params = nni.get_next_parameter()
        tuned_params = make_flat_dict(nni_params)
        train_conf = update_nested(train_conf, tuned_params)

    train_conf.update(vars(cli_args))

    main(train_conf)
Exemplo n.º 24
0
 def get_inputs(self, yaml_path):
     """Read yaml input file"""
     self.inputs = utils.yaml_load(yaml_path)
     self.set_name(yaml_path)
     self.init_search_keywords()
Exemplo n.º 25
0
 def get_conf(self):
     """Read yaml config file"""
     return utils.yaml_load(self.conf_file)
Exemplo n.º 26
0
# Just loads and saves each .yaml file to normalize serialization syntax.
#
# python lint.py
# ... will lint every .yaml file in the data directory.
#
# python lint.py file1.yaml file2.yaml ...
# ... will lint the specified files.

import glob, sys

from utils import yaml_load, yaml_dump, data_dir

for fn in glob.glob(data_dir() + "/*.yaml") if len(sys.argv) == 1 else sys.argv[1:]:
	print fn + "..."
	data = yaml_load(fn, use_cache=False)
	yaml_dump(data, fn)
	
Exemplo n.º 27
0
 def __init__(self, config_file):
     self.config = yaml_load(config_file)
     print("Config file loaded successfully: {}".format(config_file))
     terminal_break()
     pprint(self.config)
     terminal_break()
Exemplo n.º 28
0
# Converts the specified YAML file to an equivalent-ish CSV file
# (on standard output).
#
# python export_csv.py ../legislators-current.yaml

import sys, csv
from collections import OrderedDict

from utils import yaml_load

if len(sys.argv) < 2:
	print "Usage: python export_csv.py ../legislators-current.yaml > legislators-current.csv"
	sys.exit(0)

data = yaml_load(sys.argv[1])

###############################################

def flatten_object(obj, path, ret):
	"""Takes an object obj and flattens it into a dictionary ret.

	For instance { "x": { "y": 123 } } is turned into { "x__y": 123 }.
	"""
	for k, v in obj.items():
		if isinstance(v, dict):
			flatten_object(v, (path + "__" if path else "") + k + "__", ret)
		elif isinstance(v, list):
			# don't peek inside lists
			pass
		else:
			ret[path + k] = v
Exemplo n.º 29
0
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    parser = argparse.ArgumentParser(description='eval or test')
    parser.add_argument("--model_path",
                        default=" ",
                        type=str,
                        help="the model path")
    parser.add_argument('--eval', action='store_true', help="eval")

    # get command line parameter
    args = parser.parse_args()
    model_path = args.model_path
    mode = args.eval

    # get config from config.yaml
    config = yaml_load("./config.yaml")
    base_cfg = config.get("base", {})
    model_cfg = config.get("model", {})

    init_input = model_cfg["init_input"]

    # get data path
    eval_data_path = base_cfg.get("eval_data")
    test_data_path = base_cfg.get("test_data")

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # load model
    check_point = torch.load(model_path)
    model = eval(check_point["model_name"])(*init_input, True).to(device)
def main(args_dict):
    test_mode = not args_dict['disable_test_mode']
    if test_mode:
        print "======================="
        print "=======TEST MODE======="
        print "======================="

    timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
    # Write output into prod_schema_name
    prod_schema_name = "prod_" + timestamp
    print "prod_schema_name", prod_schema_name

    # Create database connections:
    db_source = DatabaseConnection(path_config='db_config_update_source.yaml')
    db_address_cache = DatabaseConnection(
        path_config='db_config_update_source.yaml',
        search_path='address_cache')
    db_prod = DatabaseConnection(path_config='db_config_update_source.yaml')
    CreateAndSetProdSchema(db_prod, prod_schema_name)

    # Initialize geocoder
    geocoder = geocoder_lib.Geocoder(db_address_cache, db_prod, test_mode)
    # Initialize entity lookup
    entities_lookup = entities.Entities(db_prod)

    # Table prod_tables.yaml defines a specifications of SQL selects to read
    # source data and describtion of additional tables to be created.
    config = utils.yaml_load('prod_tables.yaml')
    # This is where all the population happens!!!
    # Go through all the specified data sources and process them, adding data
    # as needed. We process them in lexicographic order!
    for key in sorted(config.keys()):
        config_per_source = config[key]
        print "Working on source:", key
        ProcessSource(db_source, db_prod, geocoder, entities_lookup,
                      config_per_source, test_mode)
        geocoder.PrintStats()
        entities_lookup.print_statistics()

    # Process yaml-free sources:
    process_source_rpvs(db_source, db_prod, geocoder, entities_lookup,
                        test_mode)
    db_source.close()

    # Run post processing.
    # TODO: For now post processing requires access to the profil
    # source schema. Remove this when fixed.
    schema_profil = db_prod.get_latest_schema('source_internal_profil_')
    db_prod.execute('SET search_path="' + prod_schema_name + '", "' +
                    schema_profil + '", public;')
    post_process.do_post_processing(db_prod, test_mode)

    # Create materialized view for entity search after all entities
    # have been created.
    db_prod.execute("""
        CREATE MATERIALIZED VIEW entities_search AS
          SELECT
            id,
            to_tsvector('simple', unaccent(name)) as search_vector
          FROM entities;
          CREATE INDEX ON entities_search(search_vector);
          CREATE INDEX ON entities_search USING gin(search_vector);
    """)

    # Grant apps read-only access to the newly created schema and tables within
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'data')
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'verejne')
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'kataster')
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'prepojenia')
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'obstaravania')

    # Commit database changes and close database connections
    db_address_cache.commit()
    db_address_cache.close()
    if test_mode:
        db_prod.conn.rollback()
        print('[OK] Rolled back database changes (test mode)')
    else:
        db_prod.commit()
    db_prod.close()
Exemplo n.º 31
0
def run():

	if len(sys.argv) < 2:
		print("Usage: python export_csv.py ../legislators-current.yaml > legislators-current.csv")
		sys.exit(0)

	data = yaml_load(sys.argv[1])

	###############################################

	def flatten_object(obj, path, ret):
		"""Takes an object obj and flattens it into a dictionary ret.

		For instance { "x": { "y": 123 } } is turned into { "x__y": 123 }.
		"""
		for k, v in list(obj.items()):
			if isinstance(v, dict):
				flatten_object(v, (path + "__" if path else "") + k + "__", ret)
			elif isinstance(v, list):
				# don't peek inside lists
				pass
			else:
				ret[path + k] = v
		return ret

	# Scan through the records recursively to get a list of column names.
	# Attempt to preserve the field order as found in the YAML file. Since
	# any field may be absent, no one record can provide the complete field
	# order. Build the best field order by looking at what each field tends
	# to be preceded by.
	fields = set()
	preceding_keys = dict() # maps keys to a dict of *previous* keys and how often they occurred
	for record in data:
		prev_key = None
		for key in flatten_object(record, "", OrderedDict()):
			fields.add(key)

			preceding_keys.setdefault(key, {}).setdefault(prev_key, 0)
			preceding_keys[key][prev_key] += 1
			prev_key = key

	# Convert to relative frequencies.
	for k, v in list(preceding_keys.items()):
		s = float(sum(v.values()))
		for k2 in v:
			v[k2] /= s

	# Get a good order for the fields. Greedily add keys from left to right
	# maximizing the conditional probability that the preceding key would
	# precede the key on the right.
	field_order = [None]
	prev_key = None
	while len(field_order) < len(fields):
		# Which key is such that prev_key is its most likely precedessor?
		# We do it this way (and not what is prev_key's most likely follower)
		# because we should be using a probability (of sorts) that is
		# conditional on the key being present. Otherwise we lost infrequent
		# keys.
		next_key = max([f for f in fields if f not in field_order], key =
			lambda k :
				max(preceding_keys[k].get(pk, 0) for pk in field_order))
		field_order.append(next_key)
		prev_key = next_key
	field_order = field_order[1:] # remove the None at the start

	# Write CSV header.
	w = csv.writer(sys.stdout)
	w.writerow(field_order)

	# Write the objects.
	for record in data:
		obj = flatten_object(record, "", {})
		w.writerow([
			obj.get(f, "")
			for f in field_order
			])
Exemplo n.º 32
0
# Converts the specified YAML file to an equivalent-ish CSV file
# (on standard output).
#
# python export_csv.py ../legislators-current.yaml

import sys, csv
from collections import OrderedDict

from utils import yaml_load

if len(sys.argv) < 2:
    print "Usage: python export_csv.py ../legislators-current.yaml > legislators-current.csv"
    sys.exit(0)

data = yaml_load(sys.argv[1])

###############################################


def flatten_object(obj, path, ret):
    """Takes an object obj and flattens it into a dictionary ret.

	For instance { "x": { "y": 123 } } is turned into { "x__y": 123 }.
	"""
    for k, v in obj.items():
        if isinstance(v, dict):
            flatten_object(v, (path + "__" if path else "") + k + "__", ret)
        elif isinstance(v, list):
            # don't peek inside lists
            pass
        else:
Exemplo n.º 33
0
def run():
    for fn in glob.glob(data_dir() + "/*.yaml") if len(sys.argv) == 1 else sys.argv[1:]:
        print(fn + "...")
        data = yaml_load(fn, use_cache=False)
        yaml_dump(data, fn)
Exemplo n.º 34
0
def run():

    if len(sys.argv) < 2:
        print(
            "Usage: python export_csv.py ../legislators-current.yaml > legislators-current.csv"
        )
        sys.exit(0)

    data = yaml_load(sys.argv[1])

    ###############################################

    def flatten_object(obj, path, ret):
        """Takes an object obj and flattens it into a dictionary ret.

		For instance { "x": { "y": 123 } } is turned into { "x__y": 123 }.
		"""
        for k, v in list(obj.items()):
            if isinstance(v, dict):
                flatten_object(v, (path + "__" if path else "") + k + "__",
                               ret)
            elif isinstance(v, list):
                # don't peek inside lists
                pass
            else:
                ret[path + k] = v
        return ret

    # Scan through the records recursively to get a list of column names.
    # Attempt to preserve the field order as found in the YAML file. Since
    # any field may be absent, no one record can provide the complete field
    # order. Build the best field order by looking at what each field tends
    # to be preceded by.
    fields = set()
    preceding_keys = dict(
    )  # maps keys to a dict of *previous* keys and how often they occurred
    for record in data:
        prev_key = None
        for key in flatten_object(record, "", OrderedDict()):
            fields.add(key)

            preceding_keys.setdefault(key, {}).setdefault(prev_key, 0)
            preceding_keys[key][prev_key] += 1
            prev_key = key

    # Convert to relative frequencies.
    for k, v in list(preceding_keys.items()):
        s = float(sum(v.values()))
        for k2 in v:
            v[k2] /= s

    # Get a good order for the fields. Greedily add keys from left to right
    # maximizing the conditional probability that the preceding key would
    # precede the key on the right.
    field_order = [None]
    prev_key = None
    while len(field_order) < len(fields):
        # Which key is such that prev_key is its most likely precedessor?
        # We do it this way (and not what is prev_key's most likely follower)
        # because we should be using a probability (of sorts) that is
        # conditional on the key being present. Otherwise we lost infrequent
        # keys.
        next_key = max([f for f in fields if f not in field_order],
                       key=lambda k: max(preceding_keys[k].get(pk, 0)
                                         for pk in field_order))
        field_order.append(next_key)
        prev_key = next_key
    field_order = field_order[1:]  # remove the None at the start

    # Write CSV header.
    w = csv.writer(sys.stdout)
    w.writerow(field_order)

    # Write the objects.
    for record in data:
        obj = flatten_object(record, "", {})
        w.writerow([obj.get(f, "") for f in field_order])
Exemplo n.º 35
0
 def __init__(self, path_config='db_config.yaml', search_path=None):
     config = utils.yaml_load(path_config)
     self.conn = psycopg2.connect(user=config['user'], dbname=config['db'])
     if search_path is not None:
         self.execute('SET search_path = %s', (search_path, ))
Exemplo n.º 36
0
def run():
    parser = argparse.ArgumentParser(description='handle legis/exec')
    parser.add_argument('destdir')
    parser.add_argument('--inpType')
    args = parser.parse_args()
    if len(sys.argv) < 2:
        print("Usage: python everypolitician.py outputbasename/")
        sys.exit(0)

    # Load current legislators.
    if args.inpType == 'leg':
        data = yaml_load("{0}/legislators-current.yaml".format(govtrackdir))
    else:
        data = yaml_load("{0}/executive.yaml".format(govtrackdir))
    data_social_media = {}
    for legislator in yaml_load("{0}/legislators-social-media.yaml".format(govtrackdir)):
        data_social_media[legislator['id']['bioguide']] = legislator

    # Create output files.
    if args.inpType == 'leg':
        writers = {
            "rep": csv.writer(open(args.destdir + "house.csv", "w")),
            "sen": csv.writer(open(args.destdir + "senate.csv", "w")),
        }
    else:
        writers = {
            "prez": csv.writer(open(args.destdir + "prez.csv", "w")),
            "viceprez": csv.writer(open(args.destdir + "viceprez.csv", "w"))
        }

    for w in writers.values():
        w.writerow([
            "id",
            # "name",
            "postal_code",
            "state",
            # "group",
            "class_district",
            "start_date",
            "end_date",
            "num_terms",
            "party",
            "given_name",
            "middle_name",
            "family_name",
            "suffix",
            # "sort_name",
            # "phone",
            "gender",
            # "birth_date",
            "image",
            # "twitter",
            # "facebook",
            # "instagram",
            # "wikipedia",
            # "website",
            "office_code",
            "office_name"
        ])

    # Write out one row per legislator for their current term.
    for legislator in data:
        genRow(legislator, writers)
Exemplo n.º 37
0
 def ec2metadata(self):
     if self._instance_cache:
         return self._instance_cache
     output = subprocess.check_output(["ec2metadata"])
     self._instance_cache = yaml_load(output)
     return self._instance_cache
Exemplo n.º 38
0
 def init(self, path):
     config = utils.yaml_load(path)
     self.Data = config['Data']
     self.Model = config['Model']
Exemplo n.º 39
0
def run():
    parser = argparse.ArgumentParser(description='handle legis/exec')
    parser.add_argument('destdir')
    parser.add_argument('--inpType')
    args = parser.parse_args()
    if len(sys.argv) < 2:
        print("Usage: python everypolitician.py outputbasename/")
        sys.exit(0)

    # Load current legislators.
    if args.inpType == 'leg':
        data = yaml_load("{0}/legislators-current.yaml".format(govtrackdir))
    else:
        data = yaml_load("{0}/executive.yaml".format(govtrackdir))
    data_social_media = {}
    for legislator in yaml_load(
            "{0}/legislators-social-media.yaml".format(govtrackdir)):
        data_social_media[legislator['id']['bioguide']] = legislator

    # Create output files.
    if args.inpType == 'leg':
        writers = {
            "rep": csv.writer(open(args.destdir + "house.csv", "w")),
            "sen": csv.writer(open(args.destdir + "senate.csv", "w")),
        }
    else:
        writers = {
            "prez": csv.writer(open(args.destdir + "prez.csv", "w")),
            "viceprez": csv.writer(open(args.destdir + "viceprez.csv", "w"))
        }

    for w in writers.values():
        w.writerow([
            "id",
            # "name",
            "postal_code",
            "state",
            # "group",
            "class_district",
            "start_date",
            "end_date",
            "num_terms",
            "party",
            "given_name",
            "middle_name",
            "family_name",
            "suffix",
            # "sort_name",
            # "phone",
            "gender",
            # "birth_date",
            "image",
            # "twitter",
            # "facebook",
            # "instagram",
            # "wikipedia",
            # "website",
            "office_code",
            "office_name"
        ])

    # Write out one row per legislator for their current term.
    for legislator in data:
        genRow(legislator, writers)
Exemplo n.º 40
0
def run():
	if len(sys.argv) < 2:
		print("Usage: python everypolitician.py outputbasename/")
		sys.exit(0)

	# Load current legislators.
	data = yaml_load("../legislators-current.yaml")
	data_social_media = { }
	for legislator in yaml_load("../legislators-social-media.yaml"):
		data_social_media[legislator['id']['bioguide']] = legislator

	# Create output files.
	writers = {
		"rep": csv.writer(open(sys.argv[1] + "house.csv", "w")),
		"sen": csv.writer(open(sys.argv[1] + "senate.csv", "w")),
	}
	for w in writers.values():
		w.writerow([
			"id",
			"name",
			"area",
			"group",
			"term",
			"start_date",
			"end_date",
			"given_name",
			"family_name",
			"honorific_suffix",
			"sort_name",
			"phone",
			"gender",
			"birth_date",
			"image",
			"twitter",
			"facebook",
			"instagram",
			"wikipedia",
			"website",
		])

	# Write out one row per legislator for their current term.
	for legislator in data:
		term = legislator['terms'][-1]

		# TODO: "If someone changed party/faction affilation in the middle of the term, you should include two entries, with the relevant start/end dates set."

		w = writers[term['type']]
		w.writerow([
			legislator['id']['bioguide'],
			build_name(legislator, term, 'full'),
			build_area(term),
			term['party'],
			CURRENT_CONGRESS,
			term['start'],
			term['end'],
			legislator['name'].get('first'),
			legislator['name'].get('last'),
			legislator['name'].get('suffix'),
			build_name(legislator, term, 'sort'),
			term.get('phone'),
			legislator['bio'].get('gender'),
			legislator['bio'].get('birthday'),
			"https://theunitedstates.io/images/congress/original/%s.jpg" % legislator['id']['bioguide'],
			data_social_media.get(legislator['id']['bioguide'], {}).get("social", {}).get("twitter"),
			data_social_media.get(legislator['id']['bioguide'], {}).get("social", {}).get("facebook"),
			data_social_media.get(legislator['id']['bioguide'], {}).get("social", {}).get("instagram"),
			legislator['id'].get('wikipedia', '').replace(" ", "_"),
			term['url'],
		])