def main(): """Generates public data dump files from the latest prod data.""" # Connect to the latest schemas. db = DatabaseConnection(path_config='db_config.yaml') schema = db.get_latest_schema('prod_') schema_profil = db.get_latest_schema('source_internal_profil_') db.execute('SET search_path="' + schema + '", "' + schema_profil + '";') timestamp = schema[schema.rfind('_') + 1:] print('[OK] Dumping from schemas "%s" and "%s"...' % (schema, schema_profil)) # Read YAML configuration file. config = yaml_load('public_dumps.yaml') dir_save = config['save_directory'] dumps = config['dumps'] # Process all dumps. for dump_name in dumps: save_path = os.path.join(dir_save, '%s_%s.csv' % (dump_name, timestamp)) db.dump_to_CSV(dumps[dump_name]['query'], save_path) print('[OK] Saved dump "%s" to %s' % (dump_name, save_path)) stage_path = os.path.join(dir_save, dump_name + '.csv') shutil.copyfile(save_path, stage_path) print('[OK] Copied dump "%s" to %s' % (dump_name, stage_path)) # Close database connection. db.close()
def gather_meta(self): """ Return the meta file. """ if not os.path.exists(self.paths["meta"]): return "" meta_dict = utils.yaml_load(self.paths["meta"]) # gather the dependencies if meta_dict and "dependencies" in meta_dict: # create a simple list of each role that is a dependency dep_list = [] for dependency in meta_dict["dependencies"]: if type(dependency) is dict: dep_list.append(dependency["role"]) else: dep_list.append(dependency) # unique set of dependencies meta_dict["dependencies"] = list(set(dep_list)) self.dependencies = meta_dict["dependencies"] else: self.dependencies = [] return utils.file_to_string(self.paths["meta"])
def run(options): # Load the committee metadata from the congress-legislators repository and make a # mapping from thomas_id and house_id to the committee dict. For each committee, # replace the subcommittees list with a dict from thomas_id to the subcommittee. utils.require_congress_legislators_repo() committees = {} for c in utils.yaml_load("congress-legislators/committees-current.yaml"): committees[c["thomas_id"]] = c if "house_committee_id" in c: committees[c["house_committee_id"] + "00"] = c c["subcommittees"] = dict( (s["thomas_id"], s) for s in c.get("subcommittees", [])) for chamber in ("house", "senate"): # Load any existing meetings file so we can recycle GUIDs generated for Senate meetings. existing_meetings = [] output_file = utils.data_dir( ) + "/committee_meetings_%s.json" % chamber if os.path.exists(output_file): existing_meetings = json.load(open(output_file)) # Scrape for meeting info. if chamber == "senate": meetings = fetch_senate_committee_meetings(existing_meetings, committees, options) else: meetings = fetch_house_committee_meetings(existing_meetings, committees, options) # Write out. utils.write( json.dumps(meetings, sort_keys=True, indent=2, default=utils.format_datetime), output_file)
def run(options): # Load the committee metadata from the congress-legislators repository and make a # mapping from thomas_id and house_id to the committee dict. For each committee, # replace the subcommittees list with a dict from thomas_id to the subcommittee. utils.require_congress_legislators_repo() committees = { } for c in utils.yaml_load("congress-legislators/committees-current.yaml"): committees[c["thomas_id"]] = c if "house_committee_id" in c: committees[c["house_committee_id"] + "00"] = c c["subcommittees"] = dict((s["thomas_id"], s) for s in c.get("subcommittees", [])) for chamber in ("house", "senate"): # Load any existing meetings file so we can recycle GUIDs generated for Senate meetings. existing_meetings = [] output_file = utils.data_dir() + "/committee_meetings_%s.json" % chamber if os.path.exists(output_file): existing_meetings = json.load(open(output_file)) # Scrape for meeting info. if chamber == "senate": meetings = fetch_senate_committee_meetings(existing_meetings, committees, options) else: meetings = fetch_house_committee_meetings(existing_meetings, committees, options) # Write out. utils.write(json.dumps(meetings, sort_keys=True, indent=2, default=utils.format_datetime), output_file)
def _gather_included_roles_recursive(tasks): if not tasks: return [] included = [] include_file = None for task in tasks: if "include_role" in task: included.append(task.get("include_role")['name']) elif "include" in task: include_file = task.get("include") elif "include_tasks" in task: include_file = task.get("include_tasks") elif "import_tasks" in task: include_file = task.get("import_tasks") elif "block" in task: included.extend( _gather_included_roles_recursive(task['block'])) if include_file: # TODO: check playbooks dir, and role dir include_path = os.path.join( os.path.dirname(self.paths["tasks"]), include_file) if not os.path.exists(include_path): include_path = os.path.join( os.path.dirname(self.paths["role"]), include_file) if not os.path.exists(include_path): continue tasks = utils.yaml_load(include_path) included.extend(_gather_included_roles_recursive(tasks)) return included
def init(self): # load record if not os.path.exists('.cache'): os.mkdir('.cache') if os.path.exists('.cache/record.yaml'): record = utils.yaml_load('.cache/record.yaml') logger.info('record found, load record') else: record = {'hash': []} utils.yaml_dump(record, '.cache/record.yaml') logger.info('no record found, create new record') cache_flag = False # get file list file_list = self._get_file_list() for path, type_ in file_list: fhash = utils.file_md5(path) assert len(fhash) > 0 if fhash not in record['hash']: # add new data to cache cache_flag = True record['hash'].append(fhash) logger.info('New file {0} {1} found, add to cache'.format( path, type_)) prep_sym(path, type_) logger.info('{0} {1} add to cache success'.format(path, type_)) utils.yaml_dump(record, '.cache/record.yaml') if cache_flag: logger.info('Cache data update success') prep_wm() prep_vec() else: logger.info('Data up to date, use cache data')
def run(): # Use the House History Website's Women in Congress search results to get a list of IDs. # Because this requires a POST, our utils.download() function won't work. querystring = b"Command=Next&Term=Search&TermType=Last&ShowNonMember=true&ShowNonMember=false&Office=&Leadership=&State=&Party=&ContinentalCongress=false&BlackAmericansInCongress=false&WomenInCongress=true&WomenInCongress=false&CongressNumber=65&CongressNumber=66&CongressNumber=67&CongressNumber=68&CongressNumber=69&CongressNumber=70&CongressNumber=71&CongressNumber=72&CongressNumber=73&CongressNumber=74&CongressNumber=75&CongressNumber=76&CongressNumber=77&CongressNumber=78&CongressNumber=79&CongressNumber=80&CongressNumber=81&CongressNumber=82&CongressNumber=83&CongressNumber=84&CongressNumber=85&CongressNumber=86&CongressNumber=87&CongressNumber=88&CongressNumber=89&CongressNumber=90&CongressNumber=91&CongressNumber=92&CongressNumber=93&CongressNumber=94&CongressNumber=95&CongressNumber=96&CongressNumber=97&CongressNumber=98&CongressNumber=99&CongressNumber=100&CongressNumber=101&CongressNumber=102&CongressNumber=103&CongressNumber=104&CongressNumber=105&CongressNumber=106&CongressNumber=107&CongressNumber=108&CongressNumber=109&CongressNumber=110&CongressNumber=111&CongressNumber=112&CongressNumber=113&CurrentPage=__PAGE__&SortOrder=LastName&ResultType=Grid&PreviousSearch=Search%2CLast%2C%2C%2C%2C%2CFalse%2CFalse%2CTrue%2C65%2C66%2C67%2C68%2C69%2C70%2C71%2C72%2C73%2C74%2C75%2C76%2C77%2C78%2C79%2C80%2C81%2C82%2C83%2C84%2C85%2C86%2C87%2C88%2C89%2C90%2C91%2C92%2C93%2C94%2C95%2C96%2C97%2C98%2C99%2C100%2C101%2C102%2C103%2C104%2C105%2C106%2C107%2C108%2C109%2C110%2C111%2C112%2C113%2CLastName&X-Requested-With=XMLHttpRequest" women_house_history_ids = set() for pagenum in range(0, 25+1): body = urllib.request.urlopen( "http://history.house.gov/People/Search?Length=6", querystring.replace(b"__PAGE__", str(pagenum).encode("ascii")) ).read().decode("utf8") for match in re.findall(r"/People/Detail/(\d+)\?ret=True", body): women_house_history_ids.add(int(match)) # Now check and update the gender of all legislators. missing_ids = set() for fn in ("../legislators-current.yaml", "../legislators-historical.yaml"): legislators = yaml_load(fn) for p in legislators: house_history_id = p.get("id", {}).get("house_history") if not house_history_id: missing_ids.add(p.get("id", {}).get("bioguide")) continue p.setdefault("bio", {})["gender"] = "F" if house_history_id in women_house_history_ids else "M" if house_history_id in women_house_history_ids: women_house_history_ids.remove(house_history_id) yaml_dump(legislators, fn) print("%d women in Congress were not found in our files." % len(women_house_history_ids)) print("%d legislators are missing house_history IDs:" % len(missing_ids))
def gather_meta(self): """ Return the meta file. """ if not os.path.exists(self.paths["meta"]): self.dependencies = [] return "" meta_dict = utils.yaml_load(self.paths["meta"]) # gather the dependencies if meta_dict and "dependencies" in meta_dict: # create a simple list of each role that is a dependency dep_list = [] for dependency in meta_dict["dependencies"]: if type(dependency) is dict: dep_list.append(dependency["role"]) else: dep_list.append(dependency) # unique set of dependencies meta_dict["dependencies"] = list(set(dep_list)) self.dependencies = meta_dict["dependencies"] else: self.dependencies = [] return utils.file_to_string(self.paths["meta"])
def get_public_dumps_info(): # Read public dumps YAML configuration file config = yaml_load('../data/public_dumps.yaml') dir_save = config['save_directory'] dumps = config['dumps'] # Iterate through the dumps result = [] for dump_name in dumps: # Find dump file with the latest timestamp (inherited from prod data) filenames = [ n for n in os.listdir(dir_save) if n.startswith(dump_name + '_') and n.endswith('.csv') ] if len(filenames) == 0: print('[WARNING] Could not find dump file for dump "%s"' % (dump_name)) continue filename = sorted(filenames, reverse=True)[0] # Append dump info to results result.append({ 'name': dump_name, 'notebook_url': dumps[dump_name]['notebook_url'], 'query': dumps[dump_name]['query'].strip(), 'url': 'https://verejne.digital/data/%s' % (filename) }) return result
def test_hearing(self): committees = {} for c in utils.yaml_load("test/fixtures/committees-current.yaml"): committees[c["thomas_id"]] = c if "house_committee_id" in c: committees[c["house_committee_id"] + "00"] = c c["subcommittees"] = dict( (s["thomas_id"], s) for s in c.get("subcommittees", [])) hearing_xml = "test/fixtures/hearings/sample_hearing.xml" file_xml = open(hearing_xml, "r") dom = lxml.etree.parse(file_xml) test_output = committee_meetings.parse_house_committee_meeting( '102252', dom, [], committees, {"debug": False}, None, ["BILLS-113hr4435ih.pdf", "BILLS-113hr4435ih.xml"]) # event_id, dom, existing_meetings, committees, options, witnesses, uploaded_documents self.assertEqual(test_output['bill_ids'], ['hr4435-113']) self.assertEqual(test_output['chamber'], 'house') self.assertEqual(test_output['committee'], 'HSRU') self.assertEqual(test_output['congress'], 113) self.assertEqual(test_output['house_meeting_type'], 'HMTG') self.assertEqual(test_output['meeting_documents'][0]['description'], 'H.R. 4435 (as introduced)') self.assertEqual(test_output['meeting_documents'][0]['bill_id'], 'hr4435-113') self.assertEqual(test_output['meeting_documents'][0]['version_code'], 'ih') self.assertEqual(test_output['meeting_documents'][0]['type'], 'BR') self.assertEqual(test_output['meeting_documents'][0]['urls'], [ { 'url': 'http://beta.congress.gov/113/bills/hr4435/BILLS-113hr4435ih.pdf', 'file_found': True }, { 'url': 'http://beta.congress.gov/113/bills/hr4435/BILLS-113hr4435ih.xml', 'file_found': True }, ]) self.assertEqual(test_output['occurs_at'], '2014-05-19T17:00:00') self.assertEqual(test_output['room'], 'CAPITOL H-313') self.assertEqual(test_output['subcommittee'], None) self.assertEqual( test_output['topic'], u'H.R. 4435\u2014National Defense Authorization Act for Fiscal Year 2015 [General Debate]; H.R. 4660\u2014Commerce, Justice, Science, and Related Agencies Appropriations Act, 2015' ) self.assertEqual( test_output['url'], 'http://docs.house.gov/Committee/Calendar/ByEvent.aspx?EventID=102252' )
def read_and_valid_meta(self, role): """ Read the meta files and return whether or not the meta file being read is valid. """ if os.path.exists(self.paths["meta"]): self.meta_dict = utils.yaml_load(self.paths["meta"]) if os.path.exists(self.paths["ansigenome"]): self.meta_dict['ansigenome_info'] = utils.yaml_load( self.paths["ansigenome"])['ansigenome_info'] else: self.report["state"]["missing_meta_role"] += 1 self.report["roles"][role]["state"] = "missing_meta" return False is_valid = True # utils.yaml_load returns False when the file is invalid if isinstance(self.meta_dict, bool): is_valid = False sys.exit(1) return is_valid
def run(): # Use the House History Website's Women in Congress search results to get a list of IDs. # Because this requires a POST, our utils.download() function won't work. querystring = b"Command=Next&Term=Search&SearchIn=LastName&ShowNonMember=true&ShowNonMember=false&Office=&Leadership=&State=&Party=&ContinentalCongress=false&BlackAmericansInCongress=false&WomenInCongress=true&WomenInCongress=false&HispanicAmericansInCongress=false&CongressNumber=65&CongressNumber=66&CongressNumber=67&CongressNumber=68&CongressNumber=69&CongressNumber=70&CongressNumber=71&CongressNumber=72&CongressNumber=73&CongressNumber=74&CongressNumber=75&CongressNumber=76&CongressNumber=77&CongressNumber=78&CongressNumber=79&CongressNumber=80&CongressNumber=81&CongressNumber=82&CongressNumber=83&CongressNumber=84&CongressNumber=85&CongressNumber=86&CongressNumber=87&CongressNumber=88&CongressNumber=89&CongressNumber=90&CongressNumber=91&CongressNumber=92&CongressNumber=93&CongressNumber=94&CongressNumber=95&CongressNumber=96&CongressNumber=97&CongressNumber=98&CongressNumber=99&CongressNumber=100&CongressNumber=101&CongressNumber=102&CongressNumber=103&CongressNumber=104&CongressNumber=105&CongressNumber=106&CongressNumber=107&CongressNumber=108&CongressNumber=109&CongressNumber=110&CongressNumber=111&CongressNumber=112&CongressNumber=113&CongressNumber=114&CurrentPage=__PAGE__&SortOrder=LastName&ResultType=Grid&PreviousSearch=Search%2CLastName%2C%2C%2C%2C%2CFalse%2CFalse%2CTrue%2C65%2C66%2C67%2C68%2C69%2C70%2C71%2C72%2C73%2C74%2C75%2C76%2C77%2C78%2C79%2C80%2C81%2C82%2C83%2C84%2C85%2C86%2C87%2C88%2C89%2C90%2C91%2C92%2C93%2C94%2C95%2C96%2C97%2C98%2C99%2C100%2C101%2C102%2C103%2C104%2C105%2C106%2C107%2C108%2C109%2C110%2C111%2C112%2C113%2C114%2CLastName&X-Requested-With=XMLHttpRequest" women_house_history_ids = set() for pagenum in range(0, 30 + 1): body = urllib.request.urlopen( "http://history.house.gov/People/Search?Length=6", querystring.replace( b"__PAGE__", str(pagenum).encode("ascii"))).read().decode("utf8") for match in re.findall(r"/People/Detail/(\d+)\?ret=True", body): women_house_history_ids.add(int(match)) # Now check and update the gender of all legislators. matched_women_house_history_ids = set() missing_ids = set() for fn in ("../legislators-current.yaml", "../legislators-historical.yaml"): legislators = yaml_load(fn) for p in legislators: house_history_id = p.get("id", {}).get("house_history") if not house_history_id: # We have all of the women, so anyone left must be a man. p.setdefault("bio", {})["gender"] = "M" missing_ids.add(p.get("id", {}).get("bioguide")) continue p.setdefault( "bio", {} )["gender"] = "F" if house_history_id in women_house_history_ids else "M" if house_history_id in women_house_history_ids: matched_women_house_history_ids.add(house_history_id) yaml_dump(legislators, fn) print("%d women in Congress reported by the House History website" % len(women_house_history_ids)) print("%d women in Congress were not found in our files." % len(women_house_history_ids - matched_women_house_history_ids)) print( " ", " ".join((str(x) for x in (women_house_history_ids - matched_women_house_history_ids)))) print("%d legislators are missing house_history IDs, set to male." % len(missing_ids))
def export_roles(self): """ Export the roles to one of the export types. """ # prepare the report by removing unnecessary fields del self.report["state"] del self.report["stats"] for role in self.report["roles"]: del self.report["roles"][role]["state"] defaults_path = os.path.join(self.roles_path, role, "defaults", "main.yml") if os.path.exists(defaults_path): defaults = self.report["roles"][role]["defaults"] self.report["roles"][role]["defaults"] = \ utils.yaml_load("", defaults) Export(self.roles_path, self.report, self.config, self.options)
def valid_meta(self, role): """ Return whether or not the meta file being read is valid. """ if os.path.exists(self.paths["meta"]): self.meta_dict = utils.yaml_load(self.paths["meta"]) else: self.report["state"]["missing_meta_role"] += 1 self.report["roles"][role]["state"] = "missing_meta" return False is_valid = True # utils.yaml_load returns False when the file is invalid if isinstance(self.meta_dict, bool): is_valid = False sys.exit(1) return is_valid
def initialise_app(serving_directory): """ Procedure for initialising the app with precomputed values that are shared across different requests. The registry property is intended for this purpose, in order to avoid global variables. """ # database db = DatabaseConnection(path_config='db_config.yaml') schema = db.get_latest_schema('prod_') db.execute('SET search_path to ' + schema + ';') app.registry['db'] = db # data_sources data_sources = yaml_load('datasources.yaml') app.registry['data_sources'] = data_sources # entities entities = Entities() entities.loadFromDirectory(serving_directory) app.registry['entities'] = entities
def test_hearing(self): committees = {} for c in utils.yaml_load("test/fixtures/committees-current.yaml"): committees[c["thomas_id"]] = c if "house_committee_id" in c: committees[c["house_committee_id"] + "00"] = c c["subcommittees"] = dict((s["thomas_id"], s) for s in c.get("subcommittees", [])) hearing_xml = "test/fixtures/hearings/sample_hearing.xml" file_xml = open(hearing_xml, "r") dom = lxml.etree.parse(file_xml) test_output = committee_meetings.parse_house_committee_meeting( "102252", dom, [], committees, {"debug": False}, None, ["BILLS-113hr4435ih.pdf", "BILLS-113hr4435ih.xml"] ) # event_id, dom, existing_meetings, committees, options, witnesses, uploaded_documents self.assertEqual(test_output["bill_ids"], ["hr4435-113"]) self.assertEqual(test_output["chamber"], "house") self.assertEqual(test_output["committee"], "HSRU") self.assertEqual(test_output["congress"], 113) self.assertEqual(test_output["house_meeting_type"], "HMTG") self.assertEqual(test_output["meeting_documents"][0]["description"], "H.R. 4435 (as introduced)") self.assertEqual(test_output["meeting_documents"][0]["bill_id"], "hr4435-113") self.assertEqual(test_output["meeting_documents"][0]["version_code"], "ih") self.assertEqual(test_output["meeting_documents"][0]["type"], "BR") self.assertEqual( test_output["meeting_documents"][0]["urls"], [ {"url": "http://beta.congress.gov/113/bills/hr4435/BILLS-113hr4435ih.pdf", "file_found": True}, {"url": "http://beta.congress.gov/113/bills/hr4435/BILLS-113hr4435ih.xml", "file_found": True}, ], ) self.assertEqual(test_output["occurs_at"], "2014-05-19T17:00:00") self.assertEqual(test_output["room"], "CAPITOL H-313") self.assertEqual(test_output["subcommittee"], None) self.assertEqual( test_output["topic"], u"H.R. 4435\u2014National Defense Authorization Act for Fiscal Year 2015 [General Debate]; H.R. 4660\u2014Commerce, Justice, Science, and Related Agencies Appropriations Act, 2015", ) self.assertEqual(test_output["url"], "http://docs.house.gov/Committee/Calendar/ByEvent.aspx?EventID=102252")
def generate_public_data_dumps(limit=None, verbose=False): """ Generates the public data dump files from the latest production data """ # Connect to the latest production data schema db = DatabaseConnection(path_config='db_config_update_source.yaml') schema = db.get_latest_schema('prod_') db.execute('SET search_path="' + schema + '";') timestamp = schema[schema.rfind('_') + 1:] if verbose: print('[OK] Dumping from schema "%s"...' % (schema)) if limit is not None: print('[WARNING] Dumping with row limit %d!' % (limit)) # Read YAML configuration file config = yaml_load('public_dumps.yaml') dir_save = config['save_directory'] dumps = config['dumps'] # Process all dumps for dump_name in dumps: # Construct dump query q = dumps[dump_name]['query'] q = q.rstrip().rstrip(';') # possibly remove ; ending if limit is not None: q += ' LIMIT %d' % (limit) # Dump to CSV without timestamp path_output = '%s%s.csv' % (dir_save, dump_name) db.dump_to_CSV(q, path_output) if verbose: print('[OK] Created dump "%s" in %s' % (dump_name, path_output)) # Dump to CSV with timestamp path_output = '%s%s_%s.csv' % (dir_save, dump_name, timestamp) db.dump_to_CSV(q, path_output) if verbose: print('[OK] Created dump "%s" in %s' % (dump_name, path_output)) # Close database connection db.close()
def run(options): # can limit it to one chamber chamber = options.get("chamber", None) if chamber and (chamber in ("house", "senate")): chambers = (chamber) else: chambers = ("house", "senate") load_by = options.get("load_by", None) # Load the committee metadata from the congress-legislators repository and make a # mapping from thomas_id and house_id to the committee dict. For each committee, # replace the subcommittees list with a dict from thomas_id to the subcommittee. utils.require_congress_legislators_repo() committees = {} for c in utils.yaml_load("congress-legislators/committees-current.yaml"): committees[c["thomas_id"]] = c if "house_committee_id" in c: committees[c["house_committee_id"] + "00"] = c c["subcommittees"] = dict( (s["thomas_id"], s) for s in c.get("subcommittees", [])) if "senate" in chambers: print("Fetching Senate meetings...") meetings = fetch_senate_committee_meetings(committees, options) print("Writing Senate meeting data to disk.") utils.write_json(meetings, output_for("senate")) if "house" in chambers: if load_by == None: print("Fetching House meetings...") meetings = fetch_house_committee_meetings(committees, options) else: print("Fetching House meetings by event_id...") meetings = fetch_meeting_from_event_id(committees, options, load_by) print("Writing House meeting data to disk.") utils.write_json(meetings, output_for("house"))
def get_numpy_word_embed(): row = 0 config = yaml_load("config.yaml") model_cfg = config.get("model", {}) data_cfg = config.get("data", {}) glove_path = model_cfg["glove_path"] glove_length = model_cfg["glove_length"] vocab_file = data_cfg["vocab_file"] words_embed = {} with open(glove_path, mode='r') as f: lines = f.readlines() for line in lines: line_list = line.split() word = line_list[0] embed = line_list[1:] embed = [float(num) for num in embed] words_embed[word] = embed if row > 20000: break row += 1 word2idx = {} with open(vocab_file, 'rb') as handle: word2idx = pickle.load(handle) idx2word = {idx: w for w, idx in word2idx.items()} id2emb = {} id2emb[0] = [0.0] * glove_length for (_, idx) in word2idx.items(): if idx2word[idx] in words_embed: id2emb[idx] = words_embed[idx2word[idx]] else: id2emb[idx] = [0.0] * glove_length data = [id2emb[idx] for idx in range(len(word2idx) + 1)] return data
def run(options): # can limit it to one chamber chamber = options.get("chamber", None) if chamber and (chamber in ("house", "senate")): chambers = chamber else: chambers = ("house", "senate") load_by = options.get("load_by", None) # Load the committee metadata from the congress-legislators repository and make a # mapping from thomas_id and house_id to the committee dict. For each committee, # replace the subcommittees list with a dict from thomas_id to the subcommittee. utils.require_congress_legislators_repo() committees = {} for c in utils.yaml_load("congress-legislators/committees-current.yaml"): committees[c["thomas_id"]] = c if "house_committee_id" in c: committees[c["house_committee_id"] + "00"] = c c["subcommittees"] = dict((s["thomas_id"], s) for s in c.get("subcommittees", [])) if "senate" in chambers: print "Fetching Senate meetings..." meetings = fetch_senate_committee_meetings(committees, options) print "Writing Senate meeting data to disk." utils.write_json(meetings, output_for("senate")) if "house" in chambers: if load_by == None: print "Fetching House meetings..." meetings = fetch_house_committee_meetings(committees, options) else: print "Fetching House meetings by event_id..." meetings = fetch_meeting_from_event_id(committees, options, load_by) print "Writing House meeting data to disk." utils.write_json(meetings, output_for("house"))
passages_mask.append( np.concatenate((np.ones(len(passage_ids)), np.zeros(max_passage_length - len(passage_ids))))) questions_mask.append( np.concatenate( (np.ones(len(question_ids)), np.zeros(max_question_length - len(question_ids))))) answers.append(answer) return passages_ids, questions_ids, passages_length, questions_length, passages_mask, questions_mask, answers device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config = yaml_load("config.yaml") model_cfg = config.get("model", {}) preprocess_cfg = config.get("preprocess", {}) data_cfg = config.get("data", {}) dev_cfg = config.get("dev", {}) train_cfg = config.get("train", {}) eval_cfg = config.get("eval", {}) vocab = {} with open(data_cfg['vocab_file'], 'rb') as handle: vocab = pickle.load(handle) handle.close() dev_data = get_dev_data( data_cfg['data_path'], data_cfg['dev_data'], vocab, [preprocess_cfg['pa_max_sent_len'], preprocess_cfg['qu_max_sent_len']])
results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as f_w: for key in sorted(results.keys()): f_w.write("{} = {}\n".format(key, str(results[key]))) if __name__ == "__main__": cli_parser = argparse.ArgumentParser() cli_parser.add_argument("--config_file", type=str, required=True) cli_parser.add_argument("--do_nni", action="store_true") cli_parser.add_argument("--do_reinit", action="store_true") cli_args = cli_parser.parse_args() logger.setLevel(logging.INFO) with open(cli_args.config_file) as f: train_conf = yaml_load(f) if cli_args.do_nni: nni_params = nni.get_next_parameter() tuned_params = make_flat_dict(nni_params) train_conf = update_nested(train_conf, tuned_params) train_conf.update(vars(cli_args)) main(train_conf)
def get_inputs(self, yaml_path): """Read yaml input file""" self.inputs = utils.yaml_load(yaml_path) self.set_name(yaml_path) self.init_search_keywords()
def get_conf(self): """Read yaml config file""" return utils.yaml_load(self.conf_file)
# Just loads and saves each .yaml file to normalize serialization syntax. # # python lint.py # ... will lint every .yaml file in the data directory. # # python lint.py file1.yaml file2.yaml ... # ... will lint the specified files. import glob, sys from utils import yaml_load, yaml_dump, data_dir for fn in glob.glob(data_dir() + "/*.yaml") if len(sys.argv) == 1 else sys.argv[1:]: print fn + "..." data = yaml_load(fn, use_cache=False) yaml_dump(data, fn)
def __init__(self, config_file): self.config = yaml_load(config_file) print("Config file loaded successfully: {}".format(config_file)) terminal_break() pprint(self.config) terminal_break()
# Converts the specified YAML file to an equivalent-ish CSV file # (on standard output). # # python export_csv.py ../legislators-current.yaml import sys, csv from collections import OrderedDict from utils import yaml_load if len(sys.argv) < 2: print "Usage: python export_csv.py ../legislators-current.yaml > legislators-current.csv" sys.exit(0) data = yaml_load(sys.argv[1]) ############################################### def flatten_object(obj, path, ret): """Takes an object obj and flattens it into a dictionary ret. For instance { "x": { "y": 123 } } is turned into { "x__y": 123 }. """ for k, v in obj.items(): if isinstance(v, dict): flatten_object(v, (path + "__" if path else "") + k + "__", ret) elif isinstance(v, list): # don't peek inside lists pass else: ret[path + k] = v
'cuda') if torch.cuda.is_available() else torch.device('cpu') parser = argparse.ArgumentParser(description='eval or test') parser.add_argument("--model_path", default=" ", type=str, help="the model path") parser.add_argument('--eval', action='store_true', help="eval") # get command line parameter args = parser.parse_args() model_path = args.model_path mode = args.eval # get config from config.yaml config = yaml_load("./config.yaml") base_cfg = config.get("base", {}) model_cfg = config.get("model", {}) init_input = model_cfg["init_input"] # get data path eval_data_path = base_cfg.get("eval_data") test_data_path = base_cfg.get("test_data") device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # load model check_point = torch.load(model_path) model = eval(check_point["model_name"])(*init_input, True).to(device)
def main(args_dict): test_mode = not args_dict['disable_test_mode'] if test_mode: print "=======================" print "=======TEST MODE=======" print "=======================" timestamp = datetime.now().strftime('%Y%m%d%H%M%S') # Write output into prod_schema_name prod_schema_name = "prod_" + timestamp print "prod_schema_name", prod_schema_name # Create database connections: db_source = DatabaseConnection(path_config='db_config_update_source.yaml') db_address_cache = DatabaseConnection( path_config='db_config_update_source.yaml', search_path='address_cache') db_prod = DatabaseConnection(path_config='db_config_update_source.yaml') CreateAndSetProdSchema(db_prod, prod_schema_name) # Initialize geocoder geocoder = geocoder_lib.Geocoder(db_address_cache, db_prod, test_mode) # Initialize entity lookup entities_lookup = entities.Entities(db_prod) # Table prod_tables.yaml defines a specifications of SQL selects to read # source data and describtion of additional tables to be created. config = utils.yaml_load('prod_tables.yaml') # This is where all the population happens!!! # Go through all the specified data sources and process them, adding data # as needed. We process them in lexicographic order! for key in sorted(config.keys()): config_per_source = config[key] print "Working on source:", key ProcessSource(db_source, db_prod, geocoder, entities_lookup, config_per_source, test_mode) geocoder.PrintStats() entities_lookup.print_statistics() # Process yaml-free sources: process_source_rpvs(db_source, db_prod, geocoder, entities_lookup, test_mode) db_source.close() # Run post processing. # TODO: For now post processing requires access to the profil # source schema. Remove this when fixed. schema_profil = db_prod.get_latest_schema('source_internal_profil_') db_prod.execute('SET search_path="' + prod_schema_name + '", "' + schema_profil + '", public;') post_process.do_post_processing(db_prod, test_mode) # Create materialized view for entity search after all entities # have been created. db_prod.execute(""" CREATE MATERIALIZED VIEW entities_search AS SELECT id, to_tsvector('simple', unaccent(name)) as search_vector FROM entities; CREATE INDEX ON entities_search(search_vector); CREATE INDEX ON entities_search USING gin(search_vector); """) # Grant apps read-only access to the newly created schema and tables within db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'data') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'verejne') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'kataster') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'prepojenia') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'obstaravania') # Commit database changes and close database connections db_address_cache.commit() db_address_cache.close() if test_mode: db_prod.conn.rollback() print('[OK] Rolled back database changes (test mode)') else: db_prod.commit() db_prod.close()
def run(): if len(sys.argv) < 2: print("Usage: python export_csv.py ../legislators-current.yaml > legislators-current.csv") sys.exit(0) data = yaml_load(sys.argv[1]) ############################################### def flatten_object(obj, path, ret): """Takes an object obj and flattens it into a dictionary ret. For instance { "x": { "y": 123 } } is turned into { "x__y": 123 }. """ for k, v in list(obj.items()): if isinstance(v, dict): flatten_object(v, (path + "__" if path else "") + k + "__", ret) elif isinstance(v, list): # don't peek inside lists pass else: ret[path + k] = v return ret # Scan through the records recursively to get a list of column names. # Attempt to preserve the field order as found in the YAML file. Since # any field may be absent, no one record can provide the complete field # order. Build the best field order by looking at what each field tends # to be preceded by. fields = set() preceding_keys = dict() # maps keys to a dict of *previous* keys and how often they occurred for record in data: prev_key = None for key in flatten_object(record, "", OrderedDict()): fields.add(key) preceding_keys.setdefault(key, {}).setdefault(prev_key, 0) preceding_keys[key][prev_key] += 1 prev_key = key # Convert to relative frequencies. for k, v in list(preceding_keys.items()): s = float(sum(v.values())) for k2 in v: v[k2] /= s # Get a good order for the fields. Greedily add keys from left to right # maximizing the conditional probability that the preceding key would # precede the key on the right. field_order = [None] prev_key = None while len(field_order) < len(fields): # Which key is such that prev_key is its most likely precedessor? # We do it this way (and not what is prev_key's most likely follower) # because we should be using a probability (of sorts) that is # conditional on the key being present. Otherwise we lost infrequent # keys. next_key = max([f for f in fields if f not in field_order], key = lambda k : max(preceding_keys[k].get(pk, 0) for pk in field_order)) field_order.append(next_key) prev_key = next_key field_order = field_order[1:] # remove the None at the start # Write CSV header. w = csv.writer(sys.stdout) w.writerow(field_order) # Write the objects. for record in data: obj = flatten_object(record, "", {}) w.writerow([ obj.get(f, "") for f in field_order ])
# Converts the specified YAML file to an equivalent-ish CSV file # (on standard output). # # python export_csv.py ../legislators-current.yaml import sys, csv from collections import OrderedDict from utils import yaml_load if len(sys.argv) < 2: print "Usage: python export_csv.py ../legislators-current.yaml > legislators-current.csv" sys.exit(0) data = yaml_load(sys.argv[1]) ############################################### def flatten_object(obj, path, ret): """Takes an object obj and flattens it into a dictionary ret. For instance { "x": { "y": 123 } } is turned into { "x__y": 123 }. """ for k, v in obj.items(): if isinstance(v, dict): flatten_object(v, (path + "__" if path else "") + k + "__", ret) elif isinstance(v, list): # don't peek inside lists pass else:
def run(): for fn in glob.glob(data_dir() + "/*.yaml") if len(sys.argv) == 1 else sys.argv[1:]: print(fn + "...") data = yaml_load(fn, use_cache=False) yaml_dump(data, fn)
def run(): if len(sys.argv) < 2: print( "Usage: python export_csv.py ../legislators-current.yaml > legislators-current.csv" ) sys.exit(0) data = yaml_load(sys.argv[1]) ############################################### def flatten_object(obj, path, ret): """Takes an object obj and flattens it into a dictionary ret. For instance { "x": { "y": 123 } } is turned into { "x__y": 123 }. """ for k, v in list(obj.items()): if isinstance(v, dict): flatten_object(v, (path + "__" if path else "") + k + "__", ret) elif isinstance(v, list): # don't peek inside lists pass else: ret[path + k] = v return ret # Scan through the records recursively to get a list of column names. # Attempt to preserve the field order as found in the YAML file. Since # any field may be absent, no one record can provide the complete field # order. Build the best field order by looking at what each field tends # to be preceded by. fields = set() preceding_keys = dict( ) # maps keys to a dict of *previous* keys and how often they occurred for record in data: prev_key = None for key in flatten_object(record, "", OrderedDict()): fields.add(key) preceding_keys.setdefault(key, {}).setdefault(prev_key, 0) preceding_keys[key][prev_key] += 1 prev_key = key # Convert to relative frequencies. for k, v in list(preceding_keys.items()): s = float(sum(v.values())) for k2 in v: v[k2] /= s # Get a good order for the fields. Greedily add keys from left to right # maximizing the conditional probability that the preceding key would # precede the key on the right. field_order = [None] prev_key = None while len(field_order) < len(fields): # Which key is such that prev_key is its most likely precedessor? # We do it this way (and not what is prev_key's most likely follower) # because we should be using a probability (of sorts) that is # conditional on the key being present. Otherwise we lost infrequent # keys. next_key = max([f for f in fields if f not in field_order], key=lambda k: max(preceding_keys[k].get(pk, 0) for pk in field_order)) field_order.append(next_key) prev_key = next_key field_order = field_order[1:] # remove the None at the start # Write CSV header. w = csv.writer(sys.stdout) w.writerow(field_order) # Write the objects. for record in data: obj = flatten_object(record, "", {}) w.writerow([obj.get(f, "") for f in field_order])
def __init__(self, path_config='db_config.yaml', search_path=None): config = utils.yaml_load(path_config) self.conn = psycopg2.connect(user=config['user'], dbname=config['db']) if search_path is not None: self.execute('SET search_path = %s', (search_path, ))
def run(): parser = argparse.ArgumentParser(description='handle legis/exec') parser.add_argument('destdir') parser.add_argument('--inpType') args = parser.parse_args() if len(sys.argv) < 2: print("Usage: python everypolitician.py outputbasename/") sys.exit(0) # Load current legislators. if args.inpType == 'leg': data = yaml_load("{0}/legislators-current.yaml".format(govtrackdir)) else: data = yaml_load("{0}/executive.yaml".format(govtrackdir)) data_social_media = {} for legislator in yaml_load("{0}/legislators-social-media.yaml".format(govtrackdir)): data_social_media[legislator['id']['bioguide']] = legislator # Create output files. if args.inpType == 'leg': writers = { "rep": csv.writer(open(args.destdir + "house.csv", "w")), "sen": csv.writer(open(args.destdir + "senate.csv", "w")), } else: writers = { "prez": csv.writer(open(args.destdir + "prez.csv", "w")), "viceprez": csv.writer(open(args.destdir + "viceprez.csv", "w")) } for w in writers.values(): w.writerow([ "id", # "name", "postal_code", "state", # "group", "class_district", "start_date", "end_date", "num_terms", "party", "given_name", "middle_name", "family_name", "suffix", # "sort_name", # "phone", "gender", # "birth_date", "image", # "twitter", # "facebook", # "instagram", # "wikipedia", # "website", "office_code", "office_name" ]) # Write out one row per legislator for their current term. for legislator in data: genRow(legislator, writers)
def ec2metadata(self): if self._instance_cache: return self._instance_cache output = subprocess.check_output(["ec2metadata"]) self._instance_cache = yaml_load(output) return self._instance_cache
def init(self, path): config = utils.yaml_load(path) self.Data = config['Data'] self.Model = config['Model']
def run(): parser = argparse.ArgumentParser(description='handle legis/exec') parser.add_argument('destdir') parser.add_argument('--inpType') args = parser.parse_args() if len(sys.argv) < 2: print("Usage: python everypolitician.py outputbasename/") sys.exit(0) # Load current legislators. if args.inpType == 'leg': data = yaml_load("{0}/legislators-current.yaml".format(govtrackdir)) else: data = yaml_load("{0}/executive.yaml".format(govtrackdir)) data_social_media = {} for legislator in yaml_load( "{0}/legislators-social-media.yaml".format(govtrackdir)): data_social_media[legislator['id']['bioguide']] = legislator # Create output files. if args.inpType == 'leg': writers = { "rep": csv.writer(open(args.destdir + "house.csv", "w")), "sen": csv.writer(open(args.destdir + "senate.csv", "w")), } else: writers = { "prez": csv.writer(open(args.destdir + "prez.csv", "w")), "viceprez": csv.writer(open(args.destdir + "viceprez.csv", "w")) } for w in writers.values(): w.writerow([ "id", # "name", "postal_code", "state", # "group", "class_district", "start_date", "end_date", "num_terms", "party", "given_name", "middle_name", "family_name", "suffix", # "sort_name", # "phone", "gender", # "birth_date", "image", # "twitter", # "facebook", # "instagram", # "wikipedia", # "website", "office_code", "office_name" ]) # Write out one row per legislator for their current term. for legislator in data: genRow(legislator, writers)
def run(): if len(sys.argv) < 2: print("Usage: python everypolitician.py outputbasename/") sys.exit(0) # Load current legislators. data = yaml_load("../legislators-current.yaml") data_social_media = { } for legislator in yaml_load("../legislators-social-media.yaml"): data_social_media[legislator['id']['bioguide']] = legislator # Create output files. writers = { "rep": csv.writer(open(sys.argv[1] + "house.csv", "w")), "sen": csv.writer(open(sys.argv[1] + "senate.csv", "w")), } for w in writers.values(): w.writerow([ "id", "name", "area", "group", "term", "start_date", "end_date", "given_name", "family_name", "honorific_suffix", "sort_name", "phone", "gender", "birth_date", "image", "twitter", "facebook", "instagram", "wikipedia", "website", ]) # Write out one row per legislator for their current term. for legislator in data: term = legislator['terms'][-1] # TODO: "If someone changed party/faction affilation in the middle of the term, you should include two entries, with the relevant start/end dates set." w = writers[term['type']] w.writerow([ legislator['id']['bioguide'], build_name(legislator, term, 'full'), build_area(term), term['party'], CURRENT_CONGRESS, term['start'], term['end'], legislator['name'].get('first'), legislator['name'].get('last'), legislator['name'].get('suffix'), build_name(legislator, term, 'sort'), term.get('phone'), legislator['bio'].get('gender'), legislator['bio'].get('birthday'), "https://theunitedstates.io/images/congress/original/%s.jpg" % legislator['id']['bioguide'], data_social_media.get(legislator['id']['bioguide'], {}).get("social", {}).get("twitter"), data_social_media.get(legislator['id']['bioguide'], {}).get("social", {}).get("facebook"), data_social_media.get(legislator['id']['bioguide'], {}).get("social", {}).get("instagram"), legislator['id'].get('wikipedia', '').replace(" ", "_"), term['url'], ])