def test_string_date_pattern(self): assert _extract_date('Sep 21 2017') == '2017-09-21' assert _extract_date('Mar 1 2011') == '2011-03-01' assert _extract_date('Apr 7 2009') == '2009-04-07' assert _extract_date('January 2016') == '2016-01-01' assert _extract_date('Oct 2014') == '2014-10-01' assert _extract_date('2015') == '2015-01-01' assert _extract_date('6 April 2018') == '2018-04-06' assert _extract_date('8 Dec, 2010') == '2010-12-08'
def run(): start_index = os.environ["BATCHPAR_start_index"] end_index = os.environ["BATCHPAR_end_index"] #mysqldb_config = os.environ["BATCHPAR_config"] es_host = os.environ["BATCHPAR_outinfo"] es_port = os.environ["BATCHPAR_out_port"] es_index = os.environ["BATCHPAR_out_index"] es_type = os.environ["BATCHPAR_out_type"] entity_type = os.environ["BATCHPAR_entity_type"] db = os.environ["BATCHPAR_db"] aws_auth_region = os.environ["BATCHPAR_aws_auth_region"] # Read in the US states static_engine = get_mysql_engine("BATCHPAR_config", "mysqldb", "static_data") states_lookup = { row['state_code']: row['state_name'] for _, row in pd.read_sql_table('us_states_lookup', static_engine).iterrows() } states_lookup[None] = None states_lookup[''] = None # Get continent lookup continent_lookup = get_continent_lookup() engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db) Session = sessionmaker(bind=engine) session = Session() cols = [ "application_id", "full_project_num", "fy", "org_city", "org_country", "org_state", "org_zipcode", "org_name", "project_start", "project_end", "project_terms", "project_title", "total_cost", "phr", "ic_name" ] cols_attrs = [getattr(Projects, c) for c in cols] batch_selection = session.query(*cols_attrs).filter( Projects.application_id >= start_index, Projects.application_id <= end_index).selectable df = pd.read_sql(batch_selection, session.bind) df.columns = [c[13::] for c in df.columns] # remove the 'nih_projects_' prefix # geocode the dataframe df = df.rename(columns={'org_city': 'city', 'org_country': 'country'}) df = geocode_dataframe(df) # append iso codes for country df = country_iso_code_dataframe(df) # clean start and end dates for col in ["project_start", "project_end"]: df[col] = df[col].apply(lambda x: _extract_date(x)) # currency is the same for the whole dataset df['total_cost_currency'] = 'USD' # output to elasticsearch field_null_mapping = load_json_from_pathstub("tier_1/field_null_mappings/", "health_scanner.json") strans_kwargs = { 'filename': 'nih.json', 'from_key': 'tier_0', 'to_key': 'tier_1', 'ignore': ['application_id'] } es = ElasticsearchPlus( hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, no_commit=("AWSBATCHTEST" in os.environ), entity_type=entity_type, strans_kwargs=strans_kwargs, field_null_mapping=field_null_mapping, null_empty_str=True, coordinates_as_floats=True, country_detection=True, listify_terms=True, terms_delimiters=(";", ","), caps_to_camel_case=True, null_pairs={"currency_total_cost": "cost_total_project"}) for _, row in df.iterrows(): doc = dict(row.loc[~pd.isnull(row)]) if 'country' in doc: # Try to patch broken US data if doc['country'] == '' and doc['org_state'] != '': doc['country'] = "United States" doc['continent'] = "NA" doc['placeName_state_organisation'] = states_lookup[ doc['org_state']] if 'continent' in doc: continent_code = doc['continent'] else: continent_code = None doc['placeName_continent_organisation'] = continent_lookup[ continent_code] if 'ic_name' in doc: doc['ic_name'] = [doc['ic_name']] uid = doc.pop("application_id") es.index(index=es_index, doc_type=es_type, id=uid, body=doc)
def test_invalid_year_returns_none(self): assert _extract_date('no year') is None assert _extract_date('nan') is None assert _extract_date('-') is None
def test_valid_year_extract(self): assert _extract_date('2019') == '2019-01-01' assert _extract_date('sometime in 2011') == '2011-01-01' assert _extract_date('maybe 2019 or 2020') == '2019-01-01'
def test_invalid_day_returns_year(self): assert _extract_date('Mar 38 2001') == '2001-01-01' assert _extract_date('2000-09-40') == '2000-01-01' assert _extract_date('5/32/2017') == '2017-01-01'
def test_invalid_month_returns_year(self): assert _extract_date('Cat 12 2009') == '2009-01-01' assert _extract_date('2000-19-09') == '2000-01-01' assert _extract_date('20/4/2009') == '2009-01-01'
def test_slash_date_pattern(self): assert _extract_date('5/31/2020') == '2020-05-31' assert _extract_date('11/1/2012') == '2012-11-01' assert _extract_date('1/1/2010') == '2010-01-01' assert _extract_date('2000/12/01') == '2000-12-01' assert _extract_date('1999/04/20') == '1999-04-20'
def test_dash_date_pattern(self): assert _extract_date('2016-07-31') == '2016-07-31' assert _extract_date('2010-12-01') == '2010-12-01' assert _extract_date('2020-01-04') == '2020-01-04'