def test_crappy_text_insert(engine, schema): is_mysql = 'mysql' in engine.dialect.dialect_description dtype = {'profileid':VARCHAR(10)} if is_mysql else None # mix crappy letters with a few normal ones crap_char_seq = """/_- ?§$&"',:;*()%[]{}|<>=!+#""" + "\\" + "sknalji" # add columns with crappy names # don't do this for MySQL which has more strict rules for column names if not is_mysql: for i in range(5): random_crappy_col_name = ''.join([random.choice(crap_char_seq) for i in range(50)]) df_test = (pd.DataFrame({random_crappy_col_name: ['test', None]}) .rename_axis(['profileid'], axis='index', inplace=False)) # psycopg2 can't process columns with "%" or "(" or ")" df_test = fix_psycopg2_bad_cols(df_test) upsert(engine=engine, schema=schema, df=df_test, if_row_exists='update', dtype=dtype, **default_args) # add crappy text in a column named 'text' create_random_text = lambda: ''.join([random.choice(crap_char_seq) for i in range(10)]) df_test = (pd.DataFrame({'text': [create_random_text() for i in range(10)]}) .rename_axis(['profileid'], axis='index', inplace=False)) upsert(engine=engine, schema=schema, df=df_test, if_row_exists='update', dtype=dtype, **default_args)
def florida(): # Gets url to download file. source = requests.get( "https://services1.arcgis.com/CY1LXxl9zlJeBuRZ/arcgis/rest/services/Florida_Cases_Zips_COVID19/FeatureServer/0/query?where=0%3D0&outFields=*&outFields=ZIP,COUNTYNAME,POName,Cases_1&returnGeometry=false&f=json" ) # Retrieves filename. data = json.loads(source.text) rows = [] for element in data["features"]: rows.append(element["attributes"]) df = pd.DataFrame(rows) cols = [0, 1, 4, 6, 7, 11] df = df[df.columns[cols]] # Uploads dataframe to Postgres database. table_name = 'florida' engine = create_engine(os.getenv('SQLALCHEMY_DATABASE_URI')) df.sort_values(by=['ZIP'], inplace=True) # Some ZIPs span multiple counties, but data is reported separately # for each county, so ZIP cannot be used as index column. df.set_index("OBJECTID", inplace=True) df = pangres.fix_psycopg2_bad_cols(df) pangres.upsert(engine=engine, df=df, table_name=table_name, if_row_exists='update')
def arizona(): file = requests.get( "https://adhsgis.maps.arcgis.com/sharing/rest/content/items/8a2c089c866940bbac0ee70a41ea27bd/data", allow_redirects=True) # Saves file locally and loads into dataframe. READ_EXCEL REQUIRES XLRD DEPENDENCY. open("COVID19CONFIRMED_BYZIP_excel.xls", 'wb').write(file.content) df = pd.read_excel("COVID19CONFIRMED_BYZIP_excel.xls", usecols="A,C") df.drop(df[df['ConfirmedCaseCount'] == "Data Suppressed"].index, inplace=True) # The code below is used to populate the database on first run with city names for each ZIP code. # citynames = [get_city_names(row) for row in df['POSTCODE']] # df.insert(loc=1, column="City Name", value=citynames) # Uploads dataframe to Postgres database. table_name = 'arizona' engine = create_engine(os.getenv('SQLALCHEMY_DATABASE_URI')) df.set_index('POSTCODE', inplace=True) df = pangres.fix_psycopg2_bad_cols(df) pangres.upsert(engine=engine, df=df, table_name=table_name, if_row_exists='update')
def massachusetts(): # Gets url to download file. domain = "https://www.mass.gov" source = requests.get( "https://www.mass.gov/info-details/covid-19-response-reporting") soup = BeautifulSoup(source.content, 'html.parser') element = soup.find(text=re.compile( 'Raw data used to create the Weekly Public Health Report')) element = element.find_next_sibling('a') download_url = domain + element['href'] headers = { "Host": "www.mass.gov", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Referer": "https://www.mass.gov/info-details/covid-19-response-reporting" } # Retrieves filename. file = requests.get(download_url, allow_redirects=True) filename = file.headers.get('content-disposition') filename = re.findall('filename=(.+)', filename) filename = re.findall( r'"([^"]*)"', filename[0]) #regex to find text within double quotes # At this stage, filename[0] gives filename. # Saves file locally and loads into dataframe. READ_EXCEL REQUIRES XLRD DEPENDENCY. open(filename[0], 'wb').write(file.content) df = pd.read_excel(filename[0], sheet_name="City_town", usecols="A:D,H,I", na_values="*") df['Percent positivity'] = df['Percent positivity'].multiply(100) df = df.where(df != "<5", df["Positive Tests Last 14 days"], axis=0) df = df.drop(columns=["Positive Tests Last 14 days"]) # Uploads dataframe to Postgres database. table_name = 'massachusetts' engine = create_engine(os.getenv('SQLALCHEMY_DATABASE_URI')) df.set_index('City/Town', inplace=True) df = pangres.fix_psycopg2_bad_cols(df) pangres.upsert(engine=engine, df=df, table_name=table_name, if_row_exists='update')
def test_bad_column_names(engine, schema, iteration): # add columns with bad names # don't do this for MySQL which has more strict rules for column names if 'mysql' in engine.dialect.dialect_description: pytest.skip( 'MySQL has very strict rules for column names so we do not even test it' ) random_bad_col_name = ''.join( random.choice(bad_char_seq) for i in range(50)) df_test = (pd.DataFrame({ random_bad_col_name: ['test', None] }).rename_axis(['profileid'], axis='index', inplace=False)) # psycopg2 can't process columns with "%" or "(" or ")" so we will need `fix_psycopg2_bad_cols` if 'postgres' in engine.dialect.dialect_description: df_test = fix_psycopg2_bad_cols(df_test) upsert_or_aupsert(con=engine, schema=schema, df=df_test, table_name=TableNames.BAD_COLUMN_NAMES, if_row_exists='update')