예제 #1
0
def test_crappy_text_insert(engine, schema):
    is_mysql = 'mysql' in engine.dialect.dialect_description
    dtype = {'profileid':VARCHAR(10)} if is_mysql else None
    
    # mix crappy letters with a few normal ones
    crap_char_seq = """/_- ?§$&"',:;*()%[]{}|<>=!+#""" + "\\" + "sknalji"  

    # add columns with crappy names
    # don't do this for MySQL which has more strict rules for column names 
    if not is_mysql:
        for i in range(5):
            random_crappy_col_name = ''.join([random.choice(crap_char_seq)
                                              for i in range(50)])

            df_test = (pd.DataFrame({random_crappy_col_name: ['test', None]})
                       .rename_axis(['profileid'], axis='index', inplace=False))

            # psycopg2 can't process columns with "%" or "(" or ")"
            df_test = fix_psycopg2_bad_cols(df_test)
            upsert(engine=engine, schema=schema, df=df_test, if_row_exists='update', dtype=dtype, **default_args)

    # add crappy text in a column named 'text'
    create_random_text = lambda: ''.join([random.choice(crap_char_seq)
                                          for i in range(10)])

    df_test = (pd.DataFrame({'text': [create_random_text() for i in range(10)]})
               .rename_axis(['profileid'], axis='index', inplace=False))
    upsert(engine=engine, schema=schema, df=df_test, if_row_exists='update', dtype=dtype, **default_args)
예제 #2
0
def florida():
    # Gets url to download file.
    source = requests.get(
        "https://services1.arcgis.com/CY1LXxl9zlJeBuRZ/arcgis/rest/services/Florida_Cases_Zips_COVID19/FeatureServer/0/query?where=0%3D0&outFields=*&outFields=ZIP,COUNTYNAME,POName,Cases_1&returnGeometry=false&f=json"
    )

    # Retrieves filename.
    data = json.loads(source.text)

    rows = []
    for element in data["features"]:
        rows.append(element["attributes"])

    df = pd.DataFrame(rows)
    cols = [0, 1, 4, 6, 7, 11]
    df = df[df.columns[cols]]

    # Uploads dataframe to Postgres database.
    table_name = 'florida'
    engine = create_engine(os.getenv('SQLALCHEMY_DATABASE_URI'))

    df.sort_values(by=['ZIP'], inplace=True)

    # Some ZIPs span multiple counties, but data is reported separately
    # for each county, so ZIP cannot be used as index column.
    df.set_index("OBJECTID", inplace=True)

    df = pangres.fix_psycopg2_bad_cols(df)
    pangres.upsert(engine=engine,
                   df=df,
                   table_name=table_name,
                   if_row_exists='update')
예제 #3
0
def arizona():
    file = requests.get(
        "https://adhsgis.maps.arcgis.com/sharing/rest/content/items/8a2c089c866940bbac0ee70a41ea27bd/data",
        allow_redirects=True)

    # Saves file locally and loads into dataframe. READ_EXCEL REQUIRES XLRD DEPENDENCY.
    open("COVID19CONFIRMED_BYZIP_excel.xls", 'wb').write(file.content)
    df = pd.read_excel("COVID19CONFIRMED_BYZIP_excel.xls", usecols="A,C")

    df.drop(df[df['ConfirmedCaseCount'] == "Data Suppressed"].index,
            inplace=True)
    # The code below is used to populate the database on first run with city names for each ZIP code.
    # citynames = [get_city_names(row) for row in df['POSTCODE']]
    # df.insert(loc=1, column="City Name", value=citynames)

    # Uploads dataframe to Postgres database.
    table_name = 'arizona'
    engine = create_engine(os.getenv('SQLALCHEMY_DATABASE_URI'))

    df.set_index('POSTCODE', inplace=True)
    df = pangres.fix_psycopg2_bad_cols(df)
    pangres.upsert(engine=engine,
                   df=df,
                   table_name=table_name,
                   if_row_exists='update')
예제 #4
0
def massachusetts():
    # Gets url to download file.
    domain = "https://www.mass.gov"
    source = requests.get(
        "https://www.mass.gov/info-details/covid-19-response-reporting")
    soup = BeautifulSoup(source.content, 'html.parser')
    element = soup.find(text=re.compile(
        'Raw data used to create the Weekly Public Health Report'))
    element = element.find_next_sibling('a')
    download_url = domain + element['href']
    headers = {
        "Host": "www.mass.gov",
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Referer":
        "https://www.mass.gov/info-details/covid-19-response-reporting"
    }

    # Retrieves filename.
    file = requests.get(download_url, allow_redirects=True)
    filename = file.headers.get('content-disposition')
    filename = re.findall('filename=(.+)', filename)
    filename = re.findall(
        r'"([^"]*)"', filename[0])  #regex to find text within double quotes
    # At this stage, filename[0] gives filename.

    # Saves file locally and loads into dataframe. READ_EXCEL REQUIRES XLRD DEPENDENCY.
    open(filename[0], 'wb').write(file.content)
    df = pd.read_excel(filename[0],
                       sheet_name="City_town",
                       usecols="A:D,H,I",
                       na_values="*")
    df['Percent positivity'] = df['Percent positivity'].multiply(100)
    df = df.where(df != "<5", df["Positive Tests Last 14 days"], axis=0)
    df = df.drop(columns=["Positive Tests Last 14 days"])
    # Uploads dataframe to Postgres database.
    table_name = 'massachusetts'
    engine = create_engine(os.getenv('SQLALCHEMY_DATABASE_URI'))

    df.set_index('City/Town', inplace=True)
    df = pangres.fix_psycopg2_bad_cols(df)
    pangres.upsert(engine=engine,
                   df=df,
                   table_name=table_name,
                   if_row_exists='update')
예제 #5
0
def test_bad_column_names(engine, schema, iteration):
    # add columns with bad names
    # don't do this for MySQL which has more strict rules for column names
    if 'mysql' in engine.dialect.dialect_description:
        pytest.skip(
            'MySQL has very strict rules for column names so we do not even test it'
        )

    random_bad_col_name = ''.join(
        random.choice(bad_char_seq) for i in range(50))
    df_test = (pd.DataFrame({
        random_bad_col_name: ['test', None]
    }).rename_axis(['profileid'], axis='index', inplace=False))

    # psycopg2 can't process columns with "%" or "(" or ")" so we will need `fix_psycopg2_bad_cols`
    if 'postgres' in engine.dialect.dialect_description:
        df_test = fix_psycopg2_bad_cols(df_test)
    upsert_or_aupsert(con=engine,
                      schema=schema,
                      df=df_test,
                      table_name=TableNames.BAD_COLUMN_NAMES,
                      if_row_exists='update')