def load_userstats(): user_stats = [{ 'bbl': stats['pad_bbl'], **stats, } for stats in json.loads(USERSTATS_PATH.read_text())] db.drop_and_create_table('userstats', USERSTATS_COLUMNS) db.insert_many('userstats', list(USERSTATS_COLUMNS.keys()), user_stats)
def download_census_data_into_db(): from db import db db.drop_and_create_table("census", CENSUS_COLUMNS) for county in ALL_COUNTIES: county_name = COUNTY_NAMES[county] print(f"Downloading data for {county_name} county.") rows = get_county_tract_data(county) db.insert_many('census', list(CENSUS_COLUMNS.keys()), rows)
def load_oil_data(file, rename, marker): print('load %s' % file) df = pd.read_csv(file) df = df.rename(index=str, columns=rename) df['marker'] = marker df['date'] = pd.to_datetime(df['date'], errors='coerce') df = df.dropna() REQUIRED_FIELDS = ['date', 'price', 'marker'] df = df[REQUIRED_FIELDS] df = df[df.apply(row_filter, axis=1)] db.insert_many(df.to_dict('records'))
def load_data(file, rename_maps, year, map_lambdas={}): print('load %s' % file) df = pd.read_csv(file) df = df.rename(index=str, columns=rename_maps) df['company_size'] = df.apply(lambda row: int(re.findall(r'\d+', row['company_size'])[-1]) or 100, axis=1) df['experience'] = df.apply(lambda row: float(re.findall(r'[\d.]+', row['experience'])[-1]) or 1, axis=1) df['skills'] = df.apply(get_skills, axis=1) for key, cb in map_lambdas.items(): df[key] = df.apply(cb, axis=1) df['publication_date'] = df.apply( lambda row: datetime.combine( fake.date_between(start_date=date(year, 1, 1), end_date=date(year, 12, 31)), datetime.min.time()), axis=1) df = df[REQUIRED_FIELDS] df = df[df.apply(vacancy_row_filter, axis=1)] db.insert_many(df.to_dict('records'))
def performEachDay(): today = datetime.today() print("Checking brent oil price", today) todayPrice = quandl.get("FRED/DCOILBRENTEU", start_date=today) todayPrice['date'] = todayPrice.index df = todayPrice.rename(columns={'Value': 'price'}) df['marker'] = "Brent" df['date'] = pd.to_datetime(df['date'], errors='coerce') df = df.dropna() REQUIRED_FIELDS = ['date', 'price', 'marker'] df = df[REQUIRED_FIELDS] df = df[df.apply(row_filter, axis=1)] records = df.to_dict('records') print(df) if len(records) > 0: print('new records available, inserting...') db.insert_many(records)