def update_salaries(jobs=[], cities=[], df=[], skiprows=0, table='salary', verbose_=True): columns = ['job', 'city', 'state', 'salary'] #columns += ['n_postings', 'state_name'] #columns += ['relative_salary', 'salaries_max', 'salaries_median', # 'trend_last2first', 'trend_median', 'trend_max'] # get jobs from text file if not any(jobs): jobs = pd.read_csv(PATH + 'jobs.txt')[skiprows:] #jobs.job = jobs.job.str.title() # get unique cities from postings if not any(cities): cities = db.get_cities_from_db() if not any(df): df = pd.DataFrame(columns=columns) for job, location in itertools.product(jobs.job.values, cities.values): city, state = location if db.queryNotInDb(job, city, state, table): df = scrape_indeed(job, city, state, df) if verbose_: print df.tail(1) db.to_sql(df.tail(1), table, 'append', null=0) return df
def get_data_scientist_salary_world_from_kd(): name = 'data_scientist_salary_world' url = 'http://www.kdnuggets.com/2013/02/salary-analytics-data-mining-data-science-professionals.html' df = pd.read_html(url)[3] df.columns = df.iloc[0] df = df.iloc[1:] df = df.fillna(mdb.NULL) db.to_sql(df, name, 'replace') return df
def get_us_millionaires(): name = 'phoenix_millionaires_2013' url = 'http://blogs.wsj.com/public/resources/documents/st_millionaires20140116.html' df = pd.read_html(url)[0] df.drop(5, axis=1, inplace=True) df.columns = ['State', 'Households', 'Millionaires', '2013 Ratio', 'Ratio change from 2012'] db.to_sql(df, name, 'replace') return df
def get_state_codes_from_wikipedia(): name = 'state_codes' #url = 'https://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations' fname = 'List_of_U.S._state_abbreviations.xlsx' df = pd.read_excel(PATH + fname, 'Sheet1', header=0) # columns = name of state, code of state df = df[['Name', 'ANSI[2]']] df.columns = ['Name', 'Code'] # MySQL db.to_sql(df, name, 'replace') return df
def create_top_cities_from_db(n=20): jobs = db.get_jobs_from_db() cities = [] for job1, job2 in itertools.product(jobs.job.values, jobs.job.values): df = get_cities(job1, job2).head(n) cities.extend((df.city + ', ' + df.state).values) c = collections.Counter(cities) city = [s.split(', ')[0] for s in c.keys()] state = [s.split(', ')[1] for s in c.keys()] d = {'city':city, 'state':state, 'top20':c.values()} top_cities = pd.DataFrame(d) db.to_sql(top_cities, 'top20') return top_cities
def get_salaries_for_job(job, table='salary', verbose_=True, save_=True): columns = ['job', 'city', 'state', 'salary'] df = pd.DataFrame(columns=columns) # get only the cities that actually have postings for the job cities = db.get_cities_for_job(job) for city, state in cities.values: if verbose_: print city, state if db.queryNotInDb(job, city, state, table): df = scrape_indeed(job, city, state, df) if verbose_: print df.tail(1) if save_: db.to_sql(df.tail(1), table, 'append', null=0) return df
def get_cnn_money_best_small_towns(): name = 'cnn_money_best_small_towns_2012' url = 'http://money.cnn.com/magazines/moneymag/best-places/2013/full_list/' # NOTE: dict is not ordered.. classes = {"rank listCol1": 'rank', "town listCol2": 'town', "population-rounded listCol3": 'population', "medhomeprice-visible": 'medhomeprice', "medfaminc-visible": 'medfaminc', "jobgrowth-visible": 'jobgrowth', "readingTestScores-visible": 'readingTestScores', "mathTestScores-visible": 'mathTestScores', "personalCrime-visible": 'personalCrime', "propertyCrime-visible": 'propertyCrime', "state": 'state', "region": 'region'} session = requests.session() response = session.get(url) soup = BeautifulSoup(response.text) d = {} for class_, column in classes.items(): tags = soup.findAll('span', {'class':class_}) matches = re.findall(r'"%s">(.*?)</span>'%class_, str(tags)) d[column] = matches # pandas dataframe df = pd.DataFrame(d) # strip symbols df.jobgrowth = df.jobgrowth.str.strip('%') df.medfaminc = df.medfaminc.str.strip('$').str.replace(',', '') df.medhomeprice = df.medhomeprice.str.strip('$').str.replace(',', '') df.population = df.population.str.replace(',', '') # strip state (df already has column 'state') df.town = df.town.str.replace(r',(.*)', '') # re-order columns columns = ['rank','town','state','population','medhomeprice','medfaminc', 'jobgrowth','readingTestScores','mathTestScores','personalCrime', 'propertyCrime','region'] df = df[columns] # MySQL db.to_sql(df, name, 'replace') return df
def scrape_payscale_for_salary(url, save_=False): session = requests.session() response = session.get(url) # TODO: solve 'latin' to utf problems if save_: d = {'url':[url], 'text':[response.text], 'date':[time.strftime('%Y-%m-%d %H:%M:%S')]} db.to_sql(pd.DataFrame(d), 'payscale', 'append') soup = BeautifulSoup(response.text) attrs = ["you_label", "median_only_you_label"] tags = soup.findAll('div', attrs) ;print tags if tags: matches = re.findall(r'([0-9]*,*[0-9]*)\r', str(tags))[1::2] return [locale.atoi(match) for match in matches]
def scrape_and_write_visa_table(route, table, year=2013, url=VISA, write_=True): url = '-'.join((url, str(year))) url = '/'.join((url, route)) df = pd.read_html(url, header=0, infer_types=False)[0] df.iloc[:,0] = df.iloc[:,0].astype(str) df.iloc[:,1] = df.iloc[:,1].apply(mk_int) #jobs.iloc[:,1] = jobs.iloc[:,1].astype(int) # clean data if 'jobs' in table: df = clean_visa_jobs_names(df) df['Year'] = year if write_: db.to_sql(df, table, 'replace') return df
def get_country_codes_from_wiki(): name = 'country_codes' #url = 'https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2' fname = 'ISO_3166.xlsx' df = pd.read_excel(fname, 'Sheet1', header=0, skiprows=0) # select columns df = df[['Code', 'Country name', 'ccTLD']] df.columns = [u'Code', u'Country', u'ccTLD'] # replace 'GB' with 'UK' df.Code[81] = u'UK'#df.ccTLD[82] # replace '.gb' with '.uk' df.ccTLD[81] = u'.co.uk'#df.ccTLD[82] df = df[~df.Country.isnull()] df.iloc[159].Code = 'NA' db.to_sql(df, name, 'replace') return df
def get_mercer_quality_of_living(sheetname='quality_of_living'): # url = 'http://www.mercer.com/press-releases/quality-of-living-report-2012' fname = 'mercer-quality-of-living-report-2012.xlsx' name = '_'.join(('mercer', sheetname, '2012')) df = pd.read_excel(fname, sheetname, header=1, skiprows=1) # create new column 'State' df['State'] = pd.Series([mdb.NULL]*len(df), index=df.index) # get american states and split city, state us_i = df.query('Country == "United States"').index states = list(df.City[us_i].str.split(', ').apply(lambda x: x[1]).values) #states = ['HI', 'CA', 'MA', 'IL', 'DC', 'NY', 'WA', 'PA', 'MI'] df.State[us_i] = states df.City[us_i] = df.City[us_i].str.replace(r', (..)', '') # replace NaN with MySQL NULL #mdb.FIELD_TYPE.NULL #df.State.fillna(mdb.NULL, inplace=True) # MySQL db.to_sql(df, name, 'replace') return df
def indeed_api(job, city, state, table='postings', maxResults=1001, source='backend', params=params, save_=True): # columns to keep columns = [u'jobkey', u'job', u'jobtitle', u'company', u'city', u'state', u'formattedLocation', u'country', u'date', u'formattedRelativeTime', u'latitude', u'longitude', u'url'] # columns to add columns += ['indeed_city', 'source', 'timestamp'] # query location = ', '.join((city, state)) params['l'] = location params['q'] = job # first page of results (n=25) params['start'] = 0 session = requests.session() response = session.get(API, params=params) df = pd.DataFrame(response.json()['results']) if df.empty: return None df['job'] = job df['indeed_city'] = df['city'] df['city'] = city df['source'] = source df['timestamp'] = pd.tslib.Timestamp.utcnow() df.formattedRelativeTime = map( lambda x: int(x[0]), df.formattedRelativeTime.str.findall(r'[0-9].')) if save_: db.to_sql(df[columns], table, 'append') totalResults = response.json()['totalResults'] print totalResults print map(response.json().get, [u'start', u'end', u'pageNumber']) # next pages of results limit = params['limit'] if totalResults <= limit: return None for start in range(limit, totalResults+limit+1, limit): if start > maxResults: return None params['start'] = start response = session.get(API, params=params) df = pd.DataFrame(response.json()['results']) if df.empty: return None df['job'] = job df['indeed_city'] = df['city'] df['city'] = city df['source'] = source df['timestamp'] = pd.tslib.Timestamp.utcnow() df.formattedRelativeTime = map( lambda x: int(x[0]), df.formattedRelativeTime.str.findall(r'[0-9].')) if save_: db.to_sql(df[columns], table, 'append') print map(response.json().get, [u'start', u'end', u'pageNumber'])