예제 #1
0
파일: indeed.py 프로젝트: rpinho/happy.li
def update_salaries(jobs=[], cities=[], df=[], skiprows=0,
                    table='salary', verbose_=True):

    columns = ['job', 'city', 'state', 'salary']
    #columns += ['n_postings', 'state_name']
    #columns += ['relative_salary', 'salaries_max', 'salaries_median',
    #            'trend_last2first', 'trend_median', 'trend_max']

    # get jobs from text file
    if not any(jobs):
        jobs = pd.read_csv(PATH + 'jobs.txt')[skiprows:]
        #jobs.job = jobs.job.str.title()

    # get unique cities from postings
    if not any(cities):
        cities = db.get_cities_from_db()

    if not any(df):
        df = pd.DataFrame(columns=columns)

    for job, location in itertools.product(jobs.job.values, cities.values):
        city, state = location
        if db.queryNotInDb(job, city, state, table):
            df = scrape_indeed(job, city, state, df)
            if verbose_:
                print df.tail(1)
            db.to_sql(df.tail(1), table, 'append', null=0)

    return df
예제 #2
0
def get_data_scientist_salary_world_from_kd():
    name = 'data_scientist_salary_world'
    url = 'http://www.kdnuggets.com/2013/02/salary-analytics-data-mining-data-science-professionals.html'
    df = pd.read_html(url)[3]
    df.columns = df.iloc[0]
    df = df.iloc[1:]
    df = df.fillna(mdb.NULL)
    db.to_sql(df, name, 'replace')
    return df
예제 #3
0
def get_us_millionaires():
    name = 'phoenix_millionaires_2013'
    url = 'http://blogs.wsj.com/public/resources/documents/st_millionaires20140116.html'
    df = pd.read_html(url)[0]
    df.drop(5, axis=1, inplace=True)
    df.columns = ['State', 'Households', 'Millionaires', '2013 Ratio',
                  'Ratio change from 2012']

    db.to_sql(df, name, 'replace')

    return df
예제 #4
0
def get_state_codes_from_wikipedia():
    name = 'state_codes'
    #url = 'https://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations'
    fname = 'List_of_U.S._state_abbreviations.xlsx'
    df = pd.read_excel(PATH + fname, 'Sheet1', header=0)
    # columns = name of state, code of state
    df = df[['Name', 'ANSI[2]']]
    df.columns = ['Name', 'Code']
    # MySQL
    db.to_sql(df, name, 'replace')
    return df
예제 #5
0
def create_top_cities_from_db(n=20):
    jobs = db.get_jobs_from_db()
    cities = []
    for job1, job2 in itertools.product(jobs.job.values, jobs.job.values):
        df = get_cities(job1, job2).head(n)
        cities.extend((df.city + ', ' + df.state).values)

    c = collections.Counter(cities)
    city = [s.split(', ')[0] for s in c.keys()]
    state = [s.split(', ')[1] for s in c.keys()]
    d = {'city':city, 'state':state, 'top20':c.values()}
    top_cities = pd.DataFrame(d)
    db.to_sql(top_cities, 'top20')
    return top_cities
예제 #6
0
파일: indeed.py 프로젝트: rpinho/happy.li
def get_salaries_for_job(job, table='salary', verbose_=True, save_=True):

    columns = ['job', 'city', 'state', 'salary']
    df = pd.DataFrame(columns=columns)

    # get only the cities that actually have postings for the job
    cities = db.get_cities_for_job(job)

    for city, state in cities.values:
        if verbose_: print city, state
        if db.queryNotInDb(job, city, state, table):
            df = scrape_indeed(job, city, state, df)
            if verbose_: print df.tail(1)
            if save_: db.to_sql(df.tail(1), table, 'append', null=0)

    return df
예제 #7
0
def get_cnn_money_best_small_towns():
    name = 'cnn_money_best_small_towns_2012'
    url = 'http://money.cnn.com/magazines/moneymag/best-places/2013/full_list/'
    # NOTE: dict is not ordered..
    classes = {"rank listCol1": 'rank',
               "town listCol2": 'town',
               "population-rounded listCol3": 'population',
               "medhomeprice-visible": 'medhomeprice',
               "medfaminc-visible": 'medfaminc',
               "jobgrowth-visible": 'jobgrowth',
               "readingTestScores-visible": 'readingTestScores',
               "mathTestScores-visible": 'mathTestScores',
               "personalCrime-visible": 'personalCrime',
               "propertyCrime-visible": 'propertyCrime',
               "state": 'state',
               "region": 'region'}
    session = requests.session()
    response = session.get(url)
    soup = BeautifulSoup(response.text)
    d = {}
    for class_, column in classes.items():
        tags = soup.findAll('span', {'class':class_})
        matches = re.findall(r'"%s">(.*?)</span>'%class_, str(tags))
        d[column] = matches

    # pandas dataframe
    df = pd.DataFrame(d)

    # strip symbols
    df.jobgrowth = df.jobgrowth.str.strip('%')
    df.medfaminc = df.medfaminc.str.strip('$').str.replace(',', '')
    df.medhomeprice = df.medhomeprice.str.strip('$').str.replace(',', '')
    df.population = df.population.str.replace(',', '')
    # strip state (df already has column 'state')
    df.town = df.town.str.replace(r',(.*)', '')

    # re-order columns
    columns = ['rank','town','state','population','medhomeprice','medfaminc',
               'jobgrowth','readingTestScores','mathTestScores','personalCrime',
               'propertyCrime','region']
    df = df[columns]

    # MySQL
    db.to_sql(df, name, 'replace')

    return df
예제 #8
0
def scrape_payscale_for_salary(url, save_=False):
    session = requests.session()
    response = session.get(url)

    # TODO: solve 'latin' to utf problems
    if save_:
        d = {'url':[url],
             'text':[response.text],
             'date':[time.strftime('%Y-%m-%d %H:%M:%S')]}
        db.to_sql(pd.DataFrame(d), 'payscale', 'append')

    soup = BeautifulSoup(response.text)
    attrs = ["you_label", "median_only_you_label"]
    tags = soup.findAll('div', attrs) ;print tags
    if tags:
        matches = re.findall(r'([0-9]*,*[0-9]*)\r', str(tags))[1::2]
        return [locale.atoi(match) for match in matches]
예제 #9
0
파일: visas.py 프로젝트: rpinho/happy.li
def scrape_and_write_visa_table(route, table, year=2013, url=VISA, write_=True):
    url = '-'.join((url, str(year)))
    url = '/'.join((url, route))
    df = pd.read_html(url, header=0, infer_types=False)[0]
    df.iloc[:,0] = df.iloc[:,0].astype(str)
    df.iloc[:,1] = df.iloc[:,1].apply(mk_int)
    #jobs.iloc[:,1] = jobs.iloc[:,1].astype(int)

    # clean data
    if 'jobs' in table:
        df = clean_visa_jobs_names(df)

    df['Year'] = year
    if write_:
        db.to_sql(df, table, 'replace')

    return df
예제 #10
0
def get_country_codes_from_wiki():
    name = 'country_codes'
    #url = 'https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2'
    fname = 'ISO_3166.xlsx'
    df = pd.read_excel(fname, 'Sheet1', header=0, skiprows=0)
    # select columns
    df = df[['Code', 'Country name', 'ccTLD']]
    df.columns = [u'Code', u'Country', u'ccTLD']
    # replace 'GB' with 'UK'
    df.Code[81] = u'UK'#df.ccTLD[82]
    # replace '.gb' with '.uk'
    df.ccTLD[81] = u'.co.uk'#df.ccTLD[82]
    df = df[~df.Country.isnull()]
    df.iloc[159].Code = 'NA'

    db.to_sql(df, name, 'replace')

    return df
예제 #11
0
def get_mercer_quality_of_living(sheetname='quality_of_living'):
    # url = 'http://www.mercer.com/press-releases/quality-of-living-report-2012'
    fname = 'mercer-quality-of-living-report-2012.xlsx'
    name = '_'.join(('mercer', sheetname, '2012'))
    df = pd.read_excel(fname, sheetname, header=1, skiprows=1)
    # create new column 'State'
    df['State'] = pd.Series([mdb.NULL]*len(df), index=df.index)
    # get american states and split city, state
    us_i = df.query('Country == "United States"').index
    states = list(df.City[us_i].str.split(', ').apply(lambda x: x[1]).values)
    #states = ['HI', 'CA', 'MA', 'IL', 'DC', 'NY', 'WA', 'PA', 'MI']
    df.State[us_i] = states
    df.City[us_i] = df.City[us_i].str.replace(r', (..)', '')
    # replace NaN with MySQL NULL
    #mdb.FIELD_TYPE.NULL
    #df.State.fillna(mdb.NULL, inplace=True)

    # MySQL
    db.to_sql(df, name, 'replace')

    return df
예제 #12
0
파일: indeed.py 프로젝트: rpinho/happy.li
def indeed_api(job, city, state, table='postings', maxResults=1001,
               source='backend',  params=params, save_=True):

    # columns to keep
    columns = [u'jobkey', u'job', u'jobtitle', u'company',
               u'city', u'state', u'formattedLocation', u'country',
               u'date', u'formattedRelativeTime',
               u'latitude', u'longitude', u'url']

    # columns to add
    columns += ['indeed_city', 'source', 'timestamp']

    # query
    location = ', '.join((city, state))
    params['l'] = location
    params['q'] = job

    # first page of results (n=25)
    params['start'] = 0
    session = requests.session()
    response = session.get(API, params=params)
    df = pd.DataFrame(response.json()['results'])

    if df.empty:
        return None

    df['job'] = job
    df['indeed_city'] = df['city']
    df['city'] = city
    df['source'] = source
    df['timestamp'] = pd.tslib.Timestamp.utcnow()

    df.formattedRelativeTime = map(
        lambda x: int(x[0]), df.formattedRelativeTime.str.findall(r'[0-9].'))

    if save_:
        db.to_sql(df[columns], table, 'append')

    totalResults = response.json()['totalResults']
    print totalResults

    print map(response.json().get, [u'start', u'end', u'pageNumber'])

    # next pages of results
    limit = params['limit']
    if totalResults <= limit:
        return None

    for start in range(limit, totalResults+limit+1, limit):

        if start > maxResults:
            return None

        params['start'] = start
        response = session.get(API, params=params)
        df = pd.DataFrame(response.json()['results'])

        if df.empty:
            return None

        df['job'] = job
        df['indeed_city'] = df['city']
        df['city'] = city
        df['source'] = source
        df['timestamp'] = pd.tslib.Timestamp.utcnow()

        df.formattedRelativeTime = map(
            lambda x: int(x[0]),
            df.formattedRelativeTime.str.findall(r'[0-9].'))

        if save_:
            db.to_sql(df[columns], table, 'append')

        print map(response.json().get, [u'start', u'end', u'pageNumber'])