Python read_html示例，pandas.read_html Python示例

示例#1

0

显示文件

文件： CR.py 项目： alfurey/electricitymap

def fetch_production(country_code='CR', session=None):
    # Do not use existing session as some amount of cache is taking place
    r = requests.session()
    url = 'https://appcenter.grupoice.com/CenceWeb/CencePosdespachoNacional.jsf'
    response = r.get(url)
    df_yesterday = pd.read_html(response.text, skiprows=1, index_col=0, header=0)[0]

    soup = BeautifulSoup(response.text, 'html.parser')
    yesterday_date = soup.select('#formPosdespacho:pickFechaInputDate')[0]['value']
    jsf_view_state = soup.select('#javax.faces.ViewState')[0]['value']

    yesterday = arrow.get(yesterday_date, 'DD/MM/YYYY', tzinfo=TIMEZONE)
    today = yesterday.shift(days=+1)

    data = [
        ('formPosdespacho', 'formPosdespacho'),
        ('formPosdespacho:pickFechaInputDate', today.format(DATE_FORMAT)),
        ('formPosdespacho:pickFechaInputCurrentDate', today.format(MONTH_FORMAT)),
        ('formPosdespacho:j_id35.x', ''),
        ('formPosdespacho:j_id35.y', ''),
        ('javax.faces.ViewState', jsf_view_state),
    ]
    response = r.post(url, cookies={}, data=data)
    df_today = pd.read_html(response.text, skiprows=1, index_col=0)[0]

    ydata = df_to_data(country_code, yesterday, df_yesterday)
    tdata = df_to_data(country_code, today, df_today)
    production = ydata + tdata
    unknown_plants()

    return production

示例#2

0

显示文件

文件： boxscore_stats.py 项目： Sandy4321/nbaDraft

def get_stats(year, level='pro'): #TODO Switch to regex patterns
    '''Scrapes draftexpress.com/stats for of a given level, year'''
    front = 'http://www.draftexpress.com/stats.php?sort=8&q='
    pages = 2
    frontb = '&league=NBA&year=20'
    if level == 'col':
        frontb = '&league=NCAA&year=20'
        pages = 13
    midA = '&per=per40pace&qual=prospects&sort2=DESC&pos=all&stage=all&min=10&conference=All&pageno='
    back = '&sort=8'
    url = front + frontb + year + midA+ '0' + back
    reg = pd.DataFrame()
    eff = pd.DataFrame()
    for n in xrange(pages):
        url = front + frontb + year + midA+ str(n) + back
        eff_url = front + 'eff'+ frontb + year + midA+ str(n) + back
        reg_temps = pd.read_html(url, header=0)
        reg_temp = reg_temps[5]
        eff_temps = pd.read_html(eff_url)
        eff_temp = eff_temps[5]
        eff_temp.to_csv('temp.csv')
        eff_temp = pd.read_csv('temp.csv', header=3) #im ashamed
        reg = reg.append(reg_temp)
        eff = eff.append(eff_temp)
    reg['year'] = 2000 + float(year)
    eff['year'] = 2000 + float(year)
    df = reg.merge(eff, how='inner', on='Name', suffixes=('', '_y'))
    df = df.drop(['Cmp', 'Team_y', 'year_y', 'Min_y', 'Cmp_y', 'GP_y'], 1)
    print df.shape
    return df

示例#3

0

显示文件

文件： stocqfilter.py 项目： john721/test

def get_html_dfs(stryear, strmonth):
    year = int(stryear)
    month = int(strmonth)
    monthly_file = "./" + stryear + "_" + strmonth + ".html"
    try:
        with open (monthly_file, 'r') as mf:
            dfs = pd.read_html(monthly_file, encoding='utf-8')
            print ("read html file successfully")
            return dfs
    except Exception as e:
        print(e)
        if year > 1990:
    	    year -= 1911
    
        url = 'http://mops.twse.com.tw/nas/t21/sii/t21sc03_'+str(year)+'_'+str(month)+'_0.html'
        if year <= 98:
        	url = 'http://mops.twse.com.tw/nas/t21/sii/t21sc03_'+str(year)+'_'+str(month)+'.html'
    
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    
        r = requests.get(url, headers=headers)
        r.encoding = 'big5'
        print ("fetch html file successfully")
    
        with codecs.open( monthly_file, mode='wb') as writefile:
            writefile.write(r.text.encode('utf8'))
        dfs = pd.read_html(StringIO(r.text), encoding='big-5')
        return dfs

示例#4

0

显示文件

文件： NCAAScraper.py 项目： sethah/cbbdb

    def get_box_stats(self, url):
        """
        INPUT: NCAAScraper, STRING
        OUTPUT: DATAFRAME, DATAFRAME, DATAFRAME

        Extract html from box stats page and convert to dataframes

        url is a string linking to the box stats page
        """
        soup = self.page_opener.open_and_soup(url)
        tables = soup.findAll('table', {'class': 'mytable'})
        if len(tables) != 3:
            print 'Incorrect number of tables'
            return None

        htable = pd.read_html(str(tables[0]), header=0)[0]
        table1 = pd.read_html(str(tables[1]), skiprows=1, header=0, infer_types=False)[0]
        table2 = pd.read_html(str(tables[2]), skiprows=1, header=0, infer_types=False)[0]

        team1 = htable.iloc[0, 0]
        team2 = htable.iloc[1, 0]
        table1['Team'] = [team1] * table1.shape[0]
        table2['Team'] = [team2] * table2.shape[0]
        table1['game_id'] = [self.game_id(url)] * table1.shape[0]
        table2['game_id'] = [self.game_id(url)] * table2.shape[0]

        # older box stat page versions use different column names so
        # we must map them all to common column names (e.g. MIN vs. Min)
        table1 = self.rename_box_table(table1)
        table2 = self.rename_box_table(table2)
        table1 = self.format_box_table(table1)
        table2 = self.format_box_table(table2)
        
        return htable, table1, table2

示例#5

0

显示文件

文件： 2008primary_parser.py 项目： hodgesmr/openelections-data-oh

def make_USrepresentative_df():
    representative_df = pd.DataFrame()

    df = pd.read_html(URLS['dem_USrepresentative'])[0]
    df.columns = ['county', 'candidate1', 'candidate2',
                  'candidate3', 'candidate4', 'candidate5', 'candidate6']
    df['county'] = df['county'].fillna('') 
    splits = df[df.county.str.startswith('DISTRICT')].index.tolist()
    splits.append(df.shape[0])
    
    for split in range(len(splits) - 1):
        df_ = df.iloc[splits[split]:splits[split+1]]
        df_ = df_.drop(df_.index[0])
        df_.columns = df_.iloc[0]
        df_ = df_.drop(df_.index[0])
        df_.columns = ['county'] + list(df_.columns[1:])
        df_ = df_.dropna(subset=[df_.columns.values[1]])
        df_ = df_.dropna(axis=1)
        
        df_ = pd.melt(df_, id_vars=['county'], value_vars=list(df_.columns[1:]))
        df_.columns = ['county', 'candidate', 'votes']
        df_ = df_[df_['county'] != '']


        df_['party'] = 'Democratic'
        df_['candidate'] = df_['candidate'].str.lstrip('*')
        df_['candidate'] = df_['candidate'].str.replace('\((.*?)\)', '')
        df_['candidate'] = df_['candidate'].str.rstrip('()')
        df_['office'] = 'US Representative'
        representative_df = representative_df.append(df_)

    df = pd.read_html(URLS['rep_USrepresentative'])[0]
    df.columns = ['county', 'candidate1', 'candidate2',
                  'candidate3', 'candidate4', 'candidate5']
    df['county'] = df['county'].fillna('') 
    splits = df[df.county.str.startswith('DISTRICT')].index.tolist()
    splits.append(df.shape[0])
    
    for split in range(len(splits) - 1):
        df_ = df.iloc[splits[split]:splits[split+1]]
        df_ = df_.drop(df_.index[0])
        df_.columns = df_.iloc[0]
        df_ = df_.drop(df_.index[0])
        df_.columns = ['county'] + list(df_.columns[1:])
        df_ = df_.dropna(subset=[df_.columns.values[1]])
        df_ = df_.dropna(axis=1)
        
        df_ = pd.melt(df_, id_vars=['county'], value_vars=list(df_.columns[1:]))
        df_.columns = ['county', 'candidate', 'votes']
        df_ = df_[df_['county'] != '']


        df_['party'] = 'Republican'
        df_['candidate'] = df_['candidate'].str.lstrip('*')
        df_['candidate'] = df_['candidate'].str.replace('\((.*?)\)', '')
        df_['candidate'] = df_['candidate'].str.rstrip('()')
        df_['office'] = 'US Representative'
        representative_df = representative_df.append(df_)

    return representative_df

示例#6

0

显示文件

文件： parse_ema.py 项目： youyanggu/adulteration

def read_ema_ings():
    with open('all_ingredients.html', 'rb') as f_in:
        text_ings = f_in.read()
    df_sort = pd.read_html(text_ings)[0]
    df_sort.columns = ['name', 'sort_name', 'form']
    df_sort = df_sort.sort_values('name')
    df_sort.reset_index(drop=True, inplace=True)

    cols_ings = ['ingredient', 'function', 'form', 'id_test', 'assay']
    df_type = ['high_suscept', 'low_suscept', 'high_suscept_id', 'pending']

    with open('ema_ingredients.html', 'rb') as f_in:
        text_ings = f_in.read()

    dfs = pd.read_html(text_ings)
    for i, df_ in enumerate(dfs):
        df_.columns = cols_ings
        df_['type'] = df_type[i]
    all_dfs = pd.concat(dfs)
    all_dfs = all_dfs.sort_values('ingredient')
    all_dfs = all_dfs.drop_duplicates('ingredient')
    all_dfs.reset_index(drop=True, inplace=True)
    all_dfs['sort_name'] = df_sort['sort_name']
    all_dfs = all_dfs.sort_values('sort_name')
    all_dfs.reset_index(drop=True, inplace=True)
    return all_dfs

示例#7

0

显示文件

文件： robot.py 项目： gsjo/RoboAlbert

def pulldata():
    #  Schedule, odds and TV listing DataFrame
    da = pd.read_html('https://www.teamrankings.com/ncaa-basketball/team/florida-gators')[1]
    db = pd.read_html('http://stats.gatorsports.com/cbk/teamstats.asp?team=210&report=schedule',
                  header=0, parse_dates=False, attrs={'class': 'shsTable shsBorderTable'})[0]['TV']
    df = pd.concat([da, db], axis=1)
    df = df.set_index('Date')
    return df

示例#8

0

显示文件

文件： test_kmeans_model.py 项目： WheatonCS/Lexos

    def test_processed_table(self):
        pd.testing.assert_series_equal(
            pd.read_html(test_two_d.get_result().table)[0]["X-Coordinate"],
            two_d_result.table["X-Coordinate"]
        )

        pd.testing.assert_series_equal(
            pd.read_html(test_two_d.get_result().table)[0]["Y-Coordinate"],
            two_d_result.table["Y-Coordinate"]
        )

示例#9

0

显示文件

文件： mlb_scratchpad.py 项目： karlwichorek/DFSharp

def get_fangraph_pitchers():
    # get al pitchers
    al = pd.read_html('http://www.fangraphs.com/dailyprojections.aspx?pos=all&stats=pit&type=sabersim&team=0&lg=al&players=0')
    fgpal = al[15]
    sleep(2)
    # get nl pitchers
    nl = pd.read_html('http://www.fangraphs.com/dailyprojections.aspx?pos=all&stats=pit&type=sabersim&team=0&lg=nl&players=0')
    fgpnl = nl[15]
    # merge and return
    fgp = fgpal.append(fgpnl)
    return(fgp)

示例#10

0

显示文件

文件： mlb_scratchpad.py 项目： karlwichorek/DFSharp

def get_cruz():
    cruzh = pd.read_html('https://rotogrinders.com/pages/c-r-u-z-mlb-model-792518')
    czh = cruzh[1]   
    
    cruzp = pd.read_html('https://rotogrinders.com/pages/c-r-u-z-mlb-model-792521')
    czp = cruzp[1]
    
    czh['cruz'] = czh['\tRating\t']
    czp['cruz'] = czp['\tRating\t']
    
    return(czh, czp)

示例#11

0

显示文件

文件： cities_us.py 项目： rpinho/happy.li

def get_household_income_from_orlando_sentinel():
    name = 'city_median_income'
    url = 'http://databases.sun-sentinel.com/Orlando/ftlaudOS2011income/income2011_list.php'
    df = pd.read_html(url)[6][:-1]
    for i in range(2,28):
        print i
        df = df.append(pd.read_html(url + '?goto=%d'%i)[6][:-1])
    df.columns = ['0', 'City', 'State', 'Median_Income', '4']
    df = df[['City', 'State', 'Median_Income']]
    df.Median_Income = df.Median_Income.str.strip('$').apply(locale.atof)
    return df

示例#12

0

显示文件

文件： mlb_scratchpad.py 项目： karlwichorek/DFSharp

def get_fangraph_batters():
    poslist = ['c','1b','2b','ss','3b','rf','cf','lf','dh']   
    df = pd.DataFrame()
    for pos in poslist:
        tmp = pd.read_html('http://www.fangraphs.com/dailyprojections.aspx?pos='+pos+'&stats=bat&type=sabersim&team=0&lg=al&players=0')
        df = df.append(tmp[15])
        sleep(2)
        tmp2 = pd.read_html('http://www.fangraphs.com/dailyprojections.aspx?pos='+pos+'&stats=bat&type=sabersim&team=0&lg=nl&players=0')
        df = df.append(tmp2[15])
        sleep(2)
    return(df)

示例#13

0

显示文件

文件： compress.py 项目： azampagl/ai-ml-pitch

def main():
    """Main execution."""

    # Determine command line arguments.
    try:
        rawopts, _ = getopt.getopt(sys.argv[1:], 'i:o:')
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    opts = {}

    # Process each command line argument.
    for o, a in rawopts:
        opts[o[1]] = a

    # The following arguments are required in all cases.
    for opt in ['i', 'o']:
        if not opt in opts:
            usage()
            sys.exit(2)

    # Make sure the output directory exists.
    if not os.path.exists(opts['o']):
        os.makedirs(opts['o'])

    # Traverse the root folder that contains sub folders
    #  that represent each pitcher.
    for root, dirs, _ in os.walk(opts['i']):
        # Traverse each folder in the root.
        for pid in dirs:
            outfile = os.path.join(opts['o'], pid + ".csv")

            # Check if this pitcher was already processed.
            if os.path.isfile(outfile):
                continue

            for proot, _, files in os.walk(os.path.join(root, pid)):
                try:
                    # Read in the first game for this pitcher.
                    with open(os.path.join(proot, files[0]), 'r') as f:
                        df = pd.read_html(f.read(), header=0)[0]
                    # Read in the subsequent games and append to the
                    #  running DataFrame.
                    for file in files[1:]:
                        with open(os.path.join(proot, file), 'r') as f:
                            df = df.append(pd.read_html(f.read(), header=0)[0])
                    # Save to disk as a csv file.
                    df.to_csv(outfile)
                except ValueError:
                    print("Error processing " + pid)
                    continue

示例#14

0

显示文件

文件： nba_predict.py 项目： siyu/pbox

def predict_by_stats(games=[]):
    scores = get_team_scores(team_scores_url)
    num_scores = len(scores)
    team_stats = pd.read_html(team_misc_stats_url, header=1)[0].iloc[:-1, :]
    team_stats['Team'] = [t.strip('*') for t in team_stats['Team'].values]

    scores['home-away'] = scores['PTS.1'] - scores['PTS'] - home_court_advantage # home court adv = 2 pts

    param_columns = team_stats.columns[13:21].tolist()  # starts from column eFG%
    param_columns.remove('FT/FGA')
    param_columns.remove('FT/FGA.1')
    num_params = len(param_columns)
    x = np.zeros([num_scores, num_params])

    for idx, row in scores.iterrows():
        home = row['Home/Neutral']
        away = row['Visitor/Neutral']
        x[idx] = team_stats.loc[team_stats['Team'] == home][param_columns].values - \
                 team_stats.loc[team_stats['Team'] == away][param_columns].values

    x = pd.DataFrame(x, columns=param_columns)
    y = scores['home-away']
    model = sm.OLS(y, x)
    result = model.fit()
    print(result.summary())
    print()

    team_ranking = pd.read_html(team_ranking_url, header=1)[0]
    game_spreads = {} #get_game_spreads()

    print('{:22s} - {:22s} =  {:7s} | {:7s} | {:6s} | {:6s} | {:6s}'.format('home', 'away', 'fit mov', 'ref mov',
                                                                            'spread', 'vs fit', 'vs mov'))
    for [home, away] in games:
        fit_mov = sum(result.params * (
            team_stats.loc[team_stats['Team'] == home][param_columns].values -
            team_stats.loc[team_stats['Team'] == away][
                param_columns].values)[0]) + home_court_advantage
        mov = team_ranking.loc[team_stats['Team'] == home]['MOV/A'].values - \
              team_ranking.loc[team_stats['Team'] == away]['MOV/A'].values + 2

        home_spread = -999
        for k, v in game_spreads.items():
            if home.find(k) > -1:
                home_spread = v * -1

        print('{:22s} - {:22s} =  {:7.1f} | {:7.1f} | {:6.1f} | {:>6s} | {:>6s}'.format(home, away, fit_mov, mov[0],
                                                                                      home_spread,
                                                                                      'home' if fit_mov > home_spread else 'away',
                                                                                      'home' if mov > home_spread else 'away'
                                                                                      ))

示例#15

0

显示文件

文件： GooglePEScrapper.py 项目： evy555/PriceSalesAlgo

def getpe_google(stocks):
    pe_list = dict()  
    pe_list['ticker'] = []
    pe_list['value'] = []
    for ticker in stocks:
        try:
            key_statistics = pd.read_html('https://www.google.com/finance?q=' + str(ticker) + '&ei')
        except:
            key_statistics = pd.read_html('https://www.google.com/finance?q=NYSEARCA%3A' + str(ticker) + '&ei')
        convert = key_statistics[0][1][5:6] 
        input = convert.tolist()  
        pe_list['ticker'].append(ticker) 
        pe_list['value'].extend(input) 
    return(pe_list)

示例#16

0

显示文件

文件： draft_db.py 项目： Sandy4321/nbaDraft

def scrape_mock(year):
    '''Scrapes a mock_draft off the web in a wierd format'''
    url = MOCK_URL + str(year) + '/list/'
    crap = pd.read_html(url, header=0, match='First Round')
    first_round = crap[-1]
    crap = pd.read_html(url, header=0, match='Second Round')
    second_round = crap[-1]
    first_round.columns = ['pick', 'year', 'details']
    second_round.columns = ['pick', 'year', 'details']
    second_round['pick'] = second_round['pick'] + 30
    mock_draft = first_round.append(second_round)
    mock_draft['year'] = year
    mock_draft = mock_draft.set_index('pick')
    mock_draft['pick'] = mock_draft.index
    return mock_draft

示例#17

0

显示文件

文件： fraunhofer-repdose.py 项目： IamCatkin/Learning-Python

def crawler(start, difference, maximum):
    try:
        result = pd.DataFrame()
        origin = "http://fraunhofer-repdose.de/repdose/"
        parameter1 = start
        parameter2 = parameter1 + difference
        if parameter2 > maximum:
            parameter2 = maximum
        with tqdm(total=math.ceil((maximum-start)/difference)) as pbar:              
            while parameter1 < maximum:
                target = 'http://fraunhofer-repdose.de/repdose/query.php?cas_where=&cas_string=&cas_show=on&species=' \
                         '&species_show=on&organ=&organ_show=on&name=&name_show=on&s_sex=&ssex_show=on&effect=&effect_show=' \
                         'on&route=&route_show=on&e_sex=&esex_show=on&boilingpoint_c=&boilingpoint_show=on&duration_from=' \
                         '&duration_to=&duration_show=on&eloel_mg_from=&eloel_mg_to=&eloel_mg_show=on&watersolubility_c=&watersolubility_show' \
                         '=on&noel_mg_from=&noel_mg_to=&noel_mg_show=on&logpow_c=&logpow_show=on&loel_mg_from=&loel_mg_to=&loel_mg_show' \
                         '=on&pressure_c=&pressure_show=on&reliabilityA=on&reliabilityB=on&mol_from='+str(parameter1)+'&mol_to='+str(parameter2)+'&molweight_show=on&reference_show=0'
                page = requests.get(target).text
                if "Please restrict query conditions." in page:
                    print(str(parameter1)+":error")
                elif "Page" in page:
                    lists = []
                    bsObj = BeautifulSoup(page, 'lxml')
                    found_a = bsObj.find_all('a')
                    for item in found_a:
                        found_href = item.get('href')
                        if "query.php" in found_href:
                            lists.append(found_href)
                    for i in lists:
                        html = origin + i
                        r_page = requests.get(html).text
                        table = pd.read_html(r_page)[0]
                        table.drop([0,1], inplace=True)
                        result = pd.concat([result,table])
                else:
                    table = pd.read_html(page)[0]
                    table.drop([0,1], inplace=True)
                    result = pd.concat([result,table])
                parameter1 = parameter2
                parameter2 += difference
                if parameter2 > maximum:
                    parameter2 = maximum
                time.sleep(0.5)
                pbar.update(1)
    finally:
        get_c_name = pd.read_html(page)[0]
        c_name = get_c_name.iloc[1,:]
        result.rename(columns=c_name, inplace=True)
        result.to_csv("result_" + str(maximum) + ".csv", index=False)

示例#18

0

显示文件

文件： atmid_results_scrapper.py 项目： ndiquattro/quattsweb

def dfmaker(pagetitle):

    # Get page HTML and find tables
    wikipage = wikipedia.page(title=pagetitle)
    alltables = BeautifulSoup(wikipage.html(), 'html.parser').find_all('table')

    # Keep tables that have show results
    rtabs = [pd.read_html(str(t), header=0, encoding='utf-8')[0]
             for t in alltables if t.th.text == ' No.\n']
    yeares = pd.concat(rtabs)

    # Clean up dataframe
    newnames = {'Original air date[3]': 'airdate',
                'Original air date': 'airdate',
                'Runner-up': 'Runnerup',
                'Last place': 'Lastplace',
                'No.': 'shownum'}
    yeares.rename(columns=newnames,
                  inplace=True)

    qrem = ['Winner', 'Runnerup', 'Lastplace']
    yeares[qrem] = yeares[qrem].replace(regex='["]', value='')
    yeares.airdate = yeares.airdate.str[:-13]

    results = yeares[pd.notnull(yeares['Winner'])]  # removes not results rows

    return results

示例#19

0

显示文件

文件： trading.py 项目： andyzsf/PyTuShare

def _today_ticks(symbol, tdate, pageNo, retry_count, pause):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            html = lxml.html.parse(ct.TODAY_TICKS_URL % (ct.P_TYPE['http'],
                                                         ct.DOMAINS['vsf'], ct.PAGES['t_ticks'],
                                                         symbol, tdate, pageNo
                                ))  
            res = html.xpath('//table[@id=\"datatbl\"]/tbody/tr')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            sarr = sarr.replace('--', '0')
            df = pd.read_html(StringIO(sarr), parse_dates=False)[0]
            df.columns = ct.TODAY_TICK_COLUMNS
            df['pchange'] = df['pchange'].map(lambda x : x.replace('%', ''))
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)

示例#20

0

显示文件

文件： fundamental.py 项目： afayge/tushare

def _get_report_data(year, quarter, pageNo, dataArr, orderby):
    ct._write_console()
    try:
        request = Request(ct.REPORT_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'],
                                           year, quarter, pageNo, ct.PAGE_NUM[1], orderby))
        # 默认排序抓取的信息有重复和遗漏,增加排序功能参数orderby
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        text = text.replace('--', '')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>' % sarr
        df = pd.read_html(sarr)[0]
        df = df.drop(11, axis=1)
        df.columns = ct.REPORT_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage) > 0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_report_data(year, quarter, pageNo, dataArr,orderby)
        else:
            return dataArr
    except Exception as e:
        print(e)

示例#21

0

显示文件

文件： trading.py 项目： andyzsf/PyTuShare

def _parse_fq_data(url, index, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(url)
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath('//table[@id=\"FundHoldSharesTable\"]')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            df = pd.read_html(sarr, skiprows = [0, 1])[0]
            if len(df) == 0:
                return pd.DataFrame()
            if index:
                df.columns = ct.HIST_FQ_COLS[0:7]
            else:
                df.columns = ct.HIST_FQ_COLS
            if df['date'].dtypes == np.object:
                df['date'] = df['date'].astype(np.datetime64)
            df = df.drop_duplicates('date')
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)

示例#22

0

显示文件

文件： reference.py 项目： luodanzhi/tushare

def _get_forecast_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        gparser = etree.HTMLParser(encoding='GBK')
        html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], 
                                                ct.PAGES['fd'], year, quarter, pageNo,
                                                ct.PAGE_NUM[1]),
                               parser=gparser)
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = sarr.replace('--', '0')
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df = df.drop([4, 5, 8], axis=1)
        df.columns = ct.FORECAST_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+',nextPage[0])[0]
            return _get_forecast_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
            print(e)

示例#23

0

显示文件

文件： reference.py 项目： luodanzhi/tushare

def _newstocks(data, pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        ct._write_console()
        try:
            html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'],
                         ct.PAGES['newstock'], pageNo))
            res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
            if len(res) == 0:
                return data
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = sarr.replace('<font color="red">*</font>', '')
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
            df = df.drop([df.columns[idx] for idx in [12, 13, 14]], axis=1)
            df.columns = rv.NEW_STOCKS_COLS
            df['code'] = df['code'].map(lambda x : str(x).zfill(6))
            df['xcode'] = df['xcode'].map(lambda x : str(x).zfill(6))
            res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()')
            tag = '下一页' if ct.PY3 else unicode('下一页', 'utf-8')
            hasNext = True if tag in res else False 
            data = data.append(df, ignore_index=True)
            pageNo += 1
            if hasNext:
                data = _newstocks(data, pageNo, retry_count, pause)
        except Exception as ex:
            print(ex)
        else:
            return data

示例#24

0

显示文件

文件： cryptory.py 项目： Averroes/cryptory

 def extract_coinmarketcap(self, coin, coin_col=False):
     """Retrieve basic historical information for a specific cryptocurrency from coinmarketcap.com
     
     Parameters
     ----------
     coin : the name of the cryptocurrency (e.g. 'bitcoin', 'ethereum', 'dentacoin')
     coin_col : whether to include the coin name as a column
         (default is False i.e. the column is not included)
         
     Returns
     -------
     pandas Dataframe
     """
     try:
         output = pd.read_html("https://coinmarketcap.com/currencies/{}/historical-data/?start={}&end={}".format(
             coin, self.from_date.replace("-", ""), self.to_date.replace("-", "")))[0]
     except Exception as e:
         return pd.DataFrame({"error":e}, index=[0])
     output = output.assign(Date=pd.to_datetime(output['Date']))
     for col in output.columns:
         if output[col].dtype == np.dtype('O'):
             output.loc[output[col]=="-",col]=0
             output[col] = output[col].astype('int64')
     output.columns = [col.lower() for col in output.columns]
     if coin_col:
         output['coin'] = coin
     return output

示例#25

0

显示文件

文件： stock_draw.py 项目： Trietptm-on-Coding-Algorithms/SampleCode

def get_quote_yahoojp(code, start=None, end=None, interval='d'):
    base = 'http://info.finance.yahoo.co.jp/history/?code={0}.T&{1}&{2}&tm={3}&p={4}'
    start, end = web._sanitize_dates(start, end)
    start = 'sy={0}&sm={1}&sd={2}'.format(start.year, start.month, start.day)
    end = 'ey={0}&em={1}&ed={2}'.format(end.year, end.month, end.day)
    p = 1
    results = []

    if interval not in ['d', 'w', 'm', 'v']:
        raise ValueError("Invalid interval: valid values are 'd', 'w', 'm' and 'v'")

    while True:
        url = base.format(code, start, end, interval, p)
        tables = pd.read_html(url, header=0)
        if len(tables) < 2 or len(tables[1]) == 0:
            break
        results.append(tables[1])
        p += 1
    result = pd.concat(results, ignore_index=True)
    result.columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
    if interval == 'm':
        result['Date'] = pd.to_datetime(result['Date'], format='%Y年%m月')
    else:
        result['Date'] = pd.to_datetime(result['Date'], format='%Y年%m月%d日')
    result = result.set_index('Date')
    result = result.sort_index()
    return result

示例#26

0

显示文件

文件： web_scraper.py 项目： TambourineMan88/carmELO

def scrape_nba_results():
    ''' Scrape recent NBA results'''
    url = 'http://www.betexplorer.com/basketball/usa/nba/results/'
    df = pd.read_html(get(url).text)[0]
    homeTeam = df[0].apply(lambda r: str.split(r, sep='-')[0].strip())
    homeTeam[homeTeam == 'Portland Trail Blazers'] = 'Portland Trailblazers'
    homePoints = df[1].apply(lambda r: str.split(r, sep=':')[0].strip())    
    awayTeam = df[0].apply(lambda r: str.split(r, sep='-')[1].strip())
    awayTeam[awayTeam == 'Portland Trail Blazers'] = 'Portland Trailblazers'
    awayPoints = df[1].apply(lambda r: str.split(r, sep=':')[1].strip())
    awayPoints = awayPoints.apply(lambda r: str.split(r, sep='ET')[0].strip())
    dates = df[4].apply(lambda r: datetime.strptime(r, '%d.%m.%Y'))
    # The dates on this website are GMT so are one day advanced.
    dates = dates.apply(lambda r: datetime.strftime(r - timedelta(days=1), 
                                                    '%d/%m/%Y'))
    df['Date'] = dates
    df['HomeTeam'] = homeTeam
    df['AwayTeam'] = awayTeam
    df['HomePoints'] = homePoints
    df['AwayPoints'] = awayPoints
    df['HomeWin'] = homePoints > awayPoints
    df = df.ix[:, 5:11]
    
    teams = lookup_teams()
    df['HomeId'] = df.merge(df.merge(teams, left_on='HomeTeam', 
                            right_on='Franchise', sort=False))['TeamId']
    df['AwayId']= df.merge(df.merge(teams, left_on='AwayTeam', 
                            right_on='Franchise', sort=False))['TeamId']
    return df

示例#27

0

显示文件

文件： web_scraper.py 项目： TambourineMan88/carmELO

def scrape_best_odds():
    ''' Scrape best odds offered for next round of matches'''
    url = 'http://www.oddschecker.com/basketball/nba'
    df = pd.read_html(get(url).text)[0]
    df = df[pd.notnull(df.ix[:,1])].ix[:,1:3]
    
    def parse_team(string):
        return str.split(string, sep='(')[0].strip()
        
    def parse_odds(string):
        s = string.split(sep='(')[1]
        s = s.replace(')', '')
        f = s.split(sep='/')
        d = 1
        if(len(f) > 1):
            d = float(s.split(sep='/')[1])
        return (float(f[0])/d)+1
        
    df['Date'] = datetime.today().strftime('%d/%m/%Y')
    df['AwayTeam'] = df['1'].apply(parse_team)
    df['HomeTeam'] = df['2'].apply(parse_team)
    df['AwayOdds'] =  df['1'].apply(parse_odds)
    df['HomeOdds'] =  df['2'].apply(parse_odds)
    df['AwayOddsProb'] = 1/df['AwayOdds']
    df['HomeOddsProb'] = 1/df['HomeOdds']
    df = df[['Date', 'HomeTeam', 'AwayTeam', 'HomeOdds', 'HomeOddsProb', 
             'AwayOdds', 'AwayOddsProb']]
    return df

示例#28

0

显示文件

文件： fundamental.py 项目： 1FENQI/tushare

def _get_cashflow_data(year, quarter, pageNo, dataArr,
                       retry_count=3, pause=0.001):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                                    ct.PAGES['fd'], year,
                                                    quarter, pageNo, ct.PAGE_NUM[1]))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            text = text.replace('--', '')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@class=\"list_table\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = ct.CASHFLOW_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _get_cashflow_data(year, quarter, pageNo, dataArr)
            else:
                return dataArr
        except Exception as e:
            pass
    raise IOError(ct.NETWORK_URL_ERROR_MSG)

示例#29

0

显示文件

文件： web_scraper.py 项目： TambourineMan88/carmELO

def scrape_model_and_odds():
    model = scrape_model_probs()
    
    for i in range(len(model)):
        home_team = model.HomeTeam[i]
        away_team = model.AwayTeam[i]
        url = _odds_checker_page(home_team, away_team)
        df = pd.read_html(get(url).text)[0]
        
        df0 = df.loc[0:1, bookies.keys()]
        df0.columns = bookies.keys()
        df0.index = [_adjust_portland(df[2][0]), _adjust_portland(df[2][1])]
          
        odds = pd.DataFrame(data = df0.loc[home_team, :].apply(_parse_odds), 
                               index = bookies.keys())  
        odds[away_team] = pd.DataFrame(data = df0.loc[away_team, :].apply(_parse_odds), 
                                    index = bookies.keys())
    
        model.loc[i, 'HomeOdds'] = odds[home_team].max()
        model.loc[i, 'HomeBookie'] = bookies[odds.sort_values(by=home_team).index[-1]]
        model['HomeOddsProb'] = 1 / model.HomeOdds
        model.loc[i, 'AwayOdds'] = odds[away_team].max()
        model.loc[i, 'AwayBookie'] = bookies[odds.sort_values(by=away_team).index[-1]]
        model['AwayOddsProb'] = 1 / model.AwayOdds    
    return model

示例#30

0

显示文件

文件： jpstock.py 项目： misper/finance

    def get(self, code, start=None, end=None, interval='d'):
        if code in {'N225', 'GSPC', 'IXIC', 'DJI'}:
            start = datetime.datetime.strptime(start, '%Y-%m-%d')
            result = data.DataReader("".join(['^', code]),
                                     'yahoo', start, end)
            return result.asfreq('B')

        base = self._base_url()
        start, end = self._sanitize_dates(start, end)
        start = 'sy={0}&sm={1}&sd={2}'.format(
            start.year, start.month, start.day)
        end = 'ey={0}&em={1}&ed={2}'.format(end.year, end.month, end.day)
        p = 1
        results = []

        if interval not in ['d', 'w', 'm', 'v']:
            raise ValueError(
                "Invalid interval: valid values are 'd', 'w', 'm' and 'v'")

        while True:
            url = base.format(int(code), start, end, interval, p)
            tables = pd.read_html(url, header=0)
            if len(tables) < 2 or len(tables[1]) == 0:
                break
            results.append(tables[1])
            p += 1

        result = pd.concat(results, ignore_index=True)

        result.columns = [
            'Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
        result['Date'] = pd.to_datetime(result['Date'], format='%Y年%m月%d日')
        result = result.set_index('Date')
        result = result.sort_index()
        return result.asfreq('B')

示例#31

0

显示文件

文件： gen_dash_html.py 项目： riteshsharma29/Python_based_Dashboard

#  To avoid unwanted float values
pd.options.display.float_format = '{:,.0f}'.format

url = 'https://www.worldometers.info/coronavirus/'

#  To avoid 403 Error
header = {
    "User-Agent":
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)

df = pd.read_html(r.text)
df = df[0]
df = df[1:212]

df.columns = [
    'Country', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths',
    'TotalRecovered', 'ActiveCases', 'Critical', 'Tot Cases/1M pop',
    'Deaths/1M pop', '	TotalTests', 'Tests/ 1M pop'
]

#  Replace few countries names

df = df.replace(to_replace="UK", value="United Kingdom")
df = df.replace(to_replace='S. Korea', value="South Korea")
df = df.replace(to_replace='UAE', value="United Arab Emirates")
df = df.replace(to_replace='0', value=0)

示例#32

0

显示文件

文件： Basketball Reference Scraper.py 项目： Nisar-1234/Springboard-Projects

# Add player links to player_links list.
for key, letter_soup in soups.items():
    # First row is a header row, all the rest are players. Create a players list.
    player_rows=letter_soup.table.find_all("tr")[1:]
    counter=0
    
    #Choose only centers that began playing after 1980
    for idx, player in enumerate(player_rows):
        if int(player.td.string) >= 1980 and fnmatch.fnmatch(player.find_all("td")[2].string, "*C*"):
            player_links.append(base_link+player.th.a["href"])
            counter+=1
            
"""Collection of individual player URLs is done, now we begin creating the 'player data' dataframe."""         
        
player_data_df= player_df=pd.read_html(str(BeautifulSoup(urlopen(player_links[0])).table))[0].iloc[:-1, 0:30]
dataframe_index=1

countries=np.array(["albania","andorra","armenia","austria","azerbaijan","belarus","belgium",
                    " bosnia and herzegovina","bulgaria","croatia","cyprus","czech republic",
                    "denmark","estonia","finland","france","georgia","germany","greece","hungary",
                    "iceland","ireland","italy","kazakhstan","kosovo","latvia","liechtenstein",
                    "lithuania","luxembourg","macedonia","malta","moldova","monaco","montenegro",
                    "netherlands","norway","poland","portugal","romania","russia","san marino",
                    "serbia","slovakia","slovenia","spain","sweden","switzerland","turkey",
                    "ukraine","united kingdom","vatican city"])
states=np.empty(shape=len(us_states), dtype=object)
for index, state in enumerate(us_states):
    states[index]=str(state).lower()

for player_link in player_links[1:]: # Creating and adding player dataframes together.

示例#33

0

显示文件

文件： kansas.py 项目： sherpadee/WebScrapingDemo

for link in soup_level1.find_all(
        'a', id=re.compile("^MainContent_uxLevel2_JobTitles_uxJobTitleBtn_")):

    #Selenium visits each Job Title page
    python_button = driver.find_element_by_id(
        'MainContent_uxLevel2_JobTitles_uxJobTitleBtn_' + str(x))
    python_button.click()  #click link

    #Selenium hands of the source of the specific job page to Beautiful Soup
    soup_level2 = BeautifulSoup(driver.page_source, 'html.parser')

    #Beautiful Soup grabs the HTML table on the page
    table = soup_level2.find_all('table')[0]

    #Giving the HTML table to pandas to put in a dataframe object
    df = pd.read_html(str(table), header=0)

    #Store the dataframe in a list
    datalist.append(df[0])

    #Ask Selenium to click the back button
    driver.execute_script("window.history.go(-1)")

    #increment the counter variable before starting the loop over
    x += 1

    #end loop block

#loop has completed

#end the Selenium browser session

示例#34

0

显示文件

文件： mission_to_mars.py 项目： emilyrush/scraping_mars_homework

def scrape():
    # URL of page to be scraped
    url = "https://mars.nasa.gov/news/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # print(response.text)


    # Find latest news title about Mars
    news_title = soup.find('div', class_="content_title").text
    news_title


    # Find latest news blurb
    news_p = soup.find('div', class_="rollover_description_inner").text
    news_p

    # * Use splinter to navigate the site and find the image url for the current Featured Mars Image
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    featured_image = browser.find_by_id('full_image')
    featured_image.click()
    time.sleep(5)

    more_info = browser.find_link_by_partial_text('more info')
    more_info.click()

    # Pull featured image url
    html = browser.html
    soupsearch = BeautifulSoup(html, 'html.parser')

    part_image_url = soupsearch.find('img', class_='main_image').get('src')
    featured_image_url = 'https://www.jpl.nasa.gov' + part_image_url
    featured_image_url
    
    # Exit browser
    browser.quit()
    
    # Visit the Mars Weather twitter account [here](https://twitter.com/marswxreport?lang=en) 
    # and scrape the latest Mars weather tweet from the page.
    url = "https://twitter.com/marswxreport?lang=en"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    mars_weather = soup.find('div', class_='js-tweet-text-container').text
    mars_weather

    # # Pull Mars facts table from Space-Facts
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path)
    url = 'https://space-facts.com/mars/'
    marsFacts_df = pd.read_html(url)
    marsFacts_df = marsFacts_df[0]
    marsFacts_df

    # # * Use Pandas to convert the data to a HTML table string.
    # marsFacts_df.to_html('mars_facts.html', index=False)
    marsHTML = marsFacts_df.to_html()
    print(marsHTML)

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path)
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    cerberus = browser.find_link_by_partial_text('Cerberus')
    cerberus.click()

    html = browser.html
    soupsearch = BeautifulSoup(html, 'html.parser')

    astrogeology_url = 'https://astrogeology.usgs.gov/'
    #---------------------------------------
    cerberus_url = soupsearch.find('img', class_='wide-image').get('src')
    cerberus_img_url = astrogeology_url + cerberus_url
    print('cerberus image')
    print(cerberus_img_url)

    back = browser.find_link_by_partial_text('Back')
    back.click()
    #---------------------------------------
    schiaparelli = browser.find_link_by_partial_text('Schiaparelli')
    schiaparelli.click()

    html = browser.html
    soupsearch = BeautifulSoup(html, 'html.parser')

    schiaparelli_url = soupsearch.find('img', class_='wide-image').get('src')
    schiaparelli_img_url = astrogeology_url + schiaparelli_url

    back = browser.find_link_by_partial_text('Back')
    back.click()
    print('schiaparelli image')
    print(schiaparelli_img_url)
    #---------------------------------------

    syrtis = browser.find_link_by_partial_text('Syrtis')
    syrtis.click()

    html = browser.html
    soupsearch = BeautifulSoup(html, 'html.parser')

    syrtis_url = soupsearch.find('img', class_='wide-image').get('src')
    syrtis_img_url = astrogeology_url + syrtis_url

    back = browser.find_link_by_partial_text('Back')
    back.click()

    valles = browser.find_link_by_partial_text('Valles')
    valles.click()

    html = browser.html
    soupsearch = BeautifulSoup(html, 'html.parser')

    valles_url = soupsearch.find('img', class_='wide-image').get('src')
    valles_img_url = astrogeology_url + valles_url
    valles_img_url


    print(cerberus_img_url, schiaparelli_img_url, syrtis_img_url, valles_img_url)




    # # Scrape Hemisphere image urls
    # executable_path = {'executable_path': 'chromedriver.exe'}
    # browser = Browser('chrome', **executable_path)
    # url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    # browser.visit(url)

    # cerberus = browser.find_link_by_partial_text('Cerberus')
    # cerberus.click()

    # html = browser.html
    # soupsearch = BeautifulSoup(html, 'html.parser')

    # astrogeology_url = 'https://astrogeology.usgs.gov/'
    # #---------------------------------------
    # cerberus_url = soupsearch.find('img', class_='wide-image').get('src')
    # cerberus_img_url = astrogeology_url + cerberus_url


    # back = browser.find_link_by_partial_text('Back')
    # # back.click()

    # #---------------------------------------
    # url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    # browser.visit(url)
    
    # schiaparelli = browser.find_link_by_partial_text('Schiaparelli')
    # schiaparelli.click()
    # time.sleep(2)

    # schiaparelli_url = soupsearch.find('img', class_='wide-image').get('src')
    # schiaparelli_img_url = astrogeology_url + schiaparelli_url

    # back = browser.find_link_by_partial_text('Back')
    # back.click()

    # #---------------------------------------

    # syrtis = browser.find_link_by_partial_text('Syrtis')
    # syrtis.click()
    # time.sleep(2)
    # syrtis_url = soupsearch.find('img', class_='wide-image').get('src')
    # syrtis_img_url = astrogeology_url + syrtis_url

    # back = browser.find_link_by_partial_text('Back')
    # back.click()

    # valles = browser.find_link_by_partial_text('Valles')
    # valles.click()
    # time.sleep(2)
    # valles_url = soupsearch.find('img', class_='wide-image').get('src')
    # valles_img_url = astrogeology_url + valles_url
    # valles_img_url

    # # Exit browser
    # browser.quit()
    # print(cerberus_img_url, schiaparelli_img_url, syrtis_img_url, valles_img_url)

    # Save hemisphere image urls in a dictionary.
    hemisphere_image_urls = [
        {"title": "Valles Marineris Hemisphere", "img_url": valles_img_url},
        {"title": "Cerberus Hemisphere", "img_url": cerberus_img_url},
        {"title": "Schiaparelli Hemisphere", "img_url": schiaparelli_img_url},
        {"title": "Syrtis Major Hemisphere", "img_url": syrtis_img_url},
    ]
    print(hemisphere_image_urls)
    # Save all variables in a dictionary
    mars_data = {
        "hemisphere_image_urls": hemisphere_image_urls,
        "news_p" : news_p,
        "news_title" : news_title,
        "featured_image_url": featured_image_url,
        "mars_weather": mars_weather,
        "mars_facts": marsHTML
    }

    return mars_data

示例#35

0

显示文件

文件： Mission_to_Mars.py 项目： TriLuu01/Mission_To_Mars

html = browser.html
image_soup = BeautifulSoup(html, 'html.parser')
# use CSS selector in BeautifulSoup to extract img_url
image_relative_url = image_soup.select_one('figure.lede a img').get('src')

#combine with base URL to create an absolute img URL
img_url = f'https://www.jpl.nasa.gov{image_relative_url}'
img_url

# check if a valid img_url
#browser.visit(img_url)
# %% [markdown]
# ## Web Scrape TABLE from Mars facts website
# - (use Pandas funtions to parse HTML Table)
# - (No BeautifulSoup used)
# - (No auto browser used)
# %%
fact_url = 'http://space-facts.com/mars/' # only accept http without s
# extract the first one in list of DFs
df = pandas.read_html(fact_url)[0]
df.columns = ['description', 'value']
df.set_index('description', inplace=True)
df
# convert back to HTML string
fact_table_html = df.to_html()
fact_table_html
# %%
browser.quit()

# %%

示例#36

0

显示文件

def scrape_all():

    browser = init_browser()

    url = 'https://mars.nasa.gov/news'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')

    titles = soup.find_all('div', class_='content_title')
    texts = soup.find_all('div', class_='article_teaser_body')

    title_text = []
    text_only = []

    #keep only the text
    for x in titles:
        title_text.append(x.text.strip())

    for x in texts:
        text_only.append(x.text.strip())

    # JPL Mars Space Image

    #These lines of code are needed to navigate to the next page

    image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
    browser.visit(image_url)
    html = browser.html
    soup = bs(html, 'html.parser')

    image_url_src = soup.find('img', class_='headerimage fade-in')['src']

    url_short = image_url.split('/')

    #rearrange and concatenate URL
    featured_image_url = url_short[0] + '//' + url_short[1] + url_short[
        2] + '/' + url_short[3] + '/' + image_url_src

    # Mars Facts
    facts_url = 'https://space-facts.com/mars/'
    df = pd.read_html(facts_url)[0]
    mars_facts = df.to_html()

    # Mars Hemispheres
    pic_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(pic_url)
    hemi_url = []

    links = browser.find_by_css('a.product-item h3')

    for i in range(len(links)):
        hemi = {}
        browser.find_by_css('a.product-item h3')[i].click()
        sample_image = browser.links.find_by_text('Sample').first
        hemi['img_url'] = sample_image['href']
        hemi['title'] = browser.find_by_css('h2.title').text

        hemi_url.append(hemi)

        browser.back()

    browser.quit()

    # Store data in one dictionary
    mars_data = {
        "news_title": title_text,
        "news_paragraph": text_only,
        "featured_image": featured_image_url,
        "mars_facts": mars_facts,
        "hemispheres": hemi_url
    }

    return mars_data

示例#37

0

显示文件

文件： import pandas as pd 11.py 项目： romisaakhaled454611/corrsera-capstone-

import requests
import pandas as pd

wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikipedia_page = requests.get(wiki)

df_raw = pd.read_html(wikipedia_page.content, header=0)[0]
df_new = df_raw[df_raw.Borough != 'Not assigned']

df_new.head()
df_new.loc[df_new.Neighborhood == 'Not assigned']
df_new.Neighborhood.replace('Not assigned', df_new.Borough, inplace=True)
df_new.head(8)
df_toronto = df_new.groupby(['Postal Code', 'Borough'
                             ])['Neighborhood'].apply(lambda x: ', '.join(x))
df_toronto = df_toronto.reset_index()
df_toronto.rename(columns={'Postal Code': 'PostCode'}, inplace=True)
df_toronto.rename(columns={'Neighborhood': 'Neighbourhood'}, inplace=True)
df_toronto.head()
df_toronto.shape

示例#38

0

显示文件

@author: Brad
"""
#Importing libraries
import sqlalchemy
import pandas as pd
import time

#Loading login credentials into Python
credentials = pd.read_csv('credentials.csv')
SQL_user = credentials.loc[0, 'SQL_user']
SQL_pass = credentials.loc[0, 'SQL_pass']
API_key = credentials.loc[0, 'API_key']

#Creating list of stocks for table
asx_200 = pd.read_html('https://en.wikipedia.org/wiki/S%26P/ASX_200')
asx_200 = asx_200[0][0]
asx_200 = asx_200[1:]
asx_200.columns = ['Symbol']
asx_200 = asx_200.str.lower()

#Setting up database
engine = sqlalchemy.create_engine('mysql+mysqlconnector://' + str(SQL_user) +
                                  ':' + str(SQL_pass) + '@localhost:3306')
con = engine.connect()
con.execute('CREATE database ASX_API')
con.close()

#Creating connection with MySQL server for API upload
engine = sqlalchemy.create_engine('mysql+mysqlconnector://' + str(SQL_user) +
                                  ':' + str(SQL_pass) +

示例#39

0

显示文件

def scrape():
    browser = init_browser()
    # mars_scrapped_data = {}

    url= 'https://mars.nasa.gov/news/'
    browser.visit(url)

    html= browser.html
    soup = BeautifulSoup(html, 'html.parser') 
    news_title = soup.find('div', class_='content_title').text
    news_p = soup.find('div', class_='article_teaser_body').text
    
    # just to check the output
    print(f"Title: {news_title}")
    print(f"Paragraph: {news_p}")

    url_image= 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url_image)
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(3)
    browser.click_link_by_partial_text('more info')
    new_html= browser.html
    imgsoup = BeautifulSoup(new_html, 'html.parser')
    temp_img = imgsoup.find('img', class_='main_image')['src']

    featured_image_url= 'https://www.jpl.nasa.gov/' + temp_img
    print(featured_image_url)

    mars_twitter_url= 'https://twitter.com/marswxreport?lang=en'
    browser.visit(mars_twitter_url)

    mars_twitter= browser.html
    soup = BeautifulSoup(mars_twitter, 'html.parser') 
    find_tweet = soup.find('p', class_='TweetTextSize').text
    mars_weather= find_tweet
    print(f"Latest Tweet: {mars_weather}")

    
    
    mars_facts_url = 'https://space-facts.com/mars/'
    
    tables = pd.read_html(mars_facts_url)
    tables
    

    df = tables[0]
    df.columns = ['Profile', 'Details']
    df.head()
    df.set_index('Profile', inplace=True)
    df.head()
    html_table = df.to_html()
    html_table
    html_table.replace('\n', '')

    df.to_html('table.html')


    url_mars= 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url_mars)
    
    hemi_dicts = []

    for i in range(1,9,2):
        hemi_dict = {}
    
        browser.visit(url_mars)
    #     time.sleep(1)
        hemispheres_html = browser.html
        hemispheres_soup = BeautifulSoup(hemispheres_html, 'html.parser')
        hemi_name_links = hemispheres_soup.find_all('a', class_='product-item')
        hemi_name = hemi_name_links[i].text.strip('Enhanced')
    
        detail_links = browser.find_by_css('a.product-item')
        detail_links[i].click()
        time.sleep(1)
        browser.find_link_by_text('Sample').first.click()
        time.sleep(1)
        browser.windows.current = browser.windows[-1]
        hemi_img_html = browser.html
        browser.windows.current = browser.windows[0]
        browser.windows[-1].close()
    
        hemi_img_soup = BeautifulSoup(hemi_img_html, 'html.parser')
        hemi_img_path = hemi_img_soup.find('img')['src']

        print(hemi_name)
        hemi_dict['title'] = hemi_name.strip()
    
        print(hemi_img_path)
        hemi_dict['img_url'] = hemi_img_path

        hemi_dicts.append(hemi_dict)

    mars_scrapped_data ={"news_title": news_title, 
                        "news_paragraph": news_p,
                        "featured_image": featured_image_url,
                        "Latest_Tweet": mars_weather,
                        "Hemispheres_details": hemi_dicts
                        # "Table": html_table
                        }
    return mars_scrapped_data

示例#40

0

显示文件

文件： scrape_mars.py 项目： ashvinrajput/Mission-to-Mars

def init_browser():
executable_path = {'executable_path': 'chromedriver.exe'}
return browser('chrome', **executable_path, headless=False)

def scrape_info():
    browser = init_browser()
    

        url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
        browser.visit(url)
        time.sleep(5)

        # HTML 
        html_news = browser.html

        # Parse HTML with Beautiful Soup
        soup = BeautifulSoup(html_news, 'html.parser')

        news_title = soup.find("div", class_="content_title").text
        news_p = soup.find("div", class_ ="article_teaser_body").text

        news_p 

        browser.quit()

# Feature Image

        featured_img_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
        browser.visit(featured_img_url)

        browser.links.find_by_partial_text('FULL IMAGE')
        browser.links.find_by_partial_text('more info')
        browser.links.find_by_partial_text('jpg')

        # HTML Object 
        html_image = browser.html

        # Parse HTML with Beautiful Soup
        soup = BeautifulSoup(html_image, 'html.parser')

        featured_img_url = soup.find('img')['src']

        featured_img_url

        
# Mars Weather

        weather_url = 'https://twitter.com/marswxreport?lang=en'
        
        browser.visit(weather_url)

        time.sleep(5)
        
        html_weather = browser.html
        
        weather_soup = BeautifulSoup(html_weather, "html.parser")
        
        mars_weather=weather_soup.find('div', class_ ='css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0').text
        
        mars_weather


# Mars Facts


    mars_df = pd.read_html("https://space-facts.com/mars/")[0]

    #Panda DataFrame
    #mars_df.columns=["Des", "MARS PLANET PROFILE"]
    #mars_df.set_index("Des", inplace=True)
    #mars_df

    mars_df.columns=["Description", "Value"]
    mars_df.set_index("Description", inplace = True)
    mars_facts = mars_df.to_html(classes="table")
    mars_facts =mars_df.replace("'","")
    mars_facts

    

# MARS HEMISPHERES

    # Cerberus Hemisphere 

    mars_hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(mars_hemi_url)

    # Click to URL of page to be scraped and extract data
    browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced')

    html_mars_hemi = browser.html
    soup_mars_hemi = BeautifulSoup(mars_hemi_url, 'html.parser')

    #Pull specific data from webpage
    hemi_title1_url = soup_mars_hemi.find('h2', class_ ='title').text
    img1_url =soup_mars_hemi.find('a', text ='Sample').get("href")

    #Put data into a dictionary
    one ={'title':hemi_title1_url, "img_url":img1_url}
    one


    # Schiaparelli Hemisphere 

    mars_hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(mars_hemi_url)

    # Click to URL of page to be scraped and extract data
    browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced')
    html_mars_hemi = browser.html
    soup_mars_hemi = BeautifulSoup(html_mars_hemi, 'html.parser')

    #Pull specific data from webpage
    hemi_title2_url = soup_mars_hemi.find('h2', class_ ='title').text
    img2_url =soup_mars_hemi.find('a', text ='Sample').get("href")

    #Put data into a dictionary
    two ={'title':hemi_title2_url, "img_url":img2_url}
    two
                                      

    # Syrtis Major Hemisphere 

    mars_hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(mars_hemi_url)

    # Click to URL of page to be scraped and extract data
    browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced')
    html_mars_hemi = browser.html
    soup_mars_hemi = BeautifulSoup(html_mars_hemi, 'html.parser')

    # Pull specific data from webpage
    hemi_title3_url = soup_mars_hemi.find('h2', class_ ='title').text
    img3_url =soup_mars_hemi.find('a', text ='Sample').get("href")

    #Put data into a dictionary
    three ={'title':hemi_title3_url, "img_url":img3_url}
    three

    # Valles Marineris Hemisphere 

    mars_hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(mars_hemi_url)

    browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced')
    html_mars_hemi = browser.html
    soup_mars_hemi = BeautifulSoup(html_mars_hemi, 'html.parser')

    #Pull specific data from webpage
    hemi_title4_url = soup_mars_hemi.find('h2', class_ ='title').text
    img4_url =soup_mars_hemi.find('a', text ='Sample').get("href")

    #Put data into a dictionary
    four ={'title':hemi_title4_url, "img_url":img4_url}
    four


     #Summary

    hemisphere_url = [one, two, three, four]
    hemisphere_url
                                                    
    # Store data in a dictionary
    mars_data = {


        "News_Title" : news_p,
        "Featured_Image" : featured_img_url,
        "Mars_Weather" : mars_weather,
        "Mars_Facts" : mars_facts,
        "Hemisphere_Images" : hemisphere_img_url

    }

    browser.quit()

    # Return results
    return mars_data

示例#41

0

显示文件

def scrape_info():
    # Making space soup
    browser = init_browser()
    url = 'https://www.nasa.gov/missions/'
    html = browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Stiring the soup
    links = [
        a for a in soup.find('div', class_="static-landing-page").find_all(
            'a', href=True)
    ]

    # Pouring out the list and title
    link_list = []
    title_list = []
    for i in range(len(links)):
        if links[i].get('href').find("mission_pages") == 1:
            link_list.append(links[i]['href'])
            title_list.append(links[i].text)
        else:
            print('No Mission Page')
    mission_dict = {}
    mission_dict["Mission"] = title_list
    mission_dict["Mission Link"] = link_list

    url = 'https://www.cdscc.nasa.gov/Pages/trackingtoday.html'
    html = browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    abv_table = pd.read_html(url)[3]

    abv_table.columns = ['ABV', 'Name']
    abv_dict = abv_table.to_dict("records")

    MADRID = {}
    GOLDSTONE = {}
    CANBERRA = {}
    browser.visit('https://eyes.jpl.nasa.gov/dsn/dsn.html')
    time.sleep(.3)
    for i in browser.find_by_tag('a'):
        if i['class'] == 'inactive' or None:
            pass
        elif i.text == '':
            pass
        elif i['id'] == '' or None:
            pass
        else:
            if i['id'][:2] == 'sp':
                ABV = i.text
                if i['id'][-5] == '0':
                    MADRID[ABV] = {}
                elif i['id'][-5] == '1':
                    GOLDSTONE[ABV] = {}
                elif i['id'][-5] == '2':
                    CANBERRA[ABV] = {}

    # Platting the soup
    mission_data = {
        "Madrid": MADRID,
        "Goldstone": GOLDSTONE,
        "Canberra": CANBERRA,
        "Mission_titles": mission_dict,
        "Mission_Code": abv_dict
    }
    browser.quit()
    return mission_data

示例#42

0

显示文件

文件： MysqlConn.py 项目： melissa-bei/RecommendationAlgorithm

def get_family_info_from_mysqldb(save_dataset=True):
    # 建立连接
    conn = pymysql.connect(host='172.16.201.103',
                           port=3306,
                           user='******',
                           password='******',
                           db='resource',
                           charset='utf8')

    # 获取游标
    cursor = conn.cursor(pymysql.cursors.DictCursor)

    # 执行sql语句
    # 获取构件库中所有的族（包含各专业）
    sql = 'select i.id, i.name, i.resource_desc ' \
          'from resource_item i, resource_parameter_value pv ' \
          'where i.parameter_code=pv.code and i.resource_lib_id=54 and i.could_be_shown_in_front=1'
    rows = cursor.execute(sql)
    items = cursor.fetchall()
    # 获取所有族的参数
    sql = 'select ip.resource_item_id , i.name, ip.resource_parameter_id, p.name as parameter, pv.id as param_id, pv.value ' \
          'from resource_item i, resource_item_parameter ip, resource_parameter p, resource_parameter_value pv ' \
          'where i.id=ip.resource_item_id and p.id=ip.resource_parameter_id and ip.resource_parameter_value_id=pv.id ' \
          'and (pv.resource_parameter_id=1 or pv.resource_parameter_id=51 or pv.resource_parameter_id=52 ' \
          'or pv.resource_parameter_id=53 or pv.resource_parameter_id=10004) '
    rows = cursor.execute(sql)
    item_params = cursor.fetchall()
    # 获取结构化参数
    sql = 'select pv.id, pv.code, pv.`value` from resource_parameter_value pv ' \
          'where (pv.resource_parameter_id=1 or pv.resource_parameter_id=51 or pv.resource_parameter_id=52 ' \
          'or pv.resource_parameter_id=53 or pv.resource_parameter_id=10004) ' \
          'order by code'
    rows = cursor.execute(sql)
    params = cursor.fetchall()
    # 关闭游标
    cursor.close()
    # 关闭连接
    conn.cursor()

    items_dict = {}
    for idx in range(len(items)):
        # if items[idx]["id"] not in items_dict:
        try:
            items[idx]["resource_desc"] = pd.read_html(
                items[idx]["resource_desc"],
                header=0)[0].to_dict(orient="records")
        except:
            pass
        if items[idx]["name"][0] not in ["A", "S"]:  # 过滤土建专业
            continue
        item_id = items[idx]["id"]
        items[idx].pop("id", None)
        items[idx].pop("resource_desc", None)
        items_dict[item_id] = items[idx]

    tmp1 = {c["code"]: c["id"] for c in params if c["code"]}
    tmp2 = {c["id"]: c["value"] for c in params}
    pid2v = {}
    for c in params:
        if not c["code"]:
            pid2v[c["id"]] = [c["value"]]
        else:
            ks = c["code"].split("_")
            pid2v[c["id"]] = []
            for i in range(len(ks)):
                tmp_k = "_".join(ks[:i + 1])
                if tmp1[tmp_k] in pid2v:
                    pid2v[c["id"]].append(tmp2[tmp1[tmp_k]])

    for idx in range(len(item_params)):
        p = item_params[idx]
        iid = p["resource_item_id"]
        if iid in items_dict:
            if p["parameter"] not in items_dict[iid]:
                items_dict[iid][item_params[idx]["parameter"]] = []
            items_dict[iid][item_params[idx]["parameter"]] += pid2v[
                p["param_id"]]

    # 保存结果
    if save_dataset:
        with open(
                os.path.join(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))),
                    "data/standard_vocab.json"), "w") as f:
            json.dump(list(items_dict.values()), f)
    if save_dataset:
        with open(
                os.path.join(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))),
                    "data/standard_param.json"), "w") as f:
            json.dump(params, f)

    return items_dict

示例#43

0

显示文件

             'Visitors': [43,34,65,56,29,76],
             'Bounce Rate': [65,67,78,65,45,52]
}

df = pd.DataFrame(web_stats)
df.set_index('Day', inplace=True)
'''
##read write csv
dfx = pd.read_csv('/Users/zhangsicai/Desktop/Panda/grade.csv')
dfx.set_index('Date', inplace=True)
dfx.rename(columns={'math': 'shuxue'}, inplace=True)
print(dfx['shuxue'])
#df['math'].to_csv('/Users/zhangsicai/Desktop/Panda/grade1.csv')

df = quandl.get('FMAC/HPI_TX', authtoken='HUg-EPXknoSxzbk26DMu')
fiddy_states = pd.read_html(
    'https://simple.wikipedia.org/wiki/List_of_U.S._states')
print(df.head())

print(df[0:2])
print(df['NSA Value'])
df.plot()
plt.show()

print(fiddy_states[0]['Name'])
'''
for dd in df['NSA Value'][1:]:
    print(dd)
'''

df1 = pd.DataFrame(
    {

示例#44

0

显示文件

文件： scrape_mars.py 项目： arerickson28/web-scraping-challenge

def scrape():

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    # NASA MARS NEWS
    news_url = (
        'https://mars.nasa.gov/news/?page=0&per_page=40'
        '&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    )

    browser.visit(news_url)
    time.sleep(2)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    news_title = soup.body.find_all('div',
                                    class_="content_title")[1].text.strip()

    news_p = soup.body.find_all('div',
                                class_="article_teaser_body")[0].text.strip()

    #JPL MARS SPACE IMAGES - FEATURED IMAGE

    image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

    browser.visit(image_url)
    time.sleep(2)

    my_xpath = '/html/body/div[1]/div/div[3]/section[1]/div/div/article/div[1]/footer/a'

    results = browser.find_by_xpath(my_xpath)
    img = results[0]
    img.click()

    browser.click_link_by_partial_text('more info')

    html1 = browser.html
    soup = BeautifulSoup(html1, 'html.parser')

    feat_img = soup.find_all('figure', class_='lede')
    feat_img_result = feat_img[0].a['href']

    featured_image_url = 'https://www.jpl.nasa.gov' + feat_img_result

    # MARS FACTS

    facts_url = 'https://space-facts.com/mars/'

    facts_table = pd.read_html(facts_url)

    table_df = facts_table[0]

    # mars_table_df = table_df.rename(columns={0: 'Mars: Measurement', 1: 'Measurement: Value'})
    mars_table_df = table_df.to_html(header=False, index=False)

    # mars_table_df.to_html(classes="table table-striped")

    print(mars_table_df)

    # MARS HEMISPHERES

    #Note the inconsistent url
    hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    #alternate site, if previous site is unavailable
    # hemispheres_url = 'https://web.archive.org/web/20181114171728/https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(hemispheres_url)

    time.sleep(2)

    hemisphere_image_urls = []

    url_links = browser.find_by_css('a.product-item h3')

    for i in range(len(url_links)):
        # create an empty dictionary for each hemisphere
        hemisphere = {}
        browser.find_by_css('a.product-item h3')[i].click()
        #get hemisphere title
        hemisphere['title'] = browser.find_by_css("h2.title").text
        #next find the sample image anchor tag and get href
        sample_elem = browser.find_link_by_text('Sample').first
        hemisphere['img_url'] = sample_elem['href']
        #Append hemisphere object to list
        hemisphere_image_urls.append(hemisphere)
        #Finally navigate back to start again on loop
        browser.back()

    #*************** CREATE A DICTOIONARY *********************
    mars_info = {}
    mars_info['news_title'] = news_title
    mars_info['news_detail'] = news_p
    mars_info['featured_img_url'] = featured_image_url
    mars_info['mars_facts_html'] = mars_table_df
    mars_info['hemisphere_image_urls'] = hemisphere_image_urls
    # Close the browser
    browser.quit()
    # Return results
    return mars_info


#

#     xpaths = [
#             '/html/body/div[1]/div[1]/div[2]/section/div/div[2]/div[1]/div/a',
#             '/html/body/div[1]/div[1]/div[2]/section/div/div[2]/div[2]/div/a',
#             '/html/body/div[1]/div[1]/div[2]/section/div/div[2]/div[3]/div/a',
#             '/html/body/div[1]/div[1]/div[2]/section/div/div[2]/div[4]/div/a'
#              ]

#     hem_title = []

#     hem_url = []

#     mars_hem_title_url = []

#     for path in xpaths :
#         results = browser.find_by_xpath(path)
#         img = results[0]
#         img.click()

#         html = browser.html
#         soup = BeautifulSoup(html, 'html.parser')

#         title = soup.find('h2', class_ = 'title').text
#         hem_title.append(title)

#         hem = soup.find('div', class_='downloads')
#         hem_result = hem
#         img_url = hem_result.find('a')['href']
#         hem_url.append(img_url)

#         mars_hem_title_url.append({'title': title, 'img_url': img_url})

#         browser.visit(hemispheres_url)

#     browser.quit()

# #Store results in dictionary
#     notebook_dict = {}

#     notebook_dict = {
#                 'article_title': news_title,
#                 'article_paragraph': news_p,
#                 'mars_image': featured_image_url,
#                 'mars_data_table': mars_table_df,
#                 'hemisphere_image_urls': mars_hem_title_url}

#     print(f"index 0 {notebook_dict['article_title']}")
#     print(f"index 1 {notebook_dict['article_paragraph']}")
#     print(f"index 2 {notebook_dict['mars_image']}")
#     print(f"index 3 {notebook_dict['mars_data_table']}")
#     print(f"index 4 {notebook_dict['hemisphere_image_urls']}")

# return notebook_dict

示例#45

0

显示文件

文件： scrape_mars.py 项目： mfbadgley/WebScrapingHW

def scrape():

    browser = init_browser()
    url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    response = requests.get(url)
    time.sleep(1)
    soup = BeautifulSoup(response.text, 'lxml')
    #grabbling the 'slide' class element from the url
    results = soup.find_all(class_="slide")
    #creating a list to hold scraped data
    news_data = []
    for result in results:
        # Error handling
        try:  #loop thru and get the text within these classes, replace \n with blank space
            news_p = result.find(
                class_="rollover_description_inner").text.replace('\n', '')
            news_title = result.find(class_="content_title").text.replace(
                '\n', '')

            post = {"news_title": news_title, "news_p": news_p}

            news_data.append(post)
            print(post)

        except Exception as e:
            print(e)
    browser = Browser('chrome', headless=False)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    time.sleep(1)
    #use splinter to click the "Full Image" button
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(1)
    #HTML Object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    #find the class where pic is stored
    results = soup.find(class_='fancybox-image')
    #retrieve source attribute, i.e. the path
    url = results['src']
    #attach the path to the main site link, this is the full image link
    featured_image_url = 'https://www.jpl.nasa.gov' + url
    post_two = {'featured_image': featured_image_url}
    news_data.append(post_two)
    print(post_two)
    #visit the mars twitter page to get the Weather
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    time.sleep(1)
    response = requests.get(url)
    #parse HTML with Beautiful soup, get the text
    soup = BeautifulSoup(response.text, 'html.parser')
    #get the text from the first p tag with appropriate class (from inspecting the site)
    mars_weather = soup.find(
        'p',
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text
    post_three = {'mars_weather': mars_weather}
    print(post_three)
    news_data.append(post_three)
    browser = Browser('chrome', headless=False)
    #visit the mars space facts site
    url = 'https://space-facts.com/mars/'
    #read the table, put into list variable
    tables = pd.read_html(url)
    #convert the list to a dataframe
    mars_df = tables[0]
    #put column headers on
    mars_df.columns = ["Characteristic", "Value"]
    #convert the datframe to dictionary, using 'records' orientation, this does not neeed to be, nor should be, appended to news_data, as will create a list of a dictionary within the list, and not be able to be inserted to mongodb
    mars_dict = mars_df.to_dict('records')
    print(mars_dict)
    #Visit the site to get images of Mars Hemispheres
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    time.sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    results = soup.find_all(class_='item')

    #loop through the item class
    for result in results:
        #find the first a tag
        link = result.find('a')
        #assign the href to variable 'links'
        links = link['href']
        #assign the link h3 title text to variable 'title'
        title = result.find('h3').text
        #concatenate the path with the main site link, assign to variable 'url'
        url = 'https://astrogeology.usgs.gov' + links
        #open brower, chromedriver
        browser = Browser('chrome', headless=False)
        #visit the concatenated url
        browser.visit(url)
        time.sleep(1)
        html = browser.html
        #parse the html with beautiful soup
        soup = BeautifulSoup(html, 'html.parser')
        #find all elemenst with class 'downloads', assign results to variable list 'infos'
        infos = soup.find_all(class_='downloads')
        #loop thru infos, pull out links to images, assign with title to dictionary post, and then append to list
        #mars_images
        for info in infos:
            link_two = info.find('a')
            img_url = link_two['href']
            post_four = {'img_url': img_url, 'title': title}
            news_data.append(post_four)
            print(post_four)

#return your data, so it can be accessed by flask app (where the insertion into mongodb will occur)
    return news_data + mars_dict

示例#46

0

显示文件

文件： mission_to_mars.py 项目： ryan212502/Scrape_Mars

browser = Browser('chrome', headless=False)
weather_url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(weather_url)

html = browser.html
weather_soup = BeautifulSoup(html, 'html.parser')

weather = weather_soup.find('div', class_='js-tweet-text-container')

mars_weather= weather.p.text.lstrip()
print(mars_weather)

facts_url = 'http://space-facts.com/mars/'

fact_table = pd.read_html(facts_url)
fact_table

df = fact_table[0]
df.columns = ['Mars', 'Value']
df

html_table = df.to_html()
df.to_html('table.html')

mars_facts=df.to_dict('records')
mars_facts

tem=list(mars_facts[0].values())
tem

示例#47

0

显示文件

    def _read_change_from_url(self, url: str) -> pd.DataFrame:
        """read change from url

        Parameters
        ----------
        url : str
            change url

        Returns
        -------
            pd.DataFrame:
                symbol      date        type
                SH600000  2019-11-11    add
                SH600000  2020-11-10    remove
            dtypes:
                symbol: str
                date: pd.Timestamp
                type: str, value from ["add", "remove"]
        """
        resp = retry_request(url)
        _text = resp.text
        date_list = re.findall(r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日", _text)
        if len(date_list) >= 2:
            add_date = pd.Timestamp("-".join(date_list[0]))
        else:
            _date = pd.Timestamp("-".join(re.findall(r"(\d{4}).*?年.*?(\d+).*?月", _text)[0]))
            add_date = get_trading_date_by_shift(self.calendar_list, _date, shift=0)
        remove_date = get_trading_date_by_shift(self.calendar_list, add_date, shift=-1)
        logger.info(f"get {add_date} changes")
        try:
            excel_url = re.findall('.*href="(.*?xls.*?)".*', _text)[0]
            content = retry_request(f"http://www.csindex.com.cn{excel_url}", exclude_status=[404]).content
            _io = BytesIO(content)
            df_map = pd.read_excel(_io, sheet_name=None)
            with self.cache_dir.joinpath(
                f"{self.index_name.lower()}_changes_{add_date.strftime('%Y%m%d')}.{excel_url.split('.')[-1]}"
            ).open("wb") as fp:
                fp.write(content)
            tmp = []
            for _s_name, _type, _date in [("调入", self.ADD, add_date), ("调出", self.REMOVE, remove_date)]:
                _df = df_map[_s_name]
                _df = _df.loc[_df["指数代码"] == self.index_code, ["证券代码"]]
                _df = _df.applymap(self.normalize_symbol)
                _df.columns = [self.SYMBOL_FIELD_NAME]
                _df["type"] = _type
                _df[self.DATE_FIELD_NAME] = _date
                tmp.append(_df)
            df = pd.concat(tmp)
        except Exception as e:
            df = None
            _tmp_count = 0
            for _df in pd.read_html(resp.content):
                if _df.shape[-1] != 4:
                    continue
                _tmp_count += 1
                if self.html_table_index + 1 > _tmp_count:
                    continue
                tmp = []
                for _s, _type, _date in [
                    (_df.iloc[2:, 0], self.REMOVE, remove_date),
                    (_df.iloc[2:, 2], self.ADD, add_date),
                ]:
                    _tmp_df = pd.DataFrame()
                    _tmp_df[self.SYMBOL_FIELD_NAME] = _s.map(self.normalize_symbol)
                    _tmp_df["type"] = _type
                    _tmp_df[self.DATE_FIELD_NAME] = _date
                    tmp.append(_tmp_df)
                df = pd.concat(tmp)
                df.to_csv(
                    str(
                        self.cache_dir.joinpath(
                            f"{self.index_name.lower()}_changes_{add_date.strftime('%Y%m%d')}.csv"
                        ).resolve()
                    )
                )
                break
        return df

示例#48

0

显示文件

文件： part 7 pickling.py 项目： franck070/pandas

def state_list():
    fiddy_states = pd.read_html(
        'https://simple.wikipedia.org/wiki/List_of_U.S._states')
    return fiddy_states[0][0][1:]

示例#49

0

显示文件

文件： Departure_Web_Scraping.py 项目： jackapbutler/Flight-Delay-Web-App

# In[1]:


import os
import glob
import pandas as pd
import numpy as np

import time
from datetime import datetime


# In[2]:


dep_df, = pd.read_html("https://www.dublinairport.com/flight-information/live-departures", header=0)
dep_df.tail(1)


# In[3]:


# Initial Cleaning
dep1_df = dep_df.dropna()
dep1_df = dep1_df.drop('Status', axis=1)
dep1_df.columns = ['Terminal', 'Destination', 'Airline', 'Flight No.', 'Scheduled DateTime', 'Actual Departure']

# Month Column
new2 = dep1_df["Scheduled DateTime"].str.split(" ", n = 2, expand = True) 
dep1_df["Month"]= new2[1]

示例#50

0

显示文件

文件： scraping.py 项目： yosiash97/AWProject

def scrape_eng_pages(filename, sheet, check):
    print("SHEETNAME********:", sheet)
    book = load_workbook(filename)
    old_sheet = sheet
    sheet = book[sheet]
    dictionary = {}
    msg = ""


    # filling up dictionary with city and corresponding website values
    first = False
    for row in sheet.rows:
        dictionary[row[2].value] = row[7].value

    keywords = ['Design', 'Professional ', 'Consult', 'Civil', 'Transportation', 'Bike', 'Pedestrian', 'Sidewalk', 'Street'
                    'Road','Boulevard', 'Blvd', 'Way', 'On-call']

    keywords_track = {}
    pattern = re.compile(
      "(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|"
      "Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|"
      "Dec(ember)?)\s+\d{1,2},\s+\d{4}")

    regexp = re.compile(pattern)
    dates = []
    #list of websites that either have no current RFPS, or are broken - Can't check the checked/not working field because not all are updated so program will break
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    for each in dictionary.keys():
        if each == "City":
            continue
        if each is not None and dictionary[each] is not None and each not in check:
            user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'

            url = dictionary[each]
            headers = {'User-Agent': user_agent, }

            request = urllib.request.Request(url, None, headers)  # The assembled request
            response = urllib.request.urlopen(request)

            html = response.read()
            soup = BeautifulSoup(html)
            tables = soup.find_all('table')
            final_dates = []
            for table in tables:
                # do your stuff for every table

                try:
                    df = pd.read_html(str(table))
                    if len(df) == 0:
                        continue
                    else:
                        #convert table from website into string paragraphs
                        a = tabulate(df[0], headers='keys', tablefmt='psql')

                        # run through keywords
                        for key in keywords:
                            if key in a:
                                #print("EACH IS IN KEY: ", each, key)
                                if each not in keywords_track:
                                    keywords_track[each] = [key]
                                else:
                                    num_occ = a.count(key)
                                    if not len(keywords_track[each]) == num_occ:
                                        for i in range(num_occ - 1):
                                            keywords_track[each].append(key)

                        if regexp.search(a):
                            print("FOUND DATE!")
                        dates.append((each, re.findall(r"[\d]{1,2}[/.-][\d]{1,2}[/.-][\d]{4}", a), dictionary[each], a))
                except:
                    continue
    print("KEY WORD DICT AFTER FILLING: ", keywords_track)
    array = build_dates(dates)
    print("Array", array)
    email_msg = build_email_msg(array, msg, keywords_track)

    return email_msg

示例#51

0

显示文件

文件： mars_scrape.py 项目： spamlite2005/web-scraping-challenge


#Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page. 
#Save the tweet text for the weather report as a variable called mars_weather.
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

mars_weather_element = soup.find('div', class_="content")
mars_weather = print(mars_weather_element.p.text)

#Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
#Use Pandas to convert the data to a HTML table string.
url = 'https://space-facts.com/mars/'
tables = pd.read_html(url)
tables

#Mars - Earth Comparison
mars_earth_comparison = tables[0]
mars_earth_comparison

#Mars Planet Profile
mars_planet_profile = tables[1].rename(columns={0:"Mars Category", 1:"Value"})
mars_planet_profile

#Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

示例#52

0

显示文件

文件： UpdateField.py 项目： rath883/Lumpy-Best-Ball-Challenge-Scoring

AIRTABLE_API_KEY = 'keybBQGNdYeJkRwcs'
base_key = 'appmCQ7CzGefKPdmu'
url = 'http://www.espn.com/golf/leaderboard?tournamentId=' + str(tournament_id)
table_name = 'PGA_Field'
entries_table = 'PGA2019Entries'

pd.options.mode.chained_assignment = None

page = urlopen(url)
soup = BeautifulSoup(page, "html.parser")
html = soup.find(
    "table",
    attrs={
        "class": "Table2__table-scroller Table2__right-aligned Table2__table"
    })
table = pd.read_html(html.prettify())
df = table[0]
df['PLAYER'] = df['PLAYER'].str.replace("'", "")
df = df.set_index("PLAYER")
df.to_csv('espnfield.csv')

airtable = Airtable(base_key, table_name, AIRTABLE_API_KEY)
field = airtable.get_all()
field = pd.DataFrame.from_dict(field)

field_data = [0] * len(field)
for (i, entry) in enumerate(field_data):
    entry = field.loc[i]['fields']
    field_data[i] = entry

field_data = pd.DataFrame.from_dict(field_data)

示例#53

0

显示文件

def scrape():
    browser = init_browser()
    client = pymongo.MongoClient("mongodb://localhost:27017")
    db = client.mars_db

    # Retrieving news title and teaser
    browser.visit("https://mars.nasa.gov/news/")
    time.sleep(2)

    soup = bs(browser.html, "html.parser")
    items = soup.find("ul", class_="item_list")
    slides = items.find_all("li", class_="slide")

    news_titles = []
    news_paragraphs = []
    for slide in slides:
        news_title = slide.find("div", class_="content_title").text
        news_p = slide.find("div", class_="article_teaser_body").text
        news_titles.append(news_title)
        news_paragraphs.append(news_p)

    # Retrieving featured image url
    browser.visit(
        "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars")
    browser.find_by_id("full_image").click()
    time.sleep(2)

    soup = bs(browser.html, "html.parser")
    image_src = soup.find("img", class_="fancybox-image")["src"]

    featured_image_url = f"https://jpl.nasa.gov{image_src}"

    # Retriving mars facts table
    browser.visit("https://space-facts.com/mars/")
    df = pd.read_html(browser.html)[1]
    mars_facts_table_html = df.to_html(index=False, justify="center")
    mars_facts_table_html = mars_facts_table_html.replace("\n", "")

    browser.visit(
        "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    )
    time.sleep(1)
    soup = bs(browser.html, "html.parser")

    # Retrieving hemishere page's urls
    hemisphere_urls = []
    hemispheres = soup.find_all("div", class_="description")
    for hemisphere in hemispheres:
        url = hemisphere.find("a")["href"]
        url = f"https://astrogeology.usgs.gov{url}"
        hemisphere_urls.append(url)

    # Retrieving titles and image links of different hemispheres
    hemisphere_list = []
    for hemisphere_url in hemisphere_urls:
        browser.visit(hemisphere_url)
        time.sleep(2)
        soup = bs(browser.html, "html.parser")
        title = soup.find("h2", class_="title").text
        title = re.sub(" Enhanced", "", title)
        image_url = soup.find_all("li")[0].find("a")["href"]
        hemisphere_list.append({"title": title, "image_url": image_url})

    return_dict = {}
    return_dict["news_titles"] = news_titles
    return_dict["news_paragraphs"] = news_paragraphs
    return_dict["featured_image_url"] = featured_image_url
    return_dict["mars_facts_table_html"] = mars_facts_table_html
    return_dict["hemisphere_list"] = hemisphere_list
    return_dict["date"] = datetime.datetime.utcnow()

    db.mission_to_mars.update({}, return_dict, upsert=True)

    browser.quit()

    return return_dict

示例#54

0

显示文件

文件： scrape_mars.py 项目： smithalal1/-Web-Scraping-and-Document-Database

def scrape():
    """ Scrapes all websites for Mars data """
    
    # Create a python dictionary to store all data
    scrape_mars_dict = {}
    
    # Use requests and BeautifulSoup to scrape Nasa News for latest news
    url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    response = requests.get(url)
    soup = bs(response.text, 'lxml')

    results = soup.find('div', class_='features')
    news_title = results.find('div', class_='content_title').text
    newsp = results.find('div', class_='rollover_description').text
    
    # Store scraped data into dictionary
    scrape_mars_dict['news_title'] = news_title
    scrape_mars_dict['newsp'] = newsp
    
    # Scrape Mars Weather twitter for latest weather report
    twitter_url = 'https://twitter.com/marswxreport?lang=en'
    twitter_response = requests.get(twitter_url)
    twitter_soup = bs(twitter_response.text, 'lxml')
    
    twitter_result = twitter_soup.find('div', class_='js-tweet-text-container')
    mars_weather = twitter_result.find('p', class_='js-tweet-text').text
    
    # Store scraped data into dictionary
    scrape_mars_dict['mars_weather'] = mars_weather

    # Scrape facts about Mars from space-facts.com using Pandas read_html function
    mars_facts_url = 'https://space-facts.com/mars/'
    tables = pd.read_html(mars_facts_url)
    df = tables[0]
     # Cleanup the Index
    df.rename({0:"Mars - Earth Comparison", 1:"Mars", 2: "Earth"}, axis=1, inplace=True)
    df.set_index("Mars - Earth Comparison", inplace=True)
    
    # Export scraped table into an html script    
    mars_facts = df.to_html()
    mars_facts.replace("\n","")
    df.to_html('mars_facts.html')

    # Store html file to dictionary
    scrape_mars_dict['mars_facts'] = mars_facts

    # Call on chromedriver function to use for splinter
    browser = init_browser()
    
    # Scrape Nasa for url of latest featured image of Mars
    nasa_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(nasa_url)

    nasa_html = browser.html
    nasa_soup = bs(nasa_html, "lxml")

    featured_image = nasa_soup.find('div', class_='default floating_text_area ms-layer').find('footer')
    featured_image_url = 'https://www.jpl.nasa.gov'+ featured_image.find('a')['data-fancybox-href']
    
    # Store url to dictionary
    scrape_mars_dict['featured_image_url'] = featured_image_url

    # Scrape astrogeology.usgs.gov for urls of hemisphere images of Mars
    hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(hemisphere_url)

    hemisphere_html = browser.html
    hemisphere_soup = bs(hemisphere_html, 'lxml')
    base_url ="https://astrogeology.usgs.gov"

    image_list = hemisphere_soup.find_all('div', class_='item')

    # Create a list to store dictionary of urls and image titles
    hemisphere_image_urls = []

    # Loop through list of hemispheres and click on each one to find large resolution image
    for image in image_list:

        # Create a dicitonary to store urls and titles
        hemisphere_dict = {}
        
        # Find link to large image
        href = image.find('a', class_='itemLink product-item')
        link = base_url + href['href']

        # Visit the link
        browser.visit(link)

        # Wait 1 second 
        time.sleep(1)
        
        # Parse the html of the new page
        hemisphere_html2 = browser.html
        hemisphere_soup2 = bs(hemisphere_html2, 'lxml')

        # Find the title
        img_title = hemisphere_soup2.find('div', class_='content').find('h2', class_='title').text
        
        # Append to dict
        hemisphere_dict['title'] = img_title
    
        # Find image url
        img_url = hemisphere_soup2.find('div', class_='downloads').find('a')['href']
        
        # Append to dict
        hemisphere_dict['url_img'] = img_url
        
        # Append dict to list
        hemisphere_image_urls.append(hemisphere_dict)
    
    # Store hemisphere image urls to dictionary
    scrape_mars_dict['hemisphere_image_urls'] = hemisphere_image_urls

    return scrape_mars_dict

示例#55

0

显示文件

# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
# BeautifulSoup, Pandas, and Requests/Splinter
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd

# In[2]:

# EIA browser to grab table
url = 'https://www.eia.gov/electricity/state/archive/2014/'

# In[3]:

# Grab table with pandas

table_list = pd.read_html(url)
table = table_list[0]
table.head()

# In[4]:

# Remove total row from list

remove_list = ['U.S. Total']
states_table = table[~table.Name.isin(remove_list)]

# remove non-price columns
ecost_df = states_table.iloc[:, [0, 1]]

# push to csv
ecost_df.to_csv('csv/eia_2014_scrape.csv')

示例#56

0

显示文件

def scrape():
    browser = init_browser()
    listings = {}

    url = "https://mars.nasa.gov/news/"
    browser.visit(url)
    time.sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    news_title = soup.find('div', class_='content_title').text
    news_p = soup.find('div', class_='article_teaser_body').text

    url_base = "https://www.jpl.nasa.gov"
    url_add = '/spaceimages/?search=&category=Mars'
    browser.visit(url_base + url_add)
    time.sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    bttn_image_url = soup.find('article', class_='carousel_item').get('style')
    start = bttn_image_url.find("url('")
    end = bttn_image_url.find("');")
    featured_image_url = url_base + bttn_image_url[start + 3 + len("('"):end]

    url_base = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url_base)
    time.sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    mars_weather = soup.find('p', class_='TweetTextSize').text

    url_base = "https://space-facts.com/mars/"
    browser.visit(url_base)
    time.sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    table = pd.read_html(url_base)
    htmltable = table[0].to_html()

    link2 = []
    link3 = []
    link4 = []
    url_base = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_base)
    time.sleep(2)
    links = browser.find_link_by_partial_text('Hemisphere')
    [link2.append(link['href']) for link in links]
    for link in link2:
        browser.visit(link)
        time.sleep(2)
        url_link = browser.find_link_by_partial_text('Sample')
        title_text = browser.find_by_css('.title')
        link3.append(url_link['href'])
        link4.append(title_text.html)
    hemisphere_image_urls = []
    for i in range(len(link3)):
        hemisphere_image_urls.append({"title": link4[i], "img_url": link3[i]})

    listings["news_p"] = news_p
    listings["news_title"] = news_title
    listings["featured_image_url"] = featured_image_url
    listings["mars_weather"] = mars_weather
    listings["html_table"] = htmltable
    listings["hemisphere_img_dict"] = hemisphere_image_urls

    return listings

示例#57

0

显示文件

        except:
            ## proxies that do not work are removed from the list
            print(f"{pick} did not work")
            proxies_list.remove(pick)
            print(f"{pick} removed")
            print(len(proxies_list))
            print(Exception)
else:  ## if proxies_list is empty, we get our proxies without configuring urllib for using proxies
    req = urllib.request.Request(url, headers={'User-Agent': "Magic Browser"})
    sauce = urllib.request.urlopen(req).read()
    soup = bs.BeautifulSoup(sauce, 'lxml')
    print(soup)

## use pandas to get tables and choose columns

df = pd.read_html(sauce)  ## using pandas read_html method to parse through url
print(df, len(df))
df = df[0]  ## df is a list of tables. We want first table.

df.to_csv("proxiesraw.csv", index=True)  ## saving df to csv

## print(df[0].columns) ## choosing dataframe with the index 0 and checking all the columns. You may need to check columns if name of column have weird spacing and etc.
## df = df[0] ## setting df as the df[0] the data frame with the ip, port, and etc.

df = pd.read_csv("proxiesraw.csv")
df = df[['IP Address', "Port",
         "Https"]]  ## making df only show the columns we want to see.
df = df.dropna(axis=0, how='any', thresh=None, subset=None,
               inplace=False)  ## dropping all rows with missing values

示例#58

0

显示文件

文件： Covid19.py 项目： Sun-Yize-SDUWH/Time-Series

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize
import math


url = 'http://www.pangoo.it/coronavirus/?t=region&r=Lombardia&data=y#table'
data = pd.read_html(url)[0]
# data = pd.read_csv('cov19_data.csv')
count = data['Totale casi'][:-1]
n = len(count)
temp = np.array([])
for i in range(n):
    temp = np.append(temp, int(count[i]))
y = np.diff(temp)

time = np.linspace(1, n-1, n-1)
plt.figure(figsize=[10, 5])
plt.plot(time, y)
plt.xlabel('Index')
plt.ylabel('y')
plt.title('New Covid19 cases in Italy')
plt.show()


obs = len(y)-5
x = y[:obs]
a, p, k, v = [np.ones(n) for _ in range(4)]
a[0], p[0], v[0] = x[1], 10000, 0

示例#59

0

显示文件

def getWorksheetsListFromExcelURL(downloadURL, isXLSX):
	print('downloading file : ' + downloadURL)

	# handles well formatted .xlsx files with dat reader bitch
	if isXLSX:
		urllib.request.urlretrieve(downloadURL, "temp.xlsx")

		xlsxFile = pd.ExcelFile("temp.xlsx")

		worksheets = []

		print('file downloaded, tranforming into sheets')

		for xlsxSheet in xlsxFile.sheet_names:
			worksheets.append(pd.read_excel(xlsxFile, xlsxSheet))

		print('file transformed, cleaning up')
		os.remove("temp.xlsx")

	# handles .xls schema which is stored string like - parses that bitch
	else:
		urllib.request.urlretrieve(downloadURL, "temp.xls")

		file1 = open('temp.xls', 'r')
		lines = file1.readlines()

		worksheets = []
		worksheet = []
		isWorksheet = False
		isFirstWorkSheet = True
		count = 0

		print('file downloaded, tranforming into sheets')

		for line in lines:
			if '<html' in line:
				isWorksheet = True
			if '</html' in line:
				isWorksheet = False

			if isWorksheet:
				worksheet.append(line)
			else:
				if len(worksheet) > 0:
					worksheet.append(line)
					if not isFirstWorkSheet:
						temp = '\n'.join(worksheet)
						temp = temp.replace('3D1', '1')
						temp = temp.replace('3D2', '2')
						temp = temp.replace('3D3', '3')
						temp = temp.replace('3D4', '4')
						temp = temp.replace('3D5', '5')
						temp = temp.replace('3D6', '6')
						temp = temp.replace('3D7', '7')
						temp = temp.replace('3D8', '8')
						temp = temp.replace('3D9', '9')
						temp = temp.replace('3D10', '10')
						temp = pd.read_html(temp)
						temp = formatDF(temp, count)
						count += 1
						worksheets.append(temp)
						worksheet = []
					else:
						worksheet = []
						isFirstWorkSheet = False
				else:
					worksheet = []

		print('file transformed, cleaning up')

		os.remove("temp.xls")

	return worksheets

示例#60

0

显示文件

文件： plot_kaggle_extra.py 项目： GaelVaroquaux/ml_med_imaging_failures

    'ultrasound nerve segmentation',
    'pneumothorax',
    '2021 prostate',
    '2021 pneumonia',
    '2021 intracrancial, INVERTED METRIC',  #NOTE: performance metric is inverted
    '2021 covid19',
    '2021 chest xray'
]

# Load the data

data = dict()
interesting_columns = ['Team Name', 'Score', 'Entries']

for i, name in enumerate(names):
    public = pd.read_html('kaggle/' + name +
                          '_public.html')[0][interesting_columns]
    private = pd.read_html('kaggle/' + name +
                           '_private.html')[0][interesting_columns]
    # Select teams who did two or more submissions (to avoid people who
    # didn't really participate
    public = public.query('Entries >= 2')
    private = private.query('Entries >= 2')

    print(public.head())

    # Merge the two
    public = public.drop(columns='Entries').rename(columns=dict(
        Score='public'))
    private = private.drop(columns='Entries').rename(columns=dict(
        Score='private'))
    scores = pd.merge(public, private)