def test_invalid_flavor(): url = "google.com" flavor = "invalid flavor" msg = r"\{" + flavor + r"\} is not a valid set of flavors" with pytest.raises(ValueError, match=msg): read_html(url, "google", flavor=flavor)
def test_parse_failure_rewinds(): # Issue #17975 _skip_if_no('lxml') _skip_if_no('bs4') class MockFile(object): def __init__(self, data): self.data = data self.at_end = False def read(self, size=None): data = '' if self.at_end else self.data self.at_end = True return data def seek(self, offset): self.at_end = False def seekable(self): return True good = MockFile('<table><tr><td>spam<br />eggs</td></tr></table>') bad = MockFile('<table><tr><td>spam<foobr />eggs</td></tr></table>') assert read_html(good) assert read_html(bad)
def test_invalid_flavor(): url = "google.com" flavor = "invalid flavor" msg = r"\{" + flavor + r"\} is not a valid set of flavors" with tm.assert_raises_regex(ValueError, msg): read_html(url, "google", flavor=flavor)
def test_parse_dates_combine(self): raw_dates = Series(date_range('1/1/2001', periods=10)) df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())), 'time': raw_dates.map(lambda x: str(x.time()))}) res = read_html(df.to_html(), parse_dates={'datetime': [1, 2]}, index_col=1) newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0])
def test_multiple_header_rows(self): # Issue #13434 expected_df = DataFrame(data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")]) expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"], ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"]] html = expected_df.to_html(index=False) html_df = read_html(html, )[0] tm.assert_frame_equal(expected_df, html_df)
def test_parse_failure_unseekable(): # Issue #17975 _skip_if_no('lxml') class UnseekableStringIO(StringIO): def seekable(self): return False good = UnseekableStringIO(''' <table><tr><td>spam<br />eggs</td></tr></table>''') bad = UnseekableStringIO(''' <table><tr><td>spam<foobr />eggs</td></tr></table>''') assert read_html(good) assert read_html(bad, flavor='bs4') bad.seek(0) with pytest.raises(ValueError, match='passed a non-rewindable file object'): read_html(bad)
def getBasic(cls,stk_id): # stk_id can be either stock id or stock name try: res=urlopen(cls.getUrlStock(stk_id)) except Exception as ex: print 'error: %s' % (ex) return None content=res.read().decode('utf-8') strErase=findall("<tr class='tblHead'>.*\n.*</div>",content)[0] content=content.replace(strErase,'') df=read_html(content)[1] return df
def test_keep_default_na(self): html_data = """<table> <thead> <th>a</th> </tr> </thead> <tbody> <tr> <td> N/A</td> </tr> <tr> <td> NA</td> </tr> </tbody> </table>""" expected_df = DataFrame({'a': ['N/A', 'NA']}) html_df = read_html(html_data, keep_default_na=False)[0] tm.assert_frame_equal(expected_df, html_df) expected_df = DataFrame({'a': [np.nan, np.nan]}) html_df = read_html(html_data, keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df)
def _option_frames_from_url(self, url): frames = read_html(url) nframes = len(frames) frames_req = max(self._TABLE_LOC.values()) if nframes < frames_req: raise RemoteDataError("%s options tables found (%s expected)" % (nframes, frames_req)) if not hasattr(self, 'underlying_price'): try: self.underlying_price, self.quote_time = self._get_underlying_price(url) except IndexError: self.underlying_price, self.quote_time = np.nan, np.nan calls = self._process_data(frames[self._TABLE_LOC['calls']], 'call') puts = self._process_data(frames[self._TABLE_LOC['puts']], 'put') return {'calls': calls, 'puts': puts}
def test_na_values(self): # GH 13461 html_data = """<table> <thead> <th>a</th> </tr> </thead> <tbody> <tr> <td> 0.763</td> </tr> <tr> <td> 0.244</td> </tr> </tbody> </table>""" expected_df = DataFrame({'a': [0.763, np.nan]}) html_df = read_html(html_data, na_values=[0.244])[0] tm.assert_frame_equal(expected_df, html_df)
def test_converters(self): # GH 13461 html_data = """<table> <thead> <th>a</th> </tr> </thead> <tbody> <tr> <td> 0.763</td> </tr> <tr> <td> 0.244</td> </tr> </tbody> </table>""" expected_df = DataFrame({'a': ['0.763', '0.244']}) html_df = read_html(html_data, converters={'a': str})[0] tm.assert_frame_equal(expected_df, html_df)
def grab_result(self): print('grabbing results...') #self.inspect("results_aftersleep") #form = self.driver.find_element_by_xpath('//*[@id="simOutput"]') #stattab = WebDriverWait(form, 1000).until(EC.presence_of_element_located((By.ID, "test-statsBtn"))) #stattab = form.find_element_by_xpath('//*[@id="test-statsBtn"]') stattab = None time0 = 0 while not stattab and time0 < 1000: try: stattab = self.driver.find_element_by_xpath('//*[@id="test-statsBtn"]') except: sleep(1) time0 += 1 stattab.click() sleep(1) form = self.driver.find_element_by_xpath('//*[@id="statsTab"]') table = form.find_element_by_xpath('//*[@id="pnlStats"]/div/div/div/div/div/table').get_attribute('outerHTML') self.data = read_html(table)[0] col= self.data.columns[:-1] self.data.drop(self.data.columns[0],axis=1,inplace=True) self.data.columns = col print (self.data)
def _option_frames_from_url(self, url): frames = read_html(url) nframes = len(frames) frames_req = max(self._TABLE_LOC.values()) if nframes < frames_req: raise RemoteDataError("%s options tables found (%s expected)" % (nframes, frames_req)) if not hasattr(self, 'underlying_price'): try: self.underlying_price, self.quote_time = self._underlying_price_and_time_from_url(url) except IndexError: self.underlying_price, self.quote_time = np.nan, np.nan calls = frames[self._TABLE_LOC['calls']] puts = frames[self._TABLE_LOC['puts']] if len(calls) == 0 or len(puts) == 0: raise RemoteDataError('Received no data from Yahoo at url: %s' % url) calls = self._process_data(calls, 'call') puts = self._process_data(puts, 'put') return {'calls': calls, 'puts': puts}
def table_to_dataframe(bs_table, **kwargs): df = read_html(str(bs_table), header=0, infer_types=True)[0] df.columns = map( lambda x: regex.findall(x)[0], df.columns) #strips annoying <sup> numbers #data cleaning df[df.columns[-1]] = df[df.columns[-1]].apply( lambda x:x[1:11]) df[df.columns[-3]] = df[df.columns[-3]].apply( lambda x:x[1:11]) for key,item in kwargs.iteritems(): df[key] = item try: df['End of Term'] = df['Mandatory retirement'] del df['Mandatory retirement'] except Exception as e: pass try: df['End of Term'] = df['Retired'] del df['Retired'] except: pass try: del df[''] except: pass return df
"MaximalMonthOf18", "Mean5yr", "Mean12/13-17/18"]) nextpagebutton = driver.find_element_by_xpath('//button[@id="nextLot"]') backyearbutton = driver.find_element_by_xpath('//span[@id="wr_backInTime"]') finalyearbutton = driver.find_element_by_xpath('//span[@id="wr_forwardToEnd"]') muteswanpage = 1 totalmuteswanpages = int(driver.find_element_by_xpath('//select[@id="pageNo"]' '/option[last()]').get_attribute("value")) while True: print(" - Processing page {}...".format(muteswanpage)) finalyearbutton.click() table = driver.find_element_by_xpath('//table[@class="maintable"]' '/tbody[@id="wr_webs_report"]' '/..' '/..') table = read_html(table.get_attribute("innerHTML"))[0] for i in range(3): for j in range(5): backyearbutton.click() table = driver.find_element_by_xpath('//table[@class="maintable"]' '/tbody[@id="wr_webs_report"]' '/..' '/..') table = read_html(table.get_attribute("innerHTML"))[0] muteswantable = pd.concat([table, table[table.columns[2:7]]], axis=1) cols = muteswantable.columns.tolist() cols = [cols[0]] + cols[12:] + cols[2:7] + cols[8:11] muteswantable = muteswantable[cols] muteswantable = np.object_(muteswantable) for row in muteswantable:
def test_same_ordering(): _skip_if_none_of(['bs4', 'lxml', 'html5lib']) filename = os.path.join(DATA_PATH, 'valid_markup.html') dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) assert_framelist_equal(dfs_lxml, dfs_bs4)
def read_html(self, *args, **kwargs): kwargs['flavor'] = self.flavor return read_html(*args, **kwargs)
# extract table from the class attribute from pandas.io.html import read_html page = 'https://www.ft.com/content/691390ca-53d9-11ea-90ad-25e377c0ee1f?fbclid=IwAR3TfgUYCgwsZLN-ad-GnFN7lUcUEurB86SHRHVJewO6ZkL3XrwMGjxzJm4' tables = read_html(page, attrs={"class":"o-table"}) file_name = './my_file.csv' tables[0].to_csv(file_name, sep=',') print ("Extracted {num} lines".format(num=len(tables))) print(tables) # # Exemple avec beautiful SOUP puis conversion en dataframe # # import requests # from bs4 import BeautifulSoup # import pandas as pd # website_url = requests.get('https://www.ft.com/content/691390ca-53d9-11ea-90ad-25e377c0ee1f?fbclid=IwAR3TfgUYCgwsZLN-ad-GnFN7lUcUEurB86SHRHVJewO6ZkL3XrwMGjxzJm4').text # soup = BeautifulSoup(website_url,'lxml') # My_table = soup.find('table',{'class':'o-table o-table--row-stripes o-table--compact o-table--responsive-overflow o-table--sortable'}) # links = My_table.findAll('a') # events = []
def test_same_ordering(datapath): filename = datapath('io', 'data', 'valid_markup.html') dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) assert_framelist_equal(dfs_lxml, dfs_bs4)
def read_file_like(self, f, encoding): with open(f, 'rb') as fobj: return read_html(BytesIO(fobj.read()), encoding=encoding, index_col=0)
#!/usr/bin/env python3 # date: 2020.01.18 # from pandas.io.html import read_html url = 'https://en.wikipedia.org/wiki/List_of_Game_of_Thrones_episodes' wikitables = read_html( url, index_col=0, attrs={"class": "wikitable plainrowheaders wikiepisodetable"}) print("Extracted {num} wikitables".format(num=len(wikitables))) for i, dataframe in enumerate(wikitables): dataframe.to_csv('file{}.csv'.format(i))
def test_bs4_version_fails(monkeypatch): import bs4 monkeypatch.setattr(bs4, '__version__', '4.2') with tm.assert_raises_regex(ValueError, "minimum version"): read_html(os.path.join(DATA_PATH, "spam.html"), flavor='bs4')
# break # else: # continue driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") x=True j = 2 i = 1 master_df = pd.DataFrame() from selenium.common.exceptions import NoSuchElementException while x: try: if driver.find_element_by_xpath(fr'/html/body/div[1]/section/div[{j}]/h2[{i}]').text == seven_days_ago: print('end') break table = driver.find_element_by_xpath(fr'/html/body/div[1]/section/div[{j}]') table_html = table.get_attribute('innerHTML') df = read_html(table_html)[i-1] df['DATE']=driver.find_element_by_xpath(fr'/html/body/div[1]/section/div[{j}]/h2[{i}]').text i+=1 except NoSuchElementException: j+=1 if i!=1: i=1 table = driver.find_element_by_xpath(fr'/html/body/div[1]/section/div[{j}]') df = read_html(table_html)[i-1] df['DATE'] = driver.find_element_by_xpath(fr'/html/body/div[1]/section/div[{j}]/h2[{i}]').text master_df=pd.concat([df,master_df])
from pandas.io.html import read_html import pandas import csv page = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies' wikitables = read_html(page, attrs={"class": "wikitable"}) te = wikitables[0].to_csv() #print(wikitables) #print("extracted {} number of tables".format(len(wikitables))) #print(wikitables[0].shape) #wikitables[0]".shape" the .shape will give your dimensions of rows and columns. ".tail" will give you last 4 and .header is the first 4 with open('snp500.csv') as S: reader = csv.DictReader(S) #print(reader) candles = list(reader) csv_file = open('snp500.csv', 'w') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ticker', 'name']) for item in candles: csv_writer.writerow([item['Symbol'], item['Security']]) csv_file.close()
from bs4 import BeautifulSoup import requests import html5lib from pandas.io.html import read_html import unicodedata # Initalize empty lists for dataframes of both stat types traditional_stats = [] advanced_stats = [] # Loop through the two stat pages (traditional and advanced) for the relevant years for year in range(2012, 2021): # Get traditional and advanced urls trad_url = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html" adv_url = f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html" # Read in both stat tables trad_table = read_html(trad_url, attrs={"class": "stats_table"})[0] adv_table = read_html(adv_url, attrs={"class": "stats_table"})[0] # Add year column to discern season by season trad_table['Date'] = year adv_table['Date'] = year traditional_stats.append(trad_table) advanced_stats.append(adv_table) # Initiate list of empty dataframes for salaries salaries = [] for year in range(2012, 2021): # Get base web page salary_url = f"http://www.espn.com/nba/salaries/_/year/{year}/seasontype/1" page = requests.get(salary_url).text soup = BeautifulSoup(page, 'html5lib') # Get the number of pages for each year
def test_same_ordering(): _skip_if_none_of(["bs4", "lxml", "html5lib"]) filename = os.path.join(DATA_PATH, "valid_markup.html") dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"]) dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"]) assert_framelist_equal(dfs_lxml, dfs_bs4)
makedirs(dp) except OSError: pass # dir exists # output files hiphen_affix_path = '%s/medaffix_with_hiphens.txt' % dp affix_path = '%s/medaffix.txt' % dp suffix_path = '%s/medsuffix.txt' % dp prefix_path = '%s/medprefix.txt' % dp # Wikipedia URL url = 'http://en.wikipedia.org/wiki/List_of_medical_roots,_' \ 'suffixes_and_prefixes' # parsed tables at this URL tables = read_html(url, attrs={'class': 'wikitable'}, header=0) # names of interesting columns in the tables regular_keys = [ 'Affix', 'Greek root in English', 'Latin root in English', 'Other root in English' ] # former names of interesting columns # in case they are restored in the future ignoramus_keys = [ 'Preffix or suffix', 'Preffix/suffix' ]
def run_read_html(self, *args, **kwargs): self.flavor = ['lxml'] self.try_skip() kwargs['flavor'] = kwargs.get('flavor', self.flavor) return read_html(*args, **kwargs)
def read_html(self, *args, **kwargs): kwargs['flavor'] = kwargs.get('flavor', self.flavor) return read_html(*args, **kwargs)
def read_filename(self, f, encoding): return read_html(f, encoding=encoding, index_col=0)
def read_string(self, f, encoding): with open(f, 'rb') as fobj: return read_html(fobj.read(), encoding=encoding, index_col=0)
def to_dataframe(r): return read_html(r.text, infer_types=False, header=0)[0]
data = db['data'] counter = 0 #Capturing data for male dogs categories= ['Origin', 'Height', 'Weight', 'Color', 'Coat', 'AKC', 'FCI', 'ANKC', 'CKC', 'KC (UK)', 'NZKC', 'UKC'] with open('dogdata/urls.txt') as f: lst = f.read().splitlines() for url in lst: d = defaultdict(str) print(url) counter += 1 try: infobox = read_html(url, index_col=0, attrs={"class":"infobox biota"}) dog_breed = url.replace('https://en.wikipedia.org/wiki/','').replace('_(dog)', '').replace('_',' ') d['Breed'] = dog_breed success = True except: print("Error: " , sys.exc_info()[0]) success = False continue if success: for cat in categories: try: if (cat == 'Color'): d[cat] = infobox[0].xs(cat).values[0][0] elif (cat == 'Weight'): # unnecessary due to additional dataset on height and weight if infobox[0].xs(cat).values[0][0] == 'Male': d[cat] = infobox[0].xs(cat).values[0][1]
def GetAccountDataFromSinaBase(self, code, year, dataArr, table_type='zcfzb'): ''' table_type :"zcfzb","lrb","llb","fhpg" Bbase function for getting account data from sina add try catch block to avoid exception. http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_BalanceSheet/stockid/600519/ctrl/part/displaytype/4.phtml http://money.finance.sina.com.cn/corp/go.php/vFD_BalanceSheet/stockid/600519/ctrl/2019/displaytype/4.phtml ''' if (table_type == ''): table_type = 'zcfzb' Id = "BalanceSheetNewTable0" FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_BalanceSheet/stockid/%s/ctrl/%s/displaytype/4.phtml' furl = FINIANCE_SINA_URL % (code, year) #获取数据,标准处理方法 # zcfzb id="BalanceSheetNewTable0" # FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_BalanceSheet/stockid/%s/ctrl/%s/displaytype/4.phtml' # lrb id="ProfitStatementNewTable0" # FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_ProfitStatement/stockid/%s/ctrl/%s/displaytype/4.phtml' # llb id="ProfitStatementNewTable0" # FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_CashFlow/stockid/%s/ctrl/%s/displaytype/4.phtml' if (table_type == 'zcfzb'): Id = "BalanceSheetNewTable0" FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_BalanceSheet/stockid/%s/ctrl/%s/displaytype/4.phtml' furl = FINIANCE_SINA_URL % (code, year) #获取数据,标准处理方法 if (table_type == 'lrb'): Id = "ProfitStatementNewTable0" FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_ProfitStatement/stockid/%s/ctrl/%s/displaytype/4.phtml' furl = FINIANCE_SINA_URL % (code, year) #获取数据,标准处理方法 if (table_type == 'llb'): Id = "ProfitStatementNewTable0" FINIANCE_SINA_URL = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_CashFlow/stockid/%s/ctrl/%s/displaytype/4.phtml' furl = FINIANCE_SINA_URL % (code, year) #获取数据,标准处理方法 if (table_type == 'fhpg'): Id = "sharebonus_1" FINIANCE_SINA_URL = 'http://money.finance.sina.com.cn/corp/go.php/vISSUE_ShareBonus/stockid/%s.phtml' furl = FINIANCE_SINA_URL % (code) #获取数据,标准处理方法 getH = RandomHeader() headers = getH.GetHeader() request = urllib2.Request(furl, headers=headers) text = urllib2.urlopen(request, timeout=5).read() text = text.decode('gbk') html = lxml.html.parse(StringIO(text)) #分离目标数据 # res = html.xpath("//table[@id=\"BalanceSheetNewTable0\"]")#ProfitStatementNewTable0 res = html.xpath(("//table[@id=\"%s\"]") % Id) sarr = [etree.tostring(node).decode('gbk') for node in res] #存储文件 sarr = ''.join(sarr) sarr = '<table>%s</table>' % sarr #向前滚动一年 # year-=1 #对最后一页进行判断,依据是数据是否有 #将数据读入到dataframe数据个数中;并进行连接; df = read_html(sarr)[0] df.columns = range(0, df.shape[1]) df = df.set_index(df.columns[0]) dataArr = [dataArr, df] # dataArr = pd.concat(dataArr, axis=1, join='inner') dataArr = pd.concat(dataArr, axis=1) return dataArr
def test_bool_header_arg(self): # GH 6114 for arg in [True, False]: with pytest.raises(TypeError): read_html(self.spam_data, header=arg)
def _run_read_html(*args, **kwargs): _skip_if_no_parser() return read_html(*args, **kwargs)
def test_invalid_flavor(): url = 'google.com' with pytest.raises(ValueError): read_html(url, 'google', flavor='not a* valid**++ flaver')
def game(hometeam, awayteam, week, year): """Download, parse, and clean the spreads & over-under tables for one game. The columns are pinnacle, betonline, bookmaker each with suffix _spread or _over_under; datetime; hometeam, awayteam, favored; week. The first three are the bookies and give the spreads from the point of view of the favored team (so they're generally nonpositive). """ with urlopen(spread_url(hometeam, awayteam, week, year)) as connection: spreads_page = connection.read() # Note that infer_types is deprecated and won't work starting in Pandas 0.14 LOG.debug('Getting game %s', (hometeam, awayteam, week, year)) sp = read_html(io=spreads_page.decode('utf-8'), match="History", attrs={'id': 'table-000'}, infer_types=False, header=0, skiprows=[1, 2, 3]) if len(sp) != 1: raise CantFindTheRightTable sp = sp.pop() # Get the over-under page ou = read_html(io=over_under_url(hometeam, awayteam, week, year), match="History", attrs={'cellspacing': 0}, infer_types=False, header=0, skiprows=[1, 2, 3]) if len(ou) != 1: raise CantFindTheRightTable ou = ou.pop() # Cleaning. for t, name, date_col in (sp, 'spread', 'Unnamed: 0'), (ou, 'over_under', '\xa0'): datetime = pd.to_datetime( t[date_col] .replace(r'(\d\d?/\d\d?)', r'\1/%d' % year, regex=True) .replace(r'(01|02)/(\d\d?)/\d{4}', r'\1/\2/%d' % (year + 1), regex=True)) del t[date_col] # Replace all the '--' as missing so we can convert numbers to floats. for column in t.keys(): t[column] = (t[column] .replace('--', 'nan') .replace('(Pick)', 0) .apply(float)) # Add datetime back in after the str-to-float conversion so we don't do # it for the datetime. t['datetime'] = datetime # Lowercase column names for ease of programming later t.columns = [h.lower() for h in t.columns] # Give spreads/over-under their suffixes for col in 'pinnacle', 'betonline', 'bookmaker': t[col + '_' + name] = t[col] del t[col] data = sp.merge(ou, on=['datetime'], how='outer') assert set(data.datetime) == (set(sp.datetime) | set(ou.datetime)) # Add this function's arguments to the table. data['hometeam'] = hometeam data['awayteam'] = awayteam data['week'] = week # Get favored team from the big "Odds: Washington by 4," that shows up at the # top of the page. soup = BeautifulSoup(spreads_page) subheader = soup.find('p', attrs={'class': 'h1-sub'}).find('strong') m = _FAVORED_RE.search(subheader.contents[0]) if m is None or not m.group('city'): raise ValueError("Couldn't figure out who was favored: %r" % (subheader.contents)) city = m.group('city').replace(' ', '-').replace('.', '').lower() # city will be something like 'san-francisco' after the transformations # above. Find what team that is by looking for the links to the teams that # are also in that subheader. for link in subheader.findAll('a'): link = link['href'] if city in link: data['favored'] = link.split('-')[-1] break else: raise ValueError("couldn't figure out who %s is" % city) return data
def read_html(self, *args, **kwargs): kwargs.setdefault('flavor', self.flavor) return read_html(*args, **kwargs)
for x in list: try: matchup_df = matchup_df.append(matchup_scraper(str(x))) except: break ## Calculate season stats and record from matchup stats in R driver.get( 'https://www.fantrax.com/fantasy/league/8i8nwftijzzq6mwq/standings?startDate=2019-10-02&endDate=2020-04-04&hideGoBackDays=true&period=5&timeStartType=PERIOD_ONLY&timeframeType=YEAR_TO_DATE&view=SEASON_STATS&pageNumber=1' ) time.sleep(5) table = driver.find_element_by_xpath( '/html/body/app-root/div/div[1]/div/app-league-standings/div/section/league-standings-tables/div/div[2]/ultimate-table/div/section/div' ) table_html = table.get_attribute('innerHTML') season_df = read_html(table_html)[0] teams = driver.find_element_by_xpath( '/html/body/app-root/div/div[1]/div/app-league-standings/div/section/league-standings-tables/div/div[2]/ultimate-table/div' ) teams_html = teams.get_attribute('innerHTML') teams = re.findall("</figure>.*?<!---->", teams_html) # categories = re.findall('">.*?</a></th>', teams_html) for x in range(0, len(teams)): teams[x] = teams[x][10:] teams[x] = teams[x][:-8] # for x in range(0,len(categories)): # categories[x] = re.findall(';">.*?</a></th>',categories[x]) # categories[x] = str(categories[x])[6:] # categories[x] = categories[x][:-11] # season_df.columns = categories
def test_invalid_flavor(): url = "google.com" with tm.assertRaises(ValueError): read_html(url, "google", flavor="not a* valid**++ flaver")
def season_games(year): """Download, parse, and clean a table of games and scores for given season. The columns are week; hometeam; awayteam; winner; date; points, yards, and turn overs for the winning team; points, yards, and turn overs for the losing team; and season. """ LOG.debug('Getting season %d', year) data = read_html(io=season_games_url(year), attrs={'id': 'games'}, infer_types=False, header=0) if len(data) != 1: raise CantFindTheRightTable data = data.pop() # Cleaning. del data["Unnamed: 3"] # The code below issues "UserWarning: " So we catch UserWarnings. with warnings.catch_warnings(): warnings.filterwarnings(action='ignore', category=UserWarning, module=r'pandas\.core\.frame', message=(r"Boolean Series key will be reindexed" r" to match DataFrame index\.")) # These rows are mid-table header rows. data = data[data.Week != "Week"][data.Week != "nan"] data['week'] = (data.Week .replace("WildCard", "wild-card") .replace("Division", "divisional") .replace("ConfChamp", "conference") .replace("SuperBowl", "super-bowl") .apply( lambda s: (int(s) if all(c in '1234567890' for c in s) else s))) del data['Week'] data['season'] = year data['game_date'] = pd.to_datetime( data.Date .replace(r"$", r", %d" % year, regex=True) .replace(r"^(January|February) (\d+), \d+$", r"\1 \2, %d" % (year + 1), regex=True)) del data['Date'] for column in "PtsW", "PtsL", "YdsW", "TOW", "YdsL", "TOL": data[column] = data[column].apply(int) data['WatL'] = data['Unnamed: 5'].apply(lambda x: x == '@') del data['Unnamed: 5'] data['hometeam'] = (~data.WatL * data['Winner/tie'] + data.WatL * data['Loser/tie']) data['awayteam'] = (data.WatL * data['Winner/tie'] + ~data.WatL * data['Loser/tie']) data['winner'] = data['Winner/tie'] for column in 'Winner/tie', 'Loser/tie', "WatL": del data[column] for column in 'hometeam', 'awayteam', 'winner': data[column] = data[column].apply(lambda s: s.split()[-1].lower()) return data
#finalamt = usd_to_pkr(float(rate),float(gbp)) #print(str(rupees)+" USD is equvalent to "+str(finalamt)+" PKR") # url = "https://www.lme.com/" # # # page = requests.get(url) # # tables = read_html(page.text, attrs={"class":"ring-times"}) # print(tables[0].head()) # tables[0].to_excel("df.xlsx") url = "https://www.lme.com/en-GB/Metals/Non-ferrous#tabIndex=0" page = requests.get(url) tables = read_html(page.text) print(tables[0].head()) tables[0]['Zinc'] = tables[0]['Zinc'].apply(lambda x: x * float(rate)) tables[0].to_excel("df.xlsx") # # soup = bs(url.content, 'html.parser') # # filename = 'test.csv' # # csv_writer = csv.writer (open(filename, 'w')) # # heading = soup.find('h2') # # table = soup.find_all("table") # # for tr in soup.find_all('tr'): # data = []
def test_invalid_flavor(): url = 'google.com' with tm.assertRaises(ValueError): read_html(url, 'google', flavor='not a* valid**++ flaver')
def import_yahoo(symbol): equity_data = pd.DataFrame() if symbol == 'Brent': i = 0 for i in range(0, 2): try: # driver = webdriver.Chrome(options=chrome_options) # driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options) # driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, chrome_options=chrome_options) driver = webdriver.Chrome( executable_path=os.environ.get("CHROMEDRIVER_PATH"), chrome_options=chrome_options) url = "https://markets.businessinsider.com/commodities/historical-prices/oil-price/usd?type=brent" driver.get(url) time.sleep(3) table = driver.find_element_by_xpath( '//*[@id="historic-price-list"]/div/div[2]/table/..') #table = driver.find_element_by_xpath('//*[@id="historic-price-list"]/div/div[2]/table/') table_html = table.get_attribute('innerHTML') equity_data = read_html(table_html)[0] equity_data = equity_data.set_index( pd.DatetimeIndex(equity_data['Date'])).drop( ['Date'], axis=1).rename_axis('trade_date') equity_data = equity_data.rename( columns={"Closing Price": "close"}) equity_data = equity_data['close'].reset_index().set_index( 'trade_date') driver.quit() except: driver.quit() i += 1 print(f"Still trying {2-i} more times.") else: equity_data = yf.download( symbol, start=(datetime.today() - dateutil.relativedelta.relativedelta( months=28)).strftime('%Y-%m-%d'), end=datetime.today().strftime('%Y-%m-%d')).rename_axis( 'trade_date') equity_data = equity_data.rename(columns={"Close": "close"}) equity_data = equity_data['close'].reset_index().set_index( 'trade_date') if not equity_data.empty: # First part of the insert statement insert_init = """insert into equity_history (trade_date, ticker, close) values """ # Add values for all days to the insert statement if symbol == 'BZ': symbol = 'Brent' vals = ",".join([ """('{}', '{}', '{}')""".format(str(trade_date), symbol, row.close) for trade_date, row in equity_data.iterrows() ]) # Handle duplicates - Avoiding errors if you've already got some data in your table insert_end = """ on duplicate key update close=close;""" # Put the parts together query = insert_init + vals + insert_end # Fire insert statement engine.execute(query)
def test_bool_header_arg(self): # GH 6114 for arg in [True, False]: with tm.assertRaises(TypeError): read_html(self.spam_data, header=arg)
def test_bs4_version_fails(monkeypatch, datapath): import bs4 monkeypatch.setattr(bs4, '__version__', '4.2') with tm.assert_raises_regex(ValueError, "minimum version"): read_html(datapath("io", "data", "spam.html"), flavor='bs4')
#import packages import requests from bs4 import BeautifulSoup import lxml.html as lh import pandas as pd import openpyxl from selenium import webdriver from pandas.io.html import read_html #start getting tables #Overall table driver = webdriver.Safari() driver.get('https://fbref.com/en/comps/9/Premier-League-Stats') table = driver.find_element_by_id('div_results32321_overall') table_html = table.get_attribute('innerHTML') lt_df = read_html(table_html)[0] #lt_df.columns = lt_df.columns.get_level_values(1) #driver.close() #Home Away Table #driver = webdriver.Safari() #driver.get('https://fbref.com/en/comps/9/Premier-League-Stats') table = driver.find_element_by_id('div_results32321_home_away') table_html = table.get_attribute('innerHTML') homeaway_df = read_html(table_html)[0] homeaway_df.columns = homeaway_df.columns.get_level_values(1) #driver.close() #Squads table #driver = webdriver.Safari() #driver.get('https://fbref.com/en/comps/9/Premier-League-Stats') table = driver.find_element_by_id('div_stats_standard_squads') table_html = table.get_attribute('innerHTML') squads_df = read_html(table_html)[0]
import wikipedia as wiki import string import pandas as pd from pandas.io.html import read_html seedpage = wiki.page("List of United States cities by population") url = seedpage.url wikitables = read_html(url, attrs={"class": "wikitable sortable"}) print("Extracted {num} wikitables".format(num=len(wikitables))) print(url) table_df = wikitables[0] def removecite(x): ''' Simple fucntion to remove citations from scraped table''' if type(x) == str: return x.partition('[')[0] else: return x def getsummary(x): summaries = [] for city in list(x): try: summary = wiki.page(city, auto_suggest=True, redirect=True).summary except (wiki.exceptions.DisambiguationError) as e: summary = "Summary not fetched due to disambiguation"
def test_bs4_version_fails(monkeypatch, datapath): import bs4 monkeypatch.setattr(bs4, "__version__", "4.2") with pytest.raises(ImportError, match="Pandas requires version"): read_html(datapath("io", "data", "html", "spam.html"), flavor="bs4")
def test_bs4_version_fails(monkeypatch, datapath): import bs4 monkeypatch.setattr(bs4, '__version__', '4.2') with pytest.raises(ValueError, match="minimum version"): read_html(datapath("io", "data", "spam.html"), flavor='bs4')
def test_same_ordering(datapath): filename = datapath("io", "data", "html", "valid_markup.html") dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"]) dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"]) assert_framelist_equal(dfs_lxml, dfs_bs4)