#!/usr/bin/python2.7 import panda as pd tables = pd.read_html("www.chico.com/gasprices") print(tables[0])
def scrape(): executable_path = {'executable_path': '/usr/local/bin/chromedriver'} return Browser('chrome', **executable_path, headless=False) url = "https://mars.nasa.gov/news/" browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') latestnews = soup.find("div", class_="content_title").text paragraphtext = soup.find("div", class_="article_teaser_body").text url_2 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url_2) browser.find_by_id("full_image").click() time.sleep(5) html2 = browser.html soup2 = BeautifulSoup(html2, "html.parser") img_url = soup2.find("img", class_="fancybox-image")["src"] featured_image_url = "https://www.jpl.nasa.gov" + img_url url3 = "https://twitter.com/marswxreport?lang=en" browser.visit(url3) html3 = browser.html soup3 = BeautifulSoup(html3, 'html.parser') mars_weather = soup3.find( "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text url_4 = "http://space-facts.com/mars/" browser.visit(url_4) html4 = browser.html soup4 = BeautifulSoup(html4, "html.parser") marsdata = pd.read_html(url_4) df_marsdata = marsdata[0] df_marsdata.columns = ["Mars_Profile", "Mars_ProfileValue"] df_marsdata.set_index("Mars_Profile", inplace=True) marsdata_html = df_marsdata.to_html(justify="left") url_5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_5) html5 = browser.html soup5 = BeautifulSoup(html5, 'html.parser') mars_hemisphere = [] for i in range(4): time.sleep(5) images = browser.find_by_tag("h3") images[i].click() html5 = browser.html soup5 = BeautifulSoup(html5, 'html.parser') partial = soup5.find("img", class_="wide-image")["src"] image_title = soup5.find("h2", class_="title").text img_url = "https://astrogeology.usgs.gov" + partial dictionary = {"title": image_title, "img_url": img_url} mars_hemisphere.append(dictionary) browser.back() mars_hemisphere_dict = []
.sin(arr) .log(arr) NP can do regular operation * / + - < > <= >= != Pandas use to convert different type data to table and use advance function to filter and remap the data import panda as pd #read and write to csv file df = pd.read_csv('file_name') df = df.to_csv('example', index=False) #Excel Input and output, beware of image in the excel file, it may cause it to crash pd.read_excel('Excel_Sample.xlsx',sheetname='Sheet1') pd.to_excel('excelname.xlsm', sheetname) #Html Input df = pd.read_html('http://.....html') #Read Database in sql from sqlalchemy import create_engine engine = create_engine('sqlite:///:momory:') df.to_sql('data', engine) sql_df = pd.read_sql('data', con=engine) #Convert to DataFrames df= pd.DataFrame(np.random(5,4), index='A B C D E'.split(), columns='W X Y Z'.split()) df['W'] or df['W', 'Z'] #to call one or more column #creating new columns df['new'] = #whatever #Removing columns df.drop('nameofthecolumn',axis=1) #need to use "inplace=True" to make it inplace change #selecting Rows df.loc['A'] #using the name to location row
letter = 'abcdefghijklmnopqrstuvwxyz' for num in (146, 148, 155, 160, 161, 166, 167): Wyckoff_positions_list = [] Wyckoff_positions = {} driver.get('https://it.iucr.org/Ac/ch2o3v0001/sgtable2o3o{}/'.format(num)) html = driver.page_source soup = BeautifulSoup(html, 'html.parser') table = soup.find_all('table', {'class': 'genpos'})[-1] table = table.find('table', {'class': 'genposcoords'}) genpos = table.find('td', {'class': 'genposcoords'}) genpos = re.findall(r"<i><i>(.*?)</i></i>", str(genpos)) Wyckoff_positions_list.append(genpos) table = soup.find_all('table', {'class': 'specpos'})[-1] table = table.find_all('table', {'class': 'specposcoords'}) for line in table: pos = pd.read_html(str(line))[0].iloc[0, 0].replace(u'\xa0', u'').split(',') Wyckoff_positions_list.append(pos) n = len(Wyckoff_positions_list) for i in range(0, n): Wyckoff_positions[letter[n - 1 - i]] = Wyckoff_positions_list[i] with open('space_group_{}_Wyckoff_site_data.json'.format(num), 'w') as f: json.dump(Wyckoff_positions, f)
import panda as pd from lxml import html for x in range(1, 4009): url1 = "http://stats.espncricinfo.com/ci/engine/stats/index.html?class=11;page=" + str(x) + ";template=results;type=batting;view=innings" data1 = pd.read_html(url1) df1 = data1[2] df1.to_csv(path_or_buf='/Users/mohiulalamprince/work/python/cricstat/player-info-%s.csv' % '{:05d}'.format(x), sep=',') print str(x) + " => [DONE]"