def scrape(url): executable_path = {"executable_path": "chromedriver.exe"} browser = Browser('chrome', **executable_path, headless=False) browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') browser.close() return soup
def insta_account(): proxyIP = "127.0.0.1" proxyPort = 9150 proxy_settings = { "network.proxy.type": 1, "network.proxy.socks": proxyIP, "network.proxy.socks_port": proxyPort, "network.proxy.socks_remote_dns": True, } browser_sp = Browser( 'firefox', profile_preferences=proxy_settings, executable_path='/home/alex/Documents/Coder/geckodriver') browser_sp.visit("http://www.icanhazip.com") switchIP() sleep(2) browser_sp.visit("http://www.instagram.com/accounts/login/?hl=it") sleep(2) a_tags = browser_sp.find_by_tag("a") for a in a_tags: if a.text == "Iscriviti": a.click() email = get_temp_email() user = email.split("@")[0] + "linky" browser_sp.fill("emailOrPhone", email) browser_sp.fill("fullName", "Pebblor El Munchy") browser_sp.fill("username", user) browser_sp.fill("password", "fuckthepoliS") buttons = browser_sp.find_by_tag("button") for button in buttons: if button.text == "Iscriviti": button.click() file_txt = open("users.txt", "a") file_txt.writelines(user + "\n") file_txt.close() browser_sp.close()
class WOS(object): """ A little module for exporting Web of Science search results into a txt file """ def __init__(self, **kwargs): """ Construct a new WOS object given a query, an export file (without ".isi") a username and a password for authentication eg : WOS(query="TS=(epigenetic*", outfile="epigenetic", user="******", passw="mypassw") """ #defining params self.query = kwargs["query"] self.outfile = kwargs["outfile"]+".isi" try: self.user=kwargs["user"] self.passw = kwargs["passw"] except: self.user, self.passw = private try: self.browser_app = kwargs["browser"] except: self.browser_app = "splinter" #using MLV Auth Server self.auth_url = "https://apps-webofknowledge-com.fennec.u-pem.fr/WOS_AdvancedSearch_input.do?&product=WOS&search_mode=AdvancedSearch" #Firefox Browser if self.browser_app == "splinter": self.browser = Browser("firefox") else: self.browser = spynner.Browser() self.browser.set_html_parser(PyQuery) #self.browser = Browser('zope.testbrowser', ignore_robots=True) #Session params self.session = None self.cookies = {} if self.query is None: sys.exit("No query provided") if "=" not in self.query: #or "(" not in self.query logging.warning("Syntax is not WOS compliant. Check Query Syntax") sys.exit("Query Syntax Error") if self.outfile is None: self.outfile = str(re.sub(re.compile("[^0-9a-zA-Z]+"),"_", self.query))+".isi" if self.user is None and self.passw is None: self.user, self.passw = private logging.info("WOS search parameters:\n\t- query: %s\n\t- outfile: %s\n\t- user: %s\n\t- password: %s" %(self.query, self.outfile, self.user, self.passw)) self.run() def auth(self): """ authentification throught auth_url to get the session id SID """ #Loading url if self.browser_app == "splinter": self.browser.visit(self.auth_url) self.browser.fill('username', self.user) self.browser.fill('password', self.passw) self.browser.find_by_name("submit").click() self.cookies = self.browser.cookies.all() else: self.browser = self.browser.load(self.url) self.browser.wk_fill('input[id="username"]',self.username) self.browser.wk_fill('input[id="password"]',self.password) self.browser.click('input[name="submit"]') #~ if self.debug is True: #~ print "Proceding to authentication..." if "SessionError" in self.session.url : self.session.click('a[target="_top"]') self.session.wait(random.uniform(1, 3)) p_url = urlparse(self.browser.url) if p_url.netloc == "apps-webofknowledge-com.fennec.u-pem.fr": #print p_url.scheme+"//"+p_url.netloc+"/WOS_GeneralSearch_input.do?"+p_url.query match = re.match(re.compile("product\=(?P<product>.*?)\&search_mode\=(?P<search_mode>.*?)\&SID=(?P<ssid>.*?)\&preferencesSaved\="), str(p_url.query)) if match is not None: self.product = match.group("product") self.ssid = match.group("ssid") self.search_mode = re.sub("General", "Advanced", match.group("search_mode")) #self.search_mode = match.group("search_mode") self.search_url = "%s://%s/%s_%s_input.do?product=%s&search_mode=%s&SID=%s" %(p_url.scheme, p_url.netloc, self.product,self.search_mode,self.product,self.search_mode,self.ssid) if self.browser_app == "splinter": self.browser.visit(self.search_url) print self.browser.url else: self.browser.load(self.search_url) print self.browser.url return self else: return sys.exit("Session Id could not be found") else: logging.info("No redirection to service") return sys.exit("Invalid credentials") def launch_search(self): """ Filling the query form found into advanced search page """ logging.info("Launching search") if self.browser_app == "splinter": self.browser.fill("value(input1)", self.query) self.browser.find_by_xpath("/html/body/div[1]/form/div[1]/table/tbody/tr/td[1]/div[2]/div[1]/table/tbody/tr/td[1]/span[1]/input").click() bs = BeautifulSoup(self.browser.html) else: self.session.wk_fill('textarea[id="value(input1)"]', self.query) self.session.click('input[title="Search"]') self.session.wait(random.randint(2,5)) bs = BeautifulSoup(self.browser.html.encode("utf-8")) query_history = bs.find_all("div", {"class":"historyResults"}) self.nb_search = len(query_history) try: self.nb_results = int(re.sub(",", "", query_history[0].text)) except IndexError: self.nb_results = int(re.sub(",", "", query_history.text)) print self.nb_results logging.warning("Your search \"%s\" gave %i results"%(self.query, self.nb_results)) logging.info("Your SSID is : %s" %self.ssid) if self.browser_app == "splinter": self.browser.click_link_by_partial_href('/summary.do?') else: self.session.click('a[title="Click to view the results"]',wait_load=True) print urlparse(self.browser.url).query match = re.search(re.compile("product=WOS&doc\=(?P<doc>.*?)\&qid\=(?P<qid>.*?)&SID"), urlparse(self.browser.url).query) if match is not None: print match.group() self.doc, self.qid = match.group("doc"), match.group('qid') print self.doc, self.qid return self else: self.doc, self.qid = self.parse_params() return self def load_results(self, markFrom, markTo, i): """ Load_results(markFrom, markTo) 500 by 500 given the nb of results """ logging.info("loading results") #print "exporting" #p_url0= "http://apps.webofknowledge.com/AutoSave_UA_output.do?action=saveForm&SID=%s&product=UA&search_mode=output" %self.ssid #r0 = requests.post(p_url0, headers= headers, cookies=self.cookies) # print p_url0 #print r0 #p_url1= "http://apps.webofknowledge.com/AutoSave_UA_output.do?action=saveForm&SID=%s&product=UA&search_mode=results" %self.ssid # print p_url1 #r1 = requests.post(p_url1, headers= headers, cookies=self.cookies) #print r1 r_url = "https://apps-webofknowledge-com.fennec.u-pem.fr/summary.do?product=WOS&doc=1&qid="+self.qid+"&SID="+self.ssid+"&search_mode=AdvancedSearch" post_url = "https://apps-webofknowledge-com.fennec.u-pem.fr/OutboundService.do?action=go&&" #r2 = requests.post() header={ 'Host': 'apps-webofknowledge-com.fennec.u-pem.fr', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'DNT': 1, 'Referer': 'https://apps-webofknowledge-com.fennec.u-pem.fr/summary.do?product=WOS&doc=1&qid=%s&SID=%s&search_mode=AdvancedSearch'%(self.qid, self.ssid), 'Connection': 'keep-alive' } # markTo = 500 # markFrom = 1 data = { 'SID': self.ssid, 'colName':'WOS', 'count_new_items_marked':0, 'displayCitedRefs':'true', 'displayTimesCited':'true', 'fields_selection':'USAGEIND AUTHORSIDENTIFIERS ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE CITREF ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS', 'filters':'USAGEIND AUTHORSIDENTIFIERS ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE CITREF ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS', 'format':'saveToFile', 'locale':'en_US', 'markFrom':1, 'markTo':markTo, 'mark_from':markFrom, 'product':'WOS', 'mark_to':markTo, 'mode':'OpenOutputService', 'product':'WOS', 'qid':self.qid, 'startYear':'2015', 'endYear':'2014', #rurl:'http%3A%2F%2Fapps.webofknowledge.com%2Fsummary.do%3FSID%3DT1WYtnvIngPkHzI4ShI%26product%3DWOS%26doc%3D1%26qid%3D1%26search_mode%3DAd 'rurl':urllib.quote_plus(r_url), 'save_options':'othersoftware', 'search_mode':'AdvancedSearch', 'selectedIds':'', 'sortBy':'PY.D;LD.D;SO.A;VL.D;PG.A;AU.A', 'value(record_select_type)':'range', 'viewType':'summary', 'view_name':'WOS-summary', } r = requests.get(post_url, params=data,headers=header, cookies=self.cookies) #redirects to #url = "http://ets.webofknowledge.com/ETS/ets.do?" data_directory = self.outfile.split('.isi')[0] try: os.mkdir("exported_data") print "creating directory exported_data" except: print "exported_data already exists" pass try: os.mkdir("exported_data/"+data_directory) print "creating directory "+data_directory except: print data_directory +" already exists" pass final_r = requests.get(r.url, cookies=self.cookies, stream=True) with open( "exported_data/"+data_directory+'/'+data_directory+'_'+str(i) +'.isi' , 'w') as f: final_r.text f.write(final_r.text.encode('utf-8')) return self.outfile def export(self): """Writing results into outfile (defaut is normalized query)""" start_time = time.time() #open(self.outfile, 'w').close() l = list(range(0, self.nb_results, 500)) l.append(self.nb_results) logging.info("Exporting %s 500 by 500..." %self.nb_results) for i,n in enumerate(l): if l[i]+1 < self.nb_results: self.load_results(l[i]+1, l[i+1],str(l[i]+1)+'-'+str(l[i+1])) total = time.time() - start_time, "seconds" # raw_file = open(self.outfile, 'r') # raw_file_data = raw_file.read().decode("utf-8-sig").encode("utf-8") # nb_occurence = len(raw_file_data.split("\n\n"))-1 logging.info("Query \"%s\" had %d results: %d has been exported" %(self.query, self.nb_results)) logging.info("Sucessfully stored in directory : %s\n" %(self.outfile)) #logging.info("Execution total time:"+str(" ".join(total))) return def run(self): """ Generic method that encapsulates the WOS extract process """ self.auth() self.launch_search() self.export() self.browser.close() return
twitter_weather = 'https://twitter.com/marswxreport?lang=en' browser.visit(twitter_weather) time.sleep(1) weather = bs(browser.html, 'html.parser') weather_tweet_text = weather.findAll('span', class_="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0") mars_weather = weather_tweet_text[23].text.strip() hemis_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemis_url) hemis_soup = bs(browser.html, 'html.parser') time.sleep(2) results = hemis_soup.find_all('div', class_ = 'item') base = 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/' image_urls = [] for result in results: title = result.find('div', class_ = 'description').find('a', class_ = 'product-item').find('h3').text thumbnail = result.find('a', class_ = 'product-item').find('img', class_ = 'thumb') thumbstring = str(thumbnail) splitthumb = thumbstring.split('_', 1) splitAGAIN = splitthumb[1].split('_thumb.png"/>') full_url = base + splitAGAIN[0] + '/full.jpeg' dict_of_these = {'title':title, 'urls': full_url} image_urls.append(dict_of_these) browser.close()
""" This class is a browser class, which can setting your browser type """ def __init__(self): self.browser = Browser(settings.get('BROWSER_TYPE')) logger.info('Splinter browser is init!') def close(self): self.browser.quit() logger.info('Splinter browser is close success!') def visit(self, url): self.browser.visit(url) logger.info('Splinter browser is visiting : %s' % url) def get_html(self): return self.browser.html def get_title(self): return self.browser.title def reload(self): self.browser.reload() if __name__ == '__main__': MyBrowser = Browser() MyBrowser.visit('http://www.baidu.com') print MyBrowser.get_title() MyBrowser.close()
This class is a browser class, which can setting your browser type """ def __init__(self): self.browser = Browser(settings.get('BROWSER_TYPE')) logger.info('Splinter browser is init!') def close(self): self.browser.quit() logger.info('Splinter browser is close success!') def visit(self, url): self.browser.visit(url) logger.info('Splinter browser is visiting : %s' % url) def get_html(self): return self.browser.html def get_title(self): return self.browser.title def reload(self): self.browser.reload() if __name__ == '__main__': MyBrowser = Browser() MyBrowser.visit('http://www.baidu.com') print MyBrowser.get_title() MyBrowser.close()