def submit(self): last_id, b, c, d, e = Submit.get_latest_verdict(self.username) browser = RoboBrowser(parser = 'html.parser') browser.open('http://codeforces.com/enter') # todo check if it takes time enter_form = browser.get_form('enterForm') enter_form['handleOrEmail'] = self.username enter_form['password'] = self.password browser.submit_form(enter_form) try: checks = list(map(lambda x: x.getText()[1:].strip(), browser.select('div.caption.titled'))) if self.username not in checks: Colour.print('Login Failed.. Wrong password.', Colour.RED) return except Exception as e: Colour.print('Login Failed.. Maybe wrong id/password.', Colour.RED) return # todo check if it takes time browser.open('http://codeforces.com/contest/'+self.c_name+'/submit') submit_form = browser.get_form(class_ = 'submit-form') submit_form['submittedProblemIndex'].value = self.p_name submit_form['sourceFile'] = self.inputfile browser.submit_form(submit_form) print(browser.url) # if browser.url[-6:] != 'status': # it was used when submitting from problemset if not 'my' in browser.url: Colour.print('Failed submission, probably you have submit the same file before', Colour.RED) return Submit.print_verdict(last_id,self.username,100) Colour.print('[{0}] submitted ...'.format(self.inputfile), Colour.GREEN)
def __init__(self, artist, song, p_proxy): self.artist = self.remove_unwanted_chars(artist) self.song = self.remove_unwanted_chars(song) session = None #setting proxy if p_proxy is not None: arr = p_proxy.split(',') for ent in arr: m = re.match(r'(.+)=([\d\.\:]+)', ent) if m: site = m.group(1) proxy = m.group(2) if site == self.site: logging.info(self.log_msg("use proxy:" + proxy)) session = Session() session.proxies = {'http': proxy, 'https': proxy} break self.browser = RoboBrowser(parser="html.parser", session=session, user_agent='Mozilla Firefox', tries=5)
def zdz_post010(uid, unam, upas, chdr, ctxt, uhost='http://ziwang.com/'): brow = RoboBrowser(history=True, cache=True) uexit = uhost + 'member.php?action=logout' brow.open(uexit) zt.wait(1) # ulog = uhost + 'forum.php' #,'58' #灌水乐园 brow.open(ulog) zt.wait(2) #print('ulog,',ulog) xact = "member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes" xlog = brow.get_form(action=xact) #print('xlog',xlog) if xlog == None: return False print('@xlog,', unam, upas) # xlog['username'].value = unam xlog['password'].value = upas brow.submit_form(xlog) # # http://ziwang.com/forum.php?mod=post&action=newthread&fid=67 upost0 = 'http://ziwang.com/forum.php?mod=post&action=newthread&fid=' upost = upost0 + uid #print('@xpost, ',uid,upost); x = brow.open(upost) zt.wait(1) xact_post = 'forum.php?mod=post&action=newthread&fid=' + uid + '&extra=&topicsubmit=yes' xpost = brow.get_form(action=xact_post) #print('@xpost, ',xpost) # xpost['subject'].value, xpost['message'].value = chdr, ctxt brow.submit_form(xpost) #print('@xpost, ',upost); # #re_brow,upost,chk_post return True
def get_fb_token(login, password): fb_auth_url = 'https://www.facebook.com/v2.6/dialog/oauth?redirect_uri=fb464891386855067%3A%2F%2Fauthorize%2F&display=touch&state=%7B%22challenge%22%3A%22IUUkEUqIGud332lfu%252BMJhxL4Wlc%253D%22%2C%220_auth_logger_id%22%3A%2230F06532-A1B9-4B10-BB28-B29956C71AB1%22%2C%22com.facebook.sdk_client_state%22%3Atrue%2C%223_method%22%3A%22sfvc_auth%22%7D&scope=user_birthday%2Cuser_photos%2Cuser_education_history%2Cemail%2Cuser_relationship_details%2Cuser_friends%2Cuser_work_history%2Cuser_likes&response_type=token%2Csigned_request&default_audience=friends&return_scopes=true&auth_type=rerequest&client_id=464891386855067&ret=login&sdk=ios&logger_id=30F06532-A1B9-4B10-BB28-B29956C71AB1&ext=1470840777&hash=AeZqkIcf-NEW6vBd' s = RoboBrowser(parser="lxml") s.open(fb_auth_url) f = s.get_form() f["pass"] = password f["email"] = login s.submit_form(f) f = s.get_form() try: import re s.submit_form(f, submit=f.submit_fields['__CONFIRM__']) access_token = re.search(r"access_token=([\w\d]+)", s.response.content.decode()).groups()[0] return access_token except Exception as ex: print( "access token could not be retrieved. Check your username and password." ) print("Official error: %s" % ex) return { "error": "access token could not be retrieved. Check your username and password." }
def connections(handle): session = requests.Session() browser = RoboBrowser(session=session, user_agent=random.choice(HEADERS_LIST), parser="lxml") client = MongoClient("mongodb://*****:*****@35.185.118.72:27017/") db = client['sixdos'] # ttweets = total_tweets(handle) if db.data.find({'_id': handle}).count() == 0: print(handle) updatestats.update_last(handle) min_position, links = get_tweets(handle) with tqdm(10000) as pbar: while (True): min_position1, links1 = get_tweets(handle, min_position) links = links + links1 pbar.update(len(links1)) if (min_position1 == None): break min_position = min_position1 people_list = [] for link in tqdm(links): if handle in link: people_list = people_list + get_people(link, handle) print("Handle: ", handle, "Length: ", str(len(people_list)), people_list) t = datetime.datetime.now() # t = datetime.datetime(year, month, day) s = t.strftime('%Y-%m-%d %H:%M:%S.%f') result = {"_id": handle, "Length": str(len(people_list)), "Connections": str(people_list), "date": s[:-3]} update = db.data.update({'_id': handle}, {"$set": result}, upsert=True) people_list = [] return update
def authenticate_user(self, user_email, password): # Set the users input values and read the home page webpage = RoboBrowser() goodreads_page = 'http://www.goodreads.com' # best practice: page can be changed whenever needed webpage.open(goodreads_page) # load and submit the login form using the get_form function login_form = webpage.get_form(id='sign_in') login_form['user[email]'].value = user_email login_form['user[password]'].value = password webpage.submit_form(login_form) # read the web page again and check for certain tags only visible after login to verify the user # another method could be hitting the database for verifying using dynamic querry building home_page = str(webpage.parsed()) if "Currently Reading" in home_page: print("User Authenticated") return True else: print("Invalid user creditentials") return False
def get_hitran_molecules(): """ Accesses http://hitran.org/lbl/# and reads its table Returns: tuple: table (list of lists), header (list of strings) """ data, header = [], [] browser = RoboBrowser(history=True, parser="lxml") browser.open("http://hitran.org/lbl/#") table = browser.find("table") hh = table.find_all("th") for h in hh: # Skips cells whose class starts with "meta" (they are not of interest) cls = h.get("class") if isinstance(cls, list) and cls[0].startswith("meta"): continue header.append(h.text) rr = table.find_all("tr") for r in rr: dd = r.find_all("td") if len(dd) == 0: continue row = [] data.append(row) for d in dd: # Skips cells whose class starts with "meta" (they are not of interest) cls = d.get("class") if isinstance(cls, list) and cls[0].startswith("meta"): continue row.append(d.text) return data, header
def set_login(handle=None): if handle is None: handle = input("Handle: ") password = getpass.getpass("Password: "******"lxml") browser.open("http://codeforces.com/enter") enter_form = browser.get_form("enterForm") enter_form["handleOrEmail"] = handle enter_form["password"] = password browser.submit_form(enter_form) checks = list( map(lambda x: x.getText()[1:].strip(), browser.select("div.caption.titled"))) if handle not in checks: print("Login Failed.") return else: secret_loc = os.path.join(os.path.dirname(__file__), "secret") secretfile = open(secret_loc, "w") secretfile.write(encode(handle) + " " + encode(password)) secretfile.close() print("Successfully logged in as " + handle)
def pushedbutton(self, b): account = self.lineEdit.text() pasw = self.lineEdit_3.text() #use robobrowser module to manipulate web page browser = RoboBrowser(history=True) browser.open('http://web1.cmu.edu.tw/stdinfo/login.asp') form1 = browser.get_form(id='form1') form1['f_id'].value = account form1['f_pwd'].value = pasw browser.submit_form(form1) if browser.state.url == "http://web1.cmu.edu.tw/stdinfo/loginerr.asp": self.lineEdit_2.setText('帳號密碼錯了?') else: self.lineEdit_2.setText('成功登入,填寫中....') link_one = browser.get_link(text='教師教學意見調查') browser.follow_link(link_one) list = [] for l in browser.get_links(text='填寫'): list.append(l) list.pop(0) for li in list: browser.follow_link(li) form2 = browser.get_form(id='thisform') form2['CH_1'].value = '3' form2['CH_2'].value = '3' form2['CH_3'].value = '3' form2['CH_4'].value = '3' form2['CH_5'].value = '3' form2['CH_6'].value = '3' form2['CH_7'].value = '3' form2['CH_8'].value = '3' form2['CH_9'].value = '3' form2['CH_10'].value = '3' browser.submit_form(form2) self.lineEdit_2.setText('Done!')
def get_portal_auth() -> str: """ Attempts login to the Club1909 page and retrieves the cookie FortressPortalAuth :return: """ browser = RoboBrowser(session, history=True) browser.open(LOGIN_FORM_URL) login_form = browser.get_forms()[0] login_form['email'] = os.environ['club1909_username'] login_form['password'] = os.environ['club1909_password'] # TODO: check get_forms returns one value # TODO: check login errors / exceptions logging.debug( f"Attempt to login with {os.environ['club1909_username']} and {os.environ['club1909_password']} " ) browser.submit_form(login_form) logging.info( f"Found portal Auth code {browser.session.cookies['.FortressPortalAuth']}" ) return browser.session.cookies['.FortressPortalAuth']
def show(self): browser = RoboBrowser(parser='html.parser') browser.open('http://codeforces.com/enter') enter_form = browser.get_form('enterForm') enter_form['handleOrEmail'] = self.username enter_form['password'] = self.password browser.submit_form(enter_form) try: checks = list( map(lambda x: x.getText()[1:].strip(), browser.select('div.caption.titled'))) if self.username not in checks: click.secho('Login Failed.. Wrong password.', fg='red') return except Exception as e: click.secho('Login Failed.. Maybe wrong id/password.', fg='red') return browser.open('http://codeforces.com/contest/' + self.c_name + '/standings/friends/true') soup = browser.parsed()[0] # no need of soup ftable = soup.findAll('table', {'class': 'standings'})[0].findAll('tr')[1:-1] tableh = soup.findAll( 'table', {'class': 'standings'})[0].findAll('tr')[0].findAll('th') table_data = [[x.getText().strip() for x in tableh]] for friend in ftable: row = [x.getText().strip() for x in friend.findAll('td')] table_data += [row] tt = texttable.Texttable() tt.add_rows(table_data) tt.set_cols_valign(["b"] * len(tableh)) print(tt.draw())
def submit(self): # get latest submission id, so when submitting should have not equal id last_id, b, c, d, e = Submit.get_latest_verdict(self.username) browser = RoboBrowser(parser='html.parser') browser.open('http://codeforces.com/enter') enter_form = browser.get_form('enterForm') enter_form['handleOrEmail'] = self.username enter_form['password'] = self.password browser.submit_form(enter_form) try: checks = list( map(lambda x: x.getText()[1:].strip(), browser.select('div.caption.titled'))) if self.username not in checks: click.secho('Login Failed.. Wrong password.', fg='red') return except Exception as e: click.secho('Login Failed.. Maybe wrong id/password.', fg='red') return browser.open('http://codeforces.com/problemset/submit') submit_form = browser.get_form(class_='submit-form') submit_form['submittedProblemCode'] = self.prob_id submit_form['sourceFile'] = self.inputfile browser.submit_form(submit_form) if browser.url[-6:] != 'status': click.secho( 'Failed submission, probably you have submit the same file before', fg='red') return Submit.print_verdict(last_id, self.username, 100) click.secho('[{0}] submitted ...'.format(self.inputfile), fg='green')
def rslt(user, password): url = 'http://erp.iitbbs.ac.in' browser = RoboBrowser(history=False, parser='html.parser') response = browser.open(url) form = browser.get_form(action='login.php') form['email'].value = user form['password'].value = password browser.submit_form(form) if (browser.url != 'http://erp.iitbbs.ac.in/home.php'): return False attendance_link = 'http://erp.iitbbs.ac.in/Result/results.php' browser.open(attendance_link) soup = BeautifulSoup(browser.response.text, 'html.parser') content1 = soup.find('div', attrs={'class': 'inner2'}) table0 = content1.find('table', attrs={'class': 'marks_list'}) table1 = table0.find_all('table') heading = table1[0].find_all('tr') result = dict() lst = [] lst1 = [] for i in range(len(table1)): num = 0 for row in table1[i].find_all('tr'): td = row.find_all('td') for j in td: lst.append(j.text.strip()) result['table' + str(num)] = lst lst = [] num = num + 1 return result
def __init__(self): self.HEADERS_LIST = [ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13', 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201', 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16', 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre' ] self.session = requests.Session() self.browser = RoboBrowser(session=self.session, user_agent=random.choice(self.HEADERS_LIST)) self.handle = '' self.id_url = "https://twitter.com/intent/user?user_id=" self.prof_url = "https://twitter.com/" self.TWITTER_AUTH = tweepy.OAuthHandler( "", "" ) self.TWITTER_AUTH.set_access_token( "", "" ) self.api = tweepy.API(self.TWITTER_AUTH, parser=tweepy.parsers.JSONParser(), wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True)
def scrap(url): browser = RoboBrowser(user_agent='i am tool') browser.open(url) a = browser.find(class_='captcha' ) ##machine learning would be great for class prediction fullsrc = url[:-1] + a['src'] request.urlretrieve(fullsrc, "captcha.jpg") ##tesseract buraya gelecek ##tam buraya işte aha form = browser.get_form(action=re.compile(r'.')) # Fill it out form['name'].value = 'namaaaeee' form['password'].value = '*****@*****.**' form['password2'].value = 'teambeaver' form['captcha_1'].value = '1234' # Submit the form browser.submit_form(form) print(browser.response)
def desktop(keyword,sitename,device,useragent): parser = 'html.parser' browser = RoboBrowser(history=False, user_agent=useragent, parser=parser) browser.open('https://www.google.com/search?num=100&q=' + keyword) # links = browser.find_all("div", {"class": "KJDcUb"}) #desktop div where URLs are links = browser.find_all("div", {"class": "g"}) counter = 0 print('The user Agent you used was ----> ' + useragent) d=[] for i in links: counter = counter + 1 if sitename in str(i): url = i.find_all('a', href=True) position = "%d" % (counter) rank = "%s" % (url[0]['href']) now = datetime.date.today().strftime("%d-%m-%Y") keyword = keyword device = device d.append(keyword) d.append(position) d.append(rank) d.append(device) d.append(now) print(keyword, position, rank, device, now) csv_export(d,keyword,device)
def getData(self): # temp sqlString = "DELETE FROM coin_ktoon WHERE date='%s'" % ( self.todayString) self.dbconn.cur.execute(sqlString, ) # temp browser = RoboBrowser( history=True, user_agent= 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/33.0.1750.152 Chrome/33.0.1750.152 Safari/537.36' ) # ktoon login ---------------------------------------------------------------- start auth_url = 'https://www.myktoon.com/web/loginprc.kt' data = {'email': 'email value', 'passwd': 'password value'} browser.open(auth_url, method='post', data=data) # # ktoon login ---------------------------------------------------------------- end url_content = 'https://www.myktoon.com/web/payment/payment.kt' browser.open(url_content) self.coinTopElements = browser.find_all('a', {"name": "payamountkey"}) self.product_title_list, self.product_price_list = [], [] for idx, cel in enumerate(self.coinTopElements): for pt in cel.find_all('span'): if '베리' in pt.get_text(): print pt.get_text() self.product_title_list.append(pt.get_text()) for pp in cel.find_all('span', class_='won'): print pp.get_text() self.product_price_list.append(pp.get_text())
def main(): parser = parse_args() args = parser.parse_args() if not args.link or not args.output: parser.print_help() exit() if os.path.isfile("config.json"): config_file = "config.json" else: config_file = ".example.config.json" # Read config file with open(config_file, 'r') as f: config = json.load(f) # browser instance browser = RoboBrowser(history=True, parser="html.parser") firmware_file_path = args.output[0] firmware_file_url = args.link[0] if os.path.isfile(config['cookiefile']): with open(config['cookiefile'], 'r') as cookie_file: # set session cookies add_dict_to_cookiejar(browser.session.cookies, json.load(cookie_file)) download_firmware(browser, firmware_file_url, firmware_file_path) else: # Browse to sammobile login page browser.open("https://www.sammobile.com/login/") form = browser.get_form(id="loginform-custom") form["log"] = config['username'] form["pwd"] = config['password'] browser.submit_form(form) if browser.url == "https://www.sammobile.com": _save_session_cookie(browser.session.cookies, config['cookiefile']) download_firmware(browser, firmware_file_url, firmware_file_path)
warnings.filterwarnings('ignore') filename = 'Q12005' filepath = './zipfile' #creating the folder for zipfiles pathlib.Path(filepath).mkdir(parents=True, exist_ok=True) #account parameters username = '******' password = '******' # username = input('Please enter your username: '******'Please enter your password: '******'https://freddiemac.embs.com/FLoan/secure/login.php?pagename=download') #Getting form from browser form = br.get_form() form['username'] = username form['password'] = password br.submit_form(form) filename = input('Please enter filename: ') #accept the form form1 = br.get_form() form1['accept'] = 'Yes' br.submit_form(form1)
def open_page(url): """Opens the goodreads homepage for login""" br = RoboBrowser(history=True, parser="html.parser") br.open(url) return br
def scrape_bio_and_albums(keywords): """ scrapes artists' data on https://www.lyrics.com/, and stores text (bio and albums) from an HTML page source of each artist's page in file (text&csv files). :param keywords: list of keywords that should represent artists' names (list) :return: """ if keywords and isinstance(keywords, list): # builds new object of RoboBrowser with given params browser = RoboBrowser( parser='html.parser', user_agent= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', history=True, timeout=10) # Open a URL (using 'RoboBrowser' library). browser.open(BASE_URL) for keyword in keywords: if keyword and len(keyword) > 1: # get browser url (should be 'old' after searching a term - if browser goes to new url) old_url = browser.url # trying to search keyword on 'lyrics.com' (using RoboBrowser's methods to handle forms) form = browser.get_form( id='search-frm') # Find form by ID 'search-frm' form[ 'st'].value = keyword # sets query value 'st' with given keyword browser.submit_form( form) # Submit a form - to search given keyword # check if the url is changed (after searching a keyword) if old_url != browser.url: # select required <a> tags, using CSS Selectors (see on BeautifulSoup's documentation) a_tags = browser.select( 'body p[class~=serp-flat-list] a[href^=artist/]') if a_tags: # browser.follow_link(a_tags[0]) # builds base url with href - to open required url using 'open()' method, # and avoid including the "/lyrics/" part in url, when using 'follow_link()' method first_artist_url = a_tags[0]['href'].replace( "artist", BASE_URL + "artist") # Open URL (should get url of the first suggested artist's page in results) browser.open(first_artist_url) # parse response content (bs4 obj), using HTML parser specified by the browser soup = browser.parsed if soup: artist_bio_tag = soup.find( class_='artist-bio') # find tag by class if artist_bio_tag: # save parsed text (artist bio) from page source to a text file save_source(keyword + " - bio", artist_bio_tag.get_text(), dir_path=os.path.join( ARTISTS_PATH, keyword)) # parse albums&songs from html tables, and save the data to a csv file albums_to_csv(soup, keyword + " - albums", dir_path=os.path.join( ARTISTS_PATH, keyword)) browser.back() # Go back in browser history. browser.back() # Go back in browser history.
import re from robobrowser import RoboBrowser # Browse to Rap Genius browser = RoboBrowser(history=True) browser.open('http://rapgenius.com/') # Search for Queen form = browser.get_form(action='/search') form # <RoboForm q=> form['q'].value = 'queen' browser.submit_form(form) # Look up the first song songs = browser.select('.song_name') browser.follow_link(songs[0]) lyrics = browser.select('.lyrics') lyrics[0].text # \n[Intro]\nIs this the real life... # Back to results page browser.back() # Look up my favorite song browser.follow_link('death on two legs') # Can also search HTML using regex patterns lyrics = browser.find(class_=re.compile(r'\blyrics\b')) lyrics.text
#!/usr/bin/env python3 import os import re from collections import defaultdict from datetime import datetime from robobrowser import RoboBrowser from ccf.config import LoadSettings import pandas as pd browser = RoboBrowser(history=True, timeout=6000, parser="lxml") config = LoadSettings()["KSADS"] download_dir = config["download_dir"] def main(): login() download_all() generate_snapshot_from_raw_excel_files() def login(): browser.open("https://ksads.net/Login.aspx") form = browser.get_form("form1") form["txtUsername"].value = config["user"] form["txtPassword"].value = config["password"] browser.submit_form(form) if browser.response.url == "https://ksads.net/Login.aspx": raise Exception("Incorrect credentials provided") return False
session = requests.Session() session.proxies = {'http': '91.214.70.99:3128', 'http': '159.203.118.239:8080',} while votes <= 4094: response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") text = str(soup.findAll('tr')[58]) for i in range(len(text)): if text[i].isdigit() and count > 0: ini += str(text[i]) if text[i].isdigit() and text[i+1].isspace() and count == 0: count += 1 elif text[i].isdigit() and text[i+1].isspace() and count > 0: break votes = int(ini) print(votes) ini = '' count = 0 br = RoboBrowser(history=True, session=session) br.open(url) form = br.get_form() form['id'].value = 733 br.submit_form(form) tmp += 1 if tmp == 30: time.sleep(1) tmp = 0 print("Finish! {} votes".format(votes+1))
class CommenHen: "Contains common methods" LOCK = threading.Lock() TIME_RAND = app_constants.GLOBAL_EHEN_TIME QUEUE = [] COOKIES = {} LAST_USED = time.time() HEADERS = { 'user-agent': "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0" } _QUEUE_LIMIT = 25 _browser = RoboBrowser(user_agent=HEADERS['user-agent'], parser='html.parser') def begin_lock(self): log_d('locked') self.LOCK.acquire() t1 = time.time() while int(time.time() - self.LAST_USED) < self.TIME_RAND: t = random.randint(3, self.TIME_RAND) time.sleep(t) t2 = time.time() - t1 log_d("Slept for {}".format(t2)) def end_lock(self): log_d('unlocked') self.LAST_USED = time.time() self.LOCK.release() def add_to_queue(self, url='', proc=False, parse=True): """Add url the the queue, when the queue has reached _QUEUE_LIMIT entries will auto process :proc -> proccess queue :parse -> return parsed metadata """ if url: self.QUEUE.append(url) log_i("Status on queue: {}/{}".format(len(self.QUEUE), self._QUEUE_LIMIT)) try: if proc: if parse: return self.parse_metadata(*self.process_queue()) return self.process_queue() if len(self.QUEUE) >= self._QUEUE_LIMIT: if parse: return self.parse_metadata(*self.process_queue()) return self.process_queue() else: return 1 except TypeError: return None def process_queue(self): """ Process the queue if entries exists, deletes entries. Note: Will only process _QUEUE_LIMIT entries (first come first out) while additional entries will get deleted. """ log_i("Processing queue...") if len(self.QUEUE) < 1: return None try: if len(self.QUEUE) >= self._QUEUE_LIMIT: api_data, galleryid_dict = self.get_metadata( self.QUEUE[:self._QUEUE_LIMIT]) else: api_data, galleryid_dict = self.get_metadata(self.QUEUE) except TypeError: return None finally: log_i("Flushing queue...") self.QUEUE.clear() return api_data, galleryid_dict @classmethod def login(cls, user, password): pass @classmethod def check_login(cls, cookies): pass def check_cookie(self, cookie): cookies = self.COOKIES.keys() present = [] for c in cookie: if c in cookies: present.append(True) else: present.append(False) if not all(present): log_i("Updating cookies...") try: self.COOKIES.update(cookie) except requests.cookies.CookieConflictError: pass def handle_error(self, response): pass @classmethod def parse_metadata(cls, metadata_json, dict_metadata): """ :metadata_json <- raw data provided by site :dict_metadata <- a dict with gallery id's as keys and url as value returns a dict with url as key and gallery metadata as value """ pass def get_metadata(self, list_of_urls, cookies=None): """ Fetches the metadata from the provided list of urls returns raw api data and a dict with gallery id as key and url as value """ pass @classmethod def apply_metadata(cls, gallery, data, append=True): """ Applies fetched metadata to gallery """ pass def search(self, search_string, **kwargs): """ Searches for the provided string or list of hashes, returns a dict with search_string:[list of title & url tuples] of hits found or emtpy dict if no hits are found. """ pass
import os from robobrowser import RoboBrowser from string import capwords url = "http://www.guitarcats.com/realbook-jazz-standards/A" # Open up HTML browser = RoboBrowser(parser="html.parser", history=True) browser.open(url) if browser.response.status_code != 200: print("Server responded with code " + str(browser.response.status_code) + " for " + url) print("Exiting...") quit() links = browser.find_all("a", {"class": "realbook_letter_link"}) for link in links: # Open A-Z categories 1 by 1 browser.open(link["href"]) # Check proper response if browser.response.status_code != 200: print("Server responded with code " + str(browser.response.status_code) + " for " + link["href"]) continue # Get current category and find all its songs letter = link["href"][-1]
return text[len(prefix):] return text # Fix from: https://github.com/jmcarp/robobrowser/issues/93#issuecomment-650640171 import werkzeug werkzeug.cached_property = werkzeug.utils.cached_property # Start scraping from robobrowser import RoboBrowser import re import urlparse courseModulesUrl = 'https://myuni.adelaide.edu.au/courses/' + args.course + '/modules' browser = RoboBrowser(history=True, parser='html.parser') browser.open(courseModulesUrl) # Handle login page form = browser.get_form(id='fm1') form["username"] = args.username form["password"] = args.password browser.session.headers['Referer'] = args.course browser.submit_form(form) # Get course name (no special characters) courseTitle = browser.find("title").text courseTitle = remove_prefix(courseTitle, 'Course Modules: ') courseTitle = "".join([x if x.isalnum() else "_" for x in courseTitle]) print('Course Url: ' + courseModulesUrl) print('Course Title: ' + courseTitle) print('Finding file links of type: ' + args.downloadOnly) # Make output dir
from urllib import request import csv import sys """ Script per il sito del unipa regione Sicilia primo parametro: medie/estremi (1 per orari, 2 per giornalieri, 3 per mensili) secondo parametro: Intervallo temporale terzo parametro: Inizio quarto parametro: Fine """ url = "http://meteo.astropa.unipa.it/public/" br = RoboBrowser(parser="html.parser") def query (avgtype, timespan, day, month, year, dayend, monthend, yearend) : br.open(url) form = br.get_form() #prende il form dalla pagina web form['avgtype'] = avgtype #medie form['timespan'] = timespan #intervallo di tempo form['day'] = day form['month'] = month #mese inizio form['year'] = year form['dayend'] = dayend form['monthend'] = monthend #mese fine form['yearend'] = yearend br.submit_form(form) #esegue metodo post
def def_vs_scraper(credentials, bucket_name, obj_path, years=default_years, weeks=default_weeks): client = boto3.client('s3') browser = RoboBrowser() browser.open(login_url) login_form = browser.get_forms()[0] # Set login credentials login_form['ctl00$Body$EmailTextbox'].value = credentials['email'] login_form['ctl00$Body$PasswordTextbox'].value = credentials['password'] login_form.serialize() # Submit login form browser.submit_form(login_form) # Open the previously hidden page for yearIdx, year in enumerate(years): year_dict = years[yearIdx] year_key = list(year_dict.keys())[0] sn = year_dict[year_key] for week in weeks: for position_ranking in default_position_rankings: w = week ew = week pts_vs_url = 'https://fantasydata.com/nfl-stats/nfl-fantasy-football-points-allowed-defense-by-position.aspx?fs={}&stype=0&sn={}&scope={}&w={}&ew={}&s=&t=0&p=0&st={}&d=1&ls={}&live=false&pid=true&minsnaps=4'.format( fs, sn, scope, w, ew, position_ranking['url'], position_ranking['url']) # Delay before retrieving next set of data time.sleep(0.5) browser.open(pts_vs_url) content = browser.find_all('tr') # Initialize the data to be written to the file formatted_data = '' for idx, line in enumerate(content): # Only add the header once per year if idx == 0 and week == 0: formatted_data = headers + '\n' elif idx != 0: parsed_data = ','.join(line.find_all(text=True)) stripped_line = parsed_data.strip('\n').strip(',') year_value = str(list(year.keys())[0]) next_line = stripped_line + ',' + year_value + '\n' formatted_data = formatted_data + next_line # Make the directory for each year of CSV Data file_path = '{}/{}/{}/{}.csv'.format(obj_path, year_key, week + 1, position_ranking['file']) try: # Upload object to the S3 bucket client.put_object(Bucket=bucket_name, Body=formatted_data, Key=file_path) except RuntimeError as err: print('Failed to write to file: ', err) raise err print('Success! Uploaded data: {}'.format(file_path))
from robobrowser import RoboBrowser import login import parser if __name__ == '__main__': shared_browser = RoboBrowser(parser='html.parser', timeout=10) login_gui = login.Login(shared_browser) user_data, is_logged_in = login_gui.user_data, login_gui.is_logged_in del login_gui # free memory reserved by the login gui if is_logged_in: parser_gui = parser.Parser(shared_browser, user_data)