def get_location(id): HEADERS_LIST = [ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13', 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201', 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16', 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre' ] session = requests.Session() browser = RoboBrowser(session=session, user_agent=random.choice(HEADERS_LIST), parser='lxml') first_url = "https://twitter.com/intent/user?user_id=" + str(id) browser.open(first_url) results = browser.find_all("span", {"class": "nickname"}) if results is not None and len(results) is not 0: handle = " ".join(str(results[0].text).split()) url = "https://twitter.com/" + handle browser.open(url) results = browser.find_all( "span", {"class": "ProfileHeaderCard-locationText u-dir"}) if results is not None and len(results) is not 0: return " ".join(str(results[0].text).split()) return None
def scrape_cosmo_exam(url,email,password): browser = RoboBrowser() browser.open(tlink) search = browser.get_form() search[ 'user[email]' ] = email search[ 'user[password]' ] = password browser.submit_form(search,submit=search.submit_fields['commit']) # main page browser.follow_link(browser.find_all('a')[2]) browser #browser.get_links() all_links = browser.find_all('a') announcements_key = list(filter( lambda x: 'Announcements' in x, all_links ))[0] announcement_ind = all_links.index(announcements_key) browser.follow_link(browser.find_all('a')[announcement_ind]) #obtaining title objects - tags titles =mapper( lambda x:date_extract(1) , browser.find_all('h2'))[0] # helper function 2 def date_extract(ind): return list(mapper( lambda x:list(x.children)[1], browser.find_all('h2') )) # helper function 3 def matcher(lst,*matches): if not matches: matches = ['exam','reminder'] else: matches=matches[0] return filterer(lambda x:any(string.lower() in str(x).lower() for string in matches) ,lst) return titles
class Question(object): """ Zhihu parser, question obj""" def __init__(self, page_url): self.url = page_url self.browser = RoboBrowser(history=True, user_agent='nemo1') self.browser.open(self.url) def get_answer_count(self): if self.browser.find("h3", id="zh-question-answer-num") != None: return int( self.browser.find("h3", id="zh-question-answer-num")["data-num"]) def get_all_answer_url_list(self): results = [] if self.get_answer_count() <= 10: for answer_div in self.browser.find_all( "div", class_="zm-item-answer zm-item-expanded"): results.append(URL_PREFIX + answer_div.find("link")["href"]) else: for i in range(0, (self.get_answer_count() / 10) + 1): offset = i * 10 if i == 0: for answer_div in self.browser.find_all( "div", class_="zm-item-answer zm-item-expanded"): results.append(URL_PREFIX + answer_div.find("link")["href"]) # print results else: # pass post_url = "http://www.zhihu.com/node/QuestionAnswerListV2" _xsrf = self.browser.find("input", attrs={'name': '_xsrf'})["value"] params = json.dumps({ "url_token": int(self.url[-8:-1] + self.url[-1]), "pagesize": 10, "offset": offset }) data = {'_xsrf': _xsrf, 'method': "next", 'params': params} header = { 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", 'Host': "www.zhihu.com", 'Referer': self.url } r = requests.post(post_url, data=data, headers=header, verify=False) answers = r.json()["msg"] # print len(answers) # pdb.set_trace() for ans in answers: soup = BeautifulSoup(ans, 'html.parser') results.append(URL_PREFIX + soup.find("link")["href"]) return results
class Downloader(): def __init__(self, proxy=None, worker_num=0): self.worker_num = worker_num session = Session() if proxy is not None: session.proxies = {'http': proxy, 'https': proxy} self.browser = RoboBrowser(history=True, parser='html.parser', session=session) def get_download_link(self, book_url): self.browser.open(book_url) for link in self.browser.find_all("a"): if "download.php?t=1" in str(link): return f"https://www.lectulandia.cc{link['href']}" def download_book(self, download_url): self.browser.open(download_url) pattern = re.compile("var linkCode = \"(.*?)\";") section = pattern.findall(str(self.browser.parsed)) bee_url = f'https://www.beeupload.net/file/{section[0]}' self.browser.open(bee_url) try: filename = self.browser.find( "div", id="fileDescription").find_all("p")[1].text.replace( "Name: ", "") size = self.browser.find( "div", id="fileDescription").find_all("p")[2].text file_url = self.browser.find("a", id="downloadB") time.sleep(2) self.browser.follow_link(file_url) with open(f"books/{filename}", "wb") as epub_file: epub_file.write(self.browser.response.content) return filename, size except: print(self.browser.parsed) def get_book_page_list(self, page): self.browser.open(f'https://www.lectulandia.cc/book/page/{page}/') return [ f"https://www.lectulandia.cc{book['href']}" for book in self.browser.find_all("a", class_="card-click-target") ] def download_full_page(self, page): print(f"Downloading page: {page} ") books = self.get_book_page_list(page) for book in books: time.sleep(2) download_url = self.get_download_link(book) print(f"Worker: {self.worker_num} on page: {page}", self.download_book(download_url))
def scrape_cs2040s(url, email, password): browser = RoboBrowser(parser='html.parser') browser.open(url) search = browser.get_form() search['user[email]'] = str(email) search['user[password]'] = str(password) browser.submit_form(search, submit=search.submit_fields['commit']) # main page browser.follow_link(browser.find_all('a')[2]) # missions browser.follow_link(browser.find_all('a')[11]) # find names reduced = filterer(lambda x: len(list(x.children)) >= 1, browser.find_all('th')) reduced = filterer(lambda x: 'colspan' in x.attrs, reduced) # unsure of object structure so convert to list type and assess last element names = mapper(lambda x: list(list(x.children)[-1])[-1], reduced) # find deadlines deadlines_tags = list( filter(lambda x: x['class'] == ['table-end-at'], browser.find_all('td'))) deadlines = list( map(lambda x: (list(x))[0] if list(x) else 'not yet', deadlines_tags)) curr_yr = datetime.now().year #returns a list of datetime objects dates = mapper( lambda x: str(datetime.strptime(f"{curr_yr} {x}", '%Y %d %b %H:%M')) if x != 'not yet' else 'Not yet', deadlines) array = [] for n, d in zip(names, dates): dic1 = {} dic1['title'] = n dic1['datetime'] = d array.append(dic1) dic = {} dic['data'] = array #scrape exam details with open( '/Users/sherrywu1999/Desktop/untitled/callie/python/deadlines/data.json', 'w') as json_file: json.dump(dic, json_file)
def getData(self): # temp sqlString = "DELETE FROM coin_toomics WHERE date='%s'" % (self.todayString) self.dbconn.cur.execute(sqlString, ) # temp browser = RoboBrowser(history=True, user_agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/33.0.1750.152 Chrome/33.0.1750.152 Safari/537.36') # toomics login ---------------------------------------------------------------- start auth_url = 'http://m.toomics.com/auth/layer_login' data= {'user_id': 'userid value', 'user_pw': 'password value', 'iSaveUserId': 'true', 'iKeepCookie': 'true', 'returnUrl': '', 'direction': '' } browser.open(auth_url, method='post', data=data) # # toomics login ---------------------------------------------------------------- end url_content = 'http://m.toomics.com/mypage/charge' browser.open(url_content) self.coinTopElements = browser.find_all('ul', class_='list-charge') self.product_title_list, self.product_price_list = [], [] for idx, cel in enumerate(self.coinTopElements): # print cel for pt in cel.find_all('div', class_='coin-item'): print pt.get_text() self.product_title_list.append(pt.get_text()) for pp in cel.find_all('span', class_='price'): print pp.get_text() self.product_price_list.append(pp.get_text())
def processPage(pageVal): ageValues = [] noAgeCount = 0 usersProcessed = 0 browser = RoboBrowser(parser='html.parser') browser.open(baseURL + str(pageVal) + options) #Find all the links on the page, if the href in a link has member.php in it then #it's a member link, so pull out the uid from it and add that uid to the list userIDs = [ link['href'].split('u=')[1] for link in browser.find_all('a', href=True) if 'member.php?' in link['href'] ] usersProcessed = len(userIDs) for user in userIDs: age = getUserAge(user) time.sleep(1) noAgeCount += age is None if age is not None: ageValues.append(age) return [ageValues, noAgeCount, usersProcessed]
def GetTranslations(wordToFind): print("\nGet translations for " + wordToFind) # Browser browser = RoboBrowser(history=True, parser="html5lib") #Open the login page. Setup our login info. Submit the info. print ("Connecting to Ord.se ...") browser.open('http://www.ord.se/oversattning/engelska/?s='+wordToFind+'&l=SVEENG') definitions = browser.find_all(class_="search-result-word-wrapper") Translations = [] #Get rid of defitions without class... :) for definition in definitions: #print(definition) #print() wordClasses = definition.find_all(class_="word-class") if not wordClasses: definitions.remove(definition) else: for wordClass in wordClasses: wordClass = wordClass.getText() #print(definition) #print("\n") actualSearchWord = definition.find(class_="search-result-head-word").getText() actualSearchWord = actualSearchWord.rstrip() htmlDefinitions = definition.find_all(class_="normal font1 readable") translations = [] for translation in htmlDefinitions: translations.append(translation.getText()) Translations.append(Translation(actualSearchWord, wordClass, translations)) return Translations
def HentaiMS(keyword, PageNum): all_comic_links = [] base_URL = 'http://search.hentai.ms/?tag=' + keyword + '&num=' + PageNum + '8&related=&pages=&box=14&es=14' print(base_URL) browser = RoboBrowser(history=True, parser='html.parser', user_agent='Chrome/41.0.2228.0') browser.open(base_URL) td = browser.find_all('td', {'id': 'search_gallery_item'}) print(len(td)) for line in td: this_link = line.find_all('a', href=True) try: if 'tags' not in this_link[1]['href']: print(this_link[1]['href']) all_comic_links.append(this_link[1]['href']) except: pass return all_comic_links
def get_source_code(commitId, project): import random import requests from robobrowser import RoboBrowser HEADERS_LIST = [ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13', 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201', 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16', 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre' ] link = [] session = requests.Session() browser = RoboBrowser(session=session, user_agent=random.choice(HEADERS_LIST), parser="lxml") url = "https://github.com/" + project.replace("-", "/") + "/commit/" + commitId browser.open(url + "?diff=unified") results = browser.find_all("a") for item in results: if ".java" in str(item): second_url = "https://raw.githubusercontent.com/" + project.replace( "-", "/") + "/" + commitId + "/" + item.string browser.open(second_url) return browser.find().text
def FenHen(keyword): result_urls = [] base_URL = 'http://fenhentai.blogspot.co.uk/search?q=' + keyword browser = RoboBrowser(history=True, parser='html.parser', user_agent='Chrome/41.0.2228.0') while True: browser.open(base_URL) post_body_list = browser.find_all('div', {'class': 'post-body entry-content'}) for post in post_body_list: this_image = post.find('img', src=True) print(this_image['src']) result_urls.append(this_image['src']) Next_Post_Link = browser.find('a', {'class': 'blog-pager-older-link'}, href=True) if (Next_Post_Link == None): break else: base_URL = Next_Post_Link['href'] return result_urls
def desktop(keyword, sitename, useragent): parser = 'html.parser' browser = RoboBrowser(history=False, user_agent=useragent, parser=parser) browser.open('https://www.google.com/search?num=100&q=' + keyword) # links = browser.find_all("div", {"class": "KJDcUb"}) #desktop div where URLs are links = browser.find_all("div", {"class": "g"}) counter = 0 print('The user Agent you used was ----> ' + useragent) d = [] for i in links: counter = counter + 1 if sitename in str(i): url = i.find_all('a', href=True) position = "%d" % (counter) rank = "%s" % (url[0]['href']) now = datetime.date.today().strftime("%d-%m-%Y") keyword = keyword d.append(keyword) d.append(position) d.append(rank) d.append(now) print(keyword, position, rank, now) csv_export(d)
def get_residences(): """ Gets raw residence data from Columbia housing website, standardizes and cleans the data, and uploads to the database """ browser = RoboBrowser() residences_list = [] # makes list of links to each residence hall page browser.open(home_url) table_headers = browser.find_all(class_='views-field-title')[1:] residence_links = list(map(lambda x: x.find('a')['href'], table_headers)) for link in residence_links: browser.open(base_url + link) residence_json = parse_residence_info(browser) if residence_json: if type(residence_json) == list: residences_list.extend(residence_json) else: residences_list.append(residence_json) if not os.path.isfile('app/data.sqlite'): print("Creating database") db.create_all() collate_data(residences_list) # here just for data analysis print("Uploading residences to database") upload_residences_to_db(residences_list)
def get_residences(): """ Gets raw residence data from Columbia housing website, standardizes and cleans the data, and uploads to the database """ browser = RoboBrowser() residences_list = [] # makes list of links to each residence hall page browser.open(HOME_URL) table_headers = browser.find_all(class_='views-field-title')[1:] residence_links = list(map(lambda x: x.find('a')['href'], table_headers)) for link in residence_links: browser.open(BASE_URL + link) residence_json = parse_residence_info(browser) residences_list.extend(residence_json) if not os.path.isfile('app/data.sqlite'): print("Creating database") db.create_all() # uncomment to see collected data # collate_data(residences_list) print("Uploading residences to database") for res in residences_list: upload_object_to_db(Residence, res)
def ExtractONEPAGE(page): final_res = '' browser = RoboBrowser(history=True, parser='html.parser', user_agent='Chrome/41.0.2228.0') while True: print('loop') browser.open( 'http://tools.prowebguru.com/free-online-image-extractor/free_online_image_extractor_tool.php' ) form = browser.get_forms({'class': 'form-horizontal'}) if len(form) != 0: print('broke') break this_form = form[0] this_form["website"] = page browser.submit_form(this_form) img_links = browser.find_all('img', src=True) for line in img_links: if '/tbn/' not in line['src'] and '.wp.com' in line['src']: final_res = line['src'] print(final_res) if final_res != '': with open('HenRUniqueComic.txt', 'a') as f: f.write(final_res + '\n')
def GetWordForms(word, wordClass): print("\nGetting the forms of " + word + " (" + wordClass + ")") #Convert the ord class to wiktionary table class form wordClassDict = {'SUBSTANTIV':'subst', 'VERB':'verb', 'TRANSITIVT VERB':'verb', 'INTRANSITIVT VERB':'verb', 'INTRANSITIVT DEPONENSVERB':'verb', 'ADVERB':'adverb', 'ADJEKTIV':'adj'} browser = RoboBrowser(history=True, parser="html5lib") browser.open('http://sv.wiktionary.org/wiki/'+word) tableClass = "template-sv-" + wordClassDict[wordClass.upper()] #wordFormTable = browser.find_all(class_=re.compile(r"grammar\s+")) wordFormTable = browser.find_all(class_=re.compile(tableClass)) forms = [word] if wordFormTable: forms = [] for table in wordFormTable: tableheader = table.find("tbody") tableSiblings = tableheader.find_all(class_=re.compile("b-")) for sibling in tableSiblings: if not sibling.getText() in forms and sibling.getText().isalpha(): forms.append(sibling.getText()) formIndex = 1 print ("Choose the forms to use; ") for form in forms: print(str(formIndex) + ". " + form) formIndex = formIndex + 1 keepIndexes = list(map(int, input().split())) if len(keepIndexes) > 0: finalForms = [] for index in keepIndexes: finalForms.append(forms[index-1]) forms = finalForms return forms
def praca_shopping(): from robobrowser import RoboBrowser browser = RoboBrowser(parser="html.parser") not_finded = 0 n = 0 names = set() while not_finded < 20: # print(f'Página {n}') finded = False url = f"http://www.pracauberabashopping.com.br/filtro_loja_tipo.asp?tipo=vlojas.asp?busca1={n}" browser.open(url) item = browser.find("strong") if item: name = item.text if name != "Busca sem resultado.": names.add(fixed(name)) finded = True else: items = browser.find_all("a") if len(items) > 1: for item in items[1:]: if item.text != "Resultado da Busca": names.add(fixed(item.text)) finded = True if not finded: not_finded += 1 n += 1 return names
def get_kicktipp_content(browser: RoboBrowser): """ Get the content view area from the kicktipp page. """ content = browser.find_all(id='kicktipp-content') if content[0]: return content[0] return None
def get_bracket_data(year): url = 'http://espn.go.com/mens-college-basketball/tournament/bracket/_/id/{}22/'.format(year) b = RoboBrowser() b.open(url) data = [] for item in b.find_all(attrs={'class': 'match'}): t1, t2 = [(get_id(a['href']), a['title']) for a in item('a')] s1, s2 = ' '.join(item.find('dd').stripped_strings).split() data.append([t1, t2, s1, s2]) return data
def FetchImagesForOneComic(BASE_URL): with open('LUSCIO_One_Comic.txt', 'w') as f: print(f) Split_URL = BASE_URL.split('/') Craft_Input_Url = 'https://luscious.net/c/hentai_manga/pictures/album/' + Split_URL[ 4] + '/page/' print(Craft_Input_Url) browser = RoboBrowser(history=True, parser='html.parser', user_agent='Chrome/41.0.2228.0') jobs = [] counter = 1 while True: index = 0 print('Searching for number of pages') Working_URL = Craft_Input_Url + str(counter) + '/' print(Working_URL) browser.open(Working_URL) Thumbnail_Container = browser.find_all( 'div', {'class': 'item thumbnail ic_container'}) if (len(Thumbnail_Container) == 0): break for item in Thumbnail_Container: img_link = item.find('a', href=True)['href'] p = multiprocessing.Process(target=MProcess, args=( index, counter, img_link, )) index += 1 jobs.append(p) p.start() counter += 1 for proc in jobs: proc.join() with open('LUSCIO_One_Comic.txt', 'r') as f: data = f.read().splitlines() result = (sorted(data, key=ExtractNumberFromURLforLuscio)) return result
def get_data(): url = 'https://domo.ayy.fi/customers/sign_in' br = RoboBrowser() br.open(url) form = br.get_form() form['customer[email]'].value = config.email form['customer[password]'].value = config.password br.submit_form(form) data = br.find_all("li") #remove 2 first li elements return data[2:]
def gatherData(user, password): baseURL = 'https://sigarra.up.pt/feup/pt/' browser = RoboBrowser(history=True, parser='html.parser') browser.open(baseURL + 'web_page.Inicial') # Gets the login form form = browser.get_form(action=re.compile(r'validacao')) # Updates the login form with the user credentials form['p_user'].value = 'up' + user form['p_pass'].value = password browser.submit_form(form) # Goes to the user profile browser.open(baseURL + 'fest_geral.cursos_list?pv_num_unico=' + user) # Opens the extended view extended = browser.find(title='Visualizar informações no contexto do curso') browser.follow_link(extended) credits = [] grades = [] # For each html class containing grades ("i", "p" and "o"), gather data for i in browser.find_all(class_='i'): if i.find(class_='n aprovado'): credits.append(i.find(class_='k n').text) grades.append(i.find(class_='n aprovado').text) for j in browser.find_all(class_='p'): if j.find(class_='n aprovado'): credits.append(j.find(class_='k n').text) grades.append(j.find(class_='n aprovado').text) for k in browser.find_all(class_='o'): if k.find(class_='n aprovado'): credits.append(k.find(class_='k n').text) grades.append(k.find(class_='n aprovado').text) return credits, grades
class BadooApi: def __init__(self): self.browser = RoboBrowser(history=True, parser='html.parser') self.browser.open( BASE_URL.format("/es/contactos/spain/zaragoza/zaragoza/")) def next_page(self): btns = self.browser.find_all(class_=re.compile( r".*btn.*btn--xsm.*btn--transparent.*js-pages.*")) try: print(BASE_URL.format(btns[1]['href'])) self.browser.open(BASE_URL.format(btns[1]['href'])) except: print(BASE_URL.format(btns[0]['href'])) self.browser.open(BASE_URL.format(btns[0]['href'])) def extract_users(self): profiles = [] for a in self.browser.find_all("a", {"rel": "profile-view"}): profiles.append({"url": a["href"], "name": a["title"]}) return profiles def get_public_profile(self, url): self.browser.open(BASE_URL.format(url)) photos = [] for img in self.browser.find_all( class_=re.compile(r'.*photo-list__img.*js-gallery-img.*')): photos.append(img['src']) info = self.browser.find("title").text.split("|") personal_info = info[0].split(",") name = personal_info[0] sex = personal_info[1] age = personal_info[2] location = info[1] return ({ "name": name, "sex": sex, "age": age, "location": location, "photos": photos })
def ExtractBigImage( url='http://pururin.us/view/32338/1/kimi-wa-kanojo-no-kanrika-ni-iru.html' ): browser = RoboBrowser(history=True, parser='html.parser', user_agent='Chrome/41.0.2228.0') browser.open(url) All_img = browser.find_all('img', src=True) for line in All_img: print(fix_bad_unicode(line))
def scrape_cosmo(url,email,password ): browser = RoboBrowser() browser.open(tlink) search = browser.get_form() search[ 'user[email]' ] = str(email) search[ 'user[password]' ] = str(password) browser.submit_form(search,submit=search.submit_fields['commit']) # main page browser.follow_link(browser.find_all('a')[2]) # missions browser.follow_link(browser.find_all('a')[17]) # find deadlines deadlines_tags = list(filter( lambda x:x['class']==['table-end-at'], browser.find_all('td') ) ) deadlines = list( map (lambda x: (list(x))[0] if list(x) else 'not yet', deadlines_tags )) curr_yr = datetime.now().year #returns a list of datetime objects return mapper( lambda x: datetime.strptime( f"{curr_yr} {x}", '%Y %d %b %H:%M') if x!='not yet' else 'Not yet', deadlines)
async def fullwidth(arg1, *args): print('fullwidth') await torgo.send_typing(channel) query = arg1 for arg in args: query += ("+" + arg) print(query) browser = RoboBrowser() browser.open('http://qaz.wtf/u/convert.cgi?text=' + query) cells = browser.find_all('td') content = cells[5].text.strip() print(content) await torgo.say(str(content))
def follower(self, count): tc = requests.session() tc.verify = False tbrowser = RoboBrowser(session=tc) tbrowser.open('https://www.tumblr.com/tagged/trending-topics') links = tbrowser.find_all("a", {"class": "post_info_link"}) for link in links: try: self.client.post('user/follow', params={'url': link['href']}) print("following " + link['href'] + "On account: " + self.blog_url) except: print("boo")
def scrape_snotel_sites(url=None): if not url: url = "http://www.wcc.nrcs.usda.gov/nwcc/yearcount?network=sntl&counttype=statelist&state=" browser = RoboBrowser(parser="html5lib") browser.open(url) browser.response.raise_for_status() table = browser.find_all("table")[4] sites = [] # list of sites with name and code cols = [t.text.strip() for t in table.tr.find_all("th")] for row in table.find_all("tr"): if row.td and row.td.text.strip() == 'SNTL': items = [i.text.strip() for i in row.find_all("td")] sites.append(dict(zip(cols, items))) return sites
def get_people(self, link): session = requests.Session() people_list = [] browser = RoboBrowser(session=session, user_agent=random.choice(self.HEADERS_LIST), parser="lxml") url = "https://twitter.com" + link try: browser.open(url) results = browser.find_all("a", { "class": "account-group js-account-group js-action-profile js-user-profile-link js-nav"}) for link in results: people_list.append(str(link.get('href')).replace("/", "")) except: pass return people_list
def fetch_from_the_unicode_website(work_location): bs = RoboBrowser(history=True, parser="html.parser") bs.open( 'https://web.archive.org/web/20161205225113/http://unicode.org/emoji/charts/full-emoji-list.html' ) table_rows = bs.find_all("tr") print "Number of Rows %d" % len(table_rows) meta_data_dictionary = {} for row in table_rows[12:]: row_cols = row.find_all("td") if len(row_cols) == 0: continue unicode_name = row_cols[1].find("a").attrs["name"] meta_data_dictionary[unicode_name] = { 'actual_name': row_cols[16].contents[0], 'year_introduced': row_cols[17].contents[0][:4], 'key_words': map(lambda x: x.contents[0].encode('ascii', 'ignore'), row_cols[18].find_all("a")) } for i in range(2, len(row_cols)): images = row_cols[i].find_all("img") if len(images) == 0: continue img = images[0] base64_value = img.attrs['src'].split(",")[1] process_base_64_file( "%s/%s/%s.png" % (work_location, get_company_name(i), unicode_name), base64_value) # Write Cross Company MetaData to file. with open("%s/MetaDataInfo.json" % work_location, "w") as f_meta: f_meta.write(json.dumps(meta_data_dictionary, sort_keys=True, indent=4)) # Convert all Images to RGB .jpg. for company in company_names(): os.system("mogrify -flatten -format jpg %s/%s/*.png -quality 99" % (work_location, company)) os.system("rm %s/%s/*.png" % (work_location, company)) os.system("mogrify -colorspace sRGB -type truecolor %s/%s/*.jpg" % (work_location, company))
def get_hot_videos(Type): hot_videos = {} br = RoboBrowser(history=True, parser='lxml') for i in range(1, 4): url = 'http://91porn.com/v.php?category={}&viewtype=basic&page={}'.format( Type, i) br.open(url) # get every video's information videos = br.find_all('div', {'class': 'listchannel'}) # get their titles and urls videos_dict = dict([(i.find('a').find('img')['title'], i.find('a')['href']) for i in videos]) hot_videos.update(videos_dict) return hot_videos
def ituv_info_covid(self, folder="Ituverava", path=None): if folder not in os.listdir(path): os.mkdir(folder) browser = RoboBrowser(parser="html.parser") url = "http://www.ituverava.sp.gov.br/" browser.open(url) banner = browser.find_all(class_="slidehomecropimg1 wp-post-image")[0] img = browser.session.get(banner["src"]) filepath = f"{folder}/ituv_info_covid.png" if path: if path[-1] == "/": path = path[:-1] filepath = f"{path}/{filepath}" with open(filepath, "wb") as f: f.write(img.content)
def ExtractImageURL(url): browser = RoboBrowser(history=True, parser='html.parser', user_agent='Chrome/41.0.2228.0') browser.open(url) center = browser.find_all('center') nested_center = center[1].find('center') Img_SRC = nested_center.find('img', src=True) print(Img_SRC['src']) return Img_SRC['src']
def main(): args = docopt(__doc__, version="dailyprogrammer-dl v{}".format(__version__)) # Configure logging logLevel = logging.INFO #default if args['--verbose']: logLevel = logging.DEBUG elif args['--quiet']: logLevel = logging.ERROR logging.basicConfig(format='%(levelname)s: %(message)s', level=logLevel) logging.debug(args) # Process command line arguments challengeURL = args['<challengeurl>'] # Parse project page for title and description logging.info("Parsing daily challenge: {}".format(challengeURL)) browser = RoboBrowser() browser.session.headers['User-Agent'] = "dailyprogrammer-dl v{} by /u/zod77".format(__version__) browser.open(challengeURL) title = browser.find('a',class_='title').string description = browser.find_all('div',class_="md") description = description[1] descriptionHTML = "".join(str(t) for t in description.contents) # remove outer <div> projectName = generateProjectName(title) # Init project skeleton logging.info("Generating project") projectPath = os.path.abspath(projectName) os.mkdir(projectPath) # Write out project files pyTemplate = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"boilerplate.txt")) shutil.copy(pyTemplate, os.path.join(projectPath,"{}.py".format(projectName))) # Generate README.md h = html2text.HTML2Text() descriptionMD = h.handle(descriptionHTML) readme = os.path.join(projectPath,"README.md") with open(readme, "w") as f: f.write(descriptionMD) return
def getTorrents(search,limit=20): base = "http://thepiratebay.se/search/" opts = "/0/7/0" search = search.replace(' ','%20') url = base+search+opts browser = RoboBrowser() browser.open(url) rows = browser.find_all('tr') torrents = [] for row in rows: title = row.find('a',{'class':'detLink'}) if title: tor = {} tor.update( scrapeTitle(title) ) tor.update( scrapeDetails(row) ) torrents.append(tor) if len(torrents) >= limit: break return torrents
#!/usr/bin/env python # -*- encoding: utf-8 -*- from robobrowser import RoboBrowser from requests import Session from fake_useragent import UserAgent import re url = 'http://www.baidu.com' ua = UserAgent() keyword = 'sp68' s = Session() br = RoboBrowser(session=s, history=True, user_agent=ua.chrome) br.parser = 'lxml' br.timeout = 1 br.open(url) form = br.get_form(action='/s') form['wd'].value = keyword br.submit_form(form) print br.url for link in br.find_all('a', href=re.compile("^http://www.baidu.com/baidu.php")): print link['href'] s.close()
def main(): """This loops through every account in accounts.csv. Appending all their orders into 1 local html. That html file uses css pulled from amazon.com so it looks the excat same, and all of the links work, except the ones that requre login. """ if not os.path.isfile("history.html"): makeHistoryFile() if not os.path.isfile("accounts.csv"): makeAccountFile() print "accounts.csv file made. Fill in email/passwords and run again." return 1 with open("accounts.csv", "rU") as csvFile: reader = csv.reader(csvFile) for row in reader: email = str(row[0]) password = str(row[1]) update = str(row[2]) if update.lower() == "true": # html5lib parser required for broken html on gameSplits s = requests.Session() s.headers[ "User-Agent" ] = "Mozilla (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.7 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.7" browser = RoboBrowser(history=True, parser="html5lib", session=s) browser.open( "https://www.amazon.com/ap/signin?_encoding=UTF8&openid.assoc_handle=usflex&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.ns.pape=http%3A%2F%2Fspecs.openid.net%2Fextensions%2Fpape%2F1.0&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.com%2F%3Fref_%3Dnav_ya_signin" ) form_signIn = browser.get_forms()[0] form_signIn["email"] = email form_signIn["password"] = password browser.submit_form(form_signIn) browser.open( "https://www.amazon.com/gp/css/history/orders/view.html?orderFilter=year-%s&startAtIndex=1000" ) orders = browser.find_all(class_="a-box-group a-spacing-base order") with open(r"./history.html", "a+") as historyFile: historyFile.seek(0) storedOrderIds = [] tempOrder = "" storeLine = False print "Collected orders from history.html" for line in historyFile: if line == "<!-- Start Order -->\n": storeLine = True continue if line == "<!-- End Order -->\n": storedOrderIds.append(getOrderId(cStringIO.StringIO(tempOrder))) tempOrder = "" storeLine = False if storeLine: tempOrder += line print "Orders stored", len(storedOrderIds) print "Find/Adding new orders for", email for order in orders: orderId = getOrderId(cStringIO.StringIO(order.__str__())) if not orderId in storedOrderIds: print "adding order", orderId historyFile.write("\n<!-- Start Order -->\n") historyFile.write(getAccountHtml(email)) historyFile.write(order.__str__()) historyFile.write("\n<!-- End Order -->\n") print "Done"
# Submit form browser.session.headers['Referer'] = url signin_form.serialize() browser.submit_form(signin_form) url = 'https://bitbucket.org/dashboard/pullrequests?section=teams' browser.open(url) links = browser.select('tr.iterable-item') for link in links: print "Repository: " + link.select('td.repo')[0].text.encode("utf-8").strip() print "User: "******"utf-8").strip() print "Title: " + link.select('td.title')[0].select('a.execute')[0].text.encode("utf-8").strip() print "Updated " + link.select('td.date')[0].text.encode("utf-8").strip() print "\n----------------------" #obtain links with beautifulSoup links = browser.find_all('a') for link in links: try: #print(link.get('href')) if not link['href'].startswith("https"): link['href']='https://bitbucket.org'+link['href'].encode("utf-8").strip() #link['href']='/odigeoteam/frontend-html5' print link['href'] #print link browser.follow_link(link) branches = browser.select('li.branches') if len(branches)>0 : print 'branches '+ branches[0].select('span.value')[0].text tags = browser.select('li.tags')
#coding: utf-8 import re from robobrowser import RoboBrowser url = 'http://itest.info/courses/2' b = RoboBrowser(history=True) b.open(url) #页面上所有的a all_links = b.find_all('a') for link in all_links: print link.text # 页面上所有class是container的div divs = b.find_all(class_='container') print divs # limit 参数控制返回的元素个数 # 页面上前2个p first_two_p = b.find_all('p', limit=2) print first_two_p # 如果第1个参数是列表则返回相匹配的集合 # 页面上所有的meta和title print b.find_all(['meta', 'img'])
form["username"] = args.username form["password"] = args.password browser.session.headers['Referer'] = args.course browser.submit_form(form) # Get course name (no special characters) courseTitle = browser.find("title").text courseTitle = remove_prefix(courseTitle, 'Course Modules: ') courseTitle = "".join([x if x.isalnum() else "_" for x in courseTitle]) print('Course Url: ' + courseModulesUrl) print('Course Title: ' + courseTitle) print('Finding file links of type: ' + args.downloadOnly) # Make output dir outputDir = os.path.join('output/', courseTitle) make_path(outputDir) # Get modules links with lecture in title moduleLinks = browser.find_all("a", { "class" : "for-nvda" }) print('Found ' + str(len(moduleLinks)) + ' links, (not all will be valid)') # Process each lecture link for moduleLink in moduleLinks: print('Opening: ' + moduleLink['aria-label']) browser.follow_link(moduleLink) try: # Find link - containing words "download" downloadLinkRel = browser.find('a', href = re.compile(r'.*download*')) # If failed, find link - containing reference to file "****.XXX" if downloadLinkRel is None: downloadLinkRel = browser.find('a', href = re.compile(r'.*\.[a-z]{3,4}$')) fileNameWithExtension = downloadLinkRel.text.strip() # Check the link is the right filetype
class ESPN_Scrape: def __init__(self): self.logged_in = False self.espn_header = {'1/0': 'H/AB'} self.br = RoboBrowser(history=True) def loginToESPN(self, leagueID, year): if not self.logged_in: link = 'http://games.espn.go.com/flb/leagueoffice?leagueId=' + str(leagueID) + '&seasonId=' + str(year) self.br = RoboBrowser(history=True) self.br.open(link) try: form = self.br.get_form(action="https://r.espn.go.com/espn/memberservices/pc/login") username = input('ESPN Username: \n') password = input('ESPN Password: \n') form['username'].value = username form['password'].value = password self.br.submit_form(form) self.logged_in = True print('\nLogging In\n') except: print('\nLogin FailedS!\n') def is_number(self, s): try: float(s) return True except ValueError: return False def nameToBatPos(self, d): # BatPos = ['Catcher', 'First Base', 'Second Base', 'Third Base', 'Shortstop', 'Left Field', 'Center Field', 'Right Field', 'Designated Hitter'] s = d.text.format('ascii') name = self.getPlayerName(s) s = s[s.find(',') + 2:] pID = self.getPlayerID(d) team = s[:s.find('\xa0')] pos = s[s.find('\xa0') + 1:] posOut = self.getBatPositions(pos) return [pID, name, team] + posOut def nameToPlayer(self, d): s = d.text.format('ascii') name = self.getPlayerName(s) s = s[s.find(',') + 2:] pID = self.getPlayerID(d) team = self.getPlayerTeam(s) return [pID, name, team] def getPlayerName(self, s): return s[:s.find(',')] def getPlayerID(self, d): return d.find_all('a')[0]['playerid'] def getPlayerTeam(self, s): return s[:s.find('\xa0')] def getBatPositions(self, s): posOut = [None] * 9 if 'SSPD' in s: s = s.replace('SSPD', '') if '1B' in s: posOut[1] = 1 s = s.replace('1B', '') if '2B' in s: posOut[2] = 1 s = s.replace('2B', '') if '3B' in s: posOut[3] = 1 s = s.replace('3B', '') if 'SS' in s: posOut[4] = 1 s = s.replace('SS', '') if 'LF' in s: posOut[5] = 1 s = s.replace('LF', '') if 'CF' in s: posOut[6] = 1 s = s.replace('CF', '') if 'RF' in s: posOut[7] = 1 s = s.replace('RF', '') if 'DH' in s: posOut[8] = 1 s = s.replace('DH', '') if 'C' in s: posOut[0] = 1 s = s.replace('C', '') return posOut def splitHAB(self, s): hits = s[:s.find('/')] ab = s[s.find('/') + 1:] if self.is_number(hits): hits = float(hits) else: hits = 0 if self.is_number(ab): ab = float(ab) else: ab = 0 return [hits, ab] def nameToPitchPos(self, d): # ['Starting Pitcher', 'Relief Pitcher'] s = d.text.format('ascii') name = s[:s.find(',')] s = str(s[s.find(',') + 2:]) pID = d.find_all('a')[0]['playerid'] team = s[:s.find('\xa0')] pos = s[s.find('\xa0') + 1:] posOut = self.getPitchPositions(pos) return [pID, name, team] + posOut def getPitchPositions(self, s): posOut = [None] * 2 if 'SSPD' in s: s = s.replace('SSPD', '') if 'SP' in s: posOut[0] = 1 s = s.replace('SP', '') if 'RP' in s: posOut[1] = 1 s = s.replace('RP', '') return posOut def tableToBatters(self, table): Hitters = pd.DataFrame() rows = table.find_all('tr') rows = rows[2:] for r in rows: data = r.find_all('td') data = [data[0]] + data[8:20] row_data = [] for i, d in enumerate(data): if i == 0: row_data = self.nameToBatPos(d) elif '/' in d.text: row_data += self.splitHAB(d.text) else: if self.is_number(d.text): row_data.append(float(d.text)) else: row_data.append(0) Hitters = Hitters.append(pd.Series(row_data), ignore_index=True) return Hitters def tableToPitchers(self, table): Pitchers = pd.DataFrame() rows = table.find_all('tr') rows = rows[2:] for r in rows: data = r.find_all('td') data = [data[0]] + data[8:24] row_data = [] for i, d in enumerate(data): if i == 0: row_data = self.nameToPitchPos(d) else: if self.is_number(d.text): row_data.append(float(d.text)) else: row_data.append(0) Pitchers = Pitchers.append(pd.Series(row_data), ignore_index=True) return Pitchers def scrapePlayerProjections(self, leagueID, year): self.loginToESPN(leagueID, year) Hitters = pd.DataFrame() HitPos = ['Catcher', 'First Base', 'Second Base', 'Third Base', 'Shortstop', 'Left Field', 'Center Field', 'Right Field', 'Designated Hitter'] Pitchers = pd.DataFrame() PitchPos = ['Starting Pitcher', 'Relief Pitcher'] thead = [] index = 0 # get batter values self.br.open('http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str( year) + '&context=freeagency&view=stats&version=projections&startIndex=0&avail=-1&startIndex=' + str(index)) table = self.br.find_all('table', class_='playerTableTable tableBody')[0] rows = table.find_all('tr') # get the column headers header = rows[1] data = header.find_all('td') data = [data[0]] + data[8:20] for d in data: txt = d.text.replace('\xa0', '') thead.append(txt.format('ascii')) thead[0] = 'PlayerId' if 'H/AB' in thead: ind = thead.index('H/AB') thead[ind] = 'AB' # AB stored in ind+1 thead.insert(ind, 'H') # H stored in ind thead.insert(1, 'Team') thead.insert(1, 'Name') thead = thead[0:3] + HitPos + thead[3:] # get player projections while index < 250: self.br.open( 'http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str( year) + '&context=freeagency&view=stats&version=projections&avail=-1&startIndex=' + str(index)) table = self.br.find_all('table', class_='playerTableTable tableBody')[0] Hitters = Hitters.append(self.tableToBatters(table), ignore_index=True) index += 50 Hitters.columns = thead index = 0 # get Pitchers self.br.open('http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str( year) + '&context=freeagency&view=stats&version=projections&avail=-1&slotCategoryGroup=2&startIndex=' + str( index)) table = self.br.find_all('table', class_='playerTableTable tableBody')[0] rows = table.find_all('tr') # get the column headers thead = [] header = rows[1] data = header.find_all('td') data = [data[0]] + data[8:24] for d in data: txt = d.text.replace('\xa0', '') thead.append(txt.format('ascii')) thead[0] = 'PlayerId' thead.insert(1, 'Team') thead.insert(1, 'Name') thead = thead[0:3] + PitchPos + thead[3:] #get player projections while index < 250: self.br.open( 'http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str( year) + '&context=freeagency&view=stats&version=projections&avail=-1&slotCategoryGroup=2&startIndex=' + str( index)) table = self.br.find_all('table', class_='playerTableTable tableBody')[0] Pitchers = Pitchers.append(self.tableToPitchers(table), ignore_index=True) index += 50 Pitchers.columns = thead return Hitters, Pitchers def scrapePlayerSeason(self, leagueID, year): self.loginToESPN(leagueID, year) Hitters = pd.DataFrame() HitPos = ['Catcher', 'First Base', 'Second Base', 'Third Base', 'Shortstop', 'Left Field', 'Center Field', 'Right Field', 'Designated Hitter'] Pitchers = pd.DataFrame() PitchPos = ['Starting Pitcher', 'Relief Pitcher'] thead = [] index = 0 # get batter values self.br.open('http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str( year) + '&context=freeagency&view=stats&version=currSeason&startIndex=0&avail=-1&startIndex=' + str(index)) table = self.br.find_all('table', class_='playerTableTable tableBody')[0] rows = table.find_all('tr') # get the column headers header = rows[1] data = header.find_all('td') data = [data[0]] + data[8:20] for d in data: txt = d.text.replace('\xa0', '') thead.append(txt.format('ascii')) thead[0] = 'PlayerId' if 'H/AB' in thead: ind = thead.index('H/AB') thead[ind] = 'AB' # AB stored in ind+1 thead.insert(ind, 'H') # H stored in ind thead.insert(1, 'Team') thead.insert(1, 'Name') thead = thead[0:3] + HitPos + thead[3:] # get player projections while index < 250: self.br.open( 'http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str( year) + '&context=freeagency&view=stats&version=currSeason&avail=-1&startIndex=' + str(index)) table = self.br.find_all('table', class_='playerTableTable tableBody')[0] Hitters = Hitters.append(self.tableToBatters(table), ignore_index=True) index += 50 Hitters.columns = thead index = 0 # get Pitchers self.br.open('http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str( year) + '&context=freeagency&view=stats&version=currSeason&avail=-1&slotCategoryGroup=2&startIndex=' + str( index)) table = self.br.find_all('table', class_='playerTableTable tableBody')[0] rows = table.find_all('tr') # get the column headers thead = [] header = rows[1] data = header.find_all('td') data = [data[0]] + data[8:24] for d in data: txt = d.text.replace('\xa0', '') thead.append(txt.format('ascii')) thead[0] = 'PlayerId' thead.insert(1, 'Team') thead.insert(1, 'Name') thead = thead[0:3] + PitchPos + thead[3:] # get player projections while index < 250: self.br.open( 'http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str( year) + '&context=freeagency&view=stats&version=currSeason&avail=-1&slotCategoryGroup=2&startIndex=' + str( index)) table = self.br.find_all('table', class_='playerTableTable tableBody')[0] Pitchers = Pitchers.append(self.tableToPitchers(table), ignore_index=True) index += 50 Pitchers.columns = thead return Hitters, Pitchers def scrapeTeamPlayers(self, leagueID, year, teams): self.loginToESPN(leagueID, year) teamBatters = pd.DataFrame() teamPitchers = pd.DataFrame() urls = list(teams['Link']) for u in urls: self.br.open('http://games.espn.go.com' + u) teamId = teams[teams['Link'] == u].iloc[0]['teamId'] # batters Btable = self.br.find_all('table', class_='playerTableTable tableBody')[0] rows = Btable.find_all('tr') rows = rows[2:] for r in rows: d = r.find_all('td')[1] if d.find_all('a'): pID = int(self.getPlayerID(d)) teamBatters = teamBatters.append(pd.Series([teamId, pID]), ignore_index=True) # pitchers Ptable = self.br.find_all('table', class_="playerTableTable tableBody playerTableMoreTable")[0] rows = Ptable.find_all('tr') rows = rows[2:] for r in rows: d = r.find_all('td')[1] if d.find_all('a'): pID = int(self.getPlayerID(d)) teamPitchers = teamPitchers.append(pd.Series([teamId, pID]), ignore_index=True) teamBatters.columns = ['teamId', 'playerId'] teamPitchers.columns = ['teamId', 'playerId'] return teamBatters, teamPitchers # data frame containing all of te results for each weeks matchups # [weekID, gameID, teamID, H, R, 2B, 3B, HR, XBH, RBI, BB, SB, AVG, OBP, SLG, # K, QS, CG, SO, W, L, SV, HD, BAA, ERA, WHIP, K/9, Wins, Losses, Ties, H/A] def scrapeMatchupResults(self, leagueId, year): matchups = pd.DataFrame() week = self.currentWeek() weeks = [i for i in range(1, week + 1)] for w in weeks: matchups = matchups.append(self.scrapeMatchUpWeek(leagueId, year, w), ignore_index=True) return matchups # data frame containing all of the results for one weeks matchups # [weekID, gameID, teamID, H, R, 2B, 3B, HR, XBH, RBI, BB, SB, AVG, OBP, SLG, # K, QS, CG, SO, W, L, SV, HD, BAA, ERA, WHIP, K/9, Wins, Losses, Ties, H/A] def scrapeMatchUpWeek(self, leagueId, year, weekId): matchupWeek = pd.DataFrame() self.loginToESPN(leagueId, year) link = 'http://games.espn.go.com/flb/scoreboard?leagueId=' + str(leagueId) + '&seasonId=' + str( year) + '&matchupPeriodId=' + str(weekId) self.br.open(link) table = self.br.find_all('table', class_='tableBody') table = table[0] rows = table.find_all('tr') head = rows[1].find_all('th') header = [h.text for h in head] while '' in header: header.remove('') header = header[1:-1] header.insert(0, 'Name') header.insert(0, 'teamId') header.insert(0, 'gameId') header.insert(0, 'weekId') header.append('Wins') header.append('Losses') header.append('Ties') header.append('H/A') stats = rows[2:] count = 0 for r in stats: data_row = [] teamRow = r.find_all('td', class_='teamName') if teamRow: name = self.teamNameToRow(teamRow[0]) data = r.find_all('td') for d in data: if self.is_number(d.text): data_row.append(float(d.text)) score = self.scoreToList(data[-1].text) out = [weekId, 6 * (weekId - 1) + math.floor(count / 2)] + name[:2] + data_row + score + [count % 2] matchupWeek = matchupWeek.append(pd.Series(out), ignore_index=True) count += 1 matchupWeek.columns = header return matchupWeek def scoreToList(self, s): wins = float(s[:s.find('-')]) s = s[s.find('-') + 1:] losses = float(s[:s.find('-')]) ties = float(s[s.find('-') + 1:]) return [wins, losses, ties] # takes current date and find the current week def currentWeek(self): weekIds = pd.read_csv('Data/weekId.csv', index_col=0) now = datetime.datetime.now() weekEnds = list(weekIds['end']) for i, w in enumerate(weekEnds): dt = datetime.datetime.strptime(w, '%m/%d/%y') if dt > now: return i + 1 return i + 1 # data frame containing all of the matchups # [weekID, gameID, teamID, H/A] def scrapeLeagueSchedule(self, leagueId, year): schedule = pd.DataFrame() self.loginToESPN(leagueId, year) weekId = 0 gameId = 0 while weekId < 22: link = 'http://games.espn.go.com/flb/scoreboard?leagueId=' + str(leagueId) + '&seasonId=' + str( year) + '&matchupPeriodId=' + str(weekId) self.br.open(link) table = self.br.find_all('table', class_='tableBody') table = table[0] rows = table.find_all('tr') count = 0 for r in rows: data = r.find_all('td', class_='teamName') for d in data: name_row = self.teamNameToRow(d) homeAway = count % 2 schedule = schedule.append(pd.Series([weekId, gameId, name_row[0], homeAway]), ignore_index=True) count += 1 if count % 2 == 0: gameId += 1 weekId += 1 schedule.columns = ['weekId', 'gameId', 'teamId', 'H/A'] return schedule # return all matchups so far def scrapeMatchupPlayers(self, leagueId, year): batters = pd.DataFrame() pitchers = pd.DataFrame() week = self.currentWeek() - 1 weeks = [i for i in range(1, week + 1)] for w in weeks: B, P = self.scrapeMatchupPlayersWeek(leagueId, year, w) batters = batters.append(B, ignore_index=True) pitchers = pitchers.append(P, ignore_index=True) return batters, pitchers # data frame containing player results for each matchup # both hitters and pitchers and their catagories def scrapeMatchupPlayersWeek(self, leagueId, year, week): matchupBatters = pd.DataFrame() matchupPitchers = pd.DataFrame() link = 'http://games.espn.go.com/flb/scoreboard?leagueId=' + str(leagueId) + '&seasonId=' + str( year) + '&matchupPeriodId=' + str(week) base = 'http://games.espn.go.com' self.loginToESPN(leagueId, year) self.br.open(link) links = self.br.find_all('a') bscores = [] for l in links: if l.text == 'Full Box Score': bscores.append(base + l['href']) for bs in bscores: self.br.open(bs) tables = self.br.find_all('table', class_="playerTableTable tableBody") for i, t in enumerate(tables): if i % 2: # Pitchers matchupPitchers = matchupPitchers.append(self.scrapeMatchupPitchers(t), ignore_index=True) else: # Batters matchupBatters = matchupBatters.append(self.scrapeMatchupBatters(t), ignore_index=True) matchupBatters['weekId'] = week matchupPitchers['weekId'] = week return matchupBatters, matchupPitchers def scrapeMatchupBatters(self, table): batters = pd.DataFrame() rows = table.find_all('tr') head = rows[2].find_all('td') header = [h.text for h in head] header = header[2:] header[0] = 'PlayerId' if 'H/AB' in header: ind = header.index('H/AB') header[ind] = 'AB' # AB stored in ind+1 header.insert(ind, 'H') # H stored in ind header.insert(1, 'Team') header.insert(1, 'Name') rows = rows[3:-1] for r in rows: data_row = r.find_all('td') data_row = [data_row[0]] + data_row[3:] row_data = [] for i, d in enumerate(data_row): if i == 0: row_data = self.nameToPlayer(d) elif '/' in d.text: row_data += self.splitHAB(d.text) else: if self.is_number(d.text): row_data.append(float(d.text)) else: row_data.append(0) batters = batters.append(pd.Series(row_data), ignore_index=True) batters.columns = header return batters def scrapeMatchupPitchers(self, table): pitchers = pd.DataFrame() rows = table.find_all('tr') head = rows[1].find_all('td') header = [h.text for h in head] header = header[2:] header[0] = 'PlayerId' header.insert(1, 'Team') header.insert(1, 'Name') rows = rows[3:-1] for r in rows: data_row = r.find_all('td') data_row = [data_row[0]] + data_row[3:] row_data = [] for i, d in enumerate(data_row): if i == 0: row_data = self.nameToPlayer(d) else: if d.text == 'INF': row_data.append(13.5) elif self.is_number(d.text): row_data.append(float(d.text)) else: row_data.append(0) pitchers = pitchers.append(pd.Series(row_data), ignore_index=True) pitchers.columns = header return pitchers # returns data frame containing # [teamID, teamName, shortName, wins, losses, draws] def scrapeLeagueTeams(self, leagueId, year): self.loginToESPN(leagueId, year) # dataframe will have the following columns: # [teamID, teamName, wins, losses, draws] teams = pd.DataFrame() self.br.open('http://games.espn.go.com/flb/standings?leagueId=' + str(leagueId) + '&seasonId=' + str(year)) tables = self.br.find_all('table', class_='tableBody') tables = tables[:-1] for t in tables: row = t.find_all('tr')[2:] for r in row: data = r.find_all('td') name = data[0] name_row = self.teamNameToRow(name) wins = float(data[1].text) losses = float(data[2].text) draw = float(data[3].text) out = name_row + [wins, losses, draw] teams = teams.append(pd.Series(out), ignore_index=True) teams.columns = ['teamId', 'Name', 'Link', 'W', 'L', 'T'] return teams def teamNameToRow(self, name): link = name.find_all('a')[0]['href'] ID = link.split('&')[1] teamID = int(ID[ID.find('=') + 1:]) teamName = name.text if teamName.find(' (') != -1: teamName = teamName[:teamName.find(' (')] return [teamID, teamName, link] def scrapeTeamStats(self, leagueID, year): self.loginToESPN(leagueID, year) # dataframe will have the following columns: # [teamID, teamName, wins, losses, draws] teamStats = pd.DataFrame() self.br.open('http://games.espn.go.com/flb/standings?leagueId=' + str(leagueID) + '&seasonId=' + str(year)) tables = self.br.find_all('table', class_='tableBody') table = tables[-1] rows = table.find_all('tr') head = rows[2].find_all('td') header = [h.text for h in head] while '' in header: header.remove('') header.insert(0, 'Name') header.insert(0, 'teamId') stats = rows[3:] for r in stats: data_row = [] data = r.find_all('td') name = self.teamNameToRow(data[1]) data = data[2:-2] for d in data: if self.is_number(d.text): data_row.append(float(d.text)) out = name[:2] + data_row teamStats = teamStats.append(pd.Series(out), ignore_index=True) teamStats.columns = header return teamStats
challenge_count = 1 while (True): browser = RoboBrowser(parser='lxml') browser.open(SITE_URL) # loop forever #try catch this signin_form = browser.get_forms()[0] signin_form['login'].value = username signin_form['password'].value = password browser.submit_form(signin_form) #get the leaderboard list browser.follow_link(browser.get_link(text='Leaderboard')) bot_name_tags = browser.find_all('div', {'class': 'bot-name'}); bot_name_extracter = lambda tag: tag.string.replace('\t', '').replace('\n', '').lower() bot_names = map(bot_name_extracter, bot_name_tags) no_bots = len(bot_names) our_rank = bot_names.index('cbteamname') + 1 print("[INFO] CBTeamName is ranked " + str(our_rank)) random.seed(os.urandom(8)) opponent_queue = [] #three bots with lower rank opponent_queue += ([bot_names[random.randint(our_rank + 1, no_bots - 1)], bot_names[random.randint(our_rank + 1, no_bots - 1)], bot_names[random.randint(our_rank + 1, no_bots - 1)]]) #one bot with a higher rank opponent_queue += ([bot_names[random.randint(0, our_rank - 1)]])
br.open(url) fp = br.parsed #f0 = open('f1.html', 'w') #f0.write(str(fp)) #login form=br.get_form(id='mod_loginform') form['username'].value= 'pygather' form['passwd'].value= '1324354657687980' br.submit_form(form) sp = br.parsed #f2 = open('f2.html','w') #f2.write(str(sp)) #navigate to quick submit for a in br.find_all('a', href=True, text = re.compile('Quick Submit')): br.follow_link(a) tp = br.parsed form = br.get_form(action = re.compile('Itemid=25')) # print(form) #form.new_control('text','code',{'value':''}) #form.fixup() form['localid'].value=str(curProgram) form['language'].value='2' form['code'].value='import java.util.*;class Main{public static void main(String[]args) throws Exception{Scanner in = new Scanner(System.in);StringBuilder sb = new StringBuilder();while(in.hasNextLine()){sb.append(in.nextLine());}byte b=(byte)sb.charAt('+str(curByte)+');if((b>>'+str(shift)+'&0x01)==0){throw new Exception("Error");}}}' br.submit_form(form) #f3 = open('f3.html','w') #f3.write(str(tp))
# pip3 install robobrowser from robobrowser import RoboBrowser browser = RoboBrowser() browser.open('https://cdn.hackerrank.com/hackerrank/static/contests/capture-the-flag/infinite/qds.html') history = ['https://cdn.hackerrank.com/hackerrank/static/contests/capture-the-flag/infinite/qds.html'] font = browser.find_all('font') links = browser.get_links() while len(font) < 50000 and len(links) > 0: for link in links: if link not in history: browser.follow_link(link) font.extend(browser.find_all()) links.extend(browser.get_links()) history.append(link) links.remove(link) print(font)
path = os.path.join(path,'暴走GIF') if not os.path.exists(path): os.mkdir(path) #创建文件夹 url = "http://baozoumanhua.com/gif/month/page/" #url地址 headers = { #伪装浏览器 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/32.0.1700.76 Safari/537.36' } browser = RoboBrowser(history=True,user_agent='Mozilla/5.0 ... Safari/537.36') for count in range(page_sum): try: browser.open(url+str(count+1),method='get',headers=headers) except Exception: continue if browser.response.status_code is not 200: continue else: img_content = browser.find_all('img',attrs={'style':'width:460px'}) url_list = [img['src'] for img in img_content] #列表推导 url title_list = [img['alt'] for img in img_content] #图片名称 print("count:"+str(count)) for i in range(url_list.__len__()) : imgurl = url_list[i] filename = path.decode('utf-8') + os.sep.decode('utf-8') + title_list[i] + ".gif" print(filename+":"+imgurl) #打印下载信息 urllib.urlretrieve(imgurl,filename) #下载图片
# Browser #br = mechanize.Browser() br = RoboBrowser(history=True, user_agent='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2') # The site we will navigate into, handling it's session br.open('http://heroes-wow.com/wotlk/index.php?page=login') login_form = br.get_form(action="http://heroes-wow.com/wotlk/execute.php?take=login") login_form['username'].value = 'anathk2' login_form['password'].value = 'wow123456' login_form['rememberme'].value = '1' br.submit_form(login_form) br.open('http://topg.org/server-heroes-wow-id347987') links = br.find_all('a', href=True) br.follow_link(links[22]) result = br.parsed new_links = br.find_all('a', href=True) br.follow_link(new_links[1])
def parseWeek(year, week): """Parsing a specific week at http://nflweather.com/week/{}/Week-{} Follows all detial links, which is where must of the data is scraped. Scrapes weather, and stadium enough per week, and stores them in their respective collections """ logger = makeLogger(str(year) + '_' + str(week), r'./logs_nflWeather/') startTime = datetime.now() logger.debug('Starting %d %d', year, week) weather_list = [] stadium_list = [] if col_weather_info.find({'year': year, 'week': week}).count(): logger.debug('Already parsed %d %d', year, week) return None wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', "http://nflweather.com/week/{}/Week-{}".format(year, week)) data = browser.find(class_="footable") rows = data.find_all('tr') for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) weatherInfo = {'year': year, 'week': week} stadiumInfo = {'year': year, 'week': week} try: columns = row.find_all('td') if columns: weatherInfo['weatherPicAlt'] = columns[8].find('img')['alt'] weatherInfo['weatherText'] = columns[9].text.strip() weatherInfo['shortWind'] = columns[10].text details = columns[12] detialsLink = 'http://nflweather.com' + details.find('a')['href'] wait = random.uniform(.5, 2.5) logger.debug('Waiting to follow_link %f', wait) time.sleep(wait) logger.debug('Following link') browser = open_or_follow_link(logger, browser, 'open', detialsLink) gameTime = browser.find('strong').text.split('-')[0].split(':', 1)[1].strip() awayTeam = browser.find_all(class_='g-away')[1].find('a').text.replace(' ', ' ').strip() homeTeam = browser.find_all(class_='g-home')[1].find('a').text.replace(' ', ' ').strip() spans = browser.find_all(class_='span5') if len(spans) != 2: raise Exception('to many spans') weatherItems = spans[0].find_all('p') stadiumItems = spans[1].find_all('p') index = spans[0].text.find('Temperature:') weatherCondition = spans[0].text[:index].strip() for each in weatherItems: split = each.text.strip().split(':') if len(split) == 2: weatherInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip()) for index, each in enumerate(stadiumItems): split = each.text.strip().split(':') if len(split) == 2: if split[0] == 'Surface': stadiumInfo['stadium'] = stadiumItems[index-1].text.strip() stadiumInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip()) #find nfl_schedule, update gameTime, hoepfully result as id, insert id into both info dicts, append to _list schedule_query = {'year': year, 'week': week, 'homeTeam': homeTeam, 'awayTeam': awayTeam} schedule_doc = col_schedule.find(schedule_query) if schedule_doc.count() != 1: error_docs = str(schedule_query) + ' | ' + str(weatherInfo) + ' | ' + str(stadiumInfo) raise Exception("nfl_scedule doc not found " + error_docs) result = col_schedule.update_one(schedule_query, {'$set': {'dateTime': gameTime}}) schedule_id = schedule_doc[0]['_id'] weatherInfo['schedule_id'] = schedule_id stadiumInfo['schedule_id'] = schedule_id weather_list.append(weatherInfo) stadium_list.append(stadiumInfo) except: logger.exception(row) try: logger.debug('Bulk Creating weather_list') col_weather_info.insert_many(weather_list) logger.debug('Bulk Creating stadium_list') col_stadium_info.insert_many(stadium_list) except: logger.exception('insert_many error') logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime)) closeLogger(str(year) + '_' + str(week))
def spider(update=False, daysago=30, name=None, path=_DIR_WISEREP, include_type=[], force=False): start_time = time.time() incl_type_str = 'supernovae' if not include_type else '-'.join( include_type) if not os.path.exists(_PATH + path): os.mkdir(_PATH + path) # dig up lists of known non-supernovae and completed events, or create if # it does not exist if os.path.exists(_PATH + path + 'lists.json'): with open(_PATH + path + 'lists.json', 'r') as json_in: list_dict = json.load(json_in) else: list_dict = {'non_SN': [], 'completed': []} with open(_PATH + path + 'lists.json', 'w') as fp: json.dump(list_dict, fp, indent=4) # collect metadata for the few available host spectra and # build a dictionary that will be used below to # remove by SNname and "Spectrum Type" obj_host_dict = {} if daysago: browser = RoboBrowser(history=False, parser='lxml') browser.open(_WISEREP_SPECTRA_URL) form = browser.get_form(action='/spectra/list') form['spectypeid'] = "2" # 2 for Host spectrum form['rowslimit'] = "10000" browser.submit_form(form) print('\tHost page received') obj_host_headers = (browser.find("tr", {"style": "font-weight:bold"}) .findChildren("td")) for i, header in enumerate(obj_host_headers): # if header.text == 'Obj. Name': # host_obj_name_idx = i if header.text == 'Spec.Program': host_program_idx = i if header.text == 'Instrument': host_instrument_idx = i if header.text == 'Observer': host_observer_idx = i if header.text == 'Obs. Date': host_obsdate_idx = i if header.text == 'Reducer': host_reducer_idx = i if header.text == 'Ascii FileFits File': host_filename_idx = i obj_host_list = browser.find_all( "a", {"title": "Click to show/update object"}) for i, obj in enumerate(obj_host_list): print('\tParsing', i + 1, 'of', len(obj_host_list), 'host spectra') obj_name = obj.text host_children = obj.parent.parent.findChildren("td") host_program = host_children[host_program_idx].text host_instrument = host_children[host_instrument_idx].text host_observer = host_children[host_observer_idx].text host_obsdate = host_children[host_obsdate_idx].text host_reducer = host_children[host_reducer_idx].text host_filename = host_children[host_filename_idx].text host_filename = host_filename.strip().split('\n')[0] obj_host_dict[obj_name] = OrderedDict([ ("Type", "Host spectrum"), ("Filename", host_filename), ("Obs. Date", host_obsdate), ("Program", host_program), ("Instrument", host_instrument), ("Observer", host_observer), ("Reducer", host_reducer), ]) # begin scraping WISeREP OBJECTS page for supernovae browser = RoboBrowser(history=False, parser='lxml') browser.open(_WISEREP_OBJECTS_URL) form = browser.get_form(action='/objects/list') # ready search form with field entries to submit, depending on --update if browser and update: if daysago: daysstr = str(daysago) # set "Added within the last args.daysago days" print('Collecting new spectra from the last', daysstr, 'days') form['daysago'] = daysstr if name: form['name'] = name form['rowslimit'] = "10000" browser.submit_form(form) try: new_objs = (browser.find("tr", {"style": "font-weight:bold"}) .parent.findChildren("tr", {"valign": "top"})) except AttributeError: if daysago: print('Nothing to collect since ' + daysstr + ' days ago') else: print('Nothing to collect!') return new_objs = (browser.find("tr", {"style": "font-weight:bold"}) .parent.findChildren("tr", {"valign": "top"})) SN_list_tags = [] for obj in new_objs: obj_name_tag = obj.find("a", {"title": "Click to show/update"}) SN_list_tags.append(obj_name_tag) SN_list_tags = [i for i in SN_list_tags if i is not None] elif browser and not update: # grab object name list, and remove `Select Option' from list [1:] print('Grabbing list of events from WISeREP') SN_list_tags = browser.find("select", {"name": "objid"}).find_all("option")[1:] # Begin by selecting event, visiting page, and scraping. # SN_list = ['SN2009ip'] # for item in SN_list: for item in SN_list_tags: SNname = item.get_text() # SNname = item if not force and SNname in list_dict['non_SN']: print(SNname, 'is not a ' + incl_type_str + ' -- Skipping') continue elif SNname in list_dict['completed']: print(SNname, 'already done') continue print('Searching for', SNname, '...') # reset for every event -- change if needed SN_dict = {} # if in update mode and SNname directory exists, remove it if update: rmSNdir(SNname, path) # set Obj Name to SNname and retrieve results page form['name'] = SNname browser.submit_form(form) print('\tPage received') # locate object header indecies (_idx) try: headers = browser.find( "tr", {"style": "font-weight:bold"}).findChildren("td") except AttributeError: if update: updateListsJson(SNname, list_dict['completed'], list_dict, path) print('\t', 'No spectra to collect') break else: updateListsJson(SNname, list_dict['completed'], list_dict, path) print('\t', SNname, 'has no available spectra') with open(_PATH + path + 'scraper-log.txt', 'a') as f: f.write('From statement 1: ' + SNname + ' has no spectra to collect' + '\n') continue for i, header in enumerate(headers): if header.text == 'Obj. Name': obj_name_idx = i if header.text == 'IAUName': iau_name_idx = i if header.text == 'Redshift': redshift_idx = i if header.text == 'Type': type_idx = i if header.text == 'No. of publicSpectra': # publicSpectra not a typo num_total_spec_idx = i # locate objects returned -- it's not always one obj_list = browser.find_all("form", {"target": "new"}) num_objs = len(obj_list) if num_objs >= 1 and update: print('\tNew data available for', num_objs, 'objects.') if num_objs != 1: with open(_PATH + path + 'scraper-log.txt', 'a') as f: f.write( str(num_objs) + ' objects returned for ' + SNname + '\n') # locate darkred text ``Potential matching IAU-Name'' if it exists # the location of html table rows (tr) changes if it exists darkred = browser.find( "span", text=" Potential matching IAU-Name/s:", attrs={"style": "color:darkred; font-size:small"}) # parse obj_list, match to SNname, and find its spectra target = '' for obj in obj_list: obj_header = obj.parent.findChildren("td") obj_name = obj_header[obj_name_idx].text if SNname == obj_name: target = obj_header # this checks for spurious page element that changes layout if darkred: try: target_spectra = ( obj.parent.nextSibling.nextSibling.findChildren( "tr", {"valign": "top"})) except AttributeError: print('\t', SNname, 'has no spectra to collect') with open(_PATH + path + 'scraper-log.txt', 'a') as f: f.write('From statement 2: ' + SNname + ' has no spectra to collect' + '\n') continue elif darkred is None: try: target_spectra = obj.parent.nextSibling.findChildren( "tr", {"valign": "top"}) except AttributeError: print('\t', SNname, 'has no spectra to collect') with open(_PATH + path + 'scraper-log.txt', 'a') as f: f.write('From statement 3: ' + SNname + ' has no spectra to collect' + '\n') continue # No match found, skip this event if not target: continue # exclude non-SN SNtype = target[type_idx].text if not force: if ((include_type and SNtype not in include_type) or (not include_type and SNtype in exclude_type)): updateListsJson(SNname, list_dict['non_SN'], list_dict, path) updateListsJson(SNname, list_dict['completed'], list_dict, path) print('\t', SNname, 'is a', SNtype) with open(_PATH + path + 'non-' + incl_type_str + '.txt', 'a') as f: f.write(SNname + ' is a ' + SNtype + '\n') continue elif SNtype == '': # SNtype = 'Unspecified by WISeREP' print('\tType not specified by WISeREP.', 'Check the Open Supernova Catalog for type.') with open(_PATH + path + 'scraper-log.txt', 'a') as f: f.write('Type not specified by WISeREP.' + 'Check the Open Supernova Catalog for type.') # create a directory even if the SN event has no spectra. # find other instances of mkSNdir to revert this. mkSNdir(SNname, path) # second chance to exclude events without spectra num_total_spec = target[num_total_spec_idx].text num_total_spec = unicodedata.normalize("NFKD", num_total_spec) if num_total_spec == u' ' or num_total_spec == u' 0 ': updateListsJson(SNname, list_dict['completed'], list_dict, path) print('\t', SNname, 'has no spectra to collect') with open(_PATH + path + 'scraper-log.txt', 'a') as f: f.write('From statement 4: ' + SNname + ' has no spectra to collect' + '\n') continue redshift = target[redshift_idx].text SN_dict[SNname] = OrderedDict() # number of publicly available spectra num_pub_spectra = 0 spec_header = browser.find( "tr", {"style": "color:black; font-size:x-small"}).findChildren("td") for i, header in enumerate(spec_header): if header.text == 'Spec. Prog.': program_idx = i if header.text == 'Instrument': instrument_idx = i if header.text == 'Observer': observer_idx = i if header.text == 'Obs.date': obsdate_idx = i if header.text == 'Reducer': reducer_idx = i if header.text == 'Ascii/Fits Files': filename_idx = i if header.text == 'Publish': publish_idx = i if header.text == 'Contrib': contrib_idx = i if header.text == 'Last-modified': last_mod_idx = i if header.text == 'Modified-by': modified_by_idx = i # build SN_dict and locate ascii files on search results page # associated with SNname spectrum_haul = OrderedDict() for spec in target_spectra: spec_link = spec.find("a", href=re.compile(_ASCII_URL)) try: dat_url = quote(spec_link.attrs['href'], "http://") except AttributeError: # handles a return of 'None' continue children = spec.findChildren("td") filename = spec_link.text program = children[program_idx].text if not force and program in exclude_program: print('\tSkipping', program, 'spectrum') # but still count it as public num_pub_spectra += 1 continue # list of duplicate file prefixes to be excluded # list not shorted to ['t', 'f', 'PHASE'] for sanity regexes = [ 't' + SNname, 'tPSN', 'tPS', 'tLSQ', 'tGaia', 'tATLAS', 'tASASSN', 'tSMT', 'tCATA', 'tSNhunt', 'tSNHunt', 'fSNhunt', 'tSNHiTS', 'tCSS', 'tSSS', 'tCHASE', 'tSN', 'tAT', 'fPSN', 'PHASE' ] regexes = "(" + ")|(".join(regexes) + ")" if re.match(regexes, filename): status = 'rapid' else: status = 'final' instrument = children[instrument_idx].text observer = children[observer_idx].text obsdate = children[obsdate_idx].text reducer = children[reducer_idx].text last_modified = children[last_mod_idx].text modified_by = children[modified_by_idx].text contrib = children[contrib_idx].text bibcode = children[publish_idx].text bibcode = unicodedata.normalize("NFKD", bibcode) if (contrib == ('Ruiz-Lapuente, et al. 1997, Thermonuclear ' 'Supernovae. Dordrecht: Kluwer')): bibcode = '1997Obs...117..312R' contrib = 'Ruiz-Lapuente et al. 1997' elif '%26' in bibcode: bibcode = bibcode.replace('%26', '&') SN_dict[SNname][filename] = OrderedDict([ ("Type", SNtype), ("Redshift", redshift), ("Obs. Date", obsdate), ("Program", program), ("Contributor", contrib), ("Bibcode", bibcode), ("Instrument", instrument), ("Observer", observer), ("Reducer", reducer), ("Reduction Status", status), ("Last Modified", last_modified), ("Modified By", modified_by) ]) spectrum_haul[filename] = dat_url num_pub_spectra += 1 # Metadata for SNname is now available. # The following filters cases by the number of # spectra that appear on the WISeREP page. if len(spectrum_haul) == 0: print('\tNot collecting spectra at this time') with open(_PATH + path + 'scraper-log.txt', 'a') as f: f.write('Not collecting spectra of ' + SNname + ' at this time' + '\n') with open(_PATH + path + SNname + '/README.json', 'w') as fp: json.dump(SN_dict[SNname], fp, indent=4) updateListsJson(SNname, list_dict['completed'], list_dict, path) continue elif len(spectrum_haul) == 1: # remove host spectrum if it exists if SNname in obj_host_dict.keys(): if obj_host_dict[SNname]['Filename'] in SN_dict[SNname].keys(): filename = obj_host_dict[SNname]['Filename'] del SN_dict[SNname][filename] print('\tPurging host galaxy spectrum --', filename) print('\tNot collecting spectra at this time') with open(_PATH + path + 'scraper-log.txt', 'a') as f: f.write('Not collecting spectra of ' + SNname + ' at this time' + '\n') updateListsJson(SNname, list_dict['completed'], list_dict, path) continue print('\tDownloading 1 public spectrum') # make SNname subdirectory # os.mkdir(_PATH+path+SNname) # mkSNdir(SNname, path) for filename, url in spectrum_haul.items(): if filename in wiserep_spectrum_ignore: print('\tIgnoring spectrum for', SNname, '-- see sne-external-spectra/donations') continue else: rq = Request(url) res = urlopen(rq) dat = open(_PATH + path + SNname + "/" + filename, 'wb') dat.write(res.read()) dat.close() # add README for basic metadata to SNname subdirectory print('\tWriting README') with open(_PATH + path + SNname + '/README.json', 'w') as fp: json.dump(SN_dict[SNname], fp, indent=4) updateListsJson(SNname, list_dict['completed'], list_dict, path) elif len(spectrum_haul) > 1: # make SNname subdirectory # os.mkdir(_PATH+path+SNname) # mkSNdir(SNname, path) SN_files = deepcopy(SN_dict[SNname]) for filename, metadata in SN_files.items(): if metadata['Reduction Status'] == 'rapid': del SN_dict[SNname][filename] del spectrum_haul[filename] print('\tRemoving duplicate spectrum for', SNname, '--', filename) with open(_PATH + path + 'scraper-log.txt', 'a') as f: f.write('Removing duplicate spectrum for ' + SNname + ' -- ' + filename + '\n') # remove host spectrum if it exists if SNname in obj_host_dict.keys(): if obj_host_dict[SNname]['Filename'] in SN_dict[SNname].keys(): filename = obj_host_dict[SNname]['Filename'] del SN_dict[SNname][filename] print('\tPurging host galaxy spectrum --', filename) # need to continue to next supernova if host spectrum was only one if len(SN_dict[SNname].keys()) == 0: print('\tNot collecting spectra at this time') with open(_PATH + path + 'scraper-log.txt', 'a') as f: f.write('Not collecting spectra of ' + SNname + ' at this time' + '\n') updateListsJson(SNname, list_dict['completed'], list_dict, path) continue last_modified = {} SN_files = deepcopy(SN_dict[SNname]) for k, d in SN_files.items(): for l, e in SN_files.items(): aa = d['Obs. Date'] == e['Obs. Date'] bb = d['Instrument'] == e['Instrument'] cc = d['Observer'] == e['Observer'] dd = d['Modified By'] == 'ofer-UploadSet' ee = d['Modified By'] == e['Modified By'] if aa and bb and cc and dd and ee and k != l: # see 2012fs date = SN_dict[SNname][k]['Last Modified'] newdate = time.strptime(date, '%Y-%m-%d') last_modified[k] = newdate elif aa and bb and cc and k != l: # see 2016bau date = SN_dict[SNname][k]['Last Modified'] newdate = time.strptime(date, '%Y-%m-%d') last_modified[k] = newdate if len(last_modified) <= 1: print('\tPresumably no other duplicate files found for', SNname) with open(_PATH + path + 'scraper-log.txt', 'a') as f: f.write('Presumably no other duplicate files found for ' + SNname + '\n') elif len(last_modified) == 2: duplicate = min(last_modified, key=last_modified.get) del SN_dict[SNname][duplicate] del spectrum_haul[duplicate] print('\tRemoving duplicate spectrum for', SNname, '--', duplicate) with open(_PATH + path + 'scraper-log.txt', 'a') as f: f.write('Removing duplicate spectrum for ' + SNname + ' -- ' + duplicate + '\n') count = 1 for filename, url in spectrum_haul.items(): print('\tDownloading', count, 'of', len(SN_dict[SNname]), 'public spectra') if filename in wiserep_spectrum_ignore: print('\tIgnoring spectrum for', SNname, '-- see sne-external-spectra/donations') continue else: rq = Request(url) res = urlopen(rq) dat = open(_PATH + path + SNname + "/" + filename, 'wb') dat.write(res.read()) dat.close() count += 1 # add README for basic metadata to SNname subdirectory print('\tWriting README') with open(_PATH + path + SNname + '/README.json', 'w') as fp: json.dump(SN_dict[SNname], fp, indent=4) updateListsJson(SNname, list_dict['completed'], list_dict, path) # reset completed to 0 once all done list_dict['completed'] = [] with open(_PATH + path + 'lists.json', 'w') as fp: json.dump(list_dict, fp, indent=4) # execution time in minutes minutes = (time.time() - start_time) / 60.0 print("Runtime: %s minutes" % minutes) with open(_PATH + path + 'scraper-log.txt', 'a') as f: f.write('Runtime: ' + str(minutes) + ' minutes')
def scrapeDetails(details): regex = 'Uploaded (?P<date>([-:\d\s]+)), Size (?P<sizeNum>(\d+\.\d+)) (?P<sizeMeas>(\w+)), ULed by (?P<uploader>(\w+))' # regex = 'Uploaded (?P<date>([-:\d\s]+)),' ret = re.search(regex,details.replace(u'\xa0',u' ')) return ret.groupdict() if ret else {} base = "http://thepiratebay.se/search/" search = "the green mile".replace(' ','%20') opts = "/0/7/0" url = base+search+opts browser = RoboBrowser() browser.open(url) rows = browser.find_all('tr') for row in rows: # print row elem = row.find('font',{'class':'detDesc'}) if elem: print elem.text.replace('\n','') print scrapeDetails(elem.text.replace('\n','')) # browser.follow_link('generateLink') # browser.follow_link('HERE')
'bush', 'carson', 'christie', 'cruz', 'fiorina', 'gilmore', 'graham', 'huckabee', 'jindal', 'kasich', 'pataki', 'paul', 'perry', 'rubio', 'santorum', 'trump', 'walker', 'romney', 'election', 'presidential', 'cycle', 'primary', 'primaries', 'candidate', 'race'] ## dates to search in 2015 months, days = range(1, 9), range(1, 32) dates = itertools.product(months, days) ## search the archives for potentially relevant material browser = RoboBrowser(history = True) relevant_urls = [] bad_urls = [] for date in dates: m, d = date[0], date[1] archive_url = 'http://www.wsj.com/public/page/archive-2015-' + str(m) + '-' + str(d) + '.html' try: browser.open(archive_url) articles = browser.find_all('h2') for article in articles: if any(word in article.get_text().lower() for word in terms): relevant_urls.append(article.find('a').get('href')) except: bad_urls.append(archive_url) pass ## save the urls with open('wsj_article_urls.txt','rb') as f: f.write(json.dumps(relevant_urls)) f.close()
class StitchBot(object): def __init__(self, output_path=None, username=None, password=None): self.browser = RoboBrowser(history=True) self.output_path = output_path or tempfile.TemporaryDirectory().name self.username = username or os.environ['STITCHBOT_USERNAME'] self.password = password or os.environ['STITCHBOT_PASSWORD'] self.logger = logger.getChild('StitchBot') def log(self, level, method_name, message, *args, **kwargs): child_logger = self.logger.getChild(method_name) child_logger.log(level, message, *args, **kwargs) def scrape(self): self.log(logging.INFO, 'scrape', 'Starting scrape') self.log_in() self.navigate_to_free_pattern() scraped_filenames = self.download_pattern() self.log(logging.INFO, 'scrape', 'Scrape complete') return scraped_filenames def log_in(self): self.log(logging.INFO, 'log_in', 'Logging in') self.browser.open('http://dailycrossstitch.com/my-account/') form = self.browser.get_form(class_='login') form['username'] = self.username form['password'] = self.password self.browser.submit_form(form) self.log(logging.INFO, 'log_in', 'Logged in') def navigate_to_free_pattern(self): self.log( logging.INFO, 'navigate_to_free_pattern', 'Finding free pattern') self.browser.open('http://dailycrossstitch.com/') free_button = self.browser.find('a', class_='button', string='FREE') self.browser.follow_link(free_button) self.log( logging.INFO, 'navigate_to_free_pattern', 'Found free pattern') def download_pattern(self): self.log(logging.INFO, 'download_pattern', 'Downloading pattern') download_buttons = self.browser.find_all( 'a', class_='single_add_to_cart_button') download_urls = list(map(itemgetter('href'), download_buttons)) local_filenames = [ self.download_pattern_file(url) for url in download_urls] self.log(logging.INFO, 'download_pattern', 'Downloaded pattern') return local_filenames def download_pattern_file(self, url): self.log( logging.INFO, 'download_pattern_file', 'Downloading pattern file at {0}'.format(url)) self.browser.open(url) download_script = self.browser.find( 'script', string=re.compile(r'^\s*function startDownload')) if not download_script: return pdf_url_match = re.search(r'(http.+\.pdf)', download_script.string) if not pdf_url_match: return pdf_url = pdf_url_match.group(1) self.browser.open(pdf_url) output_filename = self.save_pattern(self.browser.response) self.log( logging.INFO, 'download_pattern_file', 'Downloaded pattern file at {0}'.format(url)) return output_filename def save_pattern(self, response): self.log(logging.INFO, 'save_pattern', 'Saving pattern') try: os.makedirs(self.output_path) except OSError: pass filename = self.get_filename(response.headers) output_filename = os.path.join(self.output_path, filename) with open(output_filename, 'wb') as output_file: output_file.write(response.content) self.log( logging.INFO, 'save_pattern', 'Saved pattern to {0}'.format(output_filename)) return output_filename def get_filename(self, headers, default_filename='pattern.pdf'): filename_match = re.search( r'filename="?([^"]+)"?', headers.get('Content-Disposition', '')) if not filename_match: return default_filename return filename_match.group(1)
#Actually do the search searchList = raw_input('What would you like to search for? Separate your search queries with spaces: ') sum = 0; # Initialize the link-holder array and the query array linksToVisit = [] searchQuery = searchList.split(' ') # Iterate through all the search values for searchVal in searchQuery: browser.open('https://poshmark.com/search?query=' + searchVal + '&type=people') # Compile list of users to go to print 'Here are all users you are going to visit from search ' + searchVal + ':' for link in browser.find_all('a'): if (str(link.get('href'))[:6] == '/user/' and str(link.get('href'))[-12:] == '/follow_user'): linksToVisit.append(str(link.get('href'))) print link.get('href') # Actually visits them. And sleeps a lot too. if (len(linksToVisit) == 0): print 'No users were returned' else: for url in linksToVisit: timetosleep = int(random.random() * 10) print 'sleeping for ' + str(timetosleep) + ' seconds.' time.sleep(timetosleep) print 'followed user ' + url browser.open('https://poshmark.com' + url) time.sleep(int(random.random() * 2))
from robobrowser import RoboBrowser my_url='http://yifyhdtorrent.com/' file_name='yify.csv' f=open(file_name,'w') headers='MOVIE_NAME,RATINGS\n' f.write(headers) browser=RoboBrowser(history=True) browser.open(my_url) browser.parsed('html') containers=browser.find_all('div',attrs={'class':'smp-view'}) for container in containers: name_container=container.find('div',attrs={'class':'title-video'}) movie_name=name_container.a name=movie_name.text ratings=container.div.text.strip() #print('NAME : '+name+' \nRatings :'+ ratings[0] +'\n') f.write(name+','+ratings[0]+'\n') f.close()
def get_custom(year, month): with open('elevens_list_noname.txt', encoding = 'utf-8', mode = 'r') as file: elevens = eval(file.read()) filename = str(year + 1911) + '-' + str(month).zfill(2) + '.txt' with open(filename, encoding = 'utf-8', mode = 'w') as output: header = '國家|貨品分類|中文貨名|英文貨品|數量|數量單位|重量|重量單位|價值\n' output.write(header) for good in range(0, len(elevens) // 250 * 250 - 250 + 1, 250): goodsGroup = ','.join(elevens[good : good + 250]) payload = [('minYear', '92'), ('maxYear', '105'), ('maxMonth', '6'), ('minMonth', '1'), ('maxYearByYear', '104'), # 3: 進口總值(含復進口), 6: 出口總值(含復出口) ('searchInfo.TypePort', '6'), # 資料週期:0: 按月, 1: 按年 ('searchInfo.TypeTime', '0'), # Year range: 92-105 ('searchInfo.StartYear', str(year)), ('searchInfo.StartMonth', str(month)), ('searchInfo.EndMonth', str(month)), # 11碼稅則 ('searchInfo.goodsType', '11'), ('searchInfo.goodsCodeGroup', goodsGroup), # 請點選國家地區: 全部國家 ('searchInfo.CountryName', '請點選國家地區'), # rbMoney1: 新臺幣, rbMoney2: 美元 ('searchInfo.Type', 'rbMoney2'), # rbByGood: 按貨品別排列, rbByCountry: 按國家別 ('searchInfo.GroupType', 'rbByCountry'), ('Search', '開始查詢')] while True: try: browser = RoboBrowser() browser.open(url + urllib.parse.urlencode(payload), verify = False) if browser.response.status_code == 200: break except: print('An error has occurred. Retrying.') print(browser.response.text) sleep(60) dataListNumber = 'dataList_' + str(month) table = browser.find_all('table', {'id':dataListNumber}) tds = [] for table_element in table: rows = table_element.find_all('tr') for row in rows: td = row.find_all('td') tds.append(td) data = '' for index in range(1, len(tds)): row_data = tds[index] if row_data[1].text == '合計': continue for data_index in range(8): data += row_data[data_index].text + '|' data += row_data[8].text + '\n' output.write(data) terminal_size = shutil.get_terminal_size()[0] print('Data for', goodsGroup[0:10] + '-' + goodsGroup[(len(goodsGroup) - 10):len(goodsGroup)], calendar.month_name[month] + ', %s' % (year + 1911), 'written on', strftime("%Y-%m-%d %H:%M:%S")) goodsGroup2 = ','.join(elevens[len(elevens) // 250 * 250 : len(elevens)]) payload = [('minYear', '92'), ('maxYear', '105'), ('maxMonth', '6'), ('minMonth', '1'), ('maxYearByYear', '104'), # 3: 進口總值(含復進口), 6: 出口總值(含復出口) ('searchInfo.TypePort', '6'), # 資料週期:0: 按月, 1: 按年 ('searchInfo.TypeTime', '0'), # Year range: 92-105 ('searchInfo.StartYear', str(year)), ('searchInfo.StartMonth', str(month)), ('searchInfo.EndMonth', str(month)), # 11碼稅則 ('searchInfo.goodsType', '11'), ('searchInfo.goodsCodeGroup', goodsGroup2), # 請點選國家地區: 全部國家 ('searchInfo.CountryName', '請點選國家地區'), # rbMoney1: 新臺幣, rbMoney2: 美元 ('searchInfo.Type', 'rbMoney2'), # rbByGood: 按貨品別排列, rbByCountry: 按國家別 ('searchInfo.GroupType', 'rbByCountry'), ('Search', '開始查詢')] while True: try: browser = RoboBrowser() browser.open(url + urllib.parse.urlencode(payload), verify = False) if browser.response.status_code == 200: break except: print('An error has occurred. Retrying.') print(browser.response.text) sleep(60) dataListNumber = 'dataList_' + str(month) table = browser.find_all('table', {'id':dataListNumber}) tds = [] for table_element in table: rows = table_element.find_all('tr') for row in rows: td = row.find_all('td') tds.append(td) data = '' for index in range(1, len(tds)): row_data = tds[index] if row_data[1].text == '合計': continue for data_index in range(8): data += row_data[data_index].text + '|' if index != len(tds) - 1: data += row_data[8].text + '\n' else: data += row_data[8].text output.write(data) print('Data for', goodsGroup2[0:10] + '-' + goodsGroup2[(len(goodsGroup2) - 10):len(goodsGroup2)], calendar.month_name[month] + ', %s' % (year + 1911), 'written on', strftime("%Y-%m-%d %H:%M:%S")) print('=' * terminal_size + calendar.month_name[month] + ', %s' % (year + 1911), 'data successfully downloaded on', strftime("%Y-%m-%d %H:%M:%S") + '\n' + '=' * terminal_size + '\n') return()
browser = RoboBrowser(session=session) ## DVD Queue browser.open('http://dvd.netflix.com/Queue?prioritized=true&qtype=DD', cookies = cj) # get the form queue_form = browser.get_form(class_='hasAwaitingRelease') queue_submit = queue_form.submit_fields['updateQueue2'] predictions = [] skip_keys = ["authURL", "updateQueueBtn", "updateQueue1", "queueHeader", "updateQueue2"] for key in queue_form.keys(): if key in skip_keys: continue if 'OP' in key: continue spans = browser.find_all("input", {"name" : key })[0].findAllNext("span") for s in spans: if s is not None: for k in s.findChildren(): for c in k['class']: if 'sbmf-' in c: predicted_rating = c.strip("sbmf-") if key not in (item[0] for item in predictions): predictions.append((key, predicted_rating)) sorted_preds = sorted(predictions, key=lambda x: float(x[1]), reverse=True) # for i in xrange(len(sorted_preds)): # in_arg = # in_target # queue_form[sorted_preds[i][0]].value = i # ## form submit not actually working here, it doesn't seem to take