def login(): # GG CAPTCHA (abandoned ship) r.click('//header[@role="banner"]/div/div/div[3]/div/div/nav/ul/li[6]') r.wait(10) if r.present('//div[@aria-label="Log in"]/div[2]/div[4]/button') == True: # Anti RPA by AirBnB r.click('//div[@aria-label="Log in"]/div[2]/div[4]/button') if r.present('//button[@data-testid="social-auth-button-email"]') == True: # Anti RPA by AirBnB r.click('//button[@data-testid="social-auth-button-email"]') r.type('//*[@id="email"]', USERNAME) r.type('//*[@id="password"]', PASSWORD) r.click('//button[@data-veloute="submit-btn-cypress"]') r.click('//*[@id="recaptcha-anchor"]/div[1]')
def click(xpath, s): if r.exist(xpath) & r.present(xpath): r.click(xpath) return s + 1 else: print('Couldn\'t find' + xpath + ' component') return s
def close_cookie_popup(): if r.present("//button[@class='optanon-allow-all accept-cookies-button']" ) == True: r.click("//button[@class='optanon-allow-all accept-cookies-button']")
def extract_stay_info_as_data( ): #Generates URL/text in dict instead, shorten time for upload/download, more unified data = { "0": { "name": "", "description": "", "inventory": "", "price": "", "rating": "", "picurl": [None] * 10, "pictext": [None] * 10, "url": "" }, "1": { "name": "", "description": "", "inventory": "", "price": "", "rating": "", "picurl": [None] * 10, "pictext": [None] * 10, "url": "" }, "2": { "name": "", "description": "", "inventory": "", "price": "", "rating": "", "picurl": [None] * 10, "pictext": [None] * 10, "url": "" }, "3": { "name": "", "description": "", "inventory": "", "price": "", "rating": "", "picurl": [None] * 10, "pictext": [None] * 10, "url": "" }, "4": { "name": "", "description": "", "inventory": "", "price": "", "rating": "", "picurl": [None] * 10, "pictext": [None] * 10, "url": "" } } print('Extracting Top 5 Stay Picture Information (10 Image Max)..') url = [] url = get_stay_url() i = 0 k = 0 while (i < 5): data[str(i)]["url"] = url[i + k] r.url(url[i + k]) print(f'Extracting Text Data - Homestay {i+1}') if (r.exist( '//*[@data-plugin-in-point-id="TITLE_DEFAULT"]/div/div/section/div/div/h1' ) == True): data[str(i)]["name"] = r.read( '//*[@data-plugin-in-point-id="TITLE_DEFAULT"]/div/div/section/div/div/h1' ) data[str(i)]["description"] = r.read( '//*[@data-plugin-in-point-id="OVERVIEW_DEFAULT"]/div/div/div/section/div/div/div/div/div' ) data[str(i)]["description"] = data[str(i)]["description"].replace( "\xa0", " ") data[str(i)]["inventory"] = r.read( '//*[@data-plugin-in-point-id="OVERVIEW_DEFAULT"]/div/div/div/section/div/div/div/div/div[2]' ) data[str(i)]["price"] = r.read( '//*[@data-plugin-in-point-id="BOOK_IT_SIDEBAR"]/div/div[2]/div/ul[2]/li/span[2]' ) #Total Price if r.present( '//*[@data-plugin-in-point-id="REVIEWS_DEFAULT"]/div/div/section/div/div/div/h2/span[2]/span' ): data[str(i)]["rating"] = r.read( '//*[@data-plugin-in-point-id="REVIEWS_DEFAULT"]/div/div/section/div/div/div/h2/span[2]/span' ) else: data[str(i)]["rating"] = "No Reviews Yet" r.click('//*[@id="FMP-target"]') j = 0 while (1): j = j + 1 print(f'Extracting Picture Data - Homestay {i+1} Photo {j}') r.wait(0.4) #r.snap('//div[@data-testid="photo-viewer-slideshow-desktop"]/div/div/div/div/div/img',f"data/{i+1}/{j}.jpg") #fastest but not perfect if (r.exist( '//div[@data-testid="photo-viewer-slideshow-desktop"]/div/div/div/div/div/img/@src' ) == True): data[str(i)]["picurl"][j - 1] = r.read( '//div[@data-testid="photo-viewer-slideshow-desktop"]/div/div/div/div/div/img/@src' ) if (r.present( '//div[@data-testid="photo-viewer-slideshow-desktop"]/div/div/div/div[2]/div/span/div/span' ) == True): data[str(i)]["pictext"][j - 1] = r.read( '//div[@data-testid="photo-viewer-slideshow-desktop"]/div/div/div/div[2]/div/span/div/span' ) #r.download(dl_link,f'data/{i+1}/{j}.jpg') print(f'Homestay {i+1} Photo {j} extracted!') if (r.exist('//*[@aria-label="Next"]') == False or j >= 10): break r.click('//*[@aria-label="Next"]') else: i = i - 1 #Detects Whales (Airbnb Plus spoils the format alot) k = k + 1 #Compensating Constant k print("WHALE detected, adding one more loop..") i = i + 1 #r.click('/html/body/div[9]/div/div/div/section/div/div[1]/div/button') print('Done.') return data
def extract_stay_info_as_data( ): #Generates URL/text in dict instead, shorten time for upload/download, more unified data = { "0": { "name": "", "description": "", "inventory": "", "price": "", "rating": "", "picurl": [None] * 10, "pictext": [None] * 10, "url": "", "coordinates": "" }, "1": { "name": "", "description": "", "inventory": "", "price": "", "rating": "", "picurl": [None] * 10, "pictext": [None] * 10, "url": "", "coordinates": "" }, "2": { "name": "", "description": "", "inventory": "", "price": "", "rating": "", "picurl": [None] * 10, "pictext": [None] * 10, "url": "", "coordinates": "" }, "3": { "name": "", "description": "", "inventory": "", "price": "", "rating": "", "picurl": [None] * 10, "pictext": [None] * 10, "url": "", "coordinates": "" }, "4": { "name": "", "description": "", "inventory": "", "price": "", "rating": "", "picurl": [None] * 10, "pictext": [None] * 10, "url": "", "coordinates": "" } } print('Extracting Top 5 Stay Picture Information (10 Image Max)..') url = [] url = get_stay_url() i = 0 k = 0 while (i < 5): data[str(i)]["url"] = url[i + k] r.url(url[i + k]) print(f'Extracting Text Data - Homestay {i+1}') if (r.exist('//*[@itemprop="name"]/span/h1/span') == True): data[str(i)]["coordinates"] = r.read( '//*[@data-veloute="map/GoogleMap"]/div/div/div/div[2]/a/@href' ).split("=", 1)[1].split("&", 1)[0] data[str(i)]["name"] = r.read('//*[@itemprop="name"]/span/h1/span') data[str(i)]["description"] = r.read( '//*[@href="#neighborhood"]/div') #data[str(i)]["description"]=data[str(i)]["description"].replace("\xa0"," ") data[str(i)]["inventory"] = r.read( '//*[@id="room"]/div[2]/div/div[2]/div/div/div[3]/div/div/div[1]/div/div/div[1]/div' ) + " " + r.read( '//*[@id="room"]/div[2]/div/div[2]/div/div/div[3]/div/div/div[1]/div/div/div[2]/div' ) + " " + r.read( '//*[@id="room"]/div[2]/div/div[2]/div/div/div[3]/div/div/div[1]/div/div/div[3]/div' ) + " " + r.read( '//*[@id="room"]/div[2]/div/div[2]/div/div/div[3]/div/div/div[1]/div/div/div[4]/div' ) if (r.present('//*[@id="book_it_form"]/div[4]/div[2]') == True): data[str(i)]["price"] = r.read( '//*[@id="book_it_form"]/div[4]/div[2]').split("Total", 1)[1] else: data[str(i)]["price"] = r.read( '//*[@id="book_it_form"]/div[2]').split("Total", 1)[1] #Total Price if r.present('//*[@data-heading-focus="review header"]/div'): data[str(i)]["rating"] = r.read( '//*[@data-heading-focus="review header"]/div/div/@aria-label' ) + " (" + r.read( '//*[@data-heading-focus="review header"]/div/span') + ")" else: data[str(i)]["rating"] = "No Reviews Yet" r.click('//*[@data-veloute="hero-view-photos-button"]') j = 0 while (1): j = j + 1 print(f'Extracting Picture Data - Homestay {i+1} Photo {j}') r.wait(0.4) #r.snap('//div[@data-testid="photo-viewer-slideshow-desktop"]/div/div/div/div/div/img',f"data/{i+1}/{j}.jpg") #fastest but not perfect if (r.exist('//img[@data-veloute="slideshow-image"]/@src') == True): data[str(i)]["picurl"][j - 1] = r.read( '//img[@data-veloute="slideshow-image"]/@src') if (r.present( '//*[@data-veloute="slideshow-modal"]/div/div/div[2]/div[2]/div[2]/div[2]/div' ) == True): data[str(i)]["pictext"][j - 1] = r.read( '//*[@data-veloute="slideshow-modal"]/div/div/div[2]/div[2]/div[2]/div[2]/div' ) #r.download(dl_link,f'data/{i+1}/{j}.jpg') print(f'Homestay {i+1} Photo {j} extracted!') if (r.exist('//button[@aria-label="Next"]') == False or j >= 10): break r.click('//button[@aria-label="Next"]') else: i = i - 1 #Detects Whales (Airbnb Plus spoils the format alot) k = k + 1 #Compensating Constant k print("WHALE detected, adding one more loop..") i = i + 1 #r.click('/html/body/div[9]/div/div/div/section/div/div[1]/div/button') print('Done.') return data
for a in range(size): URL = f'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Singapore&l=p%3ASG-SG%3ASingapore%3A%3A{area_list[a]}' r.url(URL) time.sleep(10) URL_list = [] maxpage = int( r.read( '//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span' ).replace("1 of ", "")) for j in range(0, maxpage): if j != 0: if r.present( f'(//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[1]/div/div/span/a/span)[2]' ): r.click( f'(//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[1]/div/div/span/a/span)[2]' ) else: r.click( f'(//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[1]/div/div/span/a/span)[1]' ) time.sleep(10) for i in range(1, 31): if r.exist( f'//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li[{i}]/div/div/div/div[2]/div[1]/div/div[1]/div/div[1]/div/div/h4/span/a/@href' ) == False: break
import rpa as r import pandas as pd import time #Read all restaurant names in csv yelp = pd.read_csv("yelpindex.csv") yelp["URL"].head r.init() #Fill up table with URLs for i in (0, len(yelp["URL"])): r.url(yelp["URL"][i]) if r.present("//*[@id='rso']/div[1]/div/div[1]/a/@href"): name = r.read("//*[@id='rso']/div[1]/div/div[1]/a/@href") else: name = r.read("//*[@id='rso']/div[2]/div/div[1]/a/@href") print(name) yelp["URL"][i] = name time.sleep(5) yelp.to_csv("yelpindex_updated.csv") yelp2TA = pd.read_csv("yelpindex_updated.csv") yelp2TA.reset_index(drop=True) yelp2TA["URL"].head #Get Reviews author_loc = "" reviews_df = pd.DataFrame() #if y == 9: init = 33
import pickle #Get URLs r.init() URL = f'https://www.tripadvisor.com.sg/Restaurants-g294265-Singapore.html' r.url(URL) time.sleep(10) maxpage = int(r.read('//*[@id="EATERY_LIST_CONTENTS"]/div[2]/div/div/a[6]/@data-page-number')) for j in range (0,maxpage): URL_list = [] if j!=0: if r.present(f'(//*[@id="EATERY_LIST_CONTENTS"]/div[2]/div/a)[2]'): r.click(f'(//*[@id="EATERY_LIST_CONTENTS"]/div[2]/div/a)[2]') else: r.click(f'(//*[@id="EATERY_LIST_CONTENTS"]/div[2]/div/a)[1]') time.sleep(10) for i in range (1,100): if r.exist(f'(//*[@id="component_2"]/div/div[*]/span/div[1]/div[2]/div[1]/div/span/a/@href)[{i}]') == False: break URL_list.append("https://www.tripadvisor.com.sg" + r.read(f'(//*[@id="component_2"]/div/div[*]/span/div[1]/div[2]/div[1]/div/span/a/@href)[{i}]')) #print(URL_list) with open(f'url_list_{j}.txt', 'wb') as filehandle: pickle.dump(URL_list, filehandle) #Load Bookmark listnumber = 0 iteminlist = 0
def sigaaRPA(self): r.init() r.timeout(30) while(self.state > 0): if self.terminateBot: r.close() break elif self.state == 1: # use url('your_url') to go to web page, url() returns current URL r.url('https://sigaa.upb.edu.co/ssomanager/c/SSB') self.state = self.state + 1 elif self.state == 2: # use type() to use the keyboard to write something if r.exist(X.username) & r.present(X.username): r.type(X.username, '000290164') r.type(X.password, 'Tandres1997_') self.state = self.state + 1 else: print("Couldn\'t find Username and Password Components") self.state = 1 elif self.state == 3: # use click() to click on an UI element or x, y location self.state = click(X.login, self.state) elif self.state == 4: ## hace click en Estudiantes self.state = click(X.estudiantes, self.state) elif self.state == 5: ## Hace click en Seguimiento a la formación self.state = click(X.seguimieto, self.state) elif self.state == 6: ## hace click en Calificaciones parciales self.state = click(X.calif, self.state) elif self.state == 7: ## Selecciona el semestre del cual quiere mirar las notas r.select(X.semester, self.semester) self.state = self.state + 1 elif self.state == 8: ## se hace click en enviar r.click(X.enviar) self.state = self.state + 1 elif self.state == 9: tablexpath = '' r.wait(2) numCursos = r.count('//*[@class="datadisplaytable"][2]/tbody/tr/td/a') for i in range(2,numCursos+2): tablexpath = '//*[@class="datadisplaytable"][2]/tbody/tr['+ str(i) +']/td/a' if r.exist(tablexpath): r.click(tablexpath) r.wait(1) pagetitle = r.read('//div[@id="pagetitle"]') if pagetitle == 'Detalle de Calificación de Componente': materia = r.read('//*[@class="datadisplaytable"][1]/tbody/tr[5]/td[2]') print(materia) r.snap('page', './notas/s'+self.semester+'/'+ materia +'.png') # r.table('//table[@class="datadisplaytable"][2]', './csv/table'+str(i-1)+'.csv') r.dom('history.back()') # use wait() to wait for a number of seconds # default wait() is 5 seconds r.wait(5) self.terminateBot = True elif self.state == 10: r.dom('history.back()')
account2box = account2boxid searchbutton = searchbuttonid if not r.exist('//*[@id="'+account1box+'"]'): i+=1 continue #enter account number r.type('//*[@id="'+account1box+'"]', "[clear]") r.type('//*[@id="'+account1box+'"]', acc[0]) r.type('//*[@id="'+account2box+'"]', "[clear]") r.type('//*[@id="'+account2box+'"]', acc[1]) #click search r.click('//*[@id="'+searchbutton+'"]') while r.present('//*[@class="busy-load-container"'): r.wait(2) r.wait(2) #account exist? txt = r.read('body') if "No accounts were found matching your search criteria" in txt: runningLog(row[0] + ': Account does not exist') break #click last bill if r.present('Latest bill'): #Billing history has no this button, download link displayed already r.click('Latest bill') r.wait(1) while r.present('//*[@class="busy-load-container"'): r.wait(2)