def read_content(page_num, url_prefix, i, today): t.url(url_prefix + str(page_num) + '.html') # 启动很慢 t.wait(2) if t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//span[@class = "hui12"]') < today: t.close() return '', '', '', '' if '' == t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']'): print("no here") raise Exception("an exception") file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']') file_name = file_name[:-10] + str("_") + file_name[-10:] + str('.txt') time = file_name[-14:-4] prefix = 'http://www.pbc.gov.cn' content_url = prefix + t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') if '' == t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'): print("no here") raise Exception("an exception") flag = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') # 判断是否需要下载 return flag, time, content_url, file_name
def rpa_process(to_date, preferred_time, from_date, phone_number, token): t.init() t.url("https://sangam-test-website.herokuapp.com/change_input") util.wait_for_pageload('//button[@id="btnsubmit"]') t.click('//input[@id="txtHandNo"]') t.type('//input[@name="txtHandNo"]', phone_number) t.click('//button[@id="btnsubmit"]') util.wait_for_pageload('//button[@id="btnsubmit"]') from_date_obj = from_date from_date = from_date.strftime("%d/%m/%Y") t.click('//label[contains(.,"' + str(from_date) + '")]') to_date_obj = to_date hour = to_date.hour minute = to_date.minute to_date = to_date.strftime("%d/%m/%Y") t.click('//input[@name="txtDateTimePicker"]') t.type('//input[@name="txtDateTimePicker"]', to_date) t.click('//div[@class="filter-option-inner-inner"]') t.click('//a[@role= "option"][.=' + str(hour) + ']') t.click('//select[@id="ddlMin"]') t.click('//a[@role= "option"][.=' + str(minute) + ']') t.click('//button[@id="btnsubmit"]') t.close() change_appointment_slot(from_date_obj, to_date_obj, token)
def catchContent(): number_to = t.count( '(//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")])' ) df_to = pd.DataFrame(index=range(0, number_to), columns=['Sno', 'Title', 'URL', 'Summary', 'Img_URL']) t.hover('//div[@class="container footer-main"]') t.wait(2) for n in range(1, number_to): title = t.read( '//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")][{}]//div[contains(@class, "article-listing_content")]//h2' .format(n)) URL_o = t.read( '//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")][{}]//@href' .format(n)) URL = "https://www.todayonline.com" + str(URL_o) Img_link = t.read( '//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")][{}]//img/@src' .format(n)) df_to.iloc[n - 1, 0] = n df_to.iloc[n - 1, 1] = title.decode('utf-8') df_to.iloc[n - 1, 2] = URL df_to.iloc[n - 1, 4] = Img_link for i in range(0, df_to.shape[0]): if df_to['Img_URL'][i] == "": df_to['Img_URL'][i] = np.nan df_to.dropna(subset=['Img_URL'], inplace=True, how='any') df_to = df_to.reset_index(drop=True) df_to['Sno'] = df_to.index df_to = util.fixImgLink( df_to, "https://cf-templates-fghyux9ggb7t-ap-southeast-1.s3-ap-southeast-1.amazonaws.com/todayOnline.png" ) for n in range(0, df_to.shape[0]): t.url(df_to.URL[n]) t.wait(4) t.hover('//div[@class="article-detail_subscription"]') t.wait(2) number_p = t.count('//div/p[not(@class)]') Content = "" for i in range(1, number_p - 2): cont = t.read('//div/p[not(@class)][{}]'.format(i)) Content = Content + "" + cont summaries = Summarize(df_to.Title[n], unicode(str(Content), "utf-8")) df_to.iloc[n - 1, 3] = summaries[0] return df_to
def get_max_page(url_prefix): # 当达到max-page的最后一个count value文件index的时候,不用再做了。输出成功日志;进行下一个任务。。 init_url = url_prefix + '1.html' t.url(init_url) # 初始url max_page = int(t.read(element_identifier='//td[@class = "Normal"]').split('/')[1]) # 最大page数量 with open('max_page_' + str(url_prefix.split('/')[-2]) + '.txt', 'w', encoding='utf-8') as f: f.write(str(max_page)) return 'max_page_' + str(url_prefix.split('/')[-2]) + '.txt'
def login(self): t.url( 'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&ct=1586073207&rver=7.0.6737.0&wp=MBI_SSL&wreply=https%3a%2f%2foutlook.live.com%2fowa%2f%3fnlp%3d1%26RpsCsrfState%3d6590c65e-2e3f-b1ed-bda9-2c5e901a9000&id=292841&aadredir=1&whr=outlook.sg&CBCXT=out&lw=1&fl=dob%2cflname%2cwld&cobrandid=90015' ) t.wait(1) type_into('//*[@type="email"]', self.account + '[enter]') t.wait(1) type_into('//*[@name="passwd"]', self.password + '[enter]')
def direct_download(content_url, time, i): # 当直接跳到需要下载的文件的时候 if 'cnhttp' in content_url: content_url = content_url[21:] # 不知道为什么会出错这个 # 取到数据 print("文件{} 是直接下载文件。".format(i)) if '' == t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'): print("no here") raise Exception("an exception") file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') suffix = file_name.split('.')[-1] file_name = file_name.split('/')[-1] t.url(content_url) # 启动很慢 wait_seconds = 1 total_seconds = 0 while os.path.exists(file_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > MAX_WAIT: print('download fails') break os.rename( file_name, file_name[:-(len(suffix) + 1)] + "_" + time + '.' + file_name[-(len(suffix) + 1):]) else: # 取到数据 print("文件{} 是直接下载文件。".format(i)) if '' == t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'): print("no here") raise Exception("an exception") file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') suffix = file_name.split('.')[-1] file_name = file_name.split('/')[-1] t.url(content_url) # 启动很慢 wait_seconds = 1 total_seconds = 0 while os.path.exists(file_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > MAX_WAIT: print('download fails') break os.rename( file_name, file_name[:-(len(suffix) + 1)] + "_" + time + '.' + file_name[-(len(suffix) + 1):])
def get_count_values(page_num, url_prefix): t.url(url_prefix + str(page_num) + '.html') print("现在所在页面 {}".format(page_num)) t.wait(5) # 拿到value count_values = t.count(element_identifier='//td[@colspan = "2"]//table') print("页面有{}个文件".format(count_values)) with open('count_items_' + str(page_num) + '_'+str(url_prefix.split('/')[-2]) + '.txt', 'w', encoding='utf-8') as f: f.write('page:' + str(page_num) + ':' + str(count_values)) # 以:为分隔符;记录当前页面和页面总共item数量 return 'count_items_' + str(page_num) + '_'+str(url_prefix.split('/')[-2]) + '.txt'
def propertydata(project_name): t.close() t.init() project_url = f'https://www.propertyguru.com.sg/property-for-sale?market=residential&freetext={project_name}&newProject=all' t.url(project_url) wait_for_pageload('//div[@class="header-wrapper"]') num_result_ad = 3 # load main page, get detail page url link url = [''] * num_result_ad for n in [x for x in range(1, num_result_ad + 1) if x != 4 and x != 8]: # skip 4th and 8th advertisement wait_for_pageload( f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)' ) url[n - 1] = read_if_present( f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)' ) print(f"{n}. url = " + url[n - 1]) property_title = [''] * num_result_ad id = [''] * num_result_ad pdf = [''] * num_result_ad pdf_link = [''] * num_result_ad for n in [x for x in range(1, num_result_ad + 1) if x != 4 and x != 8]: t.url("https://www.propertyguru.com.sg" + url[n - 1]) wait_for_pageload('//h1[@class="h2"]') property_title[n - 1] = read_if_present('//h1[@class="h2"]') print(f"{n}. property_title = " + property_title[n - 1]) id[n - 1] = read_if_present( '//*[@id="details"]/div/div[1]/div[2]/div[10]/div/div[2]') print(f"{n}. id = " + id[n - 1]) pdf[n - 1] = read_if_present( '//*[@id="sticky-right-col"]/div[3]/a[2]/@href') pdf_link[n - 1] = 'https://www.propertyguru.com.sg' + pdf[n - 1] print(f"{n}. pdf_link = " + pdf_link[n - 1]) property_info = { 'property_title': property_title, 'url': ['https://www.propertyguru.com.sg' + x for x in url], 'id': id, 'pdf_link': pdf_link, } df = DataFrame(property_info, columns=['property_title', 'id', 'url', 'pdf_link']) df.to_excel('Property Monitor.xlsx', encoding='utf8', index=None) print('======== Property Monitor.xlsx saved ==========') print(f'======== Monitoring every {interval} second ==========')
def login_stackoverflow(account, password): t.url('https://stackoverflow.com/users/login') click('//button[@data-provider="google"]') if wait_element('//div[@data-identifier="{}"]'.format(account)): click('//div[@data-identifier="{}"]'.format(account)) else: c = t.count('//div[@jsslot=""]//li') click('(//div[@jsslot=""]//li)[{}]'.format(c)) type_into('//*[@type="email"]', account + '[enter]') type_into('//*[@name="password"]', password + '[enter]') return
def rpa_process(lmp_date, doctor_name, preferred_time, phone_number, patient_name, symptoms, email, sub_id): hour = preferred_time.hour minute = preferred_time.minute checkup_dates = [] day_list = [ 45, 75, 105, 135, 165, 195, 210, 225, 240, 255, 262, 269, 275, 280 ] week_list = [6, 10, 14, 18, 22, 26, 28, 30, 32, 34, 36, 37, 38, 39] for day in day_list: checkup = lmp_date + timedelta(days=day) checkup = str(checkup.day) + "/" + str(checkup.month) + "/" + str( checkup.year) checkup_dates.append(checkup) t.init() for index, i in enumerate(checkup_dates): t.url("https://sangam-test-website.herokuapp.com/") util.wait_for_pageload('//button[@id="btnsubmit"]') t.click('//input[@class="form-control"]') t.type('//input[@name="name"]', patient_name) t.click('//input[@id="email"]') t.type('//input[@name="email"]', email) symptoms = "Pregnancy checkup after week " + str(week_list[index]) t.type('//textarea', symptoms) t.click('//input[@id="txtHandNo"]') t.type('//input[@name="txtHandNo"]', phone_number) t.click('//div[@class="filter-option-inner-inner"]') t.click('//a[@role= "option"][.=' + str(hour) + ']') t.click('//select[@id="ddlMin"]') t.click('//a[@role= "option"][.=' + str(minute) + ']') t.click('//input[@name="txtDateTimePicker"]') t.type('//input[@name="txtDateTimePicker"]', i) t.click('//select[@id="txtSpecificDoc"]') t.click('//a[@role= "option"][.="' + str(doctor_name) + '"]') t.click('//button[@id="btnsubmit"]') t.close() request_url = "https://sangam-test-website.herokuapp.com/get_future_appointments?email=" + str( email) future_appointments = requests.get(request_url) book_calendar_slot(future_appointments.json()['data'], sub_id)
def flight_search(info): t.url('https://www.expedia.com.sg/') tu.wait_for_pageload('//button[@id="tab-flight-tab-hp"]') t.click('//button[@id="tab-flight-tab-hp"]') fill_search(info) tu.wait_for_pageload('//button[@id="flights-advanced-options-toggle"]') t.click('//button[@id="flights-advanced-options-toggle"]') tu.wait_for_pageload('//select[@id="child-count"]') adult_pax = int(info['adult']) children_pax = len(info['child_age']) children_age = info['child_age'] number_of_travellers(adult_pax, children_pax, children_age) t.click('//*[@id="flight-wizard-search-button"]')
def get_shoe(shoe, g, email): gender = g # print('[nike]',gender) t.init(visual_automation=True) t.url('https://www.nike.com/sg/') t.type('//input[@id = "TypeaheadSearchInput"]', shoe + " shoes") t.click('//button[@class = "btn-search z2 bg-transparent"]') t.wait(3) if gender == " men": t.click('(//span[contains(@class,"filter-item")])[1]') elif gender == " women": t.click('(//span[contains(@class,"filter-item")])[2]') t.wait(1) count = t.count('//a[@class ="product-card__link-overlay"]') # print('[nike]',count) details = [] if count != 0: for i in range(0, min(count, 3)): k = i + 1 name = t.read(f'(//a[@class = "product-card__link-overlay"])[{k}]') price = t.read(f'(//div[@data-test="product-price"])[{k}]') img = t.read( f'(//div[contains(@class, "product-card__hero")]/picture/img)[{k}]/@src' ) link = t.read(f'(//a[contains(@class,"product-card")])[{k}]/@href') # print('[nike]',name , price, img) details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "Nike", "link": link }) else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "Nike", "link": "NA" }) # print(details) return details
def check_availability(reservation_date,reservation_time,party_size,restaurant_name): try: #Convert User Defined Values to System Usable Values reservation_day=reservation_date.split('/')[0] reservation_month =reservation_date.split('/')[1] reservation_month=int(reservation_month)-1 reservation_year =reservation_date.split('/')[2] reservation_time_int=int(reservation_time) start_time_hr= reservation_time[:2] if reservation_time_int>1159: if start_time_hr!="12": start_time_hr=int(start_time_hr)-12 start_time_option = str(start_time_hr)+":"+reservation_time[2:4]+" pm" else: start_time_option = str(start_time_hr)+":"+reservation_time[2:4]+" am" #Booking Parameters chope_url ='https://www.chope.co/singapore-restaurants/category/restaurant/' t.init() t.url(chope_url) t.wait(10) #Date Field t.click(f"(//span[contains(@class,'input-group-addon icon-calendar')])[1]") t.wait(7) boolean_flag=1 while boolean_flag: if t.present(f"//td[@data-handler='selectDay'and @data-year='{reservation_year}' and @data-month='{reservation_month}']/a[text()='{reservation_day}']"): t.click(f"//td[@data-handler='selectDay'and @data-year='{reservation_year}' and @data-month='{reservation_month}']/a[text()='{reservation_day}']") boolean_flag=0 else: t.click('//a[@title="Next"]') t.click(f"//td[@data-handler='selectDay'and @data-month='{reservation_month}']/a[text()='{reservation_day}']") #Time Field t.select(f"//select[contains(@id,'time-field')]",start_time_option) #Number of Diners Field t.click(f"(//span[contains(@class,'input-group-addon icon-person')])[1]") t.select(f"//select[contains(@id,'adults')]",party_size) #Restaurant Field t.type(f"//select[contains(@id,'sb-sel-restaurant')]",restaurant_name) t.click('//button[@id="btn-search"]') t.wait(5) if t.present(f"//div[@class='alert alert-danger']"): print('Not Available') return 0 else: print ('Available') return 1 except: print('Error') return 'Reservation Unsuccessful. Unforunately, the restaurant was not able to accomodate your reservation.'
def run(): conn = util.create_connection("./db/news.db") site = util.getSiteByName(conn, "New York Times") site_url = site[0][2] site_id = site[0][0] t.init(visual_automation = True, chrome_browser = True) t.url(site_url) t.wait(10) df = catchContent() df = util.fixImgLink(df,"https://cf-templates-fghyux9ggb7t-ap-southeast-1.s3-ap-southeast-1.amazonaws.com/NewYorkTimes.png") df = util.fixSummary(df) t.wait(20) t.close() util.updateNews(conn, site_id, df)
def run(): conn = util.create_connection("./db/news.db") site = util.getSiteByName(conn, "Today Online") site_url = site[0][2] site_id = site[0][0] t.init(visual_automation=True, chrome_browser=True) t.url(site_url) t.wait(2) t.hover('//div[@class="container footer-main"]') t.wait(6) df = catchContent() t.wait(20) t.close() util.updateNews(conn, site_id, df)
def extract_global(date_stamp): data = {} region_detail = {} data['date_stamp'] = date_stamp # World data data['country_name'] = 'Global' t.url('https://www.worldometers.info/coronavirus/') wait_for_pageload('//div[@class="maincounter-number"]') global_case = t.read('(//div[@class="maincounter-number"])[1]/span') global_death = t.read('(//div[@class="maincounter-number"])[2]/span') global_recovered = t.read('(//div[@class="maincounter-number"])[3]/span') region_detail['total_cases'] = int(global_case.replace(',', '')) region_detail['total_deaths'] = int(global_death.replace(',', '')) region_detail['total_recovered'] = int(global_recovered.replace(',', '')) conv_info_str = json.dumps(region_detail) data['conv_info_str'] = conv_info_str status = insert_db(data) return status
def rpa_process(from_date, phone_number, token): t.init() t.url("https://sangam-test-website.herokuapp.com/cancel_input") util.wait_for_pageload('//button[@id="btnsubmit"]') t.click('//input[@id="txtHandNo"]') t.type('//input[@name="txtHandNo"]', phone_number) t.click('//button[@id="btnsubmit"]') util.wait_for_pageload('//button[@id="btnsubmit"]') from_date_obj = from_date from_date = from_date.strftime("%d/%m/%Y") t.click('//label[contains(.,"' + str(from_date) + '")]') t.click('//button[@id="btnsubmit"]') t.close() cancel_appointment_slot(from_date_obj, token)
def flight_search(flight_request): search_dt = dt.today() request_id = flight_request['Request_ID'] info = flight_request['Request_Details'] t.init() t.url('https://www.skyscanner.com.sg/') tu.wait_for_pageload('//input[@id="fsc-trip-type-selector-return"]') fill_search(info) ind = 0 flight_main = getFlightExcel(info, ind) t.wait(10.0) t.close() flight_main.update({ 'Request_ID': request_id, 'Search_Datetime': search_dt }) dbf.newFlightDeals(flight_main) outFile = dbf.export_FlightDeals(request_id, search_dt) return outFile
def get_count_values(page_num, url_prefix, today): t.url(url_prefix + str(page_num) + '.html') print("现在所在页面 {}".format(page_num)) t.wait(5) # 拿到value count_values = t.count(element_identifier='//td[@colspan = "2"]//table') # today = '2018-04-24' if t.read( element_identifier= '//td[@colspan = "2"]//table[1]//span[@class = "hui12"]') < today: return '今日无增量' print("页面有{}个文件".format(count_values)) with open('count_items_' + str(page_num) + '_' + str(url_prefix.split('/')[-2]) + '.txt', 'w', encoding='utf-8') as f: f.write('page:' + str(page_num) + ':' + str(count_values)) # 以:为分隔符;记录当前页面和页面总共item数量 return 'count_items_' + str(page_num) + '_' + str( url_prefix.split('/')[-2]) + '.txt'
def get_news_using_crawler(): try: t.url( 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/news' ) wait_for_pageload('//p[@class="heading text-underline"]') num_news = t.count('//p[@class="heading text-underline"]') if num_news > 5: num_news = 5 delete_news_data_db() date_stamp = datetime.datetime.now( pytz.timezone('Singapore')).strftime('%Y-%m-%d') for n in range(1, num_news + 1): data = {} data['date_stamp'] = date_stamp news_link = t.read( f'(//p[@class="heading text-underline"])[{n}]/ancestor-or-self::a/@href' ) data['news_link'] = news_link news_title = t.read( f'(//p[@class="heading text-underline"])[{n}]/ancestor-or-self::a/@aria-label' ) data['news_title'] = news_title print('Article', n, ":", news_title) print('') news_summaries = SummarizeUrl(news_link) data['news_summary'] = str(news_summaries) print(news_summaries) status = insert_db(data) return status except Exception as e: print(e) finally: t.close()
# Sample script to search on Yahoo, take screenshot of results and visit DuckDuckgo # TagUI for Python's simple and powerful API makes digital process automation fun! # pip install tagui to install, pip install tagui --upgrade for latest version # to use in Jupyter notebook, Python script or interactive shell import tagui as t # use init() to start TagUI, it autoruns setup() to download TagUI # default init(visual_automation = False, chrome_browser = True) t.init() # use url('your_url') to go to web page, url() returns current URL t.url('https://ca.yahoo.com') # use type() to enter text into an UI element or x, y location # '[enter]' = enter key, '[clear]' = clear field t.type('search-box', 'github') # use read() to fetch and return text from UI element search_text = t.read('search-box') # use echo() to print to output, same as Python print() t.echo(search_text) # use click() to click on an UI element or x, y location # rclick() = right-click, dclick() = double-click t.click('search-button') # use wait() to wait for a number of seconds # default wait() is 5 seconds
def propertydata_update(project_name): df1 = pd.read_excel('Property Monitor.xlsx') t.close() t.init() project_url = f'https://www.propertyguru.com.sg/property-for-sale?market=residential&freetext={project_name}&newProject=all' print(project_url) t.url(project_url) wait_for_pageload('//div[@class="header-wrapper"]') num_result_ad = 3 # load main page, get detail page url link url = [''] * num_result_ad id = [''] * num_result_ad for n in [x for x in range(1, num_result_ad + 1) if x != 4 and x != 8]: # skip 4th and 8th advertisement wait_for_pageload( f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)' ) url[n - 1] = read_if_present( f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)' ) print(f"{n}. url = " + url[n - 1]) id[n - 1] = read_if_present( f'(//*[@id="wrapper-inner"]/section[1]/div[2]/div[1]/div[2]/div[2]/section/div[2]/div[{n}]/@data-listing-id)' ) print(f'searching: {id}') # ['22036842', '21725956', '20648962'] id_int = list(df1['id']) id_str = list() for n in id_int: id_str.append(str(n)) print(id_str) new_url = list() for n in id: if n not in id_str: print(f'new property found: {n}') u = f"https://www.propertyguru.com.sg/listing/{n}/for-sale-{project_name}" new_url.append(u) if new_url == []: return print(f'======== no new property found! ==========') print(f'======== new property found==========') property_title = [''] * len(new_url) pdf = [''] * len(new_url) pdf_link = [''] * len(new_url) for (n, i) in zip(new_url, range(1, len(new_url) + 1)): t.url(n) wait_for_pageload('//h1[@class="h2"]') property_title[i - 1] = read_if_present('//h1[@class="h2"]') print(f"{i}. property_title = " + property_title[i - 1]) pdf[i - 1] = read_if_present( '//*[@id="sticky-right-col"]/div[3]/a[2]/@href') pdf_link[i - 1] = 'https://www.propertyguru.com.sg' + pdf[i - 1] print(f"{i}. pdf_link = " + pdf_link[i - 1]) property_info = { 'property_title': property_title, 'url': ['https://www.propertyguru.com.sg' + x for x in url], 'id': id, 'pdf_link': pdf_link, } df2 = DataFrame(property_info, columns=['property_title', 'id', 'url', 'pdf_link']) new_df = pd.concat([df1, df2]) new_df.to_excel('Property Monitor.xlsx', encoding='utf8', index=None) print('======== Property Monitor.xlsx update ==========') pdf_filename = download_pdf(property_title, pdf_link, id) mail_subscription(input_email, input_name, pdf_filename)
import tagui as t t.init() t.url('https://www.baidu.com')
def get_shoe(shoe_name, g, email): """ Get shoe details from jdsports.com.sg :param shoe_name: name of the shoe to search for :param gender: gender of the subscriber :param email: email id of the subscriber :return: details, list of shoe details. """ details = [] t.init(visual_automation=True) t.url('https://www.jdsports.com.sg/') t.wait(5) final_command = shoe_name + " shoes" + '[enter]' t.keyboard('[esc]') t.type('//input[@id = "srchInput"]', final_command) #t.click('//input[@id ="srchButton"]') t.wait(3) if g == ' men': if t.read( '(//a[@data-e2e="plp-filterMenu-catItem"]/span)[contains(.,"Men")]' ): t.click('(//a[@data-e2e="plp-filterMenu-catItem"]/span)[1]') count = t.count( '//ul[@id="productListMain"]//li[@class="productListItem "]') t.wait(3) if count != 0: for i in range(1, min(count, 4)): price = t.read(f'(//span[@class="pri"])[{i}]') name = t.read(f'(//span[@class="itemTitle"])[{i}]') img = t.read( f'(//a[@class="itemImage"]/picture/img/@srcset)[{i}]') link = "https://www.jdsports.com.sg" + t.read( f'(//span[@class = "itemTitle"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "JD", "link": link }) else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "JD", "link": "NA" }) elif g == ' women': if t.read( '(//a[@data-e2e="plp-filterMenu-catItem"]/span)[contains(.,"Women")]' ): t.click( '(//a[@data-e2e="plp-filterMenu-catItem"]/span)[.="Women"]') count = t.count( '//ul[@id="productListMain"]//li[@class="productListItem "]') t.wait(3) if count != 0: for i in range(1, min(count, 4)): price = t.read(f'(//span[@class="pri"])[{i}]') name = t.read(f'(//span[@class="itemTitle"])[{i}]') img = t.read( f'(//a[@class="itemImage"]/picture/img/@srcset)[{i}]') link = "https://www.jdsports.com.sg" + t.read( f'(//span[@class = "itemTitle"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "JD", "link": link }) else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "JD", "link": "NA" }) else: count = t.count( '//ul[@id="productListMain"]//li[@class="productListItem "]') t.wait(3) if count != 0: for i in range(1, min(count, 4)): price = t.read(f'(//span[@class="pri"])[{i}]') name = t.read(f'(//span[@class="itemTitle"])[{i}]') img = t.read( f'(//a[@class="itemImage"]/picture/img/@srcset)[{i}]') link = "https://www.jdsports.com.sg" + t.read( f'(//span[@class = "itemTitle"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "JD", "link": link }) else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "JD", "link": "NA" }) #t.close() if len(details) == 0: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "JD", "link": "NA" }) # print("JD BOT",details) return details
def getExpFlightPrice(airline, dep_ref, dur_ref): print(airline) print(dep_ref) print(dur_ref) util.wait_for_pageload('//input[@classes="filter-checkbox"]') t.wait(3) t.click(f'//a[@data-content-id="airlineToggleContainer"]') for i in range(len(dep_ref)): if i == 0: if t.present(f'//input[@id="airlineRowContainer_{airline[i]}"]'): t.wait(3) t.click(f'//input[@id="airlineRowContainer_{airline[i]}"]') else: print('Not match') return 0, '' elif airline[i] != airline[i-1]: if t.present(f'//input[@id="airlineRowContainer_{airline[i]}"]'): t.wait(1) t.click(f'//input[@id="airlineRowContainer_{airline[i]}"]') else: print('Not match') return 0, '' if dep_ref[i][0] == '0': dep_ref[i] = dep_ref[i][1:] if dur_ref[i][-1:] == 'h': dur_ref[i] = dur_ref[i] + ' 0m' else: dur_ref[i] = dur_ref[i] + 'm' print(airline) print(dep_ref) print(dur_ref) util.wait_for_pageload('//button[@data-test-id="select-button"]') t.wait(5) for i in range(t.count(f'//ul[@id="flightModuleList"]//li')): i = i + 1 print(i) dep = t.read(f'(//span[@class="medium-bold"]//span[@data-test-id="departure-time"])[{i}]') if len(dur_ref) == 1: if dep == dep_ref[0]: print('dep OK') dur = t.read(f'(//span[@data-test-id="duration"])[{i}]') t.click(f'(//button[@data-test-id="select-button"])[{i}]') t.wait(5) if t.present('//a[@id="forcedChoiceNoThanks"]'): t.click(f'//a[@id="forcedChoiceNoThanks"]') t.wait(5) for x in range(5): print(x) if t.popup('Flight-Information?'): break else: t.wait(5) price = t.read(f'(//span[@class="packagePriceTotal"])[2]') price = float(price.replace(',', '').replace('SG', '').replace('$', '').replace(' ', '')) print(price) url = t.url() return price, url else: return 0, '' elif len(dur_ref) == 2: print('trip', len(dur_ref)) if dep == dep_ref[0]: print('dep OK') dur = t.read(f'(//span[@data-test-id="duration"])[{i}]') t.click(f'(//button[@data-test-id="select-button"])[{i}]') t.wait(5) util.wait_for_pageload('//button[@data-test-id="select-button"]') t.click(f'//input[@id="airlineRowContainer_{airline[1]}"]') t.wait(2) for j in range(t.count(f'//ul[@id="flightModuleList"]//li')): j = j + 1 print(j) dep = t.read(f'(//span[@data-test-id="departure-time"])[{j}+1]') if dep == dep_ref[1]: print('return dep ok') dur = t.read(f'(//span[@data-test-id="duration"])[{j}+1]') if dur == dur_ref[1]: t.click(f'(//button[@data-test-id="select-button"])[{j}]') t.wait(5) if t.present('//a[@id="forcedChoiceNoThanks"]'): t.click(f'//a[@id="forcedChoiceNoThanks"]') t.wait(5) for x in range(5): print(x) if t.popup('Flight-Information?'): break else: t.wait(5) util.wait_for_pageload('//h1[@class="section-header-main"]') price = t.read(f'(//span[@class="packagePriceTotal"])[2]') price = float(price.replace(',', '').replace('SG', '').replace('$', '').replace(' ', '')) print(price) url = t.url() print(url) return price, url else: return 0, '' elif len(dur_ref) >= 3: dep_lst = [] dur_lst = [] print('multi-trip ', len(dur_ref)) for k in range(len(dur_ref)): dep_lst.append(t.read(f'(//span[@data-test-id="departure-time"])[{3*i+k+1}]')) dur_lst.append(t.read(f'(//span[@data-test-id="duration"])[{3*i+k+1}]')) print(dep_lst) print(dep_ref) if dep_lst == dep_ref: print(dur_lst) print(dur_ref) if dur_lst == dur_ref: t.click(f'(//button[@data-test-id="select-button"])[{j}]') t.wait(5) if t.present('//a[@id="forcedChoiceNoThanks"]'): t.click(f'//a[@id="forcedChoiceNoThanks"]') t.wait(5) for x in range(5): print(x) if t.popup('Flight-Information?'): break else: t.wait(5) price = t.read(f'(//span[@class="packagePriceTotal"])[2]') price = float(price.replace(',', '').replace('SG', '').replace('$', '').replace(' ', '')) print(price) url = t.url() print(url) return price, url else: return 0, ''
import tagui as t t.init() t.url('https://google.com') print("test_tagui ok!")
def read_text_content(content_url, file_name, page_num, i, time, url_prefix): # 读取网页 if 'cnhttp' in content_url: content_url = content_url[21:] # 不知道为什么会出错这个 t.url(content_url) # 启动很慢 else: t.url(content_url) # 启动很慢 # 获取pdf的数量,pdf的名字和pdf应该有的名字 t.wait(2) pdf_count = t.count(element_identifier='//div[@id = "zoom"]//a/@href') if pdf_count == 0: ##如果是正常的txt文件 # 取到列表 print("文件{} 是文档。".format(i)) # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') try: with open(file_name, 'w', encoding='utf-8') as f: f.write(text) except: with open('实施《全国企业兼并破产和职工再就业工作计划》银行呆、坏帐准备金核销办法_1997-10-01.txt', 'w', encoding='utf-8') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) else: with open('wrong_log.txt', 'a', encoding='utf-8') as f: string = 'page {} doc {} didnt write in '.format(page_num, i) f.write(string) f.write("\n") print("write files fails...") else: # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) else: with open('wrong_log.txt', 'a', encoding='utf-8') as f: string = 'page {} doc {} didnt write in '.format(page_num, i) f.write(string) f.write("\n") print("write files fails...") print("文件{} 含有 {} 个文件要下载。".format(i, pdf_count)) pdf_count += 1 # python从0开始,所以至少有一个pdf count current_count = 0 for j in range(1, pdf_count): # 取pdf的名字 if '.htm' not in t.read( element_identifier='//div[@id = "zoom"]//p//a/@href'): print("当前是第{}个文件。。".format(j)) p_count = t.count(element_identifier='//div[@id = "zoom"]//p') while current_count <= p_count: try: if t.read(element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a') != '': # 如果取到了 print("这个p有!") pdf_name = t.read( element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href') # 取合规名 pdf_name_to_change = t.read( element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a') # 下载 suffix = pdf_name.split('.')[-1] pdf_name = pdf_name.split('/')[-1] prefix = 'http://www.pbc.gov.cn' download_link = prefix + t.read( element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href') if 'cnhttp' in download_link: t.url( t.read(element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href')) # 启动很慢 else: t.url(download_link) # 启动很慢 wait_seconds = 1 total_seconds = 0 while os.path.exists(pdf_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if os.path.exists(pdf_name_to_change): break if total_seconds > MAX_WAIT: print('download fails') with open('download_log.txt', 'a', encoding='utf-8') as f: string = 'page {} doc {} file {} didnt download '.format( page_num, i, j) f.write(string) f.write("\n") break if os.path.exists(pdf_name_to_change): pass else: os.rename(pdf_name, pdf_name_to_change) # 改名 os.rename( pdf_name_to_change, pdf_name_to_change[:-(len(suffix) + 1)] + '_' + time + pdf_name_to_change[-(len(suffix) + 1):]) t.url(content_url) # 返回二级目录 # 启动很慢 current_count += 1 break else: current_count += 1 print("这个p没有") except: print('some error occurs, nvm') continue else: print("是个网页,当文档处理!") prefix = 'http://www.pbc.gov.cn' download_link = prefix + t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href') if 'cnhttp' in download_link: t.url( t.read(element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href')) # 启动很慢 else: t.url(download_link) # 启动很慢 # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) else: with open('wrong_log.txt', 'a', encoding='utf-8') as f: string = 'page {} doc {} didnt write in '.format( page_num, i) f.write(string) f.write("\n") print("write files fails...")
def login(self): login_stackoverflow(self.account, self.password) t.url('https://www.gmail.com') return
import tagui as t login = ['7038157994'] cont = len(login) aux = 0 while aux != cont: t.init() t.url( 'http://servicos.coelba.com.br/servicos-ao-cliente/Pages/login-av.aspx?UrlUc=http://servicos.coelba.com.br/servicos-ao-cliente/Pages/2-via-de-conta-coelba.aspx' ) t.click( 'ctl00$m$g_2d0a0930_51e9_4b08_addf_fccd4023f2e8$ctl00$txtContaContrato' ) t.type( 'ctl00$m$g_2d0a0930_51e9_4b08_addf_fccd4023f2e8$ctl00$txtContaContrato', login[aux]) captcha = t.read('textCaptcha') t.click('ctl00$m$g_2d0a0930_51e9_4b08_addf_fccd4023f2e8$ctl00$txtCaptcha') t.type('ctl00$m$g_2d0a0930_51e9_4b08_addf_fccd4023f2e8$ctl00$txtCaptcha', captcha) t.click( 'ctl00$m$g_2d0a0930_51e9_4b08_addf_fccd4023f2e8$ctl00$btnAutenticar') t.close() aux += 1
import tagui as t import datetime import pandas as pd import os import s3_function #超参数 try: str_to_append = str(datetime.datetime.today().date()) # 初始化页面 t.init() # 输入url进入 t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1") # 等15秒反应 t.wait(15) # 鼠标放上去,点击精简选项 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="zksq"]') # 鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="fxr"]') t.type(element_identifier='//*[@id="fxr"]', text_to_type=str_to_append) # 再点击,确保日期不会遮住底下的搜索按钮 t.click(element_identifier='//*[@id="fxr"]') t.hover(element_identifier='//*[@class="ipf01"]') t.click(element_identifier='//*[@class="ipf01"]') # 把展示的尺寸设置为50个产品每页: t.hover(element_identifier='//*[@data-pagesize="50"]') t.click(element_identifier='//*[@data-pagesize="50"]') # 当下一页没有被disable的时候,有以下超参数 page_curr = 1 # 当前页面index