def catchContent(): number_to = t.count( '(//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")])' ) df_to = pd.DataFrame(index=range(0, number_to), columns=['Sno', 'Title', 'URL', 'Summary', 'Img_URL']) t.hover('//div[@class="container footer-main"]') t.wait(2) for n in range(1, number_to): title = t.read( '//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")][{}]//div[contains(@class, "article-listing_content")]//h2' .format(n)) URL_o = t.read( '//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")][{}]//@href' .format(n)) URL = "https://www.todayonline.com" + str(URL_o) Img_link = t.read( '//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")][{}]//img/@src' .format(n)) df_to.iloc[n - 1, 0] = n df_to.iloc[n - 1, 1] = title.decode('utf-8') df_to.iloc[n - 1, 2] = URL df_to.iloc[n - 1, 4] = Img_link for i in range(0, df_to.shape[0]): if df_to['Img_URL'][i] == "": df_to['Img_URL'][i] = np.nan df_to.dropna(subset=['Img_URL'], inplace=True, how='any') df_to = df_to.reset_index(drop=True) df_to['Sno'] = df_to.index df_to = util.fixImgLink( df_to, "https://cf-templates-fghyux9ggb7t-ap-southeast-1.s3-ap-southeast-1.amazonaws.com/todayOnline.png" ) for n in range(0, df_to.shape[0]): t.url(df_to.URL[n]) t.wait(4) t.hover('//div[@class="article-detail_subscription"]') t.wait(2) number_p = t.count('//div/p[not(@class)]') Content = "" for i in range(1, number_p - 2): cont = t.read('//div/p[not(@class)][{}]'.format(i)) Content = Content + "" + cont summaries = Summarize(df_to.Title[n], unicode(str(Content), "utf-8")) df_to.iloc[n - 1, 3] = summaries[0] return df_to
def catchContent(): number_bb = t.count( '(//div[contains(@data-vr-zone, "Top Stories")]//span[contains(@class, "story-headline")])' ) df_bb = pd.DataFrame(index=range(0, number_bb - 2), columns=['Sno', 'Title', 'URL', 'Summary', 'Img_URL']) for n in range(0, number_bb - 2): title = t.read( '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]' .format(n)) URL_b = t.read( '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]//@href' .format(n)) URL = "https://www.straitstimes.com/" + str(URL_b) Img_URL = t.read( '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]/ancestor::div[contains(@class, "body")]/..//img/@src' .format(n)) summaries = SummarizeUrl(URL) df_bb.iloc[n, 0] = n df_bb.iloc[n, 1] = title df_bb.iloc[n, 2] = URL df_bb.iloc[n, 3] = summaries df_bb.iloc[n, 4] = Img_URL return df_bb
def multi_city_trip(enquiry): t.click('//input[@id="fsc-trip-type-selector-multi-destination"]') travel_dates = enquiry["dates"] numDep = len(travel_dates) cities = enquiry["city"] numCity = len(cities) form_flightleg = t.count( '//*[@id="flights-search-controls-root"]/div/div/form/div[2]/ol/li') if numDep < form_flightleg: for cnt in range(form_flightleg - numDep): t.click( f'//*[@id="flights-search-controls-root"]/div/div/form/div[2]/ol/li[{form_flightleg-cnt}]/div[4]/button' ) elif numDep > form_flightleg: for cnt in range(numDep - form_flightleg): t.click( '//div[starts-with(@class,"MulticityControls_MulticityControls__add-leg-wrapper__2arYh")]/button' ) t.wait(0.5) for num in range(0, numDep): start_date = dt.strptime(travel_dates[num], '%d/%m/%Y') start_month = start_date.strftime('%Y-%m') orig_city = cities[num] if numCity == numDep: if num < numDep - 1: dest_city = cities[num + 1] else: dest_city = cities[0] else: dest_city = cities[num + 1] t.type(f'//input[@id="fsc-origin-search-{num}"]', orig_city) t.wait(0.5) t.type(f'//input[@id="fsc-destination-search-{num}"]', dest_city) t.wait(0.5) t.click( f'//button[@id="fsc-leg-date-{num}-fsc-datepicker-button"]//span[starts-with(@class,"DateInput")]' ) t.click( f'//select[@id="fsc-leg-date-{num}-calendar__bpk_calendar_nav_select"]' ) t.select( f'//select[@id="fsc-leg-date-{num}-calendar__bpk_calendar_nav_select"]', f'{start_month}') t.click( f'//button[starts-with(@class,"BpkCalendarDate") and contains(@aria-label,"{start_date.strftime("%d %B %Y").lstrip("0")}")]' ) t.click('//button[starts-with(@id,"CabinClassTravellersSelector")]') t.click('//select[@id="search-controls-cabin-class-dropdown"]') t.select('//select[@id="search-controls-cabin-class-dropdown"]', lookup_cabin_class(enquiry["cabin_class"])) adult_pax = int(enquiry['adult']) child_pax = len(enquiry['child_age']) child_age = enquiry['child_age'] number_of_travellers(adult_pax, child_pax, child_age) t.click('//button[@type="submit"][@aria-label="Search flights"]')
def get_count_values(page_num, url_prefix): t.url(url_prefix + str(page_num) + '.html') print("现在所在页面 {}".format(page_num)) t.wait(5) # 拿到value count_values = t.count(element_identifier='//td[@colspan = "2"]//table') print("页面有{}个文件".format(count_values)) with open('count_items_' + str(page_num) + '_'+str(url_prefix.split('/')[-2]) + '.txt', 'w', encoding='utf-8') as f: f.write('page:' + str(page_num) + ':' + str(count_values)) # 以:为分隔符;记录当前页面和页面总共item数量 return 'count_items_' + str(page_num) + '_'+str(url_prefix.split('/')[-2]) + '.txt'
def login_stackoverflow(account, password): t.url('https://stackoverflow.com/users/login') click('//button[@data-provider="google"]') if wait_element('//div[@data-identifier="{}"]'.format(account)): click('//div[@data-identifier="{}"]'.format(account)) else: c = t.count('//div[@jsslot=""]//li') click('(//div[@jsslot=""]//li)[{}]'.format(c)) type_into('//*[@type="email"]', account + '[enter]') type_into('//*[@name="password"]', password + '[enter]') return
def multi_city_trip(enquiry): t.click('//input[@id="flight-type-multi-dest-hp-flight"]') travel_dates = enquiry["dates"] numDep = len(travel_dates) cities = enquiry["city"] numCity = len(cities) form_flightleg = (t.count( '//div[@class="cols-nested gcw-multidest-flights-container"]/div/fieldset' )) print(form_flightleg) if numDep < form_flightleg: for cnt in range(form_flightleg - numDep): t.click( f'//*[@id="flightlegs-list-fieldset-{form_flightleg-cnt}-hp-flight"]/div/a' ) elif numDep > form_flightleg: for cnt in range(numDep - form_flightleg): t.click('//a[@id="add-flight-leg-hp-flight"]') t.wait(0.5) t.type('//input[@id="flight-origin-hp-flight"]', cities[0]) t.type('//input[@id="flight-destination-hp-flight"]', cities[1]) t.type('//input[@id="flight-departing-single-hp-flight"]', '[clear]') t.type('//input[@id="flight-departing-single-hp-flight"]', (dt.strptime(travel_dates[0], '%d/%m/%Y')).strftime("%d/%m/%Y")) for num in range(1, numDep): print(f"num:{num} and form_flightleg:{form_flightleg}") start_date = dt.strptime(travel_dates[num], '%d/%m/%Y') orig_city = cities[num] if numCity == numDep: if num < numDep - 1: dest_city = cities[num + 1] else: dest_city = cities[0] else: dest_city = cities[num + 1] t.type(f'//input[@id="flight-{num+1}-origin-hp-flight"]', orig_city) t.wait(0.5) t.type(f'//input[@id="flight-{num+1}-destination-hp-flight"]', dest_city) t.wait(0.5) t.type(f'//input[@id="flight-{num+1}-departing-hp-flight"]', '[clear]') t.type(f'//input[@id="flight-{num+1}-departing-hp-flight"]', start_date.strftime("%d/%m/%Y")) t.click('//a[@id="flight-advanced-options-hp-flight"]') t.select('//select[@id="flight-advanced-preferred-class-hp-flight"]', lookup_cabin_class(enquiry["cabin_class"])) t.click('//*[@id="gcw-flights-form-hp-flight"]/div[8]/label/button')
def get_shoe(shoe, g, email): gender = g # print('[nike]',gender) t.init(visual_automation=True) t.url('https://www.nike.com/sg/') t.type('//input[@id = "TypeaheadSearchInput"]', shoe + " shoes") t.click('//button[@class = "btn-search z2 bg-transparent"]') t.wait(3) if gender == " men": t.click('(//span[contains(@class,"filter-item")])[1]') elif gender == " women": t.click('(//span[contains(@class,"filter-item")])[2]') t.wait(1) count = t.count('//a[@class ="product-card__link-overlay"]') # print('[nike]',count) details = [] if count != 0: for i in range(0, min(count, 3)): k = i + 1 name = t.read(f'(//a[@class = "product-card__link-overlay"])[{k}]') price = t.read(f'(//div[@data-test="product-price"])[{k}]') img = t.read( f'(//div[contains(@class, "product-card__hero")]/picture/img)[{k}]/@src' ) link = t.read(f'(//a[contains(@class,"product-card")])[{k}]/@href') # print('[nike]',name , price, img) details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "Nike", "link": link }) else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "Nike", "link": "NA" }) # print(details) return details
def get_count_values(page_num, url_prefix, today): t.url(url_prefix + str(page_num) + '.html') print("现在所在页面 {}".format(page_num)) t.wait(5) # 拿到value count_values = t.count(element_identifier='//td[@colspan = "2"]//table') # today = '2018-04-24' if t.read( element_identifier= '//td[@colspan = "2"]//table[1]//span[@class = "hui12"]') < today: return '今日无增量' print("页面有{}个文件".format(count_values)) with open('count_items_' + str(page_num) + '_' + str(url_prefix.split('/')[-2]) + '.txt', 'w', encoding='utf-8') as f: f.write('page:' + str(page_num) + ':' + str(count_values)) # 以:为分隔符;记录当前页面和页面总共item数量 return 'count_items_' + str(page_num) + '_' + str( url_prefix.split('/')[-2]) + '.txt'
def extract_all_countries(date_stamp, status): num_country = int( t.count('(//a[@class="mt_a"])') / 2) # first half is for today, second half is for yesterday for n in range(1, num_country + 1): data = {} region_detail = {} data['date_stamp'] = date_stamp country_row_xpath = f'(//a[@class="mt_a"])[{n}]' country_total_cases_xpath = country_row_xpath + '/../following-sibling::td[1]' country_new_cases_xpath = country_row_xpath + '/../following-sibling::td[2]' country_total_deaths_xpath = country_row_xpath + '/../following-sibling::td[3]' country_new_deaths_xpath = country_row_xpath + '/../following-sibling::td[4]' country_total_recovered_xpath = country_row_xpath + '/../following-sibling::td[5]' country_active_cases_xpath = country_row_xpath + '/../following-sibling::td[6]' country_serious_cases_xpath = country_row_xpath + '/../following-sibling::td[7]' region_detail['total_cases'] = convert_extracted_numbers( t.read(country_total_cases_xpath)) region_detail['new_cases'] = convert_extracted_numbers( t.read(country_new_cases_xpath)) region_detail['total_deaths'] = convert_extracted_numbers( t.read(country_total_deaths_xpath)) region_detail['new_deaths'] = convert_extracted_numbers( t.read(country_new_deaths_xpath)) region_detail['total_recovered'] = convert_extracted_numbers( t.read(country_total_recovered_xpath)) region_detail['active_cases'] = convert_extracted_numbers( t.read(country_active_cases_xpath)) region_detail['serious_cases'] = convert_extracted_numbers( t.read(country_serious_cases_xpath)) conv_info_str = json.dumps(region_detail) data['conv_info_str'] = conv_info_str country_name = t.read(country_row_xpath) data['country_name'] = country_name status = insert_db(data) return status
def get_news_using_crawler(): try: t.url( 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/news' ) wait_for_pageload('//p[@class="heading text-underline"]') num_news = t.count('//p[@class="heading text-underline"]') if num_news > 5: num_news = 5 delete_news_data_db() date_stamp = datetime.datetime.now( pytz.timezone('Singapore')).strftime('%Y-%m-%d') for n in range(1, num_news + 1): data = {} data['date_stamp'] = date_stamp news_link = t.read( f'(//p[@class="heading text-underline"])[{n}]/ancestor-or-self::a/@href' ) data['news_link'] = news_link news_title = t.read( f'(//p[@class="heading text-underline"])[{n}]/ancestor-or-self::a/@aria-label' ) data['news_title'] = news_title print('Article', n, ":", news_title) print('') news_summaries = SummarizeUrl(news_link) data['news_summary'] = str(news_summaries) print(news_summaries) status = insert_db(data) return status except Exception as e: print(e) finally: t.close()
def catchContent(): number = t.count('(//li[contains(@class, "css-1iski2w")]/a)') df = pd.DataFrame(index=range(0,number), columns = ['Sno', 'Title', 'URL', 'Summary','Img_URL']) for n in range(1, number+1): title=t.read('//li[contains(@class, "css-1iski2w")][{}]/a/div'.format(n)) URL=t.read('//li[contains(@class, "css-1iski2w")][{}]//@href'.format(n)) Img_link=t.read('//li[contains(@class, "css-1iski2w")][{}]//img/@src'.format(n)) summaries = SummarizeUrl(URL) df.iloc[n-1, 0] = n df.iloc[n-1, 1] = title.decode('utf-8') df.iloc[n-1, 2] = URL df.iloc[n-1, 3] = summaries df.iloc[n-1, 4] = Img_link df['Summary'].replace('None', np.nan, inplace=True) df.dropna(subset=['Summary'], inplace=True, how='any') df= df.reset_index(drop=True) df['Sno'] = df.index return df
name_list = ['序号', '产品名称', '发行银行', '委托货币', '发行日', '停售日', '管理期(天)', '预期收益率', '到期收益率', '与同期储蓄比', '综合评级','url'] for col_name in name_list: value_dict.setdefault(col_name, []) #初始化空数据集 #当可以翻页,或数据只有一页的时候,进行循环 stop_flag = False #当当前页面不是最后一页,或只有一页时,都进行如下循环 while (t.read(element_identifier='//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1): if stop_flag == True: #如果没有今年的数据,就没必要翻页了 break #每页的数据量大小(row number) count_values = int(t.count(element_identifier='//tbody[@id = "content"]//tr')) + 1 # python从0开始 #爬取当前页面 for i in range(1, count_values): # 判定条件:如果是今年内(小于今年12-31或等于12-31的),全都要 if str(t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[@class = "px"]')) <= date_end: # 序号 value_dict[name_list[0]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[2]')) # 产品名称 value_dict[name_list[1]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[3]')) # 发行银行 value_dict[name_list[2]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[4]')) # 委托货币
print(f'\n-----start batch {batch_count}-----\n') # start date t.select('//select[@id="searchForm_selectedFromPeriodProjectName"]', START_DATE) # end date t.select('//select[@id="searchForm_selectedToPeriodProjectName"]', END_DATE) # type of sale t.click('//label[@for="checkbox1"]') t.click('//label[@for="checkbox2"]') t.click('//label[@for="checkbox3"]') project_total = t.count('//div[@id="projectContainerBox"]/a') # select projects for _ in range(SELECTION_LIMIT): if project_count > project_total - 1: PROCEED = False break selected = t.read(f'//*[@id="addToProject_{project_count}"]') print(f'select {selected}') t.click(f'//*[@id="addToProject_{project_count}"]') logging.info( f'batch: {batch_count}, project: {selected}, id: {project_count}') project_count += 1
def compliance_data(url_prefix): t.init() # init_url = url_prefix + '1.html' t.url(init_url) #初始url max_page = int( t.read(element_identifier='//td[@class = "Normal"]').split('/') [1]) + 1 #最大page数量 for page_num in range(1, max_page): t.url(url_prefix + str(page_num) + '.html') print("现在所在页面 {}".format(page_num)) t.wait(5) # 拿到value count_values = t.count( element_identifier='//td[@colspan = "2"]//table') + 1 today = datetime.datetime.today() today = str(today.date()) # today = '2018-04-24' if t.read(element_identifier= '//td[@colspan = "2"]//table[1]//span[@class = "hui12"]' ) < today: print("今日无增量") break print("页面有{}个文件".format(count_values - 1)) t.wait(5) for i in range(1, count_values): t.url(url_prefix + str(page_num) + '.html') if t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//span[@class = "hui12"]') < today: t.close() exit(1) file_name = t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']') + str('.txt') prefix = 'http://www.pbc.gov.cn' content_url = prefix + t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') if 'cnhttp' in content_url: content_url = content_url[21:] # 不知道为什么会出错这个 t.url(content_url) text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) print("文件{} 是文档。".format(i)) continue t.url(content_url) #进入二级目录 #获取pdf的数量,pdf的名字和pdf应该有的名字 t.wait(2) pdf_count = t.count( element_identifier='//div[@id = "zoom"]//a/@href') if pdf_count == 0: ##如果是正常的txt文件 # 取到列表 print("文件{} 是文档。".format(i)) # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") elif ('pdf' in t.read( element_identifier='//div[@id = "zoom"]//a/@href')): print("文件{} 含有 {} 个pdf。".format(i, pdf_count)) pdf_count += 1 #python从0开始,所以至少有一个pdf count for j in range(1, pdf_count): #取pdf的名字 if t.read(element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href') != '': print("当前是第{}个pdf。。".format(j)) pdf_name = t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href').split('/')[-1] #取合规名 pdf_name_to_change = t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a') #下载 prefix = 'http://www.pbc.gov.cn' t.url(prefix + t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href')) wait_seconds = 1 total_seconds = 0 while os.path.exists(pdf_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > 30: print('download fails') break os.rename(pdf_name, pdf_name_to_change) #改名 t.url(content_url) #返回二级目录 else: print("不合规,当文档处理!不读了!!!") # 取text if t.read(element_identifier='//div[@id = "zoom"]' ) != '': text = t.read( element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]' ) != '': text = t.read( element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") t.url(url_prefix + str(page_num) + '.html') break else: print("文件{} 含有 {} 个pdf。".format(i, pdf_count)) print("含有其他format的href,当文档处理!不读了!!!") # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") t.url(url_prefix + str(page_num) + '.html') break t.close()
def history_data(url_prefix): t.init() init_url = url_prefix + '1.html' t.url(init_url) max_page = int( t.read(element_identifier='//td[@class = "Normal"]').split('/')[1]) + 1 for page_num in range(1, max_page): #主页面 t.url(url_prefix + str(page_num) + '.html') print("现在所在页面 {}".format(page_num)) t.wait(5) #拿到value count_values = t.count( element_identifier='//td[@colspan = "2"]//table') + 1 today = datetime.datetime.today() today = str(today.date()) if t.read(element_identifier= '//td[@colspan = "2"]//table[1]//span[@class = "hui12"]' ) < today: print("今日无增量") break print("页面有{}个文件".format(count_values - 1)) t.wait(5) for i in range(1, count_values): t.url(url_prefix + str(page_num) + '.html') if t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//span[@class = "hui12"]') < today: break if '.html' in t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'): #取到列表 print("文件{} 是文档。".format(i)) file_name = t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']') + str('.txt') prefix = 'http://www.pbc.gov.cn' content_url = prefix + t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//td//a/@href') # 点击url if content_url == 'http://www.pbc.gov.cnhttp://www.pbc.gov.cn/goutongjiaoliu/113456/113469/3487563/index.html': content_url = 'http://www.pbc.gov.cn/goutongjiaoliu/113456/113469/3487563/index.html' #不知道为什么会出错这个 t.url(content_url) # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") elif '.doc' in t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'): # 取到数据 print("文件{} 是下载doc。".format(i)) file_name = t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href').split('/')[-1] prefix = 'http://www.pbc.gov.cn' content_url = prefix + t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') t.url(content_url) wait_seconds = 1 total_seconds = 0 while os.path.exists(file_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > 30: print('download fails') break else: print("unknown format..") print("爬好一次,返回页面 {}".format(page_num)) #close out t.close()
def getblanklist(): #初始化页面 t.init() #输入url进入 t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1") #直接开始搜索,不需要任何筛选条件 t.click(element_identifier='//*[@id="fxr"]') t.hover(element_identifier='//*[@class="ipf01"]') t.click(element_identifier='//*[@class="ipf01"]') #把展示的尺寸设置为50个产品每页: t.hover(element_identifier='//*[@data-pagesize="50"]') t.click(element_identifier='//*[@data-pagesize="50"]') #点击以发行日升序排行,等价于"倒过来取",这样发行日为空的会在最前面 t.hover(element_identifier='//*[@data-sort = "sell_org_date"]') t.click(element_identifier='//*[@data-sort = "sell_org_date"]') #当下一页没有被disable的时候,有以下超参数 page_curr = 1 #当前页面index max_page = 1 # 最大的页面数记录 # 存放列名 value_dict = {} # 存放data name_list = ['序号', '综合评级', 'url'] for col_name in name_list: value_dict.setdefault(col_name, []) # 初始化空数据集 #当可以翻页,或数据只有一页的时候,进行循环 stop_flag = False # 初始化一个flag,flag = true代表我们需要的数据已经取完了,没必要再翻页了 while (t.read(element_identifier= '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1): if stop_flag == True: #如果没有空白数据了,就没必要翻页了 break max_page = page_curr #每页的数据量大小(row number) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 # 爬取页面所有一个table里的值 filename = str(page_curr) + "blank_date.csv" t.wait(1) # 等1秒,万一加载错误了 t.table( element_identifier='//div[@class = "table-s1 tab-s2 w100"]//table', filename_to_save=filename) #爬取当前页面 (只有title和href) for i in range(1, count_values): # 判定条件:如果发行日是空(--),进入此if if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[@class = "px"]')) == '--': # print("number {} is running".format(str(i))) # 序号 value_dict[name_list[0]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[2]')) # 综合评级 value_dict[name_list[1]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[12]//i/@title')) # url value_dict[name_list[2]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//a/@href')) else: # 如果不再是空值-- ,此线程结束,flag置true, while循环结束 stop_flag = True # print("thread stops here..") break # 翻页 page_curr += 1 # print("turn the page..") # 鼠标模拟移动,并点击翻页 t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]') t.click(element_identifier='//*[@href="' + str(page_curr) + '"]') # #关闭tagui流 t.close() # # 输出格式为:"blank_date.csv" hist_data = pd.DataFrame(value_dict) hist_data.to_csv("blank_date.csv", index=False, encoding='UTF-8') return max_page
for loop_wait in range(1, 60): print(f"{loop_wait}. waiting for page to appear. wait for 1s...") if t.present(selector): wait_status = 1 break else: t.wait(1) print("Covid wait_status = {}".format(wait_status)) t.init() t.url('https://www.worldometers.info/coronavirus/') wait_for_pageload('//div[@class="maincounter-number"]') num_country = int(t.count('(//a[@class="mt_a"])') / 2) print("Number of Countries found: " + str(num_country)) country_list = [] link_list = [] for n in range(1, num_country + 1): try: country_row_xpath = f'(//a[@class="mt_a"])[{n}]' country_link_xpath = country_row_xpath + '/@href' country_link = 'https://www.worldometers.info/coronavirus/' + t.read( country_link_xpath) link_list.append(country_link) country_name = t.read(country_row_xpath) country_list.append(country_name)
def get_property(input_email, input_name, prefer1, prefer2, prefer3, input_loc, input_size, input_price, input_bed, input_floor): """ :param input_email: user email :param input_name: user name :param prefer1: :param prefer2: :param prefer3: :param input_loc: location name input_loc = ['Orchard', 'River Valley','Eunos'] :param input_size: square feet :param input_price: :param input_bed: :param input_floor: :return: """ # chatbot input input_area = list() if input_loc in ['Cecil', 'Raffles Place', 'Marina']: input_area.append('D01') elif input_loc in ['Chinatown', 'Tanjong Pagar']: input_area.append('D02') elif input_loc in ['Alexandra', 'Queenstown', 'Tiong Bahru']: input_area.append('D03') elif input_loc in ['Harbourfront', 'Telok Blangah', 'Mount Faber']: input_area.append('D04') elif input_loc in ['Buona Vista', 'Pasir Panjang', 'Clementi']: input_area.append('D05') elif input_loc in ['City Hall', 'Clarke Quay']: input_area.append('D06') elif input_loc in ['Beach Road', 'Bugis', 'Golden Mile']: input_area.append('D07') elif input_loc in ['Farrer Park', 'Little India']: input_area.append('D08') elif input_loc in ['Orchard', 'River Valley']: input_area.append('D09') elif input_loc in ['Balmoral', 'Holland', 'Bukit Timah']: input_area.append('D10') elif input_loc in ['Newton', 'Novena', 'Thomson']: input_area.append('D11') elif input_loc in ['Balestier', 'Toa Payoh', 'Serangoon']: input_area.append('D12') elif input_loc in ['Macpherson', 'Braddell']: input_area.append('D13') elif input_loc in ['Sims', 'Geylang', 'Paya Lebar']: input_area.append('D14') elif input_loc in ['Joo Chiat', 'Marine Parade', 'Katong']: input_area.append('D15') elif input_loc in ['Bedok', 'Upper East Coast', 'Siglap']: input_area.append('D16') elif input_loc in ['Flora', 'Changi', 'Loyang']: input_area.append('D17') elif input_loc in ['Pasir Ris', 'Tampines']: input_area.append('D18') elif input_loc in ['Serangoon Gardens', 'Punggol', 'Sengkang']: input_area.append('D19') elif input_loc in ['Ang Mo Kio', 'Bishan', 'Thomson']: input_area.append('D20') elif input_loc in ['Clementi Park', 'Upper Bukit Timah', 'Ulu Pandan']: input_area.append('D21') elif input_loc in ['Boon Lay', 'Jurong', 'Tuas']: input_area.append('D22') elif input_loc in [ 'Dairy Farm', 'Bukit Panjang', 'Choa Chu Kang', 'Hillview', 'Bukit Batok' ]: input_area.append('D23') elif input_loc in ['Lim Chu Kang', 'Tengah', 'Kranji']: input_area.append('D24') elif input_loc in ['Admiralty', 'Woodlands']: input_area.append('D25') elif input_loc in ['Mandai', 'Upper Thomson']: input_area.append('D26') elif input_loc in ['Sembawang', 'Yishun']: input_area.append('D27') elif input_loc in ['Seletar', 'Yio Chu Kang']: input_area.append('D28') print(input_area) input_type = [ 'condo' ] # HDB, condo, landed (only single choice is supported in propertyguru) input_minsize = [str(input_size * 0.8)] # square feet @ modified input_maxsize = [str(input_size * 1.2)] # square feet @ modified input_minprice = [str(input_price * 0.5)] # $ @ modified input_maxprice = [str(input_price * 1.5)] # $ @ modified input_bed = [str(input_bed)] # 0 to 5 bedroom, 0 stands for studio, @ input_floor = [ str(input_floor) ] # ground, low, mid, high, penthouse (only single choice is supported in propertyguru @ # url transfer def url_area(): url_area = '' for n in input_area: url_area += f'district_code%5B%5D={n}&' return url_area def url_type(): if 'HDB' in input_type: url_type = 'property_type=H&' if 'condo' in input_type: url_type = 'property_type=N&' if 'landed' in input_type: url_type = 'property_type=L&' return url_type def url_minsize(): url_minsize = ''.join(input_minsize) return f'minsize={url_minsize}&' def url_maxsize(): url_maxsize = ''.join(input_maxsize) return f'maxsize={url_maxsize}&' def url_minprice(): url_minprice = ''.join(input_minprice) return f'minprice={url_minprice}&' def url_maxprice(): url_maxprice = ''.join(input_maxprice) return f'maxprice={url_maxprice}&' def url_bed(): url_bed = '' for n in input_bed: url_bed += f'beds%5B%5D={n}&' return url_bed def url_floor(): url_floor = '' if 'ground' in input_floor: url_floor = 'floor_level=GND&' if 'low' in input_floor: url_floor = 'floor_level=LOW&' if 'mid' in input_floor: url_floor = 'floor_level=MID&' if 'high' in input_floor: url_floor = 'floor_level=HIGH&' if 'penthouse' in input_floor: url_floor = 'floor_level=PENT&' return url_floor url_main = f'https://www.propertyguru.com.sg/property-for-sale?market=residential&{url_type()}{url_area()}{url_minprice()}{url_maxprice()}{url_bed()}{url_minsize()}{url_maxsize()}{url_floor()}newProject=all' print('main page url link: ' + url_main) # tagui scrape t.init() t.url(url_main) result = wait_for_mainpageload('//div[@class="header-wrapper"]') if result == 0: print(' no result found') mail_notfound(input_email, input_name, input_loc, input_size, input_price, input_bed, input_floor) # restart BuyerAgent.py python = sys.executable os.execl(python, python, *sys.argv) num_result = t.count('//div[@class="header-wrapper"]') num_result_ad = num_result + 2 # num_result_ad = 6 # for test print("num of property in this page without ad = ", num_result) print("num of property in this page including ad = ", num_result_ad) # load main page, get detail page url link url = [''] * num_result_ad for n in [x for x in range(1, num_result_ad + 1) if x != 4 and x != 8]: # skip 4th and 8th advertisement wait_for_pageload( f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)' ) url[n - 1] = read_if_present( f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)' ) print(f"{n}. url = " + url[n - 1]) # load detail page property_title = [''] * num_result_ad type = [''] * num_result_ad area = [''] * num_result_ad bedroom = [''] * num_result_ad bathroom = [''] * num_result_ad price = [''] * num_result_ad total = [''] * num_result_ad address = [''] * num_result_ad postcode = [''] * num_result_ad region = [''] * num_result_ad floor = [''] * num_result_ad furnish = [''] * num_result_ad description = [''] * num_result_ad feature = [''] * num_result_ad image1 = [''] * num_result_ad image2 = [''] * num_result_ad image3 = [''] * num_result_ad id = [''] * num_result_ad pdf = [''] * num_result_ad pdf_link = [''] * num_result_ad for n in [x for x in range(1, num_result_ad + 1) if x != 4 and x != 8]: t.url("https://www.propertyguru.com.sg" + url[n - 1]) wait_for_pageload('//h1[@class="h2"]') property_title[n - 1] = read_if_present('//h1[@class="h2"]') print(f"{n}. property_title = " + property_title[n - 1]) type[n - 1] = read_if_present( '//*[@id="condo-profile"]/div/div/div/div/div[1]/div/div/div[1]/div/div[2]' ) print(f"{n}. type = " + type[n - 1]) area[n - 1] = read_if_present( '//*[@id="details"]/div/div[1]/div[2]/div[3]/div/div[2]') print(f"{n}. area = " + area[n - 1]) bedroom[n - 1] = read_if_present( '//*[@id="overview"]/div/div/div/section/div[1]/div[2]/div[1]/span' ) print(f"{n}. bedroom = " + bedroom[n - 1]) bathroom[n - 1] = read_if_present( '//*[@id="overview"]/div/div/div/section/div[1]/div[2]/div[2]/span' ) print(f"{n}. bathroom = " + bathroom[n - 1]) total[n - 1] = read_if_present( '//*[@id="overview"]/div/div/div/section/div[1]/div[1]/div[1]/span[2]' ) print(f"{n}. total price = " + total[n - 1]) price[n - 1] = read_if_present( '//*[@id="overview"]/div/div/div/section/div[1]/div[2]/div[4]/div/span[2]' ) print(f"{n}. price = " + price[n - 1]) address[n - 1] = read_if_present( '//*[@id="overview"]/div/div/div/section/div[1]/div[3]/div/div[2]/div[1]/span[1]' ) print(f"{n}. address = " + address[n - 1]) postcode[n - 1] = read_if_present( '//*[@id="overview"]/div/div/div/section/div[1]/div[3]/div/div[2]/div[1]/span[2]' ) print(f"{n}. postalcode = " + postcode[n - 1]) region[n - 1] = read_if_present( '//*[@id="overview"]/div/div/div/section/div[1]/div[3]/div/div[2]/div[1]/span[3]' ) print(f"{n}. region = " + region[n - 1]) floor[n - 1] = read_if_present( '//*[@id="details"]/div/div[1]/div[2]/div[9]/div/div[2]') print(f"{n}. floor = " + floor[n - 1]) furnish[n - 1] = read_if_present( '//*[@id="details"]/div/div[1]/div[2]/div[7]/div/div[2]') print(f"{n}. furnish = " + furnish[n - 1]) description[n - 1] = read_if_present('//*[@id="details"]/div/div[2]') print(f"{n}. description = " + description[n - 1]) feature[n - 1] = read_if_present('//*[@id="facilities"]') print(f"{n}. feature = " + feature[n - 1]) image1[n - 1] = read_if_present( '//*[@id="carousel-photos"]/div[2]/div/div[1]/span/img/@src') print(f"{n}. image1 = " + image1[n - 1]) image2[n - 1] = read_if_present( '//*[@id="carousel-photos"]/div[2]/div/div[2]/span/img/@src') print(f"{n}. image2 = " + image2[n - 1]) image3[n - 1] = read_if_present( '//*[@id="carousel-photos"]/div[2]/div/div[3]/span/img/@src') print(f"{n}. image3 = " + image3[n - 1]) pdf[n - 1] = read_if_present( '//*[@id="sticky-right-col"]/div[3]/a[2]/@href') pdf_link[n - 1] = 'https://www.propertyguru.com.sg' + pdf[n - 1] print(f"{n}. pdf_link = " + pdf_link[n - 1]) id[n - 1] = read_if_present( '//*[@id="details"]/div/div[1]/div[2]/div[10]/div/div[2]') print(f"{n}. id = " + id[n - 1]) property_info = { 'property_title': property_title, 'url': ['https://www.propertyguru.com.sg' + x for x in url], 'type': type, 'area': area, 'total price': total, 'price': price, 'bedroom': bedroom, 'bathroom': bathroom, 'address': address, 'postcode': postcode, 'region': region, 'floor': floor, 'furnish': furnish, 'description': description, 'feature': feature, 'image1': image1, 'image2': image2, 'image3': image3, 'id': id, 'pdf_link': pdf_link, } df = DataFrame(property_info, columns=[ 'property_title', 'id', 'pdf_link', 'type', 'area', 'total price', 'price', 'bedroom', 'bathroom', 'address', 'postcode', 'region', 'floor', 'furnish', 'description', 'feature', 'url', 'image1', 'image2', 'image3' ]) df.to_excel('property_info.xlsx', encoding='utf8', index=None) print('======== property_info.xlsx saved ==========') # from propertySearcher_util import download_image download_image(id, image1, image2, image3) # from propertySearcher_util import classify_image filtered_id, filtered_cluster = classify_image(df, prefer1, prefer2, prefer3) print(df) # generate image filtered df, sorted by filtered_id filtered_df = df[df['id'].isin(filtered_id)] # write image cluster column into df filtered_df['image'] = filtered_cluster print(filtered_df) # save to excel filtered_df.to_excel('property_info_image.xlsx', encoding='utf8', index=None) print('======== generate data for pdf downloader ==========') property_title = filtered_df[ 'property_title'] # filtered_df = pd.read_excel('property_info_filtered.xlsx') print(list(property_title)) pdf_link = filtered_df['pdf_link'] print(list(pdf_link)) pdf_id = filtered_df['id'] print(list(pdf_id)) # from propertySearcher_util import download_pdf pdf_filename = download_pdf( property_title, pdf_link, pdf_id ) # pdf_filename = property_title + pdf_id, pdf_filename for email attachment # from propertySearcher_util import classify_text features_selected = classify_text(filtered_df, 3, 6) # edit dataframe filtered_df['Key Features'] = features_selected filtered_df = filtered_df.drop(columns=[ 'pdf_link', 'description', 'feature', 'image1', 'image2', 'image3' ]) # save to excel filtered_df.to_excel('Property_info_text.xlsx', encoding='utf8', index=None) # from propertySearcher_util import edit_excel edit_excel('Property_info_text.xlsx') print('============ excel saved ============') # from propertySearcher_util import mail_shortlist mail_shortlist(input_email, input_name, pdf_filename)
logging.basicConfig(filename = "log.txt") srcDirectory = "OrgImage" t.init(visual_automation = True) for target in findAllFile(srcDirectory): target_image = 'OrgImage/' + target t.url('https://www.bing.com') t.click('//div[@id="sb_sbi"]/img') t.upload("input.fileinput",target_image) t.wait(3) succDownload = False image_nums = t.count('//a[@class="richImgLnk"]') print(image_nums) if t.click('//li[contains(string(),"Pages")]') == False: image_nums = 0 t.wait(3) for i in range(1, image_nums): if t.click(f'(//a[@class="richImgLnk"])[{i}]'): t.wait(3) t.keyboard('[ctrl]l') t.keyboard('[ctrl]c') imgUrl = t.clipboard() print(imgUrl)
def history_data_daily(url_prefix): t.init() # init_url = url_prefix + '1.html' t.url(init_url) # 初始url max_page = int(t.read(element_identifier='//td[@class = "Normal"]').split('/')[1]) + 1 # 最大page数量 for page_num in range(1, max_page): t.url(url_prefix + str(page_num) + '.html') print("现在所在页面 {}".format(page_num)) t.wait(5) # 拿到value count_values = t.count(element_identifier='//td[@colspan = "2"]//table') + 1 today = datetime.datetime.today() today = str(today.date()) # today = '2018-04-24' if t.read(element_identifier='//td[@colspan = "2"]//table[1]//span[@class = "hui12"]') < today: print("今日无增量") break print("页面有{}个文件".format(count_values - 1)) t.wait(5) for i in range(1, count_values): t.url(url_prefix + str(page_num) + '.html') if t.read(element_identifier='//td[@colspan = "2"]//table['+str(i)+']//span[@class = "hui12"]') < today: t.close() exit(1) file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']') file_name = file_name[:-10] + str("_") + file_name[-10:] + str('.txt') time = file_name[-14:-4] prefix = 'http://www.pbc.gov.cn' content_url = prefix + t.read( element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') if '.html' not in t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'): # 当直接跳到需要下载的文件的时候 if 'cnhttp' in content_url: content_url = content_url[21:] # 不知道为什么会出错这个 # 取到数据 print("文件{} 是直接下载文件。".format(i)) file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') suffix = file_name.split('.')[-1] file_name = file_name.split('/')[-1] t.url(content_url) wait_seconds = 1 total_seconds = 0 while os.path.exists(file_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > 30: print('download fails') break os.rename(file_name, file_name[:-(len(suffix)+1)] + "_" + time +'.'+file_name[-(len(suffix)+1):]) else: # 取到数据 print("文件{} 是直接下载文件。".format(i)) file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href') suffix = file_name.split('.')[-1] file_name = file_name.split('/')[-1] t.url(content_url) wait_seconds = 1 total_seconds = 0 while os.path.exists(file_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > 30: print('download fails') break os.rename(file_name, file_name[:-(len(suffix)+1)] + "_" + time +'.'+file_name[-(len(suffix)+1):]) else: # 当没有直接下载的时候 if 'cnhttp' in content_url: content_url = content_url[21:] # 不知道为什么会出错这个 t.url(content_url) else: t.url(content_url) # 获取pdf的数量,pdf的名字和pdf应该有的名字 t.wait(2) pdf_count = t.count(element_identifier='//div[@id = "zoom"]//a/@href') if pdf_count == 0: ##如果是正常的txt文件 # 取到列表 print("文件{} 是文档。".format(i)) # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") else: # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") print("文件{} 含有 {} 个文件要下载。".format(i, pdf_count)) pdf_count += 1 # python从0开始,所以至少有一个pdf count current_count = 0 for j in range(1, pdf_count): # 取pdf的名字 if '.htm' not in t.read(element_identifier='//div[@id = "zoom"]//p//a/@href'): print("当前是第{}个文件。。".format(j)) p_count = t.count(element_identifier='//div[@id = "zoom"]//p') while current_count <= p_count: if t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a') != '': #如果取到了 print("这个p有!") pdf_name = t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href') # 取合规名 pdf_name_to_change = t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a') # 下载 suffix = pdf_name.split('.')[-1] pdf_name = pdf_name.split('/')[-1] prefix = 'http://www.pbc.gov.cn' download_link = prefix + t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href') if 'cnhttp' in download_link: t.url(t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href')) else: t.url(download_link) wait_seconds = 1 total_seconds = 0 while os.path.exists(pdf_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if total_seconds > 30: print('download fails') break os.rename(pdf_name, pdf_name_to_change) # 改名 os.rename(pdf_name_to_change, pdf_name_to_change[:-(len(suffix)+1)] + '_' + time + pdf_name_to_change[-(len(suffix)+1):]) t.url(content_url) # 返回二级目录 current_count += 1 break else: current_count += 1 print("这个p没有") else: print("是个网页,当文档处理!") prefix = 'http://www.pbc.gov.cn' download_link = prefix + t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href') if 'cnhttp' in download_link: t.url(t.read(element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href')) else: t.url(download_link) # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w') as f: f.write(text) else: print("write files fails...") t.close()
def getdailyincrement(str_to_append): #初始化页面 t.init() #输入url进入 t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1") #等5秒反应 t.wait(15) #鼠标放上去,点击精简选项 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="zksq"]') #鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="fxr"]') t.type(element_identifier='//*[@id="fxr"]', text_to_type=str_to_append) #再点击,确保日期不会遮住底下的搜索按钮 t.click(element_identifier='//*[@id="fxr"]') t.hover(element_identifier='//*[@class="ipf01"]') t.click(element_identifier='//*[@class="ipf01"]') #把展示的尺寸设置为50个产品每页: t.hover(element_identifier='//*[@data-pagesize="50"]') t.click(element_identifier='//*[@data-pagesize="50"]') #当下一页没有被disable的时候,有以下超参数 page_curr = 1 #当前页面index value_dict = {} #存放data count = 1 #csv 命名用 #存放列名 name_list = ['序号', '综合评级', 'url'] for col_name in name_list: value_dict.setdefault(col_name, []) #初始化空数据集 #当可以翻页,或数据只有一页的时候,进行循环 while (t.read(element_identifier= '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1): #每页的数据量大小(row number) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 # 爬取页面所有一个table里的值 if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(count_values - 1) + ']//td[@class = "px"]')) > str_to_append: # print("direct continue..") # 翻页 page_curr += 1 # 鼠标模拟移动,并点击翻页 t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]') t.click(element_identifier='//*[@href="' + str(page_curr) + '"]') continue filename = str(count) + "daily_data.csv" count += 1 t.wait(1) # 等1秒,万一加载错误了 t.table( element_identifier='//div[@class = "table-s1 tab-s2 w100"]//table', filename_to_save=filename) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 for i in range(1, count_values): # 判定条件:如果是今天刚发行的,拿到所有主页面上的数据; #如果最下面那条数据都大于今天,就直接翻页 if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(count_values - 1) + ']//td[@class = "px"]')) > str_to_append: # print("direct break..") break else: if str( t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[@class = "px"]')) == str_to_append: #序号 value_dict[name_list[0]].append( t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[2]')) #综合评级 value_dict[name_list[1]].append( t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[12]//i/@title')) #url value_dict[name_list[2]].append( t.read( element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//a/@href')) else: #如果不是今天增量,什么都不做 pass # print("turn the page..") # 翻页 page_curr += 1 # 鼠标模拟移动,并点击翻页 t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]') t.click(element_identifier='//*[@href="' + str(page_curr) + '"]') #关闭tagui流 t.close() #输出格式为:"今日日期.csv" today_data = pd.DataFrame(value_dict) today_data.to_csv(str_to_append + ".csv", index=False, encoding='UTF-8') return count - 1
def main_operation(url, mode='txt'): # 当前页面 curr_page = int( t.read(element_identifier='//div[@class = "ng-binding"][last()]'). split('/')[0]) # 点击按钮 list_count = t.count( element_identifier='//div[@class = "list caidan-right-list"]' ) # 循环列表,取出总list有几个 #如果是断点,读取断电数据 if os.path.exists('baojianhui_log.txt'): with open('baojianhui_log.txt', 'r') as f: params = f.read().split(',') curr_page = params[0] start_i = params[1] start_j = params[2] else: #如果是第一次执行,全取初始值; start_i = 1 start_j = 1 #常规操作 for i in range(1, list_count + 1): t.wait(5) if i < int(start_i): continue item_count = t.count( element_identifier='//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"]') # 取出每个list里的具体法规有几条 print('当前是list {}, 里面的元素有 {} 个'.format(str(i), str(item_count))) t.wait(5) for j in range(1, item_count + 1): if j < int(start_j): continue item_title = t.read( element_identifier='//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a') time_suffix = t.read( element_identifier='//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//span[@class = "date ng-binding"]') if str(time_suffix) != str((datetime.datetime.today()).date( )): #如果不是今日日期,直接return;str((datetime.datetime.today()).date()) print('今日增量已取完') return True, '今日无增量' file_name = item_title + '_' + time_suffix + '.txt' if '/' in file_name: file_name = file_name.replace('/', ' ') if mode == 'txt': #点击 link = t.read(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a/@ng-href') prefix = 'http://www.cbirc.gov.cn/cn/view/pages/' final_link = prefix + link t.url(final_link) t.wait(1) while not os.path.exists(file_name): # type_1 = t.read(element_identifier='//div[@class = "Section0"]') + t.read(element_identifier='//div[@class = "Section1"]') # type_2 = t.read(element_identifier='//div[@class = "WordSection1"]') # type_3 = t.read(element_identifier='//div[@class = "wenzhang-content ng-binding"]') if t.read( element_identifier='//div[@class = "Section0"]' ) + t.read(element_identifier='//div[@class = "Section1"]' ) != '': #p0 p_counts_section0 = t.count( element_identifier='//div[@class = "Section0"]//p') content_list = [] with open(file_name, 'w', encoding='utf-8') as f: f.write(file_name.split("_")[0] + "\n") for p in range(1, p_counts_section0 + 1): content_list.append( t.read(element_identifier= '//div[@class = "Section0"]//p[' + str(p) + ']')) with open(file_name, 'a', encoding='utf-8') as f: f.writelines( [content + "\n" for content in content_list]) #p1 p_counts_section1 = t.count( element_identifier='//div[@class = "Section1"]//p') content_list = [] for p in range(1, p_counts_section1 + 1): content_list.append( t.read(element_identifier= '//div[@class = "Section1"]//p[' + str(p) + ']')) with open(file_name, 'a', encoding='utf-8') as f: f.writelines( [content + "\n" for content in content_list]) break elif t.read( element_identifier='//div[@class = "WordSection1"]' ) != '': p_counts = t.count(element_identifier= '//div[@class = "WordSection1"]//p') if p_counts <= 1: content_list = t.read( element_identifier= '//div[@class = "WordSection1"]//p') with open(file_name, 'w', encoding='utf-8') as f: f.write(file_name.split("_")[0] + "\n") f.writelines([ content + "\n" for content in content_list.split(" ") ]) else: content_list = [] for p in range(1, p_counts + 1): content_list.append( t.read( element_identifier= '//div[@class = "WordSection1"]//p[' + str(p) + ']')) with open(file_name, 'w', encoding='utf-8') as f: f.write(file_name.split("_")[0] + "\n") f.writelines([ content + "\n" for content in content_list ]) break elif t.read(element_identifier= '//div[@class = "wenzhang-content ng-binding"]' ) != '': #有p》1 #无p 《=1, 用split p_counts = t.count( element_identifier= '//div[@class = "wenzhang-content ng-binding"]//p') if p_counts <= 1: content_list = t.read( element_identifier= '//div[@class = "wenzhang-content ng-binding"]//p' ) with open(file_name, 'w', encoding='utf-8') as f: f.write(file_name.split("_")[0] + "\n") f.writelines([ content + "\n" for content in content_list.split(" ") ]) else: content_list = [] for p in range(1, p_counts + 1): content_list.append( t.read( element_identifier= '//div[@class = "wenzhang-content ng-binding"]//p[' + str(p) + ']')) with open(file_name, 'w', encoding='utf-8') as f: f.write(file_name.split("_")[0] + "\n") f.writelines([ content + "\n" for content in content_list ]) break else: content = ' ' with open(file_name, 'w') as f: f.write(content) break elif mode == 'doc': t.click(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a[@ng-click = "fileDownload(x.docFileUrl)"]') doc_id = t.read(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a/@ng-href').split('=')[1][:-7] doc_name = doc_id + '.doc' curr_clock = 5 while not os.path.exists(doc_name): t.wait(curr_clock) curr_clock += 5 if curr_clock > MAX_WAIT: break t.wait(5) os.rename(doc_name, item_title + '_' + time_suffix + '.doc') elif mode == 'pdf': t.click(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a[@ng-click = "fileDownload(x.pdfFileUrl)"]') pdf_id = t.read(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a/@ng-href').split('=')[1][:-7] pdf_name = pdf_id + '.pdf' curr_clock = 5 while not os.path.exists(pdf_name): t.wait(curr_clock) curr_clock += 5 if curr_clock > MAX_WAIT: break t.wait(5) os.rename(pdf_name, item_title + '_' + time_suffix + '.pdf') else: print('unknown format..') t.close() raise Exception("unknown input mode") # 返回主页面 t.url(url + str(curr_page)) t.wait(5) with open('baojianhui_log.txt', 'w') as f: f.write(str(curr_page) + ',' + str(i) + ',' + str(j)) with open('baojianhui_log.txt', 'w') as f: f.write(str(curr_page) + ',' + str(i) + ',' + str(1)) #当前list取完,j更新
def getExpFlightPrice(airline, dep_ref, dur_ref): print(airline) print(dep_ref) print(dur_ref) util.wait_for_pageload('//input[@classes="filter-checkbox"]') t.wait(3) t.click(f'//a[@data-content-id="airlineToggleContainer"]') for i in range(len(dep_ref)): if i == 0: if t.present(f'//input[@id="airlineRowContainer_{airline[i]}"]'): t.wait(3) t.click(f'//input[@id="airlineRowContainer_{airline[i]}"]') else: print('Not match') return 0, '' elif airline[i] != airline[i-1]: if t.present(f'//input[@id="airlineRowContainer_{airline[i]}"]'): t.wait(1) t.click(f'//input[@id="airlineRowContainer_{airline[i]}"]') else: print('Not match') return 0, '' if dep_ref[i][0] == '0': dep_ref[i] = dep_ref[i][1:] if dur_ref[i][-1:] == 'h': dur_ref[i] = dur_ref[i] + ' 0m' else: dur_ref[i] = dur_ref[i] + 'm' print(airline) print(dep_ref) print(dur_ref) util.wait_for_pageload('//button[@data-test-id="select-button"]') t.wait(5) for i in range(t.count(f'//ul[@id="flightModuleList"]//li')): i = i + 1 print(i) dep = t.read(f'(//span[@class="medium-bold"]//span[@data-test-id="departure-time"])[{i}]') if len(dur_ref) == 1: if dep == dep_ref[0]: print('dep OK') dur = t.read(f'(//span[@data-test-id="duration"])[{i}]') t.click(f'(//button[@data-test-id="select-button"])[{i}]') t.wait(5) if t.present('//a[@id="forcedChoiceNoThanks"]'): t.click(f'//a[@id="forcedChoiceNoThanks"]') t.wait(5) for x in range(5): print(x) if t.popup('Flight-Information?'): break else: t.wait(5) price = t.read(f'(//span[@class="packagePriceTotal"])[2]') price = float(price.replace(',', '').replace('SG', '').replace('$', '').replace(' ', '')) print(price) url = t.url() return price, url else: return 0, '' elif len(dur_ref) == 2: print('trip', len(dur_ref)) if dep == dep_ref[0]: print('dep OK') dur = t.read(f'(//span[@data-test-id="duration"])[{i}]') t.click(f'(//button[@data-test-id="select-button"])[{i}]') t.wait(5) util.wait_for_pageload('//button[@data-test-id="select-button"]') t.click(f'//input[@id="airlineRowContainer_{airline[1]}"]') t.wait(2) for j in range(t.count(f'//ul[@id="flightModuleList"]//li')): j = j + 1 print(j) dep = t.read(f'(//span[@data-test-id="departure-time"])[{j}+1]') if dep == dep_ref[1]: print('return dep ok') dur = t.read(f'(//span[@data-test-id="duration"])[{j}+1]') if dur == dur_ref[1]: t.click(f'(//button[@data-test-id="select-button"])[{j}]') t.wait(5) if t.present('//a[@id="forcedChoiceNoThanks"]'): t.click(f'//a[@id="forcedChoiceNoThanks"]') t.wait(5) for x in range(5): print(x) if t.popup('Flight-Information?'): break else: t.wait(5) util.wait_for_pageload('//h1[@class="section-header-main"]') price = t.read(f'(//span[@class="packagePriceTotal"])[2]') price = float(price.replace(',', '').replace('SG', '').replace('$', '').replace(' ', '')) print(price) url = t.url() print(url) return price, url else: return 0, '' elif len(dur_ref) >= 3: dep_lst = [] dur_lst = [] print('multi-trip ', len(dur_ref)) for k in range(len(dur_ref)): dep_lst.append(t.read(f'(//span[@data-test-id="departure-time"])[{3*i+k+1}]')) dur_lst.append(t.read(f'(//span[@data-test-id="duration"])[{3*i+k+1}]')) print(dep_lst) print(dep_ref) if dep_lst == dep_ref: print(dur_lst) print(dur_ref) if dur_lst == dur_ref: t.click(f'(//button[@data-test-id="select-button"])[{j}]') t.wait(5) if t.present('//a[@id="forcedChoiceNoThanks"]'): t.click(f'//a[@id="forcedChoiceNoThanks"]') t.wait(5) for x in range(5): print(x) if t.popup('Flight-Information?'): break else: t.wait(5) price = t.read(f'(//span[@class="packagePriceTotal"])[2]') price = float(price.replace(',', '').replace('SG', '').replace('$', '').replace(' ', '')) print(price) url = t.url() print(url) return price, url else: return 0, ''
def gethistorylist(input): # 获取xxxx年的数据 input = str(input) date_start = input + '-08-01' #一年开始的日期 (试一试10天的) date_end = input + '-12-31' #一年结束的日期 #初始化页面 t.init() #输入url进入 t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1") #等5秒网页加载 t.wait(5) #鼠标放上去,点击精简选项 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="zksq"]') #鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="fxr"]') t.type(element_identifier='//*[@id="fxr"]', text_to_type=date_start) #再点击,确保日期不会遮住底下的搜索按钮 t.click(element_identifier='//*[@id="fxr"]') t.hover(element_identifier='//*[@class="ipf01"]') t.click(element_identifier='//*[@class="ipf01"]') #把展示的尺寸设置为50个产品每页: t.hover(element_identifier='//*[@data-pagesize="50"]') t.click(element_identifier='//*[@data-pagesize="50"]') #点击以发行日升序排行,等价于"倒过来取" t.hover(element_identifier='//*[@data-sort = "sell_org_date"]') t.click(element_identifier='//*[@data-sort = "sell_org_date"]') #当下一页没有被disable的时候,有以下超参数 page_curr = 1 #当前页面index value_dict = {} #存放data max_page = 1 #最大的页面数记录 #存放列名 name_list = ['序号', '综合评级', 'url'] for col_name in name_list: value_dict.setdefault(col_name, []) #初始化空数据集 #当可以翻页,或数据只有一页的时候,进行循环 stop_flag = False #当当前页面不是最后一页,或只有一页时,都进行如下循环 while (t.read(element_identifier= '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1): if stop_flag == True: #如果没有今年的数据,就没必要翻页了 break max_page = page_curr #每页的数据量大小(row number) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 #爬取页面所有一个table里的值 filename = str(input) + str("_") + str(page_curr) + "history_data.csv" t.wait(1) #等1秒,万一加载错误了 t.table( element_identifier='//div[@class = "table-s1 tab-s2 w100"]//table', filename_to_save=filename) #爬取当前页面 (只有title和href) for i in range(1, count_values): # 判定条件:如果是今年内(小于今年12-31或等于12-31的),全都要 if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[@class = "px"]')) <= date_end: # print("number {} is running".format(str(i))) #爬取产品名称作为primary key,之后join用: # 产品序号 value_dict[name_list[0]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[2]')) # 综合评级 value_dict[name_list[1]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[12]//i/@title')) # url value_dict[name_list[2]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//a/@href')) else: # 如果已经超过今年的数据了,此线程结束,flag置true, while循环结束 stop_flag = True # print("thread stops here..") break # 翻页 page_curr += 1 # print("turn the page..") # 鼠标模拟移动,并点击翻页 t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]') t.click(element_identifier='//*[@href="' + str(page_curr) + '"]') # #关闭tagui流 t.close() #输出格式为:"今年年份.csv" hist_data = pd.DataFrame(value_dict) #双格式(csv + xlsx 输出) hist_data.to_csv(input + ".csv", encoding='UTF-8', index=False) return max_page
value_dict = {} # 存放data count = 1 # csv 命名用 # 存放列名 name_list = ['序号', '综合评级', 'url'] for col_name in name_list: value_dict.setdefault(col_name, []) # 初始化空数据集 # 当可以翻页,或数据只有一页的时候,进行循环 while (t.read(element_identifier= '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1): # 每页的数据量大小(row number) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 # 爬取页面所有一个table里的值 if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(count_values - 1) + ']//td[@class = "px"]')) > str_to_append: # print("direct continue..") # 翻页 page_curr += 1 # 鼠标模拟移动,并点击翻页 t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]') t.click(element_identifier='//*[@href="' + str(page_curr) + '"]') continue filename = str(count) + "daily_data.csv" count += 1
def main_operation(url, mode='txt'): # 当前页面 curr_page = int( t.read(element_identifier='//div[@class = "ng-binding"][last()]'). split('/')[0]) # 点击按钮 list_count = t.count( element_identifier='//div[@class = "list caidan-right-list"]' ) # 循环列表,取出总list有几个 #如果是断点,读取断电数据 if os.path.exists('baojianhui_log.txt'): with open('baojianhui_log.txt', 'r', encoding='utf-8') as f: params = f.read().split(',') curr_page = params[0] start_i = params[1] start_j = params[2] else: #如果是第一次执行,全取初始值; start_i = 1 start_j = 1 #常规操作 for i in range(1, list_count + 1): t.wait(3) if i < int(start_i): continue item_count = t.count( element_identifier='//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"]') # 取出每个list里的具体法规有几条 print('当前是list {}, 里面的元素有 {} 个'.format(str(i), str(item_count))) t.wait(3) for j in range(1, item_count + 1): if j < int(start_j): continue item_title = t.read( element_identifier='//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a') time_suffix = t.read( element_identifier='//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//span[@class = "date ng-binding"]') file_name = item_title + '_' + time_suffix + '.txt' if '/' in file_name: file_name = file_name.replace('/', ' ') if mode == 'txt': #点击 link = t.read(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a/@ng-href') prefix = 'http://www.cbirc.gov.cn/cn/view/pages/' final_link = prefix + link t.url(final_link) t.wait(1) while not os.path.exists(file_name): type_1 = t.read( element_identifier='//div[@class = "Section0"]' ) + t.read(element_identifier='//div[@class = "Section1"]') type_2 = t.read( element_identifier='//div[@class = "WordSection1"]') type_3 = t.read( element_identifier= '//div[@class = "wenzhang-content ng-binding"]') if type_1 != '': content = type_1 with open(file_name, 'w', encoding='utf-8') as f: f.write(content) break elif type_2 != '': content = type_2 with open(file_name, 'w', encoding='utf-8') as f: f.write(content) break elif type_3 != '': content = type_3 with open(file_name, 'w', encoding='utf-8') as f: f.write(content) break else: content = ' ' with open(file_name, 'w', encoding='utf-8') as f: f.write(content) break elif mode == 'doc': t.click(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a[@ng-click = "fileDownload(x.docFileUrl)"]') doc_id = t.read(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a/@ng-href').split('=')[1][:-7] doc_name = doc_id + '.doc' curr_clock = 5 while not os.path.exists(doc_name): t.wait(curr_clock) curr_clock += 5 if curr_clock > MAX_WAIT: break t.wait(2) os.rename(doc_name, item_title + '_' + time_suffix + '.doc') elif mode == 'pdf': t.click(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a[@ng-click = "fileDownload(x.pdfFileUrl)"]') pdf_id = t.read(element_identifier= '//div[@class = "list caidan-right-list"][' + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) + ']//a/@ng-href').split('=')[1][:-7] pdf_name = pdf_id + '.pdf' curr_clock = 5 while not os.path.exists(pdf_name): t.wait(curr_clock) curr_clock += 5 if curr_clock > MAX_WAIT: break t.wait(2) os.rename(pdf_name, item_title + '_' + time_suffix + '.pdf') else: print('unknown format..') t.close() raise Exception("unknown input mode") # 返回主页面 t.url(url + str(curr_page)) t.wait(2) with open('baojianhui_log.txt', 'w', encoding='utf-8') as f: f.write(str(curr_page) + ',' + str(i) + ',' + str(j)) with open('baojianhui_log.txt', 'w', encoding='utf-8') as f: f.write(str(curr_page) + ',' + str(i) + ',' + str(1)) #当前list取完,j更新
def read_text_content(content_url, file_name, page_num, i, time, url_prefix): # 读取网页 if 'cnhttp' in content_url: content_url = content_url[21:] # 不知道为什么会出错这个 t.url(content_url) # 启动很慢 else: t.url(content_url) # 启动很慢 # 获取pdf的数量,pdf的名字和pdf应该有的名字 t.wait(2) pdf_count = t.count(element_identifier='//div[@id = "zoom"]//a/@href') if pdf_count == 0: ##如果是正常的txt文件 # 取到列表 print("文件{} 是文档。".format(i)) # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') try: with open(file_name, 'w', encoding='utf-8') as f: f.write(text) except: with open('实施《全国企业兼并破产和职工再就业工作计划》银行呆、坏帐准备金核销办法_1997-10-01.txt', 'w', encoding='utf-8') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) else: with open('wrong_log.txt', 'a', encoding='utf-8') as f: string = 'page {} doc {} didnt write in '.format(page_num, i) f.write(string) f.write("\n") print("write files fails...") else: # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) else: with open('wrong_log.txt', 'a', encoding='utf-8') as f: string = 'page {} doc {} didnt write in '.format(page_num, i) f.write(string) f.write("\n") print("write files fails...") print("文件{} 含有 {} 个文件要下载。".format(i, pdf_count)) pdf_count += 1 # python从0开始,所以至少有一个pdf count current_count = 0 for j in range(1, pdf_count): # 取pdf的名字 if '.htm' not in t.read( element_identifier='//div[@id = "zoom"]//p//a/@href'): print("当前是第{}个文件。。".format(j)) p_count = t.count(element_identifier='//div[@id = "zoom"]//p') while current_count <= p_count: try: if t.read(element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a') != '': # 如果取到了 print("这个p有!") pdf_name = t.read( element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href') # 取合规名 pdf_name_to_change = t.read( element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a') # 下载 suffix = pdf_name.split('.')[-1] pdf_name = pdf_name.split('/')[-1] prefix = 'http://www.pbc.gov.cn' download_link = prefix + t.read( element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href') if 'cnhttp' in download_link: t.url( t.read(element_identifier= '//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href')) # 启动很慢 else: t.url(download_link) # 启动很慢 wait_seconds = 1 total_seconds = 0 while os.path.exists(pdf_name) == False: t.wait(wait_seconds) total_seconds += wait_seconds if os.path.exists(pdf_name_to_change): break if total_seconds > MAX_WAIT: print('download fails') with open('download_log.txt', 'a', encoding='utf-8') as f: string = 'page {} doc {} file {} didnt download '.format( page_num, i, j) f.write(string) f.write("\n") break if os.path.exists(pdf_name_to_change): pass else: os.rename(pdf_name, pdf_name_to_change) # 改名 os.rename( pdf_name_to_change, pdf_name_to_change[:-(len(suffix) + 1)] + '_' + time + pdf_name_to_change[-(len(suffix) + 1):]) t.url(content_url) # 返回二级目录 # 启动很慢 current_count += 1 break else: current_count += 1 print("这个p没有") except: print('some error occurs, nvm') continue else: print("是个网页,当文档处理!") prefix = 'http://www.pbc.gov.cn' download_link = prefix + t.read( element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href') if 'cnhttp' in download_link: t.url( t.read(element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href')) # 启动很慢 else: t.url(download_link) # 启动很慢 # 取text if t.read(element_identifier='//div[@id = "zoom"]') != '': text = t.read(element_identifier='//div[@id = "zoom"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) elif t.read(element_identifier='//td[@class = "p1"]') != '': text = t.read(element_identifier='//td[@class = "p1"]') with open(file_name, 'w', encoding='utf-8') as f: f.write(text) else: with open('wrong_log.txt', 'a', encoding='utf-8') as f: string = 'page {} doc {} didnt write in '.format( page_num, i) f.write(string) f.write("\n") print("write files fails...")
def gethistorylist(inputyear): # 获取xxxx年的数据 input = inputyear date_start = input + '-01-01' #一年开始的日期 (试一试10天的) date_end = input + '-12-31' #一年结束的日期 #初始化页面 t.init() #输入url进入 t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1") #鼠标放上去,点击精简选项 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="zksq"]') #鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索 t.hover(element_identifier='//*[@id="fxr"]') t.click(element_identifier='//*[@id="fxr"]') t.type(element_identifier='//*[@id="fxr"]', text_to_type=date_start) #再点击,确保日期不会遮住底下的搜索按钮 t.click(element_identifier='//*[@id="fxr"]') t.hover(element_identifier='//*[@class="ipf01"]') t.click(element_identifier='//*[@class="ipf01"]') #把展示的尺寸设置为50个产品每页: t.hover(element_identifier='//*[@data-pagesize="50"]') t.click(element_identifier='//*[@data-pagesize="50"]') #点击以发行日升序排行,等价于"倒过来取" t.hover(element_identifier='//*[@data-sort = "sell_org_date"]') t.click(element_identifier='//*[@data-sort = "sell_org_date"]') #当下一页没有被disable的时候,有以下超参数 page_curr = 1 #当前页面index value_dict = {} #存放data #存放列名 name_list = [ '序号', '产品名称', '发行银行', '委托货币', '发行日', '停售日', '管理期(天)', '预期收益率', '到期收益率', '与同期储蓄比', '综合评级', 'url' ] for col_name in name_list: value_dict.setdefault(col_name, []) #初始化空数据集 #当可以翻页,或数据只有一页的时候,进行循环 stop_flag = False #当当前页面不是最后一页,或只有一页时,都进行如下循环 while (t.read(element_identifier= '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1): if stop_flag == True: #如果没有今年的数据,就没必要翻页了 break #每页的数据量大小(row number) count_values = int( t.count(element_identifier='//tbody[@id = "content"]//tr') ) + 1 # python从0开始 #爬取当前页面 for i in range(1, count_values): # 判定条件:如果是今年内(小于今年12-31或等于12-31的),全都要 if str( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[@class = "px"]')) <= date_end: # 序号 value_dict[name_list[0]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[2]')) # 产品名称 value_dict[name_list[1]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[3]')) # 发行银行 value_dict[name_list[2]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[4]')) # 委托货币 value_dict[name_list[3]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[5]')) # 发行日 value_dict[name_list[4]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[6]')) # 停售日 value_dict[name_list[5]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[7]')) # 管理期(天) value_dict[name_list[6]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[8]')) # 预期收益率 value_dict[name_list[7]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[9]')) # 到期收益率 value_dict[name_list[8]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[10]')) # 与同期储蓄比 value_dict[name_list[9]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[11]')) # 综合评级 value_dict[name_list[10]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[12]//i/@title')) # url value_dict[name_list[11]].append( t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//a/@href')) else: # 如果已经超过今年的数据了,此线程结束,flag置true, while循环结束 stop_flag = True print("thread stops here..") break # 翻页 page_curr += 1 print("turn the page..") # 鼠标模拟移动,并点击翻页 t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]') t.click(element_identifier='//*[@href="' + str(page_curr) + '"]') # #关闭tagui流 t.close() #输出格式为:"今年年份.csv" hist_data = pd.DataFrame(value_dict) #双格式(csv + xlsx 输出) hist_data.to_csv(input + ".csv", index=False, encoding='UTF-8') hist_data.to_excel(input + ".xlsx", index=False, encoding='UTF-8') #gethistorylist('2003')
def get_shoe(shoe_name, g, email): """ Get shoe details from jdsports.com.sg :param shoe_name: name of the shoe to search for :param gender: gender of the subscriber :param email: email id of the subscriber :return: details, list of shoe details. """ details = [] t.init(visual_automation=True) t.url('https://www.jdsports.com.sg/') t.wait(5) final_command = shoe_name + " shoes" + '[enter]' t.keyboard('[esc]') t.type('//input[@id = "srchInput"]', final_command) #t.click('//input[@id ="srchButton"]') t.wait(3) if g == ' men': if t.read( '(//a[@data-e2e="plp-filterMenu-catItem"]/span)[contains(.,"Men")]' ): t.click('(//a[@data-e2e="plp-filterMenu-catItem"]/span)[1]') count = t.count( '//ul[@id="productListMain"]//li[@class="productListItem "]') t.wait(3) if count != 0: for i in range(1, min(count, 4)): price = t.read(f'(//span[@class="pri"])[{i}]') name = t.read(f'(//span[@class="itemTitle"])[{i}]') img = t.read( f'(//a[@class="itemImage"]/picture/img/@srcset)[{i}]') link = "https://www.jdsports.com.sg" + t.read( f'(//span[@class = "itemTitle"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "JD", "link": link }) else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "JD", "link": "NA" }) elif g == ' women': if t.read( '(//a[@data-e2e="plp-filterMenu-catItem"]/span)[contains(.,"Women")]' ): t.click( '(//a[@data-e2e="plp-filterMenu-catItem"]/span)[.="Women"]') count = t.count( '//ul[@id="productListMain"]//li[@class="productListItem "]') t.wait(3) if count != 0: for i in range(1, min(count, 4)): price = t.read(f'(//span[@class="pri"])[{i}]') name = t.read(f'(//span[@class="itemTitle"])[{i}]') img = t.read( f'(//a[@class="itemImage"]/picture/img/@srcset)[{i}]') link = "https://www.jdsports.com.sg" + t.read( f'(//span[@class = "itemTitle"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "JD", "link": link }) else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "JD", "link": "NA" }) else: count = t.count( '//ul[@id="productListMain"]//li[@class="productListItem "]') t.wait(3) if count != 0: for i in range(1, min(count, 4)): price = t.read(f'(//span[@class="pri"])[{i}]') name = t.read(f'(//span[@class="itemTitle"])[{i}]') img = t.read( f'(//a[@class="itemImage"]/picture/img/@srcset)[{i}]') link = "https://www.jdsports.com.sg" + t.read( f'(//span[@class = "itemTitle"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "JD", "link": link }) else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "JD", "link": "NA" }) #t.close() if len(details) == 0: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "JD", "link": "NA" }) # print("JD BOT",details) return details
def get_shoe(shoe, gender, email): t.init(visual_automation=True) t.url("https://www.farfetch.com/sg/") details = [] if gender == ' men': t.click('(//span[@class="tabs__span"])[.="Men"]') t.type('//input[@class="js-searchboxABTest force-ltr"]', shoe + " Shoes") t.click('//form[@class="ff-search"]/button') t.wait(3) count = t.count('(//li[@data-test="productCard"])') if count != 0: for i in range(1, min(count, 4)): name = t.read( f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/p' ) price = t.read( f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/div' ).replace('$', '') if 'Off' in price: price = price.split('Off')[1] img = t.read( f'(//li[@data-test="productCard"])[{i}]//img/@src') link = "https://www.farfetch.com" + t.read( f'(//li[@data-test="productCard"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "Farfetch", "link": link }) # print(f"name: {name}, price: {price} img_source = {img}") else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "Farfetch", "link": "NA" }) elif gender == ' women': t.click('(//span[@class="tabs__span"])[.="Women"]') t.type('//input[@class="js-searchboxABTest force-ltr"]', shoe + " Shoes") t.click('//form[@class="ff-search"]/button') t.wait(3) count = t.count('(//li[@data-test="productCard"])') if count != 0: for i in range(1, min(count, 4)): name = t.read( f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/p' ) price = t.read( f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/div' ).replace('$', '') if 'Off' in price: price = price.split('Off')[1] img = t.read( f'(//li[@data-test="productCard"])[{i}]//img/@src') link = "https://www.farfetch.com" + t.read( f'(//li[@data-test="productCard"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "Farfetch", "link": link }) # print(f"name: {name}, price: {price} img_source = {img}") else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "Farfetch", "link": "NA" }) else: t.type('//input[@class="js-searchboxABTest force-ltr"]', shoe + " Shoes") t.click('//form[@class="ff-search"]/button') t.wait(3) count = t.count('(//li[@data-test="productCard"])') if count != 0: for i in range(1, min(count, 4)): name = t.read( f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/p' ) price = t.read( f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/div' ).replace('$', '') if 'Off' in price: price = price.split('Off')[1] img = t.read( f'(//li[@data-test="productCard"])[{i}]//img/@src') link = "https://www.farfetch.com" + t.read( f'(//li[@data-test="productCard"])[{i}]/a/@href') details.append({ "email": email, "name": name, "price": price, "img": img, "Company": "Farfetch", "link": link }) # print(f"name: {name}, price: {price} img_source = {img}") else: details.append({ "email": email, "name": "NA", "price": "NA", "img": "NA", "Company": "Farfetch", "link": "NA" }) t.close() return details