示例#1
0
def main(url, mode='txt'):
    if os.path.exists('baojianhui_log.txt'):
        with open('baojianhui_log.txt', 'r', encoding='utf-8') as f:
            params = f.read().split(',')
        curr_page = params[0]
    else:  # 如果是第一次执行,全取初始值;
        curr_page = 1
    url_link = str(url) + str(curr_page)
    web_init(url_link)
    total_page = int(
        t.read(element_identifier='//div[@class = "ng-binding"][last()]').
        split('/')[-1])
    while int(curr_page) < int(total_page):  #从1开始,做完之后翻页;
        main_operation(url, mode)  #如果有页可翻,就翻页
        print('click once')
        t.click(element_identifier='//a[@ng-click = "pager.next()"]')  #翻页
        t.wait(5)
        curr_page = int(
            t.read(element_identifier='//div[@class = "ng-binding"][last()]').
            split('/')[0])
        with open('baojianhui_log.txt', 'w', encoding='utf-8') as f:
            f.write(str(curr_page) + ',' + str(1) + ',' +
                    str(1))  #翻页之后,index重置;i更新;
    if curr_page == total_page:
        main_operation(url, mode)  #如果是最后一页了,只需要做一次main
    t.close()
    return True
示例#2
0
def read_content(page_num, url_prefix, i, today):
    t.url(url_prefix + str(page_num) + '.html')
    # 启动很慢
    t.wait(2)
    if t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) +
              ']//span[@class = "hui12"]') < today:
        t.close()
        return '', '', '', ''
    if '' == t.read(element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']'):
        print("no here")
        raise Exception("an exception")
    file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' +
                       str(i) + ']')
    file_name = file_name[:-10] + str("_") + file_name[-10:] + str('.txt')
    time = file_name[-14:-4]
    prefix = 'http://www.pbc.gov.cn'
    content_url = prefix + t.read(
        element_identifier='//td[@colspan = "2"]//table[' + str(i) +
        ']//a/@href')
    if '' == t.read(element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']//a/@href'):
        print("no here")
        raise Exception("an exception")
    flag = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) +
                  ']//a/@href')  # 判断是否需要下载
    return flag, time, content_url, file_name
示例#3
0
def rpa_process(to_date, preferred_time, from_date, phone_number, token):
    t.init()
    t.url("https://sangam-test-website.herokuapp.com/change_input")
    util.wait_for_pageload('//button[@id="btnsubmit"]')

    t.click('//input[@id="txtHandNo"]')
    t.type('//input[@name="txtHandNo"]', phone_number)
    t.click('//button[@id="btnsubmit"]')

    util.wait_for_pageload('//button[@id="btnsubmit"]')
    from_date_obj = from_date
    from_date = from_date.strftime("%d/%m/%Y")
    t.click('//label[contains(.,"' + str(from_date) + '")]')

    to_date_obj = to_date
    hour = to_date.hour
    minute = to_date.minute
    to_date = to_date.strftime("%d/%m/%Y")

    t.click('//input[@name="txtDateTimePicker"]')
    t.type('//input[@name="txtDateTimePicker"]', to_date)

    t.click('//div[@class="filter-option-inner-inner"]')
    t.click('//a[@role= "option"][.=' + str(hour) + ']')
    t.click('//select[@id="ddlMin"]')
    t.click('//a[@role= "option"][.=' + str(minute) + ']')

    t.click('//button[@id="btnsubmit"]')
    t.close()

    change_appointment_slot(from_date_obj, to_date_obj, token)
示例#4
0
def propertydata(project_name):

    t.close()
    t.init()
    project_url = f'https://www.propertyguru.com.sg/property-for-sale?market=residential&freetext={project_name}&newProject=all'
    t.url(project_url)
    wait_for_pageload('//div[@class="header-wrapper"]')
    num_result_ad = 3

    # load main page, get detail page url link
    url = [''] * num_result_ad
    for n in [x for x in range(1, num_result_ad + 1)
              if x != 4 and x != 8]:  # skip 4th and 8th advertisement
        wait_for_pageload(
            f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)'
        )
        url[n - 1] = read_if_present(
            f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)'
        )
        print(f"{n}. url = " + url[n - 1])

    property_title = [''] * num_result_ad
    id = [''] * num_result_ad
    pdf = [''] * num_result_ad
    pdf_link = [''] * num_result_ad

    for n in [x for x in range(1, num_result_ad + 1) if x != 4 and x != 8]:
        t.url("https://www.propertyguru.com.sg" + url[n - 1])

        wait_for_pageload('//h1[@class="h2"]')

        property_title[n - 1] = read_if_present('//h1[@class="h2"]')
        print(f"{n}. property_title = " + property_title[n - 1])

        id[n - 1] = read_if_present(
            '//*[@id="details"]/div/div[1]/div[2]/div[10]/div/div[2]')
        print(f"{n}. id = " + id[n - 1])

        pdf[n - 1] = read_if_present(
            '//*[@id="sticky-right-col"]/div[3]/a[2]/@href')
        pdf_link[n - 1] = 'https://www.propertyguru.com.sg' + pdf[n - 1]
        print(f"{n}. pdf_link = " + pdf_link[n - 1])

    property_info = {
        'property_title': property_title,
        'url': ['https://www.propertyguru.com.sg' + x for x in url],
        'id': id,
        'pdf_link': pdf_link,
    }

    df = DataFrame(property_info,
                   columns=['property_title', 'id', 'url', 'pdf_link'])
    df.to_excel('Property Monitor.xlsx', encoding='utf8', index=None)
    print('======== Property Monitor.xlsx saved ==========')
    print(f'======== Monitoring every {interval} second ==========')
def rpa_process(lmp_date, doctor_name, preferred_time, phone_number,
                patient_name, symptoms, email, sub_id):
    hour = preferred_time.hour
    minute = preferred_time.minute
    checkup_dates = []
    day_list = [
        45, 75, 105, 135, 165, 195, 210, 225, 240, 255, 262, 269, 275, 280
    ]
    week_list = [6, 10, 14, 18, 22, 26, 28, 30, 32, 34, 36, 37, 38, 39]
    for day in day_list:
        checkup = lmp_date + timedelta(days=day)
        checkup = str(checkup.day) + "/" + str(checkup.month) + "/" + str(
            checkup.year)
        checkup_dates.append(checkup)
    t.init()

    for index, i in enumerate(checkup_dates):

        t.url("https://sangam-test-website.herokuapp.com/")
        util.wait_for_pageload('//button[@id="btnsubmit"]')

        t.click('//input[@class="form-control"]')
        t.type('//input[@name="name"]', patient_name)

        t.click('//input[@id="email"]')
        t.type('//input[@name="email"]', email)

        symptoms = "Pregnancy checkup after week " + str(week_list[index])

        t.type('//textarea', symptoms)

        t.click('//input[@id="txtHandNo"]')
        t.type('//input[@name="txtHandNo"]', phone_number)

        t.click('//div[@class="filter-option-inner-inner"]')
        t.click('//a[@role= "option"][.=' + str(hour) + ']')
        t.click('//select[@id="ddlMin"]')
        t.click('//a[@role= "option"][.=' + str(minute) + ']')

        t.click('//input[@name="txtDateTimePicker"]')
        t.type('//input[@name="txtDateTimePicker"]', i)

        t.click('//select[@id="txtSpecificDoc"]')
        t.click('//a[@role= "option"][.="' + str(doctor_name) + '"]')

        t.click('//button[@id="btnsubmit"]')

    t.close()
    request_url = "https://sangam-test-website.herokuapp.com/get_future_appointments?email=" + str(
        email)
    future_appointments = requests.get(request_url)
    book_calendar_slot(future_appointments.json()['data'], sub_id)
示例#6
0
def run():
	conn = util.create_connection("./db/news.db")
	site = util.getSiteByName(conn, "New York Times")
	site_url = site[0][2]
	site_id = site[0][0]

	t.init(visual_automation = True, chrome_browser = True)
	t.url(site_url)
	t.wait(10)
	df = catchContent()
	df = util.fixImgLink(df,"https://cf-templates-fghyux9ggb7t-ap-southeast-1.s3-ap-southeast-1.amazonaws.com/NewYorkTimes.png")
	df = util.fixSummary(df)
	t.wait(20)
	t.close()

	util.updateNews(conn, site_id, df)
示例#7
0
def run():
    conn = util.create_connection("./db/news.db")
    site = util.getSiteByName(conn, "Today Online")
    site_url = site[0][2]
    site_id = site[0][0]

    t.init(visual_automation=True, chrome_browser=True)
    t.url(site_url)
    t.wait(2)
    t.hover('//div[@class="container footer-main"]')
    t.wait(6)
    df = catchContent()
    t.wait(20)
    t.close()

    util.updateNews(conn, site_id, df)
示例#8
0
def getFlightExcel(info,ind):

    flight_main, time_lst, code_lst, dur_lst, ind = getFlightInfo(info['dates'], ind)

    #print(flight_main['Details'])
    print(code_lst)
    print(dur_lst)
    print(time_lst)
    k = len(info['dates'])
    q = len(info['city'])

    flight_lst = []
    for i in range(k):
        if i == (k-1) and i > 0 and q == k:
            flight_lst.append(info['dates'][i])
            flight = info['city'][i] + '-' + info['city'][0]
            flight_lst.append(flight)
        else:
            flight_lst.append(info['dates'][i])
            flight = info['city'][i] + '-' + info['city'][i + 1]
            flight_lst.append(flight)
    print(flight_lst)


    ###Compare Price with Expedia (Hyperlink/Multi to be added)
    for j in range(2):
        t.close()
        t.init()
        t.wait(0.5)
        flight_search(info)
        t.wait(5)
        flight_main['Flight Info'][j] = flight_lst

        price_exp, url_exp = getExpFlightPrice(code_lst[k*j:k*(j+1)], time_lst[k*j:k*(j+1)], dur_lst[k*j:k*(j+1)])
        print(price_exp)
        print(url_exp)
        print(flight_main['Price'])
        if price_exp < flight_main['Price'][j]:
            if price_exp != 0:
                flight_main['Price'][j] = price_exp
                flight_main['Hyperlink'][j] = url_exp
    print(flight_main['Price'])
    print(flight_main['Hyperlink'])

    return flight_main
示例#9
0
文件: law_RPA.py 项目: maoyuanqi/iRPA
def history_data(url_prefix, start_page=1):
    curr_page = 1
    curr_doc = 1
    try:
        t.init()
        page_file = get_max_page(url_prefix)
        with open(page_file, 'r') as f:
            max_page = int(f.read()) + 1  # 拿到最大page,加1因为python index是开区间;
        os.remove(page_file)
        for page_num in range(start_page, max_page):
            curr_page = page_num
            count_values_file = get_count_values(page_num, url_prefix)
            with open(count_values_file, 'r') as f:  # 拿到每一页的item数量;
                count_values = int(f.read().split(':')[-1]) + 1
            os.remove(count_values_file)
            for i in range(1, count_values):
                if os.path.exists('complete_log'+str(url_prefix.split('/')[-2])+'.txt'):
                    with open('complete_log' + str(url_prefix.split('/')[-2]) + '.txt', 'r') as f:
                        start_doc = f.read().split(',')[1]
                    if i < int(start_doc):
                        continue
                else:
                    pass
                curr_doc = i
                flag, time, content_url, file_name = read_content(page_num, url_prefix, i)
                if '.html' not in flag:
                    # 当直接跳到需要下载的文件的时候:需要提供 当前url,time后缀,目前的文件index
                    direct_download(content_url, time, i)
                else:  # 当没有直接下载的时候,需要读取网页
                    # 读取网页
                    read_text_content(content_url, file_name, page_num, i, time, url_prefix)
            #顺利完成了item的循环,当前页完成,complete log翻页,start doc放在1;如果page_num已经是count - 1,就不用做事情了。
            if page_num != max_page - 1:
                with open('complete_log' + str(url_prefix.split('/')[-2]) + '.txt', 'w') as f:
                    f.write(str(page_num+1) + ',' + str(1))
            else:
                pass
        t.close()
        return True

    except: #如果检测到错误
        with open('complete_log' + str(url_prefix.split('/')[-2]) + '.txt', 'w') as f:
            f.write(str(curr_page) + ',' + str(curr_doc)) #留点
        t.close()
        return False
示例#10
0
def rpa_process(from_date, phone_number, token):
    t.init()
    t.url("https://sangam-test-website.herokuapp.com/cancel_input")
    util.wait_for_pageload('//button[@id="btnsubmit"]')

    t.click('//input[@id="txtHandNo"]')
    t.type('//input[@name="txtHandNo"]', phone_number)
    t.click('//button[@id="btnsubmit"]')

    util.wait_for_pageload('//button[@id="btnsubmit"]')
    from_date_obj = from_date
    from_date = from_date.strftime("%d/%m/%Y")

    t.click('//label[contains(.,"' + str(from_date) + '")]')
    t.click('//button[@id="btnsubmit"]')
    t.close()

    cancel_appointment_slot(from_date_obj, token)
def flight_search(flight_request):
    search_dt = dt.today()
    request_id = flight_request['Request_ID']
    info = flight_request['Request_Details']
    t.init()
    t.url('https://www.skyscanner.com.sg/')
    tu.wait_for_pageload('//input[@id="fsc-trip-type-selector-return"]')
    fill_search(info)
    ind = 0
    flight_main = getFlightExcel(info, ind)
    t.wait(10.0)
    t.close()
    flight_main.update({
        'Request_ID': request_id,
        'Search_Datetime': search_dt
    })
    dbf.newFlightDeals(flight_main)
    outFile = dbf.export_FlightDeals(request_id, search_dt)
    return outFile
示例#12
0
def get_news_using_crawler():
    try:
        t.url(
            'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/news'
        )

        wait_for_pageload('//p[@class="heading text-underline"]')

        num_news = t.count('//p[@class="heading text-underline"]')
        if num_news > 5:
            num_news = 5

        delete_news_data_db()
        date_stamp = datetime.datetime.now(
            pytz.timezone('Singapore')).strftime('%Y-%m-%d')

        for n in range(1, num_news + 1):
            data = {}
            data['date_stamp'] = date_stamp
            news_link = t.read(
                f'(//p[@class="heading text-underline"])[{n}]/ancestor-or-self::a/@href'
            )
            data['news_link'] = news_link
            news_title = t.read(
                f'(//p[@class="heading text-underline"])[{n}]/ancestor-or-self::a/@aria-label'
            )
            data['news_title'] = news_title
            print('Article', n, ":", news_title)
            print('')
            news_summaries = SummarizeUrl(news_link)
            data['news_summary'] = str(news_summaries)
            print(news_summaries)
            status = insert_db(data)

        return status
    except Exception as e:
        print(e)
    finally:
        t.close()
示例#13
0
def gethistorylist(input):
    # 获取xxxx年的数据
    input = str(input)
    date_start = input + '-08-01'  #一年开始的日期 (试一试10天的)
    date_end = input + '-12-31'  #一年结束的日期

    #初始化页面
    t.init()
    #输入url进入
    t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1")
    #等5秒网页加载
    t.wait(5)
    #鼠标放上去,点击精简选项
    t.hover(element_identifier='//*[@id="fxr"]')
    t.click(element_identifier='//*[@id="zksq"]')
    #鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索
    t.hover(element_identifier='//*[@id="fxr"]')
    t.click(element_identifier='//*[@id="fxr"]')
    t.type(element_identifier='//*[@id="fxr"]', text_to_type=date_start)
    #再点击,确保日期不会遮住底下的搜索按钮
    t.click(element_identifier='//*[@id="fxr"]')
    t.hover(element_identifier='//*[@class="ipf01"]')
    t.click(element_identifier='//*[@class="ipf01"]')
    #把展示的尺寸设置为50个产品每页:
    t.hover(element_identifier='//*[@data-pagesize="50"]')
    t.click(element_identifier='//*[@data-pagesize="50"]')
    #点击以发行日升序排行,等价于"倒过来取"
    t.hover(element_identifier='//*[@data-sort = "sell_org_date"]')
    t.click(element_identifier='//*[@data-sort = "sell_org_date"]')

    #当下一页没有被disable的时候,有以下超参数
    page_curr = 1  #当前页面index
    value_dict = {}  #存放data
    max_page = 1  #最大的页面数记录

    #存放列名
    name_list = ['序号', '综合评级', 'url']

    for col_name in name_list:
        value_dict.setdefault(col_name, [])  #初始化空数据集

    #当可以翻页,或数据只有一页的时候,进行循环
    stop_flag = False

    #当当前页面不是最后一页,或只有一页时,都进行如下循环
    while (t.read(element_identifier=
                  '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]')
           == str(page_curr)) or (page_curr == 1):
        if stop_flag == True:  #如果没有今年的数据,就没必要翻页了
            break
        max_page = page_curr
        #每页的数据量大小(row number)
        count_values = int(
            t.count(element_identifier='//tbody[@id = "content"]//tr')
        ) + 1  # python从0开始
        #爬取页面所有一个table里的值
        filename = str(input) + str("_") + str(page_curr) + "history_data.csv"
        t.wait(1)  #等1秒,万一加载错误了
        t.table(
            element_identifier='//div[@class = "table-s1 tab-s2 w100"]//table',
            filename_to_save=filename)
        #爬取当前页面 (只有title和href)
        for i in range(1, count_values):
            # 判定条件:如果是今年内(小于今年12-31或等于12-31的),全都要
            if str(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//td[@class = "px"]')) <= date_end:
                # print("number {} is running".format(str(i)))
                #爬取产品名称作为primary key,之后join用:
                # 产品序号
                value_dict[name_list[0]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[2]'))
                # 综合评级
                value_dict[name_list[1]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//td[12]//i/@title'))
                # url
                value_dict[name_list[2]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//a/@href'))

            else:  # 如果已经超过今年的数据了,此线程结束,flag置true, while循环结束
                stop_flag = True
                #    print("thread stops here..")
                break
        # 翻页
        page_curr += 1
        # print("turn the page..")
        # 鼠标模拟移动,并点击翻页
        t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]')
        t.click(element_identifier='//*[@href="' + str(page_curr) + '"]')

    # #关闭tagui流
    t.close()
    #输出格式为:"今年年份.csv"
    hist_data = pd.DataFrame(value_dict)
    #双格式(csv + xlsx 输出)
    hist_data.to_csv(input + ".csv", encoding='UTF-8', index=False)
    return max_page
示例#14
0
 def close(self):
     t.close()
示例#15
0
def getdailyincrement(str_to_append):

    #初始化页面
    t.init()
    #输入url进入
    t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1")
    #等5秒反应
    t.wait(15)
    #鼠标放上去,点击精简选项
    t.hover(element_identifier='//*[@id="fxr"]')
    t.click(element_identifier='//*[@id="zksq"]')
    #鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索
    t.hover(element_identifier='//*[@id="fxr"]')
    t.click(element_identifier='//*[@id="fxr"]')
    t.type(element_identifier='//*[@id="fxr"]', text_to_type=str_to_append)
    #再点击,确保日期不会遮住底下的搜索按钮
    t.click(element_identifier='//*[@id="fxr"]')
    t.hover(element_identifier='//*[@class="ipf01"]')
    t.click(element_identifier='//*[@class="ipf01"]')
    #把展示的尺寸设置为50个产品每页:
    t.hover(element_identifier='//*[@data-pagesize="50"]')
    t.click(element_identifier='//*[@data-pagesize="50"]')

    #当下一页没有被disable的时候,有以下超参数
    page_curr = 1  #当前页面index
    value_dict = {}  #存放data
    count = 1  #csv 命名用
    #存放列名
    name_list = ['序号', '综合评级', 'url']

    for col_name in name_list:
        value_dict.setdefault(col_name, [])  #初始化空数据集

    #当可以翻页,或数据只有一页的时候,进行循环
    while (t.read(element_identifier=
                  '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]')
           == str(page_curr)) or (page_curr == 1):

        #每页的数据量大小(row number)
        count_values = int(
            t.count(element_identifier='//tbody[@id = "content"]//tr')
        ) + 1  # python从0开始
        # 爬取页面所有一个table里的值
        if str(
                t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                       str(count_values - 1) +
                       ']//td[@class = "px"]')) > str_to_append:
            # print("direct continue..")
            # 翻页
            page_curr += 1
            # 鼠标模拟移动,并点击翻页
            t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]')
            t.click(element_identifier='//*[@href="' + str(page_curr) + '"]')
            continue
        filename = str(count) + "daily_data.csv"
        count += 1
        t.wait(1)  # 等1秒,万一加载错误了
        t.table(
            element_identifier='//div[@class = "table-s1 tab-s2 w100"]//table',
            filename_to_save=filename)
        count_values = int(
            t.count(element_identifier='//tbody[@id = "content"]//tr')
        ) + 1  # python从0开始
        for i in range(1, count_values):
            # 判定条件:如果是今天刚发行的,拿到所有主页面上的数据;
            #如果最下面那条数据都大于今天,就直接翻页
            if str(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(count_values - 1) +
                           ']//td[@class = "px"]')) > str_to_append:
                # print("direct break..")
                break
            else:
                if str(
                        t.read(
                            element_identifier='//tbody[@id = "content"]//tr['
                            + str(i) +
                            ']//td[@class = "px"]')) == str_to_append:
                    #序号
                    value_dict[name_list[0]].append(
                        t.read(
                            element_identifier='//tbody[@id = "content"]//tr['
                            + str(i) + ']/td[2]'))
                    #综合评级
                    value_dict[name_list[1]].append(
                        t.read(
                            element_identifier='//tbody[@id = "content"]//tr['
                            + str(i) + ']//td[12]//i/@title'))
                    #url
                    value_dict[name_list[2]].append(
                        t.read(
                            element_identifier='//tbody[@id = "content"]//tr['
                            + str(i) + ']//a/@href'))
                else:  #如果不是今天增量,什么都不做
                    pass
        # print("turn the page..")
        # 翻页
        page_curr += 1
        # 鼠标模拟移动,并点击翻页
        t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]')
        t.click(element_identifier='//*[@href="' + str(page_curr) + '"]')

    #关闭tagui流
    t.close()
    #输出格式为:"今日日期.csv"
    today_data = pd.DataFrame(value_dict)
    today_data.to_csv(str_to_append + ".csv", index=False, encoding='UTF-8')
    return count - 1
示例#16
0
def main_operation(url, mode='txt'):
    # 当前页面
    curr_page = int(
        t.read(element_identifier='//div[@class = "ng-binding"][last()]').
        split('/')[0])
    # 点击按钮
    list_count = t.count(
        element_identifier='//div[@class = "list caidan-right-list"]'
    )  # 循环列表,取出总list有几个
    #如果是断点,读取断电数据
    if os.path.exists('baojianhui_log.txt'):
        with open('baojianhui_log.txt', 'r', encoding='utf-8') as f:
            params = f.read().split(',')
        curr_page = params[0]
        start_i = params[1]
        start_j = params[2]
    else:  #如果是第一次执行,全取初始值;
        start_i = 1
        start_j = 1
    #常规操作
    for i in range(1, list_count + 1):
        t.wait(3)
        if i < int(start_i):
            continue
        item_count = t.count(
            element_identifier='//div[@class = "list caidan-right-list"][' +
            str(i) +
            ']//div[@class = "panel-row ng-scope"]')  # 取出每个list里的具体法规有几条
        print('当前是list {}, 里面的元素有 {} 个'.format(str(i), str(item_count)))
        t.wait(3)
        for j in range(1, item_count + 1):
            if j < int(start_j):
                continue
            item_title = t.read(
                element_identifier='//div[@class = "list caidan-right-list"]['
                + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) +
                ']//a')
            time_suffix = t.read(
                element_identifier='//div[@class = "list caidan-right-list"]['
                + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) +
                ']//span[@class = "date ng-binding"]')
            file_name = item_title + '_' + time_suffix + '.txt'
            if '/' in file_name:
                file_name = file_name.replace('/', ' ')
            if mode == 'txt':
                #点击
                link = t.read(element_identifier=
                              '//div[@class = "list caidan-right-list"][' +
                              str(i) +
                              ']//div[@class = "panel-row ng-scope"][' +
                              str(j) + ']//a/@ng-href')
                prefix = 'http://www.cbirc.gov.cn/cn/view/pages/'
                final_link = prefix + link
                t.url(final_link)
                t.wait(1)
                while not os.path.exists(file_name):
                    type_1 = t.read(
                        element_identifier='//div[@class = "Section0"]'
                    ) + t.read(element_identifier='//div[@class = "Section1"]')
                    type_2 = t.read(
                        element_identifier='//div[@class = "WordSection1"]')
                    type_3 = t.read(
                        element_identifier=
                        '//div[@class = "wenzhang-content ng-binding"]')
                    if type_1 != '':
                        content = type_1
                        with open(file_name, 'w', encoding='utf-8') as f:
                            f.write(content)
                        break
                    elif type_2 != '':
                        content = type_2
                        with open(file_name, 'w', encoding='utf-8') as f:
                            f.write(content)
                        break
                    elif type_3 != '':
                        content = type_3
                        with open(file_name, 'w', encoding='utf-8') as f:
                            f.write(content)
                        break
                    else:
                        content = ' '
                        with open(file_name, 'w', encoding='utf-8') as f:
                            f.write(content)
                        break
            elif mode == 'doc':
                t.click(element_identifier=
                        '//div[@class = "list caidan-right-list"][' + str(i) +
                        ']//div[@class = "panel-row ng-scope"][' + str(j) +
                        ']//a[@ng-click = "fileDownload(x.docFileUrl)"]')
                doc_id = t.read(element_identifier=
                                '//div[@class = "list caidan-right-list"][' +
                                str(i) +
                                ']//div[@class = "panel-row ng-scope"][' +
                                str(j) + ']//a/@ng-href').split('=')[1][:-7]
                doc_name = doc_id + '.doc'
                curr_clock = 5
                while not os.path.exists(doc_name):
                    t.wait(curr_clock)
                    curr_clock += 5
                    if curr_clock > MAX_WAIT:
                        break
                t.wait(2)
                os.rename(doc_name, item_title + '_' + time_suffix + '.doc')
            elif mode == 'pdf':
                t.click(element_identifier=
                        '//div[@class = "list caidan-right-list"][' + str(i) +
                        ']//div[@class = "panel-row ng-scope"][' + str(j) +
                        ']//a[@ng-click = "fileDownload(x.pdfFileUrl)"]')
                pdf_id = t.read(element_identifier=
                                '//div[@class = "list caidan-right-list"][' +
                                str(i) +
                                ']//div[@class = "panel-row ng-scope"][' +
                                str(j) + ']//a/@ng-href').split('=')[1][:-7]
                pdf_name = pdf_id + '.pdf'
                curr_clock = 5
                while not os.path.exists(pdf_name):
                    t.wait(curr_clock)
                    curr_clock += 5
                    if curr_clock > MAX_WAIT:
                        break
                t.wait(2)
                os.rename(pdf_name, item_title + '_' + time_suffix + '.pdf')
            else:
                print('unknown format..')
                t.close()
                raise Exception("unknown input mode")
            # 返回主页面
            t.url(url + str(curr_page))
            t.wait(2)
            with open('baojianhui_log.txt', 'w', encoding='utf-8') as f:
                f.write(str(curr_page) + ',' + str(i) + ',' + str(j))
        with open('baojianhui_log.txt', 'w', encoding='utf-8') as f:
            f.write(str(curr_page) + ',' + str(i) + ',' +
                    str(1))  #当前list取完,j更新
示例#17
0
def compliance_data(url_prefix):
    t.init()  #
    init_url = url_prefix + '1.html'
    t.url(init_url)  #初始url
    max_page = int(
        t.read(element_identifier='//td[@class = "Normal"]').split('/')
        [1]) + 1  #最大page数量
    for page_num in range(1, max_page):
        t.url(url_prefix + str(page_num) + '.html')
        print("现在所在页面 {}".format(page_num))
        t.wait(5)
        # 拿到value
        count_values = t.count(
            element_identifier='//td[@colspan = "2"]//table') + 1
        today = datetime.datetime.today()
        today = str(today.date())
        # today = '2018-04-24'
        if t.read(element_identifier=
                  '//td[@colspan = "2"]//table[1]//span[@class = "hui12"]'
                  ) < today:
            print("今日无增量")
            break
        print("页面有{}个文件".format(count_values - 1))
        t.wait(5)
        for i in range(1, count_values):
            t.url(url_prefix + str(page_num) + '.html')
            if t.read(element_identifier='//td[@colspan = "2"]//table[' +
                      str(i) + ']//span[@class = "hui12"]') < today:
                t.close()
                exit(1)
            file_name = t.read(
                element_identifier='//td[@colspan = "2"]//table[' + str(i) +
                ']') + str('.txt')
            prefix = 'http://www.pbc.gov.cn'
            content_url = prefix + t.read(
                element_identifier='//td[@colspan = "2"]//table[' + str(i) +
                ']//a/@href')
            if 'cnhttp' in content_url:
                content_url = content_url[21:]  # 不知道为什么会出错这个
                t.url(content_url)
                text = t.read(element_identifier='//div[@id = "zoom"]')
                with open(file_name, 'w') as f:
                    f.write(text)
                print("文件{} 是文档。".format(i))
                continue
            t.url(content_url)  #进入二级目录

            #获取pdf的数量,pdf的名字和pdf应该有的名字
            t.wait(2)
            pdf_count = t.count(
                element_identifier='//div[@id = "zoom"]//a/@href')
            if pdf_count == 0:  ##如果是正常的txt文件
                # 取到列表
                print("文件{} 是文档。".format(i))
                # 取text
                if t.read(element_identifier='//div[@id = "zoom"]') != '':
                    text = t.read(element_identifier='//div[@id = "zoom"]')
                    with open(file_name, 'w') as f:
                        f.write(text)
                elif t.read(element_identifier='//td[@class = "p1"]') != '':
                    text = t.read(element_identifier='//td[@class = "p1"]')
                    with open(file_name, 'w') as f:
                        f.write(text)
                else:
                    print("write files fails...")
            elif ('pdf' in t.read(
                    element_identifier='//div[@id = "zoom"]//a/@href')):
                print("文件{} 含有 {} 个pdf。".format(i, pdf_count))
                pdf_count += 1  #python从0开始,所以至少有一个pdf count
                for j in range(1, pdf_count):
                    #取pdf的名字
                    if t.read(element_identifier='//div[@id = "zoom"]//p[' +
                              str(j) + ']//a/@href') != '':
                        print("当前是第{}个pdf。。".format(j))
                        pdf_name = t.read(
                            element_identifier='//div[@id = "zoom"]//p[' +
                            str(j) + ']//a/@href').split('/')[-1]
                        #取合规名
                        pdf_name_to_change = t.read(
                            element_identifier='//div[@id = "zoom"]//p[' +
                            str(j) + ']//a')
                        #下载
                        prefix = 'http://www.pbc.gov.cn'
                        t.url(prefix + t.read(
                            element_identifier='//div[@id = "zoom"]//p[' +
                            str(j) + ']//a/@href'))
                        wait_seconds = 1
                        total_seconds = 0
                        while os.path.exists(pdf_name) == False:
                            t.wait(wait_seconds)
                            total_seconds += wait_seconds
                            if total_seconds > 30:
                                print('download fails')
                                break
                        os.rename(pdf_name, pdf_name_to_change)  #改名
                        t.url(content_url)  #返回二级目录
                    else:
                        print("不合规,当文档处理!不读了!!!")
                        # 取text
                        if t.read(element_identifier='//div[@id = "zoom"]'
                                  ) != '':
                            text = t.read(
                                element_identifier='//div[@id = "zoom"]')
                            with open(file_name, 'w') as f:
                                f.write(text)
                        elif t.read(element_identifier='//td[@class = "p1"]'
                                    ) != '':
                            text = t.read(
                                element_identifier='//td[@class = "p1"]')
                            with open(file_name, 'w') as f:
                                f.write(text)
                        else:
                            print("write files fails...")
                        t.url(url_prefix + str(page_num) + '.html')
                        break
            else:
                print("文件{} 含有 {} 个pdf。".format(i, pdf_count))
                print("含有其他format的href,当文档处理!不读了!!!")
                # 取text
                if t.read(element_identifier='//div[@id = "zoom"]') != '':
                    text = t.read(element_identifier='//div[@id = "zoom"]')
                    with open(file_name, 'w') as f:
                        f.write(text)
                elif t.read(element_identifier='//td[@class = "p1"]') != '':
                    text = t.read(element_identifier='//td[@class = "p1"]')
                    with open(file_name, 'w') as f:
                        f.write(text)
                else:
                    print("write files fails...")
                t.url(url_prefix + str(page_num) + '.html')
                break
    t.close()
def make_reservation(reservation_date,reservation_time,party_size,restaurant_name,first_name,last_name,email_address,phone_number):
    try:
        #Convert User Defined Values to System Usable Values
        reservation_day=reservation_date.split('/')[0]
        reservation_month =reservation_date.split('/')[1]
        reservation_month=int(reservation_month)-1
        reservation_year =reservation_date.split('/')[2]
        reservation_time_int=int(reservation_time)
        start_time_hr= reservation_time[:2]
        if reservation_time_int>1159:
            if start_time_hr!="12":
                start_time_hr=int(start_time_hr)-12
            start_time_option = str(start_time_hr)+":"+reservation_time[2:4]+" pm"
        else:
            start_time_option = str(start_time_hr)+":"+reservation_time[2:4]+" am"
            
        #Booking Parameters
        chope_url ='https://www.chope.co/singapore-restaurants/category/restaurant/'
        t.init()
        t.url(chope_url)
        t.wait(10)
        #Date Field
        t.click(f"(//span[contains(@class,'input-group-addon icon-calendar')])[1]")
        t.wait(7)
        boolean_flag=1
        while boolean_flag:
            if t.present(f"//td[@data-handler='selectDay'and @data-year='{reservation_year}' and @data-month='{reservation_month}']/a[text()='{reservation_day}']"):
                t.click(f"//td[@data-handler='selectDay'and @data-year='{reservation_year}' and @data-month='{reservation_month}']/a[text()='{reservation_day}']")
                boolean_flag=0
            else:
                t.click('//a[@title="Next"]')
        t.click(f"//td[@data-handler='selectDay'and @data-month='{reservation_month}']/a[text()='{reservation_day}']")
        #Time Field
        t.select(f"//select[contains(@id,'time-field')]",start_time_option)
        #Number of Diners Field
        t.click(f"(//span[contains(@class,'input-group-addon icon-person')])[1]")
        t.select(f"//select[contains(@id,'adults')]",party_size)
        #Restaurant Field
        t.type(f"//select[contains(@id,'sb-sel-restaurant')]",restaurant_name)
        t.click('//button[@id="btn-search"]')
        t.wait(5)
        
        #Secondary Page to Confirm Timing
        t.click(f"//a[contains(@rname,'{restaurant_name}') and text()='{start_time_option}']")
        t.wait(5)
        t.click(f"//input[@id='btn_sub' and @value='Book Now']")
        t.wait(5)
        
        #Booking Confirmation
        t.popup('https://book.chope.co/')
        #First Name
        t.type('//input[@id="forename"]',first_name)
        #Last Name
        t.type('//input[@id="surname"]',last_name)
        #Email
        t.type('//input[@id="email"]',email_address)
        #Phone Number
        t.type('//input[@id="telephone"]',phone_number)
        #Agree Terms & Conditions
        if t.present(f"//input[@name='agree_term_conditions']"):
            t.click(f"//input[@name='agree_term_conditions']")
        #Confirm Booking
        t.click(f"//button[@id='check_book_now']")
        t.wait(5)
        t.close()
        print('Success')
        schedule_reservation(reservation_date,reservation_time,party_size,restaurant_name,first_name,sample_restaurant_address)
        return 'Reservation Successful'
    except:
        print('Error')
        return 'Reservation Unsuccessful. Unforunately, the restaurant was not able to accomodate your reservation.'
示例#19
0
def history_data(url_prefix):
    t.init()
    init_url = url_prefix + '1.html'
    t.url(init_url)
    max_page = int(
        t.read(element_identifier='//td[@class = "Normal"]').split('/')[1]) + 1
    for page_num in range(1, max_page):
        #主页面
        t.url(url_prefix + str(page_num) + '.html')
        print("现在所在页面 {}".format(page_num))
        t.wait(5)
        #拿到value
        count_values = t.count(
            element_identifier='//td[@colspan = "2"]//table') + 1
        today = datetime.datetime.today()
        today = str(today.date())
        if t.read(element_identifier=
                  '//td[@colspan = "2"]//table[1]//span[@class = "hui12"]'
                  ) < today:
            print("今日无增量")
            break
        print("页面有{}个文件".format(count_values - 1))
        t.wait(5)
        for i in range(1, count_values):
            t.url(url_prefix + str(page_num) + '.html')
            if t.read(element_identifier='//td[@colspan = "2"]//table[' +
                      str(i) + ']//span[@class = "hui12"]') < today:
                break
            if '.html' in t.read(
                    element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']//a/@href'):
                #取到列表
                print("文件{} 是文档。".format(i))
                file_name = t.read(
                    element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']') + str('.txt')
                prefix = 'http://www.pbc.gov.cn'
                content_url = prefix + t.read(
                    element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']//td//a/@href')
                # 点击url
                if content_url == 'http://www.pbc.gov.cnhttp://www.pbc.gov.cn/goutongjiaoliu/113456/113469/3487563/index.html':
                    content_url = 'http://www.pbc.gov.cn/goutongjiaoliu/113456/113469/3487563/index.html'  #不知道为什么会出错这个
                t.url(content_url)
                # 取text
                if t.read(element_identifier='//div[@id = "zoom"]') != '':
                    text = t.read(element_identifier='//div[@id = "zoom"]')
                    with open(file_name, 'w') as f:
                        f.write(text)
                elif t.read(element_identifier='//td[@class = "p1"]') != '':
                    text = t.read(element_identifier='//td[@class = "p1"]')
                    with open(file_name, 'w') as f:
                        f.write(text)
                else:
                    print("write files fails...")
            elif '.doc' in t.read(
                    element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']//a/@href'):
                # 取到数据
                print("文件{} 是下载doc。".format(i))
                file_name = t.read(
                    element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']//a/@href').split('/')[-1]
                prefix = 'http://www.pbc.gov.cn'
                content_url = prefix + t.read(
                    element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']//a/@href')
                t.url(content_url)
                wait_seconds = 1
                total_seconds = 0
                while os.path.exists(file_name) == False:
                    t.wait(wait_seconds)
                    total_seconds += wait_seconds
                    if total_seconds > 30:
                        print('download fails')
                        break
            else:
                print("unknown format..")
            print("爬好一次,返回页面 {}".format(page_num))
    #close out
    t.close()
示例#20
0
def main_operation(url, mode='txt'):
    # 当前页面
    curr_page = int(
        t.read(element_identifier='//div[@class = "ng-binding"][last()]').
        split('/')[0])
    # 点击按钮
    list_count = t.count(
        element_identifier='//div[@class = "list caidan-right-list"]'
    )  # 循环列表,取出总list有几个
    #如果是断点,读取断电数据
    if os.path.exists('baojianhui_log.txt'):
        with open('baojianhui_log.txt', 'r') as f:
            params = f.read().split(',')
        curr_page = params[0]
        start_i = params[1]
        start_j = params[2]
    else:  #如果是第一次执行,全取初始值;
        start_i = 1
        start_j = 1
    #常规操作
    for i in range(1, list_count + 1):
        t.wait(5)
        if i < int(start_i):
            continue
        item_count = t.count(
            element_identifier='//div[@class = "list caidan-right-list"][' +
            str(i) +
            ']//div[@class = "panel-row ng-scope"]')  # 取出每个list里的具体法规有几条
        print('当前是list {}, 里面的元素有 {} 个'.format(str(i), str(item_count)))
        t.wait(5)
        for j in range(1, item_count + 1):
            if j < int(start_j):
                continue
            item_title = t.read(
                element_identifier='//div[@class = "list caidan-right-list"]['
                + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) +
                ']//a')
            time_suffix = t.read(
                element_identifier='//div[@class = "list caidan-right-list"]['
                + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) +
                ']//span[@class = "date ng-binding"]')
            if str(time_suffix) != str((datetime.datetime.today()).date(
            )):  #如果不是今日日期,直接return;str((datetime.datetime.today()).date())
                print('今日增量已取完')
                return True, '今日无增量'
            file_name = item_title + '_' + time_suffix + '.txt'
            if '/' in file_name:
                file_name = file_name.replace('/', ' ')
            if mode == 'txt':
                #点击
                link = t.read(element_identifier=
                              '//div[@class = "list caidan-right-list"][' +
                              str(i) +
                              ']//div[@class = "panel-row ng-scope"][' +
                              str(j) + ']//a/@ng-href')
                prefix = 'http://www.cbirc.gov.cn/cn/view/pages/'
                final_link = prefix + link
                t.url(final_link)
                t.wait(1)
                while not os.path.exists(file_name):
                    # type_1 = t.read(element_identifier='//div[@class = "Section0"]') + t.read(element_identifier='//div[@class = "Section1"]')
                    # type_2 = t.read(element_identifier='//div[@class = "WordSection1"]')
                    # type_3 = t.read(element_identifier='//div[@class = "wenzhang-content ng-binding"]')
                    if t.read(
                            element_identifier='//div[@class = "Section0"]'
                    ) + t.read(element_identifier='//div[@class = "Section1"]'
                               ) != '':
                        #p0
                        p_counts_section0 = t.count(
                            element_identifier='//div[@class = "Section0"]//p')
                        content_list = []
                        with open(file_name, 'w', encoding='utf-8') as f:
                            f.write(file_name.split("_")[0] + "\n")
                        for p in range(1, p_counts_section0 + 1):
                            content_list.append(
                                t.read(element_identifier=
                                       '//div[@class = "Section0"]//p[' +
                                       str(p) + ']'))
                        with open(file_name, 'a', encoding='utf-8') as f:
                            f.writelines(
                                [content + "\n" for content in content_list])
                            #p1
                        p_counts_section1 = t.count(
                            element_identifier='//div[@class = "Section1"]//p')
                        content_list = []
                        for p in range(1, p_counts_section1 + 1):
                            content_list.append(
                                t.read(element_identifier=
                                       '//div[@class = "Section1"]//p[' +
                                       str(p) + ']'))
                        with open(file_name, 'a', encoding='utf-8') as f:
                            f.writelines(
                                [content + "\n" for content in content_list])
                        break
                    elif t.read(
                            element_identifier='//div[@class = "WordSection1"]'
                    ) != '':
                        p_counts = t.count(element_identifier=
                                           '//div[@class = "WordSection1"]//p')
                        if p_counts <= 1:
                            content_list = t.read(
                                element_identifier=
                                '//div[@class = "WordSection1"]//p')
                            with open(file_name, 'w', encoding='utf-8') as f:
                                f.write(file_name.split("_")[0] + "\n")
                                f.writelines([
                                    content + "\n"
                                    for content in content_list.split("     ")
                                ])
                        else:
                            content_list = []
                            for p in range(1, p_counts + 1):
                                content_list.append(
                                    t.read(
                                        element_identifier=
                                        '//div[@class = "WordSection1"]//p[' +
                                        str(p) + ']'))
                            with open(file_name, 'w', encoding='utf-8') as f:
                                f.write(file_name.split("_")[0] + "\n")
                                f.writelines([
                                    content + "\n" for content in content_list
                                ])
                        break
                    elif t.read(element_identifier=
                                '//div[@class = "wenzhang-content ng-binding"]'
                                ) != '':
                        #有p》1
                        #无p 《=1, 用split
                        p_counts = t.count(
                            element_identifier=
                            '//div[@class = "wenzhang-content ng-binding"]//p')
                        if p_counts <= 1:
                            content_list = t.read(
                                element_identifier=
                                '//div[@class = "wenzhang-content ng-binding"]//p'
                            )
                            with open(file_name, 'w', encoding='utf-8') as f:
                                f.write(file_name.split("_")[0] + "\n")
                                f.writelines([
                                    content + "\n"
                                    for content in content_list.split("     ")
                                ])
                        else:
                            content_list = []
                            for p in range(1, p_counts + 1):
                                content_list.append(
                                    t.read(
                                        element_identifier=
                                        '//div[@class = "wenzhang-content ng-binding"]//p['
                                        + str(p) + ']'))
                            with open(file_name, 'w', encoding='utf-8') as f:
                                f.write(file_name.split("_")[0] + "\n")
                                f.writelines([
                                    content + "\n" for content in content_list
                                ])
                        break
                    else:
                        content = ' '
                        with open(file_name, 'w') as f:
                            f.write(content)
                        break
            elif mode == 'doc':
                t.click(element_identifier=
                        '//div[@class = "list caidan-right-list"][' + str(i) +
                        ']//div[@class = "panel-row ng-scope"][' + str(j) +
                        ']//a[@ng-click = "fileDownload(x.docFileUrl)"]')
                doc_id = t.read(element_identifier=
                                '//div[@class = "list caidan-right-list"][' +
                                str(i) +
                                ']//div[@class = "panel-row ng-scope"][' +
                                str(j) + ']//a/@ng-href').split('=')[1][:-7]
                doc_name = doc_id + '.doc'
                curr_clock = 5
                while not os.path.exists(doc_name):
                    t.wait(curr_clock)
                    curr_clock += 5
                    if curr_clock > MAX_WAIT:
                        break
                t.wait(5)
                os.rename(doc_name, item_title + '_' + time_suffix + '.doc')
            elif mode == 'pdf':
                t.click(element_identifier=
                        '//div[@class = "list caidan-right-list"][' + str(i) +
                        ']//div[@class = "panel-row ng-scope"][' + str(j) +
                        ']//a[@ng-click = "fileDownload(x.pdfFileUrl)"]')
                pdf_id = t.read(element_identifier=
                                '//div[@class = "list caidan-right-list"][' +
                                str(i) +
                                ']//div[@class = "panel-row ng-scope"][' +
                                str(j) + ']//a/@ng-href').split('=')[1][:-7]
                pdf_name = pdf_id + '.pdf'
                curr_clock = 5
                while not os.path.exists(pdf_name):
                    t.wait(curr_clock)
                    curr_clock += 5
                    if curr_clock > MAX_WAIT:
                        break
                t.wait(5)
                os.rename(pdf_name, item_title + '_' + time_suffix + '.pdf')
            else:
                print('unknown format..')
                t.close()
                raise Exception("unknown input mode")
            # 返回主页面
            t.url(url + str(curr_page))
            t.wait(5)
            with open('baojianhui_log.txt', 'w') as f:
                f.write(str(curr_page) + ',' + str(i) + ',' + str(j))
        with open('baojianhui_log.txt', 'w') as f:
            f.write(str(curr_page) + ',' + str(i) + ',' +
                    str(1))  #当前list取完,j更新
示例#21
0
def propertydata_update(project_name):

    df1 = pd.read_excel('Property Monitor.xlsx')

    t.close()
    t.init()
    project_url = f'https://www.propertyguru.com.sg/property-for-sale?market=residential&freetext={project_name}&newProject=all'
    print(project_url)
    t.url(project_url)
    wait_for_pageload('//div[@class="header-wrapper"]')
    num_result_ad = 3

    # load main page, get detail page url link
    url = [''] * num_result_ad
    id = [''] * num_result_ad

    for n in [x for x in range(1, num_result_ad + 1)
              if x != 4 and x != 8]:  # skip 4th and 8th advertisement
        wait_for_pageload(
            f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)'
        )
        url[n - 1] = read_if_present(
            f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)'
        )
        print(f"{n}. url = " + url[n - 1])
        id[n - 1] = read_if_present(
            f'(//*[@id="wrapper-inner"]/section[1]/div[2]/div[1]/div[2]/div[2]/section/div[2]/div[{n}]/@data-listing-id)'
        )

    print(f'searching: {id}')  # ['22036842', '21725956', '20648962']
    id_int = list(df1['id'])
    id_str = list()
    for n in id_int:
        id_str.append(str(n))
    print(id_str)

    new_url = list()
    for n in id:
        if n not in id_str:
            print(f'new property found: {n}')
            u = f"https://www.propertyguru.com.sg/listing/{n}/for-sale-{project_name}"
            new_url.append(u)
    if new_url == []:
        return print(f'======== no new property found! ==========')

    print(f'======== new property found==========')
    property_title = [''] * len(new_url)
    pdf = [''] * len(new_url)
    pdf_link = [''] * len(new_url)

    for (n, i) in zip(new_url, range(1, len(new_url) + 1)):
        t.url(n)
        wait_for_pageload('//h1[@class="h2"]')
        property_title[i - 1] = read_if_present('//h1[@class="h2"]')
        print(f"{i}. property_title = " + property_title[i - 1])
        pdf[i - 1] = read_if_present(
            '//*[@id="sticky-right-col"]/div[3]/a[2]/@href')
        pdf_link[i - 1] = 'https://www.propertyguru.com.sg' + pdf[i - 1]
        print(f"{i}. pdf_link = " + pdf_link[i - 1])

    property_info = {
        'property_title': property_title,
        'url': ['https://www.propertyguru.com.sg' + x for x in url],
        'id': id,
        'pdf_link': pdf_link,
    }
    df2 = DataFrame(property_info,
                    columns=['property_title', 'id', 'url', 'pdf_link'])

    new_df = pd.concat([df1, df2])
    new_df.to_excel('Property Monitor.xlsx', encoding='utf8', index=None)
    print('======== Property Monitor.xlsx update ==========')

    pdf_filename = download_pdf(property_title, pdf_link, id)
    mail_subscription(input_email, input_name, pdf_filename)
示例#22
0
# use snap() to save screenshot of page or UI element
# page = web page, page.png = computer screen
t.snap('page', 'results.png')
t.snap('logo', 'logo.png')

# another example of interacting with a web page
# include http:// or https:// in URL parameter
t.url('https://duckduckgo.com')
t.type('search_form_input_homepage',
       'The search engine that doesn\'t track you.')
t.snap('page', 'duckduckgo.png')
t.wait(4.4)

# use close() to close TagUI process and web browser
# if you forget to close, just close() next time
t.close()

# in above web automation example, web element identifier can be XPath selector, CSS selector or
# attributes id, name, class, title, aria-label, text(), href, in decreasing order of priority
# if you don't mind using ugly and less robust XPath, it can be copied from Chrome inspector
# otherwise recommend googling on writing XPath manually, or simply make use of attributes

# also supports visual element identifier using .png or .bmp image snapshot
# representing the UI element (can be on desktop applications or web browser)
# for eg t.click('start_menu.png'), t.type('username_box.png', 'Sonic')

# image transparency (0% opacity) is supported, ie images with empty sections
# t.read('image_preview_frame.png'), t.snap('application_window_frame.png')

# visual element identifiers can also be x, y coordinates of elements on the screen
# for eg t.click(600, 300), t.type(600, 300, 'Mario'), t.select(600, 300, 600, 400)
示例#23
0
def url2png(url):
    t.init()
    t.url(url)
    # t.type('q', 'decentralization[enter]')
    t.snap('page', 'results-' + str(uuid.uuid1()) + '.png')
    t.close()
def getblanklist():
    #初始化页面
    t.init()
    #输入url进入
    t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1")
    #直接开始搜索,不需要任何筛选条件
    t.click(element_identifier='//*[@id="fxr"]')
    t.hover(element_identifier='//*[@class="ipf01"]')
    t.click(element_identifier='//*[@class="ipf01"]')
    #把展示的尺寸设置为50个产品每页:
    t.hover(element_identifier='//*[@data-pagesize="50"]')
    t.click(element_identifier='//*[@data-pagesize="50"]')
    #点击以发行日升序排行,等价于"倒过来取",这样发行日为空的会在最前面
    t.hover(element_identifier='//*[@data-sort = "sell_org_date"]')
    t.click(element_identifier='//*[@data-sort = "sell_org_date"]')

    #当下一页没有被disable的时候,有以下超参数
    page_curr = 1  #当前页面index
    max_page = 1  # 最大的页面数记录

    # 存放列名
    value_dict = {}  # 存放data
    name_list = ['序号', '综合评级', 'url']

    for col_name in name_list:
        value_dict.setdefault(col_name, [])  # 初始化空数据集

    #当可以翻页,或数据只有一页的时候,进行循环
    stop_flag = False  # 初始化一个flag,flag = true代表我们需要的数据已经取完了,没必要再翻页了
    while (t.read(element_identifier=
                  '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]')
           == str(page_curr)) or (page_curr == 1):

        if stop_flag == True:  #如果没有空白数据了,就没必要翻页了
            break
        max_page = page_curr
        #每页的数据量大小(row number)
        count_values = int(
            t.count(element_identifier='//tbody[@id = "content"]//tr')
        ) + 1  # python从0开始
        # 爬取页面所有一个table里的值
        filename = str(page_curr) + "blank_date.csv"
        t.wait(1)  # 等1秒,万一加载错误了
        t.table(
            element_identifier='//div[@class = "table-s1 tab-s2 w100"]//table',
            filename_to_save=filename)

        #爬取当前页面 (只有title和href)
        for i in range(1, count_values):
            # 判定条件:如果发行日是空(--),进入此if
            if str(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//td[@class = "px"]')) == '--':
                # print("number {} is running".format(str(i)))
                # 序号
                value_dict[name_list[0]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[2]'))
                # 综合评级
                value_dict[name_list[1]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//td[12]//i/@title'))
                # url
                value_dict[name_list[2]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//a/@href'))

            else:  # 如果不再是空值-- ,此线程结束,flag置true, while循环结束
                stop_flag = True
                # print("thread stops here..")
                break
        # 翻页
        page_curr += 1
        # print("turn the page..")
        # 鼠标模拟移动,并点击翻页
        t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]')
        t.click(element_identifier='//*[@href="' + str(page_curr) + '"]')

    # #关闭tagui流
    t.close()
    #
    # 输出格式为:"blank_date.csv"
    hist_data = pd.DataFrame(value_dict)
    hist_data.to_csv("blank_date.csv", index=False, encoding='UTF-8')
    return max_page
示例#25
0
def history_data_daily(url_prefix):
    t.init()  #
    init_url = url_prefix + '1.html'
    t.url(init_url)  # 初始url
    max_page = int(t.read(element_identifier='//td[@class = "Normal"]').split('/')[1]) + 1  # 最大page数量
    for page_num in range(1, max_page):
        t.url(url_prefix + str(page_num) + '.html')
        print("现在所在页面 {}".format(page_num))
        t.wait(5)
        # 拿到value
        count_values = t.count(element_identifier='//td[@colspan = "2"]//table') + 1
        today = datetime.datetime.today()
        today = str(today.date())
        # today = '2018-04-24'
        if t.read(element_identifier='//td[@colspan = "2"]//table[1]//span[@class = "hui12"]') < today:
            print("今日无增量")
            break
        print("页面有{}个文件".format(count_values - 1))
        t.wait(5)
        for i in range(1, count_values):
            t.url(url_prefix + str(page_num) + '.html')
            if t.read(element_identifier='//td[@colspan = "2"]//table['+str(i)+']//span[@class = "hui12"]') < today:
                t.close()
                exit(1)
            file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']')
            file_name = file_name[:-10] + str("_") + file_name[-10:] + str('.txt')
            time = file_name[-14:-4]
            prefix = 'http://www.pbc.gov.cn'
            content_url = prefix + t.read(
                element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href')
            if '.html' not in t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'):
                # 当直接跳到需要下载的文件的时候
                if 'cnhttp' in content_url:
                    content_url = content_url[21:]  # 不知道为什么会出错这个
                    # 取到数据
                    print("文件{} 是直接下载文件。".format(i))
                    file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href')
                    suffix = file_name.split('.')[-1]

                    file_name = file_name.split('/')[-1]

                    t.url(content_url)
                    wait_seconds = 1
                    total_seconds = 0
                    while os.path.exists(file_name) == False:
                        t.wait(wait_seconds)
                        total_seconds += wait_seconds
                        if total_seconds > 30:
                            print('download fails')
                            break

                    os.rename(file_name, file_name[:-(len(suffix)+1)] + "_" + time +'.'+file_name[-(len(suffix)+1):])
                else:
                    # 取到数据
                    print("文件{} 是直接下载文件。".format(i))
                    file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href')
                    suffix = file_name.split('.')[-1]
                    file_name = file_name.split('/')[-1]
                    t.url(content_url)
                    wait_seconds = 1
                    total_seconds = 0
                    while os.path.exists(file_name) == False:
                        t.wait(wait_seconds)
                        total_seconds += wait_seconds
                        if total_seconds > 30:
                            print('download fails')
                            break
                    os.rename(file_name, file_name[:-(len(suffix)+1)] + "_" + time +'.'+file_name[-(len(suffix)+1):])

            else:  # 当没有直接下载的时候
                if 'cnhttp' in content_url:
                    content_url = content_url[21:]  # 不知道为什么会出错这个
                    t.url(content_url)
                else:
                    t.url(content_url)
                # 获取pdf的数量,pdf的名字和pdf应该有的名字
                t.wait(2)
                pdf_count = t.count(element_identifier='//div[@id = "zoom"]//a/@href')
                if pdf_count == 0:  ##如果是正常的txt文件
                    # 取到列表
                    print("文件{} 是文档。".format(i))
                    # 取text
                    if t.read(element_identifier='//div[@id = "zoom"]') != '':
                        text = t.read(element_identifier='//div[@id = "zoom"]')
                        with open(file_name, 'w') as f:
                            f.write(text)
                    elif t.read(element_identifier='//td[@class = "p1"]') != '':
                        text = t.read(element_identifier='//td[@class = "p1"]')
                        with open(file_name, 'w') as f:
                            f.write(text)
                    else:
                        print("write files fails...")
                else:
                    # 取text
                    if t.read(element_identifier='//div[@id = "zoom"]') != '':
                        text = t.read(element_identifier='//div[@id = "zoom"]')
                        with open(file_name, 'w') as f:
                            f.write(text)
                    elif t.read(element_identifier='//td[@class = "p1"]') != '':
                        text = t.read(element_identifier='//td[@class = "p1"]')
                        with open(file_name, 'w') as f:
                            f.write(text)
                    else:
                        print("write files fails...")
                    print("文件{} 含有 {} 个文件要下载。".format(i, pdf_count))
                    pdf_count += 1  # python从0开始,所以至少有一个pdf count
                    current_count = 0
                    for j in range(1, pdf_count):
                        # 取pdf的名字
                        if '.htm' not in t.read(element_identifier='//div[@id = "zoom"]//p//a/@href'):
                            print("当前是第{}个文件。。".format(j))
                            p_count = t.count(element_identifier='//div[@id = "zoom"]//p')
                            while current_count <= p_count:

                                if t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a') != '':
                                    #如果取到了
                                    print("这个p有!")
                                    pdf_name = t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href')
                                    # 取合规名
                                    pdf_name_to_change = t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a')
                                    # 下载
                                    suffix = pdf_name.split('.')[-1]

                                    pdf_name = pdf_name.split('/')[-1]
                                    prefix = 'http://www.pbc.gov.cn'
                                    download_link = prefix + t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href')
                                    if 'cnhttp' in download_link:
                                        t.url(t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href'))
                                    else:
                                        t.url(download_link)
                                    wait_seconds = 1
                                    total_seconds = 0
                                    while os.path.exists(pdf_name) == False:
                                        t.wait(wait_seconds)
                                        total_seconds += wait_seconds
                                        if total_seconds > 30:
                                            print('download fails')
                                            break
                                    os.rename(pdf_name, pdf_name_to_change)  # 改名
                                    os.rename(pdf_name_to_change,
                                              pdf_name_to_change[:-(len(suffix)+1)] + '_' + time + pdf_name_to_change[-(len(suffix)+1):])
                                    t.url(content_url)  # 返回二级目录
                                    current_count += 1
                                    break
                                else:
                                    current_count += 1
                                    print("这个p没有")

                        else:
                            print("是个网页,当文档处理!")
                            prefix = 'http://www.pbc.gov.cn'
                            download_link = prefix + t.read(
                                element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href')
                            if 'cnhttp' in download_link:
                                t.url(t.read(element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href'))
                            else:
                                t.url(download_link)
                            # 取text
                            if t.read(element_identifier='//div[@id = "zoom"]') != '':
                                text = t.read(element_identifier='//div[@id = "zoom"]')
                                with open(file_name, 'w') as f:
                                    f.write(text)
                            elif t.read(element_identifier='//td[@class = "p1"]') != '':
                                text = t.read(element_identifier='//td[@class = "p1"]')
                                with open(file_name, 'w') as f:
                                    f.write(text)
                            else:
                                print("write files fails...")

    t.close()
示例#26
0
def gethistorylist(inputyear):
    # 获取xxxx年的数据
    input = inputyear
    date_start = input + '-01-01'  #一年开始的日期 (试一试10天的)
    date_end = input + '-12-31'  #一年结束的日期

    #初始化页面
    t.init()
    #输入url进入
    t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1")
    #鼠标放上去,点击精简选项
    t.hover(element_identifier='//*[@id="fxr"]')
    t.click(element_identifier='//*[@id="zksq"]')
    #鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索
    t.hover(element_identifier='//*[@id="fxr"]')
    t.click(element_identifier='//*[@id="fxr"]')
    t.type(element_identifier='//*[@id="fxr"]', text_to_type=date_start)
    #再点击,确保日期不会遮住底下的搜索按钮
    t.click(element_identifier='//*[@id="fxr"]')
    t.hover(element_identifier='//*[@class="ipf01"]')
    t.click(element_identifier='//*[@class="ipf01"]')
    #把展示的尺寸设置为50个产品每页:
    t.hover(element_identifier='//*[@data-pagesize="50"]')
    t.click(element_identifier='//*[@data-pagesize="50"]')
    #点击以发行日升序排行,等价于"倒过来取"
    t.hover(element_identifier='//*[@data-sort = "sell_org_date"]')
    t.click(element_identifier='//*[@data-sort = "sell_org_date"]')
    #当下一页没有被disable的时候,有以下超参数
    page_curr = 1  #当前页面index
    value_dict = {}  #存放data
    #存放列名
    name_list = [
        '序号', '产品名称', '发行银行', '委托货币', '发行日', '停售日', '管理期(天)', '预期收益率', '到期收益率',
        '与同期储蓄比', '综合评级', 'url'
    ]
    for col_name in name_list:
        value_dict.setdefault(col_name, [])  #初始化空数据集
    #当可以翻页,或数据只有一页的时候,进行循环
    stop_flag = False
    #当当前页面不是最后一页,或只有一页时,都进行如下循环
    while (t.read(element_identifier=
                  '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]')
           == str(page_curr)) or (page_curr == 1):
        if stop_flag == True:  #如果没有今年的数据,就没必要翻页了
            break
        #每页的数据量大小(row number)
        count_values = int(
            t.count(element_identifier='//tbody[@id = "content"]//tr')
        ) + 1  # python从0开始
        #爬取当前页面
        for i in range(1, count_values):
            # 判定条件:如果是今年内(小于今年12-31或等于12-31的),全都要
            if str(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//td[@class = "px"]')) <= date_end:
                # 序号
                value_dict[name_list[0]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[2]'))
                # 产品名称
                value_dict[name_list[1]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[3]'))
                # 发行银行
                value_dict[name_list[2]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[4]'))
                # 委托货币
                value_dict[name_list[3]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[5]'))
                # 发行日
                value_dict[name_list[4]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[6]'))
                # 停售日
                value_dict[name_list[5]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[7]'))
                # 管理期(天)
                value_dict[name_list[6]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[8]'))
                # 预期收益率
                value_dict[name_list[7]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[9]'))
                # 到期收益率
                value_dict[name_list[8]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[10]'))
                # 与同期储蓄比
                value_dict[name_list[9]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[11]'))
                # 综合评级
                value_dict[name_list[10]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//td[12]//i/@title'))
                # url
                value_dict[name_list[11]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//a/@href'))
            else:  # 如果已经超过今年的数据了,此线程结束,flag置true, while循环结束
                stop_flag = True
                print("thread stops here..")
                break
        # 翻页
        page_curr += 1
        print("turn the page..")
        # 鼠标模拟移动,并点击翻页
        t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]')
        t.click(element_identifier='//*[@href="' + str(page_curr) + '"]')
    # #关闭tagui流
    t.close()
    #输出格式为:"今年年份.csv"
    hist_data = pd.DataFrame(value_dict)
    #双格式(csv + xlsx 输出)
    hist_data.to_csv(input + ".csv", index=False, encoding='UTF-8')
    hist_data.to_excel(input + ".xlsx", index=False, encoding='UTF-8')


#gethistorylist('2003')
def get_shoe(shoe, gender, email):
    t.init(visual_automation=True)
    t.url("https://www.farfetch.com/sg/")
    details = []
    if gender == ' men':
        t.click('(//span[@class="tabs__span"])[.="Men"]')
        t.type('//input[@class="js-searchboxABTest force-ltr"]',
               shoe + " Shoes")
        t.click('//form[@class="ff-search"]/button')
        t.wait(3)
        count = t.count('(//li[@data-test="productCard"])')
        if count != 0:
            for i in range(1, min(count, 4)):
                name = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/p'
                )
                price = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/div'
                ).replace('$', '')

                if 'Off' in price:
                    price = price.split('Off')[1]

                img = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//img/@src')
                link = "https://www.farfetch.com" + t.read(
                    f'(//li[@data-test="productCard"])[{i}]/a/@href')
                details.append({
                    "email": email,
                    "name": name,
                    "price": price,
                    "img": img,
                    "Company": "Farfetch",
                    "link": link
                })
#                print(f"name: {name}, price: {price} img_source = {img}")
        else:
            details.append({
                "email": email,
                "name": "NA",
                "price": "NA",
                "img": "NA",
                "Company": "Farfetch",
                "link": "NA"
            })

    elif gender == ' women':
        t.click('(//span[@class="tabs__span"])[.="Women"]')
        t.type('//input[@class="js-searchboxABTest force-ltr"]',
               shoe + " Shoes")
        t.click('//form[@class="ff-search"]/button')
        t.wait(3)
        count = t.count('(//li[@data-test="productCard"])')
        if count != 0:
            for i in range(1, min(count, 4)):
                name = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/p'
                )
                price = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/div'
                ).replace('$', '')
                if 'Off' in price:
                    price = price.split('Off')[1]

                img = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//img/@src')
                link = "https://www.farfetch.com" + t.read(
                    f'(//li[@data-test="productCard"])[{i}]/a/@href')
                details.append({
                    "email": email,
                    "name": name,
                    "price": price,
                    "img": img,
                    "Company": "Farfetch",
                    "link": link
                })
#                print(f"name: {name}, price: {price} img_source = {img}")
        else:
            details.append({
                "email": email,
                "name": "NA",
                "price": "NA",
                "img": "NA",
                "Company": "Farfetch",
                "link": "NA"
            })
    else:
        t.type('//input[@class="js-searchboxABTest force-ltr"]',
               shoe + " Shoes")
        t.click('//form[@class="ff-search"]/button')
        t.wait(3)
        count = t.count('(//li[@data-test="productCard"])')
        if count != 0:
            for i in range(1, min(count, 4)):
                name = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/p'
                )
                price = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/div'
                ).replace('$', '')
                if 'Off' in price:
                    price = price.split('Off')[1]

                img = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//img/@src')
                link = "https://www.farfetch.com" + t.read(
                    f'(//li[@data-test="productCard"])[{i}]/a/@href')
                details.append({
                    "email": email,
                    "name": name,
                    "price": price,
                    "img": img,
                    "Company": "Farfetch",
                    "link": link
                })
#                print(f"name: {name}, price: {price} img_source = {img}")
        else:
            details.append({
                "email": email,
                "name": "NA",
                "price": "NA",
                "img": "NA",
                "Company": "Farfetch",
                "link": "NA"
            })

    t.close()

    return details