示例#1
0
def catchContent():
    number_to = t.count(
        '(//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")])'
    )

    df_to = pd.DataFrame(index=range(0, number_to),
                         columns=['Sno', 'Title', 'URL', 'Summary', 'Img_URL'])

    t.hover('//div[@class="container footer-main"]')
    t.wait(2)

    for n in range(1, number_to):
        title = t.read(
            '//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")][{}]//div[contains(@class, "article-listing_content")]//h2'
            .format(n))
        URL_o = t.read(
            '//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")][{}]//@href'
            .format(n))
        URL = "https://www.todayonline.com" + str(URL_o)

        Img_link = t.read(
            '//div[@class="col"]/div[contains(@class, "today")]/ul/li[contains(@class, "col-md-12")][{}]//img/@src'
            .format(n))

        df_to.iloc[n - 1, 0] = n
        df_to.iloc[n - 1, 1] = title.decode('utf-8')
        df_to.iloc[n - 1, 2] = URL
        df_to.iloc[n - 1, 4] = Img_link

    for i in range(0, df_to.shape[0]):
        if df_to['Img_URL'][i] == "":
            df_to['Img_URL'][i] = np.nan

    df_to.dropna(subset=['Img_URL'], inplace=True, how='any')
    df_to = df_to.reset_index(drop=True)
    df_to['Sno'] = df_to.index

    df_to = util.fixImgLink(
        df_to,
        "https://cf-templates-fghyux9ggb7t-ap-southeast-1.s3-ap-southeast-1.amazonaws.com/todayOnline.png"
    )

    for n in range(0, df_to.shape[0]):
        t.url(df_to.URL[n])
        t.wait(4)
        t.hover('//div[@class="article-detail_subscription"]')
        t.wait(2)

        number_p = t.count('//div/p[not(@class)]')

        Content = ""

        for i in range(1, number_p - 2):
            cont = t.read('//div/p[not(@class)][{}]'.format(i))
            Content = Content + "" + cont

        summaries = Summarize(df_to.Title[n], unicode(str(Content), "utf-8"))
        df_to.iloc[n - 1, 3] = summaries[0]

    return df_to
示例#2
0
def catchContent():
    number_bb = t.count(
        '(//div[contains(@data-vr-zone, "Top Stories")]//span[contains(@class, "story-headline")])'
    )

    df_bb = pd.DataFrame(index=range(0, number_bb - 2),
                         columns=['Sno', 'Title', 'URL', 'Summary', 'Img_URL'])

    for n in range(0, number_bb - 2):
        title = t.read(
            '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]'
            .format(n))
        URL_b = t.read(
            '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]//@href'
            .format(n))
        URL = "https://www.straitstimes.com/" + str(URL_b)
        Img_URL = t.read(
            '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]/ancestor::div[contains(@class, "body")]/..//img/@src'
            .format(n))
        summaries = SummarizeUrl(URL)
        df_bb.iloc[n, 0] = n
        df_bb.iloc[n, 1] = title
        df_bb.iloc[n, 2] = URL
        df_bb.iloc[n, 3] = summaries
        df_bb.iloc[n, 4] = Img_URL

    return df_bb
def multi_city_trip(enquiry):
    t.click('//input[@id="fsc-trip-type-selector-multi-destination"]')
    travel_dates = enquiry["dates"]
    numDep = len(travel_dates)
    cities = enquiry["city"]
    numCity = len(cities)
    form_flightleg = t.count(
        '//*[@id="flights-search-controls-root"]/div/div/form/div[2]/ol/li')
    if numDep < form_flightleg:
        for cnt in range(form_flightleg - numDep):
            t.click(
                f'//*[@id="flights-search-controls-root"]/div/div/form/div[2]/ol/li[{form_flightleg-cnt}]/div[4]/button'
            )
    elif numDep > form_flightleg:
        for cnt in range(numDep - form_flightleg):
            t.click(
                '//div[starts-with(@class,"MulticityControls_MulticityControls__add-leg-wrapper__2arYh")]/button'
            )
            t.wait(0.5)

    for num in range(0, numDep):
        start_date = dt.strptime(travel_dates[num], '%d/%m/%Y')
        start_month = start_date.strftime('%Y-%m')
        orig_city = cities[num]
        if numCity == numDep:
            if num < numDep - 1:
                dest_city = cities[num + 1]
            else:
                dest_city = cities[0]
        else:
            dest_city = cities[num + 1]
        t.type(f'//input[@id="fsc-origin-search-{num}"]', orig_city)
        t.wait(0.5)
        t.type(f'//input[@id="fsc-destination-search-{num}"]', dest_city)
        t.wait(0.5)
        t.click(
            f'//button[@id="fsc-leg-date-{num}-fsc-datepicker-button"]//span[starts-with(@class,"DateInput")]'
        )
        t.click(
            f'//select[@id="fsc-leg-date-{num}-calendar__bpk_calendar_nav_select"]'
        )
        t.select(
            f'//select[@id="fsc-leg-date-{num}-calendar__bpk_calendar_nav_select"]',
            f'{start_month}')
        t.click(
            f'//button[starts-with(@class,"BpkCalendarDate") and contains(@aria-label,"{start_date.strftime("%d %B %Y").lstrip("0")}")]'
        )

    t.click('//button[starts-with(@id,"CabinClassTravellersSelector")]')
    t.click('//select[@id="search-controls-cabin-class-dropdown"]')
    t.select('//select[@id="search-controls-cabin-class-dropdown"]',
             lookup_cabin_class(enquiry["cabin_class"]))
    adult_pax = int(enquiry['adult'])
    child_pax = len(enquiry['child_age'])
    child_age = enquiry['child_age']
    number_of_travellers(adult_pax, child_pax, child_age)

    t.click('//button[@type="submit"][@aria-label="Search flights"]')
示例#4
0
文件: law_RPA.py 项目: maoyuanqi/iRPA
def get_count_values(page_num, url_prefix):
    t.url(url_prefix + str(page_num) + '.html')
    print("现在所在页面 {}".format(page_num))
    t.wait(5)
    # 拿到value
    count_values = t.count(element_identifier='//td[@colspan = "2"]//table')
    print("页面有{}个文件".format(count_values))
    with open('count_items_' + str(page_num) + '_'+str(url_prefix.split('/')[-2]) + '.txt', 'w', encoding='utf-8') as f:
        f.write('page:' + str(page_num) + ':' + str(count_values))  # 以:为分隔符;记录当前页面和页面总共item数量
    return 'count_items_' + str(page_num) + '_'+str(url_prefix.split('/')[-2]) + '.txt'
示例#5
0
def login_stackoverflow(account, password):
    t.url('https://stackoverflow.com/users/login')
    click('//button[@data-provider="google"]')
    if wait_element('//div[@data-identifier="{}"]'.format(account)):
        click('//div[@data-identifier="{}"]'.format(account))
    else:
        c = t.count('//div[@jsslot=""]//li')
        click('(//div[@jsslot=""]//li)[{}]'.format(c))
        type_into('//*[@type="email"]', account + '[enter]')
    type_into('//*[@name="password"]', password + '[enter]')
    return
def multi_city_trip(enquiry):
    t.click('//input[@id="flight-type-multi-dest-hp-flight"]')
    travel_dates = enquiry["dates"]
    numDep = len(travel_dates)
    cities = enquiry["city"]
    numCity = len(cities)

    form_flightleg = (t.count(
        '//div[@class="cols-nested gcw-multidest-flights-container"]/div/fieldset'
    ))
    print(form_flightleg)
    if numDep < form_flightleg:
        for cnt in range(form_flightleg - numDep):
            t.click(
                f'//*[@id="flightlegs-list-fieldset-{form_flightleg-cnt}-hp-flight"]/div/a'
            )
    elif numDep > form_flightleg:
        for cnt in range(numDep - form_flightleg):
            t.click('//a[@id="add-flight-leg-hp-flight"]')
            t.wait(0.5)

    t.type('//input[@id="flight-origin-hp-flight"]', cities[0])
    t.type('//input[@id="flight-destination-hp-flight"]', cities[1])
    t.type('//input[@id="flight-departing-single-hp-flight"]', '[clear]')
    t.type('//input[@id="flight-departing-single-hp-flight"]',
           (dt.strptime(travel_dates[0], '%d/%m/%Y')).strftime("%d/%m/%Y"))

    for num in range(1, numDep):
        print(f"num:{num} and form_flightleg:{form_flightleg}")

        start_date = dt.strptime(travel_dates[num], '%d/%m/%Y')
        orig_city = cities[num]
        if numCity == numDep:
            if num < numDep - 1:
                dest_city = cities[num + 1]
            else:
                dest_city = cities[0]
        else:
            dest_city = cities[num + 1]

        t.type(f'//input[@id="flight-{num+1}-origin-hp-flight"]', orig_city)
        t.wait(0.5)
        t.type(f'//input[@id="flight-{num+1}-destination-hp-flight"]',
               dest_city)
        t.wait(0.5)
        t.type(f'//input[@id="flight-{num+1}-departing-hp-flight"]', '[clear]')
        t.type(f'//input[@id="flight-{num+1}-departing-hp-flight"]',
               start_date.strftime("%d/%m/%Y"))

    t.click('//a[@id="flight-advanced-options-hp-flight"]')
    t.select('//select[@id="flight-advanced-preferred-class-hp-flight"]',
             lookup_cabin_class(enquiry["cabin_class"]))
    t.click('//*[@id="gcw-flights-form-hp-flight"]/div[8]/label/button')
示例#7
0
def get_shoe(shoe, g, email):
    gender = g
    # print('[nike]',gender)
    t.init(visual_automation=True)
    t.url('https://www.nike.com/sg/')

    t.type('//input[@id = "TypeaheadSearchInput"]', shoe + " shoes")

    t.click('//button[@class = "btn-search z2 bg-transparent"]')
    t.wait(3)

    if gender == " men":
        t.click('(//span[contains(@class,"filter-item")])[1]')
    elif gender == " women":
        t.click('(//span[contains(@class,"filter-item")])[2]')

    t.wait(1)
    count = t.count('//a[@class ="product-card__link-overlay"]')
    #	print('[nike]',count)
    details = []

    if count != 0:
        for i in range(0, min(count, 3)):
            k = i + 1
            name = t.read(f'(//a[@class = "product-card__link-overlay"])[{k}]')
            price = t.read(f'(//div[@data-test="product-price"])[{k}]')
            img = t.read(
                f'(//div[contains(@class, "product-card__hero")]/picture/img)[{k}]/@src'
            )
            link = t.read(f'(//a[contains(@class,"product-card")])[{k}]/@href')
            # print('[nike]',name , price, img)
            details.append({
                "email": email,
                "name": name,
                "price": price,
                "img": img,
                "Company": "Nike",
                "link": link
            })
    else:
        details.append({
            "email": email,
            "name": "NA",
            "price": "NA",
            "img": "NA",
            "Company": "Nike",
            "link": "NA"
        })

#	print(details)
    return details
示例#8
0
def get_count_values(page_num, url_prefix, today):
    t.url(url_prefix + str(page_num) + '.html')
    print("现在所在页面 {}".format(page_num))
    t.wait(5)
    # 拿到value
    count_values = t.count(element_identifier='//td[@colspan = "2"]//table')
    # today = '2018-04-24'
    if t.read(
            element_identifier=
            '//td[@colspan = "2"]//table[1]//span[@class = "hui12"]') < today:
        return '今日无增量'
    print("页面有{}个文件".format(count_values))
    with open('count_items_' + str(page_num) + '_' +
              str(url_prefix.split('/')[-2]) + '.txt',
              'w',
              encoding='utf-8') as f:
        f.write('page:' + str(page_num) + ':' +
                str(count_values))  # 以:为分隔符;记录当前页面和页面总共item数量
    return 'count_items_' + str(page_num) + '_' + str(
        url_prefix.split('/')[-2]) + '.txt'
def extract_all_countries(date_stamp, status):
    num_country = int(
        t.count('(//a[@class="mt_a"])') /
        2)  # first half is for today, second half is for yesterday
    for n in range(1, num_country + 1):
        data = {}
        region_detail = {}
        data['date_stamp'] = date_stamp
        country_row_xpath = f'(//a[@class="mt_a"])[{n}]'

        country_total_cases_xpath = country_row_xpath + '/../following-sibling::td[1]'
        country_new_cases_xpath = country_row_xpath + '/../following-sibling::td[2]'
        country_total_deaths_xpath = country_row_xpath + '/../following-sibling::td[3]'
        country_new_deaths_xpath = country_row_xpath + '/../following-sibling::td[4]'
        country_total_recovered_xpath = country_row_xpath + '/../following-sibling::td[5]'
        country_active_cases_xpath = country_row_xpath + '/../following-sibling::td[6]'
        country_serious_cases_xpath = country_row_xpath + '/../following-sibling::td[7]'

        region_detail['total_cases'] = convert_extracted_numbers(
            t.read(country_total_cases_xpath))
        region_detail['new_cases'] = convert_extracted_numbers(
            t.read(country_new_cases_xpath))
        region_detail['total_deaths'] = convert_extracted_numbers(
            t.read(country_total_deaths_xpath))
        region_detail['new_deaths'] = convert_extracted_numbers(
            t.read(country_new_deaths_xpath))
        region_detail['total_recovered'] = convert_extracted_numbers(
            t.read(country_total_recovered_xpath))
        region_detail['active_cases'] = convert_extracted_numbers(
            t.read(country_active_cases_xpath))
        region_detail['serious_cases'] = convert_extracted_numbers(
            t.read(country_serious_cases_xpath))

        conv_info_str = json.dumps(region_detail)
        data['conv_info_str'] = conv_info_str

        country_name = t.read(country_row_xpath)
        data['country_name'] = country_name

        status = insert_db(data)
    return status
示例#10
0
def get_news_using_crawler():
    try:
        t.url(
            'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/news'
        )

        wait_for_pageload('//p[@class="heading text-underline"]')

        num_news = t.count('//p[@class="heading text-underline"]')
        if num_news > 5:
            num_news = 5

        delete_news_data_db()
        date_stamp = datetime.datetime.now(
            pytz.timezone('Singapore')).strftime('%Y-%m-%d')

        for n in range(1, num_news + 1):
            data = {}
            data['date_stamp'] = date_stamp
            news_link = t.read(
                f'(//p[@class="heading text-underline"])[{n}]/ancestor-or-self::a/@href'
            )
            data['news_link'] = news_link
            news_title = t.read(
                f'(//p[@class="heading text-underline"])[{n}]/ancestor-or-self::a/@aria-label'
            )
            data['news_title'] = news_title
            print('Article', n, ":", news_title)
            print('')
            news_summaries = SummarizeUrl(news_link)
            data['news_summary'] = str(news_summaries)
            print(news_summaries)
            status = insert_db(data)

        return status
    except Exception as e:
        print(e)
    finally:
        t.close()
示例#11
0
def catchContent():
	number = t.count('(//li[contains(@class, "css-1iski2w")]/a)')
	df = pd.DataFrame(index=range(0,number), columns = ['Sno', 'Title', 'URL', 'Summary','Img_URL'])

	for n in range(1, number+1):
		title=t.read('//li[contains(@class, "css-1iski2w")][{}]/a/div'.format(n))
		URL=t.read('//li[contains(@class, "css-1iski2w")][{}]//@href'.format(n))
		Img_link=t.read('//li[contains(@class, "css-1iski2w")][{}]//img/@src'.format(n))
		summaries = SummarizeUrl(URL)

		df.iloc[n-1, 0] = n
		df.iloc[n-1, 1] = title.decode('utf-8')
		df.iloc[n-1, 2] = URL
		df.iloc[n-1, 3] = summaries
		df.iloc[n-1, 4] = Img_link

	df['Summary'].replace('None', np.nan, inplace=True)
	df.dropna(subset=['Summary'], inplace=True, how='any')
	df= df.reset_index(drop=True)
	df['Sno'] = df.index

	return df
示例#12
0
name_list = ['序号', '产品名称', '发行银行', '委托货币', '发行日', '停售日', '管理期(天)', '预期收益率', '到期收益率', '与同期储蓄比', '综合评级','url']

for col_name in name_list:
    value_dict.setdefault(col_name, []) #初始化空数据集

#当可以翻页,或数据只有一页的时候,进行循环
stop_flag = False

#当当前页面不是最后一页,或只有一页时,都进行如下循环
while (t.read(element_identifier='//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]') == str(page_curr)) or (page_curr == 1):

    if stop_flag == True: #如果没有今年的数据,就没必要翻页了
        break

    #每页的数据量大小(row number)
    count_values = int(t.count(element_identifier='//tbody[@id = "content"]//tr')) + 1 # python从0开始
    #爬取当前页面
    for i in range(1, count_values):
        # 判定条件:如果是今年内(小于今年12-31或等于12-31的),全都要
        if str(t.read(
                element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']//td[@class = "px"]')) <= date_end:
            # 序号
            value_dict[name_list[0]].append(
                t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[2]'))
            # 产品名称
            value_dict[name_list[1]].append(
                t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[3]'))
            # 发行银行
            value_dict[name_list[2]].append(
                t.read(element_identifier='//tbody[@id = "content"]//tr[' + str(i) + ']/td[4]'))
            # 委托货币
示例#13
0
    print(f'\n-----start batch {batch_count}-----\n')

    # start date
    t.select('//select[@id="searchForm_selectedFromPeriodProjectName"]',
             START_DATE)

    # end date
    t.select('//select[@id="searchForm_selectedToPeriodProjectName"]',
             END_DATE)

    # type of sale
    t.click('//label[@for="checkbox1"]')
    t.click('//label[@for="checkbox2"]')
    t.click('//label[@for="checkbox3"]')

    project_total = t.count('//div[@id="projectContainerBox"]/a')

    # select projects
    for _ in range(SELECTION_LIMIT):
        if project_count > project_total - 1:
            PROCEED = False
            break

        selected = t.read(f'//*[@id="addToProject_{project_count}"]')
        print(f'select {selected}')
        t.click(f'//*[@id="addToProject_{project_count}"]')

        logging.info(
            f'batch: {batch_count}, project: {selected}, id: {project_count}')

        project_count += 1
示例#14
0
def compliance_data(url_prefix):
    t.init()  #
    init_url = url_prefix + '1.html'
    t.url(init_url)  #初始url
    max_page = int(
        t.read(element_identifier='//td[@class = "Normal"]').split('/')
        [1]) + 1  #最大page数量
    for page_num in range(1, max_page):
        t.url(url_prefix + str(page_num) + '.html')
        print("现在所在页面 {}".format(page_num))
        t.wait(5)
        # 拿到value
        count_values = t.count(
            element_identifier='//td[@colspan = "2"]//table') + 1
        today = datetime.datetime.today()
        today = str(today.date())
        # today = '2018-04-24'
        if t.read(element_identifier=
                  '//td[@colspan = "2"]//table[1]//span[@class = "hui12"]'
                  ) < today:
            print("今日无增量")
            break
        print("页面有{}个文件".format(count_values - 1))
        t.wait(5)
        for i in range(1, count_values):
            t.url(url_prefix + str(page_num) + '.html')
            if t.read(element_identifier='//td[@colspan = "2"]//table[' +
                      str(i) + ']//span[@class = "hui12"]') < today:
                t.close()
                exit(1)
            file_name = t.read(
                element_identifier='//td[@colspan = "2"]//table[' + str(i) +
                ']') + str('.txt')
            prefix = 'http://www.pbc.gov.cn'
            content_url = prefix + t.read(
                element_identifier='//td[@colspan = "2"]//table[' + str(i) +
                ']//a/@href')
            if 'cnhttp' in content_url:
                content_url = content_url[21:]  # 不知道为什么会出错这个
                t.url(content_url)
                text = t.read(element_identifier='//div[@id = "zoom"]')
                with open(file_name, 'w') as f:
                    f.write(text)
                print("文件{} 是文档。".format(i))
                continue
            t.url(content_url)  #进入二级目录

            #获取pdf的数量,pdf的名字和pdf应该有的名字
            t.wait(2)
            pdf_count = t.count(
                element_identifier='//div[@id = "zoom"]//a/@href')
            if pdf_count == 0:  ##如果是正常的txt文件
                # 取到列表
                print("文件{} 是文档。".format(i))
                # 取text
                if t.read(element_identifier='//div[@id = "zoom"]') != '':
                    text = t.read(element_identifier='//div[@id = "zoom"]')
                    with open(file_name, 'w') as f:
                        f.write(text)
                elif t.read(element_identifier='//td[@class = "p1"]') != '':
                    text = t.read(element_identifier='//td[@class = "p1"]')
                    with open(file_name, 'w') as f:
                        f.write(text)
                else:
                    print("write files fails...")
            elif ('pdf' in t.read(
                    element_identifier='//div[@id = "zoom"]//a/@href')):
                print("文件{} 含有 {} 个pdf。".format(i, pdf_count))
                pdf_count += 1  #python从0开始,所以至少有一个pdf count
                for j in range(1, pdf_count):
                    #取pdf的名字
                    if t.read(element_identifier='//div[@id = "zoom"]//p[' +
                              str(j) + ']//a/@href') != '':
                        print("当前是第{}个pdf。。".format(j))
                        pdf_name = t.read(
                            element_identifier='//div[@id = "zoom"]//p[' +
                            str(j) + ']//a/@href').split('/')[-1]
                        #取合规名
                        pdf_name_to_change = t.read(
                            element_identifier='//div[@id = "zoom"]//p[' +
                            str(j) + ']//a')
                        #下载
                        prefix = 'http://www.pbc.gov.cn'
                        t.url(prefix + t.read(
                            element_identifier='//div[@id = "zoom"]//p[' +
                            str(j) + ']//a/@href'))
                        wait_seconds = 1
                        total_seconds = 0
                        while os.path.exists(pdf_name) == False:
                            t.wait(wait_seconds)
                            total_seconds += wait_seconds
                            if total_seconds > 30:
                                print('download fails')
                                break
                        os.rename(pdf_name, pdf_name_to_change)  #改名
                        t.url(content_url)  #返回二级目录
                    else:
                        print("不合规,当文档处理!不读了!!!")
                        # 取text
                        if t.read(element_identifier='//div[@id = "zoom"]'
                                  ) != '':
                            text = t.read(
                                element_identifier='//div[@id = "zoom"]')
                            with open(file_name, 'w') as f:
                                f.write(text)
                        elif t.read(element_identifier='//td[@class = "p1"]'
                                    ) != '':
                            text = t.read(
                                element_identifier='//td[@class = "p1"]')
                            with open(file_name, 'w') as f:
                                f.write(text)
                        else:
                            print("write files fails...")
                        t.url(url_prefix + str(page_num) + '.html')
                        break
            else:
                print("文件{} 含有 {} 个pdf。".format(i, pdf_count))
                print("含有其他format的href,当文档处理!不读了!!!")
                # 取text
                if t.read(element_identifier='//div[@id = "zoom"]') != '':
                    text = t.read(element_identifier='//div[@id = "zoom"]')
                    with open(file_name, 'w') as f:
                        f.write(text)
                elif t.read(element_identifier='//td[@class = "p1"]') != '':
                    text = t.read(element_identifier='//td[@class = "p1"]')
                    with open(file_name, 'w') as f:
                        f.write(text)
                else:
                    print("write files fails...")
                t.url(url_prefix + str(page_num) + '.html')
                break
    t.close()
示例#15
0
def history_data(url_prefix):
    t.init()
    init_url = url_prefix + '1.html'
    t.url(init_url)
    max_page = int(
        t.read(element_identifier='//td[@class = "Normal"]').split('/')[1]) + 1
    for page_num in range(1, max_page):
        #主页面
        t.url(url_prefix + str(page_num) + '.html')
        print("现在所在页面 {}".format(page_num))
        t.wait(5)
        #拿到value
        count_values = t.count(
            element_identifier='//td[@colspan = "2"]//table') + 1
        today = datetime.datetime.today()
        today = str(today.date())
        if t.read(element_identifier=
                  '//td[@colspan = "2"]//table[1]//span[@class = "hui12"]'
                  ) < today:
            print("今日无增量")
            break
        print("页面有{}个文件".format(count_values - 1))
        t.wait(5)
        for i in range(1, count_values):
            t.url(url_prefix + str(page_num) + '.html')
            if t.read(element_identifier='//td[@colspan = "2"]//table[' +
                      str(i) + ']//span[@class = "hui12"]') < today:
                break
            if '.html' in t.read(
                    element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']//a/@href'):
                #取到列表
                print("文件{} 是文档。".format(i))
                file_name = t.read(
                    element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']') + str('.txt')
                prefix = 'http://www.pbc.gov.cn'
                content_url = prefix + t.read(
                    element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']//td//a/@href')
                # 点击url
                if content_url == 'http://www.pbc.gov.cnhttp://www.pbc.gov.cn/goutongjiaoliu/113456/113469/3487563/index.html':
                    content_url = 'http://www.pbc.gov.cn/goutongjiaoliu/113456/113469/3487563/index.html'  #不知道为什么会出错这个
                t.url(content_url)
                # 取text
                if t.read(element_identifier='//div[@id = "zoom"]') != '':
                    text = t.read(element_identifier='//div[@id = "zoom"]')
                    with open(file_name, 'w') as f:
                        f.write(text)
                elif t.read(element_identifier='//td[@class = "p1"]') != '':
                    text = t.read(element_identifier='//td[@class = "p1"]')
                    with open(file_name, 'w') as f:
                        f.write(text)
                else:
                    print("write files fails...")
            elif '.doc' in t.read(
                    element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']//a/@href'):
                # 取到数据
                print("文件{} 是下载doc。".format(i))
                file_name = t.read(
                    element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']//a/@href').split('/')[-1]
                prefix = 'http://www.pbc.gov.cn'
                content_url = prefix + t.read(
                    element_identifier='//td[@colspan = "2"]//table[' +
                    str(i) + ']//a/@href')
                t.url(content_url)
                wait_seconds = 1
                total_seconds = 0
                while os.path.exists(file_name) == False:
                    t.wait(wait_seconds)
                    total_seconds += wait_seconds
                    if total_seconds > 30:
                        print('download fails')
                        break
            else:
                print("unknown format..")
            print("爬好一次,返回页面 {}".format(page_num))
    #close out
    t.close()
def getblanklist():
    #初始化页面
    t.init()
    #输入url进入
    t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1")
    #直接开始搜索,不需要任何筛选条件
    t.click(element_identifier='//*[@id="fxr"]')
    t.hover(element_identifier='//*[@class="ipf01"]')
    t.click(element_identifier='//*[@class="ipf01"]')
    #把展示的尺寸设置为50个产品每页:
    t.hover(element_identifier='//*[@data-pagesize="50"]')
    t.click(element_identifier='//*[@data-pagesize="50"]')
    #点击以发行日升序排行,等价于"倒过来取",这样发行日为空的会在最前面
    t.hover(element_identifier='//*[@data-sort = "sell_org_date"]')
    t.click(element_identifier='//*[@data-sort = "sell_org_date"]')

    #当下一页没有被disable的时候,有以下超参数
    page_curr = 1  #当前页面index
    max_page = 1  # 最大的页面数记录

    # 存放列名
    value_dict = {}  # 存放data
    name_list = ['序号', '综合评级', 'url']

    for col_name in name_list:
        value_dict.setdefault(col_name, [])  # 初始化空数据集

    #当可以翻页,或数据只有一页的时候,进行循环
    stop_flag = False  # 初始化一个flag,flag = true代表我们需要的数据已经取完了,没必要再翻页了
    while (t.read(element_identifier=
                  '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]')
           == str(page_curr)) or (page_curr == 1):

        if stop_flag == True:  #如果没有空白数据了,就没必要翻页了
            break
        max_page = page_curr
        #每页的数据量大小(row number)
        count_values = int(
            t.count(element_identifier='//tbody[@id = "content"]//tr')
        ) + 1  # python从0开始
        # 爬取页面所有一个table里的值
        filename = str(page_curr) + "blank_date.csv"
        t.wait(1)  # 等1秒,万一加载错误了
        t.table(
            element_identifier='//div[@class = "table-s1 tab-s2 w100"]//table',
            filename_to_save=filename)

        #爬取当前页面 (只有title和href)
        for i in range(1, count_values):
            # 判定条件:如果发行日是空(--),进入此if
            if str(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//td[@class = "px"]')) == '--':
                # print("number {} is running".format(str(i)))
                # 序号
                value_dict[name_list[0]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[2]'))
                # 综合评级
                value_dict[name_list[1]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//td[12]//i/@title'))
                # url
                value_dict[name_list[2]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//a/@href'))

            else:  # 如果不再是空值-- ,此线程结束,flag置true, while循环结束
                stop_flag = True
                # print("thread stops here..")
                break
        # 翻页
        page_curr += 1
        # print("turn the page..")
        # 鼠标模拟移动,并点击翻页
        t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]')
        t.click(element_identifier='//*[@href="' + str(page_curr) + '"]')

    # #关闭tagui流
    t.close()
    #
    # 输出格式为:"blank_date.csv"
    hist_data = pd.DataFrame(value_dict)
    hist_data.to_csv("blank_date.csv", index=False, encoding='UTF-8')
    return max_page
示例#17
0
    for loop_wait in range(1, 60):
        print(f"{loop_wait}. waiting for page to appear. wait for 1s...")
        if t.present(selector):
            wait_status = 1
            break
        else:
            t.wait(1)
    print("Covid wait_status = {}".format(wait_status))


t.init()

t.url('https://www.worldometers.info/coronavirus/')

wait_for_pageload('//div[@class="maincounter-number"]')
num_country = int(t.count('(//a[@class="mt_a"])') / 2)

print("Number of Countries found: " + str(num_country))

country_list = []
link_list = []

for n in range(1, num_country + 1):
    try:
        country_row_xpath = f'(//a[@class="mt_a"])[{n}]'
        country_link_xpath = country_row_xpath + '/@href'
        country_link = 'https://www.worldometers.info/coronavirus/' + t.read(
            country_link_xpath)
        link_list.append(country_link)
        country_name = t.read(country_row_xpath)
        country_list.append(country_name)
示例#18
0
def get_property(input_email, input_name, prefer1, prefer2, prefer3, input_loc,
                 input_size, input_price, input_bed, input_floor):
    """
	:param input_email: user email
	:param input_name: user name
	:param prefer1:
	:param prefer2:
	:param prefer3:
	:param input_loc: location name input_loc = ['Orchard', 'River Valley','Eunos']
	:param input_size: square feet
	:param input_price:
	:param input_bed:
	:param input_floor:
	:return:
	"""
    # chatbot input
    input_area = list()
    if input_loc in ['Cecil', 'Raffles Place', 'Marina']:
        input_area.append('D01')
    elif input_loc in ['Chinatown', 'Tanjong Pagar']:
        input_area.append('D02')
    elif input_loc in ['Alexandra', 'Queenstown', 'Tiong Bahru']:
        input_area.append('D03')
    elif input_loc in ['Harbourfront', 'Telok Blangah', 'Mount Faber']:
        input_area.append('D04')
    elif input_loc in ['Buona Vista', 'Pasir Panjang', 'Clementi']:
        input_area.append('D05')
    elif input_loc in ['City Hall', 'Clarke Quay']:
        input_area.append('D06')
    elif input_loc in ['Beach Road', 'Bugis', 'Golden Mile']:
        input_area.append('D07')
    elif input_loc in ['Farrer Park', 'Little India']:
        input_area.append('D08')
    elif input_loc in ['Orchard', 'River Valley']:
        input_area.append('D09')
    elif input_loc in ['Balmoral', 'Holland', 'Bukit Timah']:
        input_area.append('D10')
    elif input_loc in ['Newton', 'Novena', 'Thomson']:
        input_area.append('D11')
    elif input_loc in ['Balestier', 'Toa Payoh', 'Serangoon']:
        input_area.append('D12')
    elif input_loc in ['Macpherson', 'Braddell']:
        input_area.append('D13')
    elif input_loc in ['Sims', 'Geylang', 'Paya Lebar']:
        input_area.append('D14')
    elif input_loc in ['Joo Chiat', 'Marine Parade', 'Katong']:
        input_area.append('D15')
    elif input_loc in ['Bedok', 'Upper East Coast', 'Siglap']:
        input_area.append('D16')
    elif input_loc in ['Flora', 'Changi', 'Loyang']:
        input_area.append('D17')
    elif input_loc in ['Pasir Ris', 'Tampines']:
        input_area.append('D18')
    elif input_loc in ['Serangoon Gardens', 'Punggol', 'Sengkang']:
        input_area.append('D19')
    elif input_loc in ['Ang Mo Kio', 'Bishan', 'Thomson']:
        input_area.append('D20')
    elif input_loc in ['Clementi Park', 'Upper Bukit Timah', 'Ulu Pandan']:
        input_area.append('D21')
    elif input_loc in ['Boon Lay', 'Jurong', 'Tuas']:
        input_area.append('D22')
    elif input_loc in [
            'Dairy Farm', 'Bukit Panjang', 'Choa Chu Kang', 'Hillview',
            'Bukit Batok'
    ]:
        input_area.append('D23')
    elif input_loc in ['Lim Chu Kang', 'Tengah', 'Kranji']:
        input_area.append('D24')
    elif input_loc in ['Admiralty', 'Woodlands']:
        input_area.append('D25')
    elif input_loc in ['Mandai', 'Upper Thomson']:
        input_area.append('D26')
    elif input_loc in ['Sembawang', 'Yishun']:
        input_area.append('D27')
    elif input_loc in ['Seletar', 'Yio Chu Kang']:
        input_area.append('D28')
    print(input_area)

    input_type = [
        'condo'
    ]  # HDB, condo, landed (only single choice is supported in propertyguru)
    input_minsize = [str(input_size * 0.8)]  # square feet   @ modified
    input_maxsize = [str(input_size * 1.2)]  # square feet   @ modified
    input_minprice = [str(input_price * 0.5)]  # $    @ modified
    input_maxprice = [str(input_price * 1.5)]  # $    @ modified
    input_bed = [str(input_bed)]  # 0 to 5 bedroom, 0 stands for studio,  @
    input_floor = [
        str(input_floor)
    ]  # ground, low, mid, high, penthouse (only single choice is supported in propertyguru   @

    # url transfer
    def url_area():
        url_area = ''
        for n in input_area:
            url_area += f'district_code%5B%5D={n}&'
        return url_area

    def url_type():
        if 'HDB' in input_type:
            url_type = 'property_type=H&'
        if 'condo' in input_type:
            url_type = 'property_type=N&'
        if 'landed' in input_type:
            url_type = 'property_type=L&'
        return url_type

    def url_minsize():
        url_minsize = ''.join(input_minsize)
        return f'minsize={url_minsize}&'

    def url_maxsize():
        url_maxsize = ''.join(input_maxsize)
        return f'maxsize={url_maxsize}&'

    def url_minprice():
        url_minprice = ''.join(input_minprice)
        return f'minprice={url_minprice}&'

    def url_maxprice():
        url_maxprice = ''.join(input_maxprice)
        return f'maxprice={url_maxprice}&'

    def url_bed():
        url_bed = ''
        for n in input_bed:
            url_bed += f'beds%5B%5D={n}&'
        return url_bed

    def url_floor():
        url_floor = ''
        if 'ground' in input_floor:
            url_floor = 'floor_level=GND&'
        if 'low' in input_floor:
            url_floor = 'floor_level=LOW&'
        if 'mid' in input_floor:
            url_floor = 'floor_level=MID&'
        if 'high' in input_floor:
            url_floor = 'floor_level=HIGH&'
        if 'penthouse' in input_floor:
            url_floor = 'floor_level=PENT&'
        return url_floor

    url_main = f'https://www.propertyguru.com.sg/property-for-sale?market=residential&{url_type()}{url_area()}{url_minprice()}{url_maxprice()}{url_bed()}{url_minsize()}{url_maxsize()}{url_floor()}newProject=all'
    print('main page url link: ' + url_main)

    # tagui scrape
    t.init()
    t.url(url_main)
    result = wait_for_mainpageload('//div[@class="header-wrapper"]')
    if result == 0:
        print(' no result found')
        mail_notfound(input_email, input_name, input_loc, input_size,
                      input_price, input_bed, input_floor)
        # restart BuyerAgent.py
        python = sys.executable
        os.execl(python, python, *sys.argv)
    num_result = t.count('//div[@class="header-wrapper"]')
    num_result_ad = num_result + 2
    # num_result_ad = 6  # for test
    print("num of property in this page without ad = ", num_result)
    print("num of property in this page including ad = ", num_result_ad)

    # load main page, get detail page url link
    url = [''] * num_result_ad
    for n in [x for x in range(1, num_result_ad + 1)
              if x != 4 and x != 8]:  # skip 4th and 8th advertisement
        wait_for_pageload(
            f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)'
        )
        url[n - 1] = read_if_present(
            f'(//div[@class="listing-widget-new"]/div[{n}]/div[1]/div[2]/div[1]/div[1]/h3/a/@href)'
        )
        print(f"{n}. url = " + url[n - 1])

    # load detail page
    property_title = [''] * num_result_ad
    type = [''] * num_result_ad
    area = [''] * num_result_ad
    bedroom = [''] * num_result_ad
    bathroom = [''] * num_result_ad
    price = [''] * num_result_ad
    total = [''] * num_result_ad
    address = [''] * num_result_ad
    postcode = [''] * num_result_ad
    region = [''] * num_result_ad
    floor = [''] * num_result_ad
    furnish = [''] * num_result_ad
    description = [''] * num_result_ad
    feature = [''] * num_result_ad
    image1 = [''] * num_result_ad
    image2 = [''] * num_result_ad
    image3 = [''] * num_result_ad
    id = [''] * num_result_ad
    pdf = [''] * num_result_ad
    pdf_link = [''] * num_result_ad

    for n in [x for x in range(1, num_result_ad + 1) if x != 4 and x != 8]:

        t.url("https://www.propertyguru.com.sg" + url[n - 1])
        wait_for_pageload('//h1[@class="h2"]')
        property_title[n - 1] = read_if_present('//h1[@class="h2"]')
        print(f"{n}. property_title = " + property_title[n - 1])
        type[n - 1] = read_if_present(
            '//*[@id="condo-profile"]/div/div/div/div/div[1]/div/div/div[1]/div/div[2]'
        )
        print(f"{n}. type = " + type[n - 1])
        area[n - 1] = read_if_present(
            '//*[@id="details"]/div/div[1]/div[2]/div[3]/div/div[2]')
        print(f"{n}. area = " + area[n - 1])
        bedroom[n - 1] = read_if_present(
            '//*[@id="overview"]/div/div/div/section/div[1]/div[2]/div[1]/span'
        )
        print(f"{n}. bedroom = " + bedroom[n - 1])
        bathroom[n - 1] = read_if_present(
            '//*[@id="overview"]/div/div/div/section/div[1]/div[2]/div[2]/span'
        )
        print(f"{n}. bathroom = " + bathroom[n - 1])
        total[n - 1] = read_if_present(
            '//*[@id="overview"]/div/div/div/section/div[1]/div[1]/div[1]/span[2]'
        )
        print(f"{n}. total price = " + total[n - 1])
        price[n - 1] = read_if_present(
            '//*[@id="overview"]/div/div/div/section/div[1]/div[2]/div[4]/div/span[2]'
        )
        print(f"{n}. price = " + price[n - 1])
        address[n - 1] = read_if_present(
            '//*[@id="overview"]/div/div/div/section/div[1]/div[3]/div/div[2]/div[1]/span[1]'
        )
        print(f"{n}. address = " + address[n - 1])
        postcode[n - 1] = read_if_present(
            '//*[@id="overview"]/div/div/div/section/div[1]/div[3]/div/div[2]/div[1]/span[2]'
        )
        print(f"{n}. postalcode = " + postcode[n - 1])
        region[n - 1] = read_if_present(
            '//*[@id="overview"]/div/div/div/section/div[1]/div[3]/div/div[2]/div[1]/span[3]'
        )
        print(f"{n}. region = " + region[n - 1])
        floor[n - 1] = read_if_present(
            '//*[@id="details"]/div/div[1]/div[2]/div[9]/div/div[2]')
        print(f"{n}. floor = " + floor[n - 1])
        furnish[n - 1] = read_if_present(
            '//*[@id="details"]/div/div[1]/div[2]/div[7]/div/div[2]')
        print(f"{n}. furnish = " + furnish[n - 1])
        description[n - 1] = read_if_present('//*[@id="details"]/div/div[2]')
        print(f"{n}. description = " + description[n - 1])
        feature[n - 1] = read_if_present('//*[@id="facilities"]')
        print(f"{n}. feature = " + feature[n - 1])
        image1[n - 1] = read_if_present(
            '//*[@id="carousel-photos"]/div[2]/div/div[1]/span/img/@src')
        print(f"{n}. image1 = " + image1[n - 1])
        image2[n - 1] = read_if_present(
            '//*[@id="carousel-photos"]/div[2]/div/div[2]/span/img/@src')
        print(f"{n}. image2 = " + image2[n - 1])
        image3[n - 1] = read_if_present(
            '//*[@id="carousel-photos"]/div[2]/div/div[3]/span/img/@src')
        print(f"{n}. image3 = " + image3[n - 1])
        pdf[n - 1] = read_if_present(
            '//*[@id="sticky-right-col"]/div[3]/a[2]/@href')
        pdf_link[n - 1] = 'https://www.propertyguru.com.sg' + pdf[n - 1]
        print(f"{n}. pdf_link = " + pdf_link[n - 1])
        id[n - 1] = read_if_present(
            '//*[@id="details"]/div/div[1]/div[2]/div[10]/div/div[2]')
        print(f"{n}. id = " + id[n - 1])

    property_info = {
        'property_title': property_title,
        'url': ['https://www.propertyguru.com.sg' + x for x in url],
        'type': type,
        'area': area,
        'total price': total,
        'price': price,
        'bedroom': bedroom,
        'bathroom': bathroom,
        'address': address,
        'postcode': postcode,
        'region': region,
        'floor': floor,
        'furnish': furnish,
        'description': description,
        'feature': feature,
        'image1': image1,
        'image2': image2,
        'image3': image3,
        'id': id,
        'pdf_link': pdf_link,
    }

    df = DataFrame(property_info,
                   columns=[
                       'property_title', 'id', 'pdf_link', 'type', 'area',
                       'total price', 'price', 'bedroom', 'bathroom',
                       'address', 'postcode', 'region', 'floor', 'furnish',
                       'description', 'feature', 'url', 'image1', 'image2',
                       'image3'
                   ])

    df.to_excel('property_info.xlsx', encoding='utf8', index=None)
    print('======== property_info.xlsx saved ==========')

    # from propertySearcher_util import download_image
    download_image(id, image1, image2, image3)

    # from propertySearcher_util import classify_image
    filtered_id, filtered_cluster = classify_image(df, prefer1, prefer2,
                                                   prefer3)

    print(df)
    # generate image filtered df, sorted by filtered_id
    filtered_df = df[df['id'].isin(filtered_id)]
    # write image cluster column into df
    filtered_df['image'] = filtered_cluster
    print(filtered_df)
    # save to excel
    filtered_df.to_excel('property_info_image.xlsx',
                         encoding='utf8',
                         index=None)

    print('======== generate data for pdf downloader ==========')
    property_title = filtered_df[
        'property_title']  # filtered_df = pd.read_excel('property_info_filtered.xlsx')
    print(list(property_title))
    pdf_link = filtered_df['pdf_link']
    print(list(pdf_link))
    pdf_id = filtered_df['id']
    print(list(pdf_id))

    # from propertySearcher_util import download_pdf
    pdf_filename = download_pdf(
        property_title, pdf_link, pdf_id
    )  #  pdf_filename =  property_title + pdf_id, pdf_filename for email attachment

    # from propertySearcher_util import classify_text
    features_selected = classify_text(filtered_df, 3, 6)
    # edit dataframe
    filtered_df['Key Features'] = features_selected
    filtered_df = filtered_df.drop(columns=[
        'pdf_link', 'description', 'feature', 'image1', 'image2', 'image3'
    ])
    # save to excel
    filtered_df.to_excel('Property_info_text.xlsx',
                         encoding='utf8',
                         index=None)

    # from propertySearcher_util import edit_excel
    edit_excel('Property_info_text.xlsx')
    print('============ excel saved ============')

    # from propertySearcher_util import mail_shortlist
    mail_shortlist(input_email, input_name, pdf_filename)
示例#19
0
logging.basicConfig(filename = "log.txt")
srcDirectory = "OrgImage"
t.init(visual_automation = True)


for target in findAllFile(srcDirectory):
    target_image = 'OrgImage/' + target
    t.url('https://www.bing.com')
    t.click('//div[@id="sb_sbi"]/img')
    t.upload("input.fileinput",target_image)
    t.wait(3)

    succDownload = False

    image_nums = t.count('//a[@class="richImgLnk"]')
    print(image_nums)

    if t.click('//li[contains(string(),"Pages")]') == False:
        image_nums = 0
    t.wait(3)


    for i in range(1, image_nums):
        if t.click(f'(//a[@class="richImgLnk"])[{i}]'):

            t.wait(3)
            t.keyboard('[ctrl]l')
            t.keyboard('[ctrl]c')
            imgUrl = t.clipboard()
            print(imgUrl)
示例#20
0
def history_data_daily(url_prefix):
    t.init()  #
    init_url = url_prefix + '1.html'
    t.url(init_url)  # 初始url
    max_page = int(t.read(element_identifier='//td[@class = "Normal"]').split('/')[1]) + 1  # 最大page数量
    for page_num in range(1, max_page):
        t.url(url_prefix + str(page_num) + '.html')
        print("现在所在页面 {}".format(page_num))
        t.wait(5)
        # 拿到value
        count_values = t.count(element_identifier='//td[@colspan = "2"]//table') + 1
        today = datetime.datetime.today()
        today = str(today.date())
        # today = '2018-04-24'
        if t.read(element_identifier='//td[@colspan = "2"]//table[1]//span[@class = "hui12"]') < today:
            print("今日无增量")
            break
        print("页面有{}个文件".format(count_values - 1))
        t.wait(5)
        for i in range(1, count_values):
            t.url(url_prefix + str(page_num) + '.html')
            if t.read(element_identifier='//td[@colspan = "2"]//table['+str(i)+']//span[@class = "hui12"]') < today:
                t.close()
                exit(1)
            file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']')
            file_name = file_name[:-10] + str("_") + file_name[-10:] + str('.txt')
            time = file_name[-14:-4]
            prefix = 'http://www.pbc.gov.cn'
            content_url = prefix + t.read(
                element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href')
            if '.html' not in t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href'):
                # 当直接跳到需要下载的文件的时候
                if 'cnhttp' in content_url:
                    content_url = content_url[21:]  # 不知道为什么会出错这个
                    # 取到数据
                    print("文件{} 是直接下载文件。".format(i))
                    file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href')
                    suffix = file_name.split('.')[-1]

                    file_name = file_name.split('/')[-1]

                    t.url(content_url)
                    wait_seconds = 1
                    total_seconds = 0
                    while os.path.exists(file_name) == False:
                        t.wait(wait_seconds)
                        total_seconds += wait_seconds
                        if total_seconds > 30:
                            print('download fails')
                            break

                    os.rename(file_name, file_name[:-(len(suffix)+1)] + "_" + time +'.'+file_name[-(len(suffix)+1):])
                else:
                    # 取到数据
                    print("文件{} 是直接下载文件。".format(i))
                    file_name = t.read(element_identifier='//td[@colspan = "2"]//table[' + str(i) + ']//a/@href')
                    suffix = file_name.split('.')[-1]
                    file_name = file_name.split('/')[-1]
                    t.url(content_url)
                    wait_seconds = 1
                    total_seconds = 0
                    while os.path.exists(file_name) == False:
                        t.wait(wait_seconds)
                        total_seconds += wait_seconds
                        if total_seconds > 30:
                            print('download fails')
                            break
                    os.rename(file_name, file_name[:-(len(suffix)+1)] + "_" + time +'.'+file_name[-(len(suffix)+1):])

            else:  # 当没有直接下载的时候
                if 'cnhttp' in content_url:
                    content_url = content_url[21:]  # 不知道为什么会出错这个
                    t.url(content_url)
                else:
                    t.url(content_url)
                # 获取pdf的数量,pdf的名字和pdf应该有的名字
                t.wait(2)
                pdf_count = t.count(element_identifier='//div[@id = "zoom"]//a/@href')
                if pdf_count == 0:  ##如果是正常的txt文件
                    # 取到列表
                    print("文件{} 是文档。".format(i))
                    # 取text
                    if t.read(element_identifier='//div[@id = "zoom"]') != '':
                        text = t.read(element_identifier='//div[@id = "zoom"]')
                        with open(file_name, 'w') as f:
                            f.write(text)
                    elif t.read(element_identifier='//td[@class = "p1"]') != '':
                        text = t.read(element_identifier='//td[@class = "p1"]')
                        with open(file_name, 'w') as f:
                            f.write(text)
                    else:
                        print("write files fails...")
                else:
                    # 取text
                    if t.read(element_identifier='//div[@id = "zoom"]') != '':
                        text = t.read(element_identifier='//div[@id = "zoom"]')
                        with open(file_name, 'w') as f:
                            f.write(text)
                    elif t.read(element_identifier='//td[@class = "p1"]') != '':
                        text = t.read(element_identifier='//td[@class = "p1"]')
                        with open(file_name, 'w') as f:
                            f.write(text)
                    else:
                        print("write files fails...")
                    print("文件{} 含有 {} 个文件要下载。".format(i, pdf_count))
                    pdf_count += 1  # python从0开始,所以至少有一个pdf count
                    current_count = 0
                    for j in range(1, pdf_count):
                        # 取pdf的名字
                        if '.htm' not in t.read(element_identifier='//div[@id = "zoom"]//p//a/@href'):
                            print("当前是第{}个文件。。".format(j))
                            p_count = t.count(element_identifier='//div[@id = "zoom"]//p')
                            while current_count <= p_count:

                                if t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a') != '':
                                    #如果取到了
                                    print("这个p有!")
                                    pdf_name = t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href')
                                    # 取合规名
                                    pdf_name_to_change = t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a')
                                    # 下载
                                    suffix = pdf_name.split('.')[-1]

                                    pdf_name = pdf_name.split('/')[-1]
                                    prefix = 'http://www.pbc.gov.cn'
                                    download_link = prefix + t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href')
                                    if 'cnhttp' in download_link:
                                        t.url(t.read(element_identifier='//div[@id = "zoom"]//p[last()-' + str(current_count) + ']//a/@href'))
                                    else:
                                        t.url(download_link)
                                    wait_seconds = 1
                                    total_seconds = 0
                                    while os.path.exists(pdf_name) == False:
                                        t.wait(wait_seconds)
                                        total_seconds += wait_seconds
                                        if total_seconds > 30:
                                            print('download fails')
                                            break
                                    os.rename(pdf_name, pdf_name_to_change)  # 改名
                                    os.rename(pdf_name_to_change,
                                              pdf_name_to_change[:-(len(suffix)+1)] + '_' + time + pdf_name_to_change[-(len(suffix)+1):])
                                    t.url(content_url)  # 返回二级目录
                                    current_count += 1
                                    break
                                else:
                                    current_count += 1
                                    print("这个p没有")

                        else:
                            print("是个网页,当文档处理!")
                            prefix = 'http://www.pbc.gov.cn'
                            download_link = prefix + t.read(
                                element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href')
                            if 'cnhttp' in download_link:
                                t.url(t.read(element_identifier='//div[@id = "zoom"]//p[' + str(j) + ']//a/@href'))
                            else:
                                t.url(download_link)
                            # 取text
                            if t.read(element_identifier='//div[@id = "zoom"]') != '':
                                text = t.read(element_identifier='//div[@id = "zoom"]')
                                with open(file_name, 'w') as f:
                                    f.write(text)
                            elif t.read(element_identifier='//td[@class = "p1"]') != '':
                                text = t.read(element_identifier='//td[@class = "p1"]')
                                with open(file_name, 'w') as f:
                                    f.write(text)
                            else:
                                print("write files fails...")

    t.close()
示例#21
0
def getdailyincrement(str_to_append):

    #初始化页面
    t.init()
    #输入url进入
    t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1")
    #等5秒反应
    t.wait(15)
    #鼠标放上去,点击精简选项
    t.hover(element_identifier='//*[@id="fxr"]')
    t.click(element_identifier='//*[@id="zksq"]')
    #鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索
    t.hover(element_identifier='//*[@id="fxr"]')
    t.click(element_identifier='//*[@id="fxr"]')
    t.type(element_identifier='//*[@id="fxr"]', text_to_type=str_to_append)
    #再点击,确保日期不会遮住底下的搜索按钮
    t.click(element_identifier='//*[@id="fxr"]')
    t.hover(element_identifier='//*[@class="ipf01"]')
    t.click(element_identifier='//*[@class="ipf01"]')
    #把展示的尺寸设置为50个产品每页:
    t.hover(element_identifier='//*[@data-pagesize="50"]')
    t.click(element_identifier='//*[@data-pagesize="50"]')

    #当下一页没有被disable的时候,有以下超参数
    page_curr = 1  #当前页面index
    value_dict = {}  #存放data
    count = 1  #csv 命名用
    #存放列名
    name_list = ['序号', '综合评级', 'url']

    for col_name in name_list:
        value_dict.setdefault(col_name, [])  #初始化空数据集

    #当可以翻页,或数据只有一页的时候,进行循环
    while (t.read(element_identifier=
                  '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]')
           == str(page_curr)) or (page_curr == 1):

        #每页的数据量大小(row number)
        count_values = int(
            t.count(element_identifier='//tbody[@id = "content"]//tr')
        ) + 1  # python从0开始
        # 爬取页面所有一个table里的值
        if str(
                t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                       str(count_values - 1) +
                       ']//td[@class = "px"]')) > str_to_append:
            # print("direct continue..")
            # 翻页
            page_curr += 1
            # 鼠标模拟移动,并点击翻页
            t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]')
            t.click(element_identifier='//*[@href="' + str(page_curr) + '"]')
            continue
        filename = str(count) + "daily_data.csv"
        count += 1
        t.wait(1)  # 等1秒,万一加载错误了
        t.table(
            element_identifier='//div[@class = "table-s1 tab-s2 w100"]//table',
            filename_to_save=filename)
        count_values = int(
            t.count(element_identifier='//tbody[@id = "content"]//tr')
        ) + 1  # python从0开始
        for i in range(1, count_values):
            # 判定条件:如果是今天刚发行的,拿到所有主页面上的数据;
            #如果最下面那条数据都大于今天,就直接翻页
            if str(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(count_values - 1) +
                           ']//td[@class = "px"]')) > str_to_append:
                # print("direct break..")
                break
            else:
                if str(
                        t.read(
                            element_identifier='//tbody[@id = "content"]//tr['
                            + str(i) +
                            ']//td[@class = "px"]')) == str_to_append:
                    #序号
                    value_dict[name_list[0]].append(
                        t.read(
                            element_identifier='//tbody[@id = "content"]//tr['
                            + str(i) + ']/td[2]'))
                    #综合评级
                    value_dict[name_list[1]].append(
                        t.read(
                            element_identifier='//tbody[@id = "content"]//tr['
                            + str(i) + ']//td[12]//i/@title'))
                    #url
                    value_dict[name_list[2]].append(
                        t.read(
                            element_identifier='//tbody[@id = "content"]//tr['
                            + str(i) + ']//a/@href'))
                else:  #如果不是今天增量,什么都不做
                    pass
        # print("turn the page..")
        # 翻页
        page_curr += 1
        # 鼠标模拟移动,并点击翻页
        t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]')
        t.click(element_identifier='//*[@href="' + str(page_curr) + '"]')

    #关闭tagui流
    t.close()
    #输出格式为:"今日日期.csv"
    today_data = pd.DataFrame(value_dict)
    today_data.to_csv(str_to_append + ".csv", index=False, encoding='UTF-8')
    return count - 1
示例#22
0
def main_operation(url, mode='txt'):
    # 当前页面
    curr_page = int(
        t.read(element_identifier='//div[@class = "ng-binding"][last()]').
        split('/')[0])
    # 点击按钮
    list_count = t.count(
        element_identifier='//div[@class = "list caidan-right-list"]'
    )  # 循环列表,取出总list有几个
    #如果是断点,读取断电数据
    if os.path.exists('baojianhui_log.txt'):
        with open('baojianhui_log.txt', 'r') as f:
            params = f.read().split(',')
        curr_page = params[0]
        start_i = params[1]
        start_j = params[2]
    else:  #如果是第一次执行,全取初始值;
        start_i = 1
        start_j = 1
    #常规操作
    for i in range(1, list_count + 1):
        t.wait(5)
        if i < int(start_i):
            continue
        item_count = t.count(
            element_identifier='//div[@class = "list caidan-right-list"][' +
            str(i) +
            ']//div[@class = "panel-row ng-scope"]')  # 取出每个list里的具体法规有几条
        print('当前是list {}, 里面的元素有 {} 个'.format(str(i), str(item_count)))
        t.wait(5)
        for j in range(1, item_count + 1):
            if j < int(start_j):
                continue
            item_title = t.read(
                element_identifier='//div[@class = "list caidan-right-list"]['
                + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) +
                ']//a')
            time_suffix = t.read(
                element_identifier='//div[@class = "list caidan-right-list"]['
                + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) +
                ']//span[@class = "date ng-binding"]')
            if str(time_suffix) != str((datetime.datetime.today()).date(
            )):  #如果不是今日日期,直接return;str((datetime.datetime.today()).date())
                print('今日增量已取完')
                return True, '今日无增量'
            file_name = item_title + '_' + time_suffix + '.txt'
            if '/' in file_name:
                file_name = file_name.replace('/', ' ')
            if mode == 'txt':
                #点击
                link = t.read(element_identifier=
                              '//div[@class = "list caidan-right-list"][' +
                              str(i) +
                              ']//div[@class = "panel-row ng-scope"][' +
                              str(j) + ']//a/@ng-href')
                prefix = 'http://www.cbirc.gov.cn/cn/view/pages/'
                final_link = prefix + link
                t.url(final_link)
                t.wait(1)
                while not os.path.exists(file_name):
                    # type_1 = t.read(element_identifier='//div[@class = "Section0"]') + t.read(element_identifier='//div[@class = "Section1"]')
                    # type_2 = t.read(element_identifier='//div[@class = "WordSection1"]')
                    # type_3 = t.read(element_identifier='//div[@class = "wenzhang-content ng-binding"]')
                    if t.read(
                            element_identifier='//div[@class = "Section0"]'
                    ) + t.read(element_identifier='//div[@class = "Section1"]'
                               ) != '':
                        #p0
                        p_counts_section0 = t.count(
                            element_identifier='//div[@class = "Section0"]//p')
                        content_list = []
                        with open(file_name, 'w', encoding='utf-8') as f:
                            f.write(file_name.split("_")[0] + "\n")
                        for p in range(1, p_counts_section0 + 1):
                            content_list.append(
                                t.read(element_identifier=
                                       '//div[@class = "Section0"]//p[' +
                                       str(p) + ']'))
                        with open(file_name, 'a', encoding='utf-8') as f:
                            f.writelines(
                                [content + "\n" for content in content_list])
                            #p1
                        p_counts_section1 = t.count(
                            element_identifier='//div[@class = "Section1"]//p')
                        content_list = []
                        for p in range(1, p_counts_section1 + 1):
                            content_list.append(
                                t.read(element_identifier=
                                       '//div[@class = "Section1"]//p[' +
                                       str(p) + ']'))
                        with open(file_name, 'a', encoding='utf-8') as f:
                            f.writelines(
                                [content + "\n" for content in content_list])
                        break
                    elif t.read(
                            element_identifier='//div[@class = "WordSection1"]'
                    ) != '':
                        p_counts = t.count(element_identifier=
                                           '//div[@class = "WordSection1"]//p')
                        if p_counts <= 1:
                            content_list = t.read(
                                element_identifier=
                                '//div[@class = "WordSection1"]//p')
                            with open(file_name, 'w', encoding='utf-8') as f:
                                f.write(file_name.split("_")[0] + "\n")
                                f.writelines([
                                    content + "\n"
                                    for content in content_list.split("     ")
                                ])
                        else:
                            content_list = []
                            for p in range(1, p_counts + 1):
                                content_list.append(
                                    t.read(
                                        element_identifier=
                                        '//div[@class = "WordSection1"]//p[' +
                                        str(p) + ']'))
                            with open(file_name, 'w', encoding='utf-8') as f:
                                f.write(file_name.split("_")[0] + "\n")
                                f.writelines([
                                    content + "\n" for content in content_list
                                ])
                        break
                    elif t.read(element_identifier=
                                '//div[@class = "wenzhang-content ng-binding"]'
                                ) != '':
                        #有p》1
                        #无p 《=1, 用split
                        p_counts = t.count(
                            element_identifier=
                            '//div[@class = "wenzhang-content ng-binding"]//p')
                        if p_counts <= 1:
                            content_list = t.read(
                                element_identifier=
                                '//div[@class = "wenzhang-content ng-binding"]//p'
                            )
                            with open(file_name, 'w', encoding='utf-8') as f:
                                f.write(file_name.split("_")[0] + "\n")
                                f.writelines([
                                    content + "\n"
                                    for content in content_list.split("     ")
                                ])
                        else:
                            content_list = []
                            for p in range(1, p_counts + 1):
                                content_list.append(
                                    t.read(
                                        element_identifier=
                                        '//div[@class = "wenzhang-content ng-binding"]//p['
                                        + str(p) + ']'))
                            with open(file_name, 'w', encoding='utf-8') as f:
                                f.write(file_name.split("_")[0] + "\n")
                                f.writelines([
                                    content + "\n" for content in content_list
                                ])
                        break
                    else:
                        content = ' '
                        with open(file_name, 'w') as f:
                            f.write(content)
                        break
            elif mode == 'doc':
                t.click(element_identifier=
                        '//div[@class = "list caidan-right-list"][' + str(i) +
                        ']//div[@class = "panel-row ng-scope"][' + str(j) +
                        ']//a[@ng-click = "fileDownload(x.docFileUrl)"]')
                doc_id = t.read(element_identifier=
                                '//div[@class = "list caidan-right-list"][' +
                                str(i) +
                                ']//div[@class = "panel-row ng-scope"][' +
                                str(j) + ']//a/@ng-href').split('=')[1][:-7]
                doc_name = doc_id + '.doc'
                curr_clock = 5
                while not os.path.exists(doc_name):
                    t.wait(curr_clock)
                    curr_clock += 5
                    if curr_clock > MAX_WAIT:
                        break
                t.wait(5)
                os.rename(doc_name, item_title + '_' + time_suffix + '.doc')
            elif mode == 'pdf':
                t.click(element_identifier=
                        '//div[@class = "list caidan-right-list"][' + str(i) +
                        ']//div[@class = "panel-row ng-scope"][' + str(j) +
                        ']//a[@ng-click = "fileDownload(x.pdfFileUrl)"]')
                pdf_id = t.read(element_identifier=
                                '//div[@class = "list caidan-right-list"][' +
                                str(i) +
                                ']//div[@class = "panel-row ng-scope"][' +
                                str(j) + ']//a/@ng-href').split('=')[1][:-7]
                pdf_name = pdf_id + '.pdf'
                curr_clock = 5
                while not os.path.exists(pdf_name):
                    t.wait(curr_clock)
                    curr_clock += 5
                    if curr_clock > MAX_WAIT:
                        break
                t.wait(5)
                os.rename(pdf_name, item_title + '_' + time_suffix + '.pdf')
            else:
                print('unknown format..')
                t.close()
                raise Exception("unknown input mode")
            # 返回主页面
            t.url(url + str(curr_page))
            t.wait(5)
            with open('baojianhui_log.txt', 'w') as f:
                f.write(str(curr_page) + ',' + str(i) + ',' + str(j))
        with open('baojianhui_log.txt', 'w') as f:
            f.write(str(curr_page) + ',' + str(i) + ',' +
                    str(1))  #当前list取完,j更新
示例#23
0
def getExpFlightPrice(airline, dep_ref, dur_ref):
    print(airline)
    print(dep_ref)
    print(dur_ref)
    util.wait_for_pageload('//input[@classes="filter-checkbox"]')

    t.wait(3)
    t.click(f'//a[@data-content-id="airlineToggleContainer"]')

    for i in range(len(dep_ref)):
        if i == 0:
            if t.present(f'//input[@id="airlineRowContainer_{airline[i]}"]'):
                t.wait(3)
                t.click(f'//input[@id="airlineRowContainer_{airline[i]}"]')
            else:
                print('Not match')
                return 0, ''

        elif airline[i] != airline[i-1]:
            if t.present(f'//input[@id="airlineRowContainer_{airline[i]}"]'):
                t.wait(1)
                t.click(f'//input[@id="airlineRowContainer_{airline[i]}"]')
            else:
                print('Not match')
                return 0, ''

        if dep_ref[i][0] == '0':
            dep_ref[i] = dep_ref[i][1:]

        if dur_ref[i][-1:] == 'h':
            dur_ref[i] = dur_ref[i] + ' 0m'
        else:
            dur_ref[i] = dur_ref[i] + 'm'


    print(airline)
    print(dep_ref)
    print(dur_ref)

    util.wait_for_pageload('//button[@data-test-id="select-button"]')
    t.wait(5)
    for i in range(t.count(f'//ul[@id="flightModuleList"]//li')):
        i = i + 1
        print(i)
        dep = t.read(f'(//span[@class="medium-bold"]//span[@data-test-id="departure-time"])[{i}]')
        if len(dur_ref) == 1:
            if dep == dep_ref[0]:
                print('dep OK')
                dur = t.read(f'(//span[@data-test-id="duration"])[{i}]')
                t.click(f'(//button[@data-test-id="select-button"])[{i}]')
                t.wait(5)
                if t.present('//a[@id="forcedChoiceNoThanks"]'):
                    t.click(f'//a[@id="forcedChoiceNoThanks"]')
                t.wait(5)
                for x in range(5):
                    print(x)
                    if t.popup('Flight-Information?'):
                        break
                    else:
                        t.wait(5)
                price = t.read(f'(//span[@class="packagePriceTotal"])[2]')
                price = float(price.replace(',', '').replace('SG', '').replace('$', '').replace(' ', ''))
                print(price)
                url = t.url()
                return price, url
            else:
                return 0, ''

        elif len(dur_ref) == 2:
            print('trip', len(dur_ref))
            if dep == dep_ref[0]:
                print('dep OK')
                dur = t.read(f'(//span[@data-test-id="duration"])[{i}]')

                t.click(f'(//button[@data-test-id="select-button"])[{i}]')
                t.wait(5)

                util.wait_for_pageload('//button[@data-test-id="select-button"]')
                t.click(f'//input[@id="airlineRowContainer_{airline[1]}"]')
                t.wait(2)
                for j in range(t.count(f'//ul[@id="flightModuleList"]//li')):
                    j = j + 1
                    print(j)
                    dep = t.read(f'(//span[@data-test-id="departure-time"])[{j}+1]')
                    if dep == dep_ref[1]:
                        print('return dep ok')
                        dur = t.read(f'(//span[@data-test-id="duration"])[{j}+1]')

                        if dur == dur_ref[1]:
                            t.click(f'(//button[@data-test-id="select-button"])[{j}]')
                            t.wait(5)
                            if t.present('//a[@id="forcedChoiceNoThanks"]'):
                                t.click(f'//a[@id="forcedChoiceNoThanks"]')
                            t.wait(5)
                            for x in range(5):
                                print(x)
                                if t.popup('Flight-Information?'):
                                    break
                                else:
                                    t.wait(5)
                            util.wait_for_pageload('//h1[@class="section-header-main"]')
                            price = t.read(f'(//span[@class="packagePriceTotal"])[2]')
                            price = float(price.replace(',', '').replace('SG', '').replace('$', '').replace(' ', ''))
                            print(price)
                            url = t.url()
                            print(url)
                            return price, url
            else:
                return 0, ''

        elif len(dur_ref) >= 3:
            dep_lst = []
            dur_lst = []
            print('multi-trip ', len(dur_ref))
            for k in range(len(dur_ref)):
                dep_lst.append(t.read(f'(//span[@data-test-id="departure-time"])[{3*i+k+1}]'))
                dur_lst.append(t.read(f'(//span[@data-test-id="duration"])[{3*i+k+1}]'))
            print(dep_lst)
            print(dep_ref)
            if dep_lst == dep_ref:
                print(dur_lst)
                print(dur_ref)
                if dur_lst == dur_ref:
                    t.click(f'(//button[@data-test-id="select-button"])[{j}]')
                    t.wait(5)
                    if t.present('//a[@id="forcedChoiceNoThanks"]'):
                        t.click(f'//a[@id="forcedChoiceNoThanks"]')
                    t.wait(5)
                    for x in range(5):
                        print(x)
                        if t.popup('Flight-Information?'):
                            break
                        else:
                            t.wait(5)
                    price = t.read(f'(//span[@class="packagePriceTotal"])[2]')
                    price = float(price.replace(',', '').replace('SG', '').replace('$', '').replace(' ', ''))
                    print(price)
                    url = t.url()
                    print(url)
                    return price, url
            else:
                return 0, ''
示例#24
0
def gethistorylist(input):
    # 获取xxxx年的数据
    input = str(input)
    date_start = input + '-08-01'  #一年开始的日期 (试一试10天的)
    date_end = input + '-12-31'  #一年结束的日期

    #初始化页面
    t.init()
    #输入url进入
    t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1")
    #等5秒网页加载
    t.wait(5)
    #鼠标放上去,点击精简选项
    t.hover(element_identifier='//*[@id="fxr"]')
    t.click(element_identifier='//*[@id="zksq"]')
    #鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索
    t.hover(element_identifier='//*[@id="fxr"]')
    t.click(element_identifier='//*[@id="fxr"]')
    t.type(element_identifier='//*[@id="fxr"]', text_to_type=date_start)
    #再点击,确保日期不会遮住底下的搜索按钮
    t.click(element_identifier='//*[@id="fxr"]')
    t.hover(element_identifier='//*[@class="ipf01"]')
    t.click(element_identifier='//*[@class="ipf01"]')
    #把展示的尺寸设置为50个产品每页:
    t.hover(element_identifier='//*[@data-pagesize="50"]')
    t.click(element_identifier='//*[@data-pagesize="50"]')
    #点击以发行日升序排行,等价于"倒过来取"
    t.hover(element_identifier='//*[@data-sort = "sell_org_date"]')
    t.click(element_identifier='//*[@data-sort = "sell_org_date"]')

    #当下一页没有被disable的时候,有以下超参数
    page_curr = 1  #当前页面index
    value_dict = {}  #存放data
    max_page = 1  #最大的页面数记录

    #存放列名
    name_list = ['序号', '综合评级', 'url']

    for col_name in name_list:
        value_dict.setdefault(col_name, [])  #初始化空数据集

    #当可以翻页,或数据只有一页的时候,进行循环
    stop_flag = False

    #当当前页面不是最后一页,或只有一页时,都进行如下循环
    while (t.read(element_identifier=
                  '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]')
           == str(page_curr)) or (page_curr == 1):
        if stop_flag == True:  #如果没有今年的数据,就没必要翻页了
            break
        max_page = page_curr
        #每页的数据量大小(row number)
        count_values = int(
            t.count(element_identifier='//tbody[@id = "content"]//tr')
        ) + 1  # python从0开始
        #爬取页面所有一个table里的值
        filename = str(input) + str("_") + str(page_curr) + "history_data.csv"
        t.wait(1)  #等1秒,万一加载错误了
        t.table(
            element_identifier='//div[@class = "table-s1 tab-s2 w100"]//table',
            filename_to_save=filename)
        #爬取当前页面 (只有title和href)
        for i in range(1, count_values):
            # 判定条件:如果是今年内(小于今年12-31或等于12-31的),全都要
            if str(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//td[@class = "px"]')) <= date_end:
                # print("number {} is running".format(str(i)))
                #爬取产品名称作为primary key,之后join用:
                # 产品序号
                value_dict[name_list[0]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[2]'))
                # 综合评级
                value_dict[name_list[1]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//td[12]//i/@title'))
                # url
                value_dict[name_list[2]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//a/@href'))

            else:  # 如果已经超过今年的数据了,此线程结束,flag置true, while循环结束
                stop_flag = True
                #    print("thread stops here..")
                break
        # 翻页
        page_curr += 1
        # print("turn the page..")
        # 鼠标模拟移动,并点击翻页
        t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]')
        t.click(element_identifier='//*[@href="' + str(page_curr) + '"]')

    # #关闭tagui流
    t.close()
    #输出格式为:"今年年份.csv"
    hist_data = pd.DataFrame(value_dict)
    #双格式(csv + xlsx 输出)
    hist_data.to_csv(input + ".csv", encoding='UTF-8', index=False)
    return max_page
示例#25
0
    value_dict = {}  # 存放data
    count = 1  # csv 命名用
    # 存放列名
    name_list = ['序号', '综合评级', 'url']

    for col_name in name_list:
        value_dict.setdefault(col_name, [])  # 初始化空数据集

    # 当可以翻页,或数据只有一页的时候,进行循环
    while (t.read(element_identifier=
                  '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]')
           == str(page_curr)) or (page_curr == 1):

        # 每页的数据量大小(row number)
        count_values = int(
            t.count(element_identifier='//tbody[@id = "content"]//tr')
        ) + 1  # python从0开始
        # 爬取页面所有一个table里的值
        if str(
                t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                       str(count_values - 1) +
                       ']//td[@class = "px"]')) > str_to_append:
            # print("direct continue..")
            # 翻页
            page_curr += 1
            # 鼠标模拟移动,并点击翻页
            t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]')
            t.click(element_identifier='//*[@href="' + str(page_curr) + '"]')
            continue
        filename = str(count) + "daily_data.csv"
        count += 1
示例#26
0
def main_operation(url, mode='txt'):
    # 当前页面
    curr_page = int(
        t.read(element_identifier='//div[@class = "ng-binding"][last()]').
        split('/')[0])
    # 点击按钮
    list_count = t.count(
        element_identifier='//div[@class = "list caidan-right-list"]'
    )  # 循环列表,取出总list有几个
    #如果是断点,读取断电数据
    if os.path.exists('baojianhui_log.txt'):
        with open('baojianhui_log.txt', 'r', encoding='utf-8') as f:
            params = f.read().split(',')
        curr_page = params[0]
        start_i = params[1]
        start_j = params[2]
    else:  #如果是第一次执行,全取初始值;
        start_i = 1
        start_j = 1
    #常规操作
    for i in range(1, list_count + 1):
        t.wait(3)
        if i < int(start_i):
            continue
        item_count = t.count(
            element_identifier='//div[@class = "list caidan-right-list"][' +
            str(i) +
            ']//div[@class = "panel-row ng-scope"]')  # 取出每个list里的具体法规有几条
        print('当前是list {}, 里面的元素有 {} 个'.format(str(i), str(item_count)))
        t.wait(3)
        for j in range(1, item_count + 1):
            if j < int(start_j):
                continue
            item_title = t.read(
                element_identifier='//div[@class = "list caidan-right-list"]['
                + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) +
                ']//a')
            time_suffix = t.read(
                element_identifier='//div[@class = "list caidan-right-list"]['
                + str(i) + ']//div[@class = "panel-row ng-scope"][' + str(j) +
                ']//span[@class = "date ng-binding"]')
            file_name = item_title + '_' + time_suffix + '.txt'
            if '/' in file_name:
                file_name = file_name.replace('/', ' ')
            if mode == 'txt':
                #点击
                link = t.read(element_identifier=
                              '//div[@class = "list caidan-right-list"][' +
                              str(i) +
                              ']//div[@class = "panel-row ng-scope"][' +
                              str(j) + ']//a/@ng-href')
                prefix = 'http://www.cbirc.gov.cn/cn/view/pages/'
                final_link = prefix + link
                t.url(final_link)
                t.wait(1)
                while not os.path.exists(file_name):
                    type_1 = t.read(
                        element_identifier='//div[@class = "Section0"]'
                    ) + t.read(element_identifier='//div[@class = "Section1"]')
                    type_2 = t.read(
                        element_identifier='//div[@class = "WordSection1"]')
                    type_3 = t.read(
                        element_identifier=
                        '//div[@class = "wenzhang-content ng-binding"]')
                    if type_1 != '':
                        content = type_1
                        with open(file_name, 'w', encoding='utf-8') as f:
                            f.write(content)
                        break
                    elif type_2 != '':
                        content = type_2
                        with open(file_name, 'w', encoding='utf-8') as f:
                            f.write(content)
                        break
                    elif type_3 != '':
                        content = type_3
                        with open(file_name, 'w', encoding='utf-8') as f:
                            f.write(content)
                        break
                    else:
                        content = ' '
                        with open(file_name, 'w', encoding='utf-8') as f:
                            f.write(content)
                        break
            elif mode == 'doc':
                t.click(element_identifier=
                        '//div[@class = "list caidan-right-list"][' + str(i) +
                        ']//div[@class = "panel-row ng-scope"][' + str(j) +
                        ']//a[@ng-click = "fileDownload(x.docFileUrl)"]')
                doc_id = t.read(element_identifier=
                                '//div[@class = "list caidan-right-list"][' +
                                str(i) +
                                ']//div[@class = "panel-row ng-scope"][' +
                                str(j) + ']//a/@ng-href').split('=')[1][:-7]
                doc_name = doc_id + '.doc'
                curr_clock = 5
                while not os.path.exists(doc_name):
                    t.wait(curr_clock)
                    curr_clock += 5
                    if curr_clock > MAX_WAIT:
                        break
                t.wait(2)
                os.rename(doc_name, item_title + '_' + time_suffix + '.doc')
            elif mode == 'pdf':
                t.click(element_identifier=
                        '//div[@class = "list caidan-right-list"][' + str(i) +
                        ']//div[@class = "panel-row ng-scope"][' + str(j) +
                        ']//a[@ng-click = "fileDownload(x.pdfFileUrl)"]')
                pdf_id = t.read(element_identifier=
                                '//div[@class = "list caidan-right-list"][' +
                                str(i) +
                                ']//div[@class = "panel-row ng-scope"][' +
                                str(j) + ']//a/@ng-href').split('=')[1][:-7]
                pdf_name = pdf_id + '.pdf'
                curr_clock = 5
                while not os.path.exists(pdf_name):
                    t.wait(curr_clock)
                    curr_clock += 5
                    if curr_clock > MAX_WAIT:
                        break
                t.wait(2)
                os.rename(pdf_name, item_title + '_' + time_suffix + '.pdf')
            else:
                print('unknown format..')
                t.close()
                raise Exception("unknown input mode")
            # 返回主页面
            t.url(url + str(curr_page))
            t.wait(2)
            with open('baojianhui_log.txt', 'w', encoding='utf-8') as f:
                f.write(str(curr_page) + ',' + str(i) + ',' + str(j))
        with open('baojianhui_log.txt', 'w', encoding='utf-8') as f:
            f.write(str(curr_page) + ',' + str(i) + ',' +
                    str(1))  #当前list取完,j更新
示例#27
0
def read_text_content(content_url, file_name, page_num, i, time, url_prefix):
    # 读取网页

    if 'cnhttp' in content_url:
        content_url = content_url[21:]  # 不知道为什么会出错这个
        t.url(content_url)
        # 启动很慢
    else:
        t.url(content_url)
        # 启动很慢
    # 获取pdf的数量,pdf的名字和pdf应该有的名字
    t.wait(2)
    pdf_count = t.count(element_identifier='//div[@id = "zoom"]//a/@href')
    if pdf_count == 0:  ##如果是正常的txt文件
        # 取到列表
        print("文件{} 是文档。".format(i))
        # 取text
        if t.read(element_identifier='//div[@id = "zoom"]') != '':
            text = t.read(element_identifier='//div[@id = "zoom"]')
            try:
                with open(file_name, 'w', encoding='utf-8') as f:
                    f.write(text)
            except:
                with open('实施《全国企业兼并破产和职工再就业工作计划》银行呆、坏帐准备金核销办法_1997-10-01.txt',
                          'w',
                          encoding='utf-8') as f:
                    f.write(text)
        elif t.read(element_identifier='//td[@class = "p1"]') != '':
            text = t.read(element_identifier='//td[@class = "p1"]')
            with open(file_name, 'w', encoding='utf-8') as f:
                f.write(text)
        else:
            with open('wrong_log.txt', 'a', encoding='utf-8') as f:
                string = 'page {} doc {} didnt write in '.format(page_num, i)
                f.write(string)
                f.write("\n")
            print("write files fails...")
    else:
        # 取text
        if t.read(element_identifier='//div[@id = "zoom"]') != '':
            text = t.read(element_identifier='//div[@id = "zoom"]')
            with open(file_name, 'w', encoding='utf-8') as f:
                f.write(text)
        elif t.read(element_identifier='//td[@class = "p1"]') != '':
            text = t.read(element_identifier='//td[@class = "p1"]')
            with open(file_name, 'w', encoding='utf-8') as f:
                f.write(text)
        else:
            with open('wrong_log.txt', 'a', encoding='utf-8') as f:
                string = 'page {} doc {} didnt write in '.format(page_num, i)
                f.write(string)
                f.write("\n")
            print("write files fails...")
        print("文件{} 含有 {} 个文件要下载。".format(i, pdf_count))
        pdf_count += 1  # python从0开始,所以至少有一个pdf count
        current_count = 0
        for j in range(1, pdf_count):
            # 取pdf的名字
            if '.htm' not in t.read(
                    element_identifier='//div[@id = "zoom"]//p//a/@href'):
                print("当前是第{}个文件。。".format(j))
                p_count = t.count(element_identifier='//div[@id = "zoom"]//p')
                while current_count <= p_count:
                    try:
                        if t.read(element_identifier=
                                  '//div[@id = "zoom"]//p[last()-' +
                                  str(current_count) + ']//a') != '':
                            # 如果取到了
                            print("这个p有!")
                            pdf_name = t.read(
                                element_identifier=
                                '//div[@id = "zoom"]//p[last()-' +
                                str(current_count) + ']//a/@href')
                            # 取合规名
                            pdf_name_to_change = t.read(
                                element_identifier=
                                '//div[@id = "zoom"]//p[last()-' +
                                str(current_count) + ']//a')
                            # 下载
                            suffix = pdf_name.split('.')[-1]

                            pdf_name = pdf_name.split('/')[-1]
                            prefix = 'http://www.pbc.gov.cn'
                            download_link = prefix + t.read(
                                element_identifier=
                                '//div[@id = "zoom"]//p[last()-' +
                                str(current_count) + ']//a/@href')
                            if 'cnhttp' in download_link:
                                t.url(
                                    t.read(element_identifier=
                                           '//div[@id = "zoom"]//p[last()-' +
                                           str(current_count) + ']//a/@href'))
                                # 启动很慢
                            else:
                                t.url(download_link)
                                # 启动很慢
                            wait_seconds = 1
                            total_seconds = 0
                            while os.path.exists(pdf_name) == False:
                                t.wait(wait_seconds)
                                total_seconds += wait_seconds
                                if os.path.exists(pdf_name_to_change):
                                    break
                                if total_seconds > MAX_WAIT:
                                    print('download fails')
                                    with open('download_log.txt',
                                              'a',
                                              encoding='utf-8') as f:
                                        string = 'page {} doc {} file {} didnt download '.format(
                                            page_num, i, j)
                                        f.write(string)
                                        f.write("\n")
                                    break
                            if os.path.exists(pdf_name_to_change):
                                pass
                            else:
                                os.rename(pdf_name, pdf_name_to_change)  # 改名
                                os.rename(
                                    pdf_name_to_change,
                                    pdf_name_to_change[:-(len(suffix) + 1)] +
                                    '_' + time +
                                    pdf_name_to_change[-(len(suffix) + 1):])
                            t.url(content_url)  # 返回二级目录
                            # 启动很慢
                            current_count += 1
                            break
                        else:
                            current_count += 1
                            print("这个p没有")
                    except:
                        print('some error occurs, nvm')
                        continue

            else:
                print("是个网页,当文档处理!")
                prefix = 'http://www.pbc.gov.cn'
                download_link = prefix + t.read(
                    element_identifier='//div[@id = "zoom"]//p[' + str(j) +
                    ']//a/@href')
                if 'cnhttp' in download_link:
                    t.url(
                        t.read(element_identifier='//div[@id = "zoom"]//p[' +
                               str(j) + ']//a/@href'))
                    # 启动很慢
                else:
                    t.url(download_link)
                    # 启动很慢
                # 取text
                if t.read(element_identifier='//div[@id = "zoom"]') != '':
                    text = t.read(element_identifier='//div[@id = "zoom"]')
                    with open(file_name, 'w', encoding='utf-8') as f:
                        f.write(text)
                elif t.read(element_identifier='//td[@class = "p1"]') != '':
                    text = t.read(element_identifier='//td[@class = "p1"]')
                    with open(file_name, 'w', encoding='utf-8') as f:
                        f.write(text)
                else:
                    with open('wrong_log.txt', 'a', encoding='utf-8') as f:
                        string = 'page {} doc {} didnt write in '.format(
                            page_num, i)
                        f.write(string)
                        f.write("\n")
                    print("write files fails...")
示例#28
0
def gethistorylist(inputyear):
    # 获取xxxx年的数据
    input = inputyear
    date_start = input + '-01-01'  #一年开始的日期 (试一试10天的)
    date_end = input + '-12-31'  #一年结束的日期

    #初始化页面
    t.init()
    #输入url进入
    t.url("http://bank.jrj.com.cn/bankpro/data.shtml?type=1")
    #鼠标放上去,点击精简选项
    t.hover(element_identifier='//*[@id="fxr"]')
    t.click(element_identifier='//*[@id="zksq"]')
    #鼠标移动到发行日期上,点击文本栏,输入发行日日期为今日,点击搜索
    t.hover(element_identifier='//*[@id="fxr"]')
    t.click(element_identifier='//*[@id="fxr"]')
    t.type(element_identifier='//*[@id="fxr"]', text_to_type=date_start)
    #再点击,确保日期不会遮住底下的搜索按钮
    t.click(element_identifier='//*[@id="fxr"]')
    t.hover(element_identifier='//*[@class="ipf01"]')
    t.click(element_identifier='//*[@class="ipf01"]')
    #把展示的尺寸设置为50个产品每页:
    t.hover(element_identifier='//*[@data-pagesize="50"]')
    t.click(element_identifier='//*[@data-pagesize="50"]')
    #点击以发行日升序排行,等价于"倒过来取"
    t.hover(element_identifier='//*[@data-sort = "sell_org_date"]')
    t.click(element_identifier='//*[@data-sort = "sell_org_date"]')
    #当下一页没有被disable的时候,有以下超参数
    page_curr = 1  #当前页面index
    value_dict = {}  #存放data
    #存放列名
    name_list = [
        '序号', '产品名称', '发行银行', '委托货币', '发行日', '停售日', '管理期(天)', '预期收益率', '到期收益率',
        '与同期储蓄比', '综合评级', 'url'
    ]
    for col_name in name_list:
        value_dict.setdefault(col_name, [])  #初始化空数据集
    #当可以翻页,或数据只有一页的时候,进行循环
    stop_flag = False
    #当当前页面不是最后一页,或只有一页时,都进行如下循环
    while (t.read(element_identifier=
                  '//div[@id = "pagefoot"]//a[@class = "cur pf-disabled"]')
           == str(page_curr)) or (page_curr == 1):
        if stop_flag == True:  #如果没有今年的数据,就没必要翻页了
            break
        #每页的数据量大小(row number)
        count_values = int(
            t.count(element_identifier='//tbody[@id = "content"]//tr')
        ) + 1  # python从0开始
        #爬取当前页面
        for i in range(1, count_values):
            # 判定条件:如果是今年内(小于今年12-31或等于12-31的),全都要
            if str(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//td[@class = "px"]')) <= date_end:
                # 序号
                value_dict[name_list[0]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[2]'))
                # 产品名称
                value_dict[name_list[1]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[3]'))
                # 发行银行
                value_dict[name_list[2]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[4]'))
                # 委托货币
                value_dict[name_list[3]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[5]'))
                # 发行日
                value_dict[name_list[4]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[6]'))
                # 停售日
                value_dict[name_list[5]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[7]'))
                # 管理期(天)
                value_dict[name_list[6]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[8]'))
                # 预期收益率
                value_dict[name_list[7]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[9]'))
                # 到期收益率
                value_dict[name_list[8]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[10]'))
                # 与同期储蓄比
                value_dict[name_list[9]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']/td[11]'))
                # 综合评级
                value_dict[name_list[10]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//td[12]//i/@title'))
                # url
                value_dict[name_list[11]].append(
                    t.read(element_identifier='//tbody[@id = "content"]//tr[' +
                           str(i) + ']//a/@href'))
            else:  # 如果已经超过今年的数据了,此线程结束,flag置true, while循环结束
                stop_flag = True
                print("thread stops here..")
                break
        # 翻页
        page_curr += 1
        print("turn the page..")
        # 鼠标模拟移动,并点击翻页
        t.hover(element_identifier='//*[@href="' + str(page_curr) + '"]')
        t.click(element_identifier='//*[@href="' + str(page_curr) + '"]')
    # #关闭tagui流
    t.close()
    #输出格式为:"今年年份.csv"
    hist_data = pd.DataFrame(value_dict)
    #双格式(csv + xlsx 输出)
    hist_data.to_csv(input + ".csv", index=False, encoding='UTF-8')
    hist_data.to_excel(input + ".xlsx", index=False, encoding='UTF-8')


#gethistorylist('2003')
示例#29
0
def get_shoe(shoe_name, g, email):
    """
    Get shoe details from jdsports.com.sg
    :param shoe_name: name of the shoe to search for
    :param gender: gender of the subscriber
    :param email: email id of the subscriber
    :return: details, list of shoe details.
    """
    details = []
    t.init(visual_automation=True)
    t.url('https://www.jdsports.com.sg/')
    t.wait(5)
    final_command = shoe_name + " shoes" + '[enter]'
    t.keyboard('[esc]')
    t.type('//input[@id = "srchInput"]', final_command)
    #t.click('//input[@id ="srchButton"]')
    t.wait(3)

    if g == ' men':
        if t.read(
                '(//a[@data-e2e="plp-filterMenu-catItem"]/span)[contains(.,"Men")]'
        ):
            t.click('(//a[@data-e2e="plp-filterMenu-catItem"]/span)[1]')
            count = t.count(
                '//ul[@id="productListMain"]//li[@class="productListItem "]')
            t.wait(3)

            if count != 0:
                for i in range(1, min(count, 4)):
                    price = t.read(f'(//span[@class="pri"])[{i}]')
                    name = t.read(f'(//span[@class="itemTitle"])[{i}]')
                    img = t.read(
                        f'(//a[@class="itemImage"]/picture/img/@srcset)[{i}]')
                    link = "https://www.jdsports.com.sg" + t.read(
                        f'(//span[@class = "itemTitle"])[{i}]/a/@href')
                    details.append({
                        "email": email,
                        "name": name,
                        "price": price,
                        "img": img,
                        "Company": "JD",
                        "link": link
                    })
            else:
                details.append({
                    "email": email,
                    "name": "NA",
                    "price": "NA",
                    "img": "NA",
                    "Company": "JD",
                    "link": "NA"
                })

    elif g == ' women':
        if t.read(
                '(//a[@data-e2e="plp-filterMenu-catItem"]/span)[contains(.,"Women")]'
        ):
            t.click(
                '(//a[@data-e2e="plp-filterMenu-catItem"]/span)[.="Women"]')
            count = t.count(
                '//ul[@id="productListMain"]//li[@class="productListItem "]')
            t.wait(3)

            if count != 0:

                for i in range(1, min(count, 4)):
                    price = t.read(f'(//span[@class="pri"])[{i}]')
                    name = t.read(f'(//span[@class="itemTitle"])[{i}]')
                    img = t.read(
                        f'(//a[@class="itemImage"]/picture/img/@srcset)[{i}]')
                    link = "https://www.jdsports.com.sg" + t.read(
                        f'(//span[@class = "itemTitle"])[{i}]/a/@href')
                    details.append({
                        "email": email,
                        "name": name,
                        "price": price,
                        "img": img,
                        "Company": "JD",
                        "link": link
                    })
            else:
                details.append({
                    "email": email,
                    "name": "NA",
                    "price": "NA",
                    "img": "NA",
                    "Company": "JD",
                    "link": "NA"
                })
    else:
        count = t.count(
            '//ul[@id="productListMain"]//li[@class="productListItem "]')
        t.wait(3)
        if count != 0:

            for i in range(1, min(count, 4)):
                price = t.read(f'(//span[@class="pri"])[{i}]')
                name = t.read(f'(//span[@class="itemTitle"])[{i}]')
                img = t.read(
                    f'(//a[@class="itemImage"]/picture/img/@srcset)[{i}]')
                link = "https://www.jdsports.com.sg" + t.read(
                    f'(//span[@class = "itemTitle"])[{i}]/a/@href')
                details.append({
                    "email": email,
                    "name": name,
                    "price": price,
                    "img": img,
                    "Company": "JD",
                    "link": link
                })
        else:
            details.append({
                "email": email,
                "name": "NA",
                "price": "NA",
                "img": "NA",
                "Company": "JD",
                "link": "NA"
            })
    #t.close()
    if len(details) == 0:
        details.append({
            "email": email,
            "name": "NA",
            "price": "NA",
            "img": "NA",
            "Company": "JD",
            "link": "NA"
        })
#    print("JD BOT",details)
    return details
def get_shoe(shoe, gender, email):
    t.init(visual_automation=True)
    t.url("https://www.farfetch.com/sg/")
    details = []
    if gender == ' men':
        t.click('(//span[@class="tabs__span"])[.="Men"]')
        t.type('//input[@class="js-searchboxABTest force-ltr"]',
               shoe + " Shoes")
        t.click('//form[@class="ff-search"]/button')
        t.wait(3)
        count = t.count('(//li[@data-test="productCard"])')
        if count != 0:
            for i in range(1, min(count, 4)):
                name = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/p'
                )
                price = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/div'
                ).replace('$', '')

                if 'Off' in price:
                    price = price.split('Off')[1]

                img = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//img/@src')
                link = "https://www.farfetch.com" + t.read(
                    f'(//li[@data-test="productCard"])[{i}]/a/@href')
                details.append({
                    "email": email,
                    "name": name,
                    "price": price,
                    "img": img,
                    "Company": "Farfetch",
                    "link": link
                })
#                print(f"name: {name}, price: {price} img_source = {img}")
        else:
            details.append({
                "email": email,
                "name": "NA",
                "price": "NA",
                "img": "NA",
                "Company": "Farfetch",
                "link": "NA"
            })

    elif gender == ' women':
        t.click('(//span[@class="tabs__span"])[.="Women"]')
        t.type('//input[@class="js-searchboxABTest force-ltr"]',
               shoe + " Shoes")
        t.click('//form[@class="ff-search"]/button')
        t.wait(3)
        count = t.count('(//li[@data-test="productCard"])')
        if count != 0:
            for i in range(1, min(count, 4)):
                name = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/p'
                )
                price = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/div'
                ).replace('$', '')
                if 'Off' in price:
                    price = price.split('Off')[1]

                img = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//img/@src')
                link = "https://www.farfetch.com" + t.read(
                    f'(//li[@data-test="productCard"])[{i}]/a/@href')
                details.append({
                    "email": email,
                    "name": name,
                    "price": price,
                    "img": img,
                    "Company": "Farfetch",
                    "link": link
                })
#                print(f"name: {name}, price: {price} img_source = {img}")
        else:
            details.append({
                "email": email,
                "name": "NA",
                "price": "NA",
                "img": "NA",
                "Company": "Farfetch",
                "link": "NA"
            })
    else:
        t.type('//input[@class="js-searchboxABTest force-ltr"]',
               shoe + " Shoes")
        t.click('//form[@class="ff-search"]/button')
        t.wait(3)
        count = t.count('(//li[@data-test="productCard"])')
        if count != 0:
            for i in range(1, min(count, 4)):
                name = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/p'
                )
                price = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//div[@data-test="information"]/div'
                ).replace('$', '')
                if 'Off' in price:
                    price = price.split('Off')[1]

                img = t.read(
                    f'(//li[@data-test="productCard"])[{i}]//img/@src')
                link = "https://www.farfetch.com" + t.read(
                    f'(//li[@data-test="productCard"])[{i}]/a/@href')
                details.append({
                    "email": email,
                    "name": name,
                    "price": price,
                    "img": img,
                    "Company": "Farfetch",
                    "link": link
                })
#                print(f"name: {name}, price: {price} img_source = {img}")
        else:
            details.append({
                "email": email,
                "name": "NA",
                "price": "NA",
                "img": "NA",
                "Company": "Farfetch",
                "link": "NA"
            })

    t.close()

    return details