Пример #1
0
def doSearch(p_search, p_location, p_pathA, p_pathB, p_minDelay, p_maxDelay, p_distance):
    # http://www.118218.fr/recherche?category_id=&geo_id=&distance=&category=&what=plombier&where=75013
    if p_distance > 0:
        l_baseUrl = '{0}recherche?category_id=&geo_id=&distance={3}&category=&what={1}&where={2}'.format(
            g_url,
            urllib.parse.quote(p_search, safe=''),
            urllib.parse.quote(p_location, safe=''),
            p_distance
        )
    else:
        l_baseUrl = '{0}recherche?category_id=&geo_id=&distance=&category=&what={1}&where={2}'.format(
            g_url,
            urllib.parse.quote(p_search, safe=''),
            urllib.parse.quote(p_location, safe='')
        )
    l_urlSearch = l_baseUrl

    # open output csv file (main)
    l_fOutMain = open(p_pathA, 'w')
    l_fOutMain.write('ID;NAME;ADDRESS;CP;CITY;CREATION;SIRET;TYPE;COUNT;OWNER;' +
                     'TEL1;TEL2;TEL3;TEL4;MAIL;WEB1;WEB2;WEB3;WEB4;HOURS;BUSINESS;ADDITIONAL\n')

    # open output csv file (secondary)
    l_fOutSecondary = open(p_pathB, 'w')
    l_fOutSecondary.write('ID;TYPE;RAW;CLEAN;FLAG\n')

    # Create a new instance of the Firefox driver
    l_driver = CommonFunctions.getDriver()

    # go to the base Url
    l_driver.get(l_urlSearch)

    l_finished = False
    l_linksList = []
    l_currentPage = 1

    l_wait = 60
    # get all links in the result set
    while not l_finished:
        print('Result page:', l_currentPage)

        # Wait for the footer to appear
        if not waitFoFooter(l_driver):
            l_finished = True
            continue

        try:
            l_messageDisplay = l_driver.find_element_by_xpath(
                '//article/section[@class="staticContent ieWrapperFix"]')
            l_message = l_messageDisplay.text
            if re.match('Nos systèmes ont détecté un trafic important', l_message):
                print('Abuse message:', l_message)

                if l_currentPage <= 20 and l_wait <= 300:
                    print('Waiting for {0} seconds ...'.format(l_wait))
                    time.sleep(l_wait)
                    l_wait += 60

                    l_driver.get(l_urlSearch)
                    continue

                l_finished = True
                continue

        except EX.NoSuchElementException:
            print('Ok apparently ...')

        l_wait = 60

        try:
            l_resultCountLocation = l_driver.find_element_by_xpath('//p[@class="resultCount"]')
            l_resultCount = l_resultCountLocation.text
            print('l_resultCount:', l_resultCount)
        except EX.NoSuchElementException:
            print('No Results')
            l_finished = True
            continue

        l_countLink = 0
        for l_link in l_driver.find_elements_by_xpath('//h2/a'):
            l_linkUrl = l_link.get_attribute('href')
            l_linksList += [l_linkUrl]
            print('l_linkUrl:', l_linkUrl)
            l_countLink += 1

        try:
            l_found = False
            for l_link in l_driver.find_elements_by_xpath('//a'):
                # find next page link page
                if l_link.get_attribute('data-page') == str(l_currentPage + 1):

                    l_found = True
                    l_currentPage += 1
                    l_urlSearch = l_link.get_attribute('href')
                    print('Link to next page:', l_urlSearch)

                    # scroll to it, to make it visible, and then click it
                    l_actions = ActionChains(l_driver)
                    l_actions.move_to_element(l_link)
                    l_actions.click()
                    l_actions.perform()

                    CommonFunctions.randomWait(p_minDelay, p_maxDelay)
                    break

            if not l_found:
                # if the link was not found --> Finished
                print('No More Results')
                l_finished = True

        except EX.NoSuchElementException:
            print('No More Results')
            l_finished = True
            continue

    l_count = 0
    for l_url in l_linksList:
        # Scrape one company and stops in case of failure
        if not doOneCompany(l_driver, l_url, l_fOutMain, l_fOutSecondary, p_minDelay, p_maxDelay, l_count):
            break

        l_count += 1
        CommonFunctions.randomWait(p_minDelay, p_maxDelay)

    l_driver.quit()
    print('Number of items retrieved:', l_count)

    l_fOutMain.close()
    l_fOutSecondary.close()

    return l_count
Пример #2
0
def doSearch(p_search, p_location, p_pathA, p_pathB, p_minDelay, p_maxDelay):

    # open output csv file (main)
    l_fOutMain = open(p_pathA, 'w')
    l_fOutMain.write('ID;NAME;ADDRESS;CP;CITY;CREATION;SIRET;TYPE;COUNT;OWNER;' +
                     'TEL1;TEL2;TEL3;TEL4;MAIL;WEB1;WEB2;WEB3;WEB4;HOURS;BUSINESS;ADDITIONAL\n')

    # open output csv file (secondary)
    l_fOutSecondary = open(p_pathB, 'w')
    l_fOutSecondary.write('ID;TYPE;RAW;CLEAN;FLAG\n')

    # Create a new instance of the Firefox driver
    l_driver = CommonFunctions.getDriver()

    # go to the base Url
    l_driver.get(g_url)

    try:
        # locate the keyword search input text box and enter the search string
        l_quoiQui = WebDriverWait(l_driver, 10).until(EC.presence_of_element_located(
                    (By.XPATH, '//input[@id="pj_search_quoiqui"]')))
        print('l_quoiQui placeholder:', l_quoiQui.get_attribute('placeholder'))
        l_quoiQui.send_keys(p_search)

        # locate the location input text box and enter the location string
        l_ou = l_driver.find_element_by_id('pj_search_ou')
        print('l_ou placeholder:', l_ou.get_attribute('placeholder'))
        l_ou.send_keys(p_location)

        # submit the form
        l_driver.find_element_by_xpath('//button[@class="button primary icon large-button"]').click()
    except EX.NoSuchElementException:
        print('[01] Something is badly wrong (Element not found) ...')
        return 0
    except EX.TimeoutException:
        print('[02] Something is badly wrong (Timeout) ...')
        return 0

    l_finished = False
    l_count = 0
    while not l_finished:
        try:
            # WebDriverWait(driver,5).until(
            # lambda driver: driver.find_elements(By.ID,"a") or driver.find_elements(By.ID,"b"))

            WebDriverWait(l_driver, 10).until(
                lambda p_driver: \
                    p_driver.find_elements(By.XPATH, '//h2[@class="company-name"]') \
                    or p_driver.find_elements(By.XPATH, '//div[@class="no-response"]'))

            #WebDriverWait(l_driver, 10).until(EC.presence_of_element_located(
            #    (By.XPATH, '//h2[@class="company-name"]')))
        except EX.TimeoutException:
            print('[03] Something is badly wrong (Timeout) ...')
            return 0

        if killPopup(l_driver):
            continue

        try:
            l_driver.find_element_by_xpath('//div[@class="no-response"]')
            print('No results')

            l_finished = True
            continue
        except EX.NoSuchElementException:
            print('There should be results')

        try:
            # reformulation
            l_reformulation = l_driver.find_element_by_xpath(
                '//span[@class="denombrement"]/strong[@id="SEL-nbresultat"]')

            l_resultCount = l_reformulation.text
            print('l_resultCount:', l_resultCount)

        except EX.NoSuchElementException:
            print('No reformulation ?! ...')

        l_articleList = []
        try:
            for l_company in l_driver.find_elements_by_xpath('//h2[@class="company-name"]/../../../..'):
                l_articleId = l_company.get_attribute('id')
                print('l_articleId:', l_articleId)
                l_articleList += [l_articleId]

        except EX.NoSuchElementException:
            print('[04] Something is badly wrong (Element not found) ...')
            return 0

        try:
            l_article = 0
            for l_articleId in l_articleList:
                if killPopup(l_driver):
                    print('Popup Killed, waiting for 10 s.')
                    time.sleep(10)

                print('+ l_articleId:', l_articleId)
                l_company = l_driver.find_element_by_xpath(
                    '//article[@id="{0}"]//h2[@class="company-name"]/a[2]'.format(l_articleId))

                #l_driver.execute_script("return arguments[0].scrollIntoView();", l_company)

                l_name = l_company.text
                print('Fetching:', l_name)

                l_driver.execute_script("return arguments[0].scrollIntoView();", l_company)
                l_driver.execute_script("window.scrollBy(0, -300);")

                # Save the window opener (current window, do not mistaken with tab... not the same)
                l_mainWindow = l_driver.current_window_handle

                # l_company.send_keys(Keys.CONTROL + Keys.RETURN)
                # scroll to it, to make it visible, and then click it
                l_actions = ActionChains(l_driver)
                l_actions.move_to_element(l_company)
                l_actions.context_click()
                l_actions.send_keys(Keys.ARROW_DOWN)
                l_actions.send_keys(Keys.ENTER)
                l_actions.perform()

                # Switch tab to the new tab, which we will assume is the next one on the right
                l_driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.TAB)

                # Put focus on current window which will, in fact, put focus on the current visible tab
                l_driver.switch_to_window(l_mainWindow)

                if doOneCompany(l_driver, l_fOutMain, l_fOutSecondary, l_count):
                    l_count += 1

                CommonFunctions.randomWait(p_minDelay, p_maxDelay)

                # Close current tab
                l_driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + 'w')

                # Put focus on current window which will be the window opener
                l_driver.switch_to_window(l_mainWindow)

        except EX.NoSuchElementException:
            print('[05] Something is badly wrong (Element not found) ...')
            return 0

        # locate the next button and click it
        try:
            l_next = l_driver.find_element_by_id('pagination-next')

            # scroll to it, to make it visible, and then click it
            l_actions = ActionChains(l_driver)
            l_actions.move_to_element(l_next)
            l_actions.click()
            l_actions.perform()
        except EX.NoSuchElementException:
            print('No more results')
            l_finished = True

    print('Number of items retrieved:', l_count)

    l_fOutMain.close()
    l_fOutSecondary.close()

    l_driver.quit()
    return l_count