def code_pub_main(s3_bucket, s3_path, rs_table, base_loc, start_link):
    cwd = os.getcwd()
    chrome_options = webdriver.ChromeOptions()
    # set download folder
    # configure multiple file download and turn off prompt
    prefs = {
        'download.default_directory': base_loc,
        'profile.default_content_setting_values.automatic_downloads': 1,
        'download.prompt_for_download': 'false',
        'default_content_settings.automatic_downloads': 1,
        'profile.content_settings.exceptions.automatic_downloads': 1
    }
    chrome_options.add_experimental_option('prefs', prefs)
    keys_written = []
    city = start_link[0]
    links = start_link[1]
    print(city)
    for link in links:
        try:
            driver = webdriver.Chrome(f'{cwd}/chromedriver',
                                      chrome_options=chrome_options)
            print(link)
            driver.get(link)
            # find update date
            messy_date = get_update_date(driver)
            # find and click all necessary checkboxes
            driver = handle_checkboxes(driver, 0.4, 0.5)
            # save the document
            driver = save_doc(driver)
            update_date = scraper_tools.extract_date(messy_date[0])
            # puts file in right folder and waits for files to download
            old_path = base_loc + city.replace(' ', '') + ".txt"
            new_path = scraper_tools.downloads_done(old_path, 36)
            path = scraper_tools.make_path(base_loc, city, update_date)
            new_path = path + city + ".txt"
            os.rename(old_path, new_path)
            # split document by lvl 2 sections
            lvl2_docs = split_lvl2_docs(new_path)
            # send each lvl 2 section to s3 as seperate doc
            for lvl2_header, lvl2_text in lvl2_docs.items():
                print(lvl2_header)
                key = scraper_tools.s3_file_writer(s3_bucket, s3_path,
                                                   base_loc, city, update_date,
                                                   lvl2_header, lvl2_text)
                if key and (key not in list(rs_table.s3_key)):
                    keys_written.append(key)
            driver.close()
            driver.quit()
            return False, keys_written
        except:
            return True, keys_written
Пример #2
0
def q_code_main(s3_bucket, s3_path, rs_table, base_loc, start_link):
    cwd = os.getcwd()
    chrome_options = webdriver.ChromeOptions()
    #set download folder
    #configure multiple file download and turn off prompt
    prefs = {'download.default_directory' : base_loc,
            'profile.default_content_setting_values.automatic_downloads': 1,
            'download.prompt_for_download': 'False'}
    chrome_options.add_experimental_option('prefs', prefs)
    missing_sections = 0
    keys_written = []
    my_xpath = "//div[@class='navChildren']//a"
    showall_xpath = "//a[@class='showAll']"
    high_title_xpath = "//div[@class='currentTopic']"
    low_title_xpath = "//div[@class='navTocHeading']"
    content_xpath = "//div[@class='content-fragment']"
    up_xpath = "//a[@accesskey='u']"
    city = start_link[0]
    links = start_link[1]
    print(city)
    for link in links:
        my_doc = [city]
        try:
            # level 1
            driver = webdriver.Chrome(f'{cwd}/chromedriver', options=chrome_options)
            print(link)
            driver.get(link)
            # get last updated date
            driver.switch_to.frame('LEFT')
            date_xpath = "//body[@class='preface']//p"
            scraper_tools.waiting_for_presence_of(driver, date_xpath, 3, 0.1)
            left_text = driver.find_elements_by_xpath(date_xpath)
            for p in left_text:
                if 'current' in p.text.lower():
                    update_date_messy = p.text
                    my_doc.append(update_date_messy)
            driver.switch_to.default_content()
            driver.switch_to.frame('RIGHT')
            scraper_tools.waiting_for_presence_of(driver, my_xpath, 3, 0.1)
            # level 2
            if len(driver.find_elements_by_xpath(my_xpath)) <= 4:
                for h_sec_num in range(len(driver.find_elements_by_xpath(my_xpath))):
                    h_sections = driver.find_elements_by_xpath(my_xpath)
                    level2_title = h_sections[h_sec_num].text
                    if 'code' in level2_title.lower():
                        scraper_tools.click_n_wait(driver, my_xpath, h_sections, h_sec_num, 3, 0.1)
            for h_sec_num in range(len(driver.find_elements_by_xpath(my_xpath))):
                h_sections = driver.find_elements_by_xpath(my_xpath)
                level2_title = h_sections[h_sec_num].text
                print(level2_title)
                my_doc.append(level2_title)
                if ('reserved' in level2_title.lower()) or (level2_title.lower() == 'note'):
                    continue
                scraper_tools.click_n_wait(driver, my_xpath, h_sections, h_sec_num, 3, 0.1)
                # level 3
                for l_sec_num in range(len(driver.find_elements_by_xpath(my_xpath))):
                    try:
                        l_sections = driver.find_elements_by_xpath(my_xpath)
                        my_doc.append(l_sections[l_sec_num].text)
                        # skip sections that are reserved or notes
                        if ('reserved' in l_sections[l_sec_num].text.lower()) or (l_sections[l_sec_num].text.lower() == 'note'):
                            continue
                        l_sections[l_sec_num].click()
                        # if there is no showall button use the brute force way to go back
                        if len(driver.find_elements_by_xpath(showall_xpath)) != 0:
                            waiting_for_presence_of(driver, showall_xpath, 3, 0.1)
                            scraper_tools.find_click_n_wait(driver, showall_xpath, high_title_xpath, 0, 3, 0.1)
                            # get text
                            for content, l_title in zip(driver.find_elements_by_xpath(content_xpath), driver.find_elements_by_xpath(low_title_xpath)):
                                my_doc.append(l_title.text)
                                my_doc.append(content.text)
                            # go to previous page
                            find_click_n_wait(driver, up_xpath, my_xpath, 0, 3, 0.1)
                        elif len(driver.find_elements_by_xpath(content_xpath)) != 0:
                            # get text
                            for content in driver.find_elements_by_xpath(content_xpath):
                                my_doc.append(content.text)
                            # go to previous page
                            find_click_n_wait(driver, up_xpath, my_xpath, 0, 3, 0.1)
                        else:
                            driver.get(link)
                            driver.switch_to.frame('RIGHT')
                            scraper_tools.find_click_n_wait(driver, my_xpath, my_xpath, h_sec_num, 3, 0.1)
                    except:
                        my_doc.append("-_-_-missing-_-_-")
                        missing_sections += 1
                scraper_tools.find_click_n_wait(driver, up_xpath, my_xpath, 0, 3, 0.1)
                update_date = scraper_tools.extract_date(update_date_messy)
                key = scraper_tools.s3_file_writer(s3_bucket, s3_path, base_loc, city, update_date, level2_title, '\n'.join(my_doc))
                if key:
                    keys_written.append(key)
                my_doc = [city]
        except:
            return True, keys_written
    driver.close()
    driver.quit()
    print("-"*5)
    if missing_sections > 0:
        return True, keys_written
    return False, keys_written