Python get_soup примеры использования

Язык программирования: Python

Пространство имен/Пакет: QA_request

Метод/Функция: get_soup

Примеров на hotexamples.com: 9

Python get_soup - 9 примеров найдено. Это лучшие примеры Python кода для QA_request.get_soup, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

def compare_page(old_url, new_url, browser=None, progress_var=None, step=1.0):
    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        return

    # deal with urls that exceeds 50 characters
    new_list = new_url.split("/")
    if new_list[-1].find("televox.west.com") == -1 and len(new_list[-1]) > 50:
        new_list[-1] = new_list[-1][:50]
    new_url = "/".join(new_list)

    old_soup = get_soup(old_url)
    new_soup = get_soup(new_url, browser=browser)
    result = open("result\\site_result.txt", 'a')

    if old_soup is None:
        record_error(old_url, "soup")
        if progress_var:
            progress_var.set(progress_var.get() + step)
        return False
    if new_soup is None:
        record_error(new_url, "soup")
        if progress_var:
            progress_var.set(progress_var.get() + step)
        return False

    meta_pass = compare_meta_soup(old_soup, new_soup, old_url, new_url)

    if meta_pass == -1:
        if progress_var:
            progress_var.set(progress_var.get() + step)
        return False

    form_pass = compare_form_soup(old_soup, new_soup, old_url, new_url)
    content_pass = compare_content_soup(old_soup, new_soup, old_url, new_url)
    image_pass = compare_image_soup(old_soup, new_soup, old_url, new_url)
    link_pass = compare_link_soup(old_soup,
                                  new_soup,
                                  old_url,
                                  new_url,
                                  browser=browser)
    page_pass = meta_pass and form_pass and content_pass and image_pass and link_pass

    if page_pass:
        print(new_url + " PASSED!")
        result.write(new_url + " PASSED!\n")
        result.close()
        if progress_var:
            progress_var.set(progress_var.get() + step)
        return True
    else:
        print(new_url + " FAILED! (see detail files for more information)")
        result.write(new_url +
                     " FAILED! (see detail files for more information)\n")
        result.close()
        if progress_var:
            progress_var.set(progress_var.get() + step)
        return False

Пример #2

Показать файл

Файл: Migrator_subpage.py Проект: k47ma/Scraping-Interface

def get_subpages(old_url):
    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        return []

    old_soup = get_soup(old_url)
    nav_menu = old_soup.find('ul', class_="primary-navigation")
    try:
        drop_downs = nav_menu.find_all('ul')
    except AttributeError:
        return []

    parsed_subpages = []
    for drop_down in drop_downs:
        subpages = drop_down.find_all('a')
        if not subpages:
            continue
        else:
            url_list = subpages[0]['href'].split('/')
            root_url = '/' + url_list[1]
        parsed_subpage = []
        for subpage in subpages:
            name = subpage.get_text()
            rel_url = subpage['href'].split('/')[-1]
            parsed_subpage.append((name, rel_url))
        parsed_subpages.append((root_url, parsed_subpage))

    return parsed_subpages

Пример #3

Показать файл

def check_homepage_link(old_soup, new_soup, old_url, new_url, browser=None):
    detail = open("result\\homepage_detail.txt", 'a')
    old_hostname = urlparse(old_url).hostname
    new_hostname = urlparse(new_url).hostname
    page_pass = True
    printable = set(string.printable)
    new_content = new_soup.find('div', class_="ptl_page")

    if old_url.endswith("/"):
        old_url = old_url[:-1]
    if new_url.endswith("/"):
        new_url = new_url[:-1]

    if not old_hostname:
        old_hostname = old_url
    if not new_hostname:
        new_hostname = new_url

    if not new_content:
        record_error(new_url, "new homepage container")
        return False

    # remove banner and navigation menu from soup
    if new_content:
        for (name, kwargs) in settings["HOMEPAGE_LINK_IGNORE"]:
            for s in new_content.find_all(name, **kwargs):
                s.extract()

    new_tags = new_content.find_all('a',
                                    href=re.compile("^(?!.*(#aftermap|#)).*$"))

    # check for new links that direct to old site
    host_link = old_url.replace(urlparse(old_url).path, "")
    domain = get_domain(old_url)
    for tag in new_tags:
        href = tag['href']
        href_hostname = urlparse(href).hostname
        if href_hostname is None:
            href_hostname = ""
        if href.startswith("/"):
            continue
        if (href.startswith(host_link) and host_link != "") \
                or (href_hostname.find(domain + '.') != -1 and not href.startswith("mailto") and href.find("televox.west.com") == -1) \
                or href.find("iapps") != -1:
            page_pass = False
            entry_print("***********************************************")
            entry_print("HOMEPAGE LINKS THAT GO BACK TO OLD SITE!")
            entry_print("New URL: " + new_url)
            detail.write("HOMEPAGE LINKS THAT GO BACK TO OLD SITE!\n")
            detail.write("New URL: " + new_url + "\n")
            entry_print("Bad tag: " + str(tag))
            entry_print("***********************************************")
            detail.write("Bad tag: " + str(tag) + "\n")
            detail.write("-----------------------------------------------\n")
        if href.find("televox.west.com") != -1:
            page_pass = False
            entry_print("***********************************************")
            entry_print("NON-FRIENDLY URL FOUND! ")
            entry_print("New URL: " + new_url)
            detail.write("NON-FRIENDLY URL FOUND!\n")
            detail.write("New URL: " + new_url + "\n")
            entry_print("Bad tag: " + str(tag))
            entry_print("***********************************************")
            detail.write("Bad tag: " + str(tag) + "\n")
            detail.write("-----------------------------------------------\n")

    # check invalid links in new site
    new_invalid_links = []
    for tag in new_tags:
        url = tag.get('href')
        if url is None:
            continue
        if url.startswith("https://"):
            continue
        if url.startswith("tel:") or url.startswith(
                "mailto:") or url.find("#") != -1 or url.startswith("/common"):
            continue
        if url.startswith("/"):
            url = "http://" + new_hostname + url
        if url.find("televox.west.com") != -1:
            new_target = get_soup(url, browser)
        else:
            new_target = get_soup(url)
        new_target_title = get_meta_soup(new_target, url)['title']
        if new_target_title.find("404") != -1 or new_target_title == "Page Not Found" or new_target_title == "none" \
                or new_target_title == "The resource cannot be found.":
            new_invalid_links.append((str(tag), new_target_title))

    if new_invalid_links:
        entry_print("***********************************************")
        entry_print("INVALID LINK FOUND IN HOMEPAGE!")
        entry_print("New URL: " + new_url)
        detail.write("-----------------------------------------------\n")
        detail.write("INVALID LINK FOUND IN HOMEPAGE!\n")
        detail.write("New URL: " + new_url + "\n")
        ind = 0
        for tag, target in new_invalid_links:
            ind += 1
            entry_print("Bad tag" + str(ind) + ": " + tag)
            entry_print("Target title: " + target)
            detail.write("Bad tag" + str(ind) + ": " + tag + "\n")
            detail.write("Target title: " + target + "\n")
        entry_print("***********************************************")

    # check published links for homepage
    old_publish = old_soup.find('nav', id="utility-navigation")
    new_publish = new_soup.find('nav', id="utility-navigation")

    if old_publish:
        old_published_links = old_publish.find_all(
            'a', href=re.compile("^((?!#).)*$"))
    else:
        old_published_links = []
    if new_publish:
        new_published_links = new_publish.find_all(
            'a', href=re.compile("^((?!#).)*$"))
    else:
        new_published_links = []

    if len(old_published_links) != len(new_published_links):
        entry_print("***********************************************")
        entry_print("NUMBER OF PUBLISHED LINKS DIFFERENT!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Number of old links: " + str(len(old_published_links)))
        entry_print("Number of new links: " + str(len(new_published_links)))
        entry_print("***********************************************")
        detail.write("NUMBER OF PUBLISHED LINKS DIFFERENT!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Number of old links: " + str(len(old_published_links)) +
                     "\n")
        detail.write("Number of new links: " + str(len(new_published_links)) +
                     "\n")
        detail.write("-----------------------------------------------\n")
        page_pass = False
    else:
        publish_pass = True
        # check the href and name for each published link
        for ind in range(len(new_published_links)):
            old_link = old_published_links[ind]['href']
            new_link = new_published_links[ind]['href']
            old_link_dup = old_link.replace(" ", "").replace("-", "").replace(
                "(", "").replace(")", "")
            new_link_dup = new_link.replace(" ", "").replace("-", "").replace(
                "(", "").replace(")", "")
            old_name = old_published_links[ind].get_text().replace("  ", " ")
            new_name = new_published_links[ind].get_text().replace("  ", " ")
            old_name = "".join([i for i in old_name
                                if i in printable]).strip().upper()
            new_name = "".join([i for i in new_name
                                if i in printable]).strip().upper()
            old_name_dup = old_name.replace(" ", "").replace("-", "").replace(
                "(", "").replace(")", "")
            new_name_dup = new_name.replace(" ", "").replace("-", "").replace(
                "(", "").replace(")", "")
            if old_link_dup != new_link_dup:
                if old_link.startswith("tel:") or old_link.startswith(
                        "mailto:") or unicode(old_link[0]).isnumeric():
                    continue

                if old_link.startswith("/"):
                    old_link = old_hostname + old_link
                if new_link.startswith("/"):
                    new_link = new_hostname + new_link

                old_target = get_soup(old_link)
                new_target = get_soup(new_link, browser=browser)
                old_target_title = get_meta_soup(old_target, old_link)['title']
                new_target_title = get_meta_soup(new_target, new_link)['title']

                if new_target_title.endswith("..."):
                    new_target_title = new_target_title[:-3]
                    old_target_title = old_target_title[:len(new_target_title)]

                if old_target_title != new_target_title:
                    if publish_pass:
                        entry_print(
                            "***********************************************")
                        entry_print("PUBLISHED LINKS DO NOT MATCH!")
                        entry_print("Old URL: " + old_url)
                        entry_print("New URL: " + new_url)
                        detail.write("PUBLISHED LINKS DO NOT MATCH!\n")
                        detail.write("Old URL: " + old_url + "\n")
                        detail.write("New URL: " + new_url + "\n")
                        publish_pass = False
                        page_pass = False
                    entry_print("Old target: " + old_target_title)
                    entry_print("New target: " + new_target_title)
                    entry_print("Old link: " + old_link)
                    entry_print("New link: " + new_link)
                    detail.write("Old target: " + old_target_title + "\n")
                    detail.write("New target: " + new_target_title + "\n")
                    detail.write("Old link: " + old_link + "\n")
                    detail.write("New link: " + new_link + "\n")
            if old_name_dup != new_name_dup:
                if publish_pass:
                    entry_print(
                        "***********************************************")
                    entry_print("PUBLISHED LINK NAMES DO NOT MATCH!")
                    entry_print("Old URL: " + old_url)
                    entry_print("New URL: " + new_url)
                    detail.write("PUBLISHED LINK NAMES DO NOT MATCH!\n")
                    detail.write("Old URL: " + old_url + "\n")
                    detail.write("New URL: " + new_url + "\n")
                    publish_pass = False
                    page_pass = False
                entry_print("Old name: " + old_name)
                entry_print("New name: " + new_name)
                detail.write("Old name: " + old_name + "\n")
                detail.write("New name: " + new_name + "\n")
        if not publish_pass:
            entry_print("***********************************************")
            detail.write("-----------------------------------------------\n")

    # check social media links for homepage
    old_social = old_soup.find('nav', class_="social-navigation")
    new_social = new_soup.find('nav', class_="social-navigation")

    if old_social:
        old_social_links = old_social.find_all('a')
    else:
        old_social_links = []
    if new_social:
        new_social_links = new_social.find_all('a')
    else:
        new_social_links = []

    if len(old_social_links) != len(new_social_links):
        entry_print("***********************************************")
        entry_print("NUMBER OF SOCIAL LINKS DIFFERENT!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Number of old links: " + str(len(old_social_links)))
        entry_print("Number of new links: " + str(len(new_social_links)))
        entry_print("***********************************************")
        detail.write("NUMBER OF SOCIAL LINKS DIFFERENT!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Number of old links: " + str(len(old_social_links)) +
                     "\n")
        detail.write("Number of new links: " + str(len(new_social_links)) +
                     "\n")
        detail.write("-----------------------------------------------\n")
        page_pass = False
    else:
        social_pass = True
        # check the href and name for each social link
        for ind in range(len(new_social_links)):
            old_link = old_social_links[ind]['href']
            new_link = new_social_links[ind]['href']
            old_link_reversed = old_social_links[len(old_social_links) - ind -
                                                 1]['href']
            if old_link != new_link and old_link_reversed != new_link:
                if new_link.startswith("/"):
                    new_link = new_hostname + new_link
                if old_link.startswith("/"):
                    old_link = old_hostname + old_link

                old_target = get_soup(old_link)
                new_target = get_soup(new_link)
                old_target_title = replace_special(
                    get_meta_soup(old_target, old_link)['title'])
                new_target_title = replace_special(
                    get_meta_soup(new_target, new_link)['title'])

                if new_target_title.endswith("..."):
                    new_target_title = new_target_title[:-3]
                    old_target_title = old_target_title[:len(new_target_title)]

                if old_target_title != new_target_title:
                    if social_pass:
                        entry_print(
                            "***********************************************")
                        entry_print("SOCIAL LINKS DO NOT MATCH!")
                        entry_print("Old URL: " + old_url)
                        entry_print("New URL: " + new_url)
                        detail.write("SOCIAL LINKS DO NOT MATCH!\n")
                        detail.write("Old URL: " + old_url + "\n")
                        detail.write("New URL: " + new_url + "\n")
                        social_pass = False
                    entry_print("Old target: " + old_target_title)
                    entry_print("New target: " + new_target_title)
                    entry_print("Old link: " + old_link)
                    entry_print("New link: " + new_link)
                    detail.write("Old target: " + old_target_title + "\n")
                    detail.write("New target: " + new_target_title + "\n")
                    detail.write("Old link: " + old_link + "\n")
                    detail.write("New link: " + new_link + "\n")
        if not social_pass:
            entry_print("***********************************************")
            detail.write("-----------------------------------------------\n")
    detail.close()
    return page_pass

Пример #4

Показать файл

Файл: QA_compare_homepage.py Проект: k47ma/Scraping-Interface

def compare_homepage(old_url,
                     new_url,
                     browser=None,
                     progress_var=None,
                     step=1.0):
    result = open("result\\site_result.txt", 'a')

    old_soup = get_soup(old_url)
    if not browser:
        new_soup = get_soup(new_url)
    else:
        try:
            new_soup = get_soup(new_url, browser)
        except CredentialError:
            if progress_var:
                progress_var.set(progress_var.get() + step)
            return -1

    if old_soup is None:
        record_error(old_url, "soup")
        if progress_var:
            progress_var.set(progress_var.get() + step)
        return False
    if new_soup is None:
        record_error(new_url, "soup")
        if progress_var:
            progress_var.set(progress_var.get() + step)
        return False

    meta_pass = compare_meta_soup(old_soup, new_soup, old_url, new_url)

    if meta_pass == -1:
        if progress_var:
            progress_var.set(progress_var.get() + step)
        return False

    address_pass = compare_address_soup(old_soup, new_soup, old_url, new_url)
    content_pass = compare_homepage_content(old_soup, new_soup, old_url,
                                            new_url)
    image_pass = check_homepage_image(new_soup, new_url)
    link_pass = check_homepage_link(old_soup,
                                    new_soup,
                                    old_url,
                                    new_url,
                                    browser=browser)
    page_pass = meta_pass and address_pass and content_pass and image_pass and link_pass

    if page_pass:
        print(new_url + " HOMEPAGE PASSED!")
        result.write(new_url + " HOMEPAGE PASSED!\n")
        result.close()
        if progress_var:
            progress_var.set(progress_var.get() + step)
        return True
    else:
        result.write(
            new_url +
            " HOMEPAGE FAILED! (see detail files for more information)\n")
        result.close()
        if progress_var:
            progress_var.set(progress_var.get() + step)
        return False

Пример #5

Показать файл

def set_meta(old_url, new_url, browser):
    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        return

    wait = WebDriverWait(browser, 20)
    old_soup = get_soup(old_url)
    old_meta = get_meta_soup(old_soup, old_url)

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        return

    if new_url.endswith('/'):
        new_url = new_url[:-1]

    # truncate url if name exceeds 50 characters
    new_path = urlparse(new_url).path
    new_path_list = new_path.split('/')
    if len(new_path_list[-1]) > 50:
        new_path_list[-1] = new_path_list[-1][:50]
        new_path_dup = "/".join(new_path_list)
        new_url_dup = new_url.replace(new_path, new_path_dup)
        browser.get(new_url_dup)
    else:
        browser.get(new_url)

    if browser.title == "Login":
        login(browser, wait)

    new_soup = BeautifulSoup(browser.page_source, "html.parser")
    login_status = new_soup.find('a', id="ctl00_lnkGateway").get_text()
    if login_status == "Login":
        login_button = browser.find_element_by_id("ctl00_lnkGateway")
        login_button.click()
        wait.until(
            EC.visibility_of_element_located(
                (By.ID, "ctl00_ContentPlaceHolder1_txtUsername")))
        login(browser, wait)

    page_options = browser.find_element_by_xpath(
        '//li[@class="optionPageOptions"]')
    page_options.click()

    metadata_option = browser.find_element_by_xpath(
        '//span[@class="AB_icn AB_icn-metadata"]').find_element_by_xpath('..')
    url = metadata_option.get_attribute('href')
    rel_url = re.search("/cms/.*Metadata", url).group(0)
    new_hostname = urlparse(new_url).hostname
    target_url = "http://" + new_hostname + rel_url

    browser.get(target_url)

    enable_custom_checkbox = browser.find_elements_by_xpath(
        '//input[@type="checkbox"]')[0]
    if not enable_custom_checkbox.is_selected():
        enable_custom_checkbox.click()

    # migrate title
    title = old_meta["title"]
    title_entry = browser.find_elements_by_xpath('//input[@type="text"]')[6]
    title_entry.clear()
    try:
        title_entry.send_keys(title)
    except UnicodeDecodeError:
        migration_print("Unable to migrate title for " + new_url)
        migration_print("Title: " + old_meta["title"])
        migration_print("Description: " + old_meta["description"])
        migration_print("Keywords: " + old_meta["keywords"])
        migration_print(
            "-----------------------------------------------------------")
        ask_continue()
        return

    # migrate description
    description = old_meta["description"]
    if description != "none" and not description.startswith(
            "Learn more about"):
        description_entry = browser.find_elements_by_xpath(
            '//input[@type="text"]')[13]
        description_entry.clear()
        try:
            description_entry.send_keys(description)
        except UnicodeDecodeError:
            migration_print("Unable to migrate description for " + new_url)
            migration_print("Title: " + old_meta["title"])
            migration_print("Description: " + old_meta["description"])
            migration_print("Keywords: " + old_meta["keywords"])
            migration_print(
                "-----------------------------------------------------------")
            ask_continue()
            return

    # migrate keywords
    keywords = old_meta["keywords"]
    if keywords != "none":
        keywords_entry = browser.find_elements_by_xpath(
            '//input[@type="text"]')[14]
        keywords_entry.clear()
        try:
            keywords_entry.send_keys(keywords)
        except UnicodeDecodeError:
            migration_print("Unable to migrate keywords for " + new_url)
            migration_print("Title: " + old_meta["title"])
            migration_print("Description: " + old_meta["description"])
            migration_print("Keywords: " + old_meta["keywords"])
            migration_print(
                "-----------------------------------------------------------")
            ask_continue()
            return

    submit_button = browser.find_element_by_xpath('//input[@type="submit"]')
    submit_button.click()

    new_path = urlparse(new_url).path
    if not new_path:
        new_path = "/"
    else:
        ind = new_url.find(new_path)
        new_path = new_url[ind:]
    migration_print(new_path + " metadata migrated!")

Пример #6

Показать файл

def migrate_meta(old_url, new_url, progress_var=None, step=100.0):
    old_url = old_url.strip()
    new_url = new_url.strip()

    # remove the "/" at the end of the url
    if old_url[-1] == '/':
        old_url = old_url[:-1]
    if new_url[-1] == '/':
        new_url = new_url[:-1]

    # add "http://" before url
    if not old_url.startswith("http"):
        old_url = "http://" + old_url
    if not new_url.startswith("http"):
        new_url = "http://" + new_url

    # print out the information for old and new sites
    migration_print("-----------------------------------------------------")
    migration_print("Old URL: " + old_url)
    migration_print("New URL: " + new_url)
    migration_print("-----------------------------------------------------")

    browser = webdriver.Chrome(executable_path=settings["EXECUTABLE_PATH"])
    browser.maximize_window()

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        browser.quit()
        migration_print(
            "-----------------------------------------------------\n")
        return

    if progress_var:
        progress_var.set(progress_var.get() + step * 0.01)

    sites = get_sites(old_url)

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        browser.quit()
        migration_print(
            "-----------------------------------------------------\n")
        return

    if progress_var:
        progress_var.set(progress_var.get() + step * 0.02)

    if not sites:
        migration_print("Unable to fetch subpage URLs form site map of " +
                        old_url)

    # find blog pages
    old_blog_page = get_blog_site(old_url)
    new_blog_page = get_blog_site(new_url)
    blog_exists = True

    if not old_blog_page or not new_blog_page:
        blog_exists = False

    # calculate the step for each subpage
    step *= 0.97
    if blog_exists:
        page_step = step / 2 / (len(sites) + 1)
    else:
        page_step = step / (len(sites) + 1)

    # migrate metadata for homepage
    set_meta(old_url, new_url, browser)

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        browser.quit()
        migration_print(
            "-----------------------------------------------------\n")
        return

    if progress_var:
        progress_var.set(progress_var.get() + page_step)

    # migrate all non-blog pages
    for site in sites:
        # check program status
        if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
            browser.quit()
            migration_print(
                "-----------------------------------------------------\n")
            return

        old_link = old_url + site
        new_link = new_url + site
        try:
            set_meta(old_link, new_link, browser)
        except NoSuchElementException:
            migration_print("Missing Page: " + new_link, )
        if progress_var:
            progress_var.set(progress_var.get() + page_step)

    if not blog_exists:
        browser.quit()
        migration_print(
            "-----------------------------------------------------------")
        return

    step /= 2

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        browser.quit()
        migration_print(
            "-----------------------------------------------------\n")
        return

    old_blog_soup = get_soup(old_blog_page)
    new_blog_soup = get_soup(new_blog_page, browser)

    old_blogs = old_blog_soup.find_all(['h5', 'h3'])
    new_blogs = new_blog_soup.find_all('a', class_="title")

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        browser.quit()
        migration_print(
            "-----------------------------------------------------\n")
        return

    if progress_var:
        progress_var.set(progress_var.get() + step * 0.02)

    step *= 0.98

    # record blog posts as title, url pairs in dictionary
    old_list = []
    parsed_old_blogs = {}
    ind = 1
    for blog in old_blogs:
        title = blog.get_text()
        if title == "Categories":
            old_blogs.remove(blog)
            continue

        try:
            link = blog.a.get('href')
        except AttributeError:
            migration_print("Unable to find blog metadata for " + title)

        if title in parsed_old_blogs:
            parsed_old_blogs[title + str(ind)] = link
            old_list.append((title + str(ind), link))
            ind += 1
        else:
            parsed_old_blogs[title] = link
            old_list.append((title, link))

    new_list = []
    parsed_new_blogs = {}
    ind = 1
    for blog in new_blogs:
        title = blog.get_text()
        link = new_url + blog.get('href')
        if title in parsed_new_blogs:
            parsed_new_blogs[title + str(ind)] = link
            new_list.append((title + str(ind), link))
            ind += 1
        else:
            parsed_new_blogs[title] = link
            new_list.append((title, link))

    if not old_list or not new_list:
        browser.quit()
        return

    blog_step = step / (len(old_list) + 1)

    # migrate metadata for blog index page
    set_meta(old_blog_page, new_blog_page, browser)

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        browser.quit()
        migration_print(
            "-----------------------------------------------------\n")
        return

    if progress_var:
        progress_var.set(progress_var.get() + blog_step)

    # migrate metadata for blog posts
    for ind in range(len(old_list)):
        # check program status
        if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
            browser.quit()
            migration_print(
                "-----------------------------------------------------\n")
            return

        if old_list[ind][0] == new_list[ind][0]:
            set_meta(old_list[ind][1], new_list[ind][1], browser)
        else:
            try:
                set_meta(parsed_old_blogs[old_list[ind][0]],
                         parsed_new_blogs[old_list[ind][0]], browser)
            except KeyError:
                migration_print("Cannot migrate metadata for blog page " +
                                new_list[ind][1])
                continue
        if progress_var:
            progress_var.set(progress_var.get() + blog_step)

    browser.quit()
    migration_print("-----------------------------------------------------\n")

Пример #7

Показать файл

Файл: QA_compare_normal.py Проект: k47ma/Scraping-Interface

def compare_link_soup(old_soup, new_soup, old_url, new_url, browser=None):
    detail = open("result\\site_detail.txt", 'a')
    old_hostname = urlparse(old_url).hostname
    new_hostname = urlparse(new_url).hostname

    if not old_hostname:
        old_hostname = ""
    if not new_hostname:
        new_hostname = ""

    # grab container
    old_content = old_soup.find('div', class_="right")
    new_content = new_soup.find('div', class_="right")

    if not old_content and new_content:
        if old_soup.find('div', id="content"):
            old_content = old_soup.find('div', id="content")

    if not old_content and new_content:
        record_error(old_url, "link container")
        detail.close()
        return False
    elif old_content and not new_content:
        record_error(new_url, "link container")
        detail.close()
        return False
    elif not old_content and not new_content:
        return True

    # vertical template uses different container
    if old_content.find('div', id="content"):
        old_content = old_soup.find('div', id="content")

    if new_content.find('div', id="content"):
        new_content = new_soup.find('div', id="content")

    # remove extra links from container
    if old_content:
        for (name, kwargs) in settings["COMPARE_OLD_LINK_IGNORE"]:
            for s in old_content.find_all(name, **kwargs):
                s.extract()

    if new_content:
        for (name, kwargs) in settings["COMPARE_NEW_LINK_IGNORE"]:
            for s in new_content.find_all(name, kwargs):
                s.extract()

    if old_content is None:
        old_tags = []
    else:
        old_tags = old_content.find_all('a', href=True)

    if new_content is None:
        new_tags = []
    else:
        new_tags = new_content.find_all('a', href=True)

    # remove links that does not have any content inside
    old_tags = [
        tag for tag in old_tags
        if tag.text and not tag.text.isspace() or tag.find('img')
    ]
    new_tags = [
        tag for tag in new_tags
        if tag.text and not tag.text.isspace() or tag.find('img')
    ]

    # check for new links that direct to old site
    host_link = old_url.replace(urlparse(old_url).path, "")
    domain = get_domain(old_url)
    new_pass1 = True
    for tag in new_tags:
        href = tag['href']
        href_hostname = urlparse(href).hostname
        if not href_hostname:
            href_hostname = ""
        if href.find(host_link) != -1 or (href_hostname.find(domain + '.') != -1 and href.find("televox.west.com") == -1) \
                or href.find("iapps") != -1:
            if new_pass1:
                entry_print("***********************************************")
                entry_print("LINKS THAT GO BACK TO OLD SITE!")
                entry_print("Old URL: " + old_url)
                entry_print("New URL: " + new_url)
                detail.write("LINKS THAT GO BACK TO OLD SITE!\n")
                detail.write("Old URL: " + old_url + "\n")
                detail.write("New URL: " + new_url + "\n")
                new_pass1 = False
            entry_print("Bad tag: " + str(tag))
            detail.write("Bad tag: " + str(tag) + "\n")
    if not new_pass1:
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")

    # check for non-friendly urls
    new_pass2 = True
    for tag in new_tags:
        href = tag['href']
        if not href:
            continue
        if href.find("televox.west.com") != -1:
            if new_pass2:
                entry_print("***********************************************")
                entry_print("NON-FRIENDLY URL FOUND!")
                entry_print("New URL: " + new_url)
                detail.write("NON-FRIENDLY URL FOUND!\n")
                detail.write("New URL: " + new_url + "\n")
                new_pass2 = False
            entry_print("Bad tag: " + str(tag))
            detail.write("Bad tag: " + str(tag) + "\n")
    if not new_pass2:
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")

    # remove file links
    for tag in old_tags:
        url = tag.get('href')
        if re.search("jpg|png|pdf|mp4", url):
            old_tags.remove(tag)
    for tag in new_tags:
        url = tag.get('href')
        if re.search("jpg|png|pdf|mp4|UserFile", url):
            new_tags.remove(tag)

    bad_tags = []
    if len(old_tags) != len(new_tags):
        # remove 404 pages and file links from the old tags
        for tag in old_tags:
            url = tag.get('href')
            if url is None:
                continue
            if url.startswith("https://"):
                continue
            if url.startswith("tel:") or url.startswith(
                    "mailto:") or url.find("#") != -1:
                continue
            if url.startswith("/"):
                url = "http://" + old_hostname + url
            old_target = get_soup(url)
            old_target_title = get_meta_soup(old_target, url)['title']
            if old_target_title.find("404") != -1 \
                    or re.search("page not found|the resource cannot be found", old_target_title.lower()) \
                    or old_target_title == "none":
                bad_tags.append((str(tag), old_target_title))
                old_tags.remove(tag)

    # check invalid links in new site
    new_invalid_links = []
    for tag in new_tags:
        url = tag.get('href')
        if url is None:
            continue
        if url.startswith("https://"):
            continue
        if url.startswith("tel:") or url.startswith(
                "mailto:") or url.find("#") != -1 or url.startswith("/common"):
            continue
        if url.startswith("/"):
            url = "http://" + new_hostname + url
        if url.find("televox.west.com") != -1:
            new_target = get_soup(url, browser)
        else:
            new_target = get_soup(url)
        new_target_title = get_meta_soup(new_target, url)['title']
        if new_target_title.find("404") != -1 or new_target_title == "Page Not Found" or new_target_title == "none" \
                or new_target_title == "The resource cannot be found.":
            new_invalid_links.append((str(tag), new_target_title))

    if new_invalid_links:
        entry_print("***********************************************")
        entry_print("INVALID LINK FOUND IN NEW SITE!")
        entry_print("New URL: " + new_url)
        detail.write("INVALID LINK FOUND IN NEW SITE!\n")
        detail.write("New URL: " + new_url + "\n")
        ind = 0
        for tag, target in new_invalid_links:
            ind += 1
            entry_print("Bad tag" + str(ind) + ": " + tag)
            entry_print("Target title: " + target)
            detail.write("Bad tag" + str(ind) + ": " + tag + "\n")
            detail.write("Target title: " + target + "\n")
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")

    # check that number of links match if not, return
    if len(new_tags) != len(old_tags):
        entry_print("***********************************************")
        entry_print(
            "NUMBER OF LINKS DIFFERENT OR 404 LINK EXISTS IN NEW PAGE!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Number of old links: " + str(len(old_tags)))
        entry_print("Number of new links: " + str(len(new_tags)))
        entry_print("Old tags: " + str(old_tags))
        entry_print("New tags: " + str(new_tags))
        if bad_tags:
            entry_print("404 tags in old site (removed):")
            for ind in range(len(bad_tags)):
                entry_print("Tag" + str(ind + 1) + ": " + bad_tags[ind][0])
                entry_print("Target title: " + bad_tags[ind][1])
        entry_print("***********************************************")
        detail.write(
            "NUMBER OF LINKS DIFFERENT OR 404 LINK EXISTS IN NEW PAGE!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Number of old links: " + str(len(old_tags)) + "\n")
        detail.write("Number of new links: " + str(len(new_tags)) + "\n")
        if bad_tags:
            detail.write("404 tag(s) in old site (removed):\n")
            for ind in range(len(bad_tags)):
                detail.write("Tag" + str(ind + 1) + ": " + bad_tags[ind][0] +
                             "\n")
                detail.write("Target title: " + bad_tags[ind][1] + "\n")
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")
        detail.close()
        return False

    # check that new and old links match
    new_pass3 = True
    count = 0
    for ind in range(len(new_tags)):
        old_link = old_tags[ind]['href'].replace("\\", "/").strip()
        new_link = new_tags[ind]['href'].replace("\\", "/").strip()
        if old_link == new_link:
            continue

        # take out the duplication part for old_link
        if old_link.find("#") != -1:
            old_ind = old_link.find("#")
            old_link = old_link[old_ind:]
        if new_link.find("#") != -1:
            new_ind = new_link.find("#")
            new_link = new_link[new_ind:]

        temp = old_link.split("/")
        if len(temp) > 2:
            if temp[-1] == temp[-2]:
                old_link = "/".join(temp[:-1])
        if urlparse(old_link).path == urlparse(new_link).path:
            continue

        if old_link.startswith("/"):
            old_link = "http://" + old_hostname + old_link
        # if the old link points to the homepage, then set it as "/"
        if old_link.endswith("/home") or old_link.endswith("/main"):
            old_link = "/"
        if new_link == "/home" or new_link == "/main":
            new_link = "/"
        if new_link != "/" and new_link.endswith("/"):
            new_link = new_link[:-1]
        if old_link != "/" and old_link.endswith("/"):
            old_link = old_link[:-1]

        if old_link != new_link and not new_link.startswith("/common"):
            if old_link.find("#") != -1 or new_link.find("#") != -1:
                count += 1
                if new_pass3:
                    entry_print(
                        "***********************************************")
                    entry_print("LINKS THAT DO NOT MATCH!")
                    entry_print("Old URL: " + old_url)
                    entry_print("New URL: " + new_url)
                    detail.write("LINKS THAT DO NOT MATCH!\n")
                    detail.write("Old URL: " + old_url + "\n")
                    detail.write("New URL: " + new_url + "\n")
                    new_pass3 = False
                entry_print("Old link" + str(count) + ": " + old_link)
                entry_print("New link" + str(count) + ": " + new_link)
                entry_print("Old tag" + str(count) + ": " + str(old_tags[ind]))
                entry_print("New tag" + str(count) + ": " + str(new_tags[ind]))
                detail.write("Old tag" + str(count) + ": " +
                             str(old_tags[ind]) + "\n")
                detail.write("New tag" + str(count) + ": " +
                             str(new_tags[ind]) + "\n")
                continue

            if old_link.startswith("/"):
                old_link = "http://" + old_hostname + old_link.strip()
            if new_link.startswith("/"):
                new_link = "http://" + new_hostname + new_link.strip()

            old_target = get_soup(old_link)
            new_target = get_soup(new_link, browser=browser)
            old_target_title = replace_special(
                get_meta_soup(old_target, old_link)['title'])
            new_target_title = replace_special(
                get_meta_soup(new_target, new_link)['title'])

            if new_target_title.endswith("..."):
                new_target_title = new_target_title[:-3]
                old_target_title = old_target_title[:len(new_target_title)]

            if old_target_title != new_target_title:
                count += 1
                if new_pass3:
                    entry_print(
                        "***********************************************")
                    entry_print("LINKS THAT DO NOT MATCH!")
                    entry_print("Old URL: " + old_url)
                    entry_print("New URL: " + new_url)
                    detail.write("LINKS THAT DO NOT MATCH!\n")
                    detail.write("Old URL: " + old_url + "\n")
                    detail.write("New URL: " + new_url + "\n")
                    new_pass3 = False
                entry_print("Old link" + str(count) + ": " + old_link)
                entry_print("New link" + str(count) + ": " + new_link)
                entry_print("Old target" + str(count) + ": " +
                            old_target_title)
                entry_print("New target" + str(count) + ": " +
                            new_target_title)
                entry_print("Old tag" + str(count) + ": " + str(old_tags[ind]))
                entry_print("New tag" + str(count) + ": " + str(new_tags[ind]))
                detail.write("Old tag" + str(count) + ": " +
                             str(old_tags[ind]) + "\n")
                detail.write("New tag" + str(count) + ": " +
                             str(new_tags[ind]) + "\n")
    if not new_pass3:
        detail.write("-----------------------------------------------\n")
        entry_print("***********************************************")

    detail.close()
    return new_pass1 and new_pass2 and new_pass3

Пример #8

Показать файл

def compare_site_thread(old_url,
                        new_url,
                        progress_var=None,
                        step=100.0,
                        thread_pool_csv=None):
    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        return

    # checking multiple sites mode
    if thread_pool_csv:
        thread_pool = thread_pool_csv
    else:
        thread_pool = ThreadPool(settings["THREADPOOL_SIZE"])
    create_path()
    ind = 0

    old_url = old_url.strip()
    new_url = new_url.strip()

    # remove the "/" at the end of the url
    if old_url[-1] == '/':
        old_url = old_url[:-1]
    if new_url[-1] == '/':
        new_url = new_url[:-1]

    # add "http://" before url
    if not old_url.startswith("http"):
        old_url = "http://" + old_url
    if not new_url.startswith("http"):
        new_url = "http://" + new_url

    # print out the information for old and new sites
    entry_print("-----------------------------------------------------", True)
    entry_print("Old URL: " + old_url, True)
    entry_print("New URL: " + new_url, True)
    entry_print("-----------------------------------------------------", True)

    setup_step = step * 0.01
    if progress_var:
        progress_var.set(progress_var.get() + setup_step)

    # check if the new site needs login
    new_test = get_soup(new_url)
    if new_test:
        title = new_test.find("title")
        if title and title.get_text().strip() == "Login":
            entry_print(
                "New site needs login. Please use login mode to check this site!\n",
                True)
            return -1

    setup_step = step * 0.01
    if progress_var:
        progress_var.set(progress_var.get() + setup_step)

    # get the subpages of old and new sites
    try:
        sites = get_sites(old_url)
    except AttributeError:
        entry_print(
            "Can't find the site map from " + old_url +
            ". Please check if the url is valid!", True)
        thread_pool.destroy()
        return
    old_blog = get_blog_site(old_url)
    new_blog = get_blog_site(new_url)

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        thread_pool.destroy()
        return

    blog_exists = False
    if old_blog and new_blog:
        blog_exists = True

    # if urls for subpages are not found
    if sites is None:
        record_error(new_url, "sites")
        if progress_var:
            progress_var.set(progress_var.get() + step)
        return False

    # if blog page is not found
    if old_blog is not None and new_blog is None:
        record_error(new_url, "blog")
    elif old_blog is None and new_blog is not None:
        record_error(old_url, "blog")

    setup_step = step * 0.02
    if progress_var:
        progress_var.set(progress_var.get() + setup_step)

    # print out site information
    entry_print("Site Information: ", True)

    # calculate the step for each page
    step *= 0.96
    if blog_exists:
        page_step = step / 2 / (len(sites) + 1)
        entry_print("Old Blog: " + old_blog, True)
        entry_print("New Blog: " + new_blog, True)
    else:
        page_step = step / (len(sites) + 1)

    entry_print("Number of non-blog pages: " + str(len(sites)), True)

    # check the homepage
    thread_pool.add_task(compare_homepage,
                         old_url=old_url,
                         new_url=new_url,
                         progress_var=progress_var,
                         step=page_step)

    # check all the sites in sitemap
    for site in sites:
        ind += 1
        if site.startswith("/home") or site.startswith("/main"):
            continue

        old_link = old_url + site
        new_link = new_url + site

        thread_pool.add_task(compare_page,
                             old_url=old_link,
                             new_url=new_link,
                             progress_var=progress_var,
                             step=page_step)

    # check all the blog pages
    if blog_exists:
        old_blog_soup = get_soup(old_blog)
        new_blog_soup = get_soup(new_blog)
        compare_blog(old_blog_soup,
                     new_blog_soup,
                     old_blog,
                     new_blog,
                     progress_var=progress_var,
                     step=step / 2)

    # single site mode
    if not thread_pool_csv:
        thread_pool.wait_completion()
        thread_pool.destroy()

    entry_print("-----------------------------------------------------\n")

    return True

Пример #9

Показать файл

def compare_site_selenium(old_url, new_url, progress_var=None, step=100.0):
    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        entry_print("-----------------------------------------------------\n")
        return

    create_path()
    new_browser = webdriver.Chrome(executable_path=settings["EXECUTABLE_PATH"])
    new_browser.maximize_window()
    site_pass = True
    blog_pass = True
    ind = 0

    old_url = old_url.strip()
    new_url = new_url.strip()

    # remove the "/" at the end of url
    if old_url.endswith('/'):
        old_url = old_url[:-1]
    if new_url.endswith('/'):
        new_url = new_url[:-1]

    # add http:// before url
    if not old_url.startswith("http://"):
        old_url = "http://" + old_url
    if not new_url.startswith("http://"):
        new_url = "http://" + new_url

    # print out the information for old and new sites
    entry_print("-----------------------------------------------------", True)
    entry_print("Old URL: " + old_url, True)
    entry_print("New URL: " + new_url, True)
    entry_print("-----------------------------------------------------", True)

    setup_step = step * 0.01
    if progress_var:
        progress_var.set(progress_var.get() + setup_step)

    # get the domain name of old url and derive the new url
    sites = get_sites(old_url)
    old_blog = get_blog_site(old_url)
    new_blog = get_blog_site(new_url)

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        new_browser.quit()
        entry_print("-----------------------------------------------------\n")
        return

    blog_exists = False
    if old_blog and new_blog:
        blog_exists = True

    # if subpages are not found
    if sites is None:
        record_error(new_url, "sites")
        return False

    # if blog page is not found
    if old_blog is not None and new_blog is None:
        record_error(new_url, "blog")
    elif old_blog is None and new_blog is not None:
        record_error(old_url, "blog")

    setup_step = step * 0.02
    step *= 0.97
    if progress_var:
        progress_var.set(progress_var.get() + setup_step)

    if blog_exists:
        page_step = step / 2 / (len(sites) + 1)
    else:
        page_step = step / (len(sites) + 1)

    # check homepage
    homepage_pass = compare_homepage(old_url, new_url, browser=new_browser, progress_var=progress_var, step=page_step)

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        new_browser.quit()
        entry_print("-----------------------------------------------------\n")
        return

    if homepage_pass == -1:
        return -1

    # check all the sites in sitemap
    for site in sites:
        # check program status
        if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
            new_browser.quit()
            entry_print("-----------------------------------------------------\n")
            return

        ind += 1
        if site.startswith("/home") or site.startswith("/main"):
            continue

        old_link = old_url + site
        new_link = new_url + site

        page_pass = compare_page(old_link, new_link, browser=new_browser, progress_var=progress_var, step=page_step)

        if not page_pass:
            site_pass = False

    # check all the blog entries
    if blog_exists:
        # check program status
        if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
            new_browser.quit()
            entry_print("-----------------------------------------------------\n")
            return

        old_blog_soup = get_soup(old_blog)
        new_blog_soup = get_soup_selenium(new_blog, new_browser)
        blog_pass = compare_blog(old_blog_soup, new_blog_soup, old_blog, new_blog, browser=new_browser,
                                 progress_var=progress_var, step=step / 2)

    entry_print("-----------------------------------------------------\n")
    new_browser.quit()

    return site_pass and homepage_pass and blog_pass