Exemplo n.º 1
0
def get_soup_selenium(url, browser):
    wait = WebDriverWait(browser, 20)

    if re.search("pdf|jpg|png|mp4|tel:|mailto:|/UserFile", url):
        return None

    try:
        browser.get(url)
    except WebDriverException:
        return None

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        return None

    # use Selenium to login the page
    if browser.title == "Login":
        username = browser.find_element_by_id(
            'ctl00_ContentPlaceHolder1_txtUsername')
        username.send_keys(settings["USER_NAME"])
        password = browser.find_element_by_id(
            'ctl00_ContentPlaceHolder1_txtPassword')
        password.send_keys(settings["PASSWORD"])
        button = browser.find_element_by_name(
            'ctl00$ContentPlaceHolder1$btnLogin')
        button.click()

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        return None

    # wait until the page is fully loaded
    try:
        wait.until(lambda driver: driver.find_elements_by_xpath(
            "//span[@class='label_skin_corporation']|//li[@class='copy']"))
    except TimeoutException:
        return None

    title = browser.title
    if title == "Login":
        entry_print("Invalid credential! Please check your login settings.")
        raise CredentialError

    source_code = browser.page_source
    soup = BeautifulSoup(source_code, "html.parser")

    if soup is not None:
        if url.find("televox.west.com") != -1:
            for (name, kwargs) in settings["GET_NEW_SOUP_IGNORE"]:
                for s in soup.find_all(name, **kwargs):
                    s.extract()
        else:
            for (name, kwargs) in settings["GET_OLD_SOUP_IGNORE"]:
                for s in soup.find_all(name, **kwargs):
                    s.extract()

    return soup
Exemplo n.º 2
0
def get_soup(url, browser=None):
    if not (url.startswith("http://") or url.startswith("https://")):
        url = "http://" + url

    if re.search("pdf|jpg|png|mp4|tel:|mailto:|/UserFile", url):
        return None

    # use selenium to retrieve the data
    if browser:
        return get_soup_selenium(url, browser)

    try:
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")
    except ConnectionError:
        if url[7:].startswith("www."):
            return None
        new_link = url[:7] + "www." + url[7:]
        entry_print("Bad URL! Trying " + new_link)
        return get_soup(new_link)
    except ContentDecodingError:
        record_error(url, "site")
        return None
    except InvalidURL:
        record_error(url, "site")
        return None
    except OverflowError:
        record_error(url, "site")
        return None

    if soup is not None:
        if url.find("televox.west.com") != -1:
            for (name, kwargs) in settings["GET_NEW_SOUP_IGNORE"]:
                for s in soup.find_all(name, **kwargs):
                    s.extract()
        else:
            for (name, kwargs) in settings["GET_OLD_SOUP_IGNORE"]:
                for s in soup.find_all(name, **kwargs):
                    s.extract()

    return soup
Exemplo n.º 3
0
    def run(self):
        hour_var = 0
        minute_var = 0
        second_var = 0

        while self.controller.isAlive():
            sleep(1)
            second_var += 1

            # if second reaches 60
            if second_var == 60:
                second_var = 0
                minute_var += 1

            # if minute reaches 60
            if minute_var == 60:
                minute_var = 0
                hour_var += 1

            try:
                self.update(hour_var, minute_var, second_var)
            except TclError:
                continue
        entry_print("-----------------------------------------------------",
                    True)
        entry_print(
            "Process finished in %d h %d min %d s." %
            (hour_var, minute_var, second_var), True)
        entry_print("-----------------------------------------------------\n",
                    True)
Exemplo n.º 4
0
def check_homepage_link(old_soup, new_soup, old_url, new_url, browser=None):
    detail = open("result\\homepage_detail.txt", 'a')
    old_hostname = urlparse(old_url).hostname
    new_hostname = urlparse(new_url).hostname
    page_pass = True
    printable = set(string.printable)
    new_content = new_soup.find('div', class_="ptl_page")

    if old_url.endswith("/"):
        old_url = old_url[:-1]
    if new_url.endswith("/"):
        new_url = new_url[:-1]

    if not old_hostname:
        old_hostname = old_url
    if not new_hostname:
        new_hostname = new_url

    if not new_content:
        record_error(new_url, "new homepage container")
        return False

    # remove banner and navigation menu from soup
    if new_content:
        for (name, kwargs) in settings["HOMEPAGE_LINK_IGNORE"]:
            for s in new_content.find_all(name, **kwargs):
                s.extract()

    new_tags = new_content.find_all('a',
                                    href=re.compile("^(?!.*(#aftermap|#)).*$"))

    # check for new links that direct to old site
    host_link = old_url.replace(urlparse(old_url).path, "")
    domain = get_domain(old_url)
    for tag in new_tags:
        href = tag['href']
        href_hostname = urlparse(href).hostname
        if href_hostname is None:
            href_hostname = ""
        if href.startswith("/"):
            continue
        if (href.startswith(host_link) and host_link != "") \
                or (href_hostname.find(domain + '.') != -1 and not href.startswith("mailto") and href.find("televox.west.com") == -1) \
                or href.find("iapps") != -1:
            page_pass = False
            entry_print("***********************************************")
            entry_print("HOMEPAGE LINKS THAT GO BACK TO OLD SITE!")
            entry_print("New URL: " + new_url)
            detail.write("HOMEPAGE LINKS THAT GO BACK TO OLD SITE!\n")
            detail.write("New URL: " + new_url + "\n")
            entry_print("Bad tag: " + str(tag))
            entry_print("***********************************************")
            detail.write("Bad tag: " + str(tag) + "\n")
            detail.write("-----------------------------------------------\n")
        if href.find("televox.west.com") != -1:
            page_pass = False
            entry_print("***********************************************")
            entry_print("NON-FRIENDLY URL FOUND! ")
            entry_print("New URL: " + new_url)
            detail.write("NON-FRIENDLY URL FOUND!\n")
            detail.write("New URL: " + new_url + "\n")
            entry_print("Bad tag: " + str(tag))
            entry_print("***********************************************")
            detail.write("Bad tag: " + str(tag) + "\n")
            detail.write("-----------------------------------------------\n")

    # check invalid links in new site
    new_invalid_links = []
    for tag in new_tags:
        url = tag.get('href')
        if url is None:
            continue
        if url.startswith("https://"):
            continue
        if url.startswith("tel:") or url.startswith(
                "mailto:") or url.find("#") != -1 or url.startswith("/common"):
            continue
        if url.startswith("/"):
            url = "http://" + new_hostname + url
        if url.find("televox.west.com") != -1:
            new_target = get_soup(url, browser)
        else:
            new_target = get_soup(url)
        new_target_title = get_meta_soup(new_target, url)['title']
        if new_target_title.find("404") != -1 or new_target_title == "Page Not Found" or new_target_title == "none" \
                or new_target_title == "The resource cannot be found.":
            new_invalid_links.append((str(tag), new_target_title))

    if new_invalid_links:
        entry_print("***********************************************")
        entry_print("INVALID LINK FOUND IN HOMEPAGE!")
        entry_print("New URL: " + new_url)
        detail.write("-----------------------------------------------\n")
        detail.write("INVALID LINK FOUND IN HOMEPAGE!\n")
        detail.write("New URL: " + new_url + "\n")
        ind = 0
        for tag, target in new_invalid_links:
            ind += 1
            entry_print("Bad tag" + str(ind) + ": " + tag)
            entry_print("Target title: " + target)
            detail.write("Bad tag" + str(ind) + ": " + tag + "\n")
            detail.write("Target title: " + target + "\n")
        entry_print("***********************************************")

    # check published links for homepage
    old_publish = old_soup.find('nav', id="utility-navigation")
    new_publish = new_soup.find('nav', id="utility-navigation")

    if old_publish:
        old_published_links = old_publish.find_all(
            'a', href=re.compile("^((?!#).)*$"))
    else:
        old_published_links = []
    if new_publish:
        new_published_links = new_publish.find_all(
            'a', href=re.compile("^((?!#).)*$"))
    else:
        new_published_links = []

    if len(old_published_links) != len(new_published_links):
        entry_print("***********************************************")
        entry_print("NUMBER OF PUBLISHED LINKS DIFFERENT!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Number of old links: " + str(len(old_published_links)))
        entry_print("Number of new links: " + str(len(new_published_links)))
        entry_print("***********************************************")
        detail.write("NUMBER OF PUBLISHED LINKS DIFFERENT!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Number of old links: " + str(len(old_published_links)) +
                     "\n")
        detail.write("Number of new links: " + str(len(new_published_links)) +
                     "\n")
        detail.write("-----------------------------------------------\n")
        page_pass = False
    else:
        publish_pass = True
        # check the href and name for each published link
        for ind in range(len(new_published_links)):
            old_link = old_published_links[ind]['href']
            new_link = new_published_links[ind]['href']
            old_link_dup = old_link.replace(" ", "").replace("-", "").replace(
                "(", "").replace(")", "")
            new_link_dup = new_link.replace(" ", "").replace("-", "").replace(
                "(", "").replace(")", "")
            old_name = old_published_links[ind].get_text().replace("  ", " ")
            new_name = new_published_links[ind].get_text().replace("  ", " ")
            old_name = "".join([i for i in old_name
                                if i in printable]).strip().upper()
            new_name = "".join([i for i in new_name
                                if i in printable]).strip().upper()
            old_name_dup = old_name.replace(" ", "").replace("-", "").replace(
                "(", "").replace(")", "")
            new_name_dup = new_name.replace(" ", "").replace("-", "").replace(
                "(", "").replace(")", "")
            if old_link_dup != new_link_dup:
                if old_link.startswith("tel:") or old_link.startswith(
                        "mailto:") or unicode(old_link[0]).isnumeric():
                    continue

                if old_link.startswith("/"):
                    old_link = old_hostname + old_link
                if new_link.startswith("/"):
                    new_link = new_hostname + new_link

                old_target = get_soup(old_link)
                new_target = get_soup(new_link, browser=browser)
                old_target_title = get_meta_soup(old_target, old_link)['title']
                new_target_title = get_meta_soup(new_target, new_link)['title']

                if new_target_title.endswith("..."):
                    new_target_title = new_target_title[:-3]
                    old_target_title = old_target_title[:len(new_target_title)]

                if old_target_title != new_target_title:
                    if publish_pass:
                        entry_print(
                            "***********************************************")
                        entry_print("PUBLISHED LINKS DO NOT MATCH!")
                        entry_print("Old URL: " + old_url)
                        entry_print("New URL: " + new_url)
                        detail.write("PUBLISHED LINKS DO NOT MATCH!\n")
                        detail.write("Old URL: " + old_url + "\n")
                        detail.write("New URL: " + new_url + "\n")
                        publish_pass = False
                        page_pass = False
                    entry_print("Old target: " + old_target_title)
                    entry_print("New target: " + new_target_title)
                    entry_print("Old link: " + old_link)
                    entry_print("New link: " + new_link)
                    detail.write("Old target: " + old_target_title + "\n")
                    detail.write("New target: " + new_target_title + "\n")
                    detail.write("Old link: " + old_link + "\n")
                    detail.write("New link: " + new_link + "\n")
            if old_name_dup != new_name_dup:
                if publish_pass:
                    entry_print(
                        "***********************************************")
                    entry_print("PUBLISHED LINK NAMES DO NOT MATCH!")
                    entry_print("Old URL: " + old_url)
                    entry_print("New URL: " + new_url)
                    detail.write("PUBLISHED LINK NAMES DO NOT MATCH!\n")
                    detail.write("Old URL: " + old_url + "\n")
                    detail.write("New URL: " + new_url + "\n")
                    publish_pass = False
                    page_pass = False
                entry_print("Old name: " + old_name)
                entry_print("New name: " + new_name)
                detail.write("Old name: " + old_name + "\n")
                detail.write("New name: " + new_name + "\n")
        if not publish_pass:
            entry_print("***********************************************")
            detail.write("-----------------------------------------------\n")

    # check social media links for homepage
    old_social = old_soup.find('nav', class_="social-navigation")
    new_social = new_soup.find('nav', class_="social-navigation")

    if old_social:
        old_social_links = old_social.find_all('a')
    else:
        old_social_links = []
    if new_social:
        new_social_links = new_social.find_all('a')
    else:
        new_social_links = []

    if len(old_social_links) != len(new_social_links):
        entry_print("***********************************************")
        entry_print("NUMBER OF SOCIAL LINKS DIFFERENT!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Number of old links: " + str(len(old_social_links)))
        entry_print("Number of new links: " + str(len(new_social_links)))
        entry_print("***********************************************")
        detail.write("NUMBER OF SOCIAL LINKS DIFFERENT!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Number of old links: " + str(len(old_social_links)) +
                     "\n")
        detail.write("Number of new links: " + str(len(new_social_links)) +
                     "\n")
        detail.write("-----------------------------------------------\n")
        page_pass = False
    else:
        social_pass = True
        # check the href and name for each social link
        for ind in range(len(new_social_links)):
            old_link = old_social_links[ind]['href']
            new_link = new_social_links[ind]['href']
            old_link_reversed = old_social_links[len(old_social_links) - ind -
                                                 1]['href']
            if old_link != new_link and old_link_reversed != new_link:
                if new_link.startswith("/"):
                    new_link = new_hostname + new_link
                if old_link.startswith("/"):
                    old_link = old_hostname + old_link

                old_target = get_soup(old_link)
                new_target = get_soup(new_link)
                old_target_title = replace_special(
                    get_meta_soup(old_target, old_link)['title'])
                new_target_title = replace_special(
                    get_meta_soup(new_target, new_link)['title'])

                if new_target_title.endswith("..."):
                    new_target_title = new_target_title[:-3]
                    old_target_title = old_target_title[:len(new_target_title)]

                if old_target_title != new_target_title:
                    if social_pass:
                        entry_print(
                            "***********************************************")
                        entry_print("SOCIAL LINKS DO NOT MATCH!")
                        entry_print("Old URL: " + old_url)
                        entry_print("New URL: " + new_url)
                        detail.write("SOCIAL LINKS DO NOT MATCH!\n")
                        detail.write("Old URL: " + old_url + "\n")
                        detail.write("New URL: " + new_url + "\n")
                        social_pass = False
                    entry_print("Old target: " + old_target_title)
                    entry_print("New target: " + new_target_title)
                    entry_print("Old link: " + old_link)
                    entry_print("New link: " + new_link)
                    detail.write("Old target: " + old_target_title + "\n")
                    detail.write("New target: " + new_target_title + "\n")
                    detail.write("Old link: " + old_link + "\n")
                    detail.write("New link: " + new_link + "\n")
        if not social_pass:
            entry_print("***********************************************")
            detail.write("-----------------------------------------------\n")
    detail.close()
    return page_pass
Exemplo n.º 5
0
def check_homepage_image(new_soup, new_url):
    detail = open("result\\homepage_detail.txt", 'a')
    page_pass = True
    new_content = new_soup.find('div', class_="ptl_page")

    # remove banner and navigation menu from soup
    if new_content:
        for (name, kwargs) in settings["HOMEPAGE_IMAGE_IGNORE"]:
            for s in new_content.find_all(name, **kwargs):
                s.extract()

    # find images with missing alt text
    try:
        bad_images = new_content.find_all(
            'img',
            alt=lambda x: not x,
            src=re.compile("^((/common/)(?!(data|resource)))"))
    except AttributeError:
        bad_images = []
    if bad_images:
        entry_print("***********************************************")
        entry_print("HOMEPAGE IMAGES WITHOUT ALT TEXT FOUND!")
        entry_print("New URL: " + new_url)
        detail.write("HOMEPAGE IMAGES WITHOUT ALT TEXT FOUND!\n")
        detail.write("New URL: " + new_url + "\n")
        for image in bad_images:
            entry_print("Bad image: " + str(image))
            detail.write("Bad image: " + str(image) + "\n")
        entry_print("***********************************************")
        entry_print("***********************************************\n")
        detail.write("-----------------------------------------------\n")
        page_pass = False

    # find images that are not locally stored
    try:
        bad_images = new_content.find_all(
            'img',
            src=re.compile(
                "^(?!.*(/common/|/UserFiles/|http://www.deardoctor.com|data|televoxsites)).*$"
            ))
    except AttributeError:
        bad_images = []
    if bad_images:
        entry_print("***********************************************")
        entry_print("HOMEPAGE IMAGES NOT LOCALLY STORED!")
        entry_print("New URL: " + new_url)
        detail.write("HOMEPAGE IMAGES NOT LOCALLY STORED!\n")
        detail.write("New URL: " + new_url + "\n")
        for image in bad_images:
            entry_print("Bad image: " + str(image))
            detail.write("Bad image: " + str(image) + "\n")
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")
        page_pass = False

    # check for images with id
    try:
        bad_images = list(
            set(
                new_content.find_all('img', imagesiteid=True) +
                new_content.find_all('img', objectid=True)))
    except AttributeError:
        bad_images = []
    if bad_images:
        entry_print("***********************************************")
        entry_print("HOMEPAGE IMAGES WITH ID FOUND!")
        entry_print("New URL: " + new_url)
        detail.write("HOMEPAGE IMAGES WITH ID FOUND!\n")
        detail.write("New URL: " + new_url + "\n")
        for image in bad_images:
            entry_print("Bad image: " + str(image))
            detail.write("Bad image: " + str(image) + "\n")
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")
        page_pass = False
    detail.close()
    return page_pass
def compare_homepage_content(old_soup, new_soup, old_url, new_url):
    detail = open("result\\homepage_detail.txt", 'a')
    page_pass = True
    old_container1 = old_soup.find('div', id="content")
    old_container2 = old_soup.find('div', id="features")
    new_container1 = new_soup.find('div', id="content")
    new_container2 = new_soup.find('div', id="features")

    if old_container1:
        for (name, kwargs) in settings["OLD_HOMEPAGE_CONTENT_IGNORE"]:
            for s in old_container1.find_all(name, **kwargs):
                s.extract()
    if old_container2:
        for (name, kwargs) in settings["OLD_HOMEPAGE_CONTENT_IGNORE"]:
            for s in old_container2.find_all(name, **kwargs):
                s.extract()
    if new_container1:
        for (name, kwargs) in settings["NEW_HOMEPAGE_CONTENT_IGNORE"]:
            for s in new_container1.find_all(name, **kwargs):
                s.extract()
    if new_container2:
        for (name, kwargs) in settings["NEW_HOMEPAGE_CONTENT_IGNORE"]:
            for s in new_container2.find_all(name, **kwargs):
                s.extract()

    old_content = get_homepage_content(old_container1) + get_homepage_content(
        old_container2)
    new_content = get_homepage_content(new_container1) + get_homepage_content(
        new_container2)

    if new_content.find("Read More") != -1:
        entry_print("***********************************************")
        entry_print("HOMEPAGE CONTAINS 'READ MORE'!")
        entry_print("New URL: " + new_url)
        entry_print("***********************************************")
        detail.write("HOMEPAGE CONTAINS READ MORE!\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("-----------------------------------------------\n")
        page_pass = False

    old_content = old_content.replace("Read More", "Learn More").replace(
        "Read more", "Learn More")
    old_content = old_content.replace("...", "").replace(">", "")
    new_content = new_content.replace("...", "").replace(">", "")
    old_content = old_content.replace("Learn More",
                                      "").replace("Learn more", "")
    new_content = new_content.replace("Learn More", "")
    old_content = replace_special(old_content)
    new_content = replace_special(new_content)

    if not old_content and new_content:
        record_error(old_url, "homepage container")
        detail.close()
        return False
    elif old_content and not new_content:
        record_error(new_url, "homepage container")
        detail.close()
        return False

    if old_content.replace(" ", "") != new_content.replace(" ", ""):
        entry_print("***********************************************")
        entry_print("HOMEPAGE CONTENT DIFFERENCE FOUND!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Old content: " + old_content)
        entry_print("New content: " + new_content)
        entry_print("***********************************************")
        detail.write("HOMEPAGE CONTENT DIFFERENCE FOUND!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Old content: " + old_content + "\n")
        detail.write("New content: " + new_content + "\n")
        detail.write("-----------------------------------------------\n")
        page_pass = False
    detail.close()
    return page_pass
Exemplo n.º 7
0
def compare_meta_soup(old_soup, new_soup, old_url, new_url):
    page_pass = True
    file = open("result\\meta_detail.txt", "a")

    old_meta = get_meta_soup(old_soup, old_url)
    new_meta = get_meta_soup(new_soup, new_url)

    old_title = replace_special(old_meta['title'])
    new_title = replace_special(new_meta['title'])
    old_desc = replace_special(old_meta['description'])
    new_desc = replace_special(new_meta['description'])
    old_key = replace_special(old_meta['keywords'])
    new_key = replace_special(new_meta['keywords'])

    # ignore the omitted content
    if new_title.endswith("..."):
        new_title = new_title[:len(new_title) - 3]
        old_title = old_title[:len(new_title)]

    if old_desc.startswith("Learn more about"):
        old_desc = "none"

    while old_title.find("  ") != -1:
        old_title = old_title.replace("  ", " ")
    while new_title.find("  ") != -1:
        new_title = new_title.replace("  ", " ")

    title_same = old_title == new_title
    desc_same = old_desc == new_desc
    key_same = old_key == new_key

    # if the old page does not exist, then skip the site
    # (a new space page will be created in the new site)
    if old_title == "The resource cannot be found." or old_title.startswith(
            "404"):
        title_same = True
        desc_same = True
        key_same = True

    if old_title.lower() != "page not found" and new_title == "Page Not Found":
        entry_print("***********************************************")
        entry_print("MISSING PAGE FOUND!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("***********************************************")
        file.write("-----------------------------------------------\n")
        file.write("MISSING PAGE FOUND!\n")
        file.write("Old URL: " + old_url + "\n")
        file.write("New URL: " + new_url + "\n")
        file.close()
        return -1

    if not (title_same and desc_same and key_same):
        # print and record the issue in meta.txt
        file.write("-----------------------------------------------\n")
        file.write("Old URL: " + old_url + "\n")
        file.write("New URL: " + new_url + "\n")
        entry_print("***********************************************")
        entry_print("METADATA DIFFERENCE FOUND!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        if not title_same:
            entry_print("Old title: " + old_title)
            entry_print("New title: " + new_title)
            file.write("Old title: " + old_title + "\n")
            file.write("New title: " + new_title + "\n")
        if not desc_same:
            entry_print("Old description: " + old_desc)
            entry_print("New description: " + new_desc)
            file.write("Old description: " + old_desc + "\n")
            file.write("New description: " + new_desc + "\n")
        if not key_same:
            entry_print("Old keywords: " + old_key)
            entry_print("New keywords: " + new_key)
            file.write("Old keywords: " + old_key + "\n")
            file.write("New keywords: " + new_key + "\n")
        entry_print("***********************************************")
        page_pass = False
    file.close()
    return page_pass
Exemplo n.º 8
0
def compare_image_soup(old_soup, new_soup, old_url, new_url):
    detail = open("result\\site_detail.txt", 'a')
    page_pass = True
    # grab container
    old_content = old_soup.find('div', class_="right")
    new_content = new_soup.find('div', class_="right")

    # check container exists and grab images
    if (not old_content
            or old_content.get_text().isspace()) and new_content is not None:
        if old_soup.find('div', id="content"):
            old_content = old_soup.find('div', id="content")
        else:
            old_content = old_soup.find('div',
                                        id=re.compile("Overview|overview"))

    # vertical template uses different container
    if old_content and old_content.find('div', id="content"):
        old_content = old_content.find('div', id="content")

    if new_content and new_content.find('div', id="content"):
        new_content = new_content.find('div', id="content")

    if not old_content and new_content:
        record_error(old_url, "image container")
        detail.close()
        return False
    elif old_content and not new_content:
        record_error(new_url, "image container")
        detail.close()
        return False

    # remove unnecessary tags from container
    if old_content:
        for (name, kwargs) in settings["COMPARE_OLD_IMAGE_IGNORE"]:
            for s in old_content.find_all(name, **kwargs):
                s.extract()
    if new_content:
        for (name, kwargs) in settings["COMPARE_NEW_IMAGE_IGNORE"]:
            for s in new_content.find_all(name, **kwargs):
                s.extract()

    if old_content is None:
        old_images = []
    else:
        old_images = old_content.find_all('img')

    if new_content is None:
        new_images = []
    else:
        new_images = new_content.find_all('img')

    # check that number of images are the same
    if len(new_images) != len(old_images):
        entry_print("***********************************************")
        entry_print("NUMBER OF IMAGES DIFFERENT!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Number of old images: " + str(len(old_images)))
        entry_print("Number of new images: " + str(len(new_images)))
        entry_print("Old images: " + str(old_images))
        entry_print("New images: " + str(new_images))
        entry_print("***********************************************")
        detail.write("NUMBER OF IMAGES DIFFERENT!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Number of old images: " + str(len(old_images)) + "\n")
        detail.write("Number of new images: " + str(len(new_images)) + "\n")
        detail.write("-----------------------------------------------\n")
        detail.close()
        page_pass = False
        return page_pass

    # check that images have the same class
    new_pass = True
    count = 0
    for ind in range(len(new_images)):
        old_class = old_images[ind].get('class')
        new_class = new_images[ind].get('class')

        if old_class != new_class:
            count += 1
            if new_pass:
                entry_print("***********************************************")
                entry_print("IMAGES WITH WRONG CLASS!")
                entry_print("Old URL: " + old_url)
                entry_print("New URL: " + new_url)
                detail.write("IMAGES WITH WRONG CLASS!\n")
                detail.write("Old URL: " + old_url + "\n")
                detail.write("New URL: " + new_url + "\n")
                new_pass = False
                page_pass = False
            entry_print("Old image" + str(count) + ": " + str(old_images[ind]))
            entry_print("New image" + str(count) + ": " + str(new_images[ind]))
            detail.write("Old image" + str(count) + ": " +
                         str(old_images[ind]) + "\n")
            detail.write("New image" + str(count) + ": " +
                         str(new_images[ind]) + "\n")
    if not new_pass:
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")

    if new_content:
        for (name, kwargs) in settings["CHECKING_IMAGE_IGNORE"]:
            for s in new_content.find_all(name, **kwargs):
                s.extract()

    # check for images with id
    try:
        bad_images = list(
            set(
                new_content.find_all('img', imagesiteid=True) +
                new_content.find_all('img', objectid=True)))
    except AttributeError:
        bad_images = []
    if bad_images:
        entry_print("***********************************************")
        entry_print("IMAGES WITH ID FOUND!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        detail.write("IMAGES WITH ID FOUND!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        for image in bad_images:
            entry_print("Bad image: " + str(image))
            detail.write("Bad image: " + str(image) + "\n")
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")
        page_pass = False

    # find images with missing alt text
    try:
        bad_images = new_content.find_all(
            'img',
            alt=lambda x: not x,
            src=re.compile("^((/common/)(?!(data|resource)))"))
    except AttributeError:
        bad_images = []
    if bad_images:
        entry_print("***********************************************")
        entry_print("IMAGES MISSING ALT TEXT FOUND!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        detail.write("IMAGES MISSING ALT TEXT FOUND!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        for image in bad_images:
            entry_print("Bad image: " + str(image))
            detail.write("Bad image: " + str(image) + "\n")
        detail.write("-----------------------------------------------\n")
        entry_print("***********************************************")
        page_pass = False

    # find images that are not locally stored
    try:
        bad_images = new_content.find_all(
            'img',
            src=re.compile(
                "^(?!.*(/common/|/UserFiles/|deardoctor|data|televoxsites)).*$"
            ))
    except AttributeError:
        bad_images = []
    if bad_images:
        entry_print("***********************************************")
        entry_print("IMAGES NOT LOCALLY STORED!")
        entry_print("New URL: " + new_url)
        detail.write("IMAGES NOT LOCALLY STORED!\n")
        detail.write("New URL: " + new_url + "\n")
        for image in bad_images:
            entry_print("Bad image: " + str(image))
            detail.write("Bad image: " + str(image) + '\n')
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")
        page_pass = False

    detail.close()
    return page_pass
Exemplo n.º 9
0
def compare_link_soup(old_soup, new_soup, old_url, new_url, browser=None):
    detail = open("result\\site_detail.txt", 'a')
    old_hostname = urlparse(old_url).hostname
    new_hostname = urlparse(new_url).hostname

    if not old_hostname:
        old_hostname = ""
    if not new_hostname:
        new_hostname = ""

    # grab container
    old_content = old_soup.find('div', class_="right")
    new_content = new_soup.find('div', class_="right")

    if not old_content and new_content:
        if old_soup.find('div', id="content"):
            old_content = old_soup.find('div', id="content")

    if not old_content and new_content:
        record_error(old_url, "link container")
        detail.close()
        return False
    elif old_content and not new_content:
        record_error(new_url, "link container")
        detail.close()
        return False
    elif not old_content and not new_content:
        return True

    # vertical template uses different container
    if old_content.find('div', id="content"):
        old_content = old_soup.find('div', id="content")

    if new_content.find('div', id="content"):
        new_content = new_soup.find('div', id="content")

    # remove extra links from container
    if old_content:
        for (name, kwargs) in settings["COMPARE_OLD_LINK_IGNORE"]:
            for s in old_content.find_all(name, **kwargs):
                s.extract()

    if new_content:
        for (name, kwargs) in settings["COMPARE_NEW_LINK_IGNORE"]:
            for s in new_content.find_all(name, kwargs):
                s.extract()

    if old_content is None:
        old_tags = []
    else:
        old_tags = old_content.find_all('a', href=True)

    if new_content is None:
        new_tags = []
    else:
        new_tags = new_content.find_all('a', href=True)

    # remove links that does not have any content inside
    old_tags = [
        tag for tag in old_tags
        if tag.text and not tag.text.isspace() or tag.find('img')
    ]
    new_tags = [
        tag for tag in new_tags
        if tag.text and not tag.text.isspace() or tag.find('img')
    ]

    # check for new links that direct to old site
    host_link = old_url.replace(urlparse(old_url).path, "")
    domain = get_domain(old_url)
    new_pass1 = True
    for tag in new_tags:
        href = tag['href']
        href_hostname = urlparse(href).hostname
        if not href_hostname:
            href_hostname = ""
        if href.find(host_link) != -1 or (href_hostname.find(domain + '.') != -1 and href.find("televox.west.com") == -1) \
                or href.find("iapps") != -1:
            if new_pass1:
                entry_print("***********************************************")
                entry_print("LINKS THAT GO BACK TO OLD SITE!")
                entry_print("Old URL: " + old_url)
                entry_print("New URL: " + new_url)
                detail.write("LINKS THAT GO BACK TO OLD SITE!\n")
                detail.write("Old URL: " + old_url + "\n")
                detail.write("New URL: " + new_url + "\n")
                new_pass1 = False
            entry_print("Bad tag: " + str(tag))
            detail.write("Bad tag: " + str(tag) + "\n")
    if not new_pass1:
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")

    # check for non-friendly urls
    new_pass2 = True
    for tag in new_tags:
        href = tag['href']
        if not href:
            continue
        if href.find("televox.west.com") != -1:
            if new_pass2:
                entry_print("***********************************************")
                entry_print("NON-FRIENDLY URL FOUND!")
                entry_print("New URL: " + new_url)
                detail.write("NON-FRIENDLY URL FOUND!\n")
                detail.write("New URL: " + new_url + "\n")
                new_pass2 = False
            entry_print("Bad tag: " + str(tag))
            detail.write("Bad tag: " + str(tag) + "\n")
    if not new_pass2:
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")

    # remove file links
    for tag in old_tags:
        url = tag.get('href')
        if re.search("jpg|png|pdf|mp4", url):
            old_tags.remove(tag)
    for tag in new_tags:
        url = tag.get('href')
        if re.search("jpg|png|pdf|mp4|UserFile", url):
            new_tags.remove(tag)

    bad_tags = []
    if len(old_tags) != len(new_tags):
        # remove 404 pages and file links from the old tags
        for tag in old_tags:
            url = tag.get('href')
            if url is None:
                continue
            if url.startswith("https://"):
                continue
            if url.startswith("tel:") or url.startswith(
                    "mailto:") or url.find("#") != -1:
                continue
            if url.startswith("/"):
                url = "http://" + old_hostname + url
            old_target = get_soup(url)
            old_target_title = get_meta_soup(old_target, url)['title']
            if old_target_title.find("404") != -1 \
                    or re.search("page not found|the resource cannot be found", old_target_title.lower()) \
                    or old_target_title == "none":
                bad_tags.append((str(tag), old_target_title))
                old_tags.remove(tag)

    # check invalid links in new site
    new_invalid_links = []
    for tag in new_tags:
        url = tag.get('href')
        if url is None:
            continue
        if url.startswith("https://"):
            continue
        if url.startswith("tel:") or url.startswith(
                "mailto:") or url.find("#") != -1 or url.startswith("/common"):
            continue
        if url.startswith("/"):
            url = "http://" + new_hostname + url
        if url.find("televox.west.com") != -1:
            new_target = get_soup(url, browser)
        else:
            new_target = get_soup(url)
        new_target_title = get_meta_soup(new_target, url)['title']
        if new_target_title.find("404") != -1 or new_target_title == "Page Not Found" or new_target_title == "none" \
                or new_target_title == "The resource cannot be found.":
            new_invalid_links.append((str(tag), new_target_title))

    if new_invalid_links:
        entry_print("***********************************************")
        entry_print("INVALID LINK FOUND IN NEW SITE!")
        entry_print("New URL: " + new_url)
        detail.write("INVALID LINK FOUND IN NEW SITE!\n")
        detail.write("New URL: " + new_url + "\n")
        ind = 0
        for tag, target in new_invalid_links:
            ind += 1
            entry_print("Bad tag" + str(ind) + ": " + tag)
            entry_print("Target title: " + target)
            detail.write("Bad tag" + str(ind) + ": " + tag + "\n")
            detail.write("Target title: " + target + "\n")
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")

    # check that number of links match if not, return
    if len(new_tags) != len(old_tags):
        entry_print("***********************************************")
        entry_print(
            "NUMBER OF LINKS DIFFERENT OR 404 LINK EXISTS IN NEW PAGE!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Number of old links: " + str(len(old_tags)))
        entry_print("Number of new links: " + str(len(new_tags)))
        entry_print("Old tags: " + str(old_tags))
        entry_print("New tags: " + str(new_tags))
        if bad_tags:
            entry_print("404 tags in old site (removed):")
            for ind in range(len(bad_tags)):
                entry_print("Tag" + str(ind + 1) + ": " + bad_tags[ind][0])
                entry_print("Target title: " + bad_tags[ind][1])
        entry_print("***********************************************")
        detail.write(
            "NUMBER OF LINKS DIFFERENT OR 404 LINK EXISTS IN NEW PAGE!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Number of old links: " + str(len(old_tags)) + "\n")
        detail.write("Number of new links: " + str(len(new_tags)) + "\n")
        if bad_tags:
            detail.write("404 tag(s) in old site (removed):\n")
            for ind in range(len(bad_tags)):
                detail.write("Tag" + str(ind + 1) + ": " + bad_tags[ind][0] +
                             "\n")
                detail.write("Target title: " + bad_tags[ind][1] + "\n")
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")
        detail.close()
        return False

    # check that new and old links match
    new_pass3 = True
    count = 0
    for ind in range(len(new_tags)):
        old_link = old_tags[ind]['href'].replace("\\", "/").strip()
        new_link = new_tags[ind]['href'].replace("\\", "/").strip()
        if old_link == new_link:
            continue

        # take out the duplication part for old_link
        if old_link.find("#") != -1:
            old_ind = old_link.find("#")
            old_link = old_link[old_ind:]
        if new_link.find("#") != -1:
            new_ind = new_link.find("#")
            new_link = new_link[new_ind:]

        temp = old_link.split("/")
        if len(temp) > 2:
            if temp[-1] == temp[-2]:
                old_link = "/".join(temp[:-1])
        if urlparse(old_link).path == urlparse(new_link).path:
            continue

        if old_link.startswith("/"):
            old_link = "http://" + old_hostname + old_link
        # if the old link points to the homepage, then set it as "/"
        if old_link.endswith("/home") or old_link.endswith("/main"):
            old_link = "/"
        if new_link == "/home" or new_link == "/main":
            new_link = "/"
        if new_link != "/" and new_link.endswith("/"):
            new_link = new_link[:-1]
        if old_link != "/" and old_link.endswith("/"):
            old_link = old_link[:-1]

        if old_link != new_link and not new_link.startswith("/common"):
            if old_link.find("#") != -1 or new_link.find("#") != -1:
                count += 1
                if new_pass3:
                    entry_print(
                        "***********************************************")
                    entry_print("LINKS THAT DO NOT MATCH!")
                    entry_print("Old URL: " + old_url)
                    entry_print("New URL: " + new_url)
                    detail.write("LINKS THAT DO NOT MATCH!\n")
                    detail.write("Old URL: " + old_url + "\n")
                    detail.write("New URL: " + new_url + "\n")
                    new_pass3 = False
                entry_print("Old link" + str(count) + ": " + old_link)
                entry_print("New link" + str(count) + ": " + new_link)
                entry_print("Old tag" + str(count) + ": " + str(old_tags[ind]))
                entry_print("New tag" + str(count) + ": " + str(new_tags[ind]))
                detail.write("Old tag" + str(count) + ": " +
                             str(old_tags[ind]) + "\n")
                detail.write("New tag" + str(count) + ": " +
                             str(new_tags[ind]) + "\n")
                continue

            if old_link.startswith("/"):
                old_link = "http://" + old_hostname + old_link.strip()
            if new_link.startswith("/"):
                new_link = "http://" + new_hostname + new_link.strip()

            old_target = get_soup(old_link)
            new_target = get_soup(new_link, browser=browser)
            old_target_title = replace_special(
                get_meta_soup(old_target, old_link)['title'])
            new_target_title = replace_special(
                get_meta_soup(new_target, new_link)['title'])

            if new_target_title.endswith("..."):
                new_target_title = new_target_title[:-3]
                old_target_title = old_target_title[:len(new_target_title)]

            if old_target_title != new_target_title:
                count += 1
                if new_pass3:
                    entry_print(
                        "***********************************************")
                    entry_print("LINKS THAT DO NOT MATCH!")
                    entry_print("Old URL: " + old_url)
                    entry_print("New URL: " + new_url)
                    detail.write("LINKS THAT DO NOT MATCH!\n")
                    detail.write("Old URL: " + old_url + "\n")
                    detail.write("New URL: " + new_url + "\n")
                    new_pass3 = False
                entry_print("Old link" + str(count) + ": " + old_link)
                entry_print("New link" + str(count) + ": " + new_link)
                entry_print("Old target" + str(count) + ": " +
                            old_target_title)
                entry_print("New target" + str(count) + ": " +
                            new_target_title)
                entry_print("Old tag" + str(count) + ": " + str(old_tags[ind]))
                entry_print("New tag" + str(count) + ": " + str(new_tags[ind]))
                detail.write("Old tag" + str(count) + ": " +
                             str(old_tags[ind]) + "\n")
                detail.write("New tag" + str(count) + ": " +
                             str(new_tags[ind]) + "\n")
    if not new_pass3:
        detail.write("-----------------------------------------------\n")
        entry_print("***********************************************")

    detail.close()
    return new_pass1 and new_pass2 and new_pass3
Exemplo n.º 10
0
def compare_content_soup(old_soup, new_soup, old_url, new_url):
    page_pass = True

    old_content = get_content_soup_old(old_soup, old_url)
    new_content = get_content_soup_new(new_soup, new_url)

    if old_content is None:
        old_content = ""
    if new_content is None:
        new_content = ""

    if not old_content and new_content.startswith("We are currently"):
        return True

    if not old_content:
        old_content = ""
    if not new_content:
        new_content = ""

    old_content = replace_special(old_content)
    new_content = replace_special(new_content)

    if old_content.replace(" ", "") != new_content.replace(" ", ""):
        detail = open("result\\content_detail.txt", 'a')
        entry_print("***********************************************")
        entry_print("CONTENT DIFFERENCE FOUND!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Old content: " + old_content)
        entry_print("New content: " + new_content)
        entry_print("***********************************************")
        detail.write(
            "----------------------------------------------------------------------------------------------\n"
        )
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Old content: " + old_content + "\n")
        detail.write("New content: " + new_content + "\n")
        detail.close()
        page_pass = False
    return page_pass
Exemplo n.º 11
0
def compare_address_soup(old_soup, new_soup, old_url, new_url):
    old_address = get_address_soup_old(old_soup, old_url)
    new_address = get_address_soup_new(new_soup, new_url)
    file = open("result\\address_detail.txt", 'a')
    page_pass = True

    # number of lines different from old site
    if len(old_address) != len(new_address):
        entry_print("***********************************************")
        entry_print("NUMBER OF ADDRESSES DIFFERENT!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Old lines: " + str(len(old_address)))
        entry_print("New lines: " + str(len(new_address)))
        entry_print("***********************************************")
        file.write("NUMBER OF ADDRESSES DIFFERENT!\n")
        file.write("Old URL: " + old_url + "\n")
        file.write("New URL: " + new_url + "\n")
        file.write("Old lines: " + str(len(old_address)) + "\n")
        file.write("New lines: " + str(len(new_address)) + "\n")
        file.write("-----------------------------------------------\n")
        file.close()
        page_pass = False
        return page_pass

    for line in range(len(old_address)):
        old_line = old_address[line]
        new_line = new_address[line]
        for ind in range(len(old_line)):
            old_detail = old_line[ind].encode('utf-8')
            new_detail = new_line[ind].encode('utf-8')

            if ind == 1:
                old_detail = old_detail.replace(",", "").replace(".", "")
                new_detail = new_detail.replace(",", "").replace(".", "")

            if old_detail != new_detail and check_format(
                    old_detail) and old_detail != "":
                # if this is the first difference, then print header information
                if page_pass:
                    entry_print(
                        "***********************************************")
                    entry_print("ADDRESS DIFFERENCE FOUND!")
                    entry_print("Old URL: " + old_url)
                    entry_print("New URL: " + new_url)
                    file.write("ADDRESS DIFFERENCE FOUND!\n")
                    file.write("Old URL: " + old_url + "\n")
                    file.write("New URL: " + new_url + "\n")
                    page_pass = False
                entry_print("Old address: " + old_detail)
                entry_print("New address: " + new_detail)
                file.write("Old address: " + old_detail + "\n")
                file.write("New address: " + new_detail + "\n")

    if not page_pass:
        entry_print("***********************************************")
        file.write("-----------------------------------------------\n")
    file.close()
    return page_pass
Exemplo n.º 12
0
def compare_form_soup(old_soup, new_soup, old_url, new_url):
    detail = open("result\\form_detail.txt", 'a')
    require_pass = True
    title_pass = True
    entry_pass = True
    auth_pass = True
    old_container = old_soup.find('div', class_="form-container")
    new_container = new_soup.find('div', class_="secureform")

    if not old_container and not new_container:
        detail.close()
        return True
    elif not old_container and new_container:
        record_error(old_url, "form container")
        detail.close()
        return False
    elif old_container and not new_container:
        record_error(new_url, "form container")
        detail.close()
        return False

    # check the "required field" text in new form
    if not new_container.find(text=re.compile("required field")):
        entry_print("***********************************************")
        entry_print("FORM MISSING '* REQUIRED FIELD' TITLE!")
        entry_print("New URL: " + new_url)
        entry_print("***********************************************")
        detail.write("FORM MISSING '* REQUIRED FIELD' TITLE!\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("-----------------------------------------------\n")
        require_pass = False

        # find all the entry names and choices from old page
    if old_container:
        for (name, kwargs) in settings["OLD_FORM_ENTRY_IGNORE"]:
            for s in old_container.find_all(name, **kwargs):
                s.extract()
    old_entries = [replace_special(text) for text in old_container.stripped_strings]

    # find all the entry names and choices from new page
    if new_container:
        for (name, kwargs) in settings["NEW_FORM_ENTRY_IGNORE"]:
            for s in new_container.find_all(name, **kwargs):
                s.extract()
    new_entries = [replace_special(text) for text in new_container.stripped_strings]

    # compare entry names
    if len(old_entries) != len(new_entries):
        entry_print("***********************************************")
        entry_print("NUMBER OF FORM ENTRIES DIFFERENT!")
        entry_print("Old URL: " + old_url)
        entry_print("New URL: " + new_url)
        entry_print("Number of old entries: " + str(len(old_entries)))
        entry_print("Number of new entries: " + str(len(new_entries)))
        entry_print("Old entries: " + str(old_entries))
        entry_print("New entries: " + str(new_entries))
        detail.write("NUMBER OF FORM ENTRIES DIFFERENT!\n")
        detail.write("Old URL: " + old_url + "\n")
        detail.write("New URL: " + new_url + "\n")
        detail.write("Number of old entries: " + str(len(old_entries)) + "\n")
        detail.write("Number of new entries: " + str(len(new_entries)) + "\n")
        detail.write("Old entries: " + str(old_entries) + "\n")
        detail.write("New entries: " + str(new_entries) + "\n")
        entry_pass = False

        # try to track down the issue
        for ind in range(min(len(old_entries), len(new_entries))):
            if old_entries[ind] != new_entries[ind] and old_entries[ind].upper() != new_entries[ind].upper():
                entry_print("FIRST DIFFERENCE:")
                entry_print("Old entry name: " + old_entries[ind])
                entry_print("New entry name: " + new_entries[ind])
                detail.write("FIRST DIFFERENCE:\n")
                detail.write("Old entry name: " + old_entries[ind] + "\n")
                detail.write("New entry name: " + new_entries[ind] + "\n")
                break
        entry_print("***********************************************")
        detail.write("-----------------------------------------------\n")
    else:
        # compare each entry
        count = 0
        old_diff = []
        new_diff = []
        new_pass = True

        for ind in range(len(old_entries)):
            old_entry = old_entries[ind]
            new_entry = new_entries[ind]
            if old_entry != new_entry and old_entry.upper() != new_entry.upper():
                old_diff.append(old_entry)
                new_diff.append(new_entry)
        old_diff_dup = [i.upper() for i in old_diff]
        new_diff_dup = [i.upper() for i in new_diff]
        old_diff_dup.sort()
        new_diff_dup.sort()

        for old_entry, new_entry in zip(old_diff_dup, new_diff_dup):
            if old_entry != new_entry:
                new_pass = False
                break

        if not new_pass:
            for old_entry, new_entry in zip(old_diff, new_diff):
                count += 1
                if entry_pass:
                    entry_print("***********************************************")
                    entry_print("FORM ENTRIES DO NOT MATCH!")
                    entry_print("Old URL: " + old_url)
                    entry_print("New URL: " + new_url)
                    detail.write("FORM ENTRIES DO NOT MATCH!\n")
                    detail.write("Old URL: " + old_url + "\n")
                    detail.write("New URL: " + new_url + "\n")
                    entry_pass = False
                entry_print("Old entry name" + str(count) + ": " + old_entry)
                entry_print("New entry name" + str(count) + ": " + new_entry)
                detail.write("Old entry name" + str(count) + ": " + old_entry + "\n")
                detail.write("New entry name" + str(count) + ": " + new_entry + "\n")
            entry_print("***********************************************")
            detail.write("-----------------------------------------------\n")

    detail.close()
    return require_pass and title_pass and entry_pass and auth_pass
Exemplo n.º 13
0
def get_address_soup_new(soup, url):
    file = open("result\\address_detail.txt", 'a')
    # find the address lines
    if not soup:
        record_error(url, "address container")
        return False

    try:
        address_arr = soup.find('p', {
            'class': "companyname"
        }).next_sibling.find_all('li')
    except AttributeError:
        record_error(url, "address container")
        address_arr = []
    result_arr = []

    for address in address_arr:
        name = address.find('span', {
            'class': "locationName"
        }).get_text().replace("|", "").strip()
        if name.endswith(":") or name.endswith("-"):
            name = name[:-1].strip()

        full_address = address.find('span', {
            'class': "streetAddress"
        }).get_text().replace("|", "").strip()
        street_arr = full_address.split(',')
        city = street_arr[-2].replace("|", "").strip()
        zip = street_arr[-1].replace("|", "").strip()

        ind = full_address.rfind(city)
        street = full_address[:ind - 2]

        phone = address.find('span', {
            'class': "contactNumbers"
        }).get_text()[6:].replace("|", "").strip()

        if not check_format(full_address):
            entry_print("***********************************************")
            entry_print("ADDRESS FORMAT ISSUE FOUND!")
            entry_print("New URL: " + url)
            entry_print("New address: " + full_address)
            file.write("ADDRESS FORMAT ISSUE FOUND!\n")
            file.write("New URL: " + url + "\n")
            file.write("New address: " + full_address + "\n")
            file.write("-----------------------------------------------\n")

        # if there are fax number or email in the new site, print error and record
        if not address.find('span', {'class': "faxNumbers"}) is None:
            entry_print("***********************************************")
            entry_print("ADDITIONAL FAX NUMBER!")
            entry_print("New URL: " + url)
            entry_print("***********************************************")
            file.write("ADDITIONAL FAX NUMBER!\n")
            file.write("New URL: " + url + "\n")
            file.write("-----------------------------------------------\n")
        if not address.find('span', {'class': "email"}) is None:
            entry_print("***********************************************")
            entry_print("ADDITIONAL EMAIL NUMBER!")
            entry_print("New URL: " + url)
            entry_print("***********************************************")
            file.write("ADDITIONAL EMAIL NUMBER!\n")
            file.write("New URL: " + url + "\n")
            file.write("-----------------------------------------------\n")

        parsed_address = [name, street, city, zip, phone]

        for i in range(len(parsed_address)):
            parsed_address[i] = parsed_address[i].upper()

        result_arr.append(parsed_address)
    file.close()
    return result_arr
Exemplo n.º 14
0
def compare_site_thread(old_url,
                        new_url,
                        progress_var=None,
                        step=100.0,
                        thread_pool_csv=None):
    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        return

    # checking multiple sites mode
    if thread_pool_csv:
        thread_pool = thread_pool_csv
    else:
        thread_pool = ThreadPool(settings["THREADPOOL_SIZE"])
    create_path()
    ind = 0

    old_url = old_url.strip()
    new_url = new_url.strip()

    # remove the "/" at the end of the url
    if old_url[-1] == '/':
        old_url = old_url[:-1]
    if new_url[-1] == '/':
        new_url = new_url[:-1]

    # add "http://" before url
    if not old_url.startswith("http"):
        old_url = "http://" + old_url
    if not new_url.startswith("http"):
        new_url = "http://" + new_url

    # print out the information for old and new sites
    entry_print("-----------------------------------------------------", True)
    entry_print("Old URL: " + old_url, True)
    entry_print("New URL: " + new_url, True)
    entry_print("-----------------------------------------------------", True)

    setup_step = step * 0.01
    if progress_var:
        progress_var.set(progress_var.get() + setup_step)

    # check if the new site needs login
    new_test = get_soup(new_url)
    if new_test:
        title = new_test.find("title")
        if title and title.get_text().strip() == "Login":
            entry_print(
                "New site needs login. Please use login mode to check this site!\n",
                True)
            return -1

    setup_step = step * 0.01
    if progress_var:
        progress_var.set(progress_var.get() + setup_step)

    # get the subpages of old and new sites
    try:
        sites = get_sites(old_url)
    except AttributeError:
        entry_print(
            "Can't find the site map from " + old_url +
            ". Please check if the url is valid!", True)
        thread_pool.destroy()
        return
    old_blog = get_blog_site(old_url)
    new_blog = get_blog_site(new_url)

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        thread_pool.destroy()
        return

    blog_exists = False
    if old_blog and new_blog:
        blog_exists = True

    # if urls for subpages are not found
    if sites is None:
        record_error(new_url, "sites")
        if progress_var:
            progress_var.set(progress_var.get() + step)
        return False

    # if blog page is not found
    if old_blog is not None and new_blog is None:
        record_error(new_url, "blog")
    elif old_blog is None and new_blog is not None:
        record_error(old_url, "blog")

    setup_step = step * 0.02
    if progress_var:
        progress_var.set(progress_var.get() + setup_step)

    # print out site information
    entry_print("Site Information: ", True)

    # calculate the step for each page
    step *= 0.96
    if blog_exists:
        page_step = step / 2 / (len(sites) + 1)
        entry_print("Old Blog: " + old_blog, True)
        entry_print("New Blog: " + new_blog, True)
    else:
        page_step = step / (len(sites) + 1)

    entry_print("Number of non-blog pages: " + str(len(sites)), True)

    # check the homepage
    thread_pool.add_task(compare_homepage,
                         old_url=old_url,
                         new_url=new_url,
                         progress_var=progress_var,
                         step=page_step)

    # check all the sites in sitemap
    for site in sites:
        ind += 1
        if site.startswith("/home") or site.startswith("/main"):
            continue

        old_link = old_url + site
        new_link = new_url + site

        thread_pool.add_task(compare_page,
                             old_url=old_link,
                             new_url=new_link,
                             progress_var=progress_var,
                             step=page_step)

    # check all the blog pages
    if blog_exists:
        old_blog_soup = get_soup(old_blog)
        new_blog_soup = get_soup(new_blog)
        compare_blog(old_blog_soup,
                     new_blog_soup,
                     old_blog,
                     new_blog,
                     progress_var=progress_var,
                     step=step / 2)

    # single site mode
    if not thread_pool_csv:
        thread_pool.wait_completion()
        thread_pool.destroy()

    entry_print("-----------------------------------------------------\n")

    return True
Exemplo n.º 15
0
def compare_site_selenium(old_url, new_url, progress_var=None, step=100.0):
    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        entry_print("-----------------------------------------------------\n")
        return

    create_path()
    new_browser = webdriver.Chrome(executable_path=settings["EXECUTABLE_PATH"])
    new_browser.maximize_window()
    site_pass = True
    blog_pass = True
    ind = 0

    old_url = old_url.strip()
    new_url = new_url.strip()

    # remove the "/" at the end of url
    if old_url.endswith('/'):
        old_url = old_url[:-1]
    if new_url.endswith('/'):
        new_url = new_url[:-1]

    # add http:// before url
    if not old_url.startswith("http://"):
        old_url = "http://" + old_url
    if not new_url.startswith("http://"):
        new_url = "http://" + new_url

    # print out the information for old and new sites
    entry_print("-----------------------------------------------------", True)
    entry_print("Old URL: " + old_url, True)
    entry_print("New URL: " + new_url, True)
    entry_print("-----------------------------------------------------", True)

    setup_step = step * 0.01
    if progress_var:
        progress_var.set(progress_var.get() + setup_step)

    # get the domain name of old url and derive the new url
    sites = get_sites(old_url)
    old_blog = get_blog_site(old_url)
    new_blog = get_blog_site(new_url)

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        new_browser.quit()
        entry_print("-----------------------------------------------------\n")
        return

    blog_exists = False
    if old_blog and new_blog:
        blog_exists = True

    # if subpages are not found
    if sites is None:
        record_error(new_url, "sites")
        return False

    # if blog page is not found
    if old_blog is not None and new_blog is None:
        record_error(new_url, "blog")
    elif old_blog is None and new_blog is not None:
        record_error(old_url, "blog")

    setup_step = step * 0.02
    step *= 0.97
    if progress_var:
        progress_var.set(progress_var.get() + setup_step)

    if blog_exists:
        page_step = step / 2 / (len(sites) + 1)
    else:
        page_step = step / (len(sites) + 1)

    # check homepage
    homepage_pass = compare_homepage(old_url, new_url, browser=new_browser, progress_var=progress_var, step=page_step)

    # check program status
    if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
        new_browser.quit()
        entry_print("-----------------------------------------------------\n")
        return

    if homepage_pass == -1:
        return -1

    # check all the sites in sitemap
    for site in sites:
        # check program status
        if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
            new_browser.quit()
            entry_print("-----------------------------------------------------\n")
            return

        ind += 1
        if site.startswith("/home") or site.startswith("/main"):
            continue

        old_link = old_url + site
        new_link = new_url + site

        page_pass = compare_page(old_link, new_link, browser=new_browser, progress_var=progress_var, step=page_step)

        if not page_pass:
            site_pass = False

    # check all the blog entries
    if blog_exists:
        # check program status
        if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]:
            new_browser.quit()
            entry_print("-----------------------------------------------------\n")
            return

        old_blog_soup = get_soup(old_blog)
        new_blog_soup = get_soup_selenium(new_blog, new_browser)
        blog_pass = compare_blog(old_blog_soup, new_blog_soup, old_blog, new_blog, browser=new_browser,
                                 progress_var=progress_var, step=step / 2)

    entry_print("-----------------------------------------------------\n")
    new_browser.quit()

    return site_pass and homepage_pass and blog_pass