def compare_page(old_url, new_url, browser=None, progress_var=None, step=1.0): # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: return # deal with urls that exceeds 50 characters new_list = new_url.split("/") if new_list[-1].find("televox.west.com") == -1 and len(new_list[-1]) > 50: new_list[-1] = new_list[-1][:50] new_url = "/".join(new_list) old_soup = get_soup(old_url) new_soup = get_soup(new_url, browser=browser) result = open("result\\site_result.txt", 'a') if old_soup is None: record_error(old_url, "soup") if progress_var: progress_var.set(progress_var.get() + step) return False if new_soup is None: record_error(new_url, "soup") if progress_var: progress_var.set(progress_var.get() + step) return False meta_pass = compare_meta_soup(old_soup, new_soup, old_url, new_url) if meta_pass == -1: if progress_var: progress_var.set(progress_var.get() + step) return False form_pass = compare_form_soup(old_soup, new_soup, old_url, new_url) content_pass = compare_content_soup(old_soup, new_soup, old_url, new_url) image_pass = compare_image_soup(old_soup, new_soup, old_url, new_url) link_pass = compare_link_soup(old_soup, new_soup, old_url, new_url, browser=browser) page_pass = meta_pass and form_pass and content_pass and image_pass and link_pass if page_pass: print(new_url + " PASSED!") result.write(new_url + " PASSED!\n") result.close() if progress_var: progress_var.set(progress_var.get() + step) return True else: print(new_url + " FAILED! (see detail files for more information)") result.write(new_url + " FAILED! (see detail files for more information)\n") result.close() if progress_var: progress_var.set(progress_var.get() + step) return False
def get_subpages(old_url): # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: return [] old_soup = get_soup(old_url) nav_menu = old_soup.find('ul', class_="primary-navigation") try: drop_downs = nav_menu.find_all('ul') except AttributeError: return [] parsed_subpages = [] for drop_down in drop_downs: subpages = drop_down.find_all('a') if not subpages: continue else: url_list = subpages[0]['href'].split('/') root_url = '/' + url_list[1] parsed_subpage = [] for subpage in subpages: name = subpage.get_text() rel_url = subpage['href'].split('/')[-1] parsed_subpage.append((name, rel_url)) parsed_subpages.append((root_url, parsed_subpage)) return parsed_subpages
def check_homepage_link(old_soup, new_soup, old_url, new_url, browser=None): detail = open("result\\homepage_detail.txt", 'a') old_hostname = urlparse(old_url).hostname new_hostname = urlparse(new_url).hostname page_pass = True printable = set(string.printable) new_content = new_soup.find('div', class_="ptl_page") if old_url.endswith("/"): old_url = old_url[:-1] if new_url.endswith("/"): new_url = new_url[:-1] if not old_hostname: old_hostname = old_url if not new_hostname: new_hostname = new_url if not new_content: record_error(new_url, "new homepage container") return False # remove banner and navigation menu from soup if new_content: for (name, kwargs) in settings["HOMEPAGE_LINK_IGNORE"]: for s in new_content.find_all(name, **kwargs): s.extract() new_tags = new_content.find_all('a', href=re.compile("^(?!.*(#aftermap|#)).*$")) # check for new links that direct to old site host_link = old_url.replace(urlparse(old_url).path, "") domain = get_domain(old_url) for tag in new_tags: href = tag['href'] href_hostname = urlparse(href).hostname if href_hostname is None: href_hostname = "" if href.startswith("/"): continue if (href.startswith(host_link) and host_link != "") \ or (href_hostname.find(domain + '.') != -1 and not href.startswith("mailto") and href.find("televox.west.com") == -1) \ or href.find("iapps") != -1: page_pass = False entry_print("***********************************************") entry_print("HOMEPAGE LINKS THAT GO BACK TO OLD SITE!") entry_print("New URL: " + new_url) detail.write("HOMEPAGE LINKS THAT GO BACK TO OLD SITE!\n") detail.write("New URL: " + new_url + "\n") entry_print("Bad tag: " + str(tag)) entry_print("***********************************************") detail.write("Bad tag: " + str(tag) + "\n") detail.write("-----------------------------------------------\n") if href.find("televox.west.com") != -1: page_pass = False entry_print("***********************************************") entry_print("NON-FRIENDLY URL FOUND! ") entry_print("New URL: " + new_url) detail.write("NON-FRIENDLY URL FOUND!\n") detail.write("New URL: " + new_url + "\n") entry_print("Bad tag: " + str(tag)) entry_print("***********************************************") detail.write("Bad tag: " + str(tag) + "\n") detail.write("-----------------------------------------------\n") # check invalid links in new site new_invalid_links = [] for tag in new_tags: url = tag.get('href') if url is None: continue if url.startswith("https://"): continue if url.startswith("tel:") or url.startswith( "mailto:") or url.find("#") != -1 or url.startswith("/common"): continue if url.startswith("/"): url = "http://" + new_hostname + url if url.find("televox.west.com") != -1: new_target = get_soup(url, browser) else: new_target = get_soup(url) new_target_title = get_meta_soup(new_target, url)['title'] if new_target_title.find("404") != -1 or new_target_title == "Page Not Found" or new_target_title == "none" \ or new_target_title == "The resource cannot be found.": new_invalid_links.append((str(tag), new_target_title)) if new_invalid_links: entry_print("***********************************************") entry_print("INVALID LINK FOUND IN HOMEPAGE!") entry_print("New URL: " + new_url) detail.write("-----------------------------------------------\n") detail.write("INVALID LINK FOUND IN HOMEPAGE!\n") detail.write("New URL: " + new_url + "\n") ind = 0 for tag, target in new_invalid_links: ind += 1 entry_print("Bad tag" + str(ind) + ": " + tag) entry_print("Target title: " + target) detail.write("Bad tag" + str(ind) + ": " + tag + "\n") detail.write("Target title: " + target + "\n") entry_print("***********************************************") # check published links for homepage old_publish = old_soup.find('nav', id="utility-navigation") new_publish = new_soup.find('nav', id="utility-navigation") if old_publish: old_published_links = old_publish.find_all( 'a', href=re.compile("^((?!#).)*$")) else: old_published_links = [] if new_publish: new_published_links = new_publish.find_all( 'a', href=re.compile("^((?!#).)*$")) else: new_published_links = [] if len(old_published_links) != len(new_published_links): entry_print("***********************************************") entry_print("NUMBER OF PUBLISHED LINKS DIFFERENT!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) entry_print("Number of old links: " + str(len(old_published_links))) entry_print("Number of new links: " + str(len(new_published_links))) entry_print("***********************************************") detail.write("NUMBER OF PUBLISHED LINKS DIFFERENT!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") detail.write("Number of old links: " + str(len(old_published_links)) + "\n") detail.write("Number of new links: " + str(len(new_published_links)) + "\n") detail.write("-----------------------------------------------\n") page_pass = False else: publish_pass = True # check the href and name for each published link for ind in range(len(new_published_links)): old_link = old_published_links[ind]['href'] new_link = new_published_links[ind]['href'] old_link_dup = old_link.replace(" ", "").replace("-", "").replace( "(", "").replace(")", "") new_link_dup = new_link.replace(" ", "").replace("-", "").replace( "(", "").replace(")", "") old_name = old_published_links[ind].get_text().replace(" ", " ") new_name = new_published_links[ind].get_text().replace(" ", " ") old_name = "".join([i for i in old_name if i in printable]).strip().upper() new_name = "".join([i for i in new_name if i in printable]).strip().upper() old_name_dup = old_name.replace(" ", "").replace("-", "").replace( "(", "").replace(")", "") new_name_dup = new_name.replace(" ", "").replace("-", "").replace( "(", "").replace(")", "") if old_link_dup != new_link_dup: if old_link.startswith("tel:") or old_link.startswith( "mailto:") or unicode(old_link[0]).isnumeric(): continue if old_link.startswith("/"): old_link = old_hostname + old_link if new_link.startswith("/"): new_link = new_hostname + new_link old_target = get_soup(old_link) new_target = get_soup(new_link, browser=browser) old_target_title = get_meta_soup(old_target, old_link)['title'] new_target_title = get_meta_soup(new_target, new_link)['title'] if new_target_title.endswith("..."): new_target_title = new_target_title[:-3] old_target_title = old_target_title[:len(new_target_title)] if old_target_title != new_target_title: if publish_pass: entry_print( "***********************************************") entry_print("PUBLISHED LINKS DO NOT MATCH!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) detail.write("PUBLISHED LINKS DO NOT MATCH!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") publish_pass = False page_pass = False entry_print("Old target: " + old_target_title) entry_print("New target: " + new_target_title) entry_print("Old link: " + old_link) entry_print("New link: " + new_link) detail.write("Old target: " + old_target_title + "\n") detail.write("New target: " + new_target_title + "\n") detail.write("Old link: " + old_link + "\n") detail.write("New link: " + new_link + "\n") if old_name_dup != new_name_dup: if publish_pass: entry_print( "***********************************************") entry_print("PUBLISHED LINK NAMES DO NOT MATCH!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) detail.write("PUBLISHED LINK NAMES DO NOT MATCH!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") publish_pass = False page_pass = False entry_print("Old name: " + old_name) entry_print("New name: " + new_name) detail.write("Old name: " + old_name + "\n") detail.write("New name: " + new_name + "\n") if not publish_pass: entry_print("***********************************************") detail.write("-----------------------------------------------\n") # check social media links for homepage old_social = old_soup.find('nav', class_="social-navigation") new_social = new_soup.find('nav', class_="social-navigation") if old_social: old_social_links = old_social.find_all('a') else: old_social_links = [] if new_social: new_social_links = new_social.find_all('a') else: new_social_links = [] if len(old_social_links) != len(new_social_links): entry_print("***********************************************") entry_print("NUMBER OF SOCIAL LINKS DIFFERENT!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) entry_print("Number of old links: " + str(len(old_social_links))) entry_print("Number of new links: " + str(len(new_social_links))) entry_print("***********************************************") detail.write("NUMBER OF SOCIAL LINKS DIFFERENT!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") detail.write("Number of old links: " + str(len(old_social_links)) + "\n") detail.write("Number of new links: " + str(len(new_social_links)) + "\n") detail.write("-----------------------------------------------\n") page_pass = False else: social_pass = True # check the href and name for each social link for ind in range(len(new_social_links)): old_link = old_social_links[ind]['href'] new_link = new_social_links[ind]['href'] old_link_reversed = old_social_links[len(old_social_links) - ind - 1]['href'] if old_link != new_link and old_link_reversed != new_link: if new_link.startswith("/"): new_link = new_hostname + new_link if old_link.startswith("/"): old_link = old_hostname + old_link old_target = get_soup(old_link) new_target = get_soup(new_link) old_target_title = replace_special( get_meta_soup(old_target, old_link)['title']) new_target_title = replace_special( get_meta_soup(new_target, new_link)['title']) if new_target_title.endswith("..."): new_target_title = new_target_title[:-3] old_target_title = old_target_title[:len(new_target_title)] if old_target_title != new_target_title: if social_pass: entry_print( "***********************************************") entry_print("SOCIAL LINKS DO NOT MATCH!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) detail.write("SOCIAL LINKS DO NOT MATCH!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") social_pass = False entry_print("Old target: " + old_target_title) entry_print("New target: " + new_target_title) entry_print("Old link: " + old_link) entry_print("New link: " + new_link) detail.write("Old target: " + old_target_title + "\n") detail.write("New target: " + new_target_title + "\n") detail.write("Old link: " + old_link + "\n") detail.write("New link: " + new_link + "\n") if not social_pass: entry_print("***********************************************") detail.write("-----------------------------------------------\n") detail.close() return page_pass
def compare_homepage(old_url, new_url, browser=None, progress_var=None, step=1.0): result = open("result\\site_result.txt", 'a') old_soup = get_soup(old_url) if not browser: new_soup = get_soup(new_url) else: try: new_soup = get_soup(new_url, browser) except CredentialError: if progress_var: progress_var.set(progress_var.get() + step) return -1 if old_soup is None: record_error(old_url, "soup") if progress_var: progress_var.set(progress_var.get() + step) return False if new_soup is None: record_error(new_url, "soup") if progress_var: progress_var.set(progress_var.get() + step) return False meta_pass = compare_meta_soup(old_soup, new_soup, old_url, new_url) if meta_pass == -1: if progress_var: progress_var.set(progress_var.get() + step) return False address_pass = compare_address_soup(old_soup, new_soup, old_url, new_url) content_pass = compare_homepage_content(old_soup, new_soup, old_url, new_url) image_pass = check_homepage_image(new_soup, new_url) link_pass = check_homepage_link(old_soup, new_soup, old_url, new_url, browser=browser) page_pass = meta_pass and address_pass and content_pass and image_pass and link_pass if page_pass: print(new_url + " HOMEPAGE PASSED!") result.write(new_url + " HOMEPAGE PASSED!\n") result.close() if progress_var: progress_var.set(progress_var.get() + step) return True else: result.write( new_url + " HOMEPAGE FAILED! (see detail files for more information)\n") result.close() if progress_var: progress_var.set(progress_var.get() + step) return False
def set_meta(old_url, new_url, browser): # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: return wait = WebDriverWait(browser, 20) old_soup = get_soup(old_url) old_meta = get_meta_soup(old_soup, old_url) # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: return if new_url.endswith('/'): new_url = new_url[:-1] # truncate url if name exceeds 50 characters new_path = urlparse(new_url).path new_path_list = new_path.split('/') if len(new_path_list[-1]) > 50: new_path_list[-1] = new_path_list[-1][:50] new_path_dup = "/".join(new_path_list) new_url_dup = new_url.replace(new_path, new_path_dup) browser.get(new_url_dup) else: browser.get(new_url) if browser.title == "Login": login(browser, wait) new_soup = BeautifulSoup(browser.page_source, "html.parser") login_status = new_soup.find('a', id="ctl00_lnkGateway").get_text() if login_status == "Login": login_button = browser.find_element_by_id("ctl00_lnkGateway") login_button.click() wait.until( EC.visibility_of_element_located( (By.ID, "ctl00_ContentPlaceHolder1_txtUsername"))) login(browser, wait) page_options = browser.find_element_by_xpath( '//li[@class="optionPageOptions"]') page_options.click() metadata_option = browser.find_element_by_xpath( '//span[@class="AB_icn AB_icn-metadata"]').find_element_by_xpath('..') url = metadata_option.get_attribute('href') rel_url = re.search("/cms/.*Metadata", url).group(0) new_hostname = urlparse(new_url).hostname target_url = "http://" + new_hostname + rel_url browser.get(target_url) enable_custom_checkbox = browser.find_elements_by_xpath( '//input[@type="checkbox"]')[0] if not enable_custom_checkbox.is_selected(): enable_custom_checkbox.click() # migrate title title = old_meta["title"] title_entry = browser.find_elements_by_xpath('//input[@type="text"]')[6] title_entry.clear() try: title_entry.send_keys(title) except UnicodeDecodeError: migration_print("Unable to migrate title for " + new_url) migration_print("Title: " + old_meta["title"]) migration_print("Description: " + old_meta["description"]) migration_print("Keywords: " + old_meta["keywords"]) migration_print( "-----------------------------------------------------------") ask_continue() return # migrate description description = old_meta["description"] if description != "none" and not description.startswith( "Learn more about"): description_entry = browser.find_elements_by_xpath( '//input[@type="text"]')[13] description_entry.clear() try: description_entry.send_keys(description) except UnicodeDecodeError: migration_print("Unable to migrate description for " + new_url) migration_print("Title: " + old_meta["title"]) migration_print("Description: " + old_meta["description"]) migration_print("Keywords: " + old_meta["keywords"]) migration_print( "-----------------------------------------------------------") ask_continue() return # migrate keywords keywords = old_meta["keywords"] if keywords != "none": keywords_entry = browser.find_elements_by_xpath( '//input[@type="text"]')[14] keywords_entry.clear() try: keywords_entry.send_keys(keywords) except UnicodeDecodeError: migration_print("Unable to migrate keywords for " + new_url) migration_print("Title: " + old_meta["title"]) migration_print("Description: " + old_meta["description"]) migration_print("Keywords: " + old_meta["keywords"]) migration_print( "-----------------------------------------------------------") ask_continue() return submit_button = browser.find_element_by_xpath('//input[@type="submit"]') submit_button.click() new_path = urlparse(new_url).path if not new_path: new_path = "/" else: ind = new_url.find(new_path) new_path = new_url[ind:] migration_print(new_path + " metadata migrated!")
def migrate_meta(old_url, new_url, progress_var=None, step=100.0): old_url = old_url.strip() new_url = new_url.strip() # remove the "/" at the end of the url if old_url[-1] == '/': old_url = old_url[:-1] if new_url[-1] == '/': new_url = new_url[:-1] # add "http://" before url if not old_url.startswith("http"): old_url = "http://" + old_url if not new_url.startswith("http"): new_url = "http://" + new_url # print out the information for old and new sites migration_print("-----------------------------------------------------") migration_print("Old URL: " + old_url) migration_print("New URL: " + new_url) migration_print("-----------------------------------------------------") browser = webdriver.Chrome(executable_path=settings["EXECUTABLE_PATH"]) browser.maximize_window() # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: browser.quit() migration_print( "-----------------------------------------------------\n") return if progress_var: progress_var.set(progress_var.get() + step * 0.01) sites = get_sites(old_url) # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: browser.quit() migration_print( "-----------------------------------------------------\n") return if progress_var: progress_var.set(progress_var.get() + step * 0.02) if not sites: migration_print("Unable to fetch subpage URLs form site map of " + old_url) # find blog pages old_blog_page = get_blog_site(old_url) new_blog_page = get_blog_site(new_url) blog_exists = True if not old_blog_page or not new_blog_page: blog_exists = False # calculate the step for each subpage step *= 0.97 if blog_exists: page_step = step / 2 / (len(sites) + 1) else: page_step = step / (len(sites) + 1) # migrate metadata for homepage set_meta(old_url, new_url, browser) # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: browser.quit() migration_print( "-----------------------------------------------------\n") return if progress_var: progress_var.set(progress_var.get() + page_step) # migrate all non-blog pages for site in sites: # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: browser.quit() migration_print( "-----------------------------------------------------\n") return old_link = old_url + site new_link = new_url + site try: set_meta(old_link, new_link, browser) except NoSuchElementException: migration_print("Missing Page: " + new_link, ) if progress_var: progress_var.set(progress_var.get() + page_step) if not blog_exists: browser.quit() migration_print( "-----------------------------------------------------------") return step /= 2 # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: browser.quit() migration_print( "-----------------------------------------------------\n") return old_blog_soup = get_soup(old_blog_page) new_blog_soup = get_soup(new_blog_page, browser) old_blogs = old_blog_soup.find_all(['h5', 'h3']) new_blogs = new_blog_soup.find_all('a', class_="title") # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: browser.quit() migration_print( "-----------------------------------------------------\n") return if progress_var: progress_var.set(progress_var.get() + step * 0.02) step *= 0.98 # record blog posts as title, url pairs in dictionary old_list = [] parsed_old_blogs = {} ind = 1 for blog in old_blogs: title = blog.get_text() if title == "Categories": old_blogs.remove(blog) continue try: link = blog.a.get('href') except AttributeError: migration_print("Unable to find blog metadata for " + title) if title in parsed_old_blogs: parsed_old_blogs[title + str(ind)] = link old_list.append((title + str(ind), link)) ind += 1 else: parsed_old_blogs[title] = link old_list.append((title, link)) new_list = [] parsed_new_blogs = {} ind = 1 for blog in new_blogs: title = blog.get_text() link = new_url + blog.get('href') if title in parsed_new_blogs: parsed_new_blogs[title + str(ind)] = link new_list.append((title + str(ind), link)) ind += 1 else: parsed_new_blogs[title] = link new_list.append((title, link)) if not old_list or not new_list: browser.quit() return blog_step = step / (len(old_list) + 1) # migrate metadata for blog index page set_meta(old_blog_page, new_blog_page, browser) # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: browser.quit() migration_print( "-----------------------------------------------------\n") return if progress_var: progress_var.set(progress_var.get() + blog_step) # migrate metadata for blog posts for ind in range(len(old_list)): # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: browser.quit() migration_print( "-----------------------------------------------------\n") return if old_list[ind][0] == new_list[ind][0]: set_meta(old_list[ind][1], new_list[ind][1], browser) else: try: set_meta(parsed_old_blogs[old_list[ind][0]], parsed_new_blogs[old_list[ind][0]], browser) except KeyError: migration_print("Cannot migrate metadata for blog page " + new_list[ind][1]) continue if progress_var: progress_var.set(progress_var.get() + blog_step) browser.quit() migration_print("-----------------------------------------------------\n")
def compare_link_soup(old_soup, new_soup, old_url, new_url, browser=None): detail = open("result\\site_detail.txt", 'a') old_hostname = urlparse(old_url).hostname new_hostname = urlparse(new_url).hostname if not old_hostname: old_hostname = "" if not new_hostname: new_hostname = "" # grab container old_content = old_soup.find('div', class_="right") new_content = new_soup.find('div', class_="right") if not old_content and new_content: if old_soup.find('div', id="content"): old_content = old_soup.find('div', id="content") if not old_content and new_content: record_error(old_url, "link container") detail.close() return False elif old_content and not new_content: record_error(new_url, "link container") detail.close() return False elif not old_content and not new_content: return True # vertical template uses different container if old_content.find('div', id="content"): old_content = old_soup.find('div', id="content") if new_content.find('div', id="content"): new_content = new_soup.find('div', id="content") # remove extra links from container if old_content: for (name, kwargs) in settings["COMPARE_OLD_LINK_IGNORE"]: for s in old_content.find_all(name, **kwargs): s.extract() if new_content: for (name, kwargs) in settings["COMPARE_NEW_LINK_IGNORE"]: for s in new_content.find_all(name, kwargs): s.extract() if old_content is None: old_tags = [] else: old_tags = old_content.find_all('a', href=True) if new_content is None: new_tags = [] else: new_tags = new_content.find_all('a', href=True) # remove links that does not have any content inside old_tags = [ tag for tag in old_tags if tag.text and not tag.text.isspace() or tag.find('img') ] new_tags = [ tag for tag in new_tags if tag.text and not tag.text.isspace() or tag.find('img') ] # check for new links that direct to old site host_link = old_url.replace(urlparse(old_url).path, "") domain = get_domain(old_url) new_pass1 = True for tag in new_tags: href = tag['href'] href_hostname = urlparse(href).hostname if not href_hostname: href_hostname = "" if href.find(host_link) != -1 or (href_hostname.find(domain + '.') != -1 and href.find("televox.west.com") == -1) \ or href.find("iapps") != -1: if new_pass1: entry_print("***********************************************") entry_print("LINKS THAT GO BACK TO OLD SITE!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) detail.write("LINKS THAT GO BACK TO OLD SITE!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") new_pass1 = False entry_print("Bad tag: " + str(tag)) detail.write("Bad tag: " + str(tag) + "\n") if not new_pass1: entry_print("***********************************************") detail.write("-----------------------------------------------\n") # check for non-friendly urls new_pass2 = True for tag in new_tags: href = tag['href'] if not href: continue if href.find("televox.west.com") != -1: if new_pass2: entry_print("***********************************************") entry_print("NON-FRIENDLY URL FOUND!") entry_print("New URL: " + new_url) detail.write("NON-FRIENDLY URL FOUND!\n") detail.write("New URL: " + new_url + "\n") new_pass2 = False entry_print("Bad tag: " + str(tag)) detail.write("Bad tag: " + str(tag) + "\n") if not new_pass2: entry_print("***********************************************") detail.write("-----------------------------------------------\n") # remove file links for tag in old_tags: url = tag.get('href') if re.search("jpg|png|pdf|mp4", url): old_tags.remove(tag) for tag in new_tags: url = tag.get('href') if re.search("jpg|png|pdf|mp4|UserFile", url): new_tags.remove(tag) bad_tags = [] if len(old_tags) != len(new_tags): # remove 404 pages and file links from the old tags for tag in old_tags: url = tag.get('href') if url is None: continue if url.startswith("https://"): continue if url.startswith("tel:") or url.startswith( "mailto:") or url.find("#") != -1: continue if url.startswith("/"): url = "http://" + old_hostname + url old_target = get_soup(url) old_target_title = get_meta_soup(old_target, url)['title'] if old_target_title.find("404") != -1 \ or re.search("page not found|the resource cannot be found", old_target_title.lower()) \ or old_target_title == "none": bad_tags.append((str(tag), old_target_title)) old_tags.remove(tag) # check invalid links in new site new_invalid_links = [] for tag in new_tags: url = tag.get('href') if url is None: continue if url.startswith("https://"): continue if url.startswith("tel:") or url.startswith( "mailto:") or url.find("#") != -1 or url.startswith("/common"): continue if url.startswith("/"): url = "http://" + new_hostname + url if url.find("televox.west.com") != -1: new_target = get_soup(url, browser) else: new_target = get_soup(url) new_target_title = get_meta_soup(new_target, url)['title'] if new_target_title.find("404") != -1 or new_target_title == "Page Not Found" or new_target_title == "none" \ or new_target_title == "The resource cannot be found.": new_invalid_links.append((str(tag), new_target_title)) if new_invalid_links: entry_print("***********************************************") entry_print("INVALID LINK FOUND IN NEW SITE!") entry_print("New URL: " + new_url) detail.write("INVALID LINK FOUND IN NEW SITE!\n") detail.write("New URL: " + new_url + "\n") ind = 0 for tag, target in new_invalid_links: ind += 1 entry_print("Bad tag" + str(ind) + ": " + tag) entry_print("Target title: " + target) detail.write("Bad tag" + str(ind) + ": " + tag + "\n") detail.write("Target title: " + target + "\n") entry_print("***********************************************") detail.write("-----------------------------------------------\n") # check that number of links match if not, return if len(new_tags) != len(old_tags): entry_print("***********************************************") entry_print( "NUMBER OF LINKS DIFFERENT OR 404 LINK EXISTS IN NEW PAGE!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) entry_print("Number of old links: " + str(len(old_tags))) entry_print("Number of new links: " + str(len(new_tags))) entry_print("Old tags: " + str(old_tags)) entry_print("New tags: " + str(new_tags)) if bad_tags: entry_print("404 tags in old site (removed):") for ind in range(len(bad_tags)): entry_print("Tag" + str(ind + 1) + ": " + bad_tags[ind][0]) entry_print("Target title: " + bad_tags[ind][1]) entry_print("***********************************************") detail.write( "NUMBER OF LINKS DIFFERENT OR 404 LINK EXISTS IN NEW PAGE!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") detail.write("Number of old links: " + str(len(old_tags)) + "\n") detail.write("Number of new links: " + str(len(new_tags)) + "\n") if bad_tags: detail.write("404 tag(s) in old site (removed):\n") for ind in range(len(bad_tags)): detail.write("Tag" + str(ind + 1) + ": " + bad_tags[ind][0] + "\n") detail.write("Target title: " + bad_tags[ind][1] + "\n") entry_print("***********************************************") detail.write("-----------------------------------------------\n") detail.close() return False # check that new and old links match new_pass3 = True count = 0 for ind in range(len(new_tags)): old_link = old_tags[ind]['href'].replace("\\", "/").strip() new_link = new_tags[ind]['href'].replace("\\", "/").strip() if old_link == new_link: continue # take out the duplication part for old_link if old_link.find("#") != -1: old_ind = old_link.find("#") old_link = old_link[old_ind:] if new_link.find("#") != -1: new_ind = new_link.find("#") new_link = new_link[new_ind:] temp = old_link.split("/") if len(temp) > 2: if temp[-1] == temp[-2]: old_link = "/".join(temp[:-1]) if urlparse(old_link).path == urlparse(new_link).path: continue if old_link.startswith("/"): old_link = "http://" + old_hostname + old_link # if the old link points to the homepage, then set it as "/" if old_link.endswith("/home") or old_link.endswith("/main"): old_link = "/" if new_link == "/home" or new_link == "/main": new_link = "/" if new_link != "/" and new_link.endswith("/"): new_link = new_link[:-1] if old_link != "/" and old_link.endswith("/"): old_link = old_link[:-1] if old_link != new_link and not new_link.startswith("/common"): if old_link.find("#") != -1 or new_link.find("#") != -1: count += 1 if new_pass3: entry_print( "***********************************************") entry_print("LINKS THAT DO NOT MATCH!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) detail.write("LINKS THAT DO NOT MATCH!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") new_pass3 = False entry_print("Old link" + str(count) + ": " + old_link) entry_print("New link" + str(count) + ": " + new_link) entry_print("Old tag" + str(count) + ": " + str(old_tags[ind])) entry_print("New tag" + str(count) + ": " + str(new_tags[ind])) detail.write("Old tag" + str(count) + ": " + str(old_tags[ind]) + "\n") detail.write("New tag" + str(count) + ": " + str(new_tags[ind]) + "\n") continue if old_link.startswith("/"): old_link = "http://" + old_hostname + old_link.strip() if new_link.startswith("/"): new_link = "http://" + new_hostname + new_link.strip() old_target = get_soup(old_link) new_target = get_soup(new_link, browser=browser) old_target_title = replace_special( get_meta_soup(old_target, old_link)['title']) new_target_title = replace_special( get_meta_soup(new_target, new_link)['title']) if new_target_title.endswith("..."): new_target_title = new_target_title[:-3] old_target_title = old_target_title[:len(new_target_title)] if old_target_title != new_target_title: count += 1 if new_pass3: entry_print( "***********************************************") entry_print("LINKS THAT DO NOT MATCH!") entry_print("Old URL: " + old_url) entry_print("New URL: " + new_url) detail.write("LINKS THAT DO NOT MATCH!\n") detail.write("Old URL: " + old_url + "\n") detail.write("New URL: " + new_url + "\n") new_pass3 = False entry_print("Old link" + str(count) + ": " + old_link) entry_print("New link" + str(count) + ": " + new_link) entry_print("Old target" + str(count) + ": " + old_target_title) entry_print("New target" + str(count) + ": " + new_target_title) entry_print("Old tag" + str(count) + ": " + str(old_tags[ind])) entry_print("New tag" + str(count) + ": " + str(new_tags[ind])) detail.write("Old tag" + str(count) + ": " + str(old_tags[ind]) + "\n") detail.write("New tag" + str(count) + ": " + str(new_tags[ind]) + "\n") if not new_pass3: detail.write("-----------------------------------------------\n") entry_print("***********************************************") detail.close() return new_pass1 and new_pass2 and new_pass3
def compare_site_thread(old_url, new_url, progress_var=None, step=100.0, thread_pool_csv=None): # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: return # checking multiple sites mode if thread_pool_csv: thread_pool = thread_pool_csv else: thread_pool = ThreadPool(settings["THREADPOOL_SIZE"]) create_path() ind = 0 old_url = old_url.strip() new_url = new_url.strip() # remove the "/" at the end of the url if old_url[-1] == '/': old_url = old_url[:-1] if new_url[-1] == '/': new_url = new_url[:-1] # add "http://" before url if not old_url.startswith("http"): old_url = "http://" + old_url if not new_url.startswith("http"): new_url = "http://" + new_url # print out the information for old and new sites entry_print("-----------------------------------------------------", True) entry_print("Old URL: " + old_url, True) entry_print("New URL: " + new_url, True) entry_print("-----------------------------------------------------", True) setup_step = step * 0.01 if progress_var: progress_var.set(progress_var.get() + setup_step) # check if the new site needs login new_test = get_soup(new_url) if new_test: title = new_test.find("title") if title and title.get_text().strip() == "Login": entry_print( "New site needs login. Please use login mode to check this site!\n", True) return -1 setup_step = step * 0.01 if progress_var: progress_var.set(progress_var.get() + setup_step) # get the subpages of old and new sites try: sites = get_sites(old_url) except AttributeError: entry_print( "Can't find the site map from " + old_url + ". Please check if the url is valid!", True) thread_pool.destroy() return old_blog = get_blog_site(old_url) new_blog = get_blog_site(new_url) # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: thread_pool.destroy() return blog_exists = False if old_blog and new_blog: blog_exists = True # if urls for subpages are not found if sites is None: record_error(new_url, "sites") if progress_var: progress_var.set(progress_var.get() + step) return False # if blog page is not found if old_blog is not None and new_blog is None: record_error(new_url, "blog") elif old_blog is None and new_blog is not None: record_error(old_url, "blog") setup_step = step * 0.02 if progress_var: progress_var.set(progress_var.get() + setup_step) # print out site information entry_print("Site Information: ", True) # calculate the step for each page step *= 0.96 if blog_exists: page_step = step / 2 / (len(sites) + 1) entry_print("Old Blog: " + old_blog, True) entry_print("New Blog: " + new_blog, True) else: page_step = step / (len(sites) + 1) entry_print("Number of non-blog pages: " + str(len(sites)), True) # check the homepage thread_pool.add_task(compare_homepage, old_url=old_url, new_url=new_url, progress_var=progress_var, step=page_step) # check all the sites in sitemap for site in sites: ind += 1 if site.startswith("/home") or site.startswith("/main"): continue old_link = old_url + site new_link = new_url + site thread_pool.add_task(compare_page, old_url=old_link, new_url=new_link, progress_var=progress_var, step=page_step) # check all the blog pages if blog_exists: old_blog_soup = get_soup(old_blog) new_blog_soup = get_soup(new_blog) compare_blog(old_blog_soup, new_blog_soup, old_blog, new_blog, progress_var=progress_var, step=step / 2) # single site mode if not thread_pool_csv: thread_pool.wait_completion() thread_pool.destroy() entry_print("-----------------------------------------------------\n") return True
def compare_site_selenium(old_url, new_url, progress_var=None, step=100.0): # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: entry_print("-----------------------------------------------------\n") return create_path() new_browser = webdriver.Chrome(executable_path=settings["EXECUTABLE_PATH"]) new_browser.maximize_window() site_pass = True blog_pass = True ind = 0 old_url = old_url.strip() new_url = new_url.strip() # remove the "/" at the end of url if old_url.endswith('/'): old_url = old_url[:-1] if new_url.endswith('/'): new_url = new_url[:-1] # add http:// before url if not old_url.startswith("http://"): old_url = "http://" + old_url if not new_url.startswith("http://"): new_url = "http://" + new_url # print out the information for old and new sites entry_print("-----------------------------------------------------", True) entry_print("Old URL: " + old_url, True) entry_print("New URL: " + new_url, True) entry_print("-----------------------------------------------------", True) setup_step = step * 0.01 if progress_var: progress_var.set(progress_var.get() + setup_step) # get the domain name of old url and derive the new url sites = get_sites(old_url) old_blog = get_blog_site(old_url) new_blog = get_blog_site(new_url) # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: new_browser.quit() entry_print("-----------------------------------------------------\n") return blog_exists = False if old_blog and new_blog: blog_exists = True # if subpages are not found if sites is None: record_error(new_url, "sites") return False # if blog page is not found if old_blog is not None and new_blog is None: record_error(new_url, "blog") elif old_blog is None and new_blog is not None: record_error(old_url, "blog") setup_step = step * 0.02 step *= 0.97 if progress_var: progress_var.set(progress_var.get() + setup_step) if blog_exists: page_step = step / 2 / (len(sites) + 1) else: page_step = step / (len(sites) + 1) # check homepage homepage_pass = compare_homepage(old_url, new_url, browser=new_browser, progress_var=progress_var, step=page_step) # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: new_browser.quit() entry_print("-----------------------------------------------------\n") return if homepage_pass == -1: return -1 # check all the sites in sitemap for site in sites: # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: new_browser.quit() entry_print("-----------------------------------------------------\n") return ind += 1 if site.startswith("/home") or site.startswith("/main"): continue old_link = old_url + site new_link = new_url + site page_pass = compare_page(old_link, new_link, browser=new_browser, progress_var=progress_var, step=page_step) if not page_pass: site_pass = False # check all the blog entries if blog_exists: # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: new_browser.quit() entry_print("-----------------------------------------------------\n") return old_blog_soup = get_soup(old_blog) new_blog_soup = get_soup_selenium(new_blog, new_browser) blog_pass = compare_blog(old_blog_soup, new_blog_soup, old_blog, new_blog, browser=new_browser, progress_var=progress_var, step=step / 2) entry_print("-----------------------------------------------------\n") new_browser.quit() return site_pass and homepage_pass and blog_pass