def add_a_target_blank(a): if a.has_attr('href'): if a['href'].startswith('http://') or a['href'].startswith('https://'): a['target'] = '_blank' status.log(NAME, ( "... an outbound link, add target='_blank' tag")) return a
def make_tree(tree_segments): tree = os.path.join(*tree_segments) if not os.path.exists(tree): status.log(NAME, ('Making', tree)) os.makedirs(tree) else: status.log(NAME, ('Tree', tree, 'already exists OK')) return tree
def make_tree(tree_segments): tree = os.path.join(*tree_segments) if not os.path.exists(tree): status.log(NAME,('Making', tree)) os.makedirs(tree) else: status.log(NAME,('Tree', tree, 'already exists OK')) return tree
def add_a_class(a): _class = "link--impt" if a.has_attr('href'): if not a['href'].startswith('/static'): a['class'] = _class status.log(NAME, ( "... add class '{}' to anchor" ).format(_class)) return a
def overwrite_leaves(tree, leaves): for leaf in leaves: path_leaf = os.path.join(tree, leaf[1]) status.log(NAME, ('Writing in', path_leaf)) with open(path_leaf, 'wb') as f: if leaf[1].endswith('.json'): json.dump(leaf[0], f, indent=4) f.write("\n") else: f.write(str(leaf[0])) return
def overwrite_leaves(tree, leaves): for leaf in leaves: path_leaf = os.path.join(tree, leaf[1]) status.log(NAME,('Writing in', path_leaf)) with open(path_leaf, 'wb') as f: if leaf[1].endswith('.json'): json.dump(leaf[0], f, indent=4) f.write("\n") else: f.write(str(leaf[0])) return
def add_img_alt(img): _src = img['src'] _url = os.path.split(os.path.split(_src)[0])[1] _filepath = os.path.basename(_src) _filename, _ = os.path.splitext(_filepath) _id = _filename.replace('image', '') _alt = _url.replace('-', ' ').capitalize() + ' ' + _id img['alt'] = _alt status.log(NAME, ( "... img, add alt='{}'").format(_alt)) return img
def translate_script_src(soup, dir_url, translate_static): Script = soup.findAll('script') for script in Script: if not script.has_attr('src'): continue for src_head in translate_static.keys(): if script['src'].startswith(src_head): status.log(NAME,('src (static) to translate found: ', script['src'])) new = get_new(script['src'],translate_static[src_head],dir_url) script['src'] = script['src'].replace(src_head,new) status.log(NAME,('... translated to (but will get stripped): ', script['src'])) break return soup
def translate_link_href(soup, dir_url, translate_static): Link = soup.findAll('link') for link in Link: if not link.has_attr('href'): continue for href_head in translate_static.keys(): if link['href'].startswith(href_head): status.log(NAME,('href (static) to translate found: ', link['href'])) new = get_new(link['href'],translate_static[href_head],dir_url) link['href'] = link['href'].replace(href_head,new) status.log(NAME,('... translated to: ', link['href'])) break return soup
def strip(body): attrs_to_rm = ["class", "id", "name", "style"] for attr in attrs_to_rm: del body[attr] for tag in body(): for attr in attrs_to_rm: del tag[attr] status.log(NAME, ( "Striping attributes", ' | '.join(attrs_to_rm), 'inside body !') ) Script = body.findAll('script') for script in Script: script.extract() status.log(NAME, "Striping all <script> inside body !") return body
def translate_link_href(soup, dir_url, translate_static): Link = soup.findAll('link') for link in Link: if not link.has_attr('href'): continue for href_head in translate_static.keys(): if link['href'].startswith(href_head): status.log(NAME, ( 'href (static) to translate found: ', link['href'])) new = get_new(link['href'], translate_static[href_head], dir_url) link['href'] = link['href'].replace(href_head, new) status.log(NAME, ( '... translated to: ', link['href'])) break return soup
def strip(body): attrs_to_rm = ["class", "id", "name", "style", "colspan", "rowspan", "cellpadding", "cellspacing"] for attr in attrs_to_rm: del body[attr] for tag in body(): for attr in attrs_to_rm: del tag[attr] status.log(NAME, ( "Striping attributes", ' | '.join(attrs_to_rm), 'inside body !') ) Script = body.findAll('script') for script in Script: script.extract() status.log(NAME, "Striping all <script> inside body !") return body
def check_tree_config(tree, config, flags): try: path_config = os.path.join(tree,'config.json') with open(path_config) as f: config_old = json.load(f) if config_old != config: config = config_old status.log(NAME,( "Not overwriting `{}`, as modifications were found" ).format(path_config)) flags = ['show-config'] if not config['tags']['title']: flags += ['no-title'] if not config['tags']['meta_description']: flags += ['no-meta_description'] if not config['tutorial_name']: flags += ['no-tutorial_name'] except: pass return config, flags
def translate_script_src(soup, dir_url, translate_static): Script = soup.findAll('script') for script in Script: if not script.has_attr('src'): continue for src_head in translate_static.keys(): if script['src'].startswith(src_head): status.log(NAME, ( 'src (static) to translate found: ', script['src'])) new = get_new(script['src'], translate_static[src_head], dir_url) script['src'] = script['src'].replace(src_head, new) status.log(NAME, ( '... translated to (but will get stripped): ', script['src'])) break return soup
def translate_img_src(soup, path_html, dir_url, translate_static): folder_html = os.path.split(path_html)[0] Img = soup.findAll('img') paths_image = [] for img in Img: if not img.has_attr('src'): continue for src_head in translate_static.keys(): if img['src'].startswith(src_head): # TODO add support for no folder status.log(NAME,('src (static) to translate found: ', img['src'])) paths_image.append(os.path.join(folder_html,img['src'])) new = get_new(img['src'],translate_static[src_head],dir_url) img['src'] = img['src'].replace(src_head,new) status.log(NAME,('... translated to: ', img['src'])) img = add_img_alt(img) break return soup, paths_image
def get_display_latex_content(body): latex_starts = ("$$", "\\begin{equation}") latex_ends = ("$$", "\\end{equation}") Span = body.findAll('span') for span in Span: try: # Loop through possible latex starts for latex_start in latex_starts: if latex_start in span.contents[0]: # Init. latex content and trackers in_latex = True in_latex_content = span.contents in_latex_tags = [span] status.log(NAME, ( "<span> containing latex (i.e. {start}) found:" ).format(start=latex_start)) # Get next tag, TODO generalize? _next = span.findNext(('p', 'span')) while in_latex: if _next.name != 'span': # If not span, find next span in_latex_tags += [_next] _next = _next.findChild('span') continue # Add content to leading span in_latex_content += _next.contents status.log(NAME, ( '... more in-line latex content:', ' '.join((_next.contents)).replace('\n', '') )) for latex_end in latex_ends: # Check if latex end is reached if latex_end in _next.contents[0]: in_latex = False # Add in latex tag, for tracking in_latex_tags += [_next] _next = _next.findNext(('p', 'span')) # Delete in latex tags for in_latex_tag in in_latex_tags[1:]: in_latex_tag.extract() # TODO wrap in latex content in "$$" ? except (IndexError, TypeError): # TODO generalize? pass return body
def translate_img_src(soup, path_html, dir_url, translate_static): folder_html = os.path.split(path_html)[0] Img = soup.findAll('img') paths_image = [] for img in Img: if not img.has_attr('src'): continue for src_head in translate_static.keys(): if img['src'].startswith(src_head): status.log(NAME, ( # TODO add support for no folder 'src (static) to translate found: ', img['src'])) paths_image.append(os.path.join(folder_html, img['src'])) new = get_new(img['src'], translate_static[src_head], dir_url) img['src'] = img['src'].replace(src_head, new) status.log(NAME, ( '... translated to: ', img['src'])) img = add_img_alt(img) break return soup, paths_image
def check_tree_config(tree, config, flags): try: path_config = os.path.join(tree, 'config.json') with open(path_config) as f: config_old = json.load(f) if config_old != config: config = config_old status.log(NAME, ("Not overwriting `{}`, as modifications " "from default were found").format(path_config)) flags = ['show-config'] if not config['tags']['title']: flags += ['no-title'] if not config['tags']['meta_description']: flags += ['no-meta_description'] if not config['tutorial_name']: flags += ['no-tutorial_name'] if config['banner_image'] == "": flags += ['no-banner_image'] except: pass return config, flags
def add_header_anchors(body): H_str = ['h1', 'h2', 'h3', 'h4'] h_class = "heading alpha push--ends text--center" a_class = "link--impt" for h_str in H_str: H = body.findAll(h_str) insert_tag = 'a' for h in H: text = h.getText(strip=True, separator=u' ') status.log(NAME, ( 'Header found! text:', text.encode('utf8'))) # If <h{}> is empty, remove it if not text: h.extract() status.log(NAME, ('... is empty, removing it!')) continue # Add id attr to <h{}> _id = text.replace(' ', '-').lower() h['id'] = _id status.log(NAME, ( "... add id: '{}'" ).format(_id.encode('utf8'))) # Add <a href= > around text # -> add class to <a> a_href = '#' + _id insert_attrs = {'href': a_href, 'class': a_class} inserter(h, insert_tag, insert_attrs, text) status.log(NAME, ( "... insert <a href='{}' class='{}'>" ).format(a_href.encode('utf8'), a_class.encode('utf8'))) # -> add class to <h{}> h['class'] = h_class status.log(NAME, ( "... add class '{}' to header" ).format(h_class)) return body
def add_lightbox(body): Img = body.findAll('img') wrap_tag = 'a' for img in Img: status.log(NAME, ('Image found! src:', img['src'])) # If not <a> around <img />, add lightbox ! if not img.findParent('a'): wrap_attrs = { 'href': img['src'], 'data-lightbox': os.path.splitext(os.path.basename(img['src']))[0] } wrap(img, wrap_tag, wrap_attrs) status.log(NAME, ('... wrap with lightbox <a>')) else: status.log(NAME, ("... <a> found around it, doing nothing")) return body
def add_lightbox(body): Img = body.findAll('img') wrap_tag = 'a' for img in Img: status.log(NAME, ('Image found! src:', img['src'])) # If not <a> around <img />, add lightbox ! if not img.findParent('a'): # TODO maybe only lightbox <a> src = img['src'] data = os.path.splitext(os.path.basename(img['src']))[0] wrap_attrs = { 'href': src, 'data-lightbox': data } wrap(img, wrap_tag, wrap_attrs) status.log(NAME, ('... wrap with lightbox <a>')) else: status.log(NAME, ("... <a> found around it, doing nothing")) return body
def format_paragraphs(body): P = body.findAll('p') p_class = 'push-half--ends' for p in P: status.log(NAME, ('Paragraph found!')) # get new inner contents, strip old and insert new inner_contents = get_inner_contents(p.contents, tag_ignore='span', string_ignore=u'\n') strip_contents(p) insert_inner_contents(p, inner_contents) status.log(NAME, ( "... formatting it" ).format(p_class)) # -> add class to <p> p['class'] = p_class status.log(NAME, ( "... add class '{}'" ).format(p_class)) return body
def check_redirects(folder, translate_redirects): paths_subdirs = [ os.path.join(folder, 'published', 'includes'), os.path.join(folder, 'published', 'static', 'images') ] for path_subdirs in paths_subdirs: for new, olds in translate_redirects.items(): path_subdir_new = os.path.join(path_subdirs, new) + '/' path_subdir_old = os.path.join(path_subdirs, olds[-1]) + '/' if (os.path.isdir(path_subdir_old) and not os.path.isdir(path_subdir_new)): status.log(NAME, ('Making', path_subdir_new)) os.makedirs(path_subdir_new) for item in os.listdir(path_subdir_old): path_item = os.path.join(path_subdir_old, item) shutil.copy(path_item, path_subdir_new) status.log(NAME, ('Copying {} to {}').format( path_item, path_subdir_new)) shutil.rmtree(path_subdir_old) status.log(NAME, ('Removing directory {}').format(path_subdir_old)) return
def check_redirects(folder, translate_redirects): paths_subdirs = [ os.path.join(folder,'published','includes'), os.path.join(folder,'published','static','images') ] for path_subdirs in paths_subdirs: for new, olds in translate_redirects.items(): path_subdir_new = os.path.join(path_subdirs, new) + '/' path_subdir_old = os.path.join(path_subdirs, olds[-1]) + '/' if (os.path.isdir(path_subdir_old) and not os.path.isdir(path_subdir_new)): status.log(NAME,('Making', path_subdir_new)) os.makedirs(path_subdir_new) for item in os.listdir(path_subdir_old): path_item = os.path.join(path_subdir_old, item) shutil.copy(path_item, path_subdir_new) status.log(NAME,( 'Copying {} to {}' ).format(path_item, path_subdir_new)) shutil.rmtree(path_subdir_old) status.log(NAME,( 'Removing directory {}' ).format(path_subdir_old)) return
def overwrite_redirects(folder, redirects_py): f_redirects = "{}/published/redirects.py".format(folder) with open(f_redirects, "w") as f: status.log(NAME, ('Writes in', f_redirects)) f.write(redirects_py) return
def print_flags(flags, config, path_html, tree): for flag in flags: if flag == 'show-config': status.log(NAME, ("{}/config.json ['tutorial_name']:\n\t'{}'").format( tree, config['tutorial_name'])) status.log(NAME, ("{}/config.json ['banner_image']:\n\t'{}'").format( tree, config['banner_image'])) status.log(NAME, ("{}/config.json ['tags']['title']:\n\t'{}'").format( tree, config['tags']['title'])) status.log(NAME, ( "{}/config.json ['tags']['meta_description']:\n\t'{}'").format( tree, config['tags']['meta_description'])) elif flag == 'no-title': status.important(NAME, ("There is no <title>\nin `{}`.\n" "Please fill in\n`{}/config.json`").format( path_html, tree)) elif flag == 'multiple-title': status.important( NAME, ("There is more than one <title>\nin `{}`.\n" "Picking the last one for\n`{}/config.json`").format( path_html, tree)) status.log(NAME, ('With last <title> tag, set meta' 'title to "{}"').format(config['tags']['title'])) elif flag == 'no-meta_description': status.important( NAME, ("There is more than one <meta name='description'> in\n`{}`.\n" "Please fill in\n`{}/config.json`").format(path_html, tree)) elif flag == 'multiple-meta_descriptions': status.important( NAME, ("There is more than one <meta name='description'> in\n`{}`.\n" "Picking the last one for\n`{}/config.json`").format( path_html, tree)) status.log(NAME, ('With last <meta name="description"> tag, ' 'set meta description to "{}"').format( config['tags']['meta_description'])) elif flag == 'no-tutorial_name': status.important( NAME, ("Please fill 'tutorial_name' in\n`{}/config.json`" ).format(tree)) elif flag == 'no-banner_image': status.important( NAME, ("Please fill 'banner_image' in\n`{tree}/config.json`:\n" "- For an iframe: set 'banner_image' to the url\n" "- For a static image: set 'banner_image' " "to the image file name\n" " AND copy the image to:\n" " ``{tree_image}``/\n" "- For no banner image, set 'banner_image' to false").format( tree=tree, tree_image=tree.replace('includes', 'static/images'))) elif flag == 'missing-banner_image': status.important( NAME, ("The static banner image linked to 'banner_image' " "({image}) in\n " "`{tree}/config.json`\n " "is not found in\n " "`{tree_image}`/\n " "Please copy it over.").format( image=config['banner_image'], tree=tree, tree_image=tree.replace('includes', 'static/images'))) else: status.log(NAME, ('With <title> tag, set meta title to:\n\t"{}"').format( config['tags']['title'])) status.log(NAME, ('With <meta name="description"> tag, ' 'set meta description to:\n\t"{}"').format( config['tags']['meta_description'])) return
def print_flags(flags, config, path_html, tree): for flag in flags: if flag == 'show-config': status.log(NAME, ( "{}/config.json ['tutorial_name']:\n\t'{}'" ).format(tree, config['tutorial_name'])) status.log(NAME, ( "{}/config.json ['banner_image']:\n\t'{}'" ).format(tree, config['banner_image'])) status.log(NAME, ( "{}/config.json ['tags']['title']:\n\t'{}'" ).format(tree, config['tags']['title'])) status.log(NAME, ( "{}/config.json ['tags']['meta_description']:\n\t'{}'" ).format(tree, config['tags']['meta_description'])) elif flag == 'no-title': status.important(NAME, ( "There is no <title>\nin `{}`.\n" "Please fill in\n`{}/config.json`" ).format(path_html, tree)) elif flag == 'multiple-title': status.important(NAME, ( "There is more than one <title>\nin `{}`.\n" "Picking the last one for\n`{}/config.json`" ).format(path_html, tree)) status.log(NAME, ( 'With last <title> tag, set meta' 'title to "{}"' ).format(config['tags']['title'])) elif flag == 'no-meta_description': status.important(NAME, ( "There is more than one <meta name='description'> in\n`{}`.\n" "Please fill in\n`{}/config.json`" ).format(path_html, tree)) elif flag == 'multiple-meta_descriptions': status.important(NAME, ( "There is more than one <meta name='description'> in\n`{}`.\n" "Picking the last one for\n`{}/config.json`" ).format(path_html, tree)) status.log(NAME, ( 'With last <meta name="description"> tag, ' 'set meta description to "{}"' ).format(config['tags']['meta_description'])) elif flag == 'no-tutorial_name': status.important(NAME, ( "Please fill 'tutorial_name' in\n`{}/config.json`" ).format(tree)) elif flag == 'no-banner_image': status.important(NAME, ( "Please fill 'banner_image' in\n`{tree}/config.json`:\n" "- For an iframe: set 'banner_image' to the url\n" "- For a static image: set 'banner_image' " "to the image file name\n" " AND copy the image to:\n" " ``{tree_image}``/\n" "- For no banner image, set 'banner_image' to false" ).format(tree=tree, tree_image=tree.replace('includes', 'static/images'))) elif flag == 'missing-banner_image': status.important(NAME, ( "The static banner image linked to 'banner_image' " "({image}) in\n " "`{tree}/config.json`\n " "is not found in\n " "`{tree_image}`/\n " "Please copy it over." ).format(image=config['banner_image'], tree=tree, tree_image=tree.replace('includes', 'static/images'))) else: status.log(NAME, ( 'With <title> tag, set meta title to:\n\t"{}"' ).format(config['tags']['title'])) status.log(NAME, ( 'With <meta name="description"> tag, ' 'set meta description to:\n\t"{}"' ).format(config['tags']['meta_description'])) return
def run_analysis(args): img = args[0] #Image to be analyzed batch_path = args[1] #Used for save location batch_log = open(os.path.join(batch_path, 'batch_log.txt'), 'a+') detected_log = open(os.path.join(batch_path, 'detected_log.txt'), 'a+') other_log = open(os.path.join(batch_path, 'other_log.txt'), 'a+') #Accepted extensions ext = os.path.splitext(img)[-1] #Image name to save the heatmap under img_name = ".".join(os.path.split(img)[1].split(".")[:-1]) #Handles video input if ext.lower() == ".mp4": #Create and start the timer t = timer.Timer() t.start() #Analyze the video rtn_name = analyzeVideo( (img, batch_path, 30) ) #Stop the timer t.stop() #Log the results rtn_str = rtn_name, t.get_time(), 0.0 status('-v-', rtn_str ) log(detected_log, '-v-', rtn_str ) else: #Call the algorithms final_scores, final_time, final_stats, unused_images = algorithms(img) #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #Apply colormap to the combined heatmap if it is not already a color image if len(final_scores.shape) < 3: final_image = cv.applyColorMap( final_scores.astype(np.uint8), cv.COLORMAP_JET ) #TODO: Compare with COLORMAP_RAINBOW (First Responders stated that they tend to use it instead of Jet) else: final_image = final_scores #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #Save heatmap in the correct folder if np.max(final_scores) >= 50: #TODO: Re-Evaluate whether this is still a valid requirement results_str = [ img_name, final_time, final_stats ] status('-d-', results_str) log(batch_log, '-d-', results_str) log(detected_log, '-d-', results_str) #cv.imwrite(os.path.join( detected_folder, img_name + ".jpg"), final_image) cv.imwrite(os.path.join( batch_path, "Analyzed", img_name + ".jpg"), final_image) else: results_str = [ img_name, final_time, final_stats ] status( '-o-', results_str) log(batch_log, '-o-', results_str) log(other_log, '-o-', results_str) #cv.imwrite(os.path.join( other_folder, img_name + ".jpg"), final_image) cv.imwrite(os.path.join( batch_path, "Other Analyzed", img_name + ".jpg"), final_image) #Save any resulting images from the algorithms that couldn't be used to produce the heatmap # ct = 1 # for u_img in unused_images: # results_str = [ img_name, final_time, "Modified Original" ] # status( '-i-', "An unused image was detected. Image saved to 'Modified Original' folder.") # status( '-m-', results_str) # log(batch_log, '-m-', results_str) # cv.imwrite(os.path.join( batch_path, "Modified Original", img_name + ".jpg"), u_img) # ct += 1 batch_log.close() detected_log.close() other_log.close() return
def add_img_alt(img): src = img['src'] alt = os.path.split(os.path.split(src)[0])[1] + '/' + os.path.basename(src) img['alt'] = alt status.log(NAME,("... img, add alt='{}'").format(alt)) return img
def get_body_head(soup): status.log(NAME, 'Grabs <body> and <head>') return soup.body, soup.head
def overwrite_urls(folder, urls_py): f_urls = "{}/published/urls.py".format(folder) with open(f_urls, "w") as f: status.log(NAME, ('Writes in', f_urls)) f.write(urls_py) return
def overwrite_sitemaps(folder, sitemaps_py): f_urls = "{}/published/sitemaps.py".format(folder) with open(f_urls, "w") as f: f.write(sitemaps_py) status.log(NAME, ('Writes in', f_urls)) return
def run_analysis(args): img = args[0] #Image to be analyzed batch_path = args[1] #Used for save location batch_log = open(os.path.join(batch_path, 'batch_log.txt'), 'a+') detected_log = open(os.path.join(batch_path, 'detected_log.txt'), 'a+') other_log = open(os.path.join(batch_path, 'other_log.txt'), 'a+') #Accepted extensions ext = os.path.splitext(img)[-1] #Image name to save the heatmap under img_name = ".".join(os.path.split(img)[1].split(".")[:-1]) #Handles video input if ext.lower() == ".mp4": #Create and start the timer t = timer.Timer() t.start() #Analyze the video rtn_name = analyzeVideo((img, batch_path, 30)) #Stop the timer t.stop() #Log the results rtn_str = rtn_name, t.get_time(), 0.0 status('-v-', rtn_str) log(detected_log, '-v-', rtn_str) else: #Call the algorithms final_scores, final_time, final_stats = algorithms(img) #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #Apply colormap to the combined heatmap final_heatmap = cv.applyColorMap(final_scores.astype(np.uint8), cv.COLORMAP_JET) #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #Save heatmap in the correct folder if np.max(final_scores) >= 50: results_str = [img_name, final_time, final_stats] status('-d-', results_str) log(batch_log, '-d-', results_str) log(detected_log, '-d-', results_str) #cv.imwrite(os.path.join( detected_folder, img_name + ".jpg"), final_heatmap) cv.imwrite(os.path.join(batch_path, "Detected", img_name + ".jpg"), final_heatmap) else: results_str = [img_name, final_time, final_stats] status('-o-', results_str) log(batch_log, '-o-', results_str) log(other_log, '-o-', results_str) #cv.imwrite(os.path.join( other_folder, img_name + ".jpg"), final_heatmap) cv.imwrite(os.path.join(batch_path, "Other", img_name + ".jpg"), final_heatmap) batch_log.close() detected_log.close() other_log.close() return
def copy_leaves(tree, paths_leaf): status.log(NAME,('Copying leaves to', tree)) for path_leaf in paths_leaf: shutil.copy(path_leaf, tree) return
def get_soup(path_html): with open(path_html, "r") as f: status.log(NAME,("Opening", path_html)) return BeautifulSoup(f)
def get_soup(path_html): with open(path_html, "r") as f: status.log(NAME, ("Opening", path_html)) return BeautifulSoup(f)
def get_body_head(soup): status.log(NAME,'Grabs <body> and <head>') return soup.body, soup.head
def translate_a_href(soup, dir_url, translate_static, translate_filename_url): A = soup.findAll('a') for a in A: is_translated = False # to log relevant output # Clean up case if not a.getText(strip=True) and not a.findChildren(): a.extract() status.log(NAME, ( 'Anchor with nothing in it found, removing it!')) continue if not a.has_attr('href'): a.extract() status.log(NAME, ( 'Anchor without href found, removing it!!')) continue # Now if 'real' anchor found status.log(NAME, ('Anchor found! href: ', a['href'])) # Case 1: <a> to static location (translated from streambed) for href_head in translate_static.keys(): if a['href'].startswith(href_head): status.log(NAME, ( '... href has a *static* start: ', href_head)) new = get_new(a['href'], translate_static[href_head], dir_url) a['href'] = a['href'].replace(href_head, new) is_translated = True break # Case *: handle Google redirects google_starts = ('https://www.google.com/url?q=', 'http://www.google.com/url?q=') google_end = '&' # TODO could this be more strict? for google_start in google_starts: if a['href'].startswith(google_start): status.log(NAME, ('... href has a google redirect')) _s = a['href'].find(google_start) + len(google_start) _e = a['href'].find(google_end) a['href'] = ( a['href'][_s:_e].replace('%3A', ':') .replace('%2F', '/') ) # Case 2: <a> to url location (translated to relative domain) href_starts = ['https://plot.ly/', 'plot.ly/', 'http://plot.ly/', '/'] for href_start in href_starts: if a['href'].startswith(href_start): # 2.1 href to shareplot should have full URI if a['href'].startswith(href_start+'~'): status.log(NAME, ( '... href links to shareplot:', a['href'])) status.log(NAME, ( '... guessing this is referring to a plot on prod')) a['href'] = a['href'].replace( href_start, 'https://plot.ly/', 1) is_translated = True continue # 2.2 Translate href start to django root if not a['href'].startswith('/'): status.log(NAME, ('... href *url* start: ', href_start)) a['href'] = a['href'].replace(href_start, '/', 1) is_translated = True # 2.3 Translate href to other docs using translate_filename_url for href_tail in translate_filename_url.keys(): if href_tail in a['href']: status.log(NAME, ('... href has tail: ', a['href'])) a['href'] = a['href'].replace( href_head, translate_static[href_tail]) is_translated = True break # Log output if is_translated: status.log(NAME, ('... translated to: ', a['href'])) else: status.log(NAME, ('... no translation required')) # Add attributes a = add_a_class(a) a = add_a_target_blank(a) return soup
def print_flags(flags, config, path_html, tree): for flag in flags: if flag=='show-config': status.log(NAME,( "{}/config.json ['tutorial_name']:\n\t'{}'" ).format(tree,config['tutorial_name'])) status.log(NAME,( "{}/config.json ['tags']['title']:\n\t'{}'" ).format(tree,config['tags']['title'])) status.log(NAME,( "{}/config.json ['tags']['meta_description']:\n\t'{}'" ).format(tree,config['tags']['meta_description'])) elif flag=='no-title': status.important(NAME,( "There is no <title>\nin `{}`.\n" "Please fill in\n`{}/config.json`" ).format(path_html,tree)) elif flag=='multiple-title': status.important(NAME,( "There is more than one <title>\nin `{}`.\n" "Picking the last one for\n`{}/config.json`" ).format(path_html,tree)) status.log(NAME,( 'With last <title> tag, set meta' 'title to "{}"' ).format(config['tags']['title'])) elif flag=='no-meta_description': status.important(NAME,( "There is more than one <meta name='description'> in\n`{}`.\n" "Please fill in\n`{}/config.json`" ).format(path_html,tree)) elif flag=='multiple-meta_descriptions': status.important(NAME,( "There is more than one <meta name='description'> in\n`{}`.\n" "Picking the last one for\n`{}/config.json`" ).format(path_html,tree)) status.log(NAME,( 'With last <meta name="description"> tag, ' 'set meta description to "{}"' ).format(config['tags']['meta_description'])) elif flag=='no-tutorial_name': status.important(NAME,( "Please fill 'tutorial_name' in\n`{}/config.json`" ).format(tree)) else: status.log(NAME,( 'With <title> tag, set meta title to:\n\t"{}"' ).format(config['tags']['title'])) status.log(NAME,( 'With <meta name="description"> tag, set meta description to:\n\t"{}"' ).format(config['tags']['meta_description'])) return
def main(): folders = get_args() for folder in folders: # Get translate info for folder-specific files translate_static = translate.get_translate_static(folder) translate_filename_url, translate_redirects = ( translate.get_translate_filename_url(folder)) # Get paths of all html files in {folder}/raw/ paths_html = get_paths_html(folder) # Check if paths are in translate_filename_url (update if necessary) paths_html = check_translate(folder, paths_html, translate_filename_url) # Check if there are directories to redirect check_redirects(folder, translate_redirects) # Check if {folder}/published/* corresp. to translate_filename_url check_published_subdirectories(folder, translate_filename_url) # (1) Make body.html and config.json for each html file for path_html in paths_html: # Get published files directory url (and name, they are the same!) file_html = os.path.split(path_html)[1] dir_url = translate_filename_url[file_html] # Get published tree for this html file tree_includes = make_tree([folder, 'published', 'includes', dir_url]) # Get soup and split <body> and <head> soup = get_soup(path_html) body, head = get_body_head(soup) # Translate 'href' and 'src' in body body, paths_image = translate.translate(body,path_html,dir_url, translate_static, translate_filename_url) # Get config info from head config = make_config.make_config(head, path_html, tree_includes) # Update <body> ! body = update_body.update_body(body) # Overwrite body.html and config.json leaves overwrite_leaves(tree_includes, [(body, 'body.html'), (config, 'config.json')]) # (2) Copy images in the appropriate published/ subdirectories tree_images = make_tree([folder, 'published', 'static', 'images', dir_url]) copy_leaves(tree_images,paths_image) status.log(NAME,'---- done with `{}`\n'.format(dir_url)) # (3) Make/print folder-wide urls, redirects and sitemaps files make_urls.make_urls(folder, translate_filename_url) make_redirects.make_redirects(folder, translate_redirects) make_sitemaps.make_sitemaps(folder, translate_filename_url)
def copy_leaves(tree, paths_leaf): status.log(NAME, ('Copying leaves to', tree)) for path_leaf in paths_leaf: shutil.copy(path_leaf, tree) return
def main(): folders = get_args() for folder in folders: # Get translate info for folder-specific files translate_static = translate.get_translate_static(folder) translate_filename_url, translate_redirects = ( translate.get_translate_filename_url(folder)) # Get paths of all html files in {folder}/raw/ paths_html = get_paths_html(folder) # Check if paths are in translate_filename_url (update if necessary) paths_html = check_translate(folder, paths_html, translate_filename_url) # Check if there are directories to redirect check_redirects(folder, translate_redirects) # Check if {folder}/published/* corresp. to translate_filename_url check_published_subdirectories(folder, translate_filename_url) # (1) Make body.html and config.json for each html file for path_html in paths_html: # Get published files directory url (and name, they are the same!) file_html = os.path.split(path_html)[1] dir_url = translate_filename_url[file_html] # Get published tree for this html file tree_includes = make_tree( [folder, 'published', 'includes', dir_url]) # Get soup, split <body> and <head> and strip style soup = get_soup(path_html) body, head = get_body_head(soup) body = update_body.strip(body) # Translate 'href' and 'src' in body body, paths_image = translate.translate(body, path_html, dir_url, translate_static, translate_filename_url) # Get config info from head config = make_config.make_config(head, path_html, tree_includes) # Update <body> ! body = update_body.update_body(body) # Overwrite body.html and config.json leaves overwrite_leaves(tree_includes, [(body, 'body.html'), (config, 'config.json')]) # (2) Copy images in the appropriate published/ subdirectories tree_images = make_tree( [folder, 'published', 'static', 'images', dir_url]) copy_leaves(tree_images, paths_image) status.log(NAME, '---- done with `{}`\n'.format(dir_url)) # (3) Make/print folder-wide urls, redirects and sitemaps files make_urls.make_urls(folder, translate_filename_url) make_redirects.make_redirects(folder, translate_redirects) make_sitemaps.make_sitemaps(folder, translate_filename_url)
def overwrite_sitemaps(folder,sitemaps_py): f_urls = "{}/published/sitemaps.py".format(folder) with open(f_urls, "w") as f: f.write(sitemaps_py) status.log(NAME,('Writes in', f_urls)) return
#Create the log files if they do not exist #TODO: Add a time stamp when the file was created b_log = open(os.path.join(batch_path, 'batch_log.txt'), 'a+') d_log = open(os.path.join(batch_path, 'detected_log.txt'), 'a+') o_log = open(os.path.join(batch_path, 'other_log.txt'), 'a+') e_log = open(os.path.join(batch_path, 'error_log.txt'), 'a+') #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- if __name__ == '__main__': total_time.start() status('-i-', 'Initialization completed') status('-i-', 'Beginning Image Analysis...') log(b_log, '-i-', 'Initialization completed') log(b_log, '-i-', 'Beginning Image Analysis...') #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- if __name__ == '__main__': #Free up the log files b_log.close() d_log.close() o_log.close() e_log.close() #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- if __name__ == '__main__': #In Windows you need to protect the thread creation froms each child thread. If not done, each child thread will create subthreads.