def CreateBody(self): '''Создаем body''' body = Tag(self.soup, 'body') totalTagsCount = random.randint(150, 400) '''Создаем структуру шаблона из тегов div''' for _ in range(random.randint(1, 3)): body.append(self.CreateDiv()) divsTotalCount = totalTagsCount * random.randint(15, 25) / 100 while divsTotalCount > 0: divsLowLevelList = [item for item in body.findAll('div') if len(item.findAll(True)) == 0] divToExtend = random.choice(divsLowLevelList) for _ in range(random.randint(2, 4)): divToExtend.append(self.CreateDiv()) divsTotalCount -= 1 '''Получаем список тегов div разных уровней''' divsList = body.findAll('div') divsTopLevelList = [item for item in body.findAll('div', recursive=False)] divsLowLevelList = [item for item in divsList if len(item.findAll(True)) == 0] divsMidLevelList = [item for item in divsList if item not in divsTopLevelList and item not in divsLowLevelList] '''Проставляем им атрибуты''' for item in divsTopLevelList: self.AppendIds(item, 95, 1) for item in divsMidLevelList: self.AppendIds(item, 20, 75) for item in divsLowLevelList: self.AppendIds(item, 30, 65) '''Создаем наполнение главных блоков''' divHeader = divsLowLevelList.pop(random.randint(0, 2)) divHeader.string = '[header]' divMain = divsLowLevelList.pop(random.randint(1, 3)) divMain.string = '[main]' divLinks = divsLowLevelList.pop(random.randint(-3, -1)) divLinks.string = '[links]' divFooter = divsLowLevelList.pop(random.randint(-3, -1)) divFooter.string = '[footer]' '''Создаем меню, сайдбары и формы''' for _ in range(random.randint(1, 2)): menu = divsLowLevelList.pop() menu.append(self.CreateList(0)) for _ in range(random.randint(1, 2)): sidebar = divsLowLevelList.pop() self.CreateSidebar(sidebar) for _ in range(random.randint(0, 2)): form = divsLowLevelList.pop() form.append(self.CreateForm()) '''Создаем прочее наполнение''' random.shuffle(divsLowLevelList) for _ in range(random.randint(2, 5)): div = divsLowLevelList.pop() self.CreateOthers(div) self.soup.html.append(body)
def get_last_3(soup, table): loop = 0 first = 1 enclose = Tag(soup, "div") ul = Tag(soup, "ul") for tr in table.findAll("tr"): td = tr.findAll("td") li = Tag(soup, "li") for el in td[3:]: if loop != 3: try: text = ''.join(el.findAll(text=True)) text = text.strip() if text != '' and text != ' ': el.name = "span" if loop != 2: el.append(' - ') li.append(el) except: pass else: break loop += 1 loop = 0 if ''.join(li.findAll(text=True)) != '': enclose.append(li) return enclose
def get_first_three(soup, table): loop = 0 first = 1 enclose = Tag(soup, "div") for tr in table.findAll("tr"): li = Tag(soup, "li") for td in tr.findAll("td"): if loop != 3: try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': td.name = "span" if first == 1: first = 0 enclose.append(td) else: if loop != 2: td.append(' - ') li.append(td) except: pass else: break loop += 1 loop = 0 if ''.join(li.findAll(text=True)) != '': enclose.append(li) title = enclose.find("span") enclose.find("span").replaceWith("") enclose.name = "ul" div = Tag(soup, "div") div.append(title) div.append(enclose) return div
def get_last_3(soup, table): loop = 0 first = 1 enclose = Tag(soup, "div") ul = Tag(soup, "ul") for tr in table.findAll("tr"): td = tr.findAll("td") li = Tag(soup, "li") for el in td[3:]: if loop != 3: try: text = ''.join(el.findAll(text=True)) text = text.strip() if text != '' and text != ' ': el.name = "span" if loop != 2: el.append(' - ') li.append(el) except: pass else: break loop += 1 loop = 0 if ''.join(li.findAll(text=True)) != '': enclose.append(li) return enclose
def get_first_three(soup, table): loop = 0 first = 1 enclose = Tag(soup, "div") for tr in table.findAll("tr"): li = Tag(soup, "li") for td in tr.findAll("td"): if loop != 3: try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': td.name = "span" if first == 1: first = 0 enclose.append(td) else: if loop != 2: td.append(' - ') li.append(td) except: pass else: break loop += 1 loop = 0 if ''.join(li.findAll(text=True)) != '': enclose.append(li) title = enclose.find("span") enclose.find("span").replaceWith("") enclose.name = "ul" div = Tag(soup, "div") div.append(title) div.append(enclose) return div
def fix_heading(heading, tags): ''' Remove paragraphs with no strings. Remove non-special headings that don't start with a paragraph. Remove lists from non-special headings. ''' SPECIAL = ['Books', 'Works', 'Bibliography', 'External links', 'Further reading'] tags = [tag for tag in tags if tag is not None and tag.name!='p' or tag.renderContents(None).strip()] special = False heading_text = tagtext(heading) for word in SPECIAL: if word.lower() in heading_text.lower(): special = True if heading_text == 'External links and references': set_heading_text(heading, 'External links') # Shorten lists (even special ones). # The motivation is that some pages like to list reams of crap, # usually in bibliographies, but in other things too. found_lis = 0 MAX_ITEMS = 10 # per headed section for tag in list(tags): if tag.name in ('ul', 'ol'): for li in tag.findAll('li', recursive=False): found_lis += 1 if found_lis > MAX_ITEMS: li.extract() # Remove any now-empty uls and ols. # Harder than it sounds, due to nested lists. temp = Tag(soup, 'p') for tag in tags: temp.append(tag) for tag in temp.findAll(('ul', 'ol')): if not tag.findAll(('ul', 'ol', 'li')): tag.extract() tags = temp.contents if found_lis > MAX_ITEMS: # Add " (some omitted)" to heading if heading_text: heading_text = heading_text.replace(' (incomplete)', '') if context['srcurl'].startswith('http:'): heading_text += ' (some <a href="%s">omitted</a>)' % context['srcurl'] else: heading_text += ' (some omitted)' # no "relative" links set_heading_text(heading, heading_text) if not special: if heading is not None: # Remove non-special headings which don't start with a paragraph. if not tags or tags[0].name != 'p': return drop_heading(heading) # Remove non-special headings containing lists. for tag in tags: if tag.name in ('ul', 'ol'): return drop_heading(heading) else: # Remove lists from None (before first heading, if any). tags = [tag for tag in tags if tag.name not in ('ul', 'ol')] return (heading, tags)
def CreateList(self, probNested): '''Создаем список ul, вложенный с заданной вероятностью''' ul = Tag(self.soup, 'ul') self.AppendIds(ul, 50, 30) liClass = self.GenerateClass(0) for _ in range(random.randint(3, 7)): ul.append(self.CreateListItem(liClass)) if self._Probability(probNested): liNestedList = ul.findAll('li') random.shuffle(liNestedList) liNestedList = liNestedList[:random.randint(1, 4)] for liNested in liNestedList: liNested.append(self.CreateList(0)) for li in ul.findAll('li'): if len(li.findAll(True)) == 0: li.append(self.CreateLinkText()) return ul
def CreateList(self, probNested): '''Создаем список ul, вложенный с заданной вероятностью''' ul = Tag(self.soup, 'ul') self.AppendIds(ul, 50, 30) liClass = self.GenerateClass(0) for _ in range(random.randint(3, 7)): ul.append(self.CreateListItem(liClass)) if self._Probability(probNested): liNestedList = ul.findAll('li') random.shuffle(liNestedList) liNestedList = liNestedList[:random.randint(1, 4)] for liNested in liNestedList: liNested.append(self.CreateList(0)) for li in ul.findAll('li'): if len(li.findAll(True)) == 0: li.append(self.CreateLinkText()) return ul
def linearize_cols_1_4(soup, table): if table.get('id') == "linearize-cols-1-4": div = Tag(soup, "ul") for i in range(4): for tr in table.findAll("tr"): td = tr.find("td") tr.find("td").replaceWith("") div.append(td) list_a = div.findAll("a") composite_list = [list_a[x:x+4] for x in range(0, len(list_a), 4)] ul = Tag(soup, "ul") for lista in composite_list: li = Tag(soup, "li") for a in lista: if a == lista[-1]: a = BeautifulSoup(a.prettify()) else: a = BeautifulSoup(a.prettify() + '<span> | </span>') li.append(a) ul.append(li) table.replaceWith(ul)
def linearize_cols_1_4(soup, table): if table.get('id') == "linearize-cols-1-4": div = Tag(soup, "ul") for i in range(4): for tr in table.findAll("tr"): td = tr.find("td") tr.find("td").replaceWith("") div.append(td) list_a = div.findAll("a") composite_list = [list_a[x:x + 4] for x in range(0, len(list_a), 4)] ul = Tag(soup, "ul") for lista in composite_list: li = Tag(soup, "li") for a in lista: if a == lista[-1]: a = BeautifulSoup(a.prettify()) else: a = BeautifulSoup(a.prettify() + '<span> | </span>') li.append(a) ul.append(li) table.replaceWith(ul)
def get_first_two(soup, table): loop = 0 enclose = Tag(soup, "div") for tr in table.findAll("tr"): li = Tag(soup, "li") for td in tr.findAll("td"): if loop != 2: try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': td.name = "span" if loop != 1: td.append(' - ') li.append(td) except: pass else: break loop += 1 loop = 0 if ''.join(li.findAll(text=True)) != '': enclose.append(li) return enclose
def get_first_two(soup, table): loop = 0 enclose = Tag(soup, "div") for tr in table.findAll("tr"): li = Tag(soup, "li") for td in tr.findAll("td"): if loop != 2: try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': td.name = "span" if loop != 1: td.append(' - ') li.append(td) except: pass else: break loop += 1 loop = 0 if ''.join(li.findAll(text=True)) != '': enclose.append(li) return enclose
anchorLink = column.find('a')['href'] fileName = anchorLink.split('/')[-1].replace('.shtml', '.html') # 処理対象のバンドのみ処理する。指定がない場合はすべて。 if not TARGET_BAND or fileName[:-5] == TARGET_BAND: bandBreadCrumbsSoup = BeautifulSoup() parentUlTag = createParentUlTag(bandBreadCrumbsSoup) bandListTag = column.find('li') albumUlTag = Tag(bandBreadCrumbsSoup, 'ul') bandListTag.append(albumUlTag) for childColumn in column.findAll('ul', attrs = {'class' : 'child-column'}): albumUlTag.append(childColumn.find('li')) parentUlTag.append(bandListTag) # ファイル生成 resultFile = open('/'.join([PARENT_DIR, 'common/bread_crumbs', fileName]), 'w') resultFile.write(parentUlTag.prettify()) resultFile.close() print "write %s" % fileName # アルバムファイルのパンくずリスト for childColumn in albumUlTag.findAll('li'): childAnchorLink = childColumn.find('a')['href'] albumParentUlTag = copy.deepcopy(parentUlTag) albumParentUlTag.append(copy.deepcopy(childColumn)) # ファイル生成 splitList = childAnchorLink.split('/') childFileName = '/'.join([splitList[-3], splitList[-1]]).replace('.shtml', '.html') resultFile = open('/'.join([PARENT_DIR, 'common/bread_crumbs', childFileName]), 'w') resultFile.write(albumParentUlTag.prettify()) resultFile.close() print "write %s" % childFileName
def parse_html(self): title = None body = None bodysoup = None nav_title = None meta_title = None content_title = None self.attachments = [] if (not self.debug_path or self.debug_path == self.path): # Remove nbsps self.html = self.html.replace(" ", "") # Remove attributes from closing tags (!) self.html = re.sub(r"</([a-zA-Z]+) [^>]+>", r"</\1>", self.html) m = RE_BODY.search(self.html) if m and m.lastindex == 1: bodysoup = BeautifulSoup(m.group(1), convertEntities=BeautifulStoneSoup.HTML_ENTITIES) else: try: bodysoup = BeautifulSoup(self.html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES).html.body if not bodysoup: fixed_html = self.html.replace("</head>", "</head><body>") bodysoup = BeautifulSoup(fixed_html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES).html.body except AttributeError: pass if not bodysoup: raise ImportError("No body") if self.debug_path == self.path: print "\n\n========= DEBUG =========\n" # Remove comments for comment in (bodysoup.findAll(text=lambda text:isinstance(text, Comment))): self.debug("Removed comment: <!-- %s -->" % comment) comment.extract() bodysoup = BeautifulSoup(bodysoup.prettify()) # Convert header divs into h1, h2 h1_found = False for tag in bodysoup.findAll("div"): if tag.get("class") == "BL-otsikko1" and not h1_found: h1_found = True tag.name = "h1" self.debug("Converted into H1: %s" % tag) elif tag.get("class") == "BL-otsikko2" or \ (tag.get("class") == "BL-otsikko1" and h1_found): tag.name = "h2" self.debug("Converted into H2: %s" % tag) elif tag.get("class") == "BL-leivanmurut": tag.extract() self.debug("Removed breadcrumbs") else: tag.hidden = True bodysoup = BeautifulSoup(bodysoup.prettify()) # Remove unwanted elements for tag in bodysoup.findAll(["style", "link"]): tag.extract() self.debug("Removed %s" % tag) bodysoup = BeautifulSoup(bodysoup.prettify()) # Hide unnecessary elements for tag in bodysoup.findAll(["span", "div", "body", "font"]): self.debug("Set hidden: %s" % tag.name) tag.hidden = True bodysoup = BeautifulSoup(bodysoup.prettify()) # Reformat forms for form in bodysoup.findAll("form"): pass # Hide non-semantic tables for table in bodysoup.findAll("table"): if table.get("border") != "1" and len(table.findAll("tr", recursive=False)) < 100 and not has_ancestor(table, "form"): table.hidden = True for tr in table.findAll("tr", recursive=False): tr.hidden = True for td in tr.findAll(["td", "th"], recursive=False): text = td.find(text=re.compile("[^\s]+", re.U)) if text and text.strip() != "": td.name = "p" self.debug("Converted th/td into p: %s" % td) else: td.hidden = True self.debug("Hid non-semantic table") bodysoup = BeautifulSoup(bodysoup.prettify()) # Remove orphan th/td/tr for el in bodysoup.findAll(["tr", "td", "th"]): if el.parent.name != "table" or (el.parent.parent and el.parent.parent.name != "table"): if el.name in ("td", "td"): text = el.find(text=re.compile("[^\s]+", re.U)) if text and text.strip() != "": el.name = "p" self.debug("Converted td/th into p: %s" % el) else: el.hidden = True self.debug("Hid orphan %s: %s" % (el.name, el)) else: el.hidden = True self.debug("Hid orphan %s: %s" % (el.name, el)) bodysoup = BeautifulSoup(bodysoup.prettify()) # Wrap NavigableStrings in td into p for tag in bodysoup.findAll(text=lambda text:isinstance(text, NavigableString)): if tag.parent.name == "td" and tag.strip() != "": p = Tag(bodysoup, "p") p.insert(0, "%s" % tag) tag.replaceWith(p) self.debug("Moved from td into p: %s" % tag) bodysoup = BeautifulSoup(bodysoup.prettify()) # Convert "loose" NavigableStrings into paragraphs for tag in bodysoup.findAll(text=lambda text:isinstance(text, NavigableString)): if len(tag.strip()) > 10 and tag.parent.name == "[document]": p = Tag(bodysoup, "p") p.insert(0, "%s" % tag) tag.replaceWith(p) self.debug("Moved loose string into p: %s" % tag) bodysoup = BeautifulSoup(bodysoup.prettify()) # Move NavigableStrings after list into p before moving lists for ul in bodysoup.findAll("ul"): if ul.parent.name == "p": next = ul.nextSibling if isinstance(next, NavigableString): p = Tag(bodysoup, "p") p.insert(0, "%s" % next) next.replaceWith(p) self.debug("Moved NavigableString after list into p: %s" % p) bodysoup = BeautifulSoup(bodysoup.prettify()) # Move blocks outside paragraphs for block in bodysoup.findAll(["p", "ul", "h1", "h2"]): parent = block.parent if parent.name == "p": if block.name in ("h1", "h2"): parent.parent.insert(parent.parent.index(parent), block) self.debug("Moved %s before p" % block.name) else: parent.parent.insert(parent.parent.index(parent) + 1, block) self.debug("Moved %s after p" % block.name) # Delete depracated attributes for tag in bodysoup.findAll(): for attr in ("align", "valign", "class", "style", "border", "vspace", "hspace", "cellpadding", "cellspacing"): del(tag[attr]) for attr in ("width", "height"): if tag.name != "img": del(tag[attr]) for attr in ("colspan", "rowspan"): if not tag.name in ("td", "tr"): del(tag[attr]) bodysoup = BeautifulSoup(bodysoup.prettify()) # Import images for tag in bodysoup.findAll("img"): src = tag.get("src") if src and src.endswith(".gif") and src.find("/tyhja-") != -1: tag.extract() elif src and not src.startswith("http://"): img_path = os.path.dirname(os.path.join(self.source_dir, self.source_path)) + "/" + src if os.path.exists(img_path) and os.path.isfile(img_path): img_title = tag.get("title") or tag.get("alt") or "" img = Image.create_from_file(img_path, img_title[0:100]) img.tmp_orig_path = src img.save() tag["src"] = img.file.url self.debug("Imported image: %s" % tag["src"]) bodysoup = BeautifulSoup(bodysoup.prettify()) # Import external files into Attachment models for a in bodysoup.findAll("a", href=re.compile(".+")): href = a.get("href") path, ext = os.path.splitext(href) if not href.startswith("http://") and ext != "" and not ext in (".html", ".shtml", ".php", ".jpg", ".gif", ".png"): if href.startswith("/"): abspath = self.source_dir + href else: abspath = self.source_dir + os.path.dirname(self.source_path) + "/" + href if os.path.exists(abspath): self.debug("Found attachment: %s" % abspath) self.attachments.append(abspath) # store for later import as we don't have page id yet # Remove bad linebreaks for br in bodysoup.findAll("br"): if br.parent.name == "p": for sib in (br.previousSibling, br.nextSibling): if not sib or (isinstance(sib, NavigableString) and sib.strip() == ""): self.debug("Removed linebreak at (%s, %s)" % (br.parent.index(br), br.parent)) br.extract() break elif br.parent.name == "[document]": br.extract() bodysoup = BeautifulSoup(bodysoup.prettify()) # Clean up paragraphs for p in bodysoup.findAll("p"): non_sentence = lambda str:str != None and not str.strip().endswith(".") and 3 < len(str) < 30 # Remove empty if p.string and p.string.strip() == "": self.debug("Removed empty p at (%s, %s)" % (p.parent.name, p.parent.index(p))) p.extract() # Hide if contains only tag(s) elif not p.findAll(text=re.compile(r"[^\s]+", re.U)): self.debug("Hid p with no text: %s" % p) p.hidden = True # Convert short one-liners into h3 elif non_sentence(p.string) or (len(p.findAll(text=re.compile("[^\s]+", re.U))) == 1 and non_sentence(p.contents[0].string)): p.name = "h3" self.debug("Converted p into h3: %s" % p) # Remove bad styling else: tags = p.findAll(recursive=False) if len(tags) == 1 and tags[0].name in ("b", "u", "i"): #if not tags[].previousSibling and not el.nextSibling: # el.hidden = True #self.debug("Hid %s from p, only child" % el.name) if not p.findAll(text=re.compile("[^\s]+", re.U), recursive=False): #print "!!! %s" % p tags[0].hidden = True self.debug("Hid %s from p, bad styling: %s" % (tags[0].name, p)) bodysoup = BeautifulSoup(bodysoup.prettify()) # Remove redundant information for text in bodysoup.findAll(text=re.compile(r"^\s*pdf-tiedosto [0-9]+ KB\s+$")): self.debug("Removed text: %s" % text) text.extract() bodysoup = BeautifulSoup(bodysoup.prettify()) # Clean up headings for h in bodysoup.findAll(["h1", "h2", "h3", "h4", "h5", "h6"]): for el in h.findAll(): # Remove styling elements (u, b, i, etc) if isinstance(el, Tag) and el.name != "a": el.hidden = True self.debug("Heading clean-up, hid %s in %s" % (el.name, h)) try: # Move h1 at first if h.name == "h1" and h.parent.index(h) != 1: h.parent.insert(1, h) self.debug("Moved %s at first" % h.name) # Convert any heading at the beginning of document into h1 elif h.name != "h1" and h.parent.name == "[document]" and not h.previousSibling: self.debug("Converted into h1: %s" % h) h.name = "h1" except IndexError: pass bodysoup = BeautifulSoup(bodysoup.prettify()) # Convert internal links for a in bodysoup.findAll("a"): href = a.get("href") if href and not href.startswith("http://") and href.endswith(".shtml"): a["href"] = href.replace("/index.shtml", "").replace(".shtml", "").replace("_", "-") self.debug("Fixed link: %s -> %s" % (href, a["href"])) bodysoup = BeautifulSoup(bodysoup.prettify()) # Parse content_title text h1 = bodysoup.find("h1") if h1: content_title = " ".join(h1.findAll(text=True)) content_title = re.sub("[\s]+", " ", content_title).strip() # Reformat body = u"" + bodysoup.prettify().decode("UTF8") #print "type: %s" % type(body) #raise SystemExit() body = re.sub(r"\s+>\s+", " > ", body) body = re.sub(r"\s+<\s+", " < ", body) body = re.sub(r"[\n\r]+", " ", body) body = re.sub(r"[ \t]+", " ", body) body = re.sub(r">\s+", ">", body) body = re.sub(r"\s+<", "<", body) body = re.sub(r"</(p|h1|h2|h3|h4|h5|h6|ul|ol|table|tr)>", r"</\1>\n\n", body) body = re.sub(r"</(li|td)>", r"</\1>\n", body) body = re.sub(r"<(u|b|i|em|strong)>\s*", r" <\1>", body) body = re.sub(r"\s*</(u|b|i|em|strong)>", r"</\1> ", body) body = re.sub(r"</a>([^\-])", r"</a> \1", body) body = re.sub(r"<a ", " <a ", body) body = re.sub(r"\s+(\.|,|:|;|!|\?)", r"\1", body) # Is body valid UTF8? try: body.encode("UTF8") except UnicodeError: print "DAA" raise SystemExit() else: body = "(debug mode, no content parsed)" docsoup = BeautifulSoup(self.html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) # nav title for text in docsoup.findAll(text=re.compile("^#include")): m = re.compile('"(.+)"').search(text) if m and m.lastindex == 1: include_path = m.group(1) if include_path.find("valikko") != -1: if not include_path.startswith("/"): if self.source_path.endswith("/index.shtml"): include_path = os.path.join(self.path, include_path) else: include_path = os.path.join("/".join(self.path.split("/")[0:-1]), include_path) f = open(self.source_dir + include_path) navsoup = BeautifulSoup(f.read(), convertEntities=BeautifulStoneSoup.HTML_ENTITIES) f.close() for a in navsoup.findAll("a", href="/"+self.source_path): if a.get("class") == "valikon_tekstit" or a.parent.get("class") == "avattu_alavalikko": nav_title = a.find(text=re.compile("[^\s]+")).strip() break # meta title try: meta_title = docsoup.head.title.string except AttributeError: pass if meta_title: valid_meta_title_parts = [] for part in [part.strip() for part in meta_title.split(" - ")]: if part not in ("BirdLife Suomi", u"Yhdessä lintujen puolesta"): valid_meta_title_parts.append(part) meta_title = u" – ".join(valid_meta_title_parts) # choose best title self.debug("Titles: nav: '%s', meta: '%s', content: '%s'" % (nav_title, meta_title, content_title)) if nav_title: title = nav_title self.debug("Title choice: nav_title: %s" % title) elif content_title and meta_title and len(content_title) < len(meta_title): title = content_title self.debug("Title choice: content_title (shorter): %s" % title) elif meta_title: title = meta_title self.debug("Title choice: meta_title: %s" % title) elif content_title: title = content_title self.debug("Title choice: content_title: %s" % title) else: title = "%s (autogen)" % self.slug.capitalize() self.debug("Title choice: autogenerated from slug: %s" % title) if not title: raise ImportError("No title") if self.level == 0: if self.slug in ("liity", "suojelu", "lintuharrastus", "julkaisut", "yhdistys"): self.template = "osio.html" #else: # self.level = 0 if self.level == 0 and self.slug == "": self.template = "etusivu.html" self.title = title self.body = body
def CreateBody(self): '''Создаем body''' body = Tag(self.soup, 'body') totalTagsCount = random.randint(150, 400) '''Создаем структуру шаблона из тегов div''' for _ in range(random.randint(1, 3)): body.append(self.CreateDiv()) divsTotalCount = totalTagsCount * random.randint(15, 25) / 100 while divsTotalCount > 0: divsLowLevelList = [ item for item in body.findAll('div') if len(item.findAll(True)) == 0 ] divToExtend = random.choice(divsLowLevelList) for _ in range(random.randint(2, 4)): divToExtend.append(self.CreateDiv()) divsTotalCount -= 1 '''Получаем список тегов div разных уровней''' divsList = body.findAll('div') divsTopLevelList = [ item for item in body.findAll('div', recursive=False) ] divsLowLevelList = [ item for item in divsList if len(item.findAll(True)) == 0 ] divsMidLevelList = [ item for item in divsList if item not in divsTopLevelList and item not in divsLowLevelList ] '''Проставляем им атрибуты''' for item in divsTopLevelList: self.AppendIds(item, 95, 1) for item in divsMidLevelList: self.AppendIds(item, 20, 75) for item in divsLowLevelList: self.AppendIds(item, 30, 65) '''Создаем наполнение главных блоков''' divHeader = divsLowLevelList.pop(random.randint(0, 2)) divHeader.string = '[header]' divMain = divsLowLevelList.pop(random.randint(1, 3)) divMain.string = '[main]' divLinks = divsLowLevelList.pop(random.randint(-3, -1)) divLinks.string = '[links]' divFooter = divsLowLevelList.pop(random.randint(-3, -1)) divFooter.string = '[footer]' '''Создаем меню, сайдбары и формы''' for _ in range(random.randint(1, 2)): menu = divsLowLevelList.pop() menu.append(self.CreateList(0)) for _ in range(random.randint(1, 2)): sidebar = divsLowLevelList.pop() self.CreateSidebar(sidebar) for _ in range(random.randint(0, 2)): form = divsLowLevelList.pop() form.append(self.CreateForm()) '''Создаем прочее наполнение''' random.shuffle(divsLowLevelList) for _ in range(random.randint(2, 5)): div = divsLowLevelList.pop() self.CreateOthers(div) self.soup.html.append(body)
bandListTag = column.find('li') albumUlTag = Tag(bandBreadCrumbsSoup, 'ul') bandListTag.append(albumUlTag) for childColumn in column.findAll('ul', attrs={'class': 'child-column'}): albumUlTag.append(childColumn.find('li')) parentUlTag.append(bandListTag) # ファイル生成 resultFile = open( '/'.join([PARENT_DIR, 'common/bread_crumbs', fileName]), 'w') resultFile.write(parentUlTag.prettify()) resultFile.close() print "write %s" % fileName # アルバムファイルのパンくずリスト for childColumn in albumUlTag.findAll('li'): childAnchorLink = childColumn.find('a')['href'] albumParentUlTag = copy.deepcopy(parentUlTag) albumParentUlTag.append(copy.deepcopy(childColumn)) # ファイル生成 splitList = childAnchorLink.split('/') childFileName = '/'.join([splitList[-3], splitList[-1] ]).replace('.shtml', '.html') resultFile = open( '/'.join( [PARENT_DIR, 'common/bread_crumbs', childFileName]), 'w') resultFile.write(albumParentUlTag.prettify()) resultFile.close() print "write %s" % childFileName