def _parseCluster(self, pos, class_, fallbackTitle): titlePattern = clusterTitlePattern listType = 'cluster' if class_.find('b-content-teaser-list') != -1: titlePattern = sectionTitlePattern listType = 'content' titleMatch = titlePattern.search(self.content, pos) cluster = None title = fallbackTitle if class_.find('x-notitle') != -1: if len(self.clusters) > 0: cluster = self.clusters[len(self.clusters) - 1] elif titleMatch is not None: title = stripHtml(titleMatch.group(1)) pos = titleMatch.end(0) if cluster is None: cluster = Cluster(title, listType, pos) self.clusters.append(cluster) match = listPattern.search(self.content, pos) if match is not None: cluster.listEnd = match.start(0) - 1 else: cluster.listEnd = len(self.content) - 1 return match
def _parseClusters(self): pos = 0 title = None fallbackTitleMatch = fallbackTitlePattern.search(self.content, pos) if fallbackTitleMatch is None: fallbackTitleMatch = fallbackTitlePattern2.search( self.content, pos) if fallbackTitleMatch is not None: title = stripHtml(fallbackTitleMatch.group(1)) pos = fallbackTitleMatch.end(0) match = listPattern.search(self.content, pos) while match is not None: pos = match.end(0) class_ = match.group(1) if self._isModule(class_): match = self._parseModule(pos, moduleItemPattern, moduleItemTextPattern, moduleItemDatePattern) elif self._isStageTeaser(class_): match = self._parseModule(pos, stageTeaserPattern, stageTeaserTextPattern, moduleItemDatePattern) else: match = self._parseCluster(pos, class_, title)
def parseCategory(self, article, pos): catMatch = catPattern.search(article, pos) genre = None category = None if catMatch is not None: parts = catMatch.group(1).strip().split('|') if len(parts) > 0: genre = parts[0].strip() if len(parts) > 1: category = parts[1].strip() pos = catMatch.end(0) self.genre = stripHtml(genre) self.category = stripHtml(category) return pos
def _parseCluster(self, pos, class_, fallbackTitle): titlePattern = clusterTitlePattern listType = 'cluster' if class_.find('b-content-teaser-list') != -1: titlePattern = sectionTitlePattern listType = 'content' titleMatch = titlePattern.search(self.content, pos) cluster = None title = fallbackTitle if class_.find('x-notitle') != -1: if len(self.clusters) > 0: cluster = self.clusters[len(self.clusters)-1] elif titleMatch is not None: title = stripHtml(titleMatch.group(1)) pos = titleMatch.end(0) if cluster is None: cluster = Cluster(title, listType, pos) self.clusters.append(cluster) match = listPattern.search(self.content, pos) if match is not None: cluster.listEnd = match.start(0)-1 else: cluster.listEnd = len(self.content)-1 return match
def parseTitle(self, article, pos, baseUrl): aMatch = aPattern.search(article, pos) title = None url = None if aMatch is not None: url = aMatch.group(1).strip() pos = aMatch.end(0) i = pos j = article.find('</a>', i) # check for '<span class="arrowhover ...' k = article.find('<span class="arrowhover', i) if k != -1 and k < j: j = k title = article[i:j] title = cleanTags(title) title = title.strip() pos = j + len('</a>') self.title = stripHtml(title) self.url = url self.contentName = None if url is not None: if baseUrl is not None and url[0:len(baseUrl)] == baseUrl: self.url = url[len(baseUrl):] i = url.rfind('.') if i != -1: self.contentName = '/zdf' + url[0:i] return pos
def _parseCluster(self, pos, class_, fallbackTitle): titlePattern = clusterTitlePattern listType = 'cluster' if class_.find('b-content-teaser-list') != -1: titlePattern = sectionTitlePattern listType = 'content' elif class_.find('b-newsstream') != -1: titlePattern = newsStreamTitlePattern listType = 'cluster' elif class_.find('b-topics-module') != -1: titlePattern = topicsModuleTitlePattern listType = 'topics' titleMatch = titlePattern.search(self.content, pos) cluster = None title = fallbackTitle # if content-teaser-list has no title, use previous cluster to calculate list end if class_.find('b-content-teaser-list no-title') != -1: if len(self.clusters) > 0: cluster = self.clusters[len(self.clusters) - 1] else: nextClusterMatch = listPattern.search(self.content, pos) tmpCluster = Cluster(None, listType, pos, nextClusterMatch.end(0)) self._parseClusterTeasers(tmpCluster) self.teasers.extend(tmpCluster.teasers) return nextClusterMatch elif titleMatch is not None: # title can be None in case of 'x-notitle' in 'topics' list title = stripHtml(titleMatch.group(1)) pos = titleMatch.end(0) if cluster is None: cluster = Cluster(title, listType, pos) self.clusters.append(cluster) match = listPattern.search(self.content, pos) if match is not None: cluster.listEnd = match.start(0) - 1 else: cluster.listEnd = len(self.content) - 1 # use first teaser image as cluster image if cluster.image is None: tmpCluster = Cluster(None, listType, cluster.listStart, cluster.listEnd) self._parseClusterTeasers(tmpCluster, True) if len(tmpCluster.teasers) > 0: tmpTeaser = tmpCluster.teasers[0] cluster.image = tmpTeaser.image # use teaser.title as cluster fallback if cluster.title is None: cluster.title = tmpTeaser.title return match
def parseText(self, article, pos, pattern=textPattern): textMatch = pattern.search(article, pos) text = None if textMatch is not None: text = textMatch.group(1).strip() pos = textMatch.end(0) self.text = stripHtml(text) return pos
def parseCategory(self, article, pos): catMatch = catPattern.search(article, pos) genre = None category = None if catMatch is not None: pos = catMatch.end(0) catCategoryMatch = catCategoryPattern.search(article, pos) if catCategoryMatch is not None: genre = catCategoryMatch.group(1).strip() pos = catCategoryMatch.end(0) catBrandMatch = catBrandPattern.search(article, pos) if catBrandMatch is not None: category = catBrandMatch.group(1).strip() pos = catBrandMatch.end(0) self.genre = stripHtml(genre) self.category = stripHtml(category) return pos
def _parseClusters(self): pos = 0 title = None fallbackTitleMatch = fallbackTitlePattern.search(self.content, pos) if fallbackTitleMatch is not None: title = stripHtml(fallbackTitleMatch.group(1)) pos = fallbackTitleMatch.end(0) match = listPattern.search(self.content, pos) while match is not None: pos = match.end(0) class_ = match.group(1) if class_.find('b-content-module') != -1: match = self._parseModule(pos) else: match = self._parseCluster(pos, class_, title)
def _parseClusters(self): pos = 0 title = None fallbackTitleMatch = fallbackTitlePattern.search(self.content, pos) if fallbackTitleMatch is None: fallbackTitleMatch = fallbackTitlePattern2.search(self.content, pos) if fallbackTitleMatch is not None: title = stripHtml(fallbackTitleMatch.group(1)) pos = fallbackTitleMatch.end(0) match = listPattern.search(self.content, pos) while match is not None: pos = match.end(0) class_ = match.group(1) if class_.find('b-content-module') != -1: match = self._parseModule(pos) else: match = self._parseCluster(pos, class_, title)
def parseLabel(self, article, pos): labelMatch = labelPattern.search(article, pos) label = None type = None if labelMatch is not None: labelTags = getTag('div', article, labelMatch) iconMatch = iconPattern.search(labelTags) if iconMatch is not None: type = iconMatch.group(1) i = labelTags.find('>') + len('>') j = labelTags.rfind('</div>') pos = j + len('</div>') label = labelTags[i:j] label = stripTag('abbr', label) label = cleanTags(label) label = label.strip() self.label = stripHtml(label) self.type = type return pos
def parse(self): super(NavigationResource, self).parse() leftNavMatch = leftNavPattern.search(self.content) if leftNavMatch is None: self.warn( "can't find navigation in page '{}', no rubrics will be available ...", self.url) return leftNav = getTag('ul', self.content, leftNavMatch) pos = leftNavMatch.end(0) dropdownLinksMatch = dropdownLinksPattern.search(self.content, pos) self.rubrics = [] while dropdownLinksMatch is not None: url = dropdownLinksMatch.group(1).strip() title = stripHtml(dropdownLinksMatch.group(2)) rubric = Rubric(title, url) self.rubrics.append(rubric) pos = dropdownLinksMatch.end(0) dropdownLinksMatch = dropdownLinksPattern.search(self.content, pos)
def parseLabel(self, article, pos): labelMatch = labelPattern.search(article, pos) label = None type = None if labelMatch is not None: labelTags = getTag('div', article, labelMatch) iconMatch = iconPattern.search(labelTags) if iconMatch is not None: type = iconMatch.group(1) i = labelTags.find('</span>') + len('</span>') j = labelTags.rfind('</div>') pos = j + len('</div>') label = labelTags[i:j] label = label.replace('<strong>', '') label = label.replace('</strong>', '') label = stripTag('abbr', label) label = stripTag('span', label) label = label.strip() self.label = stripHtml(label) self.type = type return pos
def parse(self): super(NavigationResource, self).parse() leftNavMatch = leftNavPattern.search(self.content) if leftNavMatch is None: self.warn("can't find navigation in page '{}', no rubrics will be available ...", self.url) return leftNav = getTag('ul', self.content, leftNavMatch) pos = leftNavMatch.end(0) dropdownLinksMatch = dropdownLinksPattern.search(self.content, pos) self.rubrics = [] urls = Set([]); while dropdownLinksMatch is not None: url = self.parseUrl(dropdownLinksMatch.group(1)) if url not in urls: urls.add(url) title = stripHtml(dropdownLinksMatch.group(2)) rubric = Rubric(title, url) self.rubrics.append(rubric) pos = dropdownLinksMatch.end(0) dropdownLinksMatch = dropdownLinksPattern.search(self.content, pos)
def parseTitle(self, article, pos, baseUrl): aMatch = aPattern.search(article, pos) title = None url = None playable = False if aMatch is not None: url = aMatch.group(1).strip() pos = aMatch.end(0) i = pos iconMatch = titleIconPattern.search(article, pos) if iconMatch is not None: playable = iconMatch.group(1) == 'play' i = article.find('</span>', pos) + len('</span>') j = article.find('</a>', i) # check for '<span class="arrowhover ...' k = article.find('<span', i) if k != -1 and k < j: j = k title = article[i:j] title = title.replace('<strong>', '') title = title.replace('</strong>', '') title = title.strip() pos = j + len('</a>') self.title = stripHtml(title) self.url = url self.playable = playable self.contentName = None if url is not None: if baseUrl is not None and url[0:len(baseUrl)] == baseUrl: self.url = url[len(baseUrl):] i = url.rfind('.') if i != -1: j = url.rfind('/') if j != -1: self.contentName = url[j + 1:i] return pos
def _parseClusters(self): pos = 0 title = None fallbackTitleMatch = fallbackTitlePattern.search(self.content, pos) if fallbackTitleMatch is None: fallbackTitleMatch = fallbackTitlePattern2.search( self.content, pos) if fallbackTitleMatch is not None: title = stripTag('span', fallbackTitleMatch.group(1)) title = stripHtml(title) pos = fallbackTitleMatch.end(0) self.fallbackTitle = title match = listPattern.search(self.content, pos) while match is not None: pos = match.end(0) class_ = match.group(1) if self._isModule(class_): match = self._parseModule(pos, moduleItemPattern, moduleItemTextPattern, moduleItemDatePattern, MODULE_TYPE_DEFAULT) elif self._isPostContent(class_): match = self._parseModule(pos, postContentPattern, moduleItemTextPattern, moduleItemDatePattern, MODULE_TYPE_POST_CONTENT) elif self._isStageTeaser(class_): match = self._parseModule(pos, stageTeaserPattern, stageTeaserTextPattern, moduleItemDatePattern, MODULE_TYPE_STAGE_TEASER) elif self._isGroupPersons(class_): # just skip group persons, no teasers in this section match = listPattern.search(self.content, pos) else: match = self._parseCluster(pos, class_, title)
def parseTitle(self, article, pos, baseUrl): aMatch = aPattern.search(article, pos) title = None url = None playable = False if aMatch is not None: url = aMatch.group(1).strip() pos = aMatch.end(0) i = pos iconMatch = titleIconPattern.search(article, pos) if iconMatch is not None: playable = iconMatch.group(1) == 'play' i = article.find('</span>', pos) + len('</span>') j = article.find('</a>', i) # check for '<span class="arrowhover ...' k = article.find('<span class="arrowhover', i) if k != -1 and k < j: j = k title = article[i:j] title = cleanTags(title) title = title.strip() pos = j + len('</a>') self.title = stripHtml(title) self.url = url self.playable = playable self.contentName = None if url is not None: if baseUrl is not None and url[0:len(baseUrl)] == baseUrl: self.url = url[len(baseUrl):] i = url.rfind('.') if i != -1: j = url.rfind('/') if j != -1: self.contentName = url[j+1:i] return pos