示例#1
0
    def parseTeaserInfo(self, article, pos, pattern=teaserInfoPattern):

        teaserInfoMatch = pattern.search(article, pos)
        playable = False
        duration = None
        season = None
        episode = None
        genre = None
        if teaserInfoMatch is not None:
            teaserInfo = getTag('dd', article, teaserInfoMatch)
            isTiviMatch = teaserInfoIsTiviPattern.search(teaserInfo)
            if isTiviMatch is not None:
                teaserInfo = teaserInfo[0:isTiviMatch.start(0)]
            teaserInfo = cleanTags(teaserInfo)
            sep = u'\xb7'.encode('utf-8')
            parts = teaserInfo.split(sep)
            for part in parts:
                part = part.strip()
                partMatch = teaserInfoDurationPattern.search(part)
                if partMatch is not None:
                    duration = partMatch.group(1)
                else:
                    partMatch = teaserInfoEpisodePattern.search(part)
                    if partMatch is not None:
                        season = partMatch.group(1)
                        episode = partMatch.group(2)
                    else:
                        genre = part

            if duration is not None and duration.isdigit():
                duration = int(duration) * 60
                playable = True

            pos = teaserInfoMatch.end(0)

        if not self.playable and playable:
            self.playable = playable
        self.duration = duration
        if self.genre is None:
            self.genre = genre
        self.season = season
        self.episode = episode
        return pos
示例#2
0
    def parse(self, string, pos=0, baseUrl=None, teaserMatch=None):
        if teaserMatch is None:
            teaserMatch = teaserPattern.search(string, pos)
        if teaserMatch is None:
            return -1
        class_ = teaserMatch.group(1)
        
        article = getTag('article', string, teaserMatch)
        endPos = teaserMatch.start(0) + len(article)
        if class_.find('m-hidden') != -1:
            return endPos
                
        pos = self.parseImage(article, pos)
        pos = self.parseLabel(article, pos)
        pos = self.parseCategory(article, pos)
        pos = self.parseTitle(article, pos, baseUrl)
        pos = self.parseText(article, pos)
        pos = self.parseDate(article, pos)

        return endPos
示例#3
0
    def parse(self, string, pos=0, baseUrl=None, teaserMatch=None):
        if teaserMatch is None:
            teaserMatch = teaserPattern.search(string, pos)
        if teaserMatch is None:
            return -1
        class_ = teaserMatch.group(1)

        article = getTag('article', string, teaserMatch)
        endPos = teaserMatch.start(0) + len(article)
        if class_.find('m-hidden') != -1:
            return endPos

        pos = self.parseImage(article, pos)
        pos = self.parseCategory(article, pos)
        pos = self.parseTitle(article, pos, baseUrl)
        pos = self.parseText(article, pos)
        pos = self.parseLabel(article, pos)
        pos = self.parseFoot(article, pos)

        return endPos
示例#4
0
    def parseLabel(self, article, pos):
        labelMatch = labelPattern.search(article, pos)
        label = None
        type = None
        if labelMatch is not None:
            labelTags = getTag('div', article, labelMatch)
            iconMatch = iconPattern.search(labelTags)
            if iconMatch is not None:
                type = iconMatch.group(1)
            i = labelTags.find('>') + len('>')
            j = labelTags.rfind('</div>')
            pos = j + len('</div>')
            label = labelTags[i:j]
            label = stripTag('abbr', label)
            label = cleanTags(label)
            label = label.strip()

        self.label = stripHtml(label)
        self.type = type
        return pos
    def parse(self):
        super(NavigationResource, self).parse()
        leftNavMatch = leftNavPattern.search(self.content)
        if leftNavMatch is None:
            self.warn(
                "can't find navigation in page '{}', no rubrics will be available ...",
                self.url)
            return

        leftNav = getTag('ul', self.content, leftNavMatch)

        pos = leftNavMatch.end(0)
        dropdownLinksMatch = dropdownLinksPattern.search(self.content, pos)
        self.rubrics = []
        while dropdownLinksMatch is not None:
            url = dropdownLinksMatch.group(1).strip()
            title = stripHtml(dropdownLinksMatch.group(2))
            rubric = Rubric(title, url)
            self.rubrics.append(rubric)
            pos = dropdownLinksMatch.end(0)
            dropdownLinksMatch = dropdownLinksPattern.search(self.content, pos)
    def parse(self):
        super(NavigationResource, self).parse()
        leftNavMatch = leftNavPattern.search(self.content)
        if leftNavMatch is None:
            self.warn("can't find navigation in page '{}', no rubrics will be available ...", self.url)
            return

        leftNav = getTag('ul', self.content, leftNavMatch)     

        pos = leftNavMatch.end(0)
        dropdownLinksMatch = dropdownLinksPattern.search(self.content, pos)
        self.rubrics = []
        urls = Set([]);
        while dropdownLinksMatch is not None:
            url = self.parseUrl(dropdownLinksMatch.group(1))
            if url not in urls:
                urls.add(url)
                title = stripHtml(dropdownLinksMatch.group(2))
                rubric = Rubric(title, url)
                self.rubrics.append(rubric)
            pos = dropdownLinksMatch.end(0)
            dropdownLinksMatch = dropdownLinksPattern.search(self.content, pos)
示例#7
0
    def parseLabel(self, article, pos):
        labelMatch = labelPattern.search(article, pos)
        label = None
        type = None
        if labelMatch is not None:        
            labelTags = getTag('div', article, labelMatch)
            iconMatch = iconPattern.search(labelTags)
            if iconMatch is not None:    
                type = iconMatch.group(1)
            i = labelTags.find('</span>') + len('</span>')
            j = labelTags.rfind('</div>')
            pos = j + len('</div>') 
            label = labelTags[i:j]
            label = label.replace('<strong>', '')
            label = label.replace('</strong>', '')
            label = stripTag('abbr', label)
            label = stripTag('span', label)
            label = label.strip()

        self.label = stripHtml(label)
        self.type = type
        return pos