Пример #1
0
    def rss_file_true(self, file):
        try:
            if file.endswith('.rss'):
                dprint(file, "RSS file:")
                self.file_list.append(file)

        except Exception as e:
            print("[RSS](rss_file_true) Error: %s" % e)
Пример #2
0
    def get_entries(self):
        try:
            self.list_rss_file()
            dprint(
                "RESUME FROM ALL NEWS ITEMS IN RSS MERGED TO MAKE SLIDESHOW")
            dprint("%s" % self.news_dict)

            return self.news_dict

        except Exception as e:
            print("[RSS](__init__)Error: %s" % e)
            return {}

    #def get_entries
Пример #3
0
    def parse_rss_list(self, file_list_to_parse):
        try:
            #print (file_list_to_parse)
            if len(file_list_to_parse) == 0:
                dprint("Can't find rss file, please check it.")
            else:
                for file in file_list_to_parse:
                    dprint(
                        "\n\n[parse_rss_list] Analizing: {file_name}".format(
                            file_name=file))
                    news_extracted = self.parse_rss_file(file)
                    self.merge_news_dict(news_extracted)

        except Exception as e:
            print("[RSS][parse_rss_list]Error: %s" % e)
Пример #4
0
    def list_rss_file(self, directory=None):
        '''Test if path is a file or directoy and execute rss_file_true for every files'''
        try:
            self.file_list = []
            news_extract = {}
            url = "http://localhost/news/rss/"
            rss_decoded = tempfile.NamedTemporaryFile().name

            try:
                urllib.request.urlretrieve(url, rss_decoded)
                del self.file_list[:]
                self.news_dict.clear()
                with open(rss_decoded, 'rt', encoding='utf-8') as f:
                    if 'rss' in f.read():
                        self.file_list.append(rss_decoded)
            except Exception as e:
                print("[RSS](list_rss_file)Can't get RSS from URL: %s" % e)

            dprint("List RSS files: %s" % self.file_list,
                   "[RSS](rss_file_true)")
            self.parse_rss_list(self.file_list)

        except Exception as e:
            print("[RSS](list_rss_file)Error: %s" % e)
Пример #5
0
    def parse_rss_file(self, rssfile):
        try:
            rss = feedparser.parse(rssfile)
            feed_entries = rss.entries
            dprint_values = {
                'title': rss.feed.title,
                'link': rss.feed.link,
                'length': len(feed_entries)
            }
            dprint('''
########################################
Detail from RSS
Title: {title}
Link: {link}
News Item Number: {length}
########################################'''.format(**dprint_values))

            entry_element = 0
            element = {}
            for entry in feed_entries:
                dprint("\n\nNEWS ITEM {entry_number}".format(
                    entry_number=entry_element + 1))
                dprint(entry)
                dprint("------------------------------")
                article_subtitle = entry.subtitle if 'subtitle' in entry else "None"
                if 'content' in entry:
                    content = entry.summary
                    content_type = entry.content.type if "type" in entry.content else "Null"
                    content_base = entry.content.base if "base" in entry.content else "Null"
                    content_value = entry.content.value if "value" in entry.content else "Null"
                else:
                    content = "None"

                dprint('''
Title: {0.title} - Link:[{0.link}]
Subtitle: {article_subtitle}
Id: {0.id}
Author: {0.author}
Published at: {0.published}
Updated: {0.updated}
Content: {content}
Content Type: {content_type}
Content Base: {content_base}
Content Value: {content_value}
Content XML: {content}
Description: {0.description}
				'''.format(entry,
                article_subtitle=article_subtitle,
                content=content,
                content_type=content_type,
                content_base=content_base,
                content_value=content_value))

                # Generate dictionary with content of news
                # parser content variable, because is a string
                dprint(
                    "------------------------------\nRESUME FROM THIS NEWS ITEM:"
                )

                element[entry_element] = {}
                soup = BeautifulSoup(content, "html.parser")
                element[entry_element]['title'] = entry.title
                element[entry_element]['type'] = "None"

                html = False
                for item in soup:
                    if 'kg-card-begin: html' in item or 'kg-card-begin: markdown' in item:
                        html = True
                        break
                if html:
                    element[entry_element]['text'] = ''
                    for item in soup:
                        if 'kg-card-begin: html' not in item and 'kg-card-end: html' not in item and 'kg-card-begin: markdown' not in item and 'kg-card-end: markdown' not in item:
                            try:
                                element[entry_element]['text'] = element[
                                    entry_element]['text'] + '%s' % (item)
                            except Exception as e:
                                print(
                                    "[RSS](parse_rss_file)(Class Beautiful)Error: %s"
                                    % e)
                    element[entry_element]['type'] = 'html'
                    dprint("Notice HTML: {}".format(
                        element[entry_element]['text']))
                else:
                    #Twitter is a text in rss news
                    for item in soup.find_all("p"):
                        try:
                            if soup.find_all("p"):
                                element[entry_element]['text'] = '%s' % (
                                    item.text.strip())
                                element[entry_element]['type'] = 'text'
                                dprint("Notice Text: {}".format(
                                    item.text.strip()))
                            else:
                                element[entry_element]['text'] = 'Not detected'
                                dprint("Notice Text: Not detected")
                        except Exception as e:
                            print(
                                "[RSS](parse_rss_file)(Class Beautiful)Error: %s"
                                % e)

                # two checks are necessary to confirm or not type
                if soup.find_all("video"):
                    if soup.find_all("video"):
                        for item in soup.find_all("video"):
                            try:
                                element[entry_element]['video'] = item['src']
                                element[entry_element]['type'] = 'video'
                                dprint("Notice Video: {}".format(item['src']))
                            except Exception as e:
                                dprint(
                                    "[RSS](parse_rss_file)(Class Beautiful)Error: %s"
                                    % e)
                    else:
                        element[entry_element]['video'] = 'Not detected'
                        dprint("Notice Video: Not detected")

                # two checks are necessary to confirm or not type
                # support to YouTube and Vimeo
                if soup.find_all("iframe"):
                    if soup.find_all("iframe"):
                        for item in soup.find_all("iframe"):
                            try:
                                if 'youtube' in item['src']:
                                    element[entry_element]['type'] = 'youtube'
                                    element[entry_element]['src'] = item['src']
                                    if 'embed/' in item['src']:
                                        element[entry_element]['id'] = item[
                                            'src'].split("embed/")[1].split(
                                                "?")[0]
                                    else:
                                        element[entry_element]['id'] = None
                                    element[entry_element]['width'] = item[
                                        'width']
                                    element[entry_element]['height'] = item[
                                        'height']
                                else:
                                    element[entry_element]['iframe'] = item
                                    element[entry_element]['type'] = 'iframe'

                                dprint(
                                    "Element Entry YouTube/iframe: {}".format(
                                        element[entry_element]))
                                dprint("Notice Iframe: {}".format(item))

                            except Exception as e:
                                dprint(
                                    "[RSS](parse_rss_file)(Class Beautiful)Error: %s"
                                    % e)
                    else:
                        element[entry_element]['iframe'] = 'Not detected'
                        dprint("Notice Iframe: Not detected")

                if soup.find_all("figure"):
                    if soup.find_all("img"):
                        # List of images add support to images gallery
                        element[entry_element]['image'] = []
                        element[entry_element]['type'] = 'image'
                        for item in soup.find_all("img"):
                            try:
                                element[entry_element]['image'].append(
                                    item['src'])
                            except Exception as e:
                                dprint(
                                    "[RSS](parse_rss_file)(Class Beautiful)Error: %s"
                                    % e)
                        dprint("Notice Image: %s" % (element[entry_element]))
                    else:
                        element[entry_element]['image'] = 'Not detected'
                        if element[entry_element]['type'] == 'iframe':
                            pass
                        elif element[entry_element]['type'] == 'text':
                            pass
                        else:
                            dprint("Notice Image: Not detected")

                entry_element = entry_element + 1

            dprint('\n\n')

            return element

        except Exception as e:
            print("[RSS](parse_rss_file)Error: %s" % e)