Exemplo n.º 1
0
    def get_docs_from_xml(self, root):
        docs = []
        for channel in root:
            for item in channel.findall("item"):
                new_doc = Document()
                new_doc.title = item.find("title").text or ""

                new_doc.download_date = datetime.now(tz.tzutc())
                new_doc.publish_date = dateparser.parse(item.find("pubDate").text, "") or new_doc.download_date
                if new_doc.publish_date.tzinfo is None or self.force_timezone:
                    new_doc.publish_date=new_doc.publish_date.replace(tzinfo=self.timezone)
                new_doc.publish_date = new_doc.publish_date.astimezone(tz.tzutc())

                new_doc.source_url = item.find("link").text or ""

                new_doc.original_summary = strip_html(item.find("description").text or "")

                if item.find("guid"):
                    new_doc.guid = hashlib.md5(item.find("guid").encode('utf-8')).hexdigest()
                else:
                    new_doc.guid = hashlib.md5(new_doc.source_url.encode('utf-8')).hexdigest()
                new_doc.provider = self.name

                if new_doc.guid not in self.processed_guids:
                    self.processed_guids[new_doc.guid] = True
                    self.document_count += 1
                    docs.append(new_doc)

        return docs
Exemplo n.º 2
0
 def sentences(self):
     tmp_sentences = [Sentence(utils.strip_html(self.title))]
     tmp_sentences.extend([Sentence(utils.strip_html(s)) for s in self.original_summary.split('[.]') if len(s.strip()) > 0])
     tmp_sentences.extend([Sentence(utils.strip_html(s)) for s in self.content.split('[.]') if len(s.strip()) > 0])
     return tmp_sentences