示例#1
0
 def test_very_similar_docs(self):
     doc1 = Document()
     doc2 = Document()
     doc1.title = "russia invades ukraine"
     doc1.content = "russia invades ukraine"
     doc2.title = "russia invaded by ukraine"
     doc2.content = "russia invaded ukraine"
     distance = self.state.repository.clustering.comparator.similarity(doc1, doc2)
     logging.info("Similarity is %2.2f", distance)
     self.assertGreater(distance, 0.3)
示例#2
0
 def test_somewhat_different_docs(self):
     doc1 = Document()
     doc2 = Document()
     doc1.title = "russia bought cars in portugal"
     doc1.content = "russia went to portugal and bought 5 new cars"
     doc2.title = "portugal is cool"
     doc2.content = "portugal is very very cool"
     distance = self.state.repository.clustering.comparator.similarity(doc1, doc2)
     logging.info("Similarity is %2.2f", distance)
     logging.info("tf_idf of \"is\": %2.5f", self.state.repository.index.tf_idf("is"))
     logging.info("tf_idf of \"portugal\": %2.5f", self.state.repository.index.tf_idf("portugal"))
     logging.info("tf_idf of \"russia\": %2.5f", self.state.repository.index.tf_idf("russia"))
     self.assertLess(distance, 0.15)
示例#3
0
 def test_very_different_docs(self):
     doc1 = Document()
     doc2 = Document()
     doc1.title = "russia invades ukraine"
     doc1.content = "russia is invading ukraine again"
     doc2.title = "portugal is cool"
     doc2.content = "portugal is very very cool"
     distance = self.state.repository.clustering.comparator.similarity(doc1, doc2)
     logging.info("Similarity is %2.2f", distance)
     logging.info("tf_idf of \"is\": %2.5f", self.state.repository.index.tf_idf("is"))
     logging.info("tf_idf of \"portugal\": %2.5f", self.state.repository.index.tf_idf("portugal"))
     logging.info("tf_idf of \"ukraine\": %2.5f", self.state.repository.index.tf_idf("ukraine"))
     self.assertLess(distance, 0.05)
示例#4
0
    def test_similar_docs_complex(self):
        doc1 = Document()
        doc2 = Document()
        doc1.title = "Fred Phelps, Head Of Westboro Baptist Church, Dies"
        doc1.content = "Fred Phelps, anti-gay activist and patriarch of the Westboro Baptist Church, has died at age 84. Frank Morris of KCUR reports on the interesting past of one of the most reviled men in America. "
        doc2.title = "Westboro Baptist Church Says Leader Fred Phelps 'Has Gone The Way of All Flesh'"
        doc2.content = "The Bible-thumping, anti-gay preacher was known for picketing funerals."
        distance = self.state.repository.clustering.comparator.similarity(doc1, doc2)
        for word in doc1.words() & doc2.words():
            logging.info("tf_idf of \"%s\": %2.5f", word, self.state.repository.index.tf_idf(word))

        for word in doc1.words() ^ doc2.words():
            logging.info("tf_idf of non intersection \"%s\": %2.5f", word, self.state.repository.index.tf_idf(word))
        logging.info("Similarity is %2.2f", distance)
        self.assertGreater(distance, 0.15)
示例#5
0
文件: feed.py 项目: ruiaf/sumnews
    def get_docs_from_xml(self, root):
        docs = []
        for channel in root:
            for item in channel.findall("item"):
                new_doc = Document()
                new_doc.title = item.find("title").text or ""

                new_doc.download_date = datetime.now(tz.tzutc())
                new_doc.publish_date = dateparser.parse(item.find("pubDate").text, "") or new_doc.download_date
                if new_doc.publish_date.tzinfo is None or self.force_timezone:
                    new_doc.publish_date=new_doc.publish_date.replace(tzinfo=self.timezone)
                new_doc.publish_date = new_doc.publish_date.astimezone(tz.tzutc())

                new_doc.source_url = item.find("link").text or ""

                new_doc.original_summary = strip_html(item.find("description").text or "")

                if item.find("guid"):
                    new_doc.guid = hashlib.md5(item.find("guid").encode('utf-8')).hexdigest()
                else:
                    new_doc.guid = hashlib.md5(new_doc.source_url.encode('utf-8')).hexdigest()
                new_doc.provider = self.name

                if new_doc.guid not in self.processed_guids:
                    self.processed_guids[new_doc.guid] = True
                    self.document_count += 1
                    docs.append(new_doc)

        return docs