예제 #1
0
파일: news.py 프로젝트: soldierkam/pynews
    def __init__(
        self,
        dir,
        testDir=None,
        doTest=True,
        ignoreKlass=[],
        includeKlass=None,
        extractor="ArticleExtractor",
        useHtml=False,
    ):
        RssDataReader.__init__(self, dir, testDir)
        logger.info("Start building " + self.__class__.__name__)
        self.__mutex = threading.Semaphore()

        freqDists = {}
        ignore = stopwords.words("english")
        features = set()
        klassSize = {}
        documentsWithLabel = []
        for klassId in self.klasses(ignoreKlass, includeKlass):
            freqDist = FreqDist()
            size = 0
            for url, document in self.documents(klassId, useHtml):
                try:
                    txt = document if not useHtml else Extractor(extractor=extractor, html=document).getText()
                    documentsWithLabel.append((txt, klassId))
                    txt = tokenize(txt)
                    size += 1
                    for part in txt:
                        if part.isalnum() and part not in ignore:
                            freqDist.inc(part)
                            features.add(part)
                    # for bigram in nltk.bigrams(txt):
                    #    freqDist.inc(bigram)
                    #    featureFd.inc(bigram)
                except:
                    logger.exception(u"Url: " + url)
            freqDists[klassId] = freqDist
            klassSize[klassId] = size

        random.shuffle(documentsWithLabel)

        self.__featuresGenerator = FeatureGenerator(freqDists, features, klassSize)
        trainset = apply_features(self.__featuresGenerator, documentsWithLabel)
        self.__classifier = NaiveBayesClassifier.train(trainset)
        logger.info(u"Classifier learned (set size=" + unicode(len(trainset)) + u")")
        if doTest:
            ref = []
            test = []
            testDocumentsWithLabel = [
                (
                    document if not useHtml else Extractor(extractor=extractor, html=document).getText(),
                    correctKlass,
                    url,
                )
                for correctKlass in self.klasses(ignoreKlass, includeKlass)
                for url, document in self._testDocuments(correctKlass, useHtml)
            ]
            for doc, cat, url in testDocumentsWithLabel:
                ans = self.__classifier.classify(self.__featuresGenerator(doc))
                ref.append(cat)
                test.append(ans)
                if ans != cat:
                    logger.info(u"Wrong " + ans + u"(" + cat + u"):\t" + url + u" " + doc.replace("\n", " "))
            # for correctKlass, klass, featuresWithLabel in zip(ref, test, testset):
            #    if correctKlass != klass:
            #        pd = self.__classifier.prob_classify(dict(featuresWithLabel[0]))
            #        labelProbList = sorted( [(sample, pd.logprob(sample)) for sample in pd.samples()], key=lambda x: x[1], reverse=True)
            #        logger.info( correctKlass + " as " + klass + ": " + str([(correctKlass, "%.2f" % prob) for correctKlass, prob in labelProbList]))
            #        logger.info([(key, value)for key, value in featuresWithLabel[0].items() if value > 0])
            #        logger.info(self.__findDocumentByKlassAndFeatures(correctKlass, featuresWithLabel[0]))
            logger.info("\n" + ConfusionMatrix(ref, test).pp())
            # testset = apply_features(self.__featuresGenerator, testDocumentsWithLabel
            # logger.info("Accuracy: " + str(nltk.classify.accuracy(self.__classifier, testset)))
            self.__classifier.show_most_informative_features(n=300)
예제 #2
0
파일: lang.py 프로젝트: soldierkam/pynews
#        logger.info(u"Lang of \"" + u + u"\" is " + unicode(ld.detect(text)))


if __name__ == "__main__":
    #
    #
    klass2Lang = {"us": "en", "nl_nl": "nl", "fr": "fr", "de": "de","es": "es", "pt-PT_pt": "pt", "pl_pl": "pl", "ru_ru": "ru", "it": "it", "tr_tr": "tr", "cn": "cn"}
    unknownLangs = ["cn"]

    if len(sys.argv) > 1 and sys.argv[1] == "-s":
        RssAnalyzer("/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/pynews/langid/", langs=klass2Lang.keys(), langAsKlass=True)
    else:
        ld = LangDetect.instance()
        logger.info(ld.detect(u"li ul li ul li ul"))
        logger.info(ld.detect(u"<li> <ul> <li> <ul><li> <ul><li> <ul><li> <ul>"))
        data = RssDataReader("/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/pynews/langid/")
        ref = []
        response = []
        for klass in data.klasses():
            correctAns = klass2Lang[klass]
            if correctAns in unknownLangs:
                correctAns = "n/k"
            for url, doc in data.documents(klass):
                if not doc:
                    logger.info("Empty: " + url)
                    continue
                extractor = Extractor(extractor='ArticleExtractor', html=doc)
                ref.append(correctAns)
                ans = ld.detect(doc)
                ans = ans if ans else "n/k"
                response.append(ans)