Exemplo n.º 1
0
    def train(self, read_article_ids=None, unread_article_ids=None):
        """
        Trains the DecisionTree Classifier.
        read_article_ids should be an iterable over read article ids
        unread_article_ids should be an iterable over unread article ids
        
        If one is None it will be loaded from database.
        """
        
        #Load user feedback if needed
        if read_article_ids is None:
            read_article_ids = set(r.article.id
                                   for r in ReadArticleFeedback.objects(user_id=self.user.id).only("article"))
        else:
            read_article_ids = set(read_article_ids)

        #Get all articles the user did not read.
        if unread_article_ids is None:
            ranked_article_ids = (a.article.id
                                  for a in RankedArticle.objects(user_id=self.user.id).only("article"))
            all_article_ids = set(a.id
                                  for a in Article.objects(id__in=ranked_article_ids).only("id"))
            unread_article_ids = all_article_ids - read_article_ids
        
        #convert all article features
        all_articles, marks = self._get_samples(read_article_ids, 
                                                unread_article_ids,
                                                p_synthetic_samples=self.p_synthetic_samples,
                                                p_majority_samples=self.p_majority_samples)

        logger.debug("Learn on %d samples." % len(marks))            

        self.clf = tree.DecisionTreeClassifier()
        self.clf.fit(all_articles, marks)
Exemplo n.º 2
0
def get_article_samples(config_):
    #Connect to mongo database
    logger.info("Connect to database...")
    connect(config_['database']['db-name'], 
            username= config_['database']['user'], 
            password= config_['database']['passwd'], 
            port = config_['database']['port'])
    
    #get user
    user = User.objects(email=u"*****@*****.**").first()
    
    ranked_article_ids = (a.article.id 
                          for a 
                          in RankedArticle.objects(user_id = user.id).only("article"))
    all_article_ids = Set(a.id 
                          for a 
                          in Article.objects(id__in = ranked_article_ids).only("id"))
    
    read_article_ids = Set(a.article.id 
                           for a 
                           in ReadArticleFeedback.objects(user_id = user.id).only("article"))
    
    unread_article_ids = all_article_ids - read_article_ids
    
    #sample test articles
    X, y = get_samples(extractor, read_article_ids, unread_article_ids)
    
    return X, y
Exemplo n.º 3
0
    def train(self, read_article_ids=None, unread_article_ids=None):
        """
        Trains the Bayes Classifier.
        read_article_ids should be an iterable over read article ids
        unread_article_ids should be an iterable over unread article ids
        
        If one is None it will be loaded from database.
        """
        
        #Load user feedback if needed
        if read_article_ids is None:
            read_article_ids = set(r.article.id for r
                                   in ReadArticleFeedback.objects(user_id=self.user.id).only("article"))
        else:
            read_article_ids = set(read_article_ids)
        
        logger.info("Use %d read articles for learning." % len(read_article_ids))
        read_articles = Article.objects(id__in=read_article_ids)

        #Get all articles the user did not read.
        if unread_article_ids is None:
            ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id=self.user.id).only("article"))
            all_article_ids = set(a.id for a in Article.objects(id__in=ranked_article_ids).only("id"))
            unread_article_ids = all_article_ids - read_article_ids
            
        #undersample unreads
        logger.info("Use %d unread articles for learning." % (len(unread_article_ids)))
        
        unread_articles = Article.objects(id__in=unread_article_ids)
        
        #convert all article features
        all_articles = UserModelBayes.AllArticles(read_articles, unread_articles, self.get_features)

        self.clf.fit(np.array(list(all_articles)), np.array(list(all_articles.get_marks())))
Exemplo n.º 4
0
 def get_top_articles(self, date, min_rating):
     '''
     Returns iterator to articles from date and with a rating bigger than
     min_rating.
     '''
     
     #get all articles from specific date
     articles_from_date = Article.objects(date__gte = date.date(), 
                     date__lt = date.date() + timedelta(days=1))
     
     #get all ranked article form loaded articles
     return [a.article for a in RankedArticle.objects(user_id = self.mongodb_user.id, 
                                  rating__gte = min_rating,
                                  article__in = articles_from_date)]
Exemplo n.º 5
0
    def train(self, read_article_ids = None, unread_article_ids = None):
        '''
        Trains the several SVM and Naive Bayes Classifiers.
        read_article_ids should be an iterable over read article ids
        unread_article_ids should be an iterable over unread article ids
        
        If one is None it will be loaded from database.
        '''
        
        #Load user feedback if needed
        if read_article_ids is None:
            read_article_ids = Set(r.article.id 
                                for r 
                                in ReadArticleFeedback.objects(user_id = self.user.id).only("article"))
        else:
            read_article_ids = Set(read_article_ids)

        #Get all articles the user did not read.
        if unread_article_ids is None:
            ranked_article_ids = (a.article.id 
                               for a 
                               in RankedArticle.objects(user_id = self.user.id).only("article"))
            all_article_ids = Set(a.id 
                                  for a 
                                  in Article.objects(id__in = ranked_article_ids).only("id"))
            unread_article_ids = all_article_ids - read_article_ids
        
        classifiers = [lambda: svm.SVC(kernel='rbf'), 
                       lambda: svm.SVC(kernel='rbf'),
                       lambda: svm.SVC(kernel='rbf'),
                       lambda: svm.SVC(kernel='rbf'),
                       lambda: svm.SVC(kernel='rbf'),
                       GaussianNB, 
                       GaussianNB, 
                       GaussianNB, 
                       GaussianNB]
        
        parameters = [#SVM
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 100,
                       'p_majority_samples': 200,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 200,
                       'p_majority_samples': 300,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 300,
                       'p_majority_samples': 400,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 400,
                       'p_majority_samples': 500,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 500,
                       'p_majority_samples': 600,
                       'k': 10},
                      #Naive Bayes
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 100,
                       'p_majority_samples': 100,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 100,
                       'p_majority_samples': 200,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 300,
                       'p_majority_samples': 500,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 600,
                       'p_majority_samples': 600,
                       'k': 10}]
        
        self._call_classifiers(classifiers, parameters)
Exemplo n.º 6
0
        username=config_["database"]["user"],
        password=config_["database"]["passwd"],
        port=config_["database"]["port"],
    )

    # Load feature extractor
    # feature_extractor = EsaFeatureExtractor(prefix = config_['prefix'])
    # feature_extractor = TfidfFeatureExtractor(prefix = config_['prefix'])
    # feature_extractor = LdaFeatureExtractor(prefix = config_['prefix'])
    # feature_extractor = LdaBowFeatureExtractor(prefix = config_['prefix'])
    feature_extractor = cEsaFeatureExtractor(prefix=config_["prefix"])

    # get user
    user = User.objects(email=u"*****@*****.**").first()

    ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id=user.id).only("article"))
    all_article_ids = set(a.id for a in Article.objects(id__in=ranked_article_ids).only("id"))

    read_article_ids = set(a.article.id for a in ReadArticleFeedback.objects(user_id=user.id).only("article"))

    unread_article_ids = all_article_ids - read_article_ids

    for p_synthetic in xrange(100, 700, 100):
        for p_majority in xrange(100, 700, 100):

            logger.info("Synthetic over-sampling %d and majority undersampling %d" % (p_synthetic, p_majority))

            # run test N_ITERATIONS
            precisions_read = np.zeros((N_ITERATIONS))
            recalls_read = np.zeros((N_ITERATIONS))
            f1_scores_read = np.zeros((N_ITERATIONS))
Exemplo n.º 7
0
            password= config_['database']['passwd'], 
            port = config_['database']['port'])
    
    #Load feature extractor
    #feature_extractor = EsaFeatureExtractor(prefix = config_['prefix'])
    #feature_extractor = TfidfFeatureExtractor(prefix = config_['prefix'])
    #feature_extractor = LdaFeatureExtractor(prefix = config_['prefix'])
    #feature_extractor = LdaBowFeatureExtractor(prefix = config_['prefix'])
    feature_extractor = cEsaFeatureExtractor(prefix = config_['prefix'])
    
    #get user
    user = User.objects(email=u"*****@*****.**").first()
    
    ranked_article_ids = (a.article.id 
                          for a 
                          in RankedArticle.objects(user_id = user.id).only("article"))
    all_article_ids = Set(a.id 
                          for a 
                          in Article.objects(id__in = ranked_article_ids).only("id"))
    
    read_article_ids = Set(a.article.id 
                           for a 
                           in ReadArticleFeedback.objects(user_id = user.id).only("article"))
    
    unread_article_ids = all_article_ids - read_article_ids

    for p_synthetic in xrange(100, 700, 100):
        for p_majority in xrange(100, 700, 100): 
            
            logger.info("Synthetic over-sampling %d and majority undersampling %d" %
                        (p_synthetic, p_majority))
Exemplo n.º 8
0
    def test_constructor_with_file_wikicorpus(self):
        
        #load tf-idf model
        tfidf_model = tfidfmodel.TfidfModel.load("/media/sdc1/test_dump/result/test_tfidf.model")
        extractor = TfidfFeatureExtractor("/media/sdc1/test_dump/result/test")
        
        #load tf-idf corpus
        tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm')
        
        #load lda corpus
        #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm')
        
        #load dictionary
        id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict")
        
        #load article titles
        document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt")
        
        #Connect to mongo database
        connect(self.config_['database']['db-name'], 
                username= self.config_['database']['user'], 
                password= self.config_['database']['passwd'], 
                port = self.config_['database']['port'])
        
        #Load articles as test corpus
        user = User.objects(email=u"*****@*****.**").first()
        
        ranked_article_ids = (a.article.id 
                              for a 
                              in RankedArticle.objects(user_id = user.id).only("article"))
        all_article_ids = Set(a.id 
                              for a 
                              in Article.objects(id__in = ranked_article_ids).only("id"))
        
        read_article_ids = Set(a.article.id 
                               for a 
                               in ReadArticleFeedback.objects(user_id = user.id).only("article"))
        
        unread_article_ids = all_article_ids - read_article_ids

        #sample test articles
        X, y = get_samples(extractor, read_article_ids, unread_article_ids)
        
        s,f = X.shape
        logger.debug("Traning with %d samples, %d features, %d marks" % 
                     (s,f, len(y)))

        #train esa model
        esa_model = CosineEsaModel(tfidf_corpus, 
                                   document_titles = document_titles,
                                   test_corpus = X, 
                                   test_corpus_targets = y, 
                                   num_test_corpus = len(y),
                                   num_best_features = 15,
                                   num_features = len(id2token))
        
        print esa_model
        
        esa_model.save('/media/sdc1/test_dump/result/test_cesa.model')
        
        tmp_esa = CosineEsaModel.load('/media/sdc1/test_dump/result/test_cesa.model') 
        print tmp_esa