Пример #1
0
    def _get_samples(self, 
                     read_article_ids, 
                     unread_article_ids,
                     p_synthetic_samples = 300,
                     p_majority_samples = 500,
                     k = 5):
        '''
        read_article_ids : Set
        unread_article_ids : Set
        p_synthetic_samples : Percentage of snythetic samples, 300 for 300% 
                              If None no are created 
        p_majority_samples : Size of majority sample = p_majority_samples/n_minority_sample, 
                             500 for 500%
                             If None under sampling ist not done
        k : neighbourhood for k nearest neighbour, standard 5

        Returns
        -------
        array-like full vector samples, shape = [n_features, n_samples]
        array-like marks, shape = [n_samples]
        '''
        
        #Under-sample unread ids
        if p_majority_samples is not None:
            unread_article_ids = Set(sample(unread_article_ids, 
                                            min(p_majority_samples/100 * len(read_article_ids), 
                                                len(unread_article_ids))
                                            )
                                     )
        
        #Create unread article vectors
        unread_marks = numpy.empty(len(unread_article_ids))
        unread_marks.fill(UserModelSVM.UNREAD)
        unread_articles = numpy.empty(shape=(len(unread_article_ids), 
                                             self.num_features_))
        
        
        for i, article in enumerate(Article.objects(id__in = unread_article_ids)):
            try:
                article_features_as_full_vec = self.get_features(article)
                unread_articles[i,:] = article_features_as_full_vec[:]
            except AttributeError as e:
                logger.error("Article %s does not have attribute: %s." 
                             % (article.id, e))  
                
        #Create read article vectors
        read_marks = numpy.empty(len(read_article_ids))
        read_marks.fill(UserModelSVM.READ)  
        read_articles = numpy.empty(shape=(len(read_article_ids), 
                                             self.num_features_))
        
        for i, article in enumerate(Article.objects(id__in = read_article_ids)):
            try:
                article_features_as_full_vec = self.get_features(article)
                read_articles[i,:] = article_features_as_full_vec[:]
            except AttributeError as e:
                logger.error("Article %s does not have attribute: %s." 
                             % (article.id, e))           
        
        #SMOTE sample minorities
        #synthetic_read_articles = SMOTE(read_articles, p_synthetic_samples, k) 
        
        #borderlineSMOTE sample minorites if p_synthetic_samples not None
        X = numpy.concatenate((read_articles, unread_articles)) 
        
        self._calculate_mean_and_std_deviation(X)
        X = self._normalize(X)
        
        y = numpy.concatenate((read_marks, unread_marks))
        if p_synthetic_samples is None:
            return X, y
        else:
            new_read_articles, synthetic_read_articles, danger_read_articles = borderlineSMOTE(X = X,
                                                                                               y = y,
                                                                                               minority_target = UserModelSVM.READ,
                                                                                               N = p_synthetic_samples, k = k)
            
            #Create synthetic read samples
            synthetic_marks = numpy.zeros(len(synthetic_read_articles))
            synthetic_marks.fill(UserModelSVM.READ)  
            
            read_marks = numpy.empty(len(new_read_articles))
            read_marks.fill(UserModelSVM.READ)  
            
            danger_read_marks = numpy.empty(len(danger_read_articles))
            danger_read_marks.fill(UserModelSVM.READ)   
            
            logger.info("Use %d read, %d unread, %d danger reads and %d synthetic samples." %
                        (len(read_marks), len(unread_marks), 
                         len(danger_read_marks), len(synthetic_marks)))
        
            return (numpy.concatenate((new_read_articles, 
                                       synthetic_read_articles, 
                                       danger_read_articles,
                                       unread_articles)),
                    numpy.concatenate((read_marks, 
                                      synthetic_marks, 
                                      danger_read_marks,
                                      unread_marks))
                    )  
Пример #2
0
def get_samples(extractor,
                read_article_ids, 
                unread_article_ids,
                p_synthetic_samples = 300,
                p_majority_samples = 500,
                k = 5):
    '''
    read_article_ids : Set
    unread_article_ids : Set
    n_synthetic_samples : Percentage of snythetic samples, 300 for 300%
    k : neighbourhood for k nearest neighbour, standard 5

    Returns
    -------
    array-like full vector samples, shape = [n_features, n_samples]
    array-like marks, shape = [n_samples]
    '''
    
    #Under-sample unread ids
    unread_article_ids = Set(sample(unread_article_ids, 
                                    min(p_majority_samples/100 * len(read_article_ids), 
                                        len(unread_article_ids))
                                    )
                             )
    
    #Create unread article vectors
    unread_marks = np.empty(len(unread_article_ids))
    unread_marks.fill(UNREAD)
    unread_articles = np.empty(shape=(len(unread_article_ids), 
                                         extractor.get_feature_number()))
    
    
    for i, article in enumerate(Article.objects(id__in = unread_article_ids)):
        try:
            article_features_as_full_vec = get_features(article, extractor)
            unread_articles[i,:] = article_features_as_full_vec[:]
        except AttributeError as e:
            logger.error("Article %s does not have attribute: %s." 
                         % (article.id, e))  
            
    #Create read article vectors
    read_marks = np.empty(len(read_article_ids))
    read_marks.fill(READ)  
    read_articles = np.empty(shape=(len(read_article_ids), 
                                         extractor.get_feature_number()))
    
    for i, article in enumerate(Article.objects(id__in = read_article_ids)):
        try:
            article_features_as_full_vec = get_features(article, extractor)
            read_articles[i,:] = article_features_as_full_vec[:]
        except AttributeError as e:
            logger.error("Article %s does not have attribute: %s." 
                         % (article.id, e))           
    
    #SMOTE sample minorities
    #synthetic_read_articles = SMOTE(read_articles, p_synthetic_samples, k) 
    
    #borderlineSMOTE sample minorites
    X = np.concatenate((read_articles, unread_articles)) 
    y = np.concatenate((read_marks, unread_marks))
    new_read_articles, synthetic_read_articles, danger_read_articles = borderlineSMOTE(X = X,
                                                                                    y = y,
                                                                                    minority_target = READ,
                                                                                    N = p_synthetic_samples, k = k)
    
    #Create synthetic read samples
    synthetic_marks = np.zeros(len(synthetic_read_articles))
    synthetic_marks.fill(READ)  
    
    read_marks = np.empty(len(new_read_articles))
    read_marks.fill(READ)  
    
    danger_read_marks = np.empty(len(danger_read_articles))
    danger_read_marks.fill(READ)   
    
    logger.info("Use %d read, %d unread, %d danger reads and %d synthetic samples." %
                (len(read_marks), len(unread_marks), 
                 len(danger_read_marks), len(synthetic_marks)))
    
    return (np.concatenate((new_read_articles, 
                              synthetic_read_articles, 
                              danger_read_articles,
                              unread_articles)),
            np.concatenate((read_marks, 
                              synthetic_marks, 
                              danger_read_marks,
                              unread_marks))
            )