def _get_samples(self, read_article_ids, unread_article_ids, p_synthetic_samples = 300, p_majority_samples = 500, k = 5): ''' read_article_ids : Set unread_article_ids : Set p_synthetic_samples : Percentage of snythetic samples, 300 for 300% If None no are created p_majority_samples : Size of majority sample = p_majority_samples/n_minority_sample, 500 for 500% If None under sampling ist not done k : neighbourhood for k nearest neighbour, standard 5 Returns ------- array-like full vector samples, shape = [n_features, n_samples] array-like marks, shape = [n_samples] ''' #Under-sample unread ids if p_majority_samples is not None: unread_article_ids = Set(sample(unread_article_ids, min(p_majority_samples/100 * len(read_article_ids), len(unread_article_ids)) ) ) #Create unread article vectors unread_marks = numpy.empty(len(unread_article_ids)) unread_marks.fill(UserModelSVM.UNREAD) unread_articles = numpy.empty(shape=(len(unread_article_ids), self.num_features_)) for i, article in enumerate(Article.objects(id__in = unread_article_ids)): try: article_features_as_full_vec = self.get_features(article) unread_articles[i,:] = article_features_as_full_vec[:] except AttributeError as e: logger.error("Article %s does not have attribute: %s." % (article.id, e)) #Create read article vectors read_marks = numpy.empty(len(read_article_ids)) read_marks.fill(UserModelSVM.READ) read_articles = numpy.empty(shape=(len(read_article_ids), self.num_features_)) for i, article in enumerate(Article.objects(id__in = read_article_ids)): try: article_features_as_full_vec = self.get_features(article) read_articles[i,:] = article_features_as_full_vec[:] except AttributeError as e: logger.error("Article %s does not have attribute: %s." % (article.id, e)) #SMOTE sample minorities #synthetic_read_articles = SMOTE(read_articles, p_synthetic_samples, k) #borderlineSMOTE sample minorites if p_synthetic_samples not None X = numpy.concatenate((read_articles, unread_articles)) self._calculate_mean_and_std_deviation(X) X = self._normalize(X) y = numpy.concatenate((read_marks, unread_marks)) if p_synthetic_samples is None: return X, y else: new_read_articles, synthetic_read_articles, danger_read_articles = borderlineSMOTE(X = X, y = y, minority_target = UserModelSVM.READ, N = p_synthetic_samples, k = k) #Create synthetic read samples synthetic_marks = numpy.zeros(len(synthetic_read_articles)) synthetic_marks.fill(UserModelSVM.READ) read_marks = numpy.empty(len(new_read_articles)) read_marks.fill(UserModelSVM.READ) danger_read_marks = numpy.empty(len(danger_read_articles)) danger_read_marks.fill(UserModelSVM.READ) logger.info("Use %d read, %d unread, %d danger reads and %d synthetic samples." % (len(read_marks), len(unread_marks), len(danger_read_marks), len(synthetic_marks))) return (numpy.concatenate((new_read_articles, synthetic_read_articles, danger_read_articles, unread_articles)), numpy.concatenate((read_marks, synthetic_marks, danger_read_marks, unread_marks)) )
def get_samples(extractor, read_article_ids, unread_article_ids, p_synthetic_samples = 300, p_majority_samples = 500, k = 5): ''' read_article_ids : Set unread_article_ids : Set n_synthetic_samples : Percentage of snythetic samples, 300 for 300% k : neighbourhood for k nearest neighbour, standard 5 Returns ------- array-like full vector samples, shape = [n_features, n_samples] array-like marks, shape = [n_samples] ''' #Under-sample unread ids unread_article_ids = Set(sample(unread_article_ids, min(p_majority_samples/100 * len(read_article_ids), len(unread_article_ids)) ) ) #Create unread article vectors unread_marks = np.empty(len(unread_article_ids)) unread_marks.fill(UNREAD) unread_articles = np.empty(shape=(len(unread_article_ids), extractor.get_feature_number())) for i, article in enumerate(Article.objects(id__in = unread_article_ids)): try: article_features_as_full_vec = get_features(article, extractor) unread_articles[i,:] = article_features_as_full_vec[:] except AttributeError as e: logger.error("Article %s does not have attribute: %s." % (article.id, e)) #Create read article vectors read_marks = np.empty(len(read_article_ids)) read_marks.fill(READ) read_articles = np.empty(shape=(len(read_article_ids), extractor.get_feature_number())) for i, article in enumerate(Article.objects(id__in = read_article_ids)): try: article_features_as_full_vec = get_features(article, extractor) read_articles[i,:] = article_features_as_full_vec[:] except AttributeError as e: logger.error("Article %s does not have attribute: %s." % (article.id, e)) #SMOTE sample minorities #synthetic_read_articles = SMOTE(read_articles, p_synthetic_samples, k) #borderlineSMOTE sample minorites X = np.concatenate((read_articles, unread_articles)) y = np.concatenate((read_marks, unread_marks)) new_read_articles, synthetic_read_articles, danger_read_articles = borderlineSMOTE(X = X, y = y, minority_target = READ, N = p_synthetic_samples, k = k) #Create synthetic read samples synthetic_marks = np.zeros(len(synthetic_read_articles)) synthetic_marks.fill(READ) read_marks = np.empty(len(new_read_articles)) read_marks.fill(READ) danger_read_marks = np.empty(len(danger_read_articles)) danger_read_marks.fill(READ) logger.info("Use %d read, %d unread, %d danger reads and %d synthetic samples." % (len(read_marks), len(unread_marks), len(danger_read_marks), len(synthetic_marks))) return (np.concatenate((new_read_articles, synthetic_read_articles, danger_read_articles, unread_articles)), np.concatenate((read_marks, synthetic_marks, danger_read_marks, unread_marks)) )