def train(self, read_article_ids=None, unread_article_ids=None): """ Trains the DecisionTree Classifier. read_article_ids should be an iterable over read article ids unread_article_ids should be an iterable over unread article ids If one is None it will be loaded from database. """ #Load user feedback if needed if read_article_ids is None: read_article_ids = set(r.article.id for r in ReadArticleFeedback.objects(user_id=self.user.id).only("article")) else: read_article_ids = set(read_article_ids) #Get all articles the user did not read. if unread_article_ids is None: ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id=self.user.id).only("article")) all_article_ids = set(a.id for a in Article.objects(id__in=ranked_article_ids).only("id")) unread_article_ids = all_article_ids - read_article_ids #convert all article features all_articles, marks = self._get_samples(read_article_ids, unread_article_ids, p_synthetic_samples=self.p_synthetic_samples, p_majority_samples=self.p_majority_samples) logger.debug("Learn on %d samples." % len(marks)) self.clf = tree.DecisionTreeClassifier() self.clf.fit(all_articles, marks)
def get_article_samples(config_): #Connect to mongo database logger.info("Connect to database...") connect(config_['database']['db-name'], username= config_['database']['user'], password= config_['database']['passwd'], port = config_['database']['port']) #get user user = User.objects(email=u"*****@*****.**").first() ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id = user.id).only("article")) all_article_ids = Set(a.id for a in Article.objects(id__in = ranked_article_ids).only("id")) read_article_ids = Set(a.article.id for a in ReadArticleFeedback.objects(user_id = user.id).only("article")) unread_article_ids = all_article_ids - read_article_ids #sample test articles X, y = get_samples(extractor, read_article_ids, unread_article_ids) return X, y
def train(self, read_article_ids=None, unread_article_ids=None): """ Trains the Bayes Classifier. read_article_ids should be an iterable over read article ids unread_article_ids should be an iterable over unread article ids If one is None it will be loaded from database. """ #Load user feedback if needed if read_article_ids is None: read_article_ids = set(r.article.id for r in ReadArticleFeedback.objects(user_id=self.user.id).only("article")) else: read_article_ids = set(read_article_ids) logger.info("Use %d read articles for learning." % len(read_article_ids)) read_articles = Article.objects(id__in=read_article_ids) #Get all articles the user did not read. if unread_article_ids is None: ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id=self.user.id).only("article")) all_article_ids = set(a.id for a in Article.objects(id__in=ranked_article_ids).only("id")) unread_article_ids = all_article_ids - read_article_ids #undersample unreads logger.info("Use %d unread articles for learning." % (len(unread_article_ids))) unread_articles = Article.objects(id__in=unread_article_ids) #convert all article features all_articles = UserModelBayes.AllArticles(read_articles, unread_articles, self.get_features) self.clf.fit(np.array(list(all_articles)), np.array(list(all_articles.get_marks())))
def get_top_articles(self, date, min_rating): ''' Returns iterator to articles from date and with a rating bigger than min_rating. ''' #get all articles from specific date articles_from_date = Article.objects(date__gte = date.date(), date__lt = date.date() + timedelta(days=1)) #get all ranked article form loaded articles return [a.article for a in RankedArticle.objects(user_id = self.mongodb_user.id, rating__gte = min_rating, article__in = articles_from_date)]
def train(self, read_article_ids = None, unread_article_ids = None): ''' Trains the several SVM and Naive Bayes Classifiers. read_article_ids should be an iterable over read article ids unread_article_ids should be an iterable over unread article ids If one is None it will be loaded from database. ''' #Load user feedback if needed if read_article_ids is None: read_article_ids = Set(r.article.id for r in ReadArticleFeedback.objects(user_id = self.user.id).only("article")) else: read_article_ids = Set(read_article_ids) #Get all articles the user did not read. if unread_article_ids is None: ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id = self.user.id).only("article")) all_article_ids = Set(a.id for a in Article.objects(id__in = ranked_article_ids).only("id")) unread_article_ids = all_article_ids - read_article_ids classifiers = [lambda: svm.SVC(kernel='rbf'), lambda: svm.SVC(kernel='rbf'), lambda: svm.SVC(kernel='rbf'), lambda: svm.SVC(kernel='rbf'), lambda: svm.SVC(kernel='rbf'), GaussianNB, GaussianNB, GaussianNB, GaussianNB] parameters = [#SVM {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 100, 'p_majority_samples': 200, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 200, 'p_majority_samples': 300, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 300, 'p_majority_samples': 400, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 400, 'p_majority_samples': 500, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 500, 'p_majority_samples': 600, 'k': 10}, #Naive Bayes {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 100, 'p_majority_samples': 100, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 100, 'p_majority_samples': 200, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 300, 'p_majority_samples': 500, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 600, 'p_majority_samples': 600, 'k': 10}] self._call_classifiers(classifiers, parameters)
username=config_["database"]["user"], password=config_["database"]["passwd"], port=config_["database"]["port"], ) # Load feature extractor # feature_extractor = EsaFeatureExtractor(prefix = config_['prefix']) # feature_extractor = TfidfFeatureExtractor(prefix = config_['prefix']) # feature_extractor = LdaFeatureExtractor(prefix = config_['prefix']) # feature_extractor = LdaBowFeatureExtractor(prefix = config_['prefix']) feature_extractor = cEsaFeatureExtractor(prefix=config_["prefix"]) # get user user = User.objects(email=u"*****@*****.**").first() ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id=user.id).only("article")) all_article_ids = set(a.id for a in Article.objects(id__in=ranked_article_ids).only("id")) read_article_ids = set(a.article.id for a in ReadArticleFeedback.objects(user_id=user.id).only("article")) unread_article_ids = all_article_ids - read_article_ids for p_synthetic in xrange(100, 700, 100): for p_majority in xrange(100, 700, 100): logger.info("Synthetic over-sampling %d and majority undersampling %d" % (p_synthetic, p_majority)) # run test N_ITERATIONS precisions_read = np.zeros((N_ITERATIONS)) recalls_read = np.zeros((N_ITERATIONS)) f1_scores_read = np.zeros((N_ITERATIONS))
password= config_['database']['passwd'], port = config_['database']['port']) #Load feature extractor #feature_extractor = EsaFeatureExtractor(prefix = config_['prefix']) #feature_extractor = TfidfFeatureExtractor(prefix = config_['prefix']) #feature_extractor = LdaFeatureExtractor(prefix = config_['prefix']) #feature_extractor = LdaBowFeatureExtractor(prefix = config_['prefix']) feature_extractor = cEsaFeatureExtractor(prefix = config_['prefix']) #get user user = User.objects(email=u"*****@*****.**").first() ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id = user.id).only("article")) all_article_ids = Set(a.id for a in Article.objects(id__in = ranked_article_ids).only("id")) read_article_ids = Set(a.article.id for a in ReadArticleFeedback.objects(user_id = user.id).only("article")) unread_article_ids = all_article_ids - read_article_ids for p_synthetic in xrange(100, 700, 100): for p_majority in xrange(100, 700, 100): logger.info("Synthetic over-sampling %d and majority undersampling %d" % (p_synthetic, p_majority))
def test_constructor_with_file_wikicorpus(self): #load tf-idf model tfidf_model = tfidfmodel.TfidfModel.load("/media/sdc1/test_dump/result/test_tfidf.model") extractor = TfidfFeatureExtractor("/media/sdc1/test_dump/result/test") #load tf-idf corpus tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict") #load article titles document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt") #Connect to mongo database connect(self.config_['database']['db-name'], username= self.config_['database']['user'], password= self.config_['database']['passwd'], port = self.config_['database']['port']) #Load articles as test corpus user = User.objects(email=u"*****@*****.**").first() ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id = user.id).only("article")) all_article_ids = Set(a.id for a in Article.objects(id__in = ranked_article_ids).only("id")) read_article_ids = Set(a.article.id for a in ReadArticleFeedback.objects(user_id = user.id).only("article")) unread_article_ids = all_article_ids - read_article_ids #sample test articles X, y = get_samples(extractor, read_article_ids, unread_article_ids) s,f = X.shape logger.debug("Traning with %d samples, %d features, %d marks" % (s,f, len(y))) #train esa model esa_model = CosineEsaModel(tfidf_corpus, document_titles = document_titles, test_corpus = X, test_corpus_targets = y, num_test_corpus = len(y), num_best_features = 15, num_features = len(id2token)) print esa_model esa_model.save('/media/sdc1/test_dump/result/test_cesa.model') tmp_esa = CosineEsaModel.load('/media/sdc1/test_dump/result/test_cesa.model') print tmp_esa