def main(): # Read text, try removing comments, headers ... See tok.py for implementation. corpus = tok.fill_corpus(["alt.atheism", "comp.windows.x"]) # corpus = tok.fill_corpus(["alt.atheism", "soc.religion.christian"]) # Create training data ctr = reduce(list.__add__, map(lambda x: x[:600], corpus)) ytr = zeros(len(ctr)) ytr[:600] = -1 ytr[600:] = 1 # Train a bag-of-words feature extractor. # You're free to play with the parameters of fe.text.TfidfVectorizer, but your answers # *should be* answered for the parameters given here. You can find out more about these # on the scikits-learn documentation site. tfidf = fe.text.TfidfVectorizer(min_df=5, ngram_range=(1, 4), use_idf=True, encoding="ascii") # Train the tokenizer. ftr = tfidf.fit_transform(ctr) ftr = ftr.tocsc() # This maps features back to their text. feature_names = tfidf.get_feature_names() # tokenizer is trained, then A, H, I = load() newsgroups = getNewsgroups() newsgroups = sort(newsgroups) for group in newsgroups: bag = tok.fill_corpus([group]) bag = bag[0] f = tfidf.transform(bag).tocsc() y_hat = adaboost_predict(A, H, I, f, len(A)) left = (y_hat > 0).sum() / float(len(y_hat)) if left > 0.5: print group, "comp.windows.x(" + str(left) + ")" else: print group, "alt.atheism(" + str(1 - left) + ")"
def main (): #Read text, try removing comments, headers ... See tok.py for implementation. #corpus = tok.fill_corpus(["alt.atheism", "comp.windows.x"]) corpus = tok.fill_corpus(["alt.atheism", "soc.religion.christian"]) #Create training data ctr = reduce(list.__add__, map(lambda x: x[:600], corpus)) ytr = zeros(len(ctr)); ytr[:600] = -1; ytr[600:] = 1 #Train a bag-of-words feature extractor. #You're free to play with the parameters of fe.text.TfidfVectorizer, but your answers #*should be* answered for the parameters given here. You can find out more about these #on the scikits-learn documentation site. tfidf = fe.text.TfidfVectorizer(min_df=5, ngram_range=(1, 4), use_idf=True, encoding="ascii") #Train the tokenizer. ftr = tfidf.fit_transform(ctr) ftr = ftr.tocsc() #This maps features back to their text. feature_names = tfidf.get_feature_names() m = 30 #This shouldn't take more than 20m. A, H, I , TE = adaboost_train(ftr, ytr, m) for i in range(m): print "T", i, "index:", I[i], "feature name:", feature_names[I[i]] # Plot pl.subplot(2,1,1) pl.xlabel('steps of adaboost') pl.ylabel('magnitude of alpha') pl.plot(np.abs(A),'o') pl.subplot(2,1,2) #pl.axis([0,50,0,.5]) pl.xlabel('steps of adaboost') pl.ylabel('training error') pl.plot(TE,'o') pl.show() #Create validation data cva = reduce(list.__add__, map(lambda x: x[600:800], corpus)) yva = zeros(len(cva)); yva[:200] = -1; yva[200:] = 1 #tfidf tokenizer is not trained here. fva = tfidf.transform(cva).tocsc() #<Validation code goes here> HE, t = adaboost_find_t(A,H,I,fva, yva) print "t", t A = A[:t] H = H[:t] I = I[:t] S = np.vstack((A,H,I)) np.savetxt("matrix2.out", S); pl.clf() pl.plot(HE,'o') pl.show() #Create test data #Some lists have less than a thousand mails. You may have to change this. cte = reduce(list.__add__, map(lambda x: x[800:], corpus)) yte = zeros(len(cte)); yte[:200] = -1; yte[200:] = 1 fte = tfidf.transform(cte).tocsc() #<Testing code goes here> y_hat = adaboost_predict(A,H,I,fte,t) err = (y_hat * yte < 0).sum() / float(yte.shape[0]) print "err", err
def classifyposts(self, N, T): # Read text, try removing comments, headers ... See tok.py for implementation. corpus = tok.fill_corpus(["alt.atheism", "comp.windows.x"]) # Create training data ctr = reduce(list.__add__, map(lambda x: x[:600], corpus)) ytr = zeros(len(ctr)) ytr[:600] = -1 ytr[600:] = 1 # Train a bag-of-words feature extractor. # You're free to play with the parameters of fe.text.TfidfVectorizer, but your answers # *should be* answered for the parameters given here. You can find out more about these # on the scikits-learn documentation site. tfidf = fe.text.TfidfVectorizer(min_df=5, ngram_range=(1, 4), use_idf=True, encoding="ascii") # Train the tokenizer. ftr = tfidf.fit_transform(ctr) ftr = ftr.tocsc() alphas, cfs, dt = self.RunAdaBoost(ftr, ytr, features=11808, N=N, T=T, d_i=None, isSparse=1) print "Round 1: ", cfs[0, :] # This maps features back to their text. feature_names = tfidf.get_feature_names() for i in cfs[0, :]: print i, ":", feature_names[int(i)] # This shouldn't take more than 20m. # <Adaboost goes here> # Create validation data cva = reduce(list.__add__, map(lambda x: x[600:800], corpus)) yva = zeros(len(cva)) yva[:200] = -1 yva[200:] = 1 # tfidf tokenizer is not trained here. fva = tfidf.transform(cva).tocsc() # <Validation code goes here> idx = self.predict_validation_errors(fva, yva, alphas, cfs, 400, T) print "idx returned was ", idx # Create test data # Some lists have less than a thousand mails. You may have to change this. cte = reduce(list.__add__, map(lambda x: x[800:], corpus)) yte = zeros(len(cte)) yte[:200] = -1 yte[200:] = 1 fte = tfidf.transform(cte).tocsc() shape_t = shape(fte)[0] if shape_t != 400: print shape_t self.predict_validation_errors(fte, yte, alphas, cfs, shape_t, idx, ValidateOrTest=0) paperlist = [ "comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware", "misc.forsale", "rec.autos", "rec.motorcycles", "rec.sport.baseball", "rec.sport.hockey", "sci.crypt", "sci.electronics", "sci.med", "sci.space", "talk.politics.guns", "talk.politics.mideast", "talk.politics.misc", "talk.religion.misc", ] for i in paperlist: corpus = tok.fill_corpus([i]) t_pred = reduce(list.__add__, map(lambda x: x[:1000], corpus)) y_fake = zeros(len(t_pred)) y_fake[:500] = -1 y_fake[500:] = 1 f_corpus = tfidf.transform(t_pred).tocsc() print "------ For posts in ", i self.classify_post_output(f_corpus.todense(), alphas, cfs, y_fake, 1000, T)