Пример #1
0
def main():
    # Read text, try removing comments, headers ... See tok.py for implementation.
    corpus = tok.fill_corpus(["alt.atheism", "comp.windows.x"])
    # corpus = tok.fill_corpus(["alt.atheism", "soc.religion.christian"])

    # Create training data
    ctr = reduce(list.__add__, map(lambda x: x[:600], corpus))
    ytr = zeros(len(ctr))
    ytr[:600] = -1
    ytr[600:] = 1

    # Train a bag-of-words feature extractor.
    # You're free to play with the parameters of fe.text.TfidfVectorizer, but your answers
    # *should be* answered for the parameters given here. You can find out more about these
    # on the scikits-learn documentation site.
    tfidf = fe.text.TfidfVectorizer(min_df=5, ngram_range=(1, 4), use_idf=True, encoding="ascii")

    # Train the tokenizer.
    ftr = tfidf.fit_transform(ctr)
    ftr = ftr.tocsc()

    # This maps features back to their text.
    feature_names = tfidf.get_feature_names()

    # tokenizer is trained, then
    A, H, I = load()

    newsgroups = getNewsgroups()
    newsgroups = sort(newsgroups)

    for group in newsgroups:
        bag = tok.fill_corpus([group])
        bag = bag[0]
        f = tfidf.transform(bag).tocsc()
        y_hat = adaboost_predict(A, H, I, f, len(A))

        left = (y_hat > 0).sum() / float(len(y_hat))
        if left > 0.5:
            print group, "comp.windows.x(" + str(left) + ")"
        else:
            print group, "alt.atheism(" + str(1 - left) + ")"
Пример #2
0
def main ():
  #Read text, try removing comments, headers ... See tok.py for implementation.
  #corpus = tok.fill_corpus(["alt.atheism", "comp.windows.x"])
  corpus = tok.fill_corpus(["alt.atheism", "soc.religion.christian"])

  #Create training data
  ctr = reduce(list.__add__, map(lambda x: x[:600], corpus))
  ytr = zeros(len(ctr)); ytr[:600] = -1; ytr[600:] = 1  

  #Train a bag-of-words feature extractor.
  #You're free to play with the parameters of fe.text.TfidfVectorizer, but your answers
  #*should be* answered for the parameters given here. You can find out more about these
  #on the scikits-learn documentation site.
  tfidf = fe.text.TfidfVectorizer(min_df=5, ngram_range=(1, 4), use_idf=True, encoding="ascii")

  #Train the tokenizer.
  ftr = tfidf.fit_transform(ctr)
  ftr = ftr.tocsc()

  #This maps features back to their text.
  feature_names = tfidf.get_feature_names()

  m = 30
  #This shouldn't take more than 20m.
  A, H, I , TE = adaboost_train(ftr, ytr, m)

  for i in range(m):
    print "T", i, "index:", I[i], "feature name:", feature_names[I[i]]

  # Plot
  pl.subplot(2,1,1)
  pl.xlabel('steps of adaboost')
  pl.ylabel('magnitude of alpha')
  pl.plot(np.abs(A),'o')
  pl.subplot(2,1,2)
  #pl.axis([0,50,0,.5])
  pl.xlabel('steps of adaboost')
  pl.ylabel('training error')
  pl.plot(TE,'o')
  pl.show()


  #Create validation data
  cva = reduce(list.__add__, map(lambda x: x[600:800], corpus))
  yva = zeros(len(cva)); yva[:200] = -1; yva[200:] = 1

  #tfidf tokenizer is not trained here.
  fva = tfidf.transform(cva).tocsc()

  #<Validation code goes here>
  HE, t = adaboost_find_t(A,H,I,fva, yva)

  print "t", t
  A = A[:t]
  H = H[:t]
  I = I[:t]

  S = np.vstack((A,H,I))
  np.savetxt("matrix2.out", S);

  pl.clf()
  pl.plot(HE,'o')
  pl.show()

  #Create test data
  #Some lists have less than a thousand mails. You may have to change this.
  cte = reduce(list.__add__, map(lambda x: x[800:], corpus))
  yte = zeros(len(cte)); yte[:200] = -1; yte[200:] = 1

  fte = tfidf.transform(cte).tocsc()

  #<Testing code goes here>
  y_hat = adaboost_predict(A,H,I,fte,t)
  err = (y_hat * yte < 0).sum() / float(yte.shape[0])
  print "err", err
Пример #3
0
    def classifyposts(self, N, T):

        # Read text, try removing comments, headers ... See tok.py for implementation.
        corpus = tok.fill_corpus(["alt.atheism", "comp.windows.x"])

        # Create training data
        ctr = reduce(list.__add__, map(lambda x: x[:600], corpus))
        ytr = zeros(len(ctr))
        ytr[:600] = -1
        ytr[600:] = 1

        # Train a bag-of-words feature extractor.
        # You're free to play with the parameters of fe.text.TfidfVectorizer, but your answers
        # *should be* answered for the parameters given here. You can find out more about these
        # on the scikits-learn documentation site.
        tfidf = fe.text.TfidfVectorizer(min_df=5, ngram_range=(1, 4), use_idf=True, encoding="ascii")

        # Train the tokenizer.
        ftr = tfidf.fit_transform(ctr)
        ftr = ftr.tocsc()
        alphas, cfs, dt = self.RunAdaBoost(ftr, ytr, features=11808, N=N, T=T, d_i=None, isSparse=1)
        print "Round 1: ", cfs[0, :]

        # This maps features back to their text.
        feature_names = tfidf.get_feature_names()
        for i in cfs[0, :]:
            print i, ":", feature_names[int(i)]
        # This shouldn't take more than 20m.
        # <Adaboost goes here>

        # Create validation data
        cva = reduce(list.__add__, map(lambda x: x[600:800], corpus))
        yva = zeros(len(cva))
        yva[:200] = -1
        yva[200:] = 1

        # tfidf tokenizer is not trained here.
        fva = tfidf.transform(cva).tocsc()

        # <Validation code goes here>
        idx = self.predict_validation_errors(fva, yva, alphas, cfs, 400, T)
        print "idx returned was ", idx

        # Create test data
        # Some lists have less than a thousand mails. You may have to change this.
        cte = reduce(list.__add__, map(lambda x: x[800:], corpus))
        yte = zeros(len(cte))
        yte[:200] = -1
        yte[200:] = 1

        fte = tfidf.transform(cte).tocsc()
        shape_t = shape(fte)[0]
        if shape_t != 400:
            print shape_t
        self.predict_validation_errors(fte, yte, alphas, cfs, shape_t, idx, ValidateOrTest=0)

        paperlist = [
            "comp.graphics",
            "comp.os.ms-windows.misc",
            "comp.sys.ibm.pc.hardware",
            "comp.sys.mac.hardware",
            "misc.forsale",
            "rec.autos",
            "rec.motorcycles",
            "rec.sport.baseball",
            "rec.sport.hockey",
            "sci.crypt",
            "sci.electronics",
            "sci.med",
            "sci.space",
            "talk.politics.guns",
            "talk.politics.mideast",
            "talk.politics.misc",
            "talk.religion.misc",
        ]
        for i in paperlist:
            corpus = tok.fill_corpus([i])
            t_pred = reduce(list.__add__, map(lambda x: x[:1000], corpus))
            y_fake = zeros(len(t_pred))
            y_fake[:500] = -1
            y_fake[500:] = 1
            f_corpus = tfidf.transform(t_pred).tocsc()
            print "------ For posts in ", i
            self.classify_post_output(f_corpus.todense(), alphas, cfs, y_fake, 1000, T)