예제 #1
0
def do_training():
    global X_train, X_test, feature_names, ch2
    print("Extracting features from the training data using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                       n_features=opts.n_features)
        X_train = vectorizer.transform(data_train_data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.25,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train_data)
    duration = time() - t0
    #print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()

    print("Extracting features from the test data using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test_data)
    duration = time() - t0
    #print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()

    # mapping from integer feature name to original token string
    if opts.use_hashing:
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()

    if True:#opts.select_chi2:
        print("Extracting %d best features by a chi-squared test" % 20000)
        t0 = time()
        ch2 = SelectKBest(chi2, k=20000)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [feature_names[i] for i
                             in ch2.get_support(indices=True)]
        print("done in %fs" % (time() - t0))
        print()
    
    if feature_names:
        feature_names = np.asarray(feature_names)

    results = []

    #for penalty in ["l2", "l1"]:
    penalty = 'l2'
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    clf = LinearSVC(loss='l2', penalty=penalty,dual=False, tol=1e-3)
    results.append(benchmark(clf))
        
    joblib.dump(vectorizer, 'vectorizer.pkl', compress=9)
    joblib.dump(ch2, 'feature_selector.pkl', compress=9)
    joblib.dump(clf, 'linearsvc_classifier.pkl', compress=9)
예제 #2
0
def main_function():
  # Display progress logs on stdout
  logging.basicConfig(level=logging.INFO,
                      format='%(asctime)s %(levelname)s %(message)s')


  # parse commandline arguments
  op = OptionParser()
  op.add_option("--report",
                action="store_true", dest="print_report",
                help="Print a detailed classification report.")
  op.add_option("--chi2_select",
                action="store", type="int", dest="select_chi2",
                help="Select some number of features using a chi-squared test")
  op.add_option("--confusion_matrix",
                action="store_true", dest="print_cm",
                help="Print the confusion matrix.")
  op.add_option("--top10",
                action="store_true", dest="print_top10",
                help="Print ten most discriminative terms per class"
                     " for every classifier.")
  op.add_option("--all_categories",
                action="store_true", dest="all_categories",
                help="Whether to use all categories or not.")
  op.add_option("--use_hashing",
                action="store_true",
                help="Use a hashing vectorizer.")
  op.add_option("--n_features",
                action="store", type=int, default=2 ** 16,
                help="n_features when using the hashing vectorizer.")
  op.add_option("--filtered",
                action="store_true",
                help="Remove newsgroup information that is easily overfit: "
                     "headers, signatures, and quoting.")
  (opts, args) = op.parse_args()
  if len(args) > 0:
      op.error("this script takes no arguments.")
      sys.exit(1)

  print(__doc__)
  op.print_help()
  print()


  ###############################################################################
  # Load some categories from the training set
  if opts.all_categories:
      categories = None
  else:
      obj = input_from_xml()
      domains,questions = obj.fetch_input_from_xml_questions()
      categories = domains

  print("Loading Data from different categories....")
  print(categories if categories else "all")


  obj = extract_data()
  train_data,train_target,train_target_names = obj.extract_training_data()
  test_data,test_target = obj.extract_testing_data()


  print('data loaded')
#  print train_target_names

  def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

  data_train_size_mb = size_mb(train_data)
  data_test_size_mb = size_mb(test_data)

  print("%d documents - %0.3fMB (training set)" % (
     len(train_data), data_train_size_mb))
  print("%d documents - %0.3fMB (test set)" % (
     len(test_data), data_test_size_mb))
  print("%d categories" % len(categories))
  print()


  categories = train_target_names 
  y_train,y_test = train_target,test_target

  print("Extracting features from the training dataset using a sparse vectorizer")
  t0 = time()
  if opts.use_hashing:
      vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                     n_features=opts.n_features)
      X_train = vectorizer.transform(train_data)
  else:
      vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                   stop_words='english')
      X_train = vectorizer.fit_transform(train_data)
  duration = time() - t0
  print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
  print("n_samples: %d, n_features: %d" % X_train.shape)
  print()

  print("Extracting features from the test dataset using the same vectorizer")
  t0 = time()
  X_test = vectorizer.transform(test_data)
  duration = time() - t0
  print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
  print("n_samples: %d, n_features: %d" % X_test.shape)
  print()

  if opts.select_chi2:
      print("Extracting %d best features by a chi-squared test" %
            opts.select_chi2)
      t0 = time()
      ch2 = SelectKBest(chi2, k=opts.select_chi2)
      X_train = ch2.fit_transform(X_train, y_train)
      X_test = ch2.transform(X_test)
      print("done in %fs" % (time() - t0))
      print()

  def trim(s):
      """Trim string to fit on terminal (assuming 80-column display)"""
      return s if len(s) <= 80 else s[:77] + "..."

  if opts.use_hashing:
      feature_names = None
  else:
      feature_names = np.asarray(vectorizer.get_feature_names())

  loop = 1
  C= 1.0
  while True:
    print('_' * 80)
    print("Training the data: ")
    t0 = time()
    clf = LinearSVC(C=C)
    clf.fit(X_train,y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    
    t0 = time()
    pred = clf.predict(X_test)
    scores = clf.decision_function(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)
#    return test_data, pred, y_test, scores, train_target_names
    train_data = train_data + test_data
    train_target = train_target + test_target
    X_train = vectorizer.transform(train_data)
    y_train = train_target

    cnt = 0
    cnt2 = 0
    for i in range(0,len(y_test)):
      if y_test[i]!=-1:
        if pred[i] == y_test[i]:
          cnt = cnt + 1
        cnt2 = cnt2 + 1

    print cnt,cnt2
    print "efficiency of classification in loop %d is: %f" %(loop,(cnt + len(y_test) - cnt2)/(1.0*len(y_test))*100)


    '''
    for i in range(0,10):
        print "quest: ",test_data[i]
        print "Given class: %s" %(train_target_names[y_test[i]])
        print "Predicted class: %s" %(train_target_names[pred[i]]) 
        print "#####################"
    '''
    loop = loop + 1
def plot():

    # Display progress logs on stdout
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')
    
    
    # parse commandline arguments
    op = OptionParser()
    op.add_option("--report",
                  action="store_true", dest="print_report",
                  help="Print a detailed classification report.")
    op.add_option("--chi2_select",
                  action="store", type="int", dest="select_chi2",
                  help="Select some number of features using a chi-squared test")
    op.add_option("--confusion_matrix",
                  action="store_true", dest="print_cm",
                  help="Print the confusion matrix.")
    op.add_option("--top10",
                  action="store_true", dest="print_top10",
                  help="Print ten most discriminative terms per class"
                       " for every classifier.")
    op.add_option("--all_categories",
                  action="store_true", dest="all_categories",
                  help="Whether to use all categories or not.")
    op.add_option("--use_hashing",
                  action="store_true",
                  help="Use a hashing vectorizer.")
    op.add_option("--n_features",
                  action="store", type=int, default=2 ** 16,
                  help="n_features when using the hashing vectorizer.")
    op.add_option("--filtered",
                  action="store_true",
                  help="Remove newsgroup information that is easily overfit: "
                       "headers, signatures, and quoting.")
    
    (opts, args) = op.parse_args()
    if len(args) > 0:
        op.error("this script takes no arguments.")
        sys.exit(1)
    
    print(__doc__)
    op.print_help()
    print()
    
    
    ###############################################################################
    # Load some categories from the training set
    if opts.all_categories:
        categories = None
    else:
        categories = [
            'alt.atheism',
            'talk.religion.misc',
            'comp.graphics',
            'sci.space',
        ]
    
    if opts.filtered:
        remove = ('headers', 'footers', 'quotes')
    else:
        remove = ()
    
    print("Loading 20 newsgroups dataset for categories:")
    print(categories if categories else "all")
    
    data_train = fetch_20newsgroups(subset='train', categories=categories,
                                    shuffle=True, random_state=42,
                                    remove=remove)
    
    data_test = fetch_20newsgroups(subset='test', categories=categories,
                                   shuffle=True, random_state=42,
                                   remove=remove)
    print('data loaded')
    
    categories = data_train.target_names    # for case categories == None
    
    
    def size_mb(docs):
        return sum(len(s.encode('utf-8')) for s in docs) / 1e6
    
    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)
    
    print("%d documents - %0.3fMB (training set)" % (
        len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" % (
        len(data_test.data), data_test_size_mb))
    print("%d categories" % len(categories))
    print()
    
    # split a training set and a test set
    y_train, y_test = data_train.target, data_test.target
    
    print("Extracting features from the training dataset using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                       n_features=opts.n_features)
        X_train = vectorizer.transform(data_train.data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()
    
    print("Extracting features from the test dataset using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()
    
    if opts.select_chi2:
        print("Extracting %d best features by a chi-squared test" %
              opts.select_chi2)
        t0 = time()
        ch2 = SelectKBest(chi2, k=opts.select_chi2)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        print("done in %fs" % (time() - t0))
        print()
    
    
    def trim(s):
        """Trim string to fit on terminal (assuming 80-column display)"""
        return s if len(s) <= 80 else s[:77] + "..."
    
    
    # mapping from integer feature name to original token string
    if opts.use_hashing:
        feature_names = None
    else:
        feature_names = np.asarray(vectorizer.get_feature_names())
    
    
    ###############################################################################
    # Benchmark classifiers
    def benchmark(clf):
        print('_' * 80)
        print("Training: ")
        print(clf)
        t0 = time()
        clf.fit(X_train, y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)
    
        t0 = time()
        pred = clf.predict(X_test)
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)
    
        score = metrics.f1_score(y_test, pred)
        print("f1-score:   %0.3f" % score)
    
        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))
    
            if opts.print_top10 and feature_names is not None:
                print("top 10 keywords per class:")
                for i, category in enumerate(categories):
                    top10 = np.argsort(clf.coef_[i])[-10:]
                    print(trim("%s: %s"
                          % (category, " ".join(feature_names[top10]))))
            print()
    
        if opts.print_report:
            print("classification report:")
            print(metrics.classification_report(y_test, pred,
                                                target_names=categories))
    
        if opts.print_cm:
            print("confusion matrix:")
            print(metrics.confusion_matrix(y_test, pred))
    
        print()
        clf_descr = str(clf).split('(')[0]
        return clf_descr, score, train_time, test_time
    
    
    results = []
    for clf, name in (
            (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
            (Perceptron(n_iter=50), "Perceptron"),
            (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
            (KNeighborsClassifier(n_neighbors=10), "kNN")):
        print('=' * 80)
        print(name)
        results.append(benchmark(clf))
    
    for penalty in ["l2", "l1"]:
        print('=' * 80)
        print("%s penalty" % penalty.upper())
        # Train Liblinear model
        results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                                dual=False, tol=1e-3)))
    
        # Train SGD model
        results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                               penalty=penalty)))
    
    # Train SGD with Elastic Net penalty
    print('=' * 80)
    print("Elastic-Net penalty")
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty="elasticnet")))
    
    # Train NearestCentroid without threshold
    print('=' * 80)
    print("NearestCentroid (aka Rocchio classifier)")
    results.append(benchmark(NearestCentroid()))
    
    # Train sparse Naive Bayes classifiers
    print('=' * 80)
    print("Naive Bayes")
    results.append(benchmark(MultinomialNB(alpha=.01)))
    results.append(benchmark(BernoulliNB(alpha=.01)))
    
    
    class L1LinearSVC(LinearSVC):
    
        def fit(self, X, y):
            # The smaller C, the stronger the regularization.
            # The more regularization, the more sparsity.
            self.transformer_ = LinearSVC(penalty="l1",
                                          dual=False, tol=1e-3)
            X = self.transformer_.fit_transform(X, y)
            return LinearSVC.fit(self, X, y)
    
        def predict(self, X):
            X = self.transformer_.transform(X)
            return LinearSVC.predict(self, X)
    
    print('=' * 80)
    print("LinearSVC with L1-based feature selection")
    results.append(benchmark(L1LinearSVC()))
    
    
    # make some plots
    
    indices = np.arange(len(results))
    
    results = [[x[i] for x in results] for i in range(4)]
    
    clf_names, score, training_time, test_time = results
    training_time = np.array(training_time) / np.max(training_time)
    test_time = np.array(test_time) / np.max(test_time)
    
    pl.figure(figsize=(12,8))
    pl.title("Score")
    pl.barh(indices, score, .2, label="score", color='r')
    pl.barh(indices + .3, training_time, .2, label="training time", color='g')
    pl.barh(indices + .6, test_time, .2, label="test time", color='b')
    pl.yticks(())
    pl.legend(loc='best')
    pl.subplots_adjust(left=.25)
    pl.subplots_adjust(top=.95)
    pl.subplots_adjust(bottom=.05)
    
    for i, c in zip(indices, clf_names):
        pl.text(-.3, i, c)

    pl.show()
def bench(texts,y,opts):
    data_train, data_test, y_train, y_test = train_test_split(texts, y, test_size=0.33, random_state=42)
    logger.info('data loaded')
    data_train_size_mb = size_mb(data_train)
    data_test_size_mb = size_mb(data_test)
    logger.info("%d documents - %0.3fMB (training set)" % (len(data_train), data_train_size_mb))
    logger.info("%d documents - %0.3fMB (test set)" % (len(data_test), data_test_size_mb))
    logger.info("\n")
    logger.info("Extracting features from the training data using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        logger.info("Use hashing")
        vectorizer = HashingVectorizer(preprocessor=stif_classifier_dataset.preprocessor, non_negative=True,
                                       n_features=opts.n_features)
        X_train = vectorizer.transform(data_train)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                     preprocessor=stif_classifier_dataset.preprocessor)
        X_train = vectorizer.fit_transform(data_train)
    duration = time() - t0
    logger.info("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    logger.info("n_samples: %d, n_features: %d" % X_train.shape)
    logger.info("\n")
    logger.info("Extracting features from the test data using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test)
    duration = time() - t0
    logger.info("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    logger.info("n_samples: %d, n_features: %d" % X_test.shape)
    logger.info("\n")
    # mapping from integer feature name to original token string
    if opts.use_hashing:
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()
    if opts.select_chi2:
        logger.info("Extracting %d best features by a chi-squared test" %
                    opts.select_chi2)
        t0 = time()
        ch2 = SelectKBest(chi2, k=opts.select_chi2)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [feature_names[i] for i
                             in ch2.get_support(indices=True)]
        logger.info("done in %fs" % (time() - t0))
        logger.info("\n")
    if feature_names:
        feature_names = np.asarray(feature_names)
    categories = ["no alert", "alert"]
    results = []
    for clf, name in (
            (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
            (Perceptron(n_iter=50), "Perceptron"),
            (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
            (KNeighborsClassifier(n_neighbors=10), "kNN"),
            (RandomForestClassifier(n_estimators=100), "Random forest")):
        logger.info('=' * 80)
        logger.info(name)
        clf_descr = str(clf).split('(')[0]
        results.append(benchmark(clf, clf_descr,X_train,y_train,X_test,y_test,categories,feature_names,opts))
    for penalty in ["l2", "l1"]:
        logger.info('=' * 80)
        logger.info("%s penalty" % penalty.upper())
        # Train Liblinear model
        clf_descr = 'LinearSVC ' + penalty
        results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                           dual=False, tol=1e-3), clf_descr,X_train,y_train,X_test,y_test,categories,feature_names,opts))

        # Train SGD model
        clf_descr = 'SGDClassifier ' + penalty
        results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                               penalty=penalty), clf_descr,X_train,y_train,X_test,y_test,categories,feature_names,opts))

    # Train SGD with Elastic Net penalty
    logger.info('=' * 80)
    logger.info("Elastic-Net penalty")
    clf_descr = 'SGDClassifier Elastic-Net'
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty="elasticnet"), clf_descr,X_train,y_train,X_test,y_test,categories,feature_names,opts))
    # Train NearestCentroid without threshold
    logger.info('=' * 80)
    logger.info("NearestCentroid (aka Rocchio classifier)")
    results.append(benchmark(NearestCentroid(), 'NearestCentroid',X_train,y_train,X_test,y_test,categories,feature_names,opts))
    # Train sparse Naive Bayes classifiers
    logger.info('=' * 80)
    logger.info("Naive Bayes")
    results.append(benchmark(MultinomialNB(alpha=.01), 'MultinomialNB',X_train,y_train,X_test,y_test,categories,feature_names,opts))
    results.append(benchmark(BernoulliNB(alpha=.01), 'BernoulliNB',X_train,y_train,X_test,y_test,categories,feature_names,opts))
    logger.info('=' * 80)
    logger.info("LinearSVC with L1-based feature selection")
    # The smaller C, the stronger the regularization.
    # The more regularization, the more sparsity.
    results.append(benchmark(Pipeline([
        ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
        ('classification', LinearSVC())
    ]), 'LinearSVC with feature_selection',X_train,y_train,X_test,y_test,categories,feature_names,opts))
    # make some plots
    indices = np.arange(len(results))
    results = [[x[i] for x in results] for i in range(4)]
    clf_names, score, training_time, test_time = results
    training_time = np.array(training_time) / np.max(training_time)
    test_time = np.array(test_time) / np.max(test_time)
    ts = time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%dT%H-%M-%S')
    pdf = PdfPages('stif_classification_' + st + '.pdf')
    fig = plt.figure(figsize=(12, 8))
    ax = plt.subplot(111)
    plt.title("Benchmark de classifieurs")
    ax.barh(indices, score, .2, label="score", color='r')
    ax.barh(indices + .3, training_time, .2, label="training time", color='g')
    ax.barh(indices + .6, test_time, .2, label="test time", color='b')
    plt.yticks(())
    plt.xticks(np.linspace(0.0, 1.0, 11, endpoint=True))
    plt.subplots_adjust(left=.25)
    plt.subplots_adjust(top=.95)
    plt.subplots_adjust(bottom=.05)
    for i, c in zip(indices, clf_names):
        ax.text(-.4, i, c)

    # Shrink current axis by 20%
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    # Put a legend to the right of the current axis
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    # plt.show()
    fig.savefig('stif_classification_' + st + '.png')
    pdf.savefig()
    pdf.close()
def predict_and_cluster(opts,mode):

	n_digits = 3
	# n_samples, n_features = (25,1927)
	n_samples, n_features = (25,491)

	labels = array([0,1,2,1,1,2,2,1,2,0,0,0,1,1,2,1,1,1,1,1,1,1,1,2,1])
	true_k = np.unique(labels).shape[0]

	corpus, news = jieba_tokenizer()

	print("Extracting features from the training dataset using a sparse vectorizer")
	t0 = time()
	if opts.use_hashing:
	    if opts.use_idf:
	        # Perform an IDF normalization on the output of HashingVectorizer
	        hasher = HashingVectorizer(n_features=opts.n_features,
	                                   stop_words='english', non_negative=True,
	                                   norm=None, binary=False)
	        vectorizer = make_pipeline(hasher, TfidfTransformer())
	    else:
	        vectorizer = HashingVectorizer(n_features=opts.n_features,
	                                       stop_words='english',
	                                       non_negative=False, norm='l2',
	                                       binary=False)
	else:
	    vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
	                                 min_df=2, stop_words='english',
	                                 use_idf=opts.use_idf)

	X = vectorizer.fit_transform(corpus)

	print("done in %fs" % (time() - t0))
	# n_samples: how many articles are there
	# n_features: how many different words in all articles are there
	print("n_samples: %d, n_features: %d" % X.shape)
	print()

	if opts.n_components:
	    print("Performing dimensionality reduction using LSA")
	    t0 = time()
	    # Vectorizer results are normalized, which makes KMeans behave as
	    # spherical k-means for better results. Since LSA/SVD results are
	    # not normalized, we have to redo the normalization.
	    svd = TruncatedSVD(opts.n_components)
	    lsa = make_pipeline(svd, Normalizer(copy=False))

	    X = lsa.fit_transform(X)

	    print("done in %fs" % (time() - t0))

	    svd = TruncatedSVD().fit(X)
	    X_proj = svd.transform(X)
	    explained_variances = np.var(X_proj, axis=0) / np.var(X, axis=0).sum()

	    print("Explained variance of the SVD step: {}%".format(
	        int(explained_variances[0] * 100)))

	    print()

	# =================================================
	# KMeans clustering

	# if opts.minibatch:
	#     km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
	#                          init_size=1000, batch_size=1000, verbose=True)
	# else:
	print ('*' * 80)


	km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
	                verbose=True) # always better

	print("Clustering sparse data with %s" % km)
	t0 = time()
	km.fit(X)
	print("done in %0.3fs" % (time() - t0))
	print()

	print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
	print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
	print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
	print("Adjusted Rand-Index: %.3f"
	      % metrics.adjusted_rand_score(labels, km.labels_))

	print("Silhouette Coefficient: %0.3f"
	     % metrics.silhouette_score(X, labels, sample_size=None))

	print ("labels    " ,labels)
	print ("my_labels " ,km.labels_)

	if not (opts.n_components or opts.use_hashing):
	    print("Top terms per cluster:")
	    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
	    terms = vectorizer.get_feature_names()
	    for i in range(true_k):
	        print("Cluster %d:" % i, end='')
	        for ind in order_centroids[i, :10]:
	            print(' %s' % terms[ind], end='')
	        print()

	for i in range(len(news)):
	    news[i].category = labels[i]

	from sklearn.metrics.pairwise import cosine_similarity

	FG=nx.Graph()

	for i in range(len(news)):
	    news[i].similarity = cosine_similarity(X[i:i+1], X)[0]
	    cs = news[i].similarity
	    # print (cs)
	    for j in range(len(news)):
			if i != j:
				FG.add_weighted_edges_from([(i,j,cs[j])])

	print ()
	print ('*' * 80)

	print (X.shape[0])
	print (X.shape)
	print (self)

	gmm(X)

	print ()
	print ('*' * 80)

	best_part(FG)

	print ()
	print ('*' * 80)
예제 #6
0
def main():
    if DATASET == 'SPAM':
        data_train = get_spam_dataset('corpus/english_big.txt')
        data_test = get_spam_dataset('corpus/english.txt', cleaned=False)
        categories = ['spam', 'ham']
    elif DATASET == 'NEWS':
        data_train, data_test, categories = get_news_data()
    else:
        raise ValueError('Unknown DATASET')
    print('data loaded')

    # order of labels in `target_names` can be different from `categories`
    target_names = data_train.target_names

    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)

    print("%d documents - %0.3fMB (training set)" %
          (len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" %
          (len(data_test.data), data_test_size_mb))
    print("{0} categories".format(len(categories) if categories else 'all'))
    print('=' * 80 + '\n')

    # split a training set and a test set
    y_train, y_test = data_train.target, data_test.target

    print(
        "Extracting features from the training data using a sparse vectorizer")
    t0 = time.time()
    if VECTORIZER == 'HASH':
        vectorizer = HashingVectorizer(stop_words='english',
                                       alternate_sign=False,
                                       n_features=N_FEATURES)
        X_train = vectorizer.transform(data_train.data)
    elif VECTORIZER == 'TFIDF':
        vectorizer = TfidfVectorizer(sublinear_tf=True,
                                     max_df=0.5,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train.data)
    else:
        vectorizer = CountVectorizer(stop_words='english')
        X_train = vectorizer.fit_transform(data_train.data)

    # Print matrix
    # X_train.toarray().astype(int)
    print("done in %fs" % (time.time() - t0))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()

    print("Extracting features from the test data using the same vectorizer")
    t0 = time.time()
    X_test = vectorizer.transform(data_test.data)
    print("done in %fs" % (time.time() - t0))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()

    # mapping from integer feature name to original token string
    if VECTORIZER == 'HASH':
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()

    results = classify(X_train, X_test, y_train, y_test, feature_names,
                       target_names)
    plot_results(results)
예제 #7
0
def svm(num_cats, train_data, test_data):
    # train_path = os.path.join(data_path, "train")
    # test_path = os.path.join(data_path, "test")

    data_train = {}
    data_train["data"] = []
    data_train["target"] = []
    data_train["target_names"] = [str(i) for i in range(num_cats)]
    data_test = {}
    data_test["data"] = []
    data_test["target"] = []
    # with open(train_path, 'r') as train_f, open(test_path) as test_f:
    #     train_data = train_f.readlines()
    #     test_data = test_f.readlines()

    for exam in train_data:
        data_train["data"].append(exam[12:])
        # print(exam)
        data_train["target"].append(int(exam[9]))
    for exam in test_data:
        data_test["data"].append(exam[12:])
        data_test["target"].append(int(exam[9]))


    data_train = ap.Namespace(**data_train)
    data_test = ap.Namespace(**data_test)

    print('data loaded')

    categories = data_train.target_names    # for case categories == None


    def size_mb(docs):
        return sum(len(s.encode('utf-8')) for s in docs) / 1e6

    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)

    print("%d documents - %0.3fMB (training set)" % (
        len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" % (
        len(data_test.data), data_test_size_mb))
    print("%d categories" % len(categories))
    print(data_train.data[:10])

    # split a training set and a test set
    y_train, y_test = data_train.target, data_test.target

    print("Extracting features from the training data using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                    n_features=opts.n_features)
        X_train = vectorizer.transform(data_train.data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                    stop_words='english')
        X_train = vectorizer.fit_transform(data_train.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()

    print("Extracting features from the test data using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()

    # mapping from integer feature name to original token string
    if opts.use_hashing:
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()

    if opts.select_chi2:
        print("Extracting %d best features by a chi-squared test" %
            opts.select_chi2)
        t0 = time()
        ch2 = SelectKBest(chi2, k=opts.select_chi2)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [feature_names[i] for i
                            in ch2.get_support(indices=True)]
        print("done in %fs" % (time() - t0))
        print()

    if feature_names:
        feature_names = np.asarray(feature_names)


    def trim(s):
        """Trim string to fit on terminal (assuming 80-column display)"""
        return s if len(s) <= 80 else s[:77] + "..."


    ###############################################################################
    # Benchmark classifiers
    def benchmark(clf):
        print('_' * 80)
        print("Training: ")
        print(clf)
        t0 = time()
        clf.fit(X_train, y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)

        t0 = time()
        pred = clf.predict(X_test)
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)

        score = metrics.accuracy_score(y_test, pred)
        print("accuracy:   %0.3f" % score)

        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))

            if opts.print_top10 and feature_names is not None:
                print("top 10 keywords per class:")
                for i, category in enumerate(categories):
                    top10 = np.argsort(clf.coef_[i])[-10:]
                    print(trim("%s: %s"
                        % (category, " ".join(feature_names[top10]))))
            print()

        if opts.print_report:
            print("classification report:")
            print(metrics.classification_report(y_test, pred,
                                                target_names=categories))

        if opts.print_cm:
            print("confusion matrix:")
            print(metrics.confusion_matrix(y_test, pred))

        print()
        clf_descr = str(clf).split('(')[0]
        return clf_descr, score, train_time, test_time


    results = []
    for penalty in ["l2", "l1"]:
        print('=' * 80)
        print("%s penalty" % penalty.upper())
        # Train Liblinear model
        results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                                dual=False, tol=1e-3)))



    # make some plots

    indices = np.arange(len(results))

    results = [[x[i] for x in results] for i in range(4)]

    clf_names, score, training_time, test_time = results
    training_time = np.array(training_time) / np.max(training_time)
    test_time = np.array(test_time) / np.max(test_time)

    plt.figure(figsize=(12, 8))
    plt.title("Score")
    plt.barh(indices, score, .2, label="score", color='r')
    plt.barh(indices + .3, training_time, .2, label="training time", color='g')
    plt.barh(indices + .6, test_time, .2, label="test time", color='b')
    plt.yticks(())
    plt.legend(loc='best')
    plt.subplots_adjust(left=.25)
    plt.subplots_adjust(top=.95)
    plt.subplots_adjust(bottom=.05)

    for i, c in zip(indices, clf_names):
        plt.text(-.3, i, c)

    plt.show()
예제 #8
0
def main_function():
  # Display progress logs on stdout
  logging.basicConfig(level=logging.INFO,
                      format='%(asctime)s %(levelname)s %(message)s')


  # parse commandline arguments
  op = OptionParser()
  op.add_option("--report",
                action="store_true", dest="print_report",
                help="Print a detailed classification report.")
  op.add_option("--chi2_select",
                action="store", type="int", dest="select_chi2",
                help="Select some number of features using a chi-squared test")
  op.add_option("--confusion_matrix",
                action="store_true", dest="print_cm",
                help="Print the confusion matrix.")
  op.add_option("--top10",
                action="store_true", dest="print_top10",
                help="Print ten most discriminative terms per class"
                     " for every classifier.")
  op.add_option("--all_categories",
                action="store_true", dest="all_categories",
                help="Whether to use all categories or not.")
  op.add_option("--use_hashing",
                action="store_true",
                help="Use a hashing vectorizer.")
  op.add_option("--n_features",
                action="store", type=int, default=2 ** 16,
                help="n_features when using the hashing vectorizer.")
  op.add_option("--filtered",
                action="store_true",
                help="Remove newsgroup information that is easily overfit: "
                     "headers, signatures, and quoting.")
  (opts, args) = op.parse_args()
  if len(args) > 0:
      op.error("this script takes no arguments.")
      sys.exit(1)

  print(__doc__)
  op.print_help()
  print()


  ###############################################################################
  # Load some categories from the training set
  if opts.all_categories:
      categories = None
  else:
      obj = input_from_xml()
      domains,questions = obj.fetch_input_from_xml_questions()
      categories = domains

  print("Loading Data from different categories....")
  print(categories if categories else "all")


  obj = extract_data()
  train_data,train_target,train_target_names = obj.extract_training_data()
  test_data,test_target = obj.extract_testing_data()


  print('data loaded')
#  print train_target_names

  def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

  data_train_size_mb = size_mb(train_data)
  data_test_size_mb = size_mb(test_data)

  print("%d documents - %0.3fMB (training set)" % (
     len(train_data), data_train_size_mb))
  print("%d documents - %0.3fMB (test set)" % (
     len(test_data), data_test_size_mb))
  print("%d categories" % len(categories))
  print()


  categories = train_target_names 
  y_train,y_test = train_target,test_target

  print("Extracting features from the training dataset using a sparse vectorizer")
  t0 = time()
  if opts.use_hashing:
      vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                     n_features=opts.n_features)
      X_train = vectorizer.transform(train_data)
  else:
      vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                   stop_words='english')
      X_train = vectorizer.fit_transform(train_data)
  duration = time() - t0
  print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
  print("n_samples: %d, n_features: %d" % X_train.shape)
  print()

  print("Extracting features from the test dataset using the same vectorizer")
  t0 = time()
  X_test = vectorizer.transform(test_data)
  duration = time() - t0
  print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
  print("n_samples: %d, n_features: %d" % X_test.shape)
  print()

  if opts.select_chi2:
      print("Extracting %d best features by a chi-squared test" %
            opts.select_chi2)
      t0 = time()
      ch2 = SelectKBest(chi2, k=opts.select_chi2)
      X_train = ch2.fit_transform(X_train, y_train)
      X_test = ch2.transform(X_test)
      print("done in %fs" % (time() - t0))
      print()

  def trim(s):
      """Trim string to fit on terminal (assuming 80-column display)"""
      return s if len(s) <= 80 else s[:77] + "..."

  if opts.use_hashing:
      feature_names = None
  else:
      feature_names = np.asarray(vectorizer.get_feature_names())


  loop = 1
  while True:
    results = []
    results.append(benchmark(X_train,y_train,X_test,y_test))

    train_data = train_data + test_data
    train_target = train_target + test_target
    X_train = vectorizer.transform(train_data)
    y_train = train_target

    if(loop == 1):
      print('=' * 80)
      indices = np.arange(len(results))
      results = [[x[i] for x in results] for i in range(4)]

      clf_names, score, training_time, test_time = results
      training_time = np.array(training_time) / np.max(training_time)
      test_time = np.array(test_time) / np.max(test_time)

      pl.figure(figsize=(8,4))
      pl.title("Score")
      pl.barh(indices, score, .2, label="score", color='r')
      pl.barh(indices + .3, training_time, .2, label="training time", color='g')
      pl.barh(indices + .6, test_time, .2, label="test time", color='b')
      pl.yticks(())
      pl.legend(loc='best')
      pl.subplots_adjust(left=.25)
      pl.subplots_adjust(top=.95)
      pl.subplots_adjust(bottom=.05)

      for i, c in zip(indices, clf_names):
          pl.text(-.3, i, c)

      pl.show()
      loop = loop + 1
예제 #9
0
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    print("done in %fs" % (time() - t0))
    print()


def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."


# mapping from integer feature name to original token string
if opts.use_hashing:
    feature_names = None
else:
    feature_names = np.asarray(vectorizer.get_feature_names())


###############################################################################
# Benchmark classifiers
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
예제 #10
0
    def extract_features(self,
                         df,
                         max_df=0.5,
                         stop_words='english',
                         ngram_range=(1, 1),
                         store_vect=True,
                         random_state=None):
        """Extracts features

        :param df: dataframe with column keys, 'input' and 'label',
                    where 'input' are the contents of articles and 'label' are the desired results/targets
        :param max_df: See TFiDF definition: default 0.5
        :param stop_words: list of stop words for feature extraction. Default is english.
        :param ngram_range: range of gram to be used in type tuple. Default is (1, 1)
        :param store_vect: store vectorizer to pickle
        :param random_state: train_test_split param, RandomState instance or None, optional (default=None)
        :return x_train, x_test, y_train, y_test:
        vectorized sets of train and test with .8:.2 ratio, respectively"""

        # SPLIT TRAIN AND TEST
        print(
            "Extracting features from the training data using a sparse vectorizer"
        )
        t0 = time()

        x_train, x_test, y_train, y_test = train_test_split(
            df['input'], df['label'], random_state=random_state)

        print('Training set: ', len(y_train))
        print('Testing set: ', len(y_test))

        if self.use_hashing and self.n_features:
            vectorizer = HashingVectorizer(stop_words=stop_words,
                                           alternate_sign=False,
                                           n_features=self.n_features,
                                           ngram_range=ngram_range)
            x_train = vectorizer.transform(x_train)
        else:
            vectorizer = TfidfVectorizer(sublinear_tf=True,
                                         max_df=max_df,
                                         stop_words=stop_words,
                                         ngram_range=ngram_range)
            x_train = vectorizer.fit_transform(x_train)

        try:
            duration = time() - t0
            print("done in %fs at %0.3fMB/s" %
                  (duration,
                   df['input'][:int(len(df['input']) * .8)].size / duration))
            print("n_samples: %d, n_features: %d" % x_train.shape)
            print()
        except ZeroDivisionError as err:
            print(err)

        print(
            "Extracting features from the test data using the same vectorizer")
        t0 = time()
        x_test = vectorizer.transform(x_test)
        duration = time() - t0
        try:
            print("done in %fs at %0.3fMB/s" %
                  (duration,
                   df['input'][int(len(df['input']) * .8):].size / duration))
            print("n_samples: %d, n_features: %d" % x_test.shape)
            print()
        except ZeroDivisionError as err:
            print(err)

        # mapping from integer feature name to original token string
        if self.use_hashing:
            feature_names = None
        else:
            feature_names = vectorizer.get_feature_names()

        if self.select_chi2:
            print("Extracting %d best features by a chi-squared test" %
                  self.select_chi2)
            t0 = time()
            ch2 = SelectKBest(chi2, k=self.select_chi2)
            x_train = ch2.fit_transform(x_train, y_train)
            x_test = ch2.transform(x_test)
            if feature_names:
                # keep selected feature names
                self.feature_names = [
                    feature_names[i] for i in ch2.get_support(indices=True)
                ]
            print("done in %fs" % (time() - t0))
            print()

        if feature_names:
            self.feature_names = np.asarray(feature_names)

        self.X_train = x_train  # sparse matrix train (train_samples, n_features)
        self.X_test = x_test  # sparse matrix test  (test_samples, n_features)
        self.y_train = y_train  # (train_n_tagets,)
        self.y_test = y_test  # (test_n_targets, )

        if store_vect:
            base, f_name = os.path.split(
                os.path.abspath('article_classifier.py'))
            filename = base + '/' + 'vectorizer.pkl'
            joblib.dump(vectorizer, filename)
            print('Saved ', os.path.abspath(filename))

        return x_train, x_test, y_train, y_test
예제 #11
0
def wordVec():
    dataset = fetch_20newsgroups(subset='all',
                                 categories=categories,
                                 shuffle=True,
                                 random_state=42)

    print("%d documents" % len(dataset.data))
    print("%d categories" % len(dataset.target_names))
    print()

    labels = dataset.target

    print("Extracting features from the training dataset "
          "using a sparse vectorizer")
    t0 = time.clock()

    if sklearn.naive_bayes.check_X_y():
        if sklearn.naive_bayes.safe_sparse_dot():

            hasher = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english',
                                       alternate_sign=False,
                                       norm=None)
            vectorizer = make_pipeline(hasher, TfidfTransformer())
        else:
            vectorizer = HashingVectorizer(n_features=opts.n_features,
                                           stop_words='english',
                                           alternate_sign=False,
                                           norm='l2')
    else:
        vectorizer = TfidfVectorizer(max_df=0.5,
                                     max_features=opts.n_features,
                                     min_df=2,
                                     stop_words='english',
                                     use_idf=sklearn.metrics.roc_curve())
    X = vectorizer.fit_transform(dataset.data)

    print("done in %fs" % (time.time() - t0))
    print("n_samples: %d, n_features: %d" % X.shape)
    print()

    if True:
        print("Performing dimensionality reduction using LSA")
        t0 = time.time()
        svd = TruncatedSVD(sklearn.linear_model.SGDClassifier.predict())
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)

        X = lsa.fit_transform(X)

        print("done in %fs" % (time() - t0))

        explained_variance = svd.explained_variance_ratio_.sum()
        print("Explained variance of the SVD step: {}%".format(
            int(explained_variance * 100)))

        print()

    if opts.minibatch:
        km = MiniBatchKMeans(n_clusters=true_k,
                             init='k-means++',
                             n_init=1,
                             init_size=1000,
                             batch_size=1000,
                             verbose=opts.verbose)
    else:
        km = KMeans(n_clusters=true_k,
                    init='k-means++',
                    max_iter=100,
                    n_init=1,
                    verbose=opts.verbose)

    print("Clustering sparse data with %s" % km)
    t0 = time()
    km.fit(X)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
    print("Completeness: %0.3f" %
          metrics.completeness_score(labels, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    print("Adjusted Rand-Index: %.3f" %
          metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(X, km.labels_, sample_size=1000))

    print()

    if not opts.use_hashing:
        print("Top terms per cluster:")

        if opts.n_components:
            original_space_centroids = svd.inverse_transform(
                km.cluster_centers_)
            order_centroids = original_space_centroids.argsort()[:, ::-1]
        else:
            order_centroids = km.cluster_centers_.argsort()[:, ::-1]

        terms = vectorizer.get_feature_names()
        for i in range(true_k):
            print("Cluster %d:" % i, end='')
            for ind in order_centroids[i, :10]:
                print(' %s' % terms[ind], end='')
            print()
def bench(texts, y, opts):
    data_train, data_test, y_train, y_test = train_test_split(texts,
                                                              y,
                                                              test_size=0.33,
                                                              random_state=42)
    logger.info('data loaded')
    data_train_size_mb = size_mb(data_train)
    data_test_size_mb = size_mb(data_test)
    logger.info("%d documents - %0.3fMB (training set)" %
                (len(data_train), data_train_size_mb))
    logger.info("%d documents - %0.3fMB (test set)" %
                (len(data_test), data_test_size_mb))
    logger.info("\n")
    logger.info(
        "Extracting features from the training data using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        logger.info("Use hashing")
        vectorizer = HashingVectorizer(
            preprocessor=stif_classifier_dataset.preprocessor,
            non_negative=True,
            n_features=opts.n_features)
        X_train = vectorizer.transform(data_train)
    else:
        vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            max_df=0.5,
            preprocessor=stif_classifier_dataset.preprocessor)
        X_train = vectorizer.fit_transform(data_train)
    duration = time() - t0
    logger.info("done in %fs at %0.3fMB/s" %
                (duration, data_train_size_mb / duration))
    logger.info("n_samples: %d, n_features: %d" % X_train.shape)
    logger.info("\n")
    logger.info(
        "Extracting features from the test data using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test)
    duration = time() - t0
    logger.info("done in %fs at %0.3fMB/s" %
                (duration, data_test_size_mb / duration))
    logger.info("n_samples: %d, n_features: %d" % X_test.shape)
    logger.info("\n")
    # mapping from integer feature name to original token string
    if opts.use_hashing:
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()
    if opts.select_chi2:
        logger.info("Extracting %d best features by a chi-squared test" %
                    opts.select_chi2)
        t0 = time()
        ch2 = SelectKBest(chi2, k=opts.select_chi2)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [
                feature_names[i] for i in ch2.get_support(indices=True)
            ]
        logger.info("done in %fs" % (time() - t0))
        logger.info("\n")
    if feature_names:
        feature_names = np.asarray(feature_names)
    categories = ["no alert", "alert"]
    results = []
    for clf, name in ((RidgeClassifier(tol=1e-2,
                                       solver="lsqr"), "Ridge Classifier"),
                      (Perceptron(n_iter=50),
                       "Perceptron"), (PassiveAggressiveClassifier(n_iter=50),
                                       "Passive-Aggressive"),
                      (KNeighborsClassifier(n_neighbors=10),
                       "kNN"), (RandomForestClassifier(n_estimators=100),
                                "Random forest")):
        logger.info('=' * 80)
        logger.info(name)
        clf_descr = str(clf).split('(')[0]
        results.append(
            benchmark(clf, clf_descr, X_train, y_train, X_test, y_test,
                      categories, feature_names, opts))
    for penalty in ["l2", "l1"]:
        logger.info('=' * 80)
        logger.info("%s penalty" % penalty.upper())
        # Train Liblinear model
        clf_descr = 'LinearSVC ' + penalty
        results.append(
            benchmark(
                LinearSVC(loss='l2', penalty=penalty, dual=False,
                          tol=1e-3), clf_descr, X_train, y_train, X_test,
                y_test, categories, feature_names, opts))

        # Train SGD model
        clf_descr = 'SGDClassifier ' + penalty
        results.append(
            benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty),
                      clf_descr, X_train, y_train, X_test, y_test, categories,
                      feature_names, opts))

    # Train SGD with Elastic Net penalty
    logger.info('=' * 80)
    logger.info("Elastic-Net penalty")
    clf_descr = 'SGDClassifier Elastic-Net'
    results.append(
        benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                penalty="elasticnet"), clf_descr, X_train,
                  y_train, X_test, y_test, categories, feature_names, opts))
    # Train NearestCentroid without threshold
    logger.info('=' * 80)
    logger.info("NearestCentroid (aka Rocchio classifier)")
    results.append(
        benchmark(NearestCentroid(), 'NearestCentroid', X_train, y_train,
                  X_test, y_test, categories, feature_names, opts))
    # Train sparse Naive Bayes classifiers
    logger.info('=' * 80)
    logger.info("Naive Bayes")
    results.append(
        benchmark(MultinomialNB(alpha=.01), 'MultinomialNB', X_train, y_train,
                  X_test, y_test, categories, feature_names, opts))
    results.append(
        benchmark(BernoulliNB(alpha=.01), 'BernoulliNB', X_train, y_train,
                  X_test, y_test, categories, feature_names, opts))
    logger.info('=' * 80)
    logger.info("LinearSVC with L1-based feature selection")
    # The smaller C, the stronger the regularization.
    # The more regularization, the more sparsity.
    results.append(
        benchmark(
            Pipeline([('feature_selection',
                       LinearSVC(penalty="l1", dual=False, tol=1e-3)),
                      ('classification', LinearSVC())]),
            'LinearSVC with feature_selection', X_train, y_train, X_test,
            y_test, categories, feature_names, opts))
    # make some plots
    indices = np.arange(len(results))
    results = [[x[i] for x in results] for i in range(4)]
    clf_names, score, training_time, test_time = results
    training_time = np.array(training_time) / np.max(training_time)
    test_time = np.array(test_time) / np.max(test_time)
    ts = time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%dT%H-%M-%S')
    pdf = PdfPages('stif_classification_' + st + '.pdf')
    fig = plt.figure(figsize=(12, 8))
    ax = plt.subplot(111)
    plt.title("Benchmark de classifieurs")
    ax.barh(indices, score, .2, label="score", color='r')
    ax.barh(indices + .3, training_time, .2, label="training time", color='g')
    ax.barh(indices + .6, test_time, .2, label="test time", color='b')
    plt.yticks(())
    plt.xticks(np.linspace(0.0, 1.0, 11, endpoint=True))
    plt.subplots_adjust(left=.25)
    plt.subplots_adjust(top=.95)
    plt.subplots_adjust(bottom=.05)
    for i, c in zip(indices, clf_names):
        ax.text(-.4, i, c)

    # Shrink current axis by 20%
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    # Put a legend to the right of the current axis
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    # plt.show()
    fig.savefig('stif_classification_' + st + '.png')
    pdf.savefig()
    pdf.close()
예제 #13
0
    def train(self):
        # TODO not relevant for paper but important

        X = self.load()

        x_train, x_test, y_train, y_test, indices_train, indices_test = \
            train_test_split(
                X['data'], X['target'], range(0, len(X['data'])), test_size=0.2, random_state=42)
        print('data loaded')

        # order of labels in `target_names` can be different from `categories`
        target_names = X['target_names']

        def size_mb(docs):
            return sum(len(s.encode('utf-8')) for s in docs) / 1e6

        data_train_size_mb = size_mb(x_train)
        data_test_size_mb = size_mb(x_test)

        print("%d documents - %0.3fMB (training set)" %
              (len(x_train), data_train_size_mb))
        print("%d documents - %0.3fMB (test set)" %
              (len(x_test), data_test_size_mb))
        print("%d categories" % len(target_names))
        print()

        print(
            "Extracting features from the training data using a sparse vectorizer"
        )
        t0 = time()
        if False:
            vectorizer = HashingVectorizer(stop_words='english',
                                           alternate_sign=False,
                                           n_features=2**16)
            X_train = vectorizer.transform(x_train)
        else:
            vectorizer = TfidfVectorizer(sublinear_tf=True,
                                         max_df=0.5,
                                         stop_words='english')
            X_train = vectorizer.fit_transform(x_train)
        duration = time() - t0
        print("done in %fs at %0.3fMB/s" %
              (duration, data_train_size_mb / duration))
        print("n_samples: %d, n_features: %d" % X_train.shape)
        print()

        print(
            "Extracting features from the test data using the same vectorizer")
        t0 = time()
        X_test = vectorizer.transform(x_test)
        duration = time() - t0
        print("done in %fs at %0.3fMB/s" %
              (duration, data_test_size_mb / duration))
        print("n_samples: %d, n_features: %d" % X_test.shape)
        print()

        # mapping from integer feature name to original token string
        if False:
            feature_names = None
        else:
            feature_names = vectorizer.get_feature_names()

        if feature_names:
            feature_names = np.asarray(feature_names)

        def trim(s):
            """Trim string to fit on terminal (assuming 80-column display)"""
            return s if len(s) <= 80 else s[:77] + "..."

        # #############################################################################
        # Benchmark classifiers
        def benchmark(clf):
            print('_' * 80)
            print("Training: ")
            print(clf)
            t0 = time()
            clf.fit(X_train, y_train)
            train_time = time() - t0
            print("train time: %0.3fs" % train_time)

            t0 = time()
            pred = clf.predict(X_test)
            test_time = time() - t0
            print("test time:  %0.3fs" % test_time)

            score = metrics.accuracy_score(y_test, pred)
            print("accuracy:   %0.3f" % score)

            if hasattr(clf, 'coef_'):
                print("dimensionality: %d" % clf.coef_.shape[1])
                print("density: %f" % density(clf.coef_))

                if False and feature_names is not None:
                    print("top 10 keywords per class:")
                    for i, label in enumerate(target_names):
                        top10 = np.argsort(clf.coef_[i])[-10:]
                        print(
                            trim("%s: %s" %
                                 (label, " ".join(feature_names[top10]))))
                print()

            print("classification report:")
            print(
                metrics.classification_report(y_test,
                                              pred,
                                              target_names=target_names))

            print("confusion matrix:")
            print(metrics.confusion_matrix(y_test, pred))

            print()
            clf_descr = str(clf).split('(')[0]
            return clf_descr, score, train_time, test_time

        results = []
        for clf, name in ((RidgeClassifier(tol=1e-2,
                                           solver="lsqr"), "Ridge Classifier"),
                          (Perceptron(n_iter=50), "Perceptron"),
                          (PassiveAggressiveClassifier(n_iter=50),
                           "Passive-Aggressive"),
                          (KNeighborsClassifier(n_neighbors=10),
                           "kNN"), (RandomForestClassifier(n_estimators=100),
                                    "Random forest")):
            print('=' * 80)
            print(name)
            results.append(benchmark(clf))

        for penalty in ["l2", "l1"]:
            print('=' * 80)
            print("%s penalty" % penalty.upper())
            # Train Liblinear model
            results.append(
                benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))

            # Train SGD model
            results.append(
                benchmark(
                    SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)))

        # Train SGD with Elastic Net penalty
        print('=' * 80)
        print("Elastic-Net penalty")
        results.append(
            benchmark(
                SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")))

        # Train NearestCentroid without threshold
        print('=' * 80)
        print("NearestCentroid (aka Rocchio classifier)")
        results.append(benchmark(NearestCentroid()))

        # Train sparse Naive Bayes classifiers
        print('=' * 80)
        print("Naive Bayes")
        results.append(benchmark(MultinomialNB(alpha=.01)))
        results.append(benchmark(BernoulliNB(alpha=.01)))

        print('=' * 80)
        print("LinearSVC with L1-based feature selection")
        # The smaller C, the stronger the regularization.
        # The more regularization, the more sparsity.
        results.append(
            benchmark(
                Pipeline([('feature_selection',
                           SelectFromModel(
                               LinearSVC(penalty="l1", dual=False, tol=1e-3))),
                          ('classification', LinearSVC(penalty="l2"))])))

        # make some plots

        indices = np.arange(len(results))

        results = [[x[i] for x in results] for i in range(4)]

        clf_names, score, training_time, test_time = results
        training_time = np.array(training_time) / np.max(training_time)
        test_time = np.array(test_time) / np.max(test_time)

        plt.figure(figsize=(12, 8))
        plt.title("Score")
        plt.barh(indices, score, .2, label="score", color='navy')
        plt.barh(indices + .3,
                 training_time,
                 .2,
                 label="training time",
                 color='c')
        plt.barh(indices + .6,
                 test_time,
                 .2,
                 label="test time",
                 color='darkorange')
        plt.yticks(())
        plt.legend(loc='best')
        plt.subplots_adjust(left=.25)
        plt.subplots_adjust(top=.95)
        plt.subplots_adjust(bottom=.05)

        for i, c in zip(indices, clf_names):
            plt.text(-.3, i, c)

        plt.show()

        return self._container
예제 #14
0
def train(opts):
    print("Loading data...")
    neg2pos_ratio = 1  #neg examples are twice pos ones
    train2all_ratio = 0.5  #20
    core_prop_names = ["recipeInstructions", "ingredients"]
    data_train, data_test = utils.prepare_data(opts.prop_name, core_prop_names,
                                               opts.index_file, neg2pos_ratio,
                                               train2all_ratio)

    categories = ['negative', 'positive']

    def size_mb(docs):
        return sum(len(s.encode('utf-8')) for s in docs) / 1e6

    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)
    print("%d documents - %0.3fMB (training set)" %
          (len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" %
          (len(data_test.data), data_test_size_mb))
    print("%d categories" % len(categories))
    print()

    # split a training set and a test set
    y_train, y_test = data_train.target, data_test.target

    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english',
                                       non_negative=True,
                                       n_features=opts.n_features)
        X_train = vectorizer.transform(data_train.data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True,
                                     max_df=0.5,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train.data)
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print("Extracting features from the test data using the same vectorizer")
    X_test = vectorizer.transform(data_test.data)
    print("n_samples: %d, n_features: %d \n" % X_test.shape)
    '''
    #Print training data for test purpose
    trainf = open("train_test.txt", "w")
    for item in data_train.data:
        trainf.write(item + "\n")
    trainf.close()


    #end test
    '''

    # mapping from integer feature name to original token string
    if opts.use_hashing:
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()

    if opts.select_chi2:
        print("Extracting %d best features by a chi-squared test" %
              opts.select_chi2)
        t0 = time()
        ch2 = SelectKBest(chi2, k=opts.select_chi2)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [
                feature_names[i] for i in ch2.get_support(indices=True)
            ]
        print("done in %fs" % (time() - t0))
        print()

    if feature_names:
        feature_names = np.asarray(feature_names)

    clf = LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3)
    benchmark(clf, X_train, y_train, X_test, y_test, opts)

    #Saving the model and vectorizer
    if opts.model_file:
        with open(opts.model_file, "wb") as f:
            pickle.dump(clf, f)
    if opts.vect_file:
        with open(opts.vect_file, "wb") as f:
            pickle.dump(vectorizer, f)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    print("done in %fs" % (time() - t0))
    print()


def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."


# mapping from integer feature name to original token string
if opts.use_hashing:
    feature_names = None
else:
    feature_names = np.asarray(vectorizer.get_feature_names())


###############################################################################
# Benchmark classifiers
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
"""
print()

cluster_colors = dict()
cluster_name_map = dict()

if not opts.use_hashing:
    print("Top terms per cluster:")

    if opts.n_components:
        original_space_centroids = svd.inverse_transform(km.cluster_centers_)
        order_centroids = original_space_centroids.argsort()[:, ::-1]
    else:
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    cluster_name = []
    terms = vectorizer.get_feature_names()
    for i in range(num_cluster):
        print("Cluster %d:" % i, end='')
        cluster_colors[i] = randomcolor()
        name = ' '
        for ind in order_centroids[i, :10]:
            name += ' ' + terms[ind]
            print(' %s' % terms[ind], end='')
        cluster_name.append(name)
        cluster_name_map[i] = name.split(",")
        print()

cluster_predicit = km.predict(X)

# ================ Update columns cluster in database ============================
for index, row in news_df.iterrows():
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test_data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

# mapping from integer feature name to original token string
if opts.use_hashing:
    feature_names = None
else:
    feature_names = vectorizer.get_feature_names()

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
    print("done in %fs" % (time() - t0))
    print()
예제 #18
0
def svm(num_cats, train_data, test_data):
    # train_path = os.path.join(data_path, "train")
    # test_path = os.path.join(data_path, "test")

    data_train = {}
    data_train["data"] = []
    data_train["target"] = []
    data_train["target_names"] = [str(i) for i in range(num_cats)]
    data_test = {}
    data_test["data"] = []
    data_test["target"] = []
    # with open(train_path, 'r') as train_f, open(test_path) as test_f:
    #     train_data = train_f.readlines()
    #     test_data = test_f.readlines()

    for exam in train_data:
        data_train["data"].append(exam[12:])
        # print(exam)
        data_train["target"].append(int(exam[9]))
    for exam in test_data:
        data_test["data"].append(exam[12:])
        data_test["target"].append(int(exam[9]))

    data_train = ap.Namespace(**data_train)
    data_test = ap.Namespace(**data_test)

    print('data loaded')

    categories = data_train.target_names  # for case categories == None

    def size_mb(docs):
        return sum(len(s.encode('utf-8')) for s in docs) / 1e6

    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)

    print("%d documents - %0.3fMB (training set)" %
          (len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" %
          (len(data_test.data), data_test_size_mb))
    print("%d categories" % len(categories))
    print(data_train.data[:10])

    # split a training set and a test set
    y_train, y_test = data_train.target, data_test.target

    print(
        "Extracting features from the training data using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english',
                                       non_negative=True,
                                       n_features=opts.n_features)
        X_train = vectorizer.transform(data_train.data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True,
                                     max_df=0.5,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" %
          (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()

    print("Extracting features from the test data using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" %
          (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()

    # mapping from integer feature name to original token string
    if opts.use_hashing:
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()

    if opts.select_chi2:
        print("Extracting %d best features by a chi-squared test" %
              opts.select_chi2)
        t0 = time()
        ch2 = SelectKBest(chi2, k=opts.select_chi2)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [
                feature_names[i] for i in ch2.get_support(indices=True)
            ]
        print("done in %fs" % (time() - t0))
        print()

    if feature_names:
        feature_names = np.asarray(feature_names)

    def trim(s):
        """Trim string to fit on terminal (assuming 80-column display)"""
        return s if len(s) <= 80 else s[:77] + "..."

    ###############################################################################
    # Benchmark classifiers
    def benchmark(clf):
        print('_' * 80)
        print("Training: ")
        print(clf)
        t0 = time()
        clf.fit(X_train, y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)

        t0 = time()
        pred = clf.predict(X_test)
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)

        score = metrics.accuracy_score(y_test, pred)
        print("accuracy:   %0.3f" % score)

        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))

            if opts.print_top10 and feature_names is not None:
                print("top 10 keywords per class:")
                for i, category in enumerate(categories):
                    top10 = np.argsort(clf.coef_[i])[-10:]
                    print(
                        trim("%s: %s" %
                             (category, " ".join(feature_names[top10]))))
            print()

        if opts.print_report:
            print("classification report:")
            print(
                metrics.classification_report(y_test,
                                              pred,
                                              target_names=categories))

        if opts.print_cm:
            print("confusion matrix:")
            print(metrics.confusion_matrix(y_test, pred))

        print()
        clf_descr = str(clf).split('(')[0]
        return clf_descr, score, train_time, test_time

    results = []
    for penalty in ["l2", "l1"]:
        print('=' * 80)
        print("%s penalty" % penalty.upper())
        # Train Liblinear model
        results.append(
            benchmark(
                LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))

    # make some plots

    indices = np.arange(len(results))

    results = [[x[i] for x in results] for i in range(4)]

    clf_names, score, training_time, test_time = results
    training_time = np.array(training_time) / np.max(training_time)
    test_time = np.array(test_time) / np.max(test_time)

    plt.figure(figsize=(12, 8))
    plt.title("Score")
    plt.barh(indices, score, .2, label="score", color='r')
    plt.barh(indices + .3, training_time, .2, label="training time", color='g')
    plt.barh(indices + .6, test_time, .2, label="test time", color='b')
    plt.yticks(())
    plt.legend(loc='best')
    plt.subplots_adjust(left=.25)
    plt.subplots_adjust(top=.95)
    plt.subplots_adjust(bottom=.05)

    for i, c in zip(indices, clf_names):
        plt.text(-.3, i, c)

    plt.show()