def unigram_selection(i,path_to_training_set,path_to_pickle):

 (X_train, y_train, X_test, y_test,number_training, number_testing)= extract_data.extract_training_and_testing_set(
    path_to_training_set+'metrics_training_set_%d.data'%i,
    path_to_training_set+'metrics_testing_set_%d.data'%i)

 print(X_train[0].__len__())
 clf=svm.SVC(C=1, cache_size=2000, class_weight=None, coef0=0.0, degree=3,
 gamma=0.1, kernel='linear', max_iter=-1, probability=False, shrinking=True,
 tol=0.001, verbose=False)

 clf.fit(X_train, y_train)


 print ()
 print("Detailed classification report:")
 print()
 print("The model is trained on the full development set: %d" % number_training)
 print("The scores are computed on the full evaluation set: %d" % number_testing)
 print()
 y_true=y_test
 y_prediction=clf.predict(X_test)

 print(metrics.classification_report(y_true, y_prediction))

 clf_metrics=np.vstack((y_true,y_prediction))

 with open(path_to_pickle+'60000_all_features_%d.pkl'%i, 'wb') as fid :
    cPickle.dump(clf_metrics, fid)
 print()
__author__ = 'pierregagliardi'

import numpy as np
import pickle
from sklearn.feature_selection import VarianceThreshold

from projet_sentiment_analysis.code.utilities import extract_data

if __name__ == "__main__":

    general_path = '/Users/pierregagliardi/DossierTravail/Programmation/PythonPath/projet_sentiment_analysis/'
    path_to_training_set = general_path + 'training_set_60000/training_set_unigram_all_features/'
    path_to_pickle = general_path + 'pickle_hyper_parameters/'

    (X_train, y_train, X_test, y_test, number_training,
     number_testing) = extract_data.extract_training_and_testing_set(
         path_to_training_set + 'metrics_training_set_7000.data',
         path_to_training_set + 'metrics_testing_set_7000.data')

    sel = VarianceThreshold(threshold=(.999 * (1 - .999)))
    X_train = sel.fit_transform(X_train)
    X_test = sel.transform(X_test)

    with open(path_to_pickle + 'metrics_60000_all_features_7000.pkl',
              'wb') as fid:
        pickle.dump((X_train, y_train, X_test, y_test), fid)
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
     print("Parameters are %s, %s with a score of %0.2f"
     %(C, gamma,score ))
     return metrics.accuracy_score(y_test, y_prediction)


if __name__ == "__main__":


    ####### home computer path ######
    general_path='/Users/pierregagliardi/DossierTravail/Programmation/PythonPath/projet_sentiment_analysis/'
    path_to_training_set=general_path+'training_set_60000/training_set_unigram_all_features/'
    path_to_pickle=general_path+'pickle_hyper_parameters/'
    (X_train, y_train, X_test, y_test,number_training, number_testing)= extract_data.extract_training_and_testing_set(
        path_to_training_set+'metrics_training_set_1000.data',
        path_to_training_set+'metrics_testing_set_1000.data')


    #check how many cpu are available to parallelise the task
    print multiprocessing.cpu_count()

    # For an initial search, a logarithmic grid with basis
    # 10 is often helpful. Using a basis of 2, a finer
    # tuning can be achieved but at a much higher cost.

    C_range = np.logspace(0, 3, 4)
    gamma_range = np.logspace(-3, -1, 3)
    param_grid = dict(gamma=gamma_range, C=C_range)