def extract_username_socling_features(username):
    normalized_tokens = pp.tokenize_and_normalize(username)

    if normalized_tokens:
        feature_list = [
            find_male_name(normalized_tokens),
            find_female_name(normalized_tokens),
            find_male_nickname(normalized_tokens),
            find_female_nickname(normalized_tokens),
            find_male_key_word(normalized_tokens),
            find_female_key_word(normalized_tokens),
            starts_with_o(normalized_tokens),
            starts_with_a(normalized_tokens),
            repeated_alphabet(pp.normalize(username)),
            caps(pp.normalize(username))
        ]
        return feature_list
    else:
        return [0] * 10
def extract_description_socling_features(description):
    if description is not None:
        normalized_tokens = pp.tokenize_and_normalize(description)

        feature_list = [
            find_male_name(normalized_tokens),
            find_female_name(normalized_tokens),
            find_male_nickname(normalized_tokens),
            find_female_nickname(normalized_tokens),
            find_male_key_word(normalized_tokens),
            find_female_key_word(normalized_tokens),
            repeated_alphabet(pp.normalize(description)),
            caps(pp.normalize(description)),
            possessive_bigrams(normalized_tokens),
            find_snapchat_link(pp.normalize(description)),
            find_instagram_link(pp.normalize(description)),
            find_tumblr_link(pp.normalize(description))
        ]
    else:
        feature_list = [0] * 12

    return feature_list
def extract_tweet_socling_features(tweet):
    if tweet is not None:
        text = pp.normalize(tweet)

        feature_list = [
            find_ellipses(text),
            possessive_bigrams(pp.tokenize_and_normalize(tweet)),
            find_self_mentions(text),
            caps(text),
            find_affirmation(text),
            find_laughter(text),
            find_exclaim(text),
            find_question(text),
            repeated_alphabet(text)
        ]
    else:
        feature_list = [0] * 9

    return feature_list
Пример #4
0
                    print 'gaussianization on idx %d failed' % idx
                    print e
        else:
            try:
                rdata = gaussianizer(data)
                data = np.array(rdata)
            except Exception, e:
                print 'gaussianization failed'
                print e

        gaussianizer.mean = data.mean(axis=1)
        gaussianizer.std = data.mean(axis=1)

    if pre_processing == 'normalize':
        print 'normalizing data'
        normalizer = normalize(data)

    lowest_bic = np.infty
    bic = []

    n_components_range = range(1, 10)

    cv_types = ['spherical', 'tied', 'diag', 'full']
    for cv_type in cv_types:
        for n_components in n_components_range:
            # Fit a mixture of gaussians with EM
            gmm = GMM(n_components=n_components, covariance_type=cv_type)
            gmm.fit(data)
            bic.append(gmm.bic(data))
            if bic[-1] < lowest_bic:
                lowest_bic = bic[-1]
'''
import csv
import os
from pre_processing import normalize

name_dict = {}
abrv_dict = {}
key_words_dict = {}

csvs_dir = os.path.dirname(__file__)

with open(csvs_dir + '/Lista_de_Nomes_Portugueses.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')

    for row in reader:
        name = normalize(row[0].decode('utf-8').lower().strip())

        if row[2] == 'Sim':
            if 'M' in row:
                name_dict[name] = 'M'
            elif 'F' in row:
                name_dict[name] = 'F'
            else:
                name_dict[name] = 'U'

with open(csvs_dir + '/Lista_de_Abreviaturas_Portuguesas.csv',
          'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')

    for row in reader:
        normalized_name = normalize(row[0].decode('utf-8').lower().strip())
Пример #6
0
def collect_train_and_test_data(location_id, door_count_placement_view_pair, trainPer, testPer, features, timezone = None, pre_processing=''):

  standardizer, normalizer, gaussianizer = None, None, None
  trainStart = trainPer[0]
  trainEnd = trainPer[1]
  predStart = testPer[0]
  predEnd = testPer[1]

  if all([isinstance(location_id, list), isinstance(door_count_placement_view_pair, list), isinstance(timezone,list)]):
    train_location_id = location_id[0]
    train_placement_view_pair = door_count_placement_view_pair[0]
    train_timezone = timezone[0]
    test_location_id = location_id[1]
    test_placement_view_pair = door_count_placement_view_pair[1]
    test_timezone = timezone[1]
  else:
    train_location_id = location_id
    train_placement_view_pair = door_count_placement_view_pair
    train_timezone = timezone
    test_location_id = location_id
    test_placement_view_pair = door_count_placement_view_pair
    test_timezone = timezone

  print train_placement_view_pair, test_placement_view_pair

  train_start_time = createDay(trainStart, train_timezone)
  train_end_time = createDay(trainEnd, train_timezone)

  print train_location_id, train_placement_view_pair, train_start_time, train_end_time
  print'\nGetting train data'
  print 'pre-processing step'
  standardizer = None
  if 'standardize' in pre_processing:
    standardizer = Standardizer(copy=copy, with_mean=True, with_std=True)
  if 'gaussianize' in pre_processing:
    gaussianizer = robjects.r('Gaussianize')

  train_X, train_Y = collectData(train_location_id, train_placement_view_pair, train_start_time, train_end_time, features, adjusted=True)

  if standardizer is not None:
    train_X = standardizer.transform(train_X, copy=None)
  if gaussianizer is not None:
    from rpy2.robjects.numpy2ri import numpy2ri
    robjects.conversion.py2ri = numpy2ri
    rtrain_X = gaussianizer(train_X)
    train_X = np.array(rtrain_X)
    gaussianizer.mean = train_X.mean(axis=1)
    gaussianizer.std = train_X.mean(axis=1)

  #add ones column
  print 'adding constant to train_X'
  if len(train_X.shape) > 1:
    ones_array = np.ones((train_X.shape[0],1))
    train_X = np.append(train_X, ones_array, 1)
  else:
    train_X = np.dstack((train_X, np.ones(len(train_X))))
    if len(train_X.shape) == 3:
      train_X = train_X[0]

  if pre_processing == 'normalize':
    print 'normalizing data'
    normalizer = normalize(train_X)

  print'\nGetting train data'
  if trainPer == testPer:
    test_X = train_X
    test_Y = train_Y
  else:
    test_start_time = createDay(predStart, test_timezone)
    test_end_time = createDay(predEnd, test_timezone)
    print test_location_id, test_placement_view_pair, test_start_time, test_end_time
    test_X, test_Y = collectData(test_location_id, test_placement_view_pair, test_start_time, test_end_time, features, adjusted=True)

    #pre process data
    if pre_processing == 'standardize':
      test_X = standardizer.transform(test_X, copy=None)
    if pre_processing == 'normalize':
      test_X = normalizer.transform(test_X, copy=None)
    if pre_processing == 'gaussianize':
      text_X = (text_X - gaussianizer.mean)/gaussianizer.std

    #add ones column
    print 'adding constant to test_X'
    if len(test_X.shape) > 1:
      ones_array = np.ones((test_X.shape[0],1))
      test_X = np.append(test_X, ones_array, 1)
    else:
      test_X = np.dstack((test_X, np.ones(len(test_X))))
      if len(test_X.shape) == 3:
        test_X = test_X[0]

  return ((train_X, train_Y), (test_X, test_Y))
Пример #7
0
def run(location_id,
        door_count_placement_view_pair,
        start_time,
        end_time,
        features,
        n_components=16,
        pre_processing='',
        BALANCE_DATA=False):
    """ Fits data to one GMM and plots confusion matrix, prediction and error ellipses
  location_id: location_id of installation, eg '55' <int> or <str>
  door_count_placement_view_pair: placement and view id pair, e.g. ('3333230','0') (<str>, <str>)
  start_time: for prunning. time object with hour and minute time <time>
  end_time: for prunning. time object with hour and minute time <time>
  features: list with label (keys) of features used, [<str>,<str>...]
  n_components: number of mixture components 
  pre_processing: pre-processing to be applied; accepts 'standardize' and 'gaussianize' with default values <str>
  BALANCE_DATA: hack for trying to balance the data <bool>
  """

    global plotEE, plotPF, plotCM

    n_folds = 4

    print '\npre-processing step'
    standardizer = None
    if 'standardize' in pre_processing:
        standardizer = Standardizer(copy=copy, with_mean=True, with_std=True)
    if 'gaussianize' in pre_processing:
        gaussianizer = robjects.r('Gaussianize')

    train_X, train_Y = collectData(train_location_id,
                                   train_placement_view_pair,
                                   train_start_time,
                                   train_end_time,
                                   features,
                                   adjusted=True,
                                   pre_processor=standardizer)

    if standardizer is not None:
        train_X = standardizer.transform(train_X, copy=None)
    if gaussianizer is not None:
        from rpy2.robjects.numpy2ri import numpy2ri
        ro.conversion.py2ri = numpy2ri
        rtrain_X = gaussianizer(train_X)
        train_X = np.array(rtrain_X)
        gaussianizer.mean = train_X.mean(axis=1)
        gaussianizer.std = train_X.mean(axis=1)

    #add ones column
    print 'adding constant to train_X'
    if len(train_X.shape) > 1:
        ones_array = np.ones((train_X.shape[0], 1))
        train_X = np.append(train_X, ones_array, 1)
    else:
        train_X = np.dstack((train_X, np.ones(len(train_X))))
        if len(train_X.shape) == 3:
            train_X = train_X[0]

    if pre_processing == 'normalize':
        print 'normalizing data'
        normalizer = normalize(train_X)

        #use sqrt of target to reduce hypothesis space
        truth[truth < 0] = 0
        truth = np.sqrt(truth).astype(int)
        truth_str = map(str, truth)

        #hack to better balance the data
        if BALANCE_DATA:
            bins = np.bincount(truth)
            avg = bins.mean()
            dev = bins.std()
            tol = 10
            tosmall = np.where(bins < avg - tol)[0]
            tobig = np.where(bins > avg + tol)[0]

            for item in tosmall:
                data = np.delete(data, np.where(truth == item)[0], axis=0)
                truth = np.delete(truth, np.where(truth == item)[0])

            for item in tobig:
                data = np.delete(data, np.where(truth == item)[0], axis=0)
                truth = np.delete(truth, np.where(truth == item)[0])

        #unbalanced targets affects, can't use StratifiedKFold, and, more important, GMM!, which assumes equal probability to all classes
        print 'Sample size is', len(truth)
        folds = KFold(len(truth),
                      n_folds=n_folds)  #shuffle=True, random_state=4
        #to only take the first fold
        #train_index, test_index = next(iter(folds))

        #for plotting
        idx = 1

        for train_index, test_index in folds:
            X_train = data[train_index]
            y_train = truth[train_index]
            X_test = data[test_index]
            y_test = truth[test_index]

            # Try GMMs using different types of covariances.
            classifiers = dict(
                (covar_type,
                 GMM(n_components=n_components,
                     covariance_type=covar_type,
                     params='wmc',
                     init_params='wmc',
                     n_iter=10000))
                for covar_type in ['spherical', 'diag', 'tied', 'full'])

            n_classifiers = len(classifiers)

            if plotEE:
                plt.figure(idx, figsize=(3 * n_classifiers / 2, 6))
                plt.subplots_adjust(bottom=.01,
                                    top=0.95,
                                    hspace=.15,
                                    wspace=.05,
                                    left=.01,
                                    right=.99)

            for index, (name,
                        classifier) in enumerate(classifiers.iteritems()):
                #np.array([X_train[y_train == i].mean(axis=0) for i in xrange(n_classes)])
                #start classifier with known means
                classifier.means_ = np.array([
                    X_train[y_train == i].mean(axis=0)
                    for i in np.unique(y_train)
                ])
                classifier.fit(X_train)

                y_train_pred = classifier.predict(X_train)
                y_train_pred = y_train_pred.astype(int)

                yresid = y_train - y_train_pred
                SSresid = np.sum(yresid**2)
                SStotal = (len(y_train) - 1) * np.var(y_train)
                train_accuracy = 1 - SSresid / SStotal  #rsq isntead of np.mean(y_train_pred.ravel() == y_train.ravel()) * 100

                y_test_pred = classifier.predict(X_test)
                y_test_pred = y_test_pred.astype(int)

                yresid = y_test - y_test_pred
                SSresid = np.sum(yresid**2)
                SStotal = (len(y_test) - 1) * np.var(y_test)
                test_accuracy = 1 - SSresid / SStotal  #rsq instead of np.mean(y_test_pred.ravel() == y_test.ravel()) * 100
                """
            if features not in results:
                results[features] = {}
                results[features][name] = (train_accuracy, test_accuracy)
            """

                if plotEE:
                    plt.figure(idx, figsize=(3 * n_classifiers / 2, 6))
                    plt.subplots_adjust(bottom=.01,
                                        top=0.95,
                                        hspace=.15,
                                        wspace=.05,
                                        left=.01,
                                        right=.99)
                    fig1 = plt.subplot(2, n_classifiers / 2, index + 1)

                    make_ellipses(classifier, fig1)

                    for n, color in enumerate('rgb'):
                        sample = data[truth == n]
                        plt.scatter(sample[:, 0],
                                    sample[:, 1],
                                    0.8,
                                    color=color,
                                    label=truth_str[n])

                    for n, color in enumerate('rgb'):
                        sample = X_test[y_test == n]
                        plt.plot(sample[:, 0], sample[:, 1], 'x', color=color)

                    plt.text(0.05,
                             0.9,
                             'Train accuracy: %.2f' % train_accuracy,
                             transform=fig1.transAxes)

                    plt.text(0.05,
                             0.8,
                             'Test accuracy: %.2f' % test_accuracy,
                             transform=fig1.transAxes)

                    plt.xticks(())
                    plt.yticks(())
                    plt.title(name)

                if plotPF:
                    #plot Ground Truth and Prediction
                    x_train = np.array(range(len(y_train)))
                    x_test = np.array(range(len(y_test)))

                    plt.figure(idx * n_folds + index)
                    fig2, ax2 = plt.subplots(2,
                                             1,
                                             1,
                                             figsize=(3 * n_classifiers / 2,
                                                      6))
                    #ax2 = plt.subplot(2, 2, index + 1)

                    ax2[0].plot(x_train, y_train, label='Train Ground Truth')
                    ax2[0].plot(x_train,
                                y_train_pred,
                                label='Train Prediction')
                    ax2[1].plot(x_test, y_test, label='Test Ground Truth')
                    ax2[1].plot(x_test, y_test_pred, label='Test Prediction')
                    """
                ax2[0].plot(x_train, y_train, label='Train Ground Truth', linestyle='none', marker='o')
                ax2[0].plot(x_train, y_train_pred, label='Train Prediction', linestyle='none', marker='o')
                ax2[1].plot(x_test, y_test, label='Test Ground Truth', linestyle='none', marker='o')
                ax2[1].plot(x_test, y_test_pred, label='Test Prediction', linestyle='none', marker='o')
                """
                    ax2[0].legend(loc='upper right',
                                  prop=dict(size=12),
                                  numpoints=1)
                    ax2[0].set_title(str(features))
                    ax2[0].set_xlabel('time')
                    ax2[0].set_ylabel('occupancy')
                    ax2[0].grid()

                    ax2[1].legend(loc='upper right',
                                  prop=dict(size=12),
                                  numpoints=1)
                    ax2[1].set_title(str(features))
                    ax2[1].set_xlabel('time')
                    ax2[1].set_ylabel('occupancy')
                    ax2[1].grid()

                    print 'y_train \n', y_train
                    print 'y_train_pred \n', y_train_pred
                    print 'y_test \n', y_test
                    print 'y_test_pred \n', y_test_pred

                if plotCM:
                    # Plot confusion matrices in a separate window
                    cm = confusion_matrix(y_train, y_train_pred)

                    plt.matshow(cm)
                    plt.title('Confusion matrix')
                    plt.colorbar()
                    plt.ylabel('True label')
                    plt.xlabel('Predicted label')
                    plt.show()

            #plt.figure(idx)
            #plt.legend(loc='lower right', prop=dict(size=12))

            idx += 1

        if plotEE or plotPF or plotCM:
            plt.show()
def plot_features(location_id, door_count_placement_view_pair, start_time, end_time, features, pre_processing=''):
  """Plots features in pairs by incremental index, e.g. (0,1), (2,3)...
  ARGS
    location_id: location_id of installation, eg '55' <int> or <str>
    door_count_placement_view_pair: placement and view id pair, e.g. ('3333230','0') (<str>, <str>)
    start_time: for prunning. time object with hour and minute time <time>
    end_time: for prunning. time object with hour and minute time <time>
    features: list with label (keys) of features used, [<str>,<str>...]
    pre_processing: pre-processing to be applied; accepts 'regularization' and 'pre_emphasis' with default values <str>
  """
  print 'pre-processing step'
  standardizer, normalizer, gaussianizer = None, None, None
  dict_of_features = {}

  for feature in features :
    for processing in ['standardize']:
      pre_processing = [processing]
      if 'standardize' in pre_processing:
        standardizer = Standardizer(copy=True, with_mean=True, with_std=True)
      if 'gaussianize' in pre_processing:
        gaussianizer = robjects.r('Gaussianize')

      print 'data mining step'
      data, _ = collectData(location_id, door_count_placement_view_pair, start_time, end_time, [feature], adjusted=False, pre_processor=standardizer)

      if standardizer is not None:
        print 'standardizer'
        data = standardizer.transform(data, copy=True)

      if gaussianizer is not None:
        print 'gaussianize'
        from rpy2.robjects.numpy2ri import numpy2ri
        robjects.conversion.py2ri = numpy2ri
        #this is slow and hacky!
        if data.ndim == 2 and data.shape[1] > 1 :
          data_transposed = data.T
          for idx in range(len(data_transposed)):
            try:
              rdata = gaussianizer(data_transposed[idx])
              gaussianized_data = np.array(rdata)
              gaussianized_data = gaussianized_data.reshape((len(gaussianized_data),))
              data[:,idx] = gaussianized_data
            except Exception, e:
              print 'gaussianization on idx %d failed' % idx
              print e
        else:
          try:
            rdata = gaussianizer(data)
            data = np.array(rdata)
          except Exception, e:
            print 'gaussianization failed'
            print e

          gaussianizer.mean = data.mean(axis=1)
          gaussianizer.std = data.mean(axis=1)

      if 'normalize' in pre_processing:
        print 'normalizing data'
        normalizer = normalize(data)

      dict_of_features[feature+'_'+pre_processing[0]] = data
def plot_features(location_id,
                  door_count_placement_view_pair,
                  start_time,
                  end_time,
                  features,
                  pre_processing=''):
    """Plots features in pairs by incremental index, e.g. (0,1), (2,3)...
  ARGS
    location_id: location_id of installation, eg '55' <int> or <str>
    door_count_placement_view_pair: placement and view id pair, e.g. ('3333230','0') (<str>, <str>)
    start_time: for prunning. time object with hour and minute time <time>
    end_time: for prunning. time object with hour and minute time <time>
    features: list with label (keys) of features used, [<str>,<str>...]
    pre_processing: pre-processing to be applied; accepts 'regularization' and 'pre_emphasis' with default values <str>
  """
    print 'pre-processing step'
    standardizer, normalizer, gaussianizer = None, None, None
    dict_of_features = {}

    for feature in features:
        for processing in ['standardize']:
            pre_processing = [processing]
            if 'standardize' in pre_processing:
                standardizer = Standardizer(copy=True,
                                            with_mean=True,
                                            with_std=True)
            if 'gaussianize' in pre_processing:
                gaussianizer = robjects.r('Gaussianize')

            print 'data mining step'
            data, _ = collectData(location_id,
                                  door_count_placement_view_pair,
                                  start_time,
                                  end_time, [feature],
                                  adjusted=False,
                                  pre_processor=standardizer)

            if standardizer is not None:
                print 'standardizer'
                data = standardizer.transform(data, copy=True)

            if gaussianizer is not None:
                print 'gaussianize'
                from rpy2.robjects.numpy2ri import numpy2ri
                robjects.conversion.py2ri = numpy2ri
                #this is slow and hacky!
                if data.ndim == 2 and data.shape[1] > 1:
                    data_transposed = data.T
                    for idx in range(len(data_transposed)):
                        try:
                            rdata = gaussianizer(data_transposed[idx])
                            gaussianized_data = np.array(rdata)
                            gaussianized_data = gaussianized_data.reshape(
                                (len(gaussianized_data), ))
                            data[:, idx] = gaussianized_data
                        except Exception, e:
                            print 'gaussianization on idx %d failed' % idx
                            print e
                else:
                    try:
                        rdata = gaussianizer(data)
                        data = np.array(rdata)
                    except Exception, e:
                        print 'gaussianization failed'
                        print e

                    gaussianizer.mean = data.mean(axis=1)
                    gaussianizer.std = data.mean(axis=1)

            if 'normalize' in pre_processing:
                print 'normalizing data'
                normalizer = normalize(data)

            dict_of_features[feature + '_' + pre_processing[0]] = data
Пример #10
0
def collect_train_and_test_data(location_id,
                                door_count_placement_view_pair,
                                trainPer,
                                testPer,
                                features,
                                timezone=None,
                                pre_processing=''):

    standardizer, normalizer, gaussianizer = None, None, None
    trainStart = trainPer[0]
    trainEnd = trainPer[1]
    predStart = testPer[0]
    predEnd = testPer[1]

    if all([
            isinstance(location_id, list),
            isinstance(door_count_placement_view_pair, list),
            isinstance(timezone, list)
    ]):
        train_location_id = location_id[0]
        train_placement_view_pair = door_count_placement_view_pair[0]
        train_timezone = timezone[0]
        test_location_id = location_id[1]
        test_placement_view_pair = door_count_placement_view_pair[1]
        test_timezone = timezone[1]
    else:
        train_location_id = location_id
        train_placement_view_pair = door_count_placement_view_pair
        train_timezone = timezone
        test_location_id = location_id
        test_placement_view_pair = door_count_placement_view_pair
        test_timezone = timezone

    print train_placement_view_pair, test_placement_view_pair

    train_start_time = createDay(trainStart, train_timezone)
    train_end_time = createDay(trainEnd, train_timezone)

    print train_location_id, train_placement_view_pair, train_start_time, train_end_time
    print '\nGetting train data'
    print 'pre-processing step'
    standardizer = None
    if 'standardize' in pre_processing:
        standardizer = Standardizer(copy=copy, with_mean=True, with_std=True)
    if 'gaussianize' in pre_processing:
        gaussianizer = robjects.r('Gaussianize')

    train_X, train_Y = collectData(train_location_id,
                                   train_placement_view_pair,
                                   train_start_time,
                                   train_end_time,
                                   features,
                                   adjusted=True)

    if standardizer is not None:
        train_X = standardizer.transform(train_X, copy=None)
    if gaussianizer is not None:
        from rpy2.robjects.numpy2ri import numpy2ri
        robjects.conversion.py2ri = numpy2ri
        rtrain_X = gaussianizer(train_X)
        train_X = np.array(rtrain_X)
        gaussianizer.mean = train_X.mean(axis=1)
        gaussianizer.std = train_X.mean(axis=1)

    #add ones column
    print 'adding constant to train_X'
    if len(train_X.shape) > 1:
        ones_array = np.ones((train_X.shape[0], 1))
        train_X = np.append(train_X, ones_array, 1)
    else:
        train_X = np.dstack((train_X, np.ones(len(train_X))))
        if len(train_X.shape) == 3:
            train_X = train_X[0]

    if pre_processing == 'normalize':
        print 'normalizing data'
        normalizer = normalize(train_X)

    print '\nGetting train data'
    if trainPer == testPer:
        test_X = train_X
        test_Y = train_Y
    else:
        test_start_time = createDay(predStart, test_timezone)
        test_end_time = createDay(predEnd, test_timezone)
        print test_location_id, test_placement_view_pair, test_start_time, test_end_time
        test_X, test_Y = collectData(test_location_id,
                                     test_placement_view_pair,
                                     test_start_time,
                                     test_end_time,
                                     features,
                                     adjusted=True)

        #pre process data
        if pre_processing == 'standardize':
            test_X = standardizer.transform(test_X, copy=None)
        if pre_processing == 'normalize':
            test_X = normalizer.transform(test_X, copy=None)
        if pre_processing == 'gaussianize':
            text_X = (text_X - gaussianizer.mean) / gaussianizer.std

        #add ones column
        print 'adding constant to test_X'
        if len(test_X.shape) > 1:
            ones_array = np.ones((test_X.shape[0], 1))
            test_X = np.append(test_X, ones_array, 1)
        else:
            test_X = np.dstack((test_X, np.ones(len(test_X))))
            if len(test_X.shape) == 3:
                test_X = test_X[0]

    return ((train_X, train_Y), (test_X, test_Y))
Пример #11
0
def run(location_id, door_count_placement_view_pair, start_time, end_time, features, n_components=16, pre_processing='', BALANCE_DATA=False):
  """ Fits data to one GMM and plots confusion matrix, prediction and error ellipses
  location_id: location_id of installation, eg '55' <int> or <str>
  door_count_placement_view_pair: placement and view id pair, e.g. ('3333230','0') (<str>, <str>)
  start_time: for prunning. time object with hour and minute time <time>
  end_time: for prunning. time object with hour and minute time <time>
  features: list with label (keys) of features used, [<str>,<str>...]
  n_components: number of mixture components 
  pre_processing: pre-processing to be applied; accepts 'standardize' and 'gaussianize' with default values <str>
  BALANCE_DATA: hack for trying to balance the data <bool>
  """

  global plotEE, plotPF, plotCM

  n_folds = 4

  print '\npre-processing step'
  standardizer = None
  if 'standardize' in pre_processing:
        standardizer = Standardizer(copy=copy, with_mean=True, with_std=True)
  if 'gaussianize' in pre_processing:
    gaussianizer = robjects.r('Gaussianize')

  train_X, train_Y = collectData(train_location_id, train_placement_view_pair, train_start_time, train_end_time, features, adjusted=True, pre_processor=standardizer)

  if standardizer is not None:
    train_X = standardizer.transform(train_X, copy=None)
  if gaussianizer is not None:
    from rpy2.robjects.numpy2ri import numpy2ri
    ro.conversion.py2ri = numpy2ri
    rtrain_X = gaussianizer(train_X)
    train_X = np.array(rtrain_X)
    gaussianizer.mean = train_X.mean(axis=1)
    gaussianizer.std = train_X.mean(axis=1)


  #add ones column
  print 'adding constant to train_X'
  if len(train_X.shape) > 1:
    ones_array = np.ones((train_X.shape[0],1))
    train_X = np.append(train_X, ones_array, 1)
  else:
    train_X= np.dstack((train_X, np.ones(len(train_X))))
    if len(train_X.shape) == 3:
      train_X= train_X[0]

  if pre_processing == 'normalize':
    print 'normalizing data'
    normalizer = normalize(train_X)

    #use sqrt of target to reduce hypothesis space 
    truth[truth<0] = 0
    truth = np.sqrt(truth).astype(int)
    truth_str = map(str, truth)

    #hack to better balance the data
    if BALANCE_DATA:
        bins = np.bincount(truth)
        avg = bins.mean()
        dev = bins.std()
        tol = 10
        tosmall = np.where(bins < avg - tol)[0]
        tobig = np.where(bins > avg + tol)[0]

        for item in tosmall:
            data = np.delete(data, np.where(truth == item)[0], axis=0)
            truth = np.delete(truth, np.where(truth == item)[0])

        for item in tobig:
            data = np.delete(data, np.where(truth == item)[0], axis=0)
            truth = np.delete(truth, np.where(truth == item)[0])

    #unbalanced targets affects, can't use StratifiedKFold, and, more important, GMM!, which assumes equal probability to all classes
    print 'Sample size is', len(truth)
    folds = KFold(len(truth), n_folds=n_folds) #shuffle=True, random_state=4
    #to only take the first fold
    #train_index, test_index = next(iter(folds)) 

    #for plotting
    idx = 1

    for train_index, test_index in folds:
        X_train = data[train_index]
        y_train = truth[train_index]
        X_test = data[test_index]
        y_test = truth[test_index]

        # Try GMMs using different types of covariances.
        classifiers = dict((covar_type, GMM(n_components=n_components,
                            covariance_type=covar_type, params='wmc', init_params='wmc', n_iter=10000))
                             for covar_type in ['spherical', 'diag', 'tied', 'full'])

        n_classifiers = len(classifiers)

        if plotEE:
            plt.figure(idx, figsize=(3 * n_classifiers / 2, 6))
            plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99)

        for index, (name, classifier) in enumerate(classifiers.iteritems()):
            #np.array([X_train[y_train == i].mean(axis=0) for i in xrange(n_classes)])
            #start classifier with known means
            classifier.means_ = np.array([X_train[y_train == i].mean(axis=0) for i in np.unique(y_train)])
            classifier.fit(X_train)


            y_train_pred = classifier.predict(X_train)
            y_train_pred = y_train_pred.astype(int)

            yresid = y_train - y_train_pred;
            SSresid = np.sum(yresid**2)
            SStotal = (len(y_train)-1) * np.var(y_train)
            train_accuracy = 1 - SSresid/SStotal #rsq isntead of np.mean(y_train_pred.ravel() == y_train.ravel()) * 100

            y_test_pred = classifier.predict(X_test)
            y_test_pred = y_test_pred.astype(int)

            yresid = y_test - y_test_pred;
            SSresid = np.sum(yresid**2)
            SStotal = (len(y_test)-1) * np.var(y_test)
            test_accuracy = 1 - SSresid/SStotal #rsq instead of np.mean(y_test_pred.ravel() == y_test.ravel()) * 100
            
            """
            if features not in results:
                results[features] = {}
                results[features][name] = (train_accuracy, test_accuracy)
            """

            if plotEE:
                plt.figure(idx, figsize=(3 * n_classifiers / 2, 6))
                plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99)
                fig1 = plt.subplot(2, n_classifiers / 2, index + 1)

                make_ellipses(classifier, fig1)

                for n, color in enumerate('rgb'):
                    sample = data[truth == n]
                    plt.scatter(sample[:, 0], sample[:, 1], 0.8, color=color, label=truth_str[n])

                for n, color in enumerate('rgb'):
                    sample = X_test[y_test == n]
                    plt.plot(sample[:, 0], sample[:, 1], 'x', color=color)

                plt.text(0.05, 0.9, 'Train accuracy: %.2f' % train_accuracy, transform=fig1.transAxes)

                plt.text(0.05, 0.8, 'Test accuracy: %.2f' % test_accuracy, transform=fig1.transAxes)

                plt.xticks(())
                plt.yticks(())
                plt.title(name)

            if plotPF:
                #plot Ground Truth and Prediction
                x_train = np.array(range(len(y_train)))
                x_test = np.array(range(len(y_test)))

                plt.figure(idx*n_folds + index)
                fig2, ax2 = plt.subplots(2, 1, 1, figsize=(3 * n_classifiers / 2, 6))
                #ax2 = plt.subplot(2, 2, index + 1)

                ax2[0].plot(x_train, y_train, label='Train Ground Truth')
                ax2[0].plot(x_train, y_train_pred, label='Train Prediction')
                ax2[1].plot(x_test, y_test, label='Test Ground Truth')
                ax2[1].plot(x_test, y_test_pred, label='Test Prediction')

                """
                ax2[0].plot(x_train, y_train, label='Train Ground Truth', linestyle='none', marker='o')
                ax2[0].plot(x_train, y_train_pred, label='Train Prediction', linestyle='none', marker='o')
                ax2[1].plot(x_test, y_test, label='Test Ground Truth', linestyle='none', marker='o')
                ax2[1].plot(x_test, y_test_pred, label='Test Prediction', linestyle='none', marker='o')
                """
                ax2[0].legend(loc='upper right', prop=dict(size=12), numpoints=1)
                ax2[0].set_title(str(features))
                ax2[0].set_xlabel('time')
                ax2[0].set_ylabel('occupancy')
                ax2[0].grid()

                ax2[1].legend(loc='upper right', prop=dict(size=12), numpoints=1)
                ax2[1].set_title(str(features))
                ax2[1].set_xlabel('time')
                ax2[1].set_ylabel('occupancy')
                ax2[1].grid()

                print 'y_train \n', y_train
                print 'y_train_pred \n',y_train_pred
                print 'y_test \n',y_test
                print 'y_test_pred \n',y_test_pred

            if plotCM:
                # Plot confusion matrices in a separate window
                cm = confusion_matrix(y_train, y_train_pred)

                plt.matshow(cm)
                plt.title('Confusion matrix')
                plt.colorbar()
                plt.ylabel('True label')
                plt.xlabel('Predicted label')
                plt.show()

        #plt.figure(idx)
        #plt.legend(loc='lower right', prop=dict(size=12))

        idx += 1

    if plotEE or plotPF or plotCM:
        plt.show()