def implement_lr_sklearn(x_train,x_test,y_train,y_test):
  """
  Implements logistic regression from scikit learn package.
  Returns an output dictionary with keys :  
  - label_test : classification predicted by LR for the test set
  - label_train : classification predicted by LR for the training set
  - thetas : coefficients of the decision boundary
  """
  from LR_functions import normalize
  x_train, x_test = normalize(x_train,x_test)

  from sklearn.grid_search import GridSearchCV
  from sklearn.linear_model import LogisticRegression

  print "doing grid search"
  C_range = 10.0 ** np.arange(-2, 5)
  param_grid = dict(C=C_range)
  grid = GridSearchCV(LogisticRegression(), param_grid=param_grid, n_jobs=-1)
  grid.fit(x_train.values, y_train.NumType.values.ravel())
  print "The best classifier is: ", grid.best_estimator_
  y_train_LR = grid.best_estimator_.predict(x_train)
  y_test_LR = grid.best_estimator_.predict(x_test)

  output = {}
  output['label_test'] = y_test_LR
  output['label_train'] = y_train_LR
  output['thetas'] = grid.best_estimator_.raw_coef_
  return output
示例#2
0
def plot_2f_variability(dic,x_train,y_train,x_test,y_test):
    """
    Plots decision boundaries for a discrimination problem with 
    2 features in function of the training set draws.
    Superimposed with scatter plots of both training and test sets.
    """

    #### PLOT ####
    nullfmt = NullFormatter()

    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left+width+0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    # start with a rectangular Figure
    fig = plt.figure(1, figsize=(8,8))
    fig.set_facecolor('white')

    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)

    # No labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    # Scatter plot:
    from LR_functions import normalize
    x_train, x_test = normalize(x_train,x_test)
    feat_1 = x_test.columns[0]
    feat_2 = x_test.columns[1]
    axScatter.scatter(x_test[feat_1],x_test[feat_2],c=list(y_test.NumType.values),cmap=plt.cm.gray,alpha=.2)
    axScatter.scatter(x_train[feat_1],x_train[feat_2],c=list(y_train.NumType.values),cmap=plt.cm.winter,alpha=.5)

    # Plot decision boundaries
    x_vec = np.arange(-1,1,.01)
    rates = []
    for draw in sorted(dic):
      theta = dic[draw]['out']['thetas']
      t = dic[draw]['out']['threshold']
      rates.append(dic[draw]['out']['rate_test']['global'])
      db = -1./theta[1][2]*(theta[1][0]+np.log((1-t[1])/t[1])+theta[1][1]*x_vec)
      axScatter.plot(x_vec,db,lw=1.,c=(0,0.1*draw,1))

    imax = np.argmax(rates)
    theta = dic[imax]['out']['thetas']
    t = dic[imax]['out']['threshold']
    method = dic[imax]['out']['method'].upper()
    types = dic[imax]['out']['types']
    lab = r'%s %.1f$\pm$%.1f%%'%(method,np.mean(rates),np.std(rates))
    db_max = -1./theta[1][2]*(theta[1][0]+np.log((1-t[1])/t[1])+theta[1][1]*x_vec)
    axScatter.plot(x_vec,db_max,lw=3.,c='midnightblue',label=lab)
    axScatter.legend(loc=4)

    x_vec, y_vec, proba, map = class_2c_2f(theta,t)
    axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.2)
 
    # Determine nice limits by hand
    bins_1, bins_2 = plot_limits(x_test,x_train)
    axScatter.set_xlim((bins_1[0], bins_1[-1]))
    axScatter.set_ylim((bins_2[0], bins_2[-1]))
    axScatter.set_xlabel(feat_1)
    axScatter.set_ylabel(feat_2)

    # Plot histograms and PDFs
    plot_histos_and_pdfs_kde(axHistx,axHisty,bins_1,bins_2,x_test,y_test,x_train=x_train,y_train=y_train) 
    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())

    # Display success rates
    pos_x = .76
    pos_y_ini = .95
    pas = .025
    plt.figtext(pos_x,pos_y_ini,'Training set %s'%method)
    rate_tr,rate_tr_1, rate_tr_2 = [],[],[]
    for draw in sorted(dic):
      p = dic[draw]['out']['rate_train']
      rate_tr.append(p['global'])
      for key in sorted(p):
        if key != 'global':
          cl, icl = key[0], key[1]
          if icl == 1:
            rate_tr_1.append(p[(cl,icl)])
          else:
            rate_tr_2.append(p[(cl,icl)])

    plt.figtext(pos_x,pos_y_ini-1*pas,'Global : %.1f$\pm$%.1f%%'%(np.mean(rate_tr),np.std(rate_tr)))
    plt.figtext(pos_x,pos_y_ini-2*pas,'%s (%d) : %.1f$\pm$%.1f%%'%(types[0],0,np.mean(rate_tr_1),np.std(rate_tr_1)))
    plt.figtext(pos_x,pos_y_ini-3*pas,'%s (%d) : %.1f$\pm$%.1f%%'%(types[1],1,np.mean(rate_tr_2),np.std(rate_tr_2)))

    pos_y_ini = pos_y_ini-4.5*pas
    pas = .025
    plt.figtext(pos_x,pos_y_ini,'Test set %s'%method)
    rate_test,rate_test_1, rate_test_2 = [],[],[]
    for draw in sorted(dic):
      p = dic[draw]['out']['rate_test']
      rate_test.append(p['global'])
      for key in sorted(p):
        if key != 'global':
          cl, icl = key[0], key[1]
          if icl == 1:
            rate_test_1.append(p[(cl,icl)])
          else:
            rate_test_2.append(p[(cl,icl)])

    plt.figtext(pos_x,pos_y_ini-1*pas,'Global : %.1f$\pm$%.1f%%'%(np.mean(rate_test),np.std(rate_test)))
    plt.figtext(pos_x,pos_y_ini-2*pas,'%s (%d) : %.1f$\pm$%.1f%%'%(types[0],0,np.mean(rate_test_1),np.std(rate_test_1)))
    plt.figtext(pos_x,pos_y_ini-3*pas,'%s (%d) : %.1f$\pm$%.1f%%'%(types[1],1,np.mean(rate_test_2),np.std(rate_test_2)))
示例#3
0
def plot_2f_all(out,x_train,y_train,x_test,y_test,x_bad,out_comp=None,map_nl=None):
    """
    Plots decision boundaries for a discrimination problem with 
    2 features.
    Superimposed with scatter plots of both training and test sets.
    If out_comp : comparison of the decision boundary from another method
    If map_nl : map of the non-linear decision boundary computed by SVM
    """

    theta = out['thetas']
    t = out['threshold']
    rate = out['rate_test']
    method = out['method']
    str_t = out['types']
    p_train = out['rate_train']

    if out_comp:
      th_comp = out_comp['thetas']
      t_comp = out_comp['threshold']
      p_comp = out_comp['rate_test']
      met_comp = out_comp['method']

    if len(theta) > 2:
      NB_class = len(theta)
      x_vec, y_vec, proba, map = class_multi_2f(theta)

    elif len(theta) == 1:
      NB_class = 2
      x_vec, y_vec, proba, map = class_2c_2f(theta,t)

    #### PLOT ####
    nullfmt = NullFormatter()

    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left+width+0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    # start with a rectangular Figure
    fig = plt.figure(1, figsize=(8,8))
    fig.set_facecolor('white')

    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)

    # No labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    # Scatter plot:
    from LR_functions import normalize
    x_train, x_test = normalize(x_train,x_test)
    feat_1 = x_test.columns[0]
    feat_2 = x_test.columns[1]
    axScatter.scatter(x_test[feat_1],x_test[feat_2],c=list(y_test.NumType.values),cmap=plt.cm.gray,alpha=.2)
    axScatter.scatter(x_train[feat_1],x_train[feat_2],c=list(y_train.NumType.values),cmap=plt.cm.winter,alpha=.5)
    axScatter.scatter(x_bad[feat_1],x_bad[feat_2],c='r',alpha=.2)

    # Plot decision boundaries
    for i in sorted(theta):
      db = -1./theta[i][2]*(theta[i][0]+np.log((1-t[i])/t[i])+theta[i][1]*x_vec[0])
      axScatter.plot(x_vec[0],db,lw=3.,c='orange')
      if out_comp:
        db = -1./th_comp[i][2]*(th_comp[i][0]+np.log((1-t_comp[i])/t_comp[i])+th_comp[i][1]*x_vec[0])
        axScatter.plot(x_vec[0],db,lw=3.,c='purple')

    if map_nl:
      axScatter.contourf(x_vec, y_vec, map_nl['map'], cmap=plt.cm.gray, alpha=0.3)
    else:
      axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.2)
 
    label = ['%s (%.2f%%)'%(method.upper(),rate['global'])]
    if out_comp:
      label.append('%s (%.2f%%)'%(met_comp.upper(),p_comp['global']))
    if map_nl:
      label.append('%s (%.2f%%)'%(map_nl['method'].upper(),map_nl['rate_test']['global']))
    axScatter.legend(label,loc=2,prop={'size':10})

    if p_train:
       x_pos = .7
       y_pos = .95
       pas = .05
       axScatter.text(x_pos,y_pos,"%s %% %s"%(p_train[(str_t[0],0)],str_t[0]),color='b',transform=axScatter.transAxes)
       axScatter.text(x_pos,y_pos-pas,"%s %% %s"%(p_train[(str_t[1],1)],str_t[1]),color='g',transform=axScatter.transAxes)
       axScatter.text(x_pos,y_pos-2*pas,"%.2f %% test set"%rate['global'],transform=axScatter.transAxes)
       axScatter.text(x_pos,y_pos-3*pas,"%.2f %% test set"%(100-rate['global']),color='r',transform=axScatter.transAxes)

    # Determine nice limits by hand
    bins_1, bins_2 = plot_limits(x_test,x_train)
    axScatter.set_xlim((bins_1[0], bins_1[-1]))
    axScatter.set_ylim((bins_2[0], bins_2[-1]))
    axScatter.set_xlabel(feat_1)
    axScatter.set_ylabel(feat_2)

    # Plot histograms and PDFs
    plot_histos_and_pdfs_kde(axHistx,axHisty,bins_1,bins_2,x_test,y_test,x_train=x_train,y_train=y_train) 
    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())

    # Display success rates
    display_rates(plt,out,out_comp=out_comp,map_nl=map_nl)
示例#4
0
def plot_2f_synth_var(out,x_train,x_test,y_test):
    """
    Plots decision boundaries for a discrimination problem with 
    2 classes and 2 features.
    """
    theta_first = out[0]['thetas']
    rate_first = out[0]['rate_test']
    t_first = out[0]['threshold']
    method = out[0]['method']

    if len(theta_first) > 2:
      NB_class = len(theta_first)
      x_vec, y_vec, proba, map = class_multi_2f(theta_first)

    elif len(theta_first) == 1:
      NB_class = 2
      x_vec, y_vec, proba, map = class_2c_2f(theta_first,t_first)


    #### PLOT ####
    nullfmt = NullFormatter()

    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left+width+0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    # start with a rectangular Figure
    fig = plt.figure(1, figsize=(8,8))
    fig.set_facecolor('white')

    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)

    # No labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    # Scatter plot:
    from LR_functions import normalize
    x_train, x_test = normalize(x_train,x_test)

    feat_1 = x_test.columns[0]
    feat_2 = x_test.columns[1]
    x = x_test[feat_1]
    y = x_test[feat_2]
    axScatter.scatter(x,y,c=list(y_test.NumType.values),cmap=plt.cm.gray)

    # Plot decision boundaries
    # VARIABILITY OF THE LR DECISION BOUNDARY
    if len(out) > 1:
      rates = []
      for i in sorted(out):
        theta = out[i]['thetas']
        t = out[i]['threshold']
        db = -1./theta[1][2]*(theta[1][0]+np.log((1-t[1])/t[1])+theta[1][1]*x_vec[0])
        axScatter.plot(x_vec[0],db,lw=1.,c=(0,0.1*i,1))
        rates.append(out[i]['rate_test']['global'])

      imax = np.argmax(rates)
      theta = out[imax]['thetas']
      t = out[imax]['threshold']
      db = -1./theta[1][2]*(theta[1][0]+np.log((1-t[1])/t[1])+theta[1][1]*x_vec[0])
      axScatter.plot(x_vec[0],db,lw=3.,c='midnightblue')
      x_vec, y_vec, proba, map = class_2c_2f(theta,t)
      axScatter.contourf(x_vec,y_vec,map,cmap=plt.cm.gray,alpha=.2)

      axScatter.text(0.6*lim_plot,-0.9*lim_plot,r'%.1f$\pm$%.1f%%'%(np.mean(rates),np.std(rates)))

    # VARIABILITY WITH THE THRESHOLD
    else:
      #for thres in np.arange(0,1,.1):
      #  db = -1./theta[0][1][2]*(theta[0][1][0]+np.log((1-thres)/thres)+theta[0][1][1]*x_vec[0])
      #  axScatter.plot(x_vec[0],db,lw=1.,c=(0,thres,1))
      from LR_functions import g
      blue_scale = []
      for i in range(10):
        blue_scale.append((0,i*0.1,1))
      CS = axScatter.contour(x_vec,y_vec,proba,10,colors=blue_scale)
      axScatter.clabel(CS, inline=1, fontsize=10)

      theta = out[0]['thetas']
      t = out[0]['threshold']
      rate = out[0]['rate_test']
      if NB_class == 2:
        db = -1./theta[1][2]*(theta[1][0]+np.log((1-t[1])/t[1])+theta[1][1]*x_vec[0])
        axScatter.plot(x_vec[0],db,lw=3.,c='midnightblue')
      axScatter.contourf(x_vec,y_vec,map,cmap=plt.cm.gray,alpha=.2)

      axScatter.text(0.6*lim_plot,-0.9*lim_plot,'LR (%.1f%%)'%rate['global'])
      axScatter.text(0.6*lim_plot,-0.8*lim_plot,'t = %.1f'%t[1])

    # Determine nice limits by hand
    bins_1, bins_2 = plot_limits_synth(x_test)
    axScatter.set_xlim((bins_1[0], bins_1[-1]))
    axScatter.set_ylim((bins_2[0], bins_2[-1]))
    axScatter.set_xlabel(feat_1)
    axScatter.set_ylabel(feat_2)

    # Plot histograms and PDFs
    plot_histos_and_pdfs_gauss(axHistx,axHisty,bins_1,bins_2,x_test,y_test,x_train=x_train,y_train=y_train)
    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())

    # Display success rates
    display_rates(plt,out)
示例#5
0
def plot_2f_nonlinear(out,x_train,x_test,y_test,y_train=None,synth=False):
    """
    Non linear decision boundary.
    synth = True for synthetics.
    """
    NB_class = len(np.unique(y_test.Type.values))
    map = out['map']
    rate = out['rate_test']
    method = out['method']

    pas = .01
    x_vec = np.arange(-1,1,pas)
    y_vec = np.arange(-1,1,pas)
    x_vec, y_vec = np.meshgrid(x_vec,y_vec)

    #### PLOT ####
    nullfmt = NullFormatter()

    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left+width+0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    # start with a rectangular Figure
    fig = plt.figure(1, figsize=(8,8))
    fig.set_facecolor('white')

    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)

    # No labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    # Scatter plot:
    from LR_functions import normalize
    x_train, x_test = normalize(x_train,x_test)

    feat_1 = x_test.columns[0]
    feat_2 = x_test.columns[1]
    x = x_test[feat_1]
    y = x_test[feat_2]
    axScatter.scatter(x,y,c=list(y_test.NumType.values),cmap=plt.cm.gray)
    if y_train:
      axScatter.scatter(x_train[feat_1],x_train[feat_2],c=list(y_train.NumType.values),cmap=plt.cm.YlOrRd)

    # Plot decision boundaries
    axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.3)
    label = ['%s (%.2f%%)'%(method.upper(),rate['global'])]
    axScatter.legend(label,loc=4,prop={'size':14})

    # Determine nice limits by hand
    if synth:
      bins_1, bins_2 = plot_limits_synth(x_test)
    else:
      bins_1, bins_2 = plot_limits(x_test,x_train)
    axScatter.set_xlim((bins_1[0], bins_1[-1]))
    axScatter.set_ylim((bins_2[0], bins_2[-1]))
    axScatter.set_xlabel(feat_1)
    axScatter.set_ylabel(feat_2)

    # Plot histograms and PDFs
    if synth:
      plot_histos_and_pdfs_gauss(axHistx,axHisty,bins_1,bins_2,x_test,y_test,x_train=x_train,y_train=y_train)
    else:
      plot_histos_and_pdfs_kde(axHistx,axHisty,bins_1,bins_2,x_test,y_test,x_train=x_train,y_train=y_train)

    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())

    # Display succes rates
    display_rates(plt,out)
示例#6
0
def plot_2f_synthetics(out,x_train,x_test,y_test,y_train=None,out_comp=None,map_nl=None):
    """
    For synthetic tests.
    """
    theta = out['thetas']
    rate = out['rate_test']
    t = out['threshold']
    method = out['method']

    if len(theta) > 2:
      NB_class = len(theta)
      x_vec, y_vec, proba, map = class_multi_2f(theta)

    elif len(theta) == 1:
      NB_class = 2
      x_vec, y_vec, proba, map = class_2c_2f(theta,t)

    #### PLOT ####
    nullfmt = NullFormatter()

    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left+width+0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    # start with a rectangular Figure
    fig = plt.figure(1, figsize=(8,8))
    fig.set_facecolor('white')

    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)

    # No labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    # Scatter plot:
    from LR_functions import normalize
    x_train, x_test = normalize(x_train,x_test)

    feat_1 = x_test.columns[0]
    feat_2 = x_test.columns[1]
    x = x_test[feat_1]
    y = x_test[feat_2]
    axScatter.scatter(x,y,c=list(y_test.NumType.values),cmap=plt.cm.gray)
    if y_train:
      axScatter.scatter(x_train[feat_1],x_train[feat_2],c=list(y_train.NumType.values),cmap=plt.cm.YlOrRd)

    # Plot decision boundaries
    if out_comp:
      colors = ['b','c']
    else:
      colors = ['pink']
    for i in sorted(theta):
      db = -1./theta[i][2]*(theta[i][0]+np.log((1-t[i])/t[i])+theta[i][1]*x_vec[0])
      axScatter.plot(x_vec[0],db,lw=2.,c=colors[0])
      if out_comp:
        th_comp = out_comp['thetas']
        t_comp = out_comp['threshold']
        db = -1./th_comp[i][2]*(th_comp[i][0]+np.log((1-t_comp[i])/t_comp[i])+th_comp[i][1]*x_vec[0])
        axScatter.plot(x_vec[0],db,lw=3.,c=colors[1])

    if map_nl:
      axScatter.contourf(x_vec, y_vec, map_nl['map'], cmap=plt.cm.gray, alpha=0.3)
    else:
      axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.3)

    label = ['%s (%.2f%%)'%(method.upper(),rate['global'])]
    if out_comp:
      label.append('%s (%.2f%%)'%(out_comp['method'].upper(),out_comp['rate_test']['global']))
    if map_nl:
      label.append('%s (%.2f%%)'%(map_nl['method'].upper(),map_nl['rate_test']['global']))
    if len(label) == 1:
      s = 14
    else:
      s = 11
    axScatter.legend(label,loc=4,prop={'size':s})

    # Determine nice limits by hand
    bins_1, bins_2 = plot_limits_synth(x_test)
    axScatter.set_xlim((bins_1[0], bins_1[-1]))
    axScatter.set_ylim((bins_2[0], bins_2[-1]))
    axScatter.set_xlabel(feat_1)
    axScatter.set_ylabel(feat_2)

    # Plot histograms and PDFs
    plot_histos_and_pdfs_gauss(axHistx,axHisty,bins_1,bins_2,x_test,y_test,x_train=x_train,y_train=y_train)
    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())

    # Display success rates
    display_rates(plt,out,out_comp=out_comp,map_nl=map_nl)
def classifier(opt):
  """
  Classification of the different types of events.
  opt is an object of the class Options()
  """

  list_attr = opt.__dict__.keys()
  if not 'x' in list_attr:
    opt.do_tri()

  X = opt.x
  Y = opt.y

  list_attr = opt.__dict__.keys()
  if 'train_x' in list_attr:
    X_TRAIN = opt.train_x
    Y_TRAIN = opt.train_y

  dic_results = {}
  for isc in sorted(opt.xs):

    print "==========",opt.trad[isc],"=========="
    subdic = {}

    if isc > 0:
      if opt.trad[isc][0] == sta_prev:
        marker_sta = 1
      else:
        marker_sta = 0
        sta_prev = opt.trad[isc][0]
    else:
      marker_sta = 0
      sta_prev = opt.trad[isc][0]

    if len(opt.xs[isc]) == 0:
      continue


    # About the training set
    if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1 and 'train_x' not in list_attr:
      if os.path.exists(opt.opdict['train_file']):
        print opt.opdict['train_file']
        TRAIN_Y = read_binary_file(opt.opdict['train_file'])
      else:
        TRAIN_Y = {}
        for tir in range(opt.opdict['boot']):
          TRAIN_Y[tir] = {}
    elif 'train_x' in list_attr:
      opt.x = opt.xs_train[isc]
      opt.y = opt.ys_train[isc]
      if opt.opdict['plot_pdf']:
        opt.compute_pdfs()
        g_train = opt.gaussians
        del opt.gaussians
      opt.classname2number()
      x_ref_train = opt.x
      y_ref_train = opt.y


    # About the test set
    opt.x = opt.xs[isc]
    opt.y = opt.ys[isc]
    if opt.opdict['plot_pdf']:
      opt.compute_pdfs()
 
    set = pd.DataFrame(index=opt.ys[isc].index,columns=['Otime'])
    set['Otime'] = opt.xs[isc].index

    opt.classname2number()
    x_test = opt.x
    y_ref = opt.y
    x_ref = opt.x

    if opt.opdict['plot_dataset']:
      opt.composition_dataset()

    #K = len(opt.types)

    ### ITERATE OVER TRAINING SET DRAWS ###
    for b in range(opt.opdict['boot']):
      print "\n-------------------- # iter: %d --------------------\n"%(b+1)

      subsubdic = {}
      print "WHOLE SET", x_ref.shape, y_ref.shape

      ### if there is no pre-defined training set ###
      if 'train_x' not in list_attr:
        x_train = x_test.copy()
        if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1:
          if len(TRAIN_Y[b]) > 0:
            y_train = y_ref.reindex(index=TRAIN_Y[b]['training_set'])
            y_train = y_train.dropna(how='any')
            y_cv = y_ref.reindex(index=TRAIN_Y[b]['cv_set'])
            y_cv = y_cv.dropna(how='any')
            y_test = y_ref.reindex(index=TRAIN_Y[b]['test_set'])
            y_test = y_test.dropna(how='any')
          else:
            y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref)
            TRAIN_Y[b]['training_set'] = map(int,list(y_train.index))
            TRAIN_Y[b]['cv_set'] = map(int,list(y_cv.index))
            TRAIN_Y[b]['test_set'] = map(int,list(y_test.index))

        ### multi-stations case ###
        else:
          if marker_sta == 0:
            y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref)
            list_ev_train = y_train.index
            list_ev_cv = y_cv.index
            list_ev_test = y_test.index
          else:
            y_train = y_ref.reindex(index=list_ev_train)
            y_train = y_train.dropna(how='any')
            y_cv = y_ref.reindex(index=list_ev_cv)
            y_cv = y_cv.dropna(how='any')
            y_test = y_ref.reindex(index=list_ev_test)
            y_test = y_test.dropna(how='any')

        x_train = x_ref.reindex(index=y_train.index)

      ### if a training set was pre-defined ###
      else:
        x_train = x_ref_train.copy()
        y_train = y_ref_train.copy()
        y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref,y_train=y_train)

      x_cv = x_ref.reindex(index=y_cv.index)
      x_test = x_ref.reindex(index=y_test.index)

      i_train = y_train.index
      x_train.index = range(x_train.shape[0])
      y_train.index = range(y_train.shape[0])
      print "TRAINING SET", x_train.shape, y_train.shape
      if x_train.shape[0] != y_train.shape[0]:
        print "Training set: Incoherence in x and y dimensions"
        sys.exit()

      i_cv = y_cv.index
      x_cv.index = range(x_cv.shape[0])
      y_cv.index = range(y_cv.shape[0])
      print "CROSS-VALIDATION SET", x_cv.shape, y_cv.shape
      if x_cv.shape[0] != y_cv.shape[0]:
        print "Cross-validation set: Incoherence in x and y dimensions"
        sys.exit()

      subsubdic['list_ev'] = np.array(y_test.index)

      i_test = y_test.index
      x_test.index = range(x_test.shape[0])
      y_test.index = range(y_test.shape[0])
      print "TEST SET", x_test.shape, y_test.shape
      if x_test.shape[0] != y_test.shape[0]:
        print "Test set: Incoherence in x and y dimensions"
        sys.exit()

      opt.train_x = x_train
      opt.x = x_test
      opt.train_y = y_train
      opt.y = y_test

      if opt.opdict['plot_pdf']:
        opt.plot_all_pdfs(save=opt.opdict['save_pdf'])
        if 'train_x' in list_attr:
          opt.plot_superposed_pdfs(g_train,save=opt.opdict['save_pdf'])
        else:
          opt.plot_all_pdfs(save=opt.opdict['save_pdf'])

      if opt.opdict['method'] == '1b1':
        # EXTRACTEURS
        print "********** EXTRACTION 1-BY-1 **********"
        opt.opdict['boot'] = 1
        one_by_one(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm')
        continue

      elif opt.opdict['method'] == 'ova':
        print "********** EXTRACTION 1-VS-ALL **********"
        opt.opdict['boot'] = 1
        one_vs_all(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm')
        continue

      elif opt.opdict['method'] in ['svm','svm_nl']:
        # SVM
        print "********** SVM **********"
        if opt.opdict['method'] == 'svm':
          kern = 'Lin'
        else:
          kern = 'NonLin'

        out = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern=kern,proba=opt.opdict['probas'])

        if 'map' in sorted(out):
          opt.map = out['map']

        if 'thetas' in sorted(out):
          theta_vec = out['thetas']
          theta,threshold = {},{}
          for it in range(len(theta_vec)):
            theta[it+1] = np.append(theta_vec[it][-1],theta_vec[it][:-1])
            threshold[it+1] = 0.5
          out['thetas'] = theta
          out['threshold'] = threshold

      elif opt.opdict['method'] == 'lrsk':
        # LOGISTIC REGRESSION (scikit learn)
        print "********* Logistic regression (sklearn) **********"
        out = implement_lr_sklearn(x_train,x_test,y_train,y_test)
        threshold, theta = {},{}
        for it in range(len(out['thetas'])):
          threshold[it+1] = 0.5
          theta[it+1] = np.append(out['thetas'][it][-1],out['thetas'][it][:-1])
        out['threshold'] = threshold
        out['thetas'] = theta

      elif opt.opdict['method'] == 'lr':
        # LOGISTIC REGRESSION
        print "********* Logistic regression **********"
        from LR_functions import do_all_logistic_regression
        out = do_all_logistic_regression(x_train,x_test,x_cv,y_train,y_test,y_cv)
        theta = out['thetas']
        threshold = out['threshold']
        if 'learn_file' in sorted(opt.opdict):
          learn_filename = opt.opdict['learn_file']
          if not os.path.exists(learn_filename):
            wtr = write_binary_file(learn_filename,i_train)

      CLASS_test = out['label_test']
      CLASS_train = out['label_train']

      # TRAINING SET
      print "\t *TRAINING SET"
      y_train_np = y_train.NumType.values.ravel()  
      from sklearn.metrics import confusion_matrix
      cmat_train = confusion_matrix(y_train_np,CLASS_train)
      p_tr = dic_percent(cmat_train,opt.types,verbose=True)
      out['rate_train'] = p_tr
      print "   Global : %.2f%%"%p_tr['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_train,opt.types,'Training',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/training_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)

      # TEST SET
      print "\t *TEST SET"
      y_test_np = y_test.NumType.values.ravel()
      cmat_test = confusion_matrix(y_test_np,CLASS_test)
      p_test = dic_percent(cmat_test,opt.types,verbose=True)
      out['rate_test'] = p_test
      print "   Global : %.2f%%"%p_test['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_test,opt.types,'Test',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/test_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)
        if opt.opdict['plot_confusion']:
          plt.show()
        else:
          plt.close()

      # PLOT PRECISION AND RECALL
      if opt.opdict['plot_prec_rec']:
        from LR_functions import normalize,plot_precision_recall
        x_train, x_test = normalize(x_train,x_test)
        plot_precision_recall(x_train,y_train.NumType,x_test,y_test.NumType,theta)

      pourcentages = (p_tr['global'],p_test['global'])
      out['method'] = opt.opdict['method']
      out['types'] = opt.types
      opt.out = out

      # PLOT DECISION BOUNDARIES
      n_feat = x_train.shape[1] # number of features
      if n_feat < 4:
        if opt.opdict['plot_sep'] or opt.opdict['save_sep']:
          print "\nPLOTTING"
          print "Theta values:",theta
          print "Threshold:", threshold

          # COMPARE AND PLOT LR AND SVM RESULTS
          out_svm, out_nl = {},{}
          dir = '%s_SEP'%opt.opdict['method'].upper()
          if opt.opdict['method']=='lr' and opt.opdict['compare']:
            dir = 'LR_SVM_SEP'
            out_svm = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='Lin')
            cmat_svm_tr = confusion_matrix(y_train_np,out_svm['label_train'])
            cmat_svm_test = confusion_matrix(y_test_np,out_svm['label_test'])
            svm_ptr = dic_percent(cmat_svm_tr,opt.types)
            svm_pt = dic_percent(cmat_svm_test,opt.types)
            theta_svm,t_svm = {},{}
            for it in range(len(out_svm['thetas'])):
              theta_svm[it+1] = np.append(out_svm['thetas'][it][-1],out_svm['thetas'][it][:-1])
              t_svm[it+1] = 0.5
            out_svm['thetas'] = theta_svm
            out_svm['threshold'] = t_svm
            out_svm['rate_test'] = svm_pt
            out_svm['rate_train'] = svm_ptr
            out_svm['method'] = 'SVM'

          if opt.opdict['method'] in ['lr','svm'] and opt.opdict['compare_nl']:
            dir = '%s_NL_SEP'%opt.opdict['method'].upper()
            out_nl = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='NonLin')
            cmat_svm_tr = confusion_matrix(y_train_np,out_nl['label_train'])
            cmat_svm_test = confusion_matrix(y_test_np,out_nl['label_test'])
            svm_ptr = dic_percent(cmat_svm_tr,opt.types)
            svm_pt = dic_percent(cmat_svm_test,opt.types)
            out_nl['rate_test'] = svm_pt
            out_nl['rate_train'] = svm_ptr
            out_nl['method'] = 'SVM_NL'

          save_dir = os.path.join(opt.opdict['fig_path'],dir)
          opt.verify_and_create(save_dir)

          from LR_functions import normalize
          x_train, x_test = normalize(x_train,x_test)

          x_train_good = x_train.reindex(index=y_train[y_train.NumType.values==CLASS_train].index)
          x_train_bad = x_train.reindex(index=y_train[y_train.NumType.values!=CLASS_train].index)
          good_train = y_train.reindex(index=x_train_good.index)

          x_test_good = x_test.reindex(index=y_test[y_test.NumType.values==CLASS_test].index)
          x_test_bad = x_test.reindex(index=y_test[y_test.NumType.values!=CLASS_test].index)

          # PLOT FOR 1 ATTRIBUTE AND 2 CLASSES
          if n_feat == 1 and len(opt.opdict['types']) == 2:
            name = opt.opdict['feat_list'][0]
            from plot_functions import plot_hyp_func_1f, histo_pdfs
            if opt.opdict['method']=='lr' and opt.opdict['compare']:
              plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,th_comp=theta_svm,cmat_test=cmat_test,cmat_svm=cmat_svm_test,cmat_train=cmat_train)
            else:
              #histo_pdfs(x_test,y_test,x_train=x_train,y_train=y_train)
              plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,cmat_test=cmat_test,cmat_train=cmat_train)

          # PLOT FOR 2 ATTRIBUTES AND 2 to 3 CLASSES
          elif n_feat == 2:
            name = '%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1])
            if opt.opdict['method'] in ['lr','svm']:
              from plot_2features import plot_2f_all
              plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad)
            elif opt.opdict['method']=='lr' and opt.opdict['compare']:
              from plot_2features import plot_2f_all
              plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad,out_comp=out_svm,map_nl=out_nl)
            elif opt.opdict['method'] == 'svm_nl':
              from plot_2features import plot_2f_nonlinear
              plot_2f_nonlinear(out,x_train,y_train,x_test,y_test,y_train=y_train)

          # PLOT FOR 3 ATTRIBUTES
          elif n_feat == 3:
            from plot_functions import plot_db_3d
            plot_db_3d(x_train,y_train.NumType,theta[1],title='Training set')
            plot_db_3d(x_test,y_test.NumType,theta[1],title='Test set')
            name = '%s_%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1],opt.opdict['feat_list'][2])

          if opt.opdict['save_sep']:
            savename = '%s/CL_sep_%s.png'%(save_dir,name)
            print "Figure saved in %s"%savename
            plt.savefig(savename)
          if opt.opdict['plot_sep']:
            plt.show()
          else:
            plt.close()

      # WRITE RESULTS INTO A DICTIONARY
      subsubdic['%'] = pourcentages
      trad_CLASS_test = []
      for i in CLASS_test:
        i = int(i)
        trad_CLASS_test.append(opt.types[i])
      subsubdic['classification'] = trad_CLASS_test
      if opt.opdict['probas']:
        subsubdic['proba'] = out['probas']
      if opt.opdict['plot_var']:
        subsubdic['out'] = out
      subdic[b] = subsubdic

    if opt.opdict['plot_var'] and opt.opdict['method'] in ['lr','svm','lrsk'] and n_feat==2 and len(opt.opdict['types'])==2:
      from plot_2features import plot_2f_variability
      plot_2f_variability(subdic,x_train,y_train,x_test,y_test)
      plt.savefig('%s/%s_variability_pas.png'%(opt.opdict['fig_path'],opt.opdict['method'].upper()))
      plt.show()


    dic_results[opt.trad[isc]] = subdic

  dic_results['header'] = {}
  dic_results['header']['features'] = opt.opdict['feat_list']
  dic_results['header']['types'] = opt.opdict['types']
  dic_results['header']['catalog'] = opt.opdict['label_test']

  if opt.opdict['method'] in ['lr','lrsk','svm','svm_nl']:
    print "Save results in file %s"%opt.opdict['result_path']
    write_binary_file(opt.opdict['result_path'],dic_results)

  if 'train_file' in sorted(opt.opdict):
    if not os.path.exists(opt.opdict['train_file']) and opt.opdict['boot'] > 1:
      write_binary_file(opt.opdict['train_file'],TRAIN_Y)
def implement_svm(x_train,x_test,y_train,y_test,types,opdict,kern='NonLin',proba=False):
  """
  Implements SVM from scikit learn package.
  Options : 
  - kernel : could be 'Lin' (for linear) or 'NonLin' (for non-linear). In the latter 
  case, the kernel is a gaussian kernel.
  - proba : tells if the probability estimates must be computed

  Returns an output dictionary with keys :  
  - label_test : classification predicted by SVM for the test set
  - label_train : classification predicted by SVM for the training set

  If proba is True, add the key 'probas' containing 
  the probability estimates for each element of the dataset

  If kernel is linear, add the key 'thetas' containing  
  the coefficients of the linear decision boundary

  If kernel is non linear, add the key "map" containing 
  the classification map.
  """
  from LR_functions import normalize
  x_train, x_test = normalize(x_train,x_test)

  # do grid search
  from sklearn.grid_search import GridSearchCV
  from sklearn import svm
  print "doing grid search"
  C_range = 10.0 ** np.arange(-2, 5)

  if kern == 'NonLin':
    gamma_range = 10.0 ** np.arange(-3,3)
    param_grid = dict(gamma=gamma_range, C=C_range)
    grid = GridSearchCV(svm.SVC(probability=proba), param_grid=param_grid, n_jobs=-1)

  elif kern == 'Lin':
    param_grid = dict(C=C_range)
    grid = GridSearchCV(svm.LinearSVC(), param_grid=param_grid, n_jobs=-1)

  grid.fit(x_train.values, y_train.NumType.values.ravel())
  print "The best classifier is: ", grid.best_estimator_

  if kern == 'NonLin':
    print "Number of support vectors for each class: ", grid.best_estimator_.n_support_
  y_train_SVM = grid.best_estimator_.predict(x_train)
  y_test_SVM = grid.best_estimator_.predict(x_test)

  output = {}
  output['label_test'] = y_test_SVM
  output['label_train'] = y_train_SVM
  if proba:
    probabilities = grid.best_estimator_.predict_proba(x_test)
    output['probas'] = {}
    NB_class = len(types)
    for k in range(NB_class):
      output['probas'][types[k]] = probabilities[:,k]
  if kern == 'Lin':
    output['thetas'] = grid.best_estimator_.raw_coef_
  elif len(x_train.columns) == 2:
    pas = .01
    x_vec, y_vec = np.arange(-1,1,pas), np.arange(-1,1,pas)
    x_vec, y_vec = np.meshgrid(x_vec,y_vec)
    vec = np.c_[x_vec.ravel(),y_vec.ravel()]
    print vec.shape
    map = grid.best_estimator_.predict(np.c_[x_vec.ravel(),y_vec.ravel()])
    output['map'] = map.reshape(x_vec.shape)
  return output
示例#9
0
def plot_2f_variability(dic, x_train, y_train, x_test, y_test):
    """
    Plots decision boundaries for a discrimination problem with 
    2 features in function of the training set draws.
    Superimposed with scatter plots of both training and test sets.
    """

    #### PLOT ####
    nullfmt = NullFormatter()

    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left + width + 0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    # start with a rectangular Figure
    fig = plt.figure(1, figsize=(8, 8))
    fig.set_facecolor('white')

    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)

    # No labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    # Scatter plot:
    from LR_functions import normalize
    x_train, x_test = normalize(x_train, x_test)
    feat_1 = x_test.columns[0]
    feat_2 = x_test.columns[1]
    axScatter.scatter(x_test[feat_1],
                      x_test[feat_2],
                      c=list(y_test.NumType.values),
                      cmap=plt.cm.gray,
                      alpha=.2)
    axScatter.scatter(x_train[feat_1],
                      x_train[feat_2],
                      c=list(y_train.NumType.values),
                      cmap=plt.cm.winter,
                      alpha=.5)

    # Plot decision boundaries
    x_vec = np.arange(-1, 1, .01)
    rates = []
    for draw in sorted(dic):
        theta = dic[draw]['out']['thetas']
        t = dic[draw]['out']['threshold']
        rates.append(dic[draw]['out']['rate_test']['global'])
        db = -1. / theta[1][2] * (theta[1][0] + np.log(
            (1 - t[1]) / t[1]) + theta[1][1] * x_vec)
        axScatter.plot(x_vec, db, lw=1., c=(0, 0.1 * draw, 1))

    imax = np.argmax(rates)
    theta = dic[imax]['out']['thetas']
    t = dic[imax]['out']['threshold']
    method = dic[imax]['out']['method'].upper()
    types = dic[imax]['out']['types']
    lab = r'%s %.1f$\pm$%.1f%%' % (method, np.mean(rates), np.std(rates))
    db_max = -1. / theta[1][2] * (theta[1][0] + np.log(
        (1 - t[1]) / t[1]) + theta[1][1] * x_vec)
    axScatter.plot(x_vec, db_max, lw=3., c='midnightblue', label=lab)
    axScatter.legend(loc=4)

    x_vec, y_vec, proba, map = class_2c_2f(theta, t)
    axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.2)

    # Determine nice limits by hand
    bins_1, bins_2 = plot_limits(x_test, x_train)
    axScatter.set_xlim((bins_1[0], bins_1[-1]))
    axScatter.set_ylim((bins_2[0], bins_2[-1]))
    axScatter.set_xlabel(feat_1)
    axScatter.set_ylabel(feat_2)

    # Plot histograms and PDFs
    plot_histos_and_pdfs_kde(axHistx,
                             axHisty,
                             bins_1,
                             bins_2,
                             x_test,
                             y_test,
                             x_train=x_train,
                             y_train=y_train)
    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())

    # Display success rates
    pos_x = .76
    pos_y_ini = .95
    pas = .025
    plt.figtext(pos_x, pos_y_ini, 'Training set %s' % method)
    rate_tr, rate_tr_1, rate_tr_2 = [], [], []
    for draw in sorted(dic):
        p = dic[draw]['out']['rate_train']
        rate_tr.append(p['global'])
        for key in sorted(p):
            if key != 'global':
                cl, icl = key[0], key[1]
                if icl == 1:
                    rate_tr_1.append(p[(cl, icl)])
                else:
                    rate_tr_2.append(p[(cl, icl)])

    plt.figtext(
        pos_x, pos_y_ini - 1 * pas,
        'Global : %.1f$\pm$%.1f%%' % (np.mean(rate_tr), np.std(rate_tr)))
    plt.figtext(
        pos_x, pos_y_ini - 2 * pas, '%s (%d) : %.1f$\pm$%.1f%%' %
        (types[0], 0, np.mean(rate_tr_1), np.std(rate_tr_1)))
    plt.figtext(
        pos_x, pos_y_ini - 3 * pas, '%s (%d) : %.1f$\pm$%.1f%%' %
        (types[1], 1, np.mean(rate_tr_2), np.std(rate_tr_2)))

    pos_y_ini = pos_y_ini - 4.5 * pas
    pas = .025
    plt.figtext(pos_x, pos_y_ini, 'Test set %s' % method)
    rate_test, rate_test_1, rate_test_2 = [], [], []
    for draw in sorted(dic):
        p = dic[draw]['out']['rate_test']
        rate_test.append(p['global'])
        for key in sorted(p):
            if key != 'global':
                cl, icl = key[0], key[1]
                if icl == 1:
                    rate_test_1.append(p[(cl, icl)])
                else:
                    rate_test_2.append(p[(cl, icl)])

    plt.figtext(
        pos_x, pos_y_ini - 1 * pas,
        'Global : %.1f$\pm$%.1f%%' % (np.mean(rate_test), np.std(rate_test)))
    plt.figtext(
        pos_x, pos_y_ini - 2 * pas, '%s (%d) : %.1f$\pm$%.1f%%' %
        (types[0], 0, np.mean(rate_test_1), np.std(rate_test_1)))
    plt.figtext(
        pos_x, pos_y_ini - 3 * pas, '%s (%d) : %.1f$\pm$%.1f%%' %
        (types[1], 1, np.mean(rate_test_2), np.std(rate_test_2)))
示例#10
0
def plot_2f_all(out,
                x_train,
                y_train,
                x_test,
                y_test,
                x_bad,
                out_comp=None,
                map_nl=None):
    """
    Plots decision boundaries for a discrimination problem with 
    2 features.
    Superimposed with scatter plots of both training and test sets.
    If out_comp : comparison of the decision boundary from another method
    If map_nl : map of the non-linear decision boundary computed by SVM
    """

    theta = out['thetas']
    t = out['threshold']
    rate = out['rate_test']
    method = out['method']
    str_t = out['types']
    p_train = out['rate_train']

    if out_comp:
        th_comp = out_comp['thetas']
        t_comp = out_comp['threshold']
        p_comp = out_comp['rate_test']
        met_comp = out_comp['method']

    if len(theta) > 2:
        NB_class = len(theta)
        x_vec, y_vec, proba, map = class_multi_2f(theta)

    elif len(theta) == 1:
        NB_class = 2
        x_vec, y_vec, proba, map = class_2c_2f(theta, t)

    #### PLOT ####
    nullfmt = NullFormatter()

    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left + width + 0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    # start with a rectangular Figure
    fig = plt.figure(1, figsize=(8, 8))
    fig.set_facecolor('white')

    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)

    # No labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    # Scatter plot:
    from LR_functions import normalize
    x_train, x_test = normalize(x_train, x_test)
    feat_1 = x_test.columns[0]
    feat_2 = x_test.columns[1]
    axScatter.scatter(x_test[feat_1],
                      x_test[feat_2],
                      c=list(y_test.NumType.values),
                      cmap=plt.cm.gray,
                      alpha=.2)
    axScatter.scatter(x_train[feat_1],
                      x_train[feat_2],
                      c=list(y_train.NumType.values),
                      cmap=plt.cm.winter,
                      alpha=.5)
    axScatter.scatter(x_bad[feat_1], x_bad[feat_2], c='r', alpha=.2)

    # Plot decision boundaries
    for i in sorted(theta):
        db = -1. / theta[i][2] * (theta[i][0] + np.log(
            (1 - t[i]) / t[i]) + theta[i][1] * x_vec[0])
        axScatter.plot(x_vec[0], db, lw=3., c='orange')
        if out_comp:
            db = -1. / th_comp[i][2] * (th_comp[i][0] + np.log(
                (1 - t_comp[i]) / t_comp[i]) + th_comp[i][1] * x_vec[0])
            axScatter.plot(x_vec[0], db, lw=3., c='purple')

    if map_nl:
        axScatter.contourf(x_vec,
                           y_vec,
                           map_nl['map'],
                           cmap=plt.cm.gray,
                           alpha=0.3)
    else:
        axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.2)

    label = ['%s (%.2f%%)' % (method.upper(), rate['global'])]
    if out_comp:
        label.append('%s (%.2f%%)' % (met_comp.upper(), p_comp['global']))
    if map_nl:
        label.append('%s (%.2f%%)' %
                     (map_nl['method'].upper(), map_nl['rate_test']['global']))
    axScatter.legend(label, loc=2, prop={'size': 10})

    if p_train:
        x_pos = .7
        y_pos = .95
        pas = .05
        axScatter.text(x_pos,
                       y_pos,
                       "%s %% %s" % (p_train[(str_t[0], 0)], str_t[0]),
                       color='b',
                       transform=axScatter.transAxes)
        axScatter.text(x_pos,
                       y_pos - pas,
                       "%s %% %s" % (p_train[(str_t[1], 1)], str_t[1]),
                       color='g',
                       transform=axScatter.transAxes)
        axScatter.text(x_pos,
                       y_pos - 2 * pas,
                       "%.2f %% test set" % rate['global'],
                       transform=axScatter.transAxes)
        axScatter.text(x_pos,
                       y_pos - 3 * pas,
                       "%.2f %% test set" % (100 - rate['global']),
                       color='r',
                       transform=axScatter.transAxes)

    # Determine nice limits by hand
    bins_1, bins_2 = plot_limits(x_test, x_train)
    axScatter.set_xlim((bins_1[0], bins_1[-1]))
    axScatter.set_ylim((bins_2[0], bins_2[-1]))
    axScatter.set_xlabel(feat_1)
    axScatter.set_ylabel(feat_2)

    # Plot histograms and PDFs
    plot_histos_and_pdfs_kde(axHistx,
                             axHisty,
                             bins_1,
                             bins_2,
                             x_test,
                             y_test,
                             x_train=x_train,
                             y_train=y_train)
    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())

    # Display success rates
    display_rates(plt, out, out_comp=out_comp, map_nl=map_nl)
示例#11
0
def plot_2f_synth_var(out, x_train, x_test, y_test):
    """
    Plots decision boundaries for a discrimination problem with 
    2 classes and 2 features.
    """
    theta_first = out[0]['thetas']
    rate_first = out[0]['rate_test']
    t_first = out[0]['threshold']
    method = out[0]['method']

    if len(theta_first) > 2:
        NB_class = len(theta_first)
        x_vec, y_vec, proba, map = class_multi_2f(theta_first)

    elif len(theta_first) == 1:
        NB_class = 2
        x_vec, y_vec, proba, map = class_2c_2f(theta_first, t_first)

    #### PLOT ####
    nullfmt = NullFormatter()

    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left + width + 0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    # start with a rectangular Figure
    fig = plt.figure(1, figsize=(8, 8))
    fig.set_facecolor('white')

    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)

    # No labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    # Scatter plot:
    from LR_functions import normalize
    x_train, x_test = normalize(x_train, x_test)

    feat_1 = x_test.columns[0]
    feat_2 = x_test.columns[1]
    x = x_test[feat_1]
    y = x_test[feat_2]
    axScatter.scatter(x, y, c=list(y_test.NumType.values), cmap=plt.cm.gray)

    # Plot decision boundaries
    # VARIABILITY OF THE LR DECISION BOUNDARY
    if len(out) > 1:
        rates = []
        for i in sorted(out):
            theta = out[i]['thetas']
            t = out[i]['threshold']
            db = -1. / theta[1][2] * (theta[1][0] + np.log(
                (1 - t[1]) / t[1]) + theta[1][1] * x_vec[0])
            axScatter.plot(x_vec[0], db, lw=1., c=(0, 0.1 * i, 1))
            rates.append(out[i]['rate_test']['global'])

        imax = np.argmax(rates)
        theta = out[imax]['thetas']
        t = out[imax]['threshold']
        db = -1. / theta[1][2] * (theta[1][0] + np.log(
            (1 - t[1]) / t[1]) + theta[1][1] * x_vec[0])
        axScatter.plot(x_vec[0], db, lw=3., c='midnightblue')
        x_vec, y_vec, proba, map = class_2c_2f(theta, t)
        axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=.2)

        axScatter.text(0.6 * lim_plot, -0.9 * lim_plot,
                       r'%.1f$\pm$%.1f%%' % (np.mean(rates), np.std(rates)))

    # VARIABILITY WITH THE THRESHOLD
    else:
        #for thres in np.arange(0,1,.1):
        #  db = -1./theta[0][1][2]*(theta[0][1][0]+np.log((1-thres)/thres)+theta[0][1][1]*x_vec[0])
        #  axScatter.plot(x_vec[0],db,lw=1.,c=(0,thres,1))
        from LR_functions import g
        blue_scale = []
        for i in range(10):
            blue_scale.append((0, i * 0.1, 1))
        CS = axScatter.contour(x_vec, y_vec, proba, 10, colors=blue_scale)
        axScatter.clabel(CS, inline=1, fontsize=10)

        theta = out[0]['thetas']
        t = out[0]['threshold']
        rate = out[0]['rate_test']
        if NB_class == 2:
            db = -1. / theta[1][2] * (theta[1][0] + np.log(
                (1 - t[1]) / t[1]) + theta[1][1] * x_vec[0])
            axScatter.plot(x_vec[0], db, lw=3., c='midnightblue')
        axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=.2)

        axScatter.text(0.6 * lim_plot, -0.9 * lim_plot,
                       'LR (%.1f%%)' % rate['global'])
        axScatter.text(0.6 * lim_plot, -0.8 * lim_plot, 't = %.1f' % t[1])

    # Determine nice limits by hand
    bins_1, bins_2 = plot_limits_synth(x_test)
    axScatter.set_xlim((bins_1[0], bins_1[-1]))
    axScatter.set_ylim((bins_2[0], bins_2[-1]))
    axScatter.set_xlabel(feat_1)
    axScatter.set_ylabel(feat_2)

    # Plot histograms and PDFs
    plot_histos_and_pdfs_gauss(axHistx,
                               axHisty,
                               bins_1,
                               bins_2,
                               x_test,
                               y_test,
                               x_train=x_train,
                               y_train=y_train)
    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())

    # Display success rates
    display_rates(plt, out)
示例#12
0
def plot_2f_nonlinear(out, x_train, x_test, y_test, y_train=None, synth=False):
    """
    Non linear decision boundary.
    synth = True for synthetics.
    """
    NB_class = len(np.unique(y_test.Type.values))
    map = out['map']
    rate = out['rate_test']
    method = out['method']

    pas = .01
    x_vec = np.arange(-1, 1, pas)
    y_vec = np.arange(-1, 1, pas)
    x_vec, y_vec = np.meshgrid(x_vec, y_vec)

    #### PLOT ####
    nullfmt = NullFormatter()

    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left + width + 0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    # start with a rectangular Figure
    fig = plt.figure(1, figsize=(8, 8))
    fig.set_facecolor('white')

    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)

    # No labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    # Scatter plot:
    from LR_functions import normalize
    x_train, x_test = normalize(x_train, x_test)

    feat_1 = x_test.columns[0]
    feat_2 = x_test.columns[1]
    x = x_test[feat_1]
    y = x_test[feat_2]
    axScatter.scatter(x, y, c=list(y_test.NumType.values), cmap=plt.cm.gray)
    if y_train:
        axScatter.scatter(x_train[feat_1],
                          x_train[feat_2],
                          c=list(y_train.NumType.values),
                          cmap=plt.cm.YlOrRd)

    # Plot decision boundaries
    axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.3)
    label = ['%s (%.2f%%)' % (method.upper(), rate['global'])]
    axScatter.legend(label, loc=4, prop={'size': 14})

    # Determine nice limits by hand
    if synth:
        bins_1, bins_2 = plot_limits_synth(x_test)
    else:
        bins_1, bins_2 = plot_limits(x_test, x_train)
    axScatter.set_xlim((bins_1[0], bins_1[-1]))
    axScatter.set_ylim((bins_2[0], bins_2[-1]))
    axScatter.set_xlabel(feat_1)
    axScatter.set_ylabel(feat_2)

    # Plot histograms and PDFs
    if synth:
        plot_histos_and_pdfs_gauss(axHistx,
                                   axHisty,
                                   bins_1,
                                   bins_2,
                                   x_test,
                                   y_test,
                                   x_train=x_train,
                                   y_train=y_train)
    else:
        plot_histos_and_pdfs_kde(axHistx,
                                 axHisty,
                                 bins_1,
                                 bins_2,
                                 x_test,
                                 y_test,
                                 x_train=x_train,
                                 y_train=y_train)

    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())

    # Display succes rates
    display_rates(plt, out)
示例#13
0
def plot_2f_synthetics(out,
                       x_train,
                       x_test,
                       y_test,
                       y_train=None,
                       out_comp=None,
                       map_nl=None):
    """
    For synthetic tests.
    """
    theta = out['thetas']
    rate = out['rate_test']
    t = out['threshold']
    method = out['method']

    if len(theta) > 2:
        NB_class = len(theta)
        x_vec, y_vec, proba, map = class_multi_2f(theta)

    elif len(theta) == 1:
        NB_class = 2
        x_vec, y_vec, proba, map = class_2c_2f(theta, t)

    #### PLOT ####
    nullfmt = NullFormatter()

    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left + width + 0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    # start with a rectangular Figure
    fig = plt.figure(1, figsize=(8, 8))
    fig.set_facecolor('white')

    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)

    # No labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    # Scatter plot:
    from LR_functions import normalize
    x_train, x_test = normalize(x_train, x_test)

    feat_1 = x_test.columns[0]
    feat_2 = x_test.columns[1]
    x = x_test[feat_1]
    y = x_test[feat_2]
    axScatter.scatter(x, y, c=list(y_test.NumType.values), cmap=plt.cm.gray)
    if y_train:
        axScatter.scatter(x_train[feat_1],
                          x_train[feat_2],
                          c=list(y_train.NumType.values),
                          cmap=plt.cm.YlOrRd)

    # Plot decision boundaries
    if out_comp:
        colors = ['b', 'c']
    else:
        colors = ['pink']
    for i in sorted(theta):
        db = -1. / theta[i][2] * (theta[i][0] + np.log(
            (1 - t[i]) / t[i]) + theta[i][1] * x_vec[0])
        axScatter.plot(x_vec[0], db, lw=2., c=colors[0])
        if out_comp:
            th_comp = out_comp['thetas']
            t_comp = out_comp['threshold']
            db = -1. / th_comp[i][2] * (th_comp[i][0] + np.log(
                (1 - t_comp[i]) / t_comp[i]) + th_comp[i][1] * x_vec[0])
            axScatter.plot(x_vec[0], db, lw=3., c=colors[1])

    if map_nl:
        axScatter.contourf(x_vec,
                           y_vec,
                           map_nl['map'],
                           cmap=plt.cm.gray,
                           alpha=0.3)
    else:
        axScatter.contourf(x_vec, y_vec, map, cmap=plt.cm.gray, alpha=0.3)

    label = ['%s (%.2f%%)' % (method.upper(), rate['global'])]
    if out_comp:
        label.append(
            '%s (%.2f%%)' %
            (out_comp['method'].upper(), out_comp['rate_test']['global']))
    if map_nl:
        label.append('%s (%.2f%%)' %
                     (map_nl['method'].upper(), map_nl['rate_test']['global']))
    if len(label) == 1:
        s = 14
    else:
        s = 11
    axScatter.legend(label, loc=4, prop={'size': s})

    # Determine nice limits by hand
    bins_1, bins_2 = plot_limits_synth(x_test)
    axScatter.set_xlim((bins_1[0], bins_1[-1]))
    axScatter.set_ylim((bins_2[0], bins_2[-1]))
    axScatter.set_xlabel(feat_1)
    axScatter.set_ylabel(feat_2)

    # Plot histograms and PDFs
    plot_histos_and_pdfs_gauss(axHistx,
                               axHisty,
                               bins_1,
                               bins_2,
                               x_test,
                               y_test,
                               x_train=x_train,
                               y_train=y_train)
    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())

    # Display success rates
    display_rates(plt, out, out_comp=out_comp, map_nl=map_nl)