Exemplo n.º 1
0
def build_model():
    #df = get_training_data()
    df = get_sampling_training()

    targets = np.array(df['success'])
    del df['success']
    del df['name']
    
    columns = df.columns

    data = np.array(df)
    model = randomforest(data, targets, tree_num=200)
    pickle.dump(model, open("data/rf.model", "w"))

    # feature importance 
    feature_importance = model.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    pl.subplot(1, 2, 2)
    pl.barh(pos, feature_importance[sorted_idx], align='center')
    pl.yticks(pos, columns[sorted_idx])
    pl.xlabel('Relative Importance')
    pl.title('Variable Importance')
    pl.savefig('plots/feature_imp.jpg')
Exemplo n.º 2
0
def get_related_features(input_df, target_feature, related_feature_size):
    features = np.array(input_df.columns)
    # remove target_feature from all features
    index = np.argwhere(features == target_feature)
    features = np.delete(features, index)
    ##feature selection

    train_x, test_x, train_y, test_y = train_test_split(
        input_df[features], input_df[target_feature], test_size=0.25)

    clf = RandomForestClassifier()
    clf.fit(train_x, train_y)

    # from the calculated importances, order them from most to least important
    # and make a barplot so we can visualize what is/isn't important
    importances = clf.feature_importances_
    sorted_idx = np.argsort(importances)
    # Return only the top features up to the related_feature_size.
    related_features = features[sorted_idx[-related_feature_size:]]

    padding = np.arange(len(features)) + 0.5
    pl.barh(padding, importances[sorted_idx], align='center')
    pl.yticks(padding, features[sorted_idx])
    pl.xlabel("Relative Importance")
    pl.title("Variable Importance")
    #pl.show()
    return related_features
Exemplo n.º 3
0
    def page_freq_graph(self, transactions):
        import config
        import numpy
        freq_item_list = {}
        for trans in transactions:
            for item in trans:
                if item in freq_item_list:
                    freq_item_list[item] += 1
                else:
                    freq_item_list[item] = 1

        from operator import itemgetter
        pages = []
        counts = []
        pos = []
        i = 1.0
        for p, c in sorted(freq_item_list.items(), key=itemgetter(1)):
            pages.append(p)
            counts.append(c)
            pos.append(i)
            i = i + 2.0

        import pylab
        pylab.cla()
        pylab.clf()
        pylab.figure(1)
        pylab.barh(numpy.array(pos), numpy.array(counts), align='center')

        #pylab.yticks(numpy.array(pos), tuple(pages))
        pylab.xlabel("Page count")
        #pylab.grid(True)
        pylab.savefig(config.OUTPUT + "page_distribution.pdf")
Exemplo n.º 4
0
    def _plot_histogram(self,
                        gs,
                        y,
                        scale,
                        y_mean=None,
                        show_len=None,
                        label=None,
                        sharex=None):
        if show_len is None:
            show_len = self.indicators[0].m
        else:
            scale = scale * show_len * 1. / self.indicators[0].m

        ax = plt.subplot(gs, sharex=sharex)
        price = self.history['last_price'][self.now - show_len:self.now]
        plt.plot(price)
        floor = price.min()
        ceil = price.max()
        #        floor = self.history['last_price'].min()
        #        ceil= self.history['last_price'].max()
        y = y[floor:ceil + 1] * scale
        y[y > show_len * 1.2] = show_len * 1.2
        plt.barh(np.arange(floor, ceil + 1),
                 y,
                 1.0,
                 label=label,
                 alpha=0.2,
                 color='r',
                 edgecolor='none')
        if y_mean is not None:
            y_mean = int(y_mean * 2. * scale)
            ax.set_xticks(np.arange(0, show_len, y_mean))
        plt.grid()
        plt.legend(loc='upper right')
        return ax
def plot_variable_importance(feature_importance, names_cols, save_name, save):
    """Show Variable importance graph."""    

    # scale by max importance first 20 variables in column names
    feature_importance = feature_importance / feature_importance.max()
    sorted_idx = np.argsort(feature_importance)[::-1][:20]
    barPos = np.arange(sorted_idx.shape[0]) + .8
    barPos = barPos[::-1]
    
    #plot.figure(num=None, facecolor='w', edgecolor='r') 
    plot.figure(num=None, facecolor='w') 
    plot.barh(barPos, feature_importance[sorted_idx]*100, align='center')
    plot.yticks(barPos, names_cols[sorted_idx])
    plot.xticks(np.arange(0, 120, 20), \
      ['0 %', '20 %', '40 %', '60 %', '80 %', '100 %'])    
    plot.margins(0.02)
    plot.subplots_adjust(bottom=0.15)
    
    plot.title('Variable Importance')
    
    if save:
	plot.savefig(save_name, bbox_inches='tight', dpi = 300)
	plot.close("all")
    else:
	plot.show()    
Exemplo n.º 6
0
def build_model():
    #df = get_training_data()
    df = get_sampling_training()

    targets = np.array(df['success'])
    del df['success']
    del df['name']

    columns = df.columns

    data = np.array(df)
    model = randomforest(data, targets, tree_num=200)
    pickle.dump(model, open("data/rf.model", "w"))

    # feature importance
    feature_importance = model.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance /
                                  feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    pl.subplot(1, 2, 2)
    pl.barh(pos, feature_importance[sorted_idx], align='center')
    pl.yticks(pos, columns[sorted_idx])
    pl.xlabel('Relative Importance')
    pl.title('Variable Importance')
    pl.savefig('plots/feature_imp.jpg')
Exemplo n.º 7
0
def test_feature(train_path):
    data = np.genfromtxt(train_path, delimiter = ',')
    y = data[:,0]
    X = data[:,1:]
    sample_size = len(y)
    train_size = int(sample_size * .95)

    params = {'n_estimators': 100, 'max_depth': 2, 'random_state': 1,
                       'min_samples_split': 5}
    params.update({'learn_rate': 0.02, 'subsample': 1.0})
    clf = ensemble.GradientBoostingClassifier(**params)
    clf.fit(X, y)

    pl.figure()
    feature_names = np.array(['type', 'type', 'type', 'main', 'log_main', 'evi', 'log_evi', 'df1', 'log_df1', 'dfu8', 'log_dfu8', 'dfband', 'log_dfband'])

    feature_importance = clf.feature_importances_
# make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)[-8:]
    pos = np.arange(sorted_idx.shape[0]) + .5
    pl.barh(pos, feature_importance[sorted_idx], align='center')
    pl.yticks(pos, feature_names[sorted_idx])
    pl.xlabel('Relative Importance')
    pl.title('Variable Importance')
    pl.show()
Exemplo n.º 8
0
def barh(pl, x, h, title=''):
    pl.figure
    if title != '':
        pl.title(title)
    pl.barh(x, h, height=0.1)
    pl.show()
    pl.close()
Exemplo n.º 9
0
def test_feature(train_path):
    data = np.genfromtxt(train_path, delimiter=',')
    y = data[:, 0]
    X = data[:, 1:]
    sample_size = len(y)
    train_size = int(sample_size * .95)

    params = {
        'n_estimators': 100,
        'max_depth': 2,
        'random_state': 1,
        'min_samples_split': 5
    }
    params.update({'learn_rate': 0.02, 'subsample': 1.0})
    clf = ensemble.GradientBoostingClassifier(**params)
    clf.fit(X, y)

    pl.figure()
    feature_names = np.array([
        'type', 'type', 'type', 'main', 'log_main', 'evi', 'log_evi', 'df1',
        'log_df1', 'dfu8', 'log_dfu8', 'dfband', 'log_dfband'
    ])

    feature_importance = clf.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance /
                                  feature_importance.max())
    sorted_idx = np.argsort(feature_importance)[-8:]
    pos = np.arange(sorted_idx.shape[0]) + .5
    pl.barh(pos, feature_importance[sorted_idx], align='center')
    pl.yticks(pos, feature_names[sorted_idx])
    pl.xlabel('Relative Importance')
    pl.title('Variable Importance')
    pl.show()
Exemplo n.º 10
0
 def plot(self, gs):
     unit_len = self.show_len * 1. / 5.
     if self.s.now - self.show_len < 0:
         return 
         
     price = self.price[0][self.s.now - self.show_len : self.s.now]
     profile_range = [price.min(), price.max() + 1]
     floor, ceil = profile_range[0] - 1, profile_range[1] + 1
         
     d = self.output(3, profile_range)
     
     ax = plt.subplot(gs)
     plt.plot(price)
     day_begin = np.where(self.s.history['time_in_ticks'][self.s.now - self.show_len : self.s.now] == 0)[0]
     for x in day_begin:
         plt.axvline(x, color='r', linestyle=':')
     y = self.smoothed_pivot_profile[floor : ceil]
     plt.barh(np.arange(floor, ceil) - 0.5, y * unit_len, 1.0, label=self.name,
              alpha=0.2, color='r', edgecolor='none')
     
     last_price = int(get(self.price))
     support = last_price + int(round((d['S_offset']) * self.volatility))
     resistance = last_price + int(round((d['R_offset']) * self.volatility))
     highlighted = [support, resistance]
     plt.barh(np.array(highlighted) - 0.5, self.smoothed_pivot_profile[highlighted] * unit_len, 1.0,
              alpha=1.0, color='r', edgecolor='none')
     ax.set_xticks(np.arange(0, self.show_len * 1.22, unit_len))
     ax.xaxis.grid(b=True, linestyle='--')
     ax.yaxis.grid(b=False)
     plt.legend(loc='upper right')
     return ax
Exemplo n.º 11
0
def do_scaplots(distance_dict, after_dict, before_dict, bins, xtext, option=0):
    for count, name,ylims in ((0,'m_diff', (-0.5,0.5)),(1,'n diff', (-1,0.5)),(2,'r diff', (-0.5,0.5)),(3, 'ba diff', (-0.05,0.05))): 
        pl.subplot(2,2,count+1)
        if 0:#count ==2:
            ns = np.array([after_dict[a][count]/np.max([before_dict[a][count],0.0000001])-1.0 for a in before_dict.keys()]).T
        else:
            ns = np.array([after_dict[a][count]-before_dict[a][count] for a in before_dict.keys()]).T
        bars, edges=np.histogram(ns, bins=100,range=ylims)
        bars = bars/float(ns.size)
        print ns
        #pl.step(bars, edges, *args, **kwargs)
        pl.barh((edges[0:-1]+edges[1:])/2, bars, align='center', height = (edges[1:]-edges[0:-1]),alpha=0.4)
        #pl.scatter(ns[0,:], ns[1,:], s =3, edgecolor='none', zorder = -900)
        nstats = bin_stats.bin_stats(0.25*np.ones_like(ns), ns, (0.0,0.5), -1000.0, 1000.0)
        nstats.lay_bounds(color='r', sigma_choice = [68,95])
        nstats.plot_ebar('median','med95ci',color='r',ecolor='r',
                         marker='s', markersize=3, lw=2, linestyle='none')
        pl.xlabel(xtext)
        pl.ylabel(name)
        pl.ylim(ylims)
        pl.xlim(0,0.5)


    #ax = pl.subplot(2,2,3)
    #pl.ylim(-10,10)
    pl.subplots_adjust(wspace=0.4, hspace=0.4)
    return
Exemplo n.º 12
0
def arbolesRegresion(caract):
    
    clf = DecisionTreeRegressor(min_samples_leaf=10, min_samples_split=15, max_depth=13, compute_importances=True)
    
    importancias = [0,0,0,0,0,0,0,0,0,0,0,0,0]    
    mae=mse=r2=0
    
    kf = KFold(len(boston_Y), n_folds=10, indices=True)
    for train, test in kf:
        trainX, testX, trainY, testY=boston_X[train], boston_X[test], boston_Y[train], boston_Y[test]
            
        nCar=len(caract)
        train=np.zeros((len(trainX), nCar))
        test=np.zeros((len(testX), nCar))
        trainYNuevo=trainY
        
        for i in range(nCar):
            for j in range(len(trainX)):
                train[j][i]=trainX[j][caract[i]]
                
            for k in range(len(testX)):
                test[k][i]=testX[k][caract[i]]
        
        trainYNuevo=np.reshape(trainYNuevo, (len(trainY), -1))
        
        clf.fit(train, trainYNuevo)
        prediccion=clf.predict(test)            
        
#        clf.fit(trainX, trainY)
#        prediccion=clf.predict(testX)
            
        mae+=metrics.mean_absolute_error(testY, prediccion)
        mse+=metrics.mean_squared_error(testY, prediccion)
        r2+=metrics.r2_score(testY, prediccion)
        
        feature_importance = clf.feature_importances_
        feature_importance = 100.0 * (feature_importance / feature_importance.max())
        for i in range(13):
            importancias[i] = importancias[i] + feature_importance[i]
        
    print 'Error abs: ', mae/len(kf), 'Error cuadratico: ', mse/len(kf), 'R cuadrado: ', r2/len(kf)
    
    for i in range(13):
        importancias[i] = importancias[i]/10
        
    sorted_idx = np.argsort(importancias)
    pos = np.arange(sorted_idx.shape[0]) + .5
    importancias = np.reshape(importancias, (len(importancias), -1))

    boston = datasets.load_boston()
    pl.barh(pos, importancias[sorted_idx], align='center')
    pl.yticks(pos, boston.feature_names[sorted_idx])
    pl.xlabel('Importancia relativa')
    pl.show()    
    
    import StringIO, pydot 
    dot_data = StringIO.StringIO() 
    tree.export_graphviz(clf, out_file=dot_data) 
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    graph.write_pdf("bostonTree.pdf") 
    def plot_predictions(self):
        data = self.get_next_batch(train=False)[2] # get a test batch
        num_classes = self.test_data_provider.get_num_classes()
        NUM_ROWS = 2
        NUM_COLS = 4
        NUM_IMGS = NUM_ROWS * NUM_COLS
        NUM_TOP_CLASSES = min(num_classes, 4) # show this many top labels

        label_names = self.test_data_provider.batch_meta['label_names']
        if self.only_errors:
            preds = n.zeros((data[0].shape[1], num_classes), dtype=n.single)
        else:
            preds = n.zeros((NUM_IMGS, num_classes), dtype=n.single)
            rand_idx = nr.randint(0, data[0].shape[1], NUM_IMGS)
            print rand_idx
            data[0] = n.require(data[0][:,rand_idx], requirements='C')
            data[1] = n.require(data[1][:,rand_idx], requirements='C')
        data += [preds]
        temp = data[0]
        print data
        print temp.ndim,temp.shape,temp.size
        # Run the model
        self.libmodel.startFeatureWriter(data, self.sotmax_idx)
        self.finish_batch()

        fig = pl.figure(3)
        fig.text(.4, .95, '%s test case predictions' % ('Mistaken' if self.only_errors else 'Random'))
        if self.only_errors:
            err_idx = nr.permutation(n.where(preds.argmax(axis=1) != data[1][0,:])[0])[:NUM_IMGS] # what the net got wrong
            data[0], data[1], preds = data[0][:,err_idx], data[1][:,err_idx], preds[err_idx,:]

        data[0] = self.test_data_provider.get_plottable_data(data[0])
        for r in xrange(NUM_ROWS):
            for c in xrange(NUM_COLS):
                img_idx = r * NUM_COLS + c
                if data[0].shape[0] <= img_idx:
                    break
                pl.subplot(NUM_ROWS*2, NUM_COLS, r * 2 * NUM_COLS + c + 1)
                pl.xticks([])
                pl.yticks([])
                try:
                    img = data[0][img_idx,:,:,:]
                except IndexError:
                    # maybe greyscale?
                    img = data[0][img_idx,:,:]
                pl.imshow(img, interpolation='nearest')
                true_label = int(data[1][0,img_idx])

                img_labels = sorted(zip(preds[img_idx,:], label_names), key=lambda x: x[0])[-NUM_TOP_CLASSES:]
                pl.subplot(NUM_ROWS*2, NUM_COLS, (r * 2 + 1) * NUM_COLS + c + 1, aspect='equal')

                ylocs = n.array(range(NUM_TOP_CLASSES)) + 0.5
                height = 0.5
                width = max(ylocs)
                pl.barh(ylocs, [l[0]*width for l in img_labels], height=height, \
                        color=['r' if l[1] == label_names[true_label] else 'b' for l in img_labels])
                pl.title(label_names[true_label])
                pl.yticks(ylocs + height/2, [l[1] for l in img_labels])
                pl.xticks([width/2.0, width], ['50%', ''])
                pl.ylim(0, ylocs[-1] + height*2)
def visualize_chi2(summaries, genres, n_gram=(1, 3), top_features=20):
    """
    Visualize the most discriminative features for each genre
    :param summaries:
    :param genres:
    :param n_gram:
    :param top_features:
    :return:
    """
    vectorizer = TfidfVectorizer(ngram_range=n_gram,
                                 lowercase=True,
                                 norm=None,
                                 smooth_idf=True,
                                 sublinear_tf=True)
    new_summaries = []
    new_genres = []
    for (summary, genre) in (summaries, genres):
        for sentence in summary.split('.'):
            new_summaries.append(sentence)
            new_genres.append(genre)
    X_train = vectorizer.fit_transform(new_summaries)
    chi2score = chi2(X_train, new_genres)[0]
    figure(figsize=(6, 6))
    wscores = zip(vectorizer.get_feature_names(), chi2score)
    wchi2 = sorted(wscores, key=lambda x: x[1])
    topchi2 = zip(*wchi2[-top_features:])
    x = range(len(topchi2[1]))
    labels = topchi2[0]
    barh(x, topchi2[1], align='center', alpha=.2, color='g')
    plot(topchi2[1], x, '-o', markersize=2, alpha=.8, color='g')
    yticks(x, labels)
    xlabel('$\chi^2$')
    ylabel('Top discriminative features')
    show()
Exemplo n.º 15
0
def plot_occs_by_motif(by_motif):
    """Plot # occurrences for each motif.
    """
    sizes = [
        (len(occs), sum(occ.Z for occ in occs), name)
        for name, occs in by_motif.iteritems()]
    # expected = [(len(occs), name) for name, occs in by_motif.iteritems()]
    sizes.sort()
    bar_positions = numpy.arange(len(sizes))
    num_occs = numpy.asarray([s for s, e, n in sizes])
    total_Z = numpy.asarray([e for s, e, n in sizes])
    pylab.barh(
        bar_positions,
        num_occs,
        # left=total_Z,
        height=.8,
        align='center',
        label='Sites',
        color='blue',
    )
    pylab.barh(
        bar_positions,
        total_Z,
        height=.8,
        align='center',
        label='Total Z',
        color='blue',
        edgecolor='white',
        hatch='/',
    )
    pylab.yticks(bar_positions, [n for x, e, n in sizes])
    pylab.ylim(ymin=-.5, ymax=len(sizes) - .5)
    pylab.xlabel('occurrences')
    pylab.legend(loc='lower right')
def plot_variable_importance(feature_importance, names_cols, save_name, save):
    """Show Variable importance graph."""

    # scale by max importance first 20 variables in column names
    feature_importance = feature_importance / feature_importance.max()
    sorted_idx = np.argsort(feature_importance)[::-1][:20]
    barPos = np.arange(sorted_idx.shape[0]) + .8
    barPos = barPos[::-1]

    #plot.figure(num=None, facecolor='w', edgecolor='r')
    plot.figure(num=None, facecolor='w')
    plot.barh(barPos, feature_importance[sorted_idx] * 100, align='center')
    plot.yticks(barPos, names_cols[sorted_idx])
    plot.xticks(np.arange(0, 120, 20), \
      ['0 %', '20 %', '40 %', '60 %', '80 %', '100 %'])
    plot.margins(0.02)
    plot.subplots_adjust(bottom=0.15)

    plot.title('Variable Importance')

    if save:
        plot.savefig(save_name, bbox_inches='tight', dpi=300)
        plot.close("all")
    else:
        plot.show()
Exemplo n.º 17
0
def length_stats_chart(path, prefixes, sortby=1):
  stats = []
  for prefix in prefixes:
    med, m,s = length_stats(prefix)
    stats.append((prefix,med,m,s))

  stats.sort(key=operator.itemgetter(sortby))
  prefixes, med_list, mean_list, std_list = zip(*stats)

  blockSize = 8 
  ind = p.arange(0, blockSize*len(prefixes), blockSize) # y location for groups
  height = 3 # bar height 

  p3 = p.barh(ind, std_list, 2   * height, color = 'b', linewidth = 0)
  p2 = p.barh(ind, med_list, height, color = 'g', linewidth = 0)
  p1 = p.barh(ind+height, mean_list, height, color = 'r', linewidth = 0)
  
  p.ylim(-height, len(prefixes) * blockSize)
  yfontprop = FontProperties(size=4)
  xfontprop = FontProperties(size='smaller')
  p.xlabel('Unicode Codepoints')
  p.ylabel('Language Code')
  p.title('Descriptive Statistics for Document Lengths')
  p.gca().yaxis.tick_left()
  p.yticks(ind+height, prefixes, fontproperties = yfontprop)
  xmin, xmax = p.xlim()
  p.xticks( p.arange(xmin,xmax,1000),fontproperties = xfontprop)
  p.gca().xaxis.grid(linestyle = '-', linewidth=0.15)
  p.legend((p1[0], p2[0], p3[0]), ('Mean','Median','Standard Deviation'), prop = xfontprop, loc = 'lower right' )

  p.savefig(path, dpi=300)
  p.close()
  p.clf()
def plot_feature_importances_cancer(model):
    n_features = cancer.data.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), cancer.feature_names)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    plt.ylim(-1, n_features)
Exemplo n.º 19
0
def barh(x, y=None, title='', xlabel='', ylabel=''):
    import pylab as P
    import numpy as np
    L = (tuple, list, np.ndarray)

    # separate arrays
    if isinstance(x,L) and isinstance(y,L):
        xylist = zip(x,y)
    # list of two-tuples
    elif isinstance(x, L) and isinstance(x[0], L) and len(x[0]) == 2:
        xylist = x
    else:
        raise TypeError

    P.figure(figsize=(10, 5)) # image dimensions
    P.title(title, size='medium')
    P.xlabel(xlabel)
    P.ylabel(ylabel)

    # add bars
    for i, item in enumerate(xylist):
        P.barh(i + 0.25 , item[1])

    # set ylim
    width = np.max(zip(*xylist)[1])
    P.xlim(0, width*1.1)

    # axis setup
    P.yticks(np.arange(0.65, len(xylist)),  ['%s' % x for x,y in xylist], size='medium')
def plotNogazeDuration():
    plt.figure(figsize=(12,12))
    for vp in range(100,120):
        print vp
        plt.subplot(5,4,vp-99)
        plt.ion()
        data=readTobii(vp,0,ETDATAPATH);
        datT=[];datF=[]
        for trl in data:
            trl.extractBasicEvents()
            miss=np.int32(np.logical_and(np.isnan(trl.gaze[:,7]),
                    np.isnan(trl.gaze[:,8])))
            miss=removeShortEvs(miss,2*60)
            miss=1-removeShortEvs(1-miss,1*60)
            datT+=map(lambda x: (x[1]-x[0])/60.,tseries2eventlist(miss))
            datF+=map(lambda x: (x[1]-x[0])/60.,tseries2eventlist(1-miss))
        
        x=np.linspace(0,10,21);h=x[-1]/float(x.size-1)
        a=np.histogram(datT,bins=x, normed=True)
        plt.barh(x[:-1],-a[0],ec='k',fc='k',height=h,lw=0)
        a=np.histogram(datF,bins=x, normed=True)
        plt.barh(x[:-1],a[0],ec='g',fc='g',height=h,lw=0)
        plt.xlim([-0.7,0.7]);
        plt.gca().set_yticks(range(0,10,2))
        plt.ylim([0,10]);
        #plt.grid(False,axis='y')
        if vp==10:plt.legend(['blikn','gaze'])
Exemplo n.º 21
0
def main(args):
    # tell the interpreter we want to use the global 'jobs' list
    global jobs
    # tell the interpreter we want to use the 'pl' module
    global pl

    # parse the command line arguments
    # note that the first command line argument is always the name of the script
    if len(args) < 2:
        print("Usage: python plot_jobtimes.py NUMBER_OF_THREADS")
        exit()
    num_threads = int(args[1])

    # do the actual plotting
    # loop over the number of threads
    for i in range(num_threads):
        # get the data
        times = get_times("jobtimes_{i}.txt".format(i=i))
        # plot each data group in the corresponding colour
        for time in times:
            for key in jobs:
                if time[0] == key:
                    pl.barh(
                        i,
                        time[2] - time[1],
                        left=time[1] - times[0][1],
                        color=jobs[key],
                    )
    # show the plot
    # the program will resume when the window is closed by the user
    pl.show()
Exemplo n.º 22
0
def wiki_sizes_chart(path, prefixes, upperlimit = None ):
  prefixes, sizes = zip(*sorted( [(pr, dumpSize(pr)) for pr in prefixes]
                               , key = operator.itemgetter(1)
                               )
                       )

  blockSize = 5 
  ind = p.arange(0, blockSize*len(prefixes), blockSize) # y location for groups
  height = 4 # bar height 

  #colors = ['g','r','c','m','y']
  colors = html_colors

  thresholds = [5000, 2000,1000,500,200,100,50,20,10]
  #colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))]
  #colors.reverse()

  overall = p.barh( ind 
                  , sizes
                  , height
                  , color = 'b'
                  , linewidth = 0
                  , align='center'
                  )
  subbars = []
  for i, thresh in enumerate(thresholds) :
    subbars.append( p.barh( ind
                          , [ docs_under_thresh(pr, thresh) for pr in prefixes]
                          , height
                          , color = colors[ i % len(colors) ] 
                          , linewidth = 0
                          , align='center'
                          )
                  )
  
  p.ylim(-height, len(prefixes) * blockSize)
  if upperlimit:
    p.xlim(0, upperlimit)
  yfontprop = FontProperties(size=4)
  xfontprop = FontProperties(size=4)
  p.xlabel('Documents')
  p.ylabel('Language Code')
  p.title('Number of Documents Under Threshold')
  p.yticks(ind, prefixes, fontproperties = yfontprop)
  xmin, xmax = p.xlim()
  xtick_interval         = rounded_interval(xmin, xmax, 20, 2) 
  p.xticks( p.arange(xmin,xmax,xtick_interval),fontproperties = xfontprop)
  p.gca().xaxis.grid(linestyle = '-', linewidth=0.15)
  p.gca().yaxis.tick_left()
  p.legend( [ b[0] for b in subbars]
          , map(str,thresholds)
          , prop = xfontprop
          , loc = 'lower right' 
          )


  p.savefig(path, dpi=300)
  p.close()
  p.clf()
Exemplo n.º 23
0
    def plot_predictions(self):
        data = self.get_next_batch(train=False)[2] # get a test batch
        num_classes = self.test_data_provider.get_num_classes()
        NUM_ROWS = 2
        NUM_COLS = 4
        NUM_IMGS = NUM_ROWS * NUM_COLS
        NUM_TOP_CLASSES = min(num_classes, 4) # show this many top labels
        
        label_names = self.test_data_provider.batch_meta['label_names']
        if self.only_errors:
            preds = n.zeros((data[0].shape[1], num_classes), dtype=n.single)
        else:
            preds = n.zeros((NUM_IMGS, num_classes), dtype=n.single)
            rand_idx = nr.randint(0, data[0].shape[1], NUM_IMGS)
            data[0] = n.require(data[0][:,rand_idx], requirements='C')
            data[1] = n.require(data[1][:,rand_idx], requirements='C')
        data += [preds]

        # Run the model
        self.libmodel.startFeatureWriter(data, self.sotmax_idx)
        self.finish_batch()
        
        fig = pl.figure(3)
        fig.text(.4, .95, '%s test case predictions' % ('Mistaken' if self.only_errors else 'Random'))
        if self.only_errors:
            err_idx = nr.permutation(n.where(preds.argmax(axis=1) != data[1][0,:])[0])[:NUM_IMGS] # what the net got wrong
            data[0], data[1], preds = data[0][:,err_idx], data[1][:,err_idx], preds[err_idx,:]
            
        data[0] = self.test_data_provider.get_plottable_data(data[0])
        pl.subplots_adjust(hspace=.3)
        for r in xrange(NUM_ROWS):
            for c in xrange(NUM_COLS):
                img_idx = r * NUM_COLS + c
                if data[0].shape[0] <= img_idx:
                    break
                pl.subplot(NUM_ROWS*2, NUM_COLS, r * 2 * NUM_COLS + c + 1)
                pl.xticks([])
                pl.yticks([])
 				#pl.title('test')   
                try:
                	img = data[0][img_idx,:,:,:]
                except IndexError:
                    # maybe greyscale?
                    img = data[0][img_idx,:,:]
                pl.imshow(img, interpolation='nearest')
                true_label = int(data[1][0,img_idx])

                img_labels = sorted(zip(preds[img_idx,:], label_names), key=lambda x: x[0])[-NUM_TOP_CLASSES:]
                pl.subplot(NUM_ROWS*2, NUM_COLS, (r * 2 + 1) * NUM_COLS + c + 1, aspect='equal')

                ylocs = n.array(range(NUM_TOP_CLASSES)) + 0.5
                height = 0.5
                width = max(ylocs)
                pl.barh(ylocs, [l[0]*width for l in img_labels], height=height, \
                        color=['r' if l[1] == label_names[true_label] else 'b' for l in img_labels])
                pl.title(label_names[true_label])
                pl.yticks(ylocs + height/2, [l[1] for l in img_labels])
                pl.xticks([width/2.0, width], ['50%', ''])
                pl.ylim(0, ylocs[-1] + height*2)
Exemplo n.º 24
0
    def plot_cascade(self, vertical=True):
        if vertical:
            fig_size = (12, 12)
            ax_size = [0.45, 0.05, 0.5, 0.9]
        else:
            fig_size = (16, 8)
            ax_size = [0.05, 0.45, 0.9, 0.5]
        df = sc.dcp(self.data)
        cutoff = 200e3
        fig = pl.figure(figsize=fig_size)
        df.sort(col='icer', reverse=False)
        DA_data = hp.arr(df['opt_spend'])
        inds = sc.findinds(DA_data > cutoff)
        DA_data = DA_data[inds]
        DA_data /= 1e6
        DA_labels = df['shortname'][inds]
        npts = len(DA_data)
        colors = sc.gridcolors(npts, limits=(0.25, 0.75))
        x = np.arange(len(DA_data))
        pl.axes(ax_size)
        for pt in range(npts):
            loc = x[pt:]
            this = DA_data[pt]
            start = sum(DA_data[:pt])
            prop = 0.9
            color = colors[pt]
            amount = sum(DA_data[:pt + 1])
            amountstr = '%0.1f' % amount
            if vertical:
                pl.barh(loc, width=this, left=start, height=prop, color=color)
                pl.text(amount,
                        x[pt],
                        amountstr,
                        verticalalignment='center',
                        color=colors[pt])
            else:
                pl.bar(loc, height=this, bottom=start, width=prop, color=color)
                pl.text(x[pt],
                        amount + 1,
                        amountstr,
                        horizontalalignment='center',
                        color=colors[pt])
        if vertical:
            pl.xlabel('Spending for optimized investment cascade')
            pl.gca().set_yticks(x)
            ticklabels = pl.gca().set_yticklabels(DA_labels)
        else:
            pl.ylabel('Optimized investment cascade')
            pl.gca().set_xticks(x)
            ticklabels = pl.gca().set_xticklabels(DA_labels, rotation=90)
        for t, tl in enumerate(ticklabels):
            tl.set_color(colors[t])

        pl.gca().set_facecolor('none')
        pl.title('Investment cascade')
        return fig
Exemplo n.º 25
0
def histogram(c, plot_name="test", plot_title="", plot_xlabel=""):
    import pylab
    pylab.figure(1)
    pos = pylab.arange(len(c)) + .5
    pylab.barh(pos, c, align='center')
    pylab.yticks(pos, range(1, len(c) + 1))
    pylab.xlabel(plot_xlabel)
    pylab.title(plot_title)
    pylab.grid(True)
    pylab.savefig(plot_name + ".png")
Exemplo n.º 26
0
def histogram(c, plot_name="test", plot_title="", plot_xlabel=""):
    import pylab
    pylab.figure(1)
    pos = pylab.arange(len(c))+.5
    pylab.barh(pos, c, align='center')
    pylab.yticks(pos, range(1, len(c)+1))
    pylab.xlabel(plot_xlabel)
    pylab.title(plot_title)
    pylab.grid(True)
    pylab.savefig(plot_name+".png")
def plot_feature_importance(feature_importance, feature_names):
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    pl.subplot(1, 2, 2)
    pl.barh(pos, feature_importance[sorted_idx], align='center')
    pl.yticks(pos, feature_names[sorted_idx])
    pl.xlabel('Relative Importance')
    pl.title('Variable Importance')
    pl.show()
Exemplo n.º 28
0
def print_figure(result_db, label_txt):
    """
    Print inference results
    """
    if result_db is None or label_txt is None:
        return
    db = h5py.File(result_db, 'r')

    if db is not None:
        labels = np.loadtxt(label_txt, dtype='object')
        NUM_COLS = 6
        NUM_IMGS = len(db['input_ids'])
        NUM_ROWS = NUM_IMGS // NUM_COLS + (NUM_IMGS % NUM_COLS > 0)
        NUM_TOPK_CLASSES = 3
        fig = pl.figure(figsize=(16, 4))
        fig.set_canvas(pl.gcf().canvas)
        for row in range(NUM_ROWS):
            for col in range(NUM_COLS):
                idx = row * NUM_COLS + col
                if idx == NUM_IMGS:
                    break
                pl.subplot(NUM_ROWS * 2, NUM_COLS,
                           row * 2 * NUM_COLS + col + 1)
                pl.xticks([])
                pl.yticks([])
                pl.imshow(db['input_data'][idx], interpolation='nearest')

        res = db['outputs'][db['outputs'].keys()[0]]
        for elem_id, elem_data in enumerate(res):
            row = elem_id // NUM_COLS
            col = elem_id % NUM_COLS
            img_labels = sorted(zip(elem_data, labels),
                                key=lambda x: x[0])[-NUM_TOPK_CLASSES:]
            ax = pl.subplot(NUM_ROWS * 2,
                            NUM_COLS, (row * 2 + 1) * NUM_COLS + col + 1,
                            aspect='equal')
            ax.yaxis.set_label_position("right")
            ax.yaxis.set_label_coords(1.25, 0.5)

            height = 10
            margin = 1
            ylocs = np.array(
                range(NUM_TOPK_CLASSES)) * (height + margin) + margin
            width = max(ylocs)
            top_class = img_labels[-1][1]
            pl.barh(ylocs, [l[0]*width for l in img_labels], height=height, \
                    color=['r' if l[1] == top_class else 'b' for l in img_labels]) #color=['r' if l[1] == labels[true_label] else 'b' for l in img_labels])
            pl.yticks(ylocs + (height + margin) / 2.0,
                      [l[1].replace('_', '\n') for l in img_labels],
                      fontsize=16)
            pl.xticks([0, width / 2.0, width], ['0%', '50%', '100%'])
            pl.ylim(0, ylocs[-1] + height + margin)
        pl.tight_layout()
        pl.show()
Exemplo n.º 29
0
def essay_char(essay):

    from pylab import xlabel, ylabel, show, savefig, title,\
         yticks, xlim, ylim, xticks, arange, figure, barh, grid, rcParams
    from string import ascii_letters

    global config

    cnt = { x:0 for x in ascii_letters }

    for c in essay:
        if cnt.has_key(c):
            cnt[c] += 1

    titlestr = "Essay Char"
    figure(figsize=(max(cnt.values())/4, 15), dpi=60)

    rcParams['font.size'] = 17
    rcParams['text.color'] = 'c'
    rcParams['xtick.color'] = 'r'
    rcParams['ytick.color'] = 'y'
    rcParams['figure.facecolor'] = 'k'
    rcParams['figure.edgecolor'] = 'b'
    rcParams['savefig.facecolor'] = rcParams['figure.facecolor']
    rcParams['savefig.edgecolor'] = rcParams['figure.edgecolor']
    rcParams['savefig.dpi'] = rcParams['figure.dpi']

    xlim(0, max(cnt.values()*2))
    ylim(0, len(cnt)*2)

    kbuf = cnt.keys()
    kbuf.sort()

    xticks(xrange(int(xlim()[0]), int(xlim()[1]), 2), rotation=45)
    yticks(xrange(int(ylim()[0]), int(ylim()[1]), 2), kbuf, rotation=-45)

    vbuf = [cnt[c] for c in kbuf]
    grid()

    for n, w in zip(xrange(len(vbuf)+1), vbuf):
        barh(n*2, w, height=1.5, left=0, align='center')

    """
    bar(xrange(1, len(vbuf)+1), height=vbuf,
            width=[1]*len(vbuf), bottom=[0]*len(vbuf), align='center')
#            orientation='horizontal')
#    hist(vbuf, bins=range(1, len(vbuf)+1), #rwidth=1, bottom=0,
#        align='mid', orientation='horizontal', alpha=0.7)
    """
    title(titlestr)
    xlabel('Characters Count')
    ylabel('Essay Characters')
#    show()
    savefig(config['/img']['tools.staticdir.dir'] + '/' + titlestr.replace(' ', '-').lower(), bbox_inches='tight', pad_inches=0)
Exemplo n.º 30
0
def wiki_sizes_chart(path, prefixes, upperlimit=None):
    prefixes, sizes = zip(*sorted([(pr, dumpSize(pr)) for pr in prefixes],
                                  key=operator.itemgetter(1)))

    blockSize = 5
    ind = p.arange(0, blockSize * len(prefixes),
                   blockSize)  # y location for groups
    height = 4  # bar height

    #colors = ['g','r','c','m','y']
    colors = html_colors

    thresholds = [5000, 2000, 1000, 500, 200, 100, 50, 20, 10]
    #colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))]
    #colors.reverse()

    overall = p.barh(ind,
                     sizes,
                     height,
                     color='b',
                     linewidth=0,
                     align='center')
    subbars = []
    for i, thresh in enumerate(thresholds):
        subbars.append(
            p.barh(ind, [docs_under_thresh(pr, thresh) for pr in prefixes],
                   height,
                   color=colors[i % len(colors)],
                   linewidth=0,
                   align='center'))

    p.ylim(-height, len(prefixes) * blockSize)
    if upperlimit:
        p.xlim(0, upperlimit)
    yfontprop = FontProperties(size=4)
    xfontprop = FontProperties(size=4)
    p.xlabel('Documents')
    p.ylabel('Language Code')
    p.title('Number of Documents Under Threshold')
    p.yticks(ind, prefixes, fontproperties=yfontprop)
    xmin, xmax = p.xlim()
    xtick_interval = rounded_interval(xmin, xmax, 20, 2)
    p.xticks(p.arange(xmin, xmax, xtick_interval), fontproperties=xfontprop)
    p.gca().xaxis.grid(linestyle='-', linewidth=0.15)
    p.gca().yaxis.tick_left()
    p.legend([b[0] for b in subbars],
             map(str, thresholds),
             prop=xfontprop,
             loc='lower right')

    p.savefig(path, dpi=300)
    p.close()
    p.clf()
Exemplo n.º 31
0
def plot_occupancy(occupancy,
                   offset=0.0,
                   cm=None,
                   n_cages=None,
                   n_animals=None,
                   label_left=None):
    if cm is None:
        cm = default_cm
    # [enter, exit, cage, animal]

    # get all animals
    aids = numpy.unique(occupancy[:, 3])
    aids.sort()
    if n_animals is None:
        n_aids = len(aids)
    else:
        n_aids = n_animals

    # find # of cages
    if n_cages is None:
        n_cages = len(numpy.unique(occupancy[:, 2]))
    # give each cage a color
    colors = {
        cid: cm(cid / float(n_cages - 1.0))
        for cid in numpy.arange(n_cages)
    }

    bar_height = 1. / n_aids
    # plot each animal
    for (i, aid) in enumerate(aids):
        # get occupancy for this animal
        ao = occupancy[occupancy[:, 3] == aid]

        # add label
        ty = i * bar_height + offset
        tx = ao[0, 0] if label_left is None else label_left
        pylab.text(tx, ty, str(aid), ha='right', va='center', color='k')

        # barh(bottom, width, height, left, **kwargs)
        cs = [colors[b] for b in ao[:, 2]]
        l = numpy.ones_like(ao[:, 1] - ao[:, 0]) * i * bar_height + offset
        pylab.barh(l,
                   ao[:, 1] - ao[:, 0],
                   bar_height,
                   ao[:, 0],
                   color=cs,
                   linewidth=0)

    yl = pylab.ylim()
    ylmin = min(yl[0], offset)
    ylmax = max(yl[1], 1 + offset)
    if yl != (ylmin, ylmax):
        pylab.ylim(ylmin, ylmax)
Exemplo n.º 32
0
def plot_importance(clf, train_df, features):
    feature_importance = clf.feature_importances_
    feature_importance = 100.0 * (feature_importance / feature_importance.max())

    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    pl.subplot(1, 2, 2)
    pl.barh(pos, feature_importance[sorted_idx], align='center')
    pl.yticks(pos, train_df[features].columns[sorted_idx])
    pl.xlabel('Relative Importance')
    pl.title('Variable Importance')
    pl.show()
Exemplo n.º 33
0
def question_a():
    logging.info("<Question A> Plotting histogram")

    #dicts containing count of files of the given type
    train_count = {}
    test_count = {}

    for i in range(len(proc_train_set.target)):
        if train_set.target_names[train_set.target[i]] in train_count:
            train_count[train_set.target_names[train_set.target[i]]] += 1
        else:
            train_count[train_set.target_names[train_set.target[i]]] = 1

    for i in range(len(test_set.target)):
        if test_set.target_names[test_set.target[i]] in test_count:
            test_count[test_set.target_names[test_set.target[i]]] += 1
        else:
            test_count[test_set.target_names[test_set.target[i]]] = 1

    # plot histogram for number of documents vs. topic name
    pl.figure(1)
    pl.xlabel('Topic Name')
    pl.ylabel('Number of Topics')
    yloc = pl.arange(len(train_count.keys()))
    pl.title('Histogram of Number of Documents Per Topic')
    pl.yticks(yloc, train_count.keys())
    pl.barh(yloc, list(train_count.values()), align='center', color='green')
    pl.tight_layout()

    # get number of docs of each category
    CT_count_train = 0
    CT_count_test = 0
    RA_count_train = 0
    RA_count_test = 0

    for i in category_CT:
        CT_count_train += train_count[i]
        CT_count_test += test_count[i]

    for j in category_RA:
        RA_count_test += test_count[j]
        RA_count_train += train_count[j]

    logging.info(
        'Computer Technology - train data: {0}'.format(CT_count_train))
    logging.info('Computer Technology - test data: {0}'.format(CT_count_test))
    logging.info(
        'Recreational Activity - train data: {0}'.format(RA_count_train))
    logging.info(
        'Recreational Activity - test data: {0}'.format(RA_count_test))

    pl.show()
def question_a():
    logger.info("EXECUTING: QUESTION A")
    logger.info("Plotting histogram of the number of documents per topic (Training Dataset)")

    count_train = {}
    count_test = {}

    # count the number of documents for each topic name in training dataset
    for record in xrange(len(train_dataset.target)):
        if train_dataset.target_names[train_dataset.target[record]] in count_train:
            count_train[train_dataset.target_names[train_dataset.target[record]]] += 1
        else:
            count_train[train_dataset.target_names[train_dataset.target[record]]]= 1

    # count the number of documents for each topic name in testing dataset
    for record in xrange(len(test_dataset.target)):
        if test_dataset.target_names[test_dataset.target[record]] in count_test:
            count_test[test_dataset.target_names[test_dataset.target[record]]] += 1
        else:
            count_test[test_dataset.target_names[test_dataset.target[record]]]= 1

    logger.info("Histogram plotted")

    # plot histogram for number of documents vs. topic name
    pl.figure(1)
    pl.ylabel('Topic Name')
    jet = pl.get_cmap('jet')
    pl.xlabel('Number of Topics')
    pos = pl.arange(len(count_train.keys())) + 0.5
    pl.title('Histogram of Number of Documents Per Topic')
    pl.yticks(pos, count_train.keys())
    pl.barh(pos, count_train.values(), align='center', color=jet(np.linspace(0, 1.0, len(count_train))))

    # count number of documents in CT and RA classes
    train_CT, train_RA, test_CT, test_RA = 0,0,0,0

    for i,j in zip(category_CT,category_RA):
        train_CT += count_train[i]
        train_RA += count_train[j]

        test_CT += count_test[i]
        test_RA += count_test[j]

    logger.info("TRAINING DATASET")
    logger.info("Number of Documents in Computer Technology : {}".format(train_CT))
    logger.info("Number of Documents in Recreational Activity : {}".format(train_RA))

    logger.info("TESTING DATASET")
    logger.info("Number of Documents in Computer Technology : {}".format(test_CT))
    logger.info("Number of Documents in Recreational Activity : {}".format(test_RA))

    pl.show()
Exemplo n.º 35
0
def summary_xyplot(df,var):
    #random forest
    features=np.array(df.ix[:, df.columns != var].describe().keys())
    clf = RandomForestClassifier()
    clf.fit(df[features], df[var])
    importances = clf.feature_importances_
    sorted_idx = np.argsort(importances)
    padding = np.arange(len(features)) + 0.5
    pl.barh(padding, importances[sorted_idx], align='center')
    pl.yticks(padding, features[sorted_idx])
    pl.xlabel("Relative Importance")
    pl.title("Variable Importance")
    return pl.show()
Exemplo n.º 36
0
def plot_feature_importances(features, feature_importances):
    df = pd.DataFrame(feature_importances, index=features)
    df.sort(axis=1, ascending=False, inplace=True)
    df.columns = ['feature_importances']
    pos = np.arange(0, len(features)) + 0.5
    plt.figure(figsize=(20, 12))
    plt.barh(pos, df.feature_importances, color='darkorange', align='center')
    plt.yticks(pos, df.index)
    plt.xlabel('Importance')
    plt.title('Feature Importances')
    plt.axis([0, 0.25, 0, 12])
    #plt.show()
    plt.savefig('RF_featureimportances_2Species_3.png')
Exemplo n.º 37
0
def drawChips(pl, df, df_close, title=""):
    """画一个竖向的直方图, 坐标显示价位, 值为仓位比率
    df: df_chips
    """
    pl.figure
    pl.subplot(121)
    pl.title(title)
    df_close['c'].plot()
    pl.subplot(122)
    chips = df[df.columns[1]].values
    pl.barh(df[df.columns[0]].values, chips)
    pl.show()
    pl.close()    
Exemplo n.º 38
0
    def stacking_evaluation(Train, Test, comparative, treshold, fileModel, label='FRAUDE', beta=2):

        yTrain = Train[label]
        xTrain = Train
        del xTrain[label]

        names = Train.columns.values.tolist()
        fileNames = np.array(names)
        from utils.model_utils import over_sampling
        xTrain, yTrain = over_sampling(xTrain, yTrain, model='ADASYN')

        fileModel.fit(xTrain.values, yTrain.values)
        y_hat_test = fileModel.predict_proba(Test.drop(label, axis=1).values)

        df_proba = pd.DataFrame(y_hat_test, index=Test.index)
        df_proba = pd.concat([Test, df_proba], axis=1)
        df_proba.columns = ['VALOR REAL', 'VALOR_PREDICHO_NO_FRAUDE', 'VALOR_PREDICHO_FRAUDE']
        df_proba.to_csv('final_files\\probabilidades_stacking.csv', sep=';', index=False, encoding='latin1')

        y_hat_test = np.delete(y_hat_test, 0, axis=1)

        y_hat_test = (y_hat_test > treshold).astype(int)
        y_hat_test = y_hat_test.tolist()
        y_hat_test = [item for sublist in y_hat_test for item in sublist]

        print('Final threshold: %.3f' % treshold)
        print('Test Recall Score: %.3f' % recall_score(y_pred=y_hat_test, y_true=Test[label].values))
        print('Test Precision Score: %.3f' % precision_score(y_pred=y_hat_test, y_true=Test[label].values))
        print('Test F2 Score: %.3f' % fbeta_score(y_pred=y_hat_test, y_true=Test[label].values, beta=beta))

        for i in comparative.columns.values.tolist():
            if i != 'id_siniestro' and i in Test.columns.values.tolist():
                del comparative[i]

        Test = pd.merge(Test, comparative, how='left', on='id_siniestro')
        cnf_matrix = confusion_matrix(Test['FRAUDE_Clusters'].values, y_hat_test)
        plot_confusion_matrix(cnf_matrix, classes=['No Fraude', 'Fraude'], title='Confusion matrix')

        cnf_matrix = confusion_matrix(Test['FRAUDE'].values, y_hat_test)
        plot_confusion_matrix(cnf_matrix, classes=['Normal', 'Anormal'], title='Confusion matrix')

        featureImportance = fileModel.feature_importances_

        featureImportance = featureImportance / featureImportance.max()

        sorted_idx = np.argsort(featureImportance)
        barPos = np.arange(sorted_idx.shape[0]) + 0.5
        plot.barh(barPos, featureImportance[sorted_idx], align='center')
        plot.yticks(barPos, fileNames[sorted_idx])
        plot.xlabel('Variable Importance')
        plot.show()
Exemplo n.º 39
0
def symhist(x1, x2, bins):
    ''' symmetric histogram of two data sets
        >>> symhist(np.random.randn(100),np.random.randn(100)+1,np.linspace(-3,4,15))
        >>> plt.show()
    '''
    bw = bins[1] - bins[0]
    a1 = np.histogram(x1, bins=bins, normed=True)
    plt.barh(bins[:-1], -a1[0], ec='w', fc='y', height=bw, lw=0.1)
    a2 = np.histogram(x2, bins=bins, normed=1)
    plt.barh(bins[:-1], a2[0], ec='w', fc='y', height=bw, lw=0.1)
    xmax = max(plt.xlim())
    plt.xlim([-xmax, xmax])
    plt.ylim([bins[0], bins[-1]])
    ax = plt.gca()
Exemplo n.º 40
0
def plot_rfid_events(events,
                     timerange=None,
                     ymin=-0.5,
                     ymax=0.5,
                     color='k',
                     label=False,
                     animals=None):
    rfid = db.sel(events, event='rfid', timerange=timerange, data1=0)
    if len(rfid) == 0:
        return
    if animals is None:
        animals = numpy.unique(rfid[:, consts.DATA0_COLUMN])
    na = animals.size
    cs = numpy.arange(na) / (na - 1.)
    for (a, c) in zip(animals, cs):
        c = pylab.cm.jet(c)
        ae = db.sel(rfid, data0=a)
        if len(ae) == 0:
            continue
        pylab.vlines(ae[:, consts.TIME_COLUMN], ymin, ymax, color=c)
    return
    rfid = db.sel(events, event='rfid', timerange=timerange)
    if len(rfid) == 0:
        return
    # remove any read errors?
    #rfid = rfid[:, 4] >= 0
    #pylab.vlines(rfid[:, consts.TIME_COLUMN], ymin, ymax, color=color)
    idi = numpy.where(rfid[:, 4] == 0)[0]
    if idi[0] == 0:
        idi = idi[1:]
    if idi[-1] == rfid.shape[0] - 1:
        idi = idi[:-1]
    si = rfid[idi - 1]
    ei = rfid[idi + 1]
    assert numpy.all(si[:, 3] == 1)
    assert numpy.all(ei[:, 3] == 0)
    n = si.shape[0]
    b = numpy.ones(n) * ymin
    h = numpy.ones(n) * (ymax - ymin)
    w = ei[:, 0] - si[:, 0]
    l = si[:, 0]
    pylab.barh(b, w, h, l, color='pink')

    rfid_y = (ymin + ymax) * 0.5
    if not label:
        for ev in rfid[idi]:
            pylab.text(ev[consts.TIME_COLUMN],
                       rfid_y,
                       '%s' % ev[consts.RFID_ID_COLUMN],
                       color='k')
Exemplo n.º 41
0
def display_importance(df, label, features):
    '''
    Given dataframe, label, and list of features,
    plot a graph to rank variable importance
    '''
    clf = RandomForestClassifier()
    clf.fit(df[features], df[label])
    importances = clf.feature_importances_
    sorted_idx = np.argsort(importances)
    padding = np.arange(len(features)) + 0.5
    pl.barh(padding, importances[sorted_idx], align='center')
    pl.yticks(padding, np.asarray(features)[sorted_idx])
    pl.xlabel("Relative Importance")
    pl.title("Variable Importance")
Exemplo n.º 42
0
def plotCandidatoPor(atributo, candidatos):
    if atributo is 'partido':
        conjunto = [c.partido[1]['sigla'] for c in candidatos]
    elif atributo is 'ocupacao':
        conjunto = [c.ocupacao[0]['ocupacao'] for c in candidatos]
    elif atributo is 'cargo':
        conjunto = [c.cargo[0]['cargo'] for c in candidatos]
    elif atributo is 'grauInstrucao':
        conjunto = [c.grauInstrucao[0]['grauInstrucao'] for c in candidatos]
    elif atributo is 'estado':
        conjunto = [c.estado[1]['uf'] for c in candidatos]
    elif atributo is 'coligacao':
        conjunto = [c.coligacao[0]['coligacao'] for c in candidatos]
    elif atributo is 'estadoCivil':
        conjunto = [c.estadoCivil[0]['estadoCivil'] for c in candidatos]
    elif atributo is 'nacionalidade':
        conjunto = [c.nacionalidade[0]['nacionalidade'] for c in candidatos]
    elif atributo is 'situacao':
        conjunto = [c.situacao[0]['situacao'] for c in candidatos]
    elif atributo is 'sexo':
        conjunto = [c.sexo[0]['sexo'] for c in candidatos]
    elif atributo is 'resultadoEleicao':
        conjunto = [
            c.resultadoEleicao[0]['resultadoEleicao'] for c in candidatos
        ]
    elif atributo is 'estadoNascimento':
        conjunto = [
            c.cidadeNascimento[1]['estado'][1]['uf'] for c in candidatos
        ]
    elif atributo is 'cidadeNascimento':
        conjunto = [c.cidadeNascimento[0]['cidade'] for c in candidatos]

    s = [(x, len(list(y))) for x, y in groupby(sorted(conjunto))]
    s = sorted(s, key=lambda x: x[1])
    siglas = [x[0] for x in s]
    qtd = [x[1] for x in s]
    posicoesY = pylab.arange(len(siglas)) + .5
    posicoesX = qtd
    pylab.title('quantidade de candidatos por ' + atributo)
    pylab.barh(posicoesY, posicoesX, align='center')
    pylab.grid(True)
    pylab.yticks(posicoesY, tuple(siglas))
    pylab.ylabel(atributo)
    pylab.xlabel('quantidade de candidatos')
    y = 0
    for x in posicoesX:
        pylab.text(x + 5, posicoesY[y] - .5, x)
        y += 1
    pylab.show()
def plot_correlations(filename, names, x, as_text, colourscheme, mid0,
                      invertsign):
    marker_to_colour = dict()
    marker_to_r = dict()
    marker_to_p = dict()
    with open(filename, 'rU') as infh:
        for line in infh:
            p = line.rstrip('\r\n').split('\t')
            marker = p[2].rstrip('+')
            if marker in marker_to_r:
                continue  # might change to raisingException
            marker_to_r[marker] = float(p[4])
            pval = float(p[0])
            qval = float(p[1])
            marker_to_p[marker] = pval
            if marker_to_r[marker] > 0:
                marker_to_colour[
                    marker] = colourscheme[2] if pval > 0.05 else colourscheme[
                        1] if qval > 0.05 else colourscheme[0]
            else:
                marker_to_colour[
                    marker] = colourscheme[5] if pval > 0.05 else colourscheme[
                        4] if qval > 0.05 else colourscheme[3]
        for y, name in enumerate(names):
            if name == 'CD3' and name not in marker_to_colour: name = 'CD3e'
            if as_text:
                pylab.text(x,
                           y,
                           '%.2f' % marker_to_r[name],
                           color=marker_to_colour[name])
            elif o.mid0:
                pylab.barh(y,
                           marker_to_r[name] / (-2.0 if invertsign else 2.0),
                           color=marker_to_colour[name],
                           linewidth=0,
                           left=x + 0.5)

            else:
                pylab.barh(y,
                           abs(marker_to_r[name]),
                           color=marker_to_colour[name],
                           linewidth=0,
                           left=x)
                if marker_to_p[name] <= 0.05:
                    if marker_to_r[name] >= 0:
                        pylab.text(x + abs(marker_to_r[name]), y + 0.20, '+')
                    else:
                        pylab.text(x + abs(marker_to_r[name]) + 0.01, y + 0.20,
                                   '-')
Exemplo n.º 44
0
def plot_occupancy2(occupancy,
                    offset=0.0,
                    cm=None,
                    n_cages=None,
                    n_animals=None):
    if cm is None:
        if hasattr(pylab.cm, 'viridis'):
            cm = pylab.cm.viridis
        else:
            cm = pylab.cm.winter
    # [enter, exit, cage, animal]

    # give each animal a color
    aids = numpy.unique(occupancy[:, 3])
    aids.sort()
    if n_animals is None:
        n_aids = len(aids)
    else:
        n_aids = n_animals
    colors = {
        aid: cm(v)
        for (aid, v) in zip(aids, numpy.linspace(0., 1., n_aids))
    }

    # find # of cages
    if n_cages is None:
        n_cages = len(numpy.unique(occupancy[:, 2]))

    bar_height = 1. / n_aids
    # plot each animal
    for (i, aid) in enumerate(aids):
        # get occupancy for this animal
        ao = occupancy[occupancy[:, 3] == aid]

        # barh(bottom, width, height, left, **kwargs)
        pylab.barh(ao[:, 2] + i * bar_height + offset,
                   ao[:, 1] - ao[:, 0],
                   bar_height,
                   ao[:, 0],
                   color=colors[aid])

    # draw cage dividers
    for i in range(n_cages + 1):
        pylab.axhline(i + offset, color='k')
    yl = pylab.ylim()
    ylmin = min(yl[0], offset)
    ylmax = max(yl[1], n_cages + offset)
    if yl != (ylmin, ylmax):
        pylab.ylim(ylmin, ylmax)
Exemplo n.º 45
0
def rfparameters(df,label,clf):
    features=np.array(df.ix[:, df.columns != label].describe().keys())
    print('Running RF')
    clf.fit(df[features], df[label])
    print('Plotting and Recording')
    importances = clf.feature_importances_
    sorted_idx = np.argsort(importances)[:10]
    padding = np.arange(10) + 0.5
    pl.barh(padding, importances[sorted_idx], align='center')
    pl.yticks(padding, features[sorted_idx])
    pl.xlabel("Relative Importance")
    pl.title("Variable Importance")
    best_features = features[sorted_idx][::-1]
    ddf=pd.DataFrame(data={'Top Features by RF': best_features})
    return pl.savefig('importanceRF.png'), ddf.to_csv('importanceRF.txt',sep='\t')
Exemplo n.º 46
0
    def __init__(self, tree):
        import pylab
        import numpy as np

        costs = []
        items = sorted(tree.walk(), key=lambda item: item.cost)

        costs = [x.cost for x in items]
        names = [x.name for x in items]

        pos = np.arange(0, len(costs)) + 0.5
        pylab.barh(pos, costs, align="center")
        pylab.yticks(pos, names)
        pylab.subplots_adjust(left=0.5)
        pylab.show()
Exemplo n.º 47
0
    def __init__(self, tree):
        import pylab
        import numpy as np

        costs = []
        items = sorted(tree.walk(), key=lambda item: item.cost)

        costs = [x.cost for x in items]
        names = [x.name for x in items]

        pos = np.arange(0, len(costs)) + 0.5
        pylab.barh(pos, costs, align="center")
        pylab.yticks(pos, names)
        pylab.subplots_adjust(left=0.5)
        pylab.show()
Exemplo n.º 48
0
def plot_most_significant(labels):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(nndocs)
    chi2score = chi2(X, labels)[0]
    figure(figsize=(6, 6))
    wscores = list(zip(vectorizer.get_feature_names(), chi2score))
    wchi2 = sorted(wscores, key=lambda x: x[1])
    topchi2 = list(zip(*wchi2[-10:]))
    x = [i for i in range(len(topchi2[1]))]
    label = topchi2[0]
    barh(x, topchi2[1], align='center', alpha=.2, color='g')
    plot(topchi2[1], x, '-o', markersize=2, alpha=.8, color='g')
    yticks(x, label)
    xlabel('$\chi^2$')
    show()
def find_features(df, features):
    '''
    Use scikit-learn lib to determine which variables are the best at predicting risk.
    Then, from the calculated importances, order them from most to least important
    and make a barplot to visualize what is/isn't important
    '''
    clf = RandomForestClassifier()
    clf.fit(df[features], df[DEP_VAR])
    importances = clf.feature_importances_
    sorted_idx = np.argsort(importances)
    padding = np.arange(len(features)) + 0.5
    pl.barh(padding, importances[sorted_idx], align='center')
    pl.yticks(padding, features[sorted_idx])
    pl.xlabel("Relative Importance")
    pl.title("Variable Importance")
Exemplo n.º 50
0
    def plot_bar_chart(self, y_labels, x):
        x = x[::-1]
        y_pos = np.arange(len(y_labels))
        y_labels = y_labels[::-1]

        pylab.figure(figsize=(15, 5))
        pylab.barh(y_pos, x, align='center')
        pylab.yticks(y_pos, y_labels)
        for i, v in enumerate(x):
            pylab.text(v + 0.005, i, str(round(v, 3)), color='black', fontweight='bold')

        pylab.xlabel(self.label)
        pylab.title(self.file_name)
        pylab.savefig('./figures/' + self.file_name + '_bar_chart' + self.ext)
        pylab.show()
Exemplo n.º 51
0
def plot_histogram(freq, mean):
    # using dict comprehensions to remove not frequent words

    topwords = {word: count
                for word, count in freq.items()
                if count > round(8 * mean)}
    sorted_alpha = collections.OrderedDict(sorted(topwords.items()))
    # plotting
    y = sorted_alpha.values()
    x = range(len(y))

    labels = topwords.keys()
    barh(x, y, align='center')
    yticks(x, labels)
    show()
Exemplo n.º 52
0
 def add_bar(self,fname, cname):
     fname=cdir+fname 
     pnames=open(fname+'.paramnames').readlines()
     for i in range(3):
         loglsx=open(fname+'_'+str(i+1)+'.maxlike').readlines()
         if (i==0):
             logls2=loglsx
         else:
             if float(loglsx[0].split()[1])<float(logls2[0].split()[1]):
                 logls2=loglsx
     logls=logls2[0].split(' ')[2:]
     chi2d={}
     for pname, logl in zip(pnames,logls):
         if "_like" in pname:
             ppname=pname.split(' ')
             if 'Betoule' in ppname[0]:
                chi2=-2*float(logl)-30
                print "bchi2=",chi2
             else:
                chi2=-2*float(logl)      
             xname=ppname[0].replace('_like','')
             chi2d[xname]=chi2
     left=0
     for xname in nlist:
         chi2=chi2d[xname] #/defdof[xname]
         color=colors[xname]
         PP=pylab.barh(self.cy-0.25,chi2,left=left,height=0.5,color=color, linewidth=0)
         self.patches[xname]=PP[0]
         left+=chi2
     self.ys.append(self.cy)
     self.cy+=1
     self.names.append(cname)
Exemplo n.º 53
0
def _create_histogram(M_c, data, columns, mc_col_indices, filename):
    dir = S.path.web_resources_data_dir
    full_filename = os.path.join(dir, filename)
    num_rows = data.shape[0]
    num_cols = data.shape[1]

    p.figure()
    # col_i goes from 0 to number of predicted columns
    # mc_col_idx is the original column's index in M_c
    for col_i in range(num_cols):
        mc_col_idx = mc_col_indices[col_i]
        data_i = data[:, col_i]
        ax = p.subplot(1, num_cols, col_i, title=columns[col_i])
        if M_c['column_metadata'][mc_col_idx]['modeltype'] == 'normal_inverse_gamma':
            p.hist(data_i, orientation='horizontal')
        else:
            str_data = [du.convert_code_to_value(M_c, mc_col_idx, code) for code in data_i]
            unique_labels = list(set(str_data))
            np_str_data = np.array(str_data)
            counts = []
            for label in unique_labels:
                counts.append(sum(np_str_data == label))
            num_vals = len(M_c['column_metadata'][mc_col_idx]['code_to_value'])
            rects = p.barh(range(num_vals), counts)
            heights = np.array([rect.get_height() for rect in rects])
            ax.set_yticks(np.arange(num_vals) + heights/2)
            ax.set_yticklabels(unique_labels)

    p.tight_layout()
    p.savefig(full_filename)
Exemplo n.º 54
0
	def updateChart(self, ranks):
		teams = [] # y axis
		points = [] # x axis
		for rank in reversed(ranks): # generating axes values 
			teams.append(rank.name)
			points.append(rank.points)
		pos = arange(len(teams))+.5    # the bar centers on the y axis
		figure(1) 
		barh(pos, points, align='center') # used horizontal bar graph
		yticks(pos, teams)
		xlabel('Points')
		ylabel('Team')
		title('Ranking')
		grid(True)
		savefig("pics/chart.png", dpi=60) # saving chart in pics folder to show it later.
		clf() # don't forget to clear the figure to make a blank start for the next chart.
Exemplo n.º 55
0
	def updateChart(self, ranks):
		teams = []
		points = []
		for rank in reversed(ranks):
			teams.append(rank["team"])
			points.append(rank["pt"])
		pos = arange(len(teams))+.5    # the bar centers on the y axis
		figure(1)
		barh(pos, points, align='center')
		yticks(pos, teams)
		xlabel('Points')
		ylabel('Team')
		title('Ranking')
		grid(True)
		savefig("pics/overviewChart.png", dpi=60) # saving into different pic
		clf()
Exemplo n.º 56
0
    def add_bar(self, fname, cname, model):
        chiT, dof = 0, 0
        fname = cdir+fname
        pnames = open(fname+'.paramnames').readlines()
        if 'Neff' in fname:
            loglsx = open(fname+'.maxlike').readlines()
            logls2 = loglsx
        else:
            for i in range(3):
                loglsx = open(fname+'_'+str(i+1)+'.maxlike').readlines()
                if (i == 0):
                    logls2 = loglsx
                else:
                    if float(loglsx[0].split()[1]) < float(logls2[0].split()[1]):
                        logls2 = loglsx
        logls = logls2[0].split(' ')[2:]
        chi2d = {}
        print(' ')
        print('++++' + model)
        for pname, logl in zip(pnames, logls):
            if "_like" in pname:
                ppname = pname.split(' ')
                if 'SPlanck' in ppname[0]:
                    chi2 = 0
                else:
                    if 'Neff' in fname:
                        chi2 = float(logl)
                        if 'Betoule' in ppname[0]:
                            chi2 = chi2 - 692
                    else:
                        chi2 = -2*float(logl)
                        if 'Betoule' in ppname[0]:
                            chi2 = chi2 - 30
                chiT += chi2
                xname = ppname[0].replace('_like', '')
                print(xname, chi2)
                chi2d[xname] = chi2
        print('Min_chi2 = ', chiT+30)

        param = mdof[model]
        for xname in nlist:
            dof += defdof[xname]
        left = 0
        for xname in nlist:
            chi2 = chi2d[xname]
            color = colors[xname]
            PP = pylab.barh(self.cy-0.25, chi2, left=left,
                            height=0.5, color=color, linewidth=0)
            self.patches[xname] = PP[0]
            left += chi2

            if "SN" in dataset:
                pylab.text(position, self.cy-0.25, r' %.2f / %s' %
                           (chiT + 30, dof-param + 30), fontsize=15)
            else:
                pylab.text(position, self.cy-0.25, r' %.2f / %s' %
                           (chiT, dof-param), fontsize=15)
        self.ys.append(self.cy)
        self.cy += 1
        self.names.append(cname)
Exemplo n.º 57
0
    def plot_multiedge_graph(self, cmap="jet"):
        """Creates a multiedge graph and plots it

        :param cmap: a valid color map from matplotlib. jet, spring, hot, ...
        :return: CNOGraphMultiEdges object


        .. plot::
            :include-source:
            :width: 50%

            # Get list of names
            from msdas import *
            from easydev import gsf

            m = MassSpecReader()
            m.read_annotations(gsf("msdas", "data", "YEAST_annotations_small.pkl"))
            n = network.NetworkFromUniProt(a.annotations)

            names = list(set(m.df.Protein))


            n = network.CombineNetworks(
                {"Curated": gsf("msdas", "data", "PKN-yeastScaffold.sif"),
                 "UniProt": "PKN-uniprot.sif",
                 "PhosPho": "PKN-phospho.sif"},
                 signals=names[:], stimuli=["a", "NaCl"])

            c = n.plot_multiedge_graph()
            c.plot()


        """
        N = len(self.labels)
        values = pylab.linspace(.1,.9, N)

        # build network
        c = self.get_multiedge_graph()
        c.plot(edge_attribute="edgecolor", edge_attribute_labels=False, cmap=cmap)

        # #build legend
        for i, label in enumerate(self.labels):
            print label, c._get_hex_color_from_value(values[i], cmap)
            pylab.barh(0,0,1,color=c._get_hex_color_from_value(values[i], cmap), label=label)
        pylab.legend(title="edge legend", fontsize="small", loc="lower right")

        return c
def plot_results(regr, params, X_test, y_test, feature_names):
    """
    Plot the results from boosting iterations
    and feature evaluations, using PyLab.
    """
    ###############################################################################
    # Plot training deviance
    # Compute test set deviance
    """
    test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

    for i, y_pred in enumerate(regr.staged_decision_function(X_test)):
        test_score[i] = regr.loss_(y_test, y_pred)

    best = np.argmin(test_score)
    print "optimal", best, test_score[best]
    """

    pl.figure(figsize=(12, 10))
    pl.subplot(1, 2, 1)
    """
    pl.title('Deviance')
    pl.plot(np.arange(params['n_estimators']) + 1, regr.train_score_, 'b-', 
        label='Training Set Deviance')
    pl.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-', 
        label='Test Set Deviance')

    pl.legend(loc='upper right')
    pl.xlabel('Boosting Iterations')
    pl.ylabel('Deviance')
    """

    ###############################################################################
    # Plot feature importance
    feature_importance = regr.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    pl.subplot(1, 2, 2)
    pl.barh(pos, feature_importance[sorted_idx], align='center')
    pl.yticks(pos, feature_names[sorted_idx])
    pl.xlabel('Relative Importance')
    pl.title('Feature Importance')

    pl.savefig('./working/foo.png', bbox_inches='tight')
def do_fit(train_path, model_path, test_path):
    
    params = {'n_estimators': 1500, 'max_depth': 3, 'min_samples_split': 4,
              'min_samples_leaf':1, 'random_state':None, 'do_consider_correct':1,
          'learn_rate': 0.1, 'n1': 10000, 'n2': 1, 'n3': 100000, 'tau': 0.01};

    ranker = GradientBoostingRanker(**params);
    
    print 'loading data...'
    X, dr, sr, groups = load_dataset(train_path)
    test_X, test_dr, test_sr, test_groups = load_dataset(test_path);
    
    print 'starting fit...'
    ranker.fit(X, dr, sr, groups, test_X, test_dr, test_sr, test_groups);
#    ranker.fit(X, dr, sr, groups);

#    print ranker.train_score_;
    pl.figure(figsize=(12, 6))
    pl.subplot(1, 2, 1)
    pl.title('Deviance')
    pl.plot(np.arange(params['n_estimators']) + 1, ranker.train_score_, 'b-',
        label='Training Set Deviance')
    pl.plot(np.arange(params['n_estimators']) + 1, ranker.oob_score_, 'r-',
        label='Test Set Deviance')
    pl.legend(loc='upper right')
    pl.xlabel('Boosting Iterations')
    pl.ylabel('Deviance')

    # Plot feature importance
    feature_importance = ranker.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    pl.subplot(1, 2, 2)
    pl.barh(pos, feature_importance[sorted_idx], align='center')
    pl.yticks(pos, np.array(range(len(feature_importance))));
    pl.xlabel('Relative Importance')
    pl.title('Variable Importance')
    
    print feature_importance;

    print 'storing to %s' % model_path
    joblib.dump(ranker, model_path, 3) 
    pl.show()
Exemplo n.º 60
0
def plotCandidatoPor(atributo, candidatos):
    if atributo is 'partido':
        conjunto = [c.partido[1]['sigla'] for c in candidatos]
    elif atributo is 'ocupacao':
        conjunto = [c.ocupacao[0]['ocupacao'] for c in candidatos]
    elif atributo is 'cargo':
        conjunto = [c.cargo[0]['cargo'] for c in candidatos]
    elif atributo is 'grauInstrucao':
        conjunto = [c.grauInstrucao[0]['grauInstrucao'] for c in candidatos]
    elif atributo is 'estado':
        conjunto = [c.estado[1]['uf'] for c in candidatos]
    elif atributo is 'coligacao':
        conjunto = [c.coligacao[0]['coligacao'] for c in candidatos]
    elif atributo is 'estadoCivil':
        conjunto = [c.estadoCivil[0]['estadoCivil'] for c in candidatos]
    elif atributo is 'nacionalidade':
        conjunto = [c.nacionalidade[0]['nacionalidade'] for c in candidatos]
    elif atributo is 'situacao':
        conjunto = [c.situacao[0]['situacao'] for c in candidatos]
    elif atributo is 'sexo':
        conjunto = [c.sexo[0]['sexo'] for c in candidatos]
    elif atributo is 'resultadoEleicao':
        conjunto = [c.resultadoEleicao[0]['resultadoEleicao'] for c in candidatos]
    elif atributo is 'estadoNascimento':
        conjunto = [c.cidadeNascimento[1]['estado'][1]['uf'] for c in candidatos]
    elif atributo is 'cidadeNascimento':
        conjunto = [c.cidadeNascimento[0]['cidade'] for c in candidatos]

    s = [(x,len(list(y))) for x,y in groupby(sorted(conjunto))]
    s = sorted(s, key=lambda x: x[1])
    siglas = [x[0] for x in s]
    qtd = [x[1] for x in s]
    posicoesY = pylab.arange(len(siglas)) + .5
    posicoesX = qtd
    pylab.title('quantidade de candidatos por ' + atributo)
    pylab.barh(posicoesY, posicoesX, align='center')
    pylab.grid(True)
    pylab.yticks(posicoesY, tuple(siglas))
    pylab.ylabel(atributo)
    pylab.xlabel('quantidade de candidatos')
    y = 0
    for x in posicoesX:
        pylab.text(x+5, posicoesY[y]-.5, x)
        y += 1
    pylab.show()