Exemplo n.º 1
0
def print_pred_distrib_figure(filename, bins, histo, dx, J_opt):
    assert isinstance(filename, str), 'filename must be a string'
    filename = os.path.splitext(filename)[0] + '.png'

    matplotlib = _try_import_matplotlib()
    if matplotlib is None:
        return
    else:
        from matplotlib import pyplot as plt

    figure = plt.figure(figsize=(7, 7))
    plt.bar(bins[:-1],
            histo[0],
            width=dx,
            align='edge',
            color='blue',
            alpha=0.7,
            label='neutral')
    plt.bar(bins[:-1],
            histo[1],
            width=dx,
            align='edge',
            color='red',
            alpha=0.7,
            label='deleterious')
    plt.axvline(x=J_opt, color='k', ls='--', lw=1)
    plt.ylabel('distribution')
    plt.xlabel('predicted score')
    plt.legend()
    figure.savefig(filename, format='png', bbox_inches='tight')
    plt.close()
    plt.rcParams.update(plt.rcParamsDefault)
    LOGGER.info(f'Predictions distribution saved to {filename}')
Exemplo n.º 2
0
def scatter_plot(P, L, pcIdx1, pcIdx2, letterList, rev):
    fig = plt.figure()
    # following the convention in lecture note ScatterPlot.html
    colors = ["r", "lime", "b", "y", "c", "m", "k", "tan", "pink", "darkred"]
    for i, letter in enumerate(letterList):
        plt.scatter(P[L == letter, pcIdx2],
                    P[L == letter, pcIdx1],
                    s=0.1,
                    c=colors[i],
                    label=letter)
    plt.axes().set_aspect('equal')
    #plt.axes().set_aspect('equal', 'datalim')
    plt.xlabel("Principle Component {}".format(pcIdx2))
    plt.ylabel("Principle Component {}".format(pcIdx1))
    plt.axhline(0, color='grey')
    plt.axvline(0, color='grey')
    plt.ylim([-5000, 5000])
    plt.xlim([-5000, 5000])
    plt.legend()
    plt.gca().invert_yaxis()
    fig.set_size_inches(8, 8)
    fName = os.path.join(
        pDir, 'scatter_PC{}_PC{}_{}_{}.png'.format(pcIdx1, pcIdx2,
                                                   "".join(letterList), rev))
    savefig(fName, bbox_inches='tight')
    plt.show()
Exemplo n.º 3
0
def plotFeatImportance(pathOut,
                       imp,
                       oob,
                       oos,
                       method,
                       tag=0,
                       simNum=0,
                       **kargs):
    # plot mean imp bars with std
    mpl.figure(figsize=(10, imp.shape[0] / 5.))
    imp = imp.sort_values('mean', ascending=True)
    ax = imp['mean'].plot(kind='barh',
                          color='b',
                          alpha=0.25,
                          xerr=imp['std'],
                          error_kw={'ecolor': 'r'})
    if method == 'MDI':
        mpl.xlim([0, imp.sum(axis=1).max()])
        mpl.axvline(1. / imp.shape[0], lw=1., color='r', ls='dotted')
    ax.get_yaxis().set_visible(False)
    for i, j in zip(ax.patches, imp.index):
        ax.text(i.get_width() / 2,
                i.get_y() + i.get_height() / 2,
                j,
                ha='center',
                va='center',
                color='k')
    mpl.title('tag=' + tag + ' | simNUm=' + str(simNum) + ' | oob=' +
              str(round(oob, 4)) + ' | oos=' + str(round(oos, 4)))
    mpl.savefig(pathOut + 'featImportance_' + str(simNum) + '.png', dpi=100)
    mpl.clf()
    mpl.close()
    return
    def plotLearning(x, scores, epsilons, filename, lines=None):
        fig = plt.figure()
        ax = fig.add_subplot(111, label="1")
        ax2 = fig.add_subplot(111, label="2", frame_on=False)

        ax.plot(x, epsilons, color="C0")
        ax.set_xlabel("Game", color="C0")
        ax.set_ylabel("Epsilon", color="C0")
        ax.tick_params(axis='x', colors="C0")
        ax.tick_params(axis='y', colors="C0")

        N = len(scores)
        running_avg = np.empty(N)
        for t in range(N):
            running_avg[t] = np.mean(scores[max(0, t - 20):(t + 1)])

        ax2.scatter(x, running_avg, color="C1")
        #ax2.xaxis.tick_top()
        ax2.axes.get_xaxis().set_visible(False)
        ax2.yaxis.tick_right()
        #ax2.set_xlabel('x label 2', color="C1")
        ax2.set_ylabel('Score', color="C1")
        #ax2.xaxis.set_label_position('top')
        ax2.yaxis.set_label_position('right')
        #ax2.tick_params(axis='x', colors="C1")
        ax2.tick_params(axis='y', colors="C1")

        if lines is not None:
            for line in lines:
                plt.axvline(x=line)

        plt.savefig(filename)
Exemplo n.º 5
0
def silhouette():
    if not os.path.exists("Stardust_results"):
        print(
            "The directory structure Stardust_results doest not exist. Please run run_stardust first"
        )
        sys.exit()
    if not os.path.exists("Stardust_results/analysis"):
        os.mkdir("Stardust_results/analysis")
    output_path = "Stardust_results/analysis/"
    from sklearn.metrics import silhouette_samples, silhouette_score
    data_df = pd.read_csv(
        'Stardust_results/visualization_output/3_pass/data.csv',
        delimiter=",",
        index_col=False)
    data_df.set_index('data', inplace=True)
    silhouette_avg = silhouette_score(data_df[['x', 'y']], data_df['cluster'])
    sample_silhouette_values = silhouette_samples(data_df[['x', 'y']],
                                                  data_df['cluster'])
    print("silhouette score ", silhouette_avg)

    y_lower = 10
    import matplotlib.cm as cm
    fig = plt.figure(figsize=(4, 7))
    n_clusters = len(list(data_df['cluster'].unique()))
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[data_df['cluster'] == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        plt.fill_betweenx(np.arange(y_lower, y_upper),
                          0,
                          ith_cluster_silhouette_values,
                          facecolor=color,
                          edgecolor=color,
                          alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    plt.title("The silhouette plot for the various clusters.")
    plt.xlabel("silhouette coefficient", fontsize=20)
    plt.ylabel("Cluster label", fontsize=20)
    plt.axvline(x=silhouette_avg, color="red", linestyle="--")

    plt.yticks([])  # Clear the yaxis labels / ticks
    plt.xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    sns.despine(bottom=False, left=False)
    fig.savefig(output_path + "/silhouette.pdf", bbox_inches='tight', dpi=600)
    fig.savefig(output_path + "/silhouette.png", bbox_inches='tight', dpi=600)
Exemplo n.º 6
0
def getCommentLengthsDistribution(comments):
    commentsList = []
    for i in range(0, len(comments)):
        commentsList.append(len(comments[i]))

    #fig, ax = plt.subplots()
    plt.hist(commentsList, bins=np.arange(0, 500, 10))
    plt.xlabel('Number of Words in Comment')
    plt.ylabel('Comment Counts')
    plt.title('Histogram of Word Counts in Comments')
    plt.axvline(x=200, color='r', linestyle='dashed', linewidth=2)
    plt.show()
def Plot3Data(x_data,
              y_data,
              z_data,
              ylabel,
              zlabel,
              plottitle,
              savename,
              LOGFILE,
              participant,
              section,
              savepath,
              verticallineindices=[0],
              grid=1,
              xlabel='Time (in Seconds)'):
    if DEBUG == 1:
        print("Plotting function called for : ", ylabel)
    try:
        #starting the plot
        fig = plt.figure()
        fig.tight_layout()
        plt.title(plottitle)
        plt.plot(x_data, y_data, 'r-', label=ylabel, linewidth=0.1)
        plt.plot(x_data, z_data, 'g--', label=zlabel)
        if DEBUG == 1:
            print("First few elements of the x,y and z data are : ",
                  x_data[0:3], '\n', y_data[0:3], '\n', z_data[0:3])
        if len(verticallineindices
               ) > 1:  #Meaning the verticallineindices array is not empty
            for i in range(len(verticallineindices)):
                if verticallineindices[i] == 1:
                    plt.axvline(x=x_data[i], linewidth='1')
        plt.xlabel(xlabel)
        plt.ylabel(str(ylabel) + ' and ' + str(zlabel))
        plt.legend(loc='upper right')
        if grid == 1:
            plt.grid(color='b', linestyle='-.', linewidth=0.1)
        #plt.show()
        plt.savefig(savepath + savename,
                    bbox_inches='tight',
                    dpi=900,
                    quality=100)
        plt.close()
    except Exception as e:
        print("Exception at the plotting function in PlottingFunctions.py : ",
              e)
        file = open(LOGFILE, 'a')
        writer = csv.writer(file)
        writer.writerow([
            ' Exception in the plotting function ', ' Participant: ',
            participant, ' Section : ', section, '  ', ' Exception: ', e
        ])
        file.close()
Exemplo n.º 8
0
def vertical_mean_line(x, **kwargs):
    plt.axvline(x.mean(), linestyle="--", color=kwargs.get("color", "r"))
    txkw = dict(size=15, color=kwargs.get("color", "r"))

    label_x_pos_adjustment = 0.08  # this needs customization based on your data
    label_y_pos_adjustment = 5  # this needs customization based on your data
    if x.mean() < 6:  # this needs customization based on your data
        tx = "mean: {:.2f}\n(std: {:.2f})".format(x.mean(), x.std())
        plt.text(x.mean() + label_x_pos_adjustment, label_y_pos_adjustment, tx,
                 **txkw)
    else:
        tx = "mean: {:.2f}\n  (std: {:.2f})".format(x.mean(), x.std())
        plt.text(x.mean() - 1.4, label_y_pos_adjustment, tx, **txkw)
def get_graph(n, title):
    """
  Draw a distribution histogram for a sample of N data from 
  n-dimensional Normal distribution
  """

    sample = np.random.normal(size=(N, n))
    dist = np.square(np.linalg.norm(sample, axis=1))
    lower_bound, upper_bound = get_2_std_estimates(dist)
    n, bins, patches = plt.hist(dist, bins='auto', density="true")
    plt.axvline(x=lower_bound, color='red')
    plt.axvline(x=upper_bound, color='red')
    plt.title(title, fontdict={'fontsize': 20})
    plt.show()
Exemplo n.º 10
0
def graph(x,y,xLabel,yLabel,title,figname):
    plt.clf()
    plt.hist(x,color="c",edgecolor="k",alpha=0.5)
    plt.axvline(np.array(x).mean(),color="k",linestyle="dashed",linewidth=3,label="average")
    plt.xlabel(xLabel)
    plt.ylabel(yLabel)
    plt.title(title)
    
    yAxis = np.arange(0,10,1)
    acRes = [y]
    z = np.array(acRes*10)
    plt.plot(z,yAxis,label="model accuracy")
    p_value = ttest_ind(x,[y])[1]
    plt.plot([],[],label=f"p-value: {np.round(p_value,4)}",color="w")
    plt.legend()
    plt.savefig(figname)
Exemplo n.º 11
0
def plot_std(qstack):
    plt.figure()
    flux_covar = qstack.flux_covar

    std = np.sqrt(np.diagonal(flux_covar))
    ax = plt.axes(None, label=str(bin_size))
    plt.plot(qstack.wave_stack, qstack.flux_stack, label='Stacked Flux')
    ax.fill_between(qstack.wave_stack,
                    qstack.flux_stack - std,
                    qstack.flux_stack + std,
                    alpha=0.25,
                    label="1-$\\sigma$ Uncertainty Range")
    #plt.title("Stacked Continuum Normalized Flux Near Ly-$\\alpha$ Transition")
    plt.xlabel("Wavelength (Angstroms)")
    plt.ylabel("Stacked Continuum Normalized Flux")
    plt.axvline(x=1215.67, color='red', linestyle='--')
    plt.legend()
Exemplo n.º 12
0
def plotMultipleNumpylist(plotDict, yLabel, xLable):
    # this function plots multiple lines using values from diffrent numpy list
    _max = []
    for key, item  in plotDict.items(): 
        plt.plot(item, linewidth = .7)
        _max.append(max(item))

    plt.ylabel(yLabel)
    plt.xlabel(xLable)
    _text_loc_y = max(_max)
    plt.axvline(120, ymin=0, ymax =100, linestyle = 'dashed', color = 'maroon')
    plt.text(120, _text_loc_y, "   GRAMs Launching", {'color': 'maroon', 'fontsize': 10})
    
    plt.legend(plotDict.keys(), loc='upper left')
    plt.show()

    return 
    def Gershgorin(self):
        if is_square(self.x) != True:
            print('Please enter a square matrix')
            return []
        else:

            row_sum = []
            list_diagonals = []
            list_diagonals.append(np.array(self.x).diagonal())
            self.x = np.absolute(self.x)

            row_sum.append(
                np.array(self.x).sum(axis=1) - np.array(self.x).diagonal())
            y, z = row_sum, list_diagonals
            z = np.array(z).tolist()
            y = np.array(y).tolist()
            circles = list(map(list, zip(z[0], y[0])))
            index, radi = zip(*circles)

            Xupper = max(index) + np.std(index)
            Xlower = min(index) - np.std(index)
            Ylimit = max(radi) + np.std(index)
            fig, ax = plt.subplots()

            ax = plt.gca()

            ax.cla()
            ax.set_xlim((Xlower, Xupper))
            ax.set_ylim((-Ylimit, Ylimit))
            plt.xlabel('Real Axis')
            plt.ylabel('Imaginary Axis')
            plt.title('Gershgorin circles')
            for x in range(0, len(circles)):

                circ = plt.Circle((index[x], 0), radius=radi[x])
                ax.add_artist(circ)

            ax.plot([Xlower, Xupper], [0, 0], 'k--')
            ax.plot([0, 0], [-Ylimit, Ylimit], 'k--')
            ax.yaxis.grid(True, linestyle="--")
            ax.xaxis.grid(True, linestyle="--")
            for i in index:

                plt.axvline(x=i, linestyle='--', color='r')  # vertical lines

            plt.show()
Exemplo n.º 14
0
def plot_boot(qstack):
    plt.figure()
    flux_covar = qstack.flux_covar

    std = np.sqrt(np.diagonal(flux_covar))
    ax = plt.axes(None, label=str(bin_size))

    num = 100
    ws_boot = qstack.ws_boot[:100]
    fs_boot = qstack.fs_boot[:100]

    plt.plot(ws_boot.T, fs_boot.T, alpha=0.1, color='orange')
    plt.plot(ws_boot[0],
             fs_boot[0],
             alpha=0.1,
             color='orange',
             label='Bootstrap Samples')
    plt.plot(qstack.wave_stack, qstack.flux_stack, label='Stacked Flux')
    #plt.title("Stacked Continuum Normalized Flux Near Ly-$\\alpha$ Transition")
    plt.xlabel("Wavelength (Angstroms)")
    plt.ylabel("Stacked Continuum Normalized Flux")
    plt.axvline(x=1215.67, color='red', linestyle='--')
    plt.legend()
Exemplo n.º 15
0
def PlotParticipantData():
    #chosenfolder = raw_input("\n\nPlease enter the name of the participant whose data we need to plot (e.g. P006/P010/P027...)\n\n")
    for chosenfolder in listoffolders:
        #chosenfolder = raw_input("\n\nPlease enter an acceptable folder name!\n\n")
        os.chdir(chosenfolder +
                 '/ClippedData/')  #Navigating in to the participant subfolder.
        #print "\n ****** Plotting for participant:", chosenfolder, " Opening all stripped files *******\n"
        #Wondering if I should sim data. There is nothing there that we need now for now.
        #simfile = open('StrippedSimData.csv','r')
        #simreader = csv.reader(simfile)
        #skiplines(simreader,1)
        #simdata = list(simreader)
        try:
            #Plotting imotions Data
            imofile = open('StrippediMotionsData.csv', 'r')
            imoreader = csv.reader(imofile)
            skiplines(imoreader, 1)
            imodata = list(imoreader)
            time = [float(imodata[i][0]) for i in range(len(imodata))]
            eventmarker = [float(imodata[i][2]) for i in range(len(imodata))]
            steer = [float(imodata[i][3]) for i in range(len(imodata))]
            throttle = [float(imodata[i][4]) for i in range(len(imodata))]
            brake = [float(imodata[i][5]) for i in range(len(imodata))]
            PPG = [float(imodata[i][6]) for i in range(len(imodata))]
            speed = [float(imodata[i][7]) for i in range(len(imodata))]
            GSR = [float(imodata[i][8]) for i in range(len(imodata))]
            #Locating indices and respective times for vertical marker placement
            # Markers for participants under 61
            if int(chosenfolder[1:4]) <= 61:
                xi = [eventmarker.index(1)]
                xi.append(eventmarker.index(21))
                xi.append(eventmarker.index(5))
                xi.append(eventmarker.index(10))
                xc = [time[xi[0]]]
                for i in xi:
                    xc.append(time[i])
                print "x coordinates: ", xc, '\n'
            # Markers for participants over 62
            if int(chosenfolder[1:4]) >= 61:
                xi = [eventmarker.index(1)]
                xi.append(eventmarker.index(5))
                xi.append(eventmarker.index(10))
                xc = [time[xi[0]]]
                for i in xi:
                    xc.append(time[i])
                print "x coordinates: ", xc, '\n'
            #Starting the iMotions Figure here.
            imofig1 = plt.figure(1)
            imofig1.tight_layout()
            plt.subplot(411)
            plt.title('Driving Data Plot (Steer/Throttle/Brake)')
            plt.plot(time, steer, 'r-', label='Steer')
            plt.xlabel('Time (sec)')
            plt.ylabel('Steer')
            plt.legend(loc='upper right')
            for j in xc:
                plt.axvline(x=j, linewidth=0.25)
            plt.subplot(412)
            plt.plot(time, throttle, 'b-', label='Throttle')
            plt.xlabel('Time (sec)')
            plt.ylabel('Throttle')
            plt.legend(loc='upper right')
            for j in xc:
                plt.axvline(x=j, linewidth=0.25)
            plt.subplot(413)
            plt.plot(time, brake, 'g-', label='Brake')
            plt.xlabel('Time (sec)')
            plt.ylabel('Brake')
            plt.legend(loc='upper right')
            for j in xc:
                plt.axvline(x=j, linewidth=0.25)
            plt.subplot(414)
            plt.plot(time, speed, 'b-', label='Speed')
            plt.xlabel('Time (sec)')
            plt.ylabel('Speed')
            plt.legend(loc='upper right')
            for j in xc:
                plt.axvline(x=j, linewidth=0.25)
            imofig1.savefig("iMotionsDrivingData.pdf", bbox_inches='tight')
            plt.close()
            #END OF FIGURE 1
            imofig2 = plt.figure(1)
            imofig2.tight_layout()
            plt.subplot(211)
            plt.title('Physiological Data Plot (PPG/GSR)')
            plt.plot(time, PPG, 'r-', label='PPG')
            plt.legend(loc='upper right')
            plt.xlabel('Time (sec)')
            plt.ylabel('PPG/HR')
            for j in xc:
                plt.axvline(x=j, linewidth=0.25)
            plt.subplot(212)
            plt.plot(time, GSR, 'g-', label='GSR')
            plt.xlabel('Time (sec)')
            plt.ylabel('GSR')
            plt.legend(loc='upper right')
            for j in xc:
                plt.axvline(x=j, linewidth=0.25)
            imofig2.savefig("iMotionsPhysioData.pdf", bbox_inches='tight')
            plt.close()
            #END OF FIGURE 2
        except:
            print "Participant : ", chosenfolder, " has bad data. Please exclude from analysis."
            '''if os.path.isfile('BadData.txt'):
                pass
            else:
                markerfile = open('BadData.csv','wb')
                markerwriter = csv.writer(markerfile)
                markerfile.close()'''
            pass
        #Plotting Eye Tracker Data
        try:
            etfile = open('StrippedEyeTrackingFile.csv', 'r')
            etreader = csv.reader(etfile)
            skiplines(etreader, 1)
            etdata = list(etreader)
            # Plotting the marker using counter in the indexbinocular column.
            time = [float(etdata[i][0]) for i in range(len(etdata))]
            catbin = [etdata[i][3] for i in range(len(etdata))]
            pupdia = []
            indexbin = []  #initializing to populate them later
            for i in range(len(etdata)):
                try:
                    pupdia.append(float(etdata[i][2]))
                except ValueError:
                    pupdia.append(0)
                try:
                    indexbin.append(etdata[i][15])
                except ValueError:
                    indexbin.append('-')
            # Function to calculate PERCLOS stats from catbin variable and time variable
            perclos_array = PERCLOS(time, catbin)
            #print "PERCLOS: \n", len(perclos_array)," \n\n\n", perclos_array
            if perclos_array[0][0] != 0:
                perclos_file = open('PERCLOS.csv', 'wb')
                percloswriter = csv.writer(perclos_file)
                percloswriter.writerow(['Time', 'PERCLOS'])
                percloswriter.writerows(
                    [perclos_array[i][1], perclos_array[i][0]]
                    for i in range(len(perclos_array)))
                perclos_file.close()
            #x = [ time[i] for i in range(len(etdata)) if catbin[i] == 'User Event']# This produces the same results as xc from above
            #Starting the eyetracker Figure here.
            etfig = plt.figure(1)
            etfig.tight_layout()
            plt.subplot(211)
            plt.title('Eye Tracking Data Plot (Pupil Diameter/Blinks)')
            plt.plot(time, pupdia, 'r-', label='Pupil Diameter')
            plt.xlabel('Time (sec)')
            plt.ylabel('Pupil Diameter (mm)')
            plt.legend(loc='upper right')
            for j in xc:
                plt.axvline(x=j, linewidth=0.25)
            plt.subplot(212)
            plt.plot([perclos_array[i][1] for i in range(len(perclos_array))],
                     [perclos_array[i][0] for i in range(len(perclos_array))],
                     'b--',
                     label='PERCLOS')
            plt.xlabel('Time (sec)')
            plt.ylabel('PERCLOS ( 0 - 1 )')
            plt.legend(loc='upper right')
            for j in xc:
                plt.axvline(x=j, linewidth=0.25)
            etfig.savefig("EyeTrackerData.pdf", bbox_inches='tight')
            plt.close()
            #END OF FIGURE 3
        except IOError:
            print "Eye tracker data for: ", chosenfolder, "is not available to plot. This participant has an error with markers or the eye tracker data wasn't recorded."
            '''if os.path.isfile('BadData.txt'):
                pass
            else:
                markerfile = open('BadData.csv','wb')
                markerwriter = csv.writer(markerfile)
                markerfile.close()'''
            pass
        os.chdir('../../')  #Navigating back to the main folder now.
Exemplo n.º 16
0
    cv = lambda z: (1 / 2) * (1 + scipy.special.erf(z / np.sqrt(2)))

    plt.figure()
    plt.plot(tq_fine, pdf)
    plt.xlim(quantile(cv(-4)), quantile(cv(4)))
    plt.xlabel("$\\log_{10}(t_Q)$ (years)")
    plt.ylabel("Probability Density")

    tqmed = quantile(0.5)
    one_sig_upper = quantile(cv(1))
    one_sig_lower = quantile(cv(-1))

    two_sig_upper = quantile(cv(2))
    two_sig_lower = quantile(cv(-2))

    plt.axvline(x=one_sig_upper, color='red', linestyle='--')
    plt.axvline(x=one_sig_lower, color='red', linestyle='--')
    plt.axvline(x=two_sig_upper, color='blue', linestyle='--')
    plt.axvline(x=two_sig_lower, color='blue', linestyle='--')
    plt.axvline(x=tqmed, color='orange', linestyle='--')
    plt.savefig("model/hist/{0}_tq_{1}_hist.pdf".format(stack, bin_size))

    u_var = one_sig_upper - tqmed
    l_var = tqmed - one_sig_lower

    print("{0} stack, bin size = {1}, tq =".format(stack, bin_size), tqmed,
          '+', u_var, '-', l_var)

    #tqmed = 5.9

    cov_interp = interpol.interp1d(tqs, mod_covars, axis=0)
Exemplo n.º 17
0
def test():
    # define datasets-----------------------------------
    datasets = ['Breast']
    names = ["DecisionTree", "KNeighbors", "GaussianNB"]
    # define classifiers-------------------------------------------
    classifiers = [
        DecisionTreeClassifier(max_depth=4),
        KNeighborsClassifier(n_neighbors=3),
        GaussianNB()
    ]
    clfs = list(zip(names, classifiers))
    eclf_soft = VotingClassifier(estimators=clfs, voting='soft')
    eclf_hard = VotingClassifier(estimators=clfs, voting='soft')
    classifiers.append(eclf_soft)
    classifiers.append(eclf_hard)
    names.append("VotingSoft")
    names.append("VotingHard")

    # iterate over datasets
    for dataset in datasets:
        X_train, y_train = utils.read_data('./data/' + dataset + '_train.data')
        X_test, y_test = utils.read_data('./data/' + dataset + '_test.data')
        # iterate over classifiers-------------------------------------------
        probas = []
        for name, clf in zip(names, classifiers):
            clf.fit(X_train, y_train)
            # predict class probabilities for all classifiers
            probas.append(clf.predict_proba(X_test))
        # get class probabilities for the first sample in the dataset
        class1_1 = [pr[0, 0] for pr in probas]
        class2_1 = [pr[0, 1] for pr in probas]
        class3_1 = [pr[0, 2] for pr in probas]
        class4_1 = [pr[0, 3] for pr in probas]
        class5_1 = [pr[0, 4] for pr in probas]
        # plotting

        N = 4  # number of groups
        ind = np.arange(N)  # group positions
        width = 0.35  # bar width

        fig, ax = plt.subplots()
        # bars for classifier 1-3
        p1 = ax.bar(ind,
                    np.hstack(([class1_1[:-1], [0]])),
                    width,
                    color='green',
                    edgecolor='k')
        p2 = ax.bar(ind + width,
                    np.hstack(([class2_1[:-1], [0]])),
                    width,
                    color='lightgreen',
                    edgecolor='k')

        # bars for VotingClassifier
        p3 = ax.bar(ind, [0, 0, 0, class1_1[-1]],
                    width,
                    color='blue',
                    edgecolor='k')
        p4 = ax.bar(ind + width, [0, 0, 0, class2_1[-1]],
                    width,
                    color='steelblue',
                    edgecolor='k')

        # plot annotations
        plt.axvline(2.8, color='k', linestyle='dashed')
        ax.set_xticks(ind + width)
        ax.set_xticklabels([
            'LogisticRegression\nweight 1', 'GaussianNB\nweight 1',
            'RandomForestClassifier\nweight 5',
            'VotingClassifier\n(average probabilities)'
        ],
                           rotation=40,
                           ha='right')
        plt.ylim([0, 1])
        plt.title('Class probabilities for sample 1 by different classifiers')
        plt.legend([p1[0], p2[0]], ['class 1', 'class 2'], loc='upper left')
        plt.tight_layout()
        plt.show()
Exemplo n.º 18
0
ax2 = fig.add_subplot(212)
ax2.set_ylabel('Cumulative P&L (USD)')
tradingResultDf['P&L'].cumsum().plot(ax=ax2)
plt.hlines(y=0,xmin = 0,xmax = 2000,color='b', linestyle='--')
plt.title('AAPL SVM trading on spread crossing using 10,000 events \n(Since openning of 6/21/2012)')
plt.savefig('10k_combined_spread.png', bbox_inches='tight', dpi=400)

# plot 30,000 events backtesting p&l
fig2 = plt.figure()
fig2.set_size_inches(18.5, 10.5)
ax3 = fig2.add_subplot(211)
res['P&L'].plot(ax=ax3)
ax1.set_ylabel('P&L (USD)')

ax4 = fig2.add_subplot(212)
ax4.set_ylabel('Cumulative P&L (USD)')
res['P&L'].cumsum().plot(ax=ax4)
xposition = [2000, 4000]
for xc in xposition:
    plt.axvline(x=xc, color='k', linestyle='--')
plt.hlines(y=0,xmin = 0,xmax = 6000,color='b', linestyle='--')
plt.title('AAPL SVM trading on mid-price using 30,000 events \n(Since beginning of 6/21/2012)')
plt.savefig('30k_combined.png', bbox_inches='tight', dpi=400)

# ax2.plot(buys.index, results.short_mavg.ix[buys.index],
#                  '^', markersize=10, color='m')
# ax2.plot(sells.index, results.short_mavg.ix[sells.index],
#                  'v', markersize=10, color='k')
# plt.legend(loc=0)

Exemplo n.º 19
0
def prompt_MDP(dataname):
    # ---------------数据读取
    dat = []
    bat = []
    xrt = []
    path_png = '../GRB_prompt_MDP(3)_300s/'
    num = 0
    with open(dataname, 'r') as f:
        bat_start = bat_end = xrt_start = xrt_end = 0
        for line in f.readlines():
            num += 1
            if 'batSNR5flux' in line:
                bat_start = num
            if 'batSNR5gamma' in line:
                bat_end = num - 2
            if 'xrtwtflux' in line:
                xrt_start = num
            if 'xrtwtgamma' in line:
                xrt_end = num - 2
    with open(dataname, 'r') as f:
        for line in f.readlines()[bat_start:bat_end]:
            bat.append(re.split(r'\s+', line))
        bat = np.array(bat)
        if len(bat) == 0:
            pass
        else:
            bat = bat[:, :-1]
            bat = bat.astype(np.float)
    with open(dataname, 'r') as f:
        for line in f.readlines()[xrt_start:xrt_end]:
            xrt.append(re.split(r'\s+', line))
        xrt = np.array(xrt)
        xrt = xrt[:, :-1]
        xrt = xrt.astype(np.float)

    #---------------------- 转换光子数
    def N_count(flux, index=xrt_photon_index):
        N = flux * integrate.quad(lambda E: E ** (-index), 2, 10.0)[0] / \
            integrate.quad(lambda E: E * E ** (-index), 0.3, 10)[0] / 1.6e-9
        return N

    #print('%3.3f count/cm2/s'%N_count(2.4e-8))
    if len(bat) == 0:
        pass
    else:
        flux = np.array(bat[:, 3])
        flux_err = np.array(bat[:, 4])
        bat_count = [
            N_count(flux[i], index=bat_photon_index) for i in range(len(flux))
        ]
        bat_count_err = [
            N_count(flux_err[i], index=bat_photon_index)
            for i in range(len(flux_err))
        ]
        bat = np.column_stack((bat, bat_count, bat_count_err))

    xrt_flux = np.array(xrt[:, 3])
    xrt_flux_err = np.array(xrt[:, 4])
    xrt_count = [N_count(xrt_flux[i]) for i in range(len(xrt_flux))]
    xrt_count_err = [
        N_count(xrt_flux_err[i]) for i in range(len(xrt_flux_err))
    ]
    xrt = np.column_stack((xrt, xrt_count, xrt_count_err))
    #print(bat_count)
    # ---------------------------画图
    fig, ax = plt.subplots()
    if len(bat) == 0:
        pass
    else:
        x = bat[:, 0]
        xerr = bat[:, 1]
        xerr_ = bat[:, 2]
        y = bat[:, -2]
        yerr = bat[:, -1]
        plt.errorbar(x,
                     y,
                     yerr=yerr,
                     xerr=xerr,
                     fmt='o',
                     label='BAT(flux to count)')

    xrt_x = xrt[:, 0]
    xrt_xerr = xrt[:, 1]
    xrt_xerr_ = xrt[:, 2]
    xrt_y = xrt[:, -2]
    xrt_yerr = xrt[:, -1]
    plt.errorbar(xrt_x,
                 xrt_y,
                 yerr=xrt_yerr,
                 xerr=xrt_xerr,
                 fmt='o',
                 color='red',
                 label='XRT(flux to count)')
    plt.xlabel('Time since BAT trigger (s)')
    plt.ylabel(r'2-10 keV (Count/cm$^2$/s)')
    plt.title('Swift BAT-XRT data of %s' % dataname)
    plt.loglog()

    #-----------------------------------合并BAT与XRT反推光子的数据
    xrt_x0 = xrt[0, 0]
    print(xrt_x0)
    if len(bat) == 0:
        bat = xrt
    else:
        bat = bat[bat[:, 0] < xrt_x0, :]
        bat = np.row_stack((bat, xrt))

    x = bat[:, 0]
    xerr = bat[:, 1]
    xerr_ = bat[:, 2]
    y = bat[:, -2]
    yerr = bat[:, -1]
    #  plt.errorbar(x, y, yerr=yerr, xerr=xerr, fmt='o',label='BAT')

    # -----------------------------------计算总MPD
    t = xerr - xerr_
    N_cm = sum(y * t)
    N_total = N_cm * eff * area
    #  print(N_cm, N_total)
    MDP = 4.29 / (miu * np.sqrt(N_total)) * 100
    #  print(MDP)
    # ----------------------------t_start 之后观测到的数据点及画图
    bat2 = bat[bat[:, 0] > t_start, :]
    bat2 = bat2[bat2[:, 0] < t_end, :]
    #print(bat2)
    x2 = bat2[:, 0]
    xerr2 = bat2[:, 1]
    xerr2_ = bat2[:, 2]
    y2 = bat2[:, 6]
    yerr2 = bat2[:, 7]
    # plt.errorbar(x2, y2, yerr=yerr2, xerr=xerr2, fmt='*')
    plt.axvline(t_start, label='t=%s s' % t_start, color='green')
    plt.axvline(t_end, label='t=%s s' % t_end, color='green')
    # ----------------------------t_start 之后观测到的MDP
    t2 = xerr2 - xerr2_
    N_cm2 = sum(y2 * t2)
    N_total2 = N_cm2 * eff * area
    # print(N_cm2, N_total2)
    MDP2 = 4.29 / (miu * np.sqrt(N_total2)) * 100
    #  print(MDP2)
    fig.text(0.2,
             0.2,
             'MDP = %2.2f %%' % MDP2,
             color='red',
             fontsize=12,
             fontweight='bold')
    plt.legend(loc='upper left')
    #plt.legend()
    plt.savefig(path_png + dataname + ' %2.2f%%' % MDP2 + '.png')
    plt.show()
    return MDP2
Exemplo n.º 20
0
plt.scatter(X0[0],
            model.predict(X0[:1]),
            marker='o',
            s=300,
            c='r',
            lw=5,
            alpha=0.5)
plt.plot(xx, model.predict(xx[:, np.newaxis]) > 0.5, lw=2)
plt.scatter(X0[0],
            model.predict_proba(X0[:1])[0][1],
            marker='x',
            s=300,
            c='r',
            lw=5,
            alpha=0.5)
plt.axvline(X0[0], c='r', lw=2, alpha=0.5)
plt.xlim(-3, 3)
plt.subplot(212)
plt.bar(model.classes_, model.predict_proba(X0[:1])[0], align="center")
plt.xlim(-1, 2)
plt.gca().xaxis.grid(False)
plt.xticks(model.classes_)
plt.title("conditional probability")
plt.tight_layout()
plt.show()

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
iris = load_iris()
X = iris.data[:, [2, 3]]
Exemplo n.º 21
0
def compute(inp_dataset, input_path, output_path, de_analysis, n_pass):

    print("Current pass ", n_pass)
    import json
    import matplotlib as plt
    import csv
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt
    from sklearn.decomposition import PCA
    from decimal import Decimal
    import seaborn as sns
    import pandas as pd
    import networkx as nx
    from sklearn.cluster import DBSCAN
    from sklearn.cluster import KMeans
    import operator
    import numpy as np
    import random
    import sys

    #csvData=[['data','x','y','type']]
    print("Processing the input data into datafames....")
    csvData = []
    count = 0
    #filename = "G:/Thesis/Dropclust/plots/output_normalized_own_cc.csv" filename = "G:/Thesis/Dropclust/plots/PCA_GENES/output_normalized_own_cc.csv" filename =
    #"G:/Thesis/Dropclust/output_normalized_zscore_cc1.csv" filename = "C:/Users/Swagatam/IdeaProjects/openOrd/output_normalized_own_cc.csv"
    filename = input_path + "/output_normalized_own_cc.csv"
    coord_data = pd.read_csv(filename, names=['data', 'x', 'y'])
    coord_data.set_index('data', inplace=True)
    data = []
    data_outlier = []
    with open(filename, 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            #f=0
            #row=[float(i) for i in row]
            data.append(row)
            temp_outlier = []
            temp_outlier.append(row[1])
            temp_outlier.append(row[2])
            data_outlier.append(temp_outlier)
            temp = row
            #if row[0].isnumeric():
            #    temp.append('cell')
            if len(row[0]) >= 16:
                temp.append('cell')
            else:
                temp.append('gene')
                count = count + 1
            csvData.append(temp)

    # # DB SCAN

    # In[20]:

    if n_pass != 4:
        noise = []
        print("Performing clustering....")
        db = DBSCAN(eps=180, min_samples=55).fit_predict(data_outlier)
        final_data = []
        csvData = [['data', 'x', 'y', 'type']]
        for i in range(0, len(list(db))):
            if db[i] != -1:
                final_data.append(data[i])
                csvData.append(data[i])
            if db[i] == -1:
                noise.append(data[i][0])
        data = final_data

        n_clusters = len(set(db)) - (1 if -1 in list(db) else 0)
        print("Clustering done. the number of obtained clusters: ", n_clusters)
    else:
        remove_data = []

        prev_df = pd.read_csv(
            "Stardust_results/visualization_output/3_pass/data.csv",
            delimiter=",",
            index_col=False)
        prev_df.set_index('data', inplace=True)
        clusters_info = []
        for i in range(0, len(csvData)):
            if csvData[i][3] == 'cell':
                if csvData[i][0] in (prev_df.index):
                    clusters_info.append(prev_df.loc[csvData[i][0]]['cluster'])
                else:
                    remove_data.append(csvData[i])
            else:
                f = 0
                import pickle
                with open(
                        'Stardust_results/visualization_output/3_pass/de_genes_cluster.txt',
                        'rb') as fp:
                    de_gene_cluster = pickle.load(fp)
                for rank in range(0, len(de_gene_cluster)):
                    if csvData[i][0] in de_gene_cluster[rank]:
                        f = 1
                        clusters_info.append(de_gene_cluster[rank].index(
                            csvData[i][0]))
                        break
                if f == 0:
                    remove_data.append(csvData[i])
        for r in remove_data:
            csvData.remove(r)
        temp = [['data', 'x', 'y', 'type']]
        temp.extend(csvData)
        csvData = temp

    # In[13]:

    # # OUTLIER VISUALIZATION

    # In[21]:
    if n_pass != 4:
        print("Starting outlier detection....")
        data_type = []
        c = 0
        g = 0
        for i in range(0, len(coord_data)):
            if db[i] != -1:
                data_type.append("data")
            else:
                if len(coord_data.index[i]) >= 16:
                    data_type.append("cell_outliers")
                else:
                    g = g + 1
                    data_type.append("gene_outliers")
        coord_data["data_type"] = data_type
        data_colors = ["lightblue"]
        if g > 0:
            noise_colors = ['blue', 'red']
        else:
            noise_colors = ['blue']
        coord_data["alpha"] = np.where(coord_data['data_type'] == 'data', 0.5,
                                       1.0)
        plt.figure(figsize=(6, 4.5))
        #ax = sns.scatterplot(x="x", y="y", data=coord_data[coord_data['alpha']==0.5],hue="data_type",palette=sns.xkcd_palette(data_colors),sizes=(50,100),size="data_type",alpha=0.3)
        #sns.scatterplot(x="x", y="y", data=coord_data[coord_data['alpha']==1.0],hue="data_type",palette=sns.xkcd_palette(noise_colors),sizes=(50,100),size="data_type",marker="^",alpha=1.0,ax=ax)
        marker = {"gene_outliers": "^", "cell_outliers": "^"}
        ax = sns.scatterplot(x="x",
                             y="y",
                             data=coord_data[coord_data['alpha'] == 0.5],
                             hue="data_type",
                             palette=sns.xkcd_palette(data_colors),
                             sizes=(50, 100),
                             size="data_type",
                             linewidth=0.0,
                             s=10,
                             alpha=0.3)
        sns.scatterplot(x="x",
                        y="y",
                        data=coord_data[coord_data['alpha'] == 1.0],
                        hue="data_type",
                        palette=sns.xkcd_palette(noise_colors),
                        sizes=(100, 50),
                        size="data_type",
                        style="data_type",
                        markers=marker,
                        alpha=1.0,
                        linewidth=0.0,
                        s=10,
                        legend='brief',
                        ax=ax)
        #plt.legend(title=='')
        ax.legend(bbox_to_anchor=(1.1, 1.05), frameon=False)
        sns.despine(bottom=False, left=False)
        plt.xlabel("dim1")
        plt.ylabel("dim2")
        plt.savefig(output_path + 'outliers_visualization.png',
                    bbox_inches='tight')
        print("Outliers removed from the dataset....")

    # # POST-HOC CLUSTER ASSIGNMENT

    # In[23]:

    print("Starting post hoc clustering....")
    neighbor_df = pd.read_hdf(
        'Stardust_results/build_output/1_pass/neighbor.h5', 'df')
    if 'Unnamed: 0' in list(neighbor_df.columns):
        neighbor_df.set_index('Unnamed: 0', inplace=True)
    p = 0
    col = list(neighbor_df.columns)
    index = list(neighbor_df.index)
    cell_dict = dict()
    column_dict = dict()
    for i in range(len(col)):
        column_dict[i] = col[i]
    for i in range(len(list(neighbor_df.index))):
        row = neighbor_df.iloc[i]
        col_ind = list(row.to_numpy().nonzero())[0]
        for ind in col_ind:
            if index[i] in cell_dict.keys():
                cell_dict[index[i]].append(column_dict[ind])
            else:
                temp = []
                temp.append(column_dict[ind])
                cell_dict[index[i]] = temp
    cluster_assign = []
    for key_cell in cell_dict.keys():
        clust = dict()
        cells = cell_dict[key_cell]
        for cell in cells:
            if n_pass == 4:
                if cell in list(prev_df.index):
                    cluster = prev_df.loc[cell]['cluster']
                else:
                    cluster = -1
            else:
                cluster = db[list(coord_data.index).index(cell)]
            if cluster not in clust.keys():
                clust[cluster] = 1
            else:
                clust[cluster] = clust[cluster] + 1
        max_cluster = max(clust.items(), key=operator.itemgetter(1))[0]
        if max_cluster == -1:
            continue
        cluster_assign.append(max_cluster)
        x_total = 0
        y_total = 0
        count = 0
        for cell in cells:
            if (n_pass != 4
                    and db[list(coord_data.index).index(cell)] == max_cluster
                ) or (n_pass == 4 and cell in list(prev_df.index)
                      and prev_df.loc[cell]['cluster'] == max_cluster):
                count = count + 1
                x_total = x_total + coord_data.loc[cell]['x']
                y_total = y_total + coord_data.loc[cell]['y']
        temp = []
        temp.append(key_cell)
        temp.append(x_total / count)
        temp.append(y_total / count)
        temp.append('cell')
        p = p + 1
        csvData.append(temp)
    print("Post hoc clustering done....")

    # In[24]:

    with open(output_path + 'data.csv', 'w') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerows(csvData)
    csvFile.close()
    data_df = pd.read_csv(output_path + "data.csv",
                          delimiter=",",
                          index_col=False)
    if n_pass != 4:
        clusters_info = [x for x in db if x != -1]
        clusters_info = clusters_info + cluster_assign
    else:
        clusters_info = clusters_info + cluster_assign
        data_df['cluster'] = clusters_info
    data_df.to_csv(output_path + 'data.csv')
    n_clusters = len(list(set(clusters_info)))
    print("cluster saved ....")

    n_clusters = len(data_df['cluster'].unique())
    colors = random.sample(seaborn_colors, n_clusters)

    colors = random.sample(seaborn_colors, n_clusters)
    plt.figure(figsize=(5, 5))
    #cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
    ax = sns.scatterplot(x="x",
                         y="y",
                         data=data_df,
                         hue="cluster",
                         palette=sns.xkcd_palette(colors),
                         linewidth=0.0,
                         s=2)
    ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False)
    for cl in range(n_clusters):
        plt.annotate(cl,
                     data_df.loc[data_df['cluster'] == cl, ['x', 'y']].mean(),
                     horizontalalignment='center',
                     verticalalignment='center',
                     size=10,
                     weight='bold',
                     color="black")
    sns.despine(bottom=False, left=False)
    plt.xlabel("sd1", fontsize=20)
    plt.ylabel("sd2", fontsize=20)
    plt.setp(ax.spines.values(), linewidth=2)
    plt.yticks([], linewidth=20)
    plt.xticks([])
    plt.savefig(output_path + "cluster_visualization.png",
                bbox_inches='tight',
                dpi=600)
    plt.savefig(output_path + "cluster_visualization.pdf",
                bbox_inches='tight',
                dpi=600)

    if n_pass == 3:
        from sklearn.datasets import make_blobs
        from sklearn.metrics import silhouette_samples, silhouette_score
        silhouette_avg = silhouette_score(data_df[['x', 'y']],
                                          data_df['cluster'])
        sample_silhouette_values = silhouette_samples(data_df[['x', 'y']],
                                                      data_df['cluster'])
        print(silhouette_avg)

        y_lower = 10
        import matplotlib.cm as cm
        #fig, (ax1, ax2) = plt.subplots(1, 2)
        fig = plt.figure(figsize=(4, 7))
        #fig.set_size_inches(18, 7)
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[data_df['cluster'] == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            plt.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_cluster_silhouette_values,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        plt.title("The silhouette plot for the various clusters.")
        plt.xlabel("silhouette coefficient", fontsize=20)
        plt.ylabel("Cluster label", fontsize=20)
        plt.axvline(x=silhouette_avg, color="red", linestyle="--")

        plt.yticks([])  # Clear the yaxis labels / ticks
        plt.xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
        sns.despine(bottom=False, left=False)
        fig.savefig(output_path + "/silhouette.pdf",
                    bbox_inches='tight',
                    dpi=600)
        fig.savefig(output_path + "/silhouette.png",
                    bbox_inches='tight',
                    dpi=600)

    #  #  MARKER FINDING
    data_df = pd.read_csv(output_path + "data.csv",
                          delimiter=",",
                          index_col=False)
    data_df.set_index('data', inplace=True)
    import pickle
    if n_pass == 2:
        path = 'Stardust_results/visualization_output/1_pass'
    if n_pass == 3:
        path = 'Stardust_results/visualization_output/2_pass'
    if n_pass == 4:
        path = 'Stardust_results/visualization_output/3_pass'
    if n_pass != 1:
        with open(path + '/de_genes_cluster.txt', 'rb') as fp:
            de_gene_cluster = pickle.load(fp)

        marker = []
        disp_marker = []
        for cl in range(n_clusters):
            cls = data_df[data_df['cluster'] == cl]
            gene_df = cls[cls['type'] == 'gene']
            f = 0
            for rank in range(len(de_gene_cluster)):
                if f == 1:
                    break
                for gene in de_gene_cluster[rank]:
                    if gene in list(gene_df.index):
                        disp_marker.append(gene)
                        #print(cl)
                        f = 1
                        break
        marker = disp_marker

        #sys.exit(0)

    # # CELL GENE MARKER

    # In[28]:
    from sklearn.neighbors import KNeighborsRegressor
    prev_pass_data = pd.read_csv(
        'Stardust_results/visualization_output/3_pass/data_openOrd.csv')
    prev_pass_data.set_index('data', inplace=True)
    data_df = pd.read_csv(output_path + '/data.csv')
    data_df.set_index('data', inplace=True)
    gene_df = data_df[data_df['type'] == 'gene']
    x_gene_fit = list(gene_df['x'])
    y_gene_fit = list(gene_df['y'])
    cells = list(prev_pass_data.index)
    cell_list = []
    x_coord = []
    y_coord = []

    for i in range(len(cells)):
        if cells[i] in list(data_df.index):
            cell_list.append(cells[i])
            x_coord.append(prev_pass_data.iloc[i]['x'])
            y_coord.append(prev_pass_data.iloc[i]['y'])

    prev_df = pd.DataFrame(index=cell_list)
    prev_df['x'] = x_coord
    prev_df['y'] = y_coord

    import numpy as np
    from sklearn.linear_model import Lasso
    from sklearn.neighbors import KNeighborsRegressor
    import pickle
    cells = []
    genes = []
    gene_coord_x = []
    gene_coord_y = []

    for i in range(n_clusters):
        clust_data = data_df[data_df['cluster'] == i]
        clust_cells = clust_data[clust_data['type'] == 'cell']
        clust_genes = clust_data[clust_data['type'] == 'gene']
        cells.extend(list(clust_cells.index))
        genes.extend(list(clust_genes.index))
        if len(list(clust_genes.index)) == 0:
            continue
        model1 = KNeighborsRegressor(n_neighbors=4)

        model2 = KNeighborsRegressor(n_neighbors=4)
        temp = []
        for cell in list(clust_cells.index):
            if cell in list(prev_df.index):
                temp.append(cell)
        clust_cells = clust_cells.loc[temp]
        model1.fit(
            np.array(list(clust_cells['x'])).reshape((-1, 1)),
            np.array(list(prev_df.loc[list(clust_cells.index)]['x'])).reshape(
                (-1, 1)))

        filename = output_path + '/sd_x_KNN_model.sav'
        pickle.dump(model1, open(filename, 'wb'))
        #model1 = pickle.load(open(filename, 'rb'))
        x_gene_pred = model1.predict(
            np.array(list(clust_genes['x'])).reshape((-1, 1)))
        gene_coord_x.extend(x_gene_pred)
        model2.fit(
            np.array(list(clust_cells['y'])).reshape((-1, 1)),
            np.array(list(prev_df.loc[list(clust_cells.index)]['y'])).reshape(
                (-1, 1)))

        filename = output_path + '/sd_y_KNN_model.sav'
        pickle.dump(model2, open(filename, 'wb'))
        #model2 = pickle.load(open(filename, 'rb'))
        y_gene_pred = model2.predict(
            np.array(list(clust_genes['y'])).reshape((-1, 1)))
        gene_coord_y.extend(y_gene_pred)

    with open(output_path + "/sd_gene_coord_x.txt", 'wb') as fp:
        pickle.dump(gene_coord_x, fp)
    with open(output_path + "/sd_gene_coord_y.txt", 'wb') as fp:
        pickle.dump(gene_coord_y, fp)

    #with open (output_path+"/sd_gene_coord_x.txt", 'rb') as fp:
    #        gene_coord_x = pickle.load(fp)
    #with open (output_path+"/sd_gene_coord_y.txt", 'rb') as fp:
    #        gene_coord_y = pickle.load(fp)

    import matplotlib.pyplot as plt, mpld3
    from scipy.spatial import ConvexHull, convex_hull_plot_2d
    prev_pass_data = pd.read_csv(
        'Stardust_results/visualization_output/3_pass/data_openOrd.csv')
    prev_pass_data["alpha"] = np.where(prev_pass_data['type'] == 'gene', 1.0,
                                       0.5)
    color_gene = ["light blue"]
    color_cell = ["red"]
    #fig,ax1 = plt.subplots()
    plt.figure(figsize=(6, 6))
    ax = sns.scatterplot(x="x",
                         y="y",
                         data=prev_pass_data[prev_pass_data['alpha'] == 0.5],
                         hue="type",
                         palette=sns.xkcd_palette(color_gene),
                         sizes=(10, 5),
                         size="type",
                         alpha=0.3,
                         s=10)
    #sns.scatterplot(x="x", y="y", data=data_df[data_df['alpha']==1.0],hue="type",palette=sns.xkcd_palette(color_cell),sizes=(20,5),size="type",marker="^",alpha=1.0,ax=ax,s=10)
    sns.scatterplot(x=gene_coord_x,
                    y=gene_coord_y,
                    palette=sns.xkcd_palette(color_cell),
                    sizes=(20, 5),
                    marker="^",
                    alpha=1.0,
                    ax=ax,
                    s=10)
    for c in range(n_clusters):
        p = data_df[data_df["cluster"] == c]
        p = p[['x', 'y']]
        points = p.values
        hull = ConvexHull(points)
        #for simplex in hull.simplices:
    #    sns.lineplot(points[simplex, 0], points[simplex, 1])
    x_list = []
    y_list = []
    if n_pass != 1:
        for m in marker:
            #x_list.append(data_df.loc[m]['x'])
            x_list.append(gene_coord_x[genes.index(m)])
            #y_list.append(data_df.loc[m]['y'])
            y_list.append(gene_coord_y[genes.index(m)])
        for label, x, y in zip(marker, x_list, y_list):
            plt.annotate(
                label,
                xy=(x, y),
                xytext=(-20, 20),
                textcoords='offset points',
                ha='right',
                va='bottom',
                #bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                arrowprops=dict(arrowstyle='-', connectionstyle='arc3,rad=0'))
    ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False)
    sns.despine(bottom=False, left=False)
    plt.xlabel("sd1", fontsize=20)
    plt.ylabel("sd2", fontsize=20)
    plt.setp(ax.spines.values(), linewidth=2)
    plt.yticks([], linewidth=20)
    plt.xticks([])
    plt.savefig(output_path + "sd_embedding.png", bbox_inches='tight', dpi=600)
    plt.savefig(output_path + "sd_embedding.pdf", bbox_inches='tight', dpi=600)

    import matplotlib.pyplot as plt, mpld3
    from scipy.spatial import ConvexHull, convex_hull_plot_2d
    #data_df["alpha"] = np.where(data_df['type'] == 'gene', 1.0, 0.5)
    prev_pass_data.set_index('data', inplace=True)
    temp_data = prev_pass_data[prev_pass_data['type'] == 'cell']
    temp_genes = data_df[data_df['type'] == 'gene']
    for pos in range(0, len(genes)):
        temp_genes.at[genes[pos], 'x'] = gene_coord_x[pos]
        temp_genes.at[genes[pos], 'y'] = gene_coord_y[pos]
    temp_data.append(temp_genes)
    color_gene = ["light blue"]
    color_cell = ["red"]
    n_clusters = len(data_df['cluster'].unique())
    colors = random.sample(seaborn_colors, n_clusters)
    #fig,ax1 = plt.subplots()
    plt.figure(figsize=(6, 6))
    ax = sns.scatterplot(x="x",
                         y="y",
                         data=temp_data,
                         hue="cluster",
                         palette=sns.xkcd_palette(colors),
                         s=2,
                         linewidth=0.0)
    #sns.scatterplot(x="x", y="y", data=data_df[data_df['alpha']==1.0],hue="type",palette=sns.xkcd_palette(color_cell),sizes=(20,5),size="type",marker="^",alpha=1.0,ax=ax,s=10)
    #sns.scatterplot(x=gene_coord_x, y=gene_coord_y,palette=sns.xkcd_palette(color_cell),sizes=(20,5),marker="^",alpha=1.0,ax=ax,s=20)
    for c in range(n_clusters):
        p = data_df[data_df["cluster"] == c]
        p = p[['x', 'y']]
        points = p.values
        hull = ConvexHull(points)
        #for simplex in hull.simplices:
    #    sns.lineplot(points[simplex, 0], points[simplex, 1])
    x_list = []
    y_list = []
    d1 = prev_pass_data[prev_pass_data['alpha'] == 0.5]
    for cl in range(n_clusters):
        plt.annotate(cl,
                     d1.loc[d1['cluster'] == cl, ['x', 'y']].mean(),
                     horizontalalignment='center',
                     verticalalignment='center',
                     size=10,
                     weight='bold',
                     color="black")
    if n_pass != 1:
        for m in marker:
            #x_list.append(data_df.loc[m]['x'])
            x_list.append(gene_coord_x[genes.index(m)])
            #y_list.append(data_df.loc[m]['y'])
            y_list.append(gene_coord_y[genes.index(m)])
        for label, x, y in zip(marker, x_list, y_list):
            plt.annotate(
                label,
                xy=(x, y),
                xytext=(-20, 20),
                textcoords='offset points',
                ha='right',
                va='bottom',
                #bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                arrowprops=dict(arrowstyle='-', connectionstyle='arc3,rad=0'))
    ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False)
    sns.despine(bottom=False, left=False)
    plt.xlabel("sd1", fontsize=20)
    plt.ylabel("sd2", fontsize=20)
    plt.setp(ax.spines.values(), linewidth=2)
    plt.yticks([], linewidth=20)
    plt.xticks([])
    plt.savefig(output_path + "sd_color_embedding.png",
                bbox_inches='tight',
                dpi=600)
    plt.savefig(output_path + "sd_color_embedding.pdf",
                bbox_inches='tight',
                dpi=600)
    #sys.exit(0)
    # # UMAP CELL GENE MARKER # #

    if n_pass == 4:

        import pickle
        with open('Stardust_results/build_output/1_pass/umap_coord.txt',
                  'rb') as fp:
            umap_coord = pickle.load(fp)
        louvain_df = pd.read_csv(
            'Stardust_results/build_output/1_pass/louvain_cluster_df.csv')
        louvain_df.set_index('Unnamed: 0', inplace=True)
        #data_df = pd.read_csv('F:/output/output_visualize_melanoma_pca/3rd_pass/data.csv')
        data_df = pd.read_csv(output_path + '/data.csv')
        data_df.set_index('data', inplace=True)
        gene_df = data_df[data_df['type'] == 'gene']
        x_gene_fit = list(gene_df['x'])
        y_gene_fit = list(gene_df['y'])
        cells = list(louvain_df.index)
        cell_list = []
        x_coord = []
        y_coord = []
        for i in range(len(cells)):
            if cells[i] in list(data_df.index):
                cell_list.append(cells[i])
                x_coord.append(umap_coord[i][0])
                y_coord.append(umap_coord[i][1])
        umap_df = pd.DataFrame(index=cell_list)
        umap_df['x'] = x_coord
        umap_df['y'] = y_coord

        import numpy as np
        from sklearn.linear_model import Lasso
        from sklearn.neighbors import KNeighborsRegressor
        import pickle
        cells = []
        genes = []
        gene_coord_x = []
        gene_coord_y = []
        for i in range(n_clusters):
            clust_data = data_df[data_df['cluster'] == i]
            clust_cells = clust_data[clust_data['type'] == 'cell']
            clust_genes = clust_data[clust_data['type'] == 'gene']
            cells.extend(list(clust_cells.index))
            genes.extend(list(clust_genes.index))
            if len(list(clust_genes.index)) == 0:
                continue
            model1 = KNeighborsRegressor(n_neighbors=5)

            model2 = KNeighborsRegressor(n_neighbors=5)

            model1.fit(
                np.array(list(clust_cells['x'])).reshape((-1, 1)),
                np.array(list(umap_df.loc[list(
                    clust_cells.index)]['x'])).reshape((-1, 1)))

            filename = output_path + '/scanpy_x_KNN_model.sav'
            pickle.dump(model1, open(filename, 'wb'))
            #model1 = pickle.load(open(filename, 'rb'))
            x_gene_pred = model1.predict(
                np.array(list(clust_genes['x'])).reshape((-1, 1)))
            gene_coord_x.extend(x_gene_pred)
            model2.fit(
                np.array(list(clust_cells['y'])).reshape((-1, 1)),
                np.array(list(umap_df.loc[list(
                    clust_cells.index)]['y'])).reshape((-1, 1)))

            filename = output_path + '/scanpy_y_KNN_model.sav'
            pickle.dump(model2, open(filename, 'wb'))
            #model2 = pickle.load(open(filename, 'rb'))
            y_gene_pred = model2.predict(
                np.array(list(clust_genes['y'])).reshape((-1, 1)))
            gene_coord_y.extend(y_gene_pred)

        with open(output_path + "/scanpy_gene_coord_x.txt", 'wb') as fp:
            pickle.dump(gene_coord_x, fp)
        with open(output_path + "/scanpy_gene_coord_y.txt", 'wb') as fp:
            pickle.dump(gene_coord_y, fp)

        #with open (output_path+"/scanpy_gene_coord_x.txt", 'rb') as fp:
        #    gene_coord_x = pickle.load(fp)
        #with open (output_path+"/scanpy_gene_coord_y.txt", 'rb') as fp:
        #    gene_coord_y = pickle.load(fp)

        #n_clusters = len(list(data_df['cluster'].unique()))

        u_map_x = []
        u_map_y = []
        for ind in list(data_df.index):
            if ind in list(louvain_df.index):

                u_map_x.append(umap_coord[list(
                    louvain_df.index).index(ind)][0])
                u_map_y.append(umap_coord[list(
                    louvain_df.index).index(ind)][1])
            else:
                u_map_x.append(gene_coord_x[genes.index(ind)])
                u_map_y.append(gene_coord_y[genes.index(ind)])
        data_df['umap_x'] = u_map_x
        data_df['umap_y'] = u_map_y

        #        colors = random.sample(seaborn_colors,n_clusters)
        #colors = colors3
        plt.figure(figsize=(5, 5))
        #cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
        ax = sns.scatterplot(x="umap_x",
                             y="umap_y",
                             data=data_df,
                             hue="cluster",
                             palette=sns.xkcd_palette(colors),
                             linewidth=0.0,
                             s=2)
        ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False)
        for cl in range(n_clusters):
            plt.annotate(cl,
                         data_df.loc[data_df['cluster'] == cl,
                                     ['umap_x', 'umap_y']].mean(),
                         horizontalalignment='center',
                         verticalalignment='center',
                         size=10,
                         weight='bold',
                         color="black")
        sns.despine(bottom=False, left=False)
        plt.xlabel("umap1", fontsize=20)
        plt.ylabel("umap2", fontsize=20)
        plt.setp(ax.spines.values(), linewidth=2)
        plt.yticks([], linewidth=20)
        plt.xticks([])
        plt.savefig(output_path + 'umap_clustering.png',
                    bbox_inches='tight',
                    dpi=600)
        plt.savefig(output_path + 'umap_clustering.pdf',
                    bbox_inches='tight',
                    dpi=600)

        import matplotlib.pyplot as plt, mpld3
        from scipy.spatial import ConvexHull, convex_hull_plot_2d
        data_df["alpha"] = np.where(data_df['type'] == 'gene', 1.0, 0.5)
        color_gene = ["light grey"]
        color_cell = ["red"]
        #fig,ax1 = plt.subplots()
        plt.figure(figsize=(6, 6))

        ax = sns.scatterplot(x="umap_x",
                             y="umap_y",
                             data=data_df[data_df['alpha'] == 0.5],
                             hue="type",
                             palette=sns.xkcd_palette(color_gene),
                             sizes=(10, 5),
                             size="type",
                             alpha=0.3,
                             s=10)
        sns.scatterplot(x="umap_x",
                        y="umap_y",
                        data=data_df[data_df['alpha'] == 1.0],
                        hue="type",
                        palette=sns.xkcd_palette(color_cell),
                        sizes=(20, 5),
                        size="type",
                        marker="^",
                        alpha=1.0,
                        ax=ax,
                        s=10)
        for c in range(n_clusters):
            p = data_df[data_df["cluster"] == c]
            p = p[['umap_x', 'umap_y']]
            points = p.values
            hull = ConvexHull(points)
            #for simplex in hull.simplices:
            #    sns.lineplot(points[simplex, 0], points[simplex, 1])
        x_list = []
        y_list = []
        for m in marker:
            x_list.append(data_df.loc[m]['umap_x'])
            #x_list.append(gene_coord_x[genes.index(m)])
            y_list.append(data_df.loc[m]['umap_y'])
            #y_list.append(gene_coord_y[genes.index(m)])
        for cl in range(n_clusters):
            plt.annotate(cl,
                         data_df.loc[data_df['cluster'] == cl,
                                     ['umap_x', 'umap_y']].mean(),
                         horizontalalignment='center',
                         verticalalignment='center',
                         size=10,
                         weight='bold',
                         color="black")
        for label, x, y in zip(marker, x_list, y_list):
            plt.annotate(
                label,
                xy=(x, y),
                xytext=(-20, 20),
                textcoords='offset points',
                ha='right',
                va='bottom',
                #bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                arrowprops=dict(arrowstyle='-', connectionstyle='arc3,rad=0'))
        ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False)
        sns.despine(bottom=False, left=False)
        plt.xlabel("umap1", fontsize=20)
        plt.ylabel("umap2", fontsize=20)
        plt.setp(ax.spines.values(), linewidth=2)
        plt.yticks([], linewidth=20)
        plt.xticks([])
        plt.savefig(output_path + 'umap_embedding.png',
                    bbox_inches='tight',
                    dpi=600)
        plt.savefig(output_path + 'umap_embedding.pdf',
                    bbox_inches='tight',
                    dpi=600)

        import matplotlib.pyplot as plt, mpld3
        from scipy.spatial import ConvexHull, convex_hull_plot_2d
        data_df["alpha"] = np.where(data_df['type'] == 'gene', 1.0, 0.5)
        color_gene = ["light grey"]
        color_cell = ["red"]
        #fig,ax1 = plt.subplots()
        plt.figure(figsize=(6, 6))
        #       colors = color
        ax = sns.scatterplot(x="umap_x",
                             y="umap_y",
                             data=data_df[data_df['alpha'] == 0.5],
                             hue="cluster",
                             linewidth=0.0,
                             sizes=(2, 5),
                             size="type",
                             palette=sns.xkcd_palette(colors),
                             s=2)
        sns.scatterplot(x="umap_x",
                        y="umap_y",
                        data=data_df[data_df['alpha'] == 1.0],
                        hue="type",
                        palette=sns.xkcd_palette(color_cell),
                        linewidth=0.1,
                        marker="^",
                        ax=ax,
                        alpha=1.0,
                        s=10)
        for c in range(n_clusters):
            p = data_df[data_df["cluster"] == c]
            p = p[['umap_x', 'umap_y']]
            points = p.values
            hull = ConvexHull(points)
            #for simplex in hull.simplices:
            #    sns.lineplot(points[simplex, 0], points[simplex, 1])
        x_list = []
        y_list = []
        for m in marker:
            x_list.append(data_df.loc[m]['umap_x'])
            y_list.append(data_df.loc[m]['umap_y'])
        for cl in range(n_clusters):
            plt.annotate(cl,
                         data_df.loc[data_df['cluster'] == cl,
                                     ['umap_x', 'umap_y']].mean(),
                         horizontalalignment='center',
                         verticalalignment='center',
                         size=10,
                         weight='bold',
                         color="black")
        for label, x, y in zip(marker, x_list, y_list):
            plt.annotate(
                label,
                xy=(x, y),
                xytext=(-20, 20),
                textcoords='offset points',
                ha='right',
                va='bottom',
                #bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                arrowprops=dict(arrowstyle='-', connectionstyle='arc3,rad=0'))
        ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False)
        sns.despine(bottom=False, left=False)
        plt.xlabel("umap1", fontsize=20)
        plt.ylabel("umap2", fontsize=20)
        plt.setp(ax.spines.values(), linewidth=2)
        plt.yticks([], linewidth=20)
        plt.xticks([])
        plt.savefig(output_path + 'umap_color_embedding.png',
                    bbox_inches='tight',
                    dpi=600)
        plt.savefig(output_path + 'umap_color_embedding.pdf',
                    bbox_inches='tight',
                    dpi=600)
Exemplo n.º 22
0
# usually now I would want to average rows MRI1 & MRI2 into standard_recon
# and MRI1_long with MRI2_long into long_recon, yet the subs don't match
# which is something I need to fix in the bash script...

standard_recon = np.concatenate((np.array(t['MRI1']), np.array(t['MRI2'])), axis=0)
standard_recon = standard_recon[~np.isnan(standard_recon)]

long_recon = np.concatenate((np.array(t['MRI1_long']), np.array(t['MRI2_long'])), axis=0)
long_recon = long_recon[~np.isnan(long_recon)]

# Histograms of different processing times, with average value clearly marked for the 3 processes

sns.set_style("white")

hist_standard = sns.distplot(standard_recon, norm_hist=True)
hist_base = sns.distplot(base, norm_hist=True)
hist_long = sns.distplot(long_recon, norm_hist=True)


plt.axvline(2.8, 0,0.17)



# Scatterplot of processing time with TIV