def plot_zipf_law_on_corpus(corpus):

    words = getWordsFromCorpus(corpus)
    words = remove_stopwords_from_corpus_words(words)
    fdist = FreqDist(words)
    words = fdist.most_common()

    x = [math.log(i[1]) for i in words]
    y = [math.log(i) for i in range(1, len(x) + 1)]

    (m, b) = pylab.polyfit(x, y, 1)
    yp = pylab.polyval([m, b], x)

    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

    pylab.plot(x, y, 'r')
    pylab.plot(x, yp, 'b')

    pylab.ylim([min(y), max(y)])
    pylab.xlim([min(x), max(x)])
    pylab.text(x=1,
               y=1,
               s="Best Fit Line (Blue) \nslope = {slope}".format(
                   slope=np.round(slope, 2)))
    pylab.grid(True)
    pylab.ylabel('Counts of words (log)')
    pylab.xlabel('Ranks of words (log)')
    pylab.title(
        'ZIPF LAW TEST ON CORPUS. IDEALLY SLOPE OF THE LINE MUST BE = -1 for IDEAL ZIPF CASE'
    )
    pylab.show()
예제 #2
0
def plotScatter(pearsonStats,data,args,color='b'):
    """"""
    fig = pl.figure()
    ax  = fig.add_subplot(111)
    if args.log:
        ax.set_xscale('log')
        ax.set_yscale('log')
    
    
    ax.scatter(data[0],data[1], s=15, c=color, marker='o', alpha=1)
    if not args.log:
        ax.set_autoscale_on(False)
    ax.set_xlabel(args.label1)
    ax.set_ylabel(args.label2)
    upperLim = max(data[0]+data[1])
    

    
    m,b  = pl.polyfit(data[0],data[1],1)
    bfYs = pl.polyval([m,b], [1,max(data[0])])
    
    ax.plot([1,max(data[0])],bfYs,'r-')
    
    pl.text(0.02,0.95,'Pearson: %.4f, %s\nBest Fit: y=%.3f*x+%.3f' % (pearsonStats[0],pearsonStats[1],m,b),
            bbox=dict(facecolor='#87AACD', alpha=1),
            horizontalalignment='left',
            verticalalignment='top',
            transform = ax.transAxes)
    
    if args.pdf:
        pdf_or_png = 'pdf'
    else:
        pdf_or_png = 'png'
        
    # construct outfile name
    if not args.log:
        outName = '%s_%s_vs_%s.%s' % (args.out,args.label1.replace(' ','_'),args.label2.replace(' ','_'),pdf_or_png)
    else:
        outName = '%s_%s_vs_%s.log.%s' % (args.out,args.label1.replace(' ','_'),args.label2.replace(' ','_'),pdf_or_png)
        
    pl.savefig(outName)
    
    if args.galaxy:
        os.rename(outName,args.out)
        
    print 'Show?  %s' % (args.show)
    if args.show:
        pl.show()
def evaluate_models_on_training(x, y, models):
    """
    For each regression model, compute the R-squared value for this model with the
    standard error over slope of a linear regression line (only if the model is
    linear), and plot the data along with the best fit curve.

    For the plots, you should plot data points (x,y) as blue dots and your best
    fit curve (aka model) as a red solid line. You should also label the axes
    of this figure appropriately and have a title reporting the following
    information:
        degree of your regression model,
        R-square of your model evaluated on the given data points,
        and SE/slope (if degree of this model is 1 -- see se_over_slope). 

    Args:
        x: an 1-d pylab array with length N, representing the x-coordinates of
            the N sample points
        y: an 1-d pylab array with length N, representing the y-coordinates of
            the N sample points
        models: a list containing the regression models you want to apply to
            your data. Each model is a pylab array storing the coefficients of
            a polynomial.

    Returns:
        None
    """
    for model in models:
        predict_y = pylab.polyval(model, x)
        #Get type of model
        if len(model) <= 4:
            types_of_model = ["linear", "quadratic", "cubic"]
            model_type = types_of_model[len(model) - 2]
        else:
            model_type = f"{len(model)-1} degree"
        #make the title
        title = f"Years against degrees C with {model_type} model \n R2 = {round(r_squared(y,predict_y),5)}"
        #If model is linear get se_over_slope and add to title
        if len(model) == 2:
            title += f"\nStandard error to slope ratio = {round(se_over_slope(x,y,predict_y,model),5)}"
        #Draw two pairs of values
        pylab.figure()
        pylab.plot(x, y, "bo", x, predict_y, "-r")
        pylab.title(title)
        pylab.xlabel("Years")
        pylab.ylabel("Temperature in degrees C")
        pylab.show()
예제 #4
0
파일: ps4.py 프로젝트: hailelagi/simulacra
def evaluate_models_on_training(x, y, models):
    """
    For each regression model, compute the R-square for this model with the
    standard error over slope of a linear regression line (only if the model is
    linear), and plot the data along with the best fit curve.

    For the plots, you should plot data points (x,y) as blue dots and your best
    fit curve (aka model) as a red solid line. You should also label the axes
    of this figure appropriately and have a title reporting the following
    information:
        degree of your regression model,
        R-square of your model evaluated on the given data points
    Args:
        x: a list of length N, representing the x-coords of N sample points
        y: a list of length N, representing the y-coords of N sample points
        models: a list containing the regression models you want to apply to
            your data. Each model is a numpy array storing the coefficients of
            a polynomial.
    Returns:
        None
    """
    for model in models:
        est_y = pylab.polyval(model, x)
        error = r_squared(y, est_y)

        pylab.figure()
        pylab.plot(x, y, 'b o')

        pylab.plot(x,
                   est_y,
                   'r',
                   label="degree of your regression model" + "R - Squared is" +
                   str(round(error, 5)))
        pylab.xlabel("x-coords of N sample points")
        pylab.ylabel("y-coords of N sample points")
        pylab.legend(loc="best")
        pylab.title("best fit")
        pylab.show()
예제 #5
0
def plotScatter(pearsonStats,normedTxCntsList,opts):
    """"""
    fig = pl.figure()
    ax  = fig.add_subplot(111)
    if opts.log:
        ax.set_xscale('log')
        ax.set_yscale('log')
    
    
    ax.scatter(normedTxCntsList[0],normedTxCntsList[1], s=15, c='b', marker='o', alpha=1)
    if not opts.log:
        ax.set_autoscale_on(False)
    ax.set_xlabel(opts.name_a)
    ax.set_ylabel(opts.name_b)
    upperLim = max(normedTxCntsList[0]+normedTxCntsList[1])
    

    
    m,b  = pl.polyfit(normedTxCntsList[0],normedTxCntsList[1],1)
    bfYs = pl.polyval([m,b], [1,max(normedTxCntsList[0])])
    
    ax.plot([1,max(normedTxCntsList[0])],bfYs,'r-')
    
    pl.text(0.01,0.99,'Pearson: %.4f, %s\nBest Fit: y=%.3f*x+%.3f' % (pearsonStats[0],pearsonStats[1],m,b),
            bbox=dict(facecolor='#87AACD', alpha=1),
            horizontalalignment='left',
            verticalalignment='top',
            transform = ax.transAxes)
    
    mkdirp(opts.dir)
    if not opts.log:
        pl.savefig('%s%s_vs_%s.png' % (opts.dir,opts.name_a,opts.name_b))
    else:
        pl.savefig('%s%s_vs_%s.log.png' % (opts.dir,opts.name_a,opts.name_b))
    print 'Show?  %s' % (opts.show)
    if opts.show:
        pl.show()
예제 #6
0
파일: ps4.py 프로젝트: serarm/edx_mitx
def evaluate_models_on_training(x, y, models):
    """
    For each regression model, compute the R-square for this model with the
    standard error over slope of a linear regression line (only if the model is
    linear), and plot the data along with the best fit curve.

    For the plots, you should plot data points (x,y) as blue dots and your best
    fit curve (aka model) as a red solid line. You should also label the axes
    of this figure appropriately and have a title reporting the following
    information:
        degree of your regression model,
        R-square of your model evaluated on the given data points
    Args:
        x: a list of length N, representing the x-coords of N sample points
        y: a list of length N, representing the y-coords of N sample points
        models: a list containing the regression models you want to apply to
            your data. Each model is a numpy array storing the coefficients of
            a polynomial.
    Returns:
        None
    """
    r2 = []
    for model in models:
        estimated = np.polyval(model, x)
        r2.append(r_squared(y, list(estimated)))
        xVals = pylab.array(x)
        yVals = pylab.array(y)
        #xVals = xVals * 9.81  # get force
        pylab.plot(xVals, yVals, 'bo', label='Measured points')
        estYVals = pylab.polyval(model, xVals)
        pylab.plot(xVals,
                   estYVals,
                   'r',
                   label='Linear fit, k = ' + str(round(1 / model[0], 5)))
        pylab.legend(loc='best')
        pylab.show()
    print(r2)
예제 #7
0
파일: P3.py 프로젝트: scattm/MIT6002x
      END of each time step, and fox_populations is a record of the fox population
      at the END of each time step.

    Both lists should be `numSteps` items long.
    """

    ret_rabbit = [CURRENTRABBITPOP]
    ret_fox = [CURRENTFOXPOP]
    for i in range(numSteps):
        if CURRENTFOXPOP >= 10 and CURRENTRABBITPOP >= 10:
            rabbitGrowth()
            foxGrowth()
        ret_rabbit.append(CURRENTRABBITPOP)
        ret_fox.append(CURRENTFOXPOP)

    return ret_rabbit, ret_fox

rp, fp = runSimulation(200)

rl, = pyplot.plot(rp, label="Rabbit population")
fl, = pyplot.plot(fp, label="Fox population")
pyplot.legend(handles=[rl, fl])
pyplot.show()

rcoeff = pylab.polyfit(range(len(rp)), rp, 2)
rcl, = pyplot.plot(pylab.polyval(rcoeff, range(len(rp))), label="Rabbit Coefficients")
fcoeff = pylab.polyfit(range(len(fp)), fp, 2)
fcl, = pyplot.plot(pylab.polyval(fcoeff, range(len(fp))), label="Fox Coefficients")
pyplot.legend(handles=[rcl, fcl])
pyplot.show()
예제 #8
0
from geobricks_raster_correlation.core.raster_correlation_core import get_correlation
from matplotlib import pyplot as plt
from matplotlib.pylab import polyfit, polyval

# input to your raster files
raster_path1 = "../../tests/data/geoserver_data_dir/data/workspace/wheat_actual_biomprod_201010_doukkala/wheat_actual_biomprod_201010_doukkala.geotiff"
raster_path2 = "../../tests/data/geoserver_data_dir/data/workspace/wheat_potential_biomprod_201010_doukkala/wheat_potential_biomprod_201010_doukkala.geotiff"

# Number of sampling bins
bins = 150

corr = get_correlation(raster_path1, raster_path2, bins)
x = []
y = []
colors = []
#print corr['series']
for serie in corr['series']:
    colors.append(serie['color'])
    for data in serie['data']:
        x.append(data[0])
        y.append(data[1])

# Adding regression line
(m, b) = polyfit(x, y, 1)
yp = polyval([m, b], x)
plt.plot(x, yp)

# plotting scatter
plt.scatter(x, y, c=colors)
plt.show()
예제 #9
0
        if opts.log:
            ax.set_xscale('log')
            ax.set_yscale('log')
        
        
        ax.scatter(vecFile[1],vecFile[2], s=15, c='b', marker='o', alpha=1)
        if not opts.log:
            ax.set_autoscale_on(False)
        ax.set_xlabel(vecFile[0][0])
        ax.set_ylabel(vecFile[0][1])
        upperLim = max(vecFile[1]+vecFile[2])
        

        
        m,b  = pl.polyfit(vecFile[1],vecFile[2],1)
        bfYs = pl.polyval([m,b], [1,max(vecFile[1])])
        
        ax.plot([1,max(vecFile[1])],bfYs,'r-')
        
        pl.text(0.01,0.99,'Pearson: %.4f, %s\nBest Fit: y=%.3f*x+%.3f' % (pearson[0],pearson[1],m,b),
                bbox=dict(facecolor='#87AACD', alpha=1),
                horizontalalignment='left',
                verticalalignment='top',
                transform = ax.transAxes)
        
        
        pl.savefig('%s_vs_%s.png' % (vecFile[0][0],vecFile[0][1]))
        print 'Show?  %s' % (opts.show)
        if opts.show:
            pl.show()
    
from geobricks_raster_correlation.core.raster_correlation_core import get_correlation
from matplotlib import pyplot as plt
from matplotlib.pylab import polyfit, polyval

# input to your raster files
raster_path1 = "../../tests/data/geoserver_data_dir/data/workspace/wheat_actual_biomprod_201010_doukkala/wheat_actual_biomprod_201010_doukkala.geotiff"
raster_path2 = "../../tests/data/geoserver_data_dir/data/workspace/wheat_potential_biomprod_201010_doukkala/wheat_potential_biomprod_201010_doukkala.geotiff"


# Number of sampling bins
bins = 150

corr = get_correlation(raster_path1, raster_path2, bins)
x = []
y = []
colors = []
#print corr['series']
for serie in corr['series']:
    colors.append(serie['color'])
    for data in serie['data']:
        x.append(data[0])
        y.append(data[1])

# Adding regression line
(m, b) = polyfit(x, y, 1)
yp = polyval([m, b], x)
plt.plot(x, yp)

# plotting scatter
plt.scatter(x, y, c=colors)
plt.show()
예제 #11
0
        #ax.set_xscale('log')
        #ax.set_yscale('log')
        
        ax.scatter(vector0,vector1, s=10, c='b', marker='o', alpha=0.6)
        
        # -- set axis labels --
        xLab = DEGseqParser._file.name.split('_')[0]
        yLab = DEGseqParser._file.name.split('_')[2]
        ax.set_xlabel(xLab)
        ax.set_ylabel(yLab)

        
        m,b  = pl.polyfit(vector0,vector1,1)
        min_xVec = min(vector0)
        max_xVec = max(vector0)
        bfYs = pl.polyval([m,b], [min_xVec,max_xVec])
        
        ax.plot([min_xVec,max_xVec],bfYs,'r-')
        
        pl.text(0.01,0.99,'Pearson: %.4f, %s\nBest Fit: y=%.3f*x+%.3f' % (pearson[0],pearson[1],m,b),
                bbox=dict(facecolor='#87AACD', alpha=1),
                horizontalalignment='left',
                verticalalignment='top',
                transform = ax.transAxes)
        
        

        pl.savefig('%s_vs_%s.indvDEG.png' % (xLab,yLab))
        print 'Show?  %s' % (opts.show)
        if opts.show:
            pl.show()       
예제 #12
0
    Both lists should be `numSteps` items long.
    """

    ret_rabbit = [CURRENTRABBITPOP]
    ret_fox = [CURRENTFOXPOP]
    for i in range(numSteps):
        if CURRENTFOXPOP >= 10 and CURRENTRABBITPOP >= 10:
            rabbitGrowth()
            foxGrowth()
        ret_rabbit.append(CURRENTRABBITPOP)
        ret_fox.append(CURRENTFOXPOP)

    return ret_rabbit, ret_fox


rp, fp = runSimulation(200)

rl, = pyplot.plot(rp, label="Rabbit population")
fl, = pyplot.plot(fp, label="Fox population")
pyplot.legend(handles=[rl, fl])
pyplot.show()

rcoeff = pylab.polyfit(range(len(rp)), rp, 2)
rcl, = pyplot.plot(pylab.polyval(rcoeff, range(len(rp))),
                   label="Rabbit Coefficients")
fcoeff = pylab.polyfit(range(len(fp)), fp, 2)
fcl, = pyplot.plot(pylab.polyval(fcoeff, range(len(fp))),
                   label="Fox Coefficients")
pyplot.legend(handles=[rcl, fcl])
pyplot.show()