예제 #1
0
파일: jentest.py 프로젝트: vintej/laspdev
def gen_list_gvf(classes):
    gvf = 0.0
    nclasses = classes
    #global c1_peerId, c3_peerId
    
    randar1 =  np.random.randint(1,5,len(c1_peerId))
    randar3 =  np.random.randint(75,100,len(c3_peerId))
    randar2 = np.random.randint(30,50,len(ND.get_allNodes())-len(c3_peerId)-len(c1_peerId))
    randartemp = []
    randartemp.append(randar1)
    randartemp.append(randar2)
    randartemp.append(randar3)
    randar = np.concatenate((randar1, randar2, randar3))
    #randar = randar1 + randar2 + randar3
    #print (randar)
    print(np.sort(randar))
    segs = []
    arr = []
    global found
    while gvf < .5 and nclasses!=2:
        gvf = goodness_of_variance_fit(randar, nclasses)
        print("RandIntegers:"+str(np.sort(randar)))
        print("GVF: "+str(gvf))
        print nclasses
        print(jenks(randar,nclasses))
        segs = jenks(randar,nclasses)
        arr = randar
        nclasses += 1
        if nclasses > 3:
            print ("Randoming randar")
            nclasses = 3
            randar1 =  np.random.randint(1,5,len(c1_peerId))
            randar3 =  np.random.randint(75,100,len(c3_peerId))
            randar2 = np.random.randint(30,50,len(ND.get_allNodes())-len(c3_peerId)-len(c1_peerId))
            randartemp = []
            randartemp.append(randar1)
            randartemp.append(randar2)
            randartemp.append(randar3)
            randar = np.concatenate((randar1, randar2, randar3))
            #randar = np.asarray(randartemp)
            #randar = randar1 + randar2 + randar3
            #randar =  np.random.randint(1,100,len(ND.get_allNodes()))
        print ("New Randar"+str(randar))
        print ("New Randar1"+str(randar1))
        print ("New Randar2"+str(randar2))
        print ("New Randar3"+str(randar3))
        
        
    return segs, arr, gvf
예제 #2
0
def calc_stops(vals, ramp, nodata=-99):
    vals = [v for v in vals if v != nodata]
    try:
        _ramps = colors[ramp]
    except KeyError:
        raise KeyError("ramp must be one of {}".format(', '.join(
            colors.keys())))

    n_classes = 8
    _colors = _ramps[n_classes]

    try:
        float(vals[0])
        _breaks = [float(x) for x in jenks(vals, n_classes)]
        _stops = list(zip(_breaks, _colors))
    except ValueError:
        uniq = list(zip(set(vals)))
        # Try for ramp of exact length
        try:
            _colors = _ramps[n_classes]
        except KeyError:
            pass
        # Broadcast colors in cycle
        while len(_colors) < len(uniq):
            _colors += _colors

        _stops = list(zip(set(vals), _colors))

    return _stops
예제 #3
0
 def fit_posteriors(self, document, desired_gvf=0.8):
     """ Cluster the posteriors using Jenks Natural Breaks algorithm
     :param document: document in {problems, questions}
     :param desired_gvf: A number between [0, 1] showing goodness of fit
     :return: A list of the dictionary of the most likely problems/questions
     """
     # gvf denotes the goodness of fit, n denotes the number of classes in Jenks/k-means
     gvf = 0.0
     n = 0
     if document == 'problems':
         cursor = self.db.problems.find()
     elif document == 'questions':
         cursor = self.db.questions.find()
     else:
         return
     posteriors = list()
     i = 0
     idx_to_hash_name = dict()
     for item in cursor:
         posteriors.append(float(item['posterior']))
         idx_to_hash_name[i] = item
         i += 1
     array = np.array(posteriors)
     while gvf < desired_gvf:
         # Keep increasing n till gvf is at least the desired_gvf
         gvf = natural_break.gvf(array, n)
         n += 1
     centers = jenks(array, n)
     most_likely = list()
     for i in range(len(posteriors)):
         d = [(abs(posteriors[i] - centers[k]), k) for k in range(len(centers))]
         d.sort()
         if d[0][1] == len(centers) - 1:
             most_likely.append(idx_to_hash_name[i])
     return most_likely
예제 #4
0
파일: jentest.py 프로젝트: vintej/laspdev
def goodness_of_variance_fit(array, classes):
    # get the break points
    classes = jenks(array, classes)

    # do the actual classification
    classified = np.array([classify(i, classes) for i in array])

    # max value of zones
    maxz = max(classified)

    # nested list of zone indices
    zone_indices = [[idx for idx, val in enumerate(classified) if zone + 1 == val] for zone in range(maxz)]

    # sum of squared deviations from array mean
    sdam = np.sum((array - array.mean()) ** 2)

    # sorted polygon stats
    array_sort = [np.array([array[index] for index in zone]) for zone in zone_indices]

    # sum of squared deviations of class means
    sdcm = sum([np.sum((classified - classified.mean()) ** 2) for classified in array_sort])

    # goodness of variance fit
    gvf = (sdam - sdcm) / sdam

    return gvf
def goodness_of_variance_fit(array, classes):
    '''This and the next function were written by camdenl:
        https://stats.stackexchange.com/questions/143974/jenks-natural-breaks-in-python-how-to-find-the-optimum-number-of-breaks/144075
    '''
    
    # get the break points
    classes = jenks(array, classes)

    # do the actual classification
    classified = np.array([classify(i, classes) for i in array])

    # max value of zones
    maxz = max(classified)

    # nested list of zone indices
    zone_indices = [[idx for idx, val in enumerate(classified) if zone + 1 == val] for zone in range(maxz)]

    # sum of squared deviations from array mean
    sdam = np.sum((array - array.mean()) ** 2)

    # sorted polygon stats
    array_sort = [np.array([array[index] for index in zone]) for zone in zone_indices]

    # sum of squared deviations of class means
    sdcm = sum([np.sum((classified - classified.mean()) ** 2) for classified in array_sort])

    # goodness of variance fit
    gvf = (sdam - sdcm) / sdam

    return gvf
예제 #6
0
def calc_breaks_natural(values, n_classes):
    natural = None
    if values:
        natural = [float(bp) for bp in jenks(values, n_classes)]
    else:
        natural = []
    return natural
예제 #7
0
def calc_stops(vals, ramp, nodata=-99):
    vals = [v for v in vals if v != nodata]
    try:
        _ramps = colors[ramp]
    except KeyError:
        raise KeyError("ramp must be one of {}".format(', '.join(colors.keys())))

    n_classes = 8
    _colors = _ramps[n_classes]

    try:
        float(vals[0])
        _breaks = [float(x) for x in jenks(vals, n_classes)]
        _stops = list(zip(_breaks, _colors))
    except ValueError:
        uniq = list(zip(set(vals)))
        # Try for ramp of exact length
        try:
            _colors = _ramps[n_classes]
        except KeyError:
            pass
        # Broadcast colors in cycle
        while len(_colors) < len(uniq):
            _colors += _colors

        _stops = list(zip(set(vals), _colors))

    return _stops
예제 #8
0
    def get_no_transaction_segments(self, n_breaks=10, visualize=False):
        """
        segments the number of transactions array
        :param n_breaks: number of segments to break for jenks algorithm
        :type n_breaks: int
        :param visualize: to visualize the output
        :type visualize: bool
        :return: the array containing the numbers in the list for breaking
        :rtype: numpy.core.numeric.array
        """
        x = self._merchant_data

        # Data Selection
        no_transaction = x[:, 1].tolist()  # Frequency
        no_transaction.sort()
        no_transactions_breaks = jenks(no_transaction, n_breaks)

        if visualize:
            plotlyvisualize.segments_plot(
                no_transaction,
                vertical_lines=no_transactions_breaks,
                title=
                "Segmentations of Number of Transactions With Jenks Natural Breaks",
                out_path=PLOT_OUT_DIR)
        return np.array(no_transactions_breaks)
예제 #9
0
파일: plot.py 프로젝트: ritviksahajpal/LUH2
def get_cb_range(arr=np.empty([2, 2]),
                 xaxis_min=0.0,
                 xaxis_max=1.1,
                 xaxis_step=0.1,
                 do_jenks=True):
    """
    https://github.com/perrygeo/jenks
    :param arr:
    :param xaxis_min:
    :param xaxis_max:
    :param xaxis_step:
    :param do_jenks:
    :return:
    """
    # Array can only have shape == 2
    if len(np.shape(arr)) != 2:
        sys.exit(0)

    if do_jenks:
        # Select 11 elements, discard the highest
        arr = np.array(jenks(np.unique(np.round_(arr, decimals=1)).data,
                             11))[:-1]

        # return only the unique elements, sometimes jenks selects duplicate elements
        return np.unique(arr)
    else:
        return np.arange(xaxis_min, xaxis_max, xaxis_step)
예제 #10
0
 def segmentation(self, data_series, n_breaks, all_breaks=[], limit=1000):
     """
     the method tries to segment data_series. first it find breaks with n_breaks
     then it tries to find the most populous break and using the start_interval and end_interval.
     of the most populous break find its exact population size. then it tries to merge breaks with 
     all_breaks it has found. Then, if the most populous segment contains more than limit size it tries
     to segment it recursively.
     :param data_series: data series that need to be segmented
     :type data_series: pandas.core.series.Series
     :param n_breaks: number of breaks in each try of algorithm using jenks(not equal to all the breaks it finally find)
     :type n_breaks: int
     :param all_breaks: auxiliary list that contains all the breaks algorithm will find. set it [] always.
     :type all_breaks: 
     :param limit: least number of population in each break, if it exceeds algorithm will recur
     :type limit: int
     :return: all the breaks
     :rtype: 
     """
     breaks = jenks(data_series.tolist(), n_breaks)
     start_interval, end_interval = self._find_most_populous_break(
         breaks, data_series)
     most_populous_chunk_series = data_series[
         (data_series > start_interval) & (data_series < end_interval)]
     all_breaks = self._merge_breaks(all_breaks, breaks)
     if most_populous_chunk_series.size > limit and int(n_breaks / 2) >= 1:
         return self.segmentation(most_populous_chunk_series,
                                  int(n_breaks / 2), all_breaks, limit)
     else:
         return all_breaks
예제 #11
0
def calc_breaks_natural(values, n_classes):
    natural = None
    if values:
        natural = [float(bp) for bp in jenks(values, n_classes)]
    else:
        natural = []
    return natural
예제 #12
0
파일: tests.py 프로젝트: joaogb/jenks
def test_json():
    data = json.load(open('test.json'))
    breaks = jenks(data, 5)
    assert [round(v, 6) for v in breaks] == [0.002811,
                                             2.093548,
                                             4.205495,
                                             6.178148,
                                             8.091759,
                                             9.997983]
예제 #13
0
파일: tests.py 프로젝트: willardmr/jenks
def test_json():
    data = json.load(open('test.json'))
    breaks, groups = jenks(data, 5)
    assert [round(v, 6) for v in breaks] == [0.002811,
                                             2.093548,
                                             4.205495,
                                             6.178148,
                                             8.091759,
                                             9.997983]
예제 #14
0
def get_jenks_breaks(col_index, num_breaks):
	data=[]
	with open(QUAKE_PARSED,'r') as f:
		flist=f.readlines()
		flist.pop(0)
		for line in flist:
			values = line.strip().split(',')
			data.append(float(values[col_index]))
	breaks = jenks(data, num_breaks)
	return breaks
예제 #15
0
def build_jenks(target):
    gvf = 0
    nclasses = 2
    while gvf < 0.95 and nclasses < 10:
        gvf = goodness_of_variance_fit(target, nclasses)
        print( "\tGVF for {0} classes: {1}".format(nclasses, gvf))
        nclasses += 1

    breaks = jenks(target, nclasses-1)
    print("Breaks: ", breaks)
    classified = np.array([classify(i, breaks) for i in target]).reshape(-1, 1)
    print(classified.shape)
    return classified
예제 #16
0
def cluster(items, value=None, K=None):
    if value is None:
        raise ValueError("Distance function not set")
    if K is None:
        raise ValueError("Parameter K not set")

    if len(items) <= K:
        sys.stderr.write(
            "WARNING: NOT ENOUGH ITEMS!\nInput List Size: {}\n".format(
                len(items)))
        return [[i] for i in items]

    distance = lambda p1, p2: value(p1) - value(p2)
    breakpoints = sorted(map(value, items))
    sys.stderr.write("Sorted values list:\n{}\n".format(breakpoints))

    # Find natural jenks breakpoints of items
    breakpoints = jenks(breakpoints, K)
    # Remove duplicate
    #breakpoints = list(set(breakpoints))
    sys.stderr.write("Breakpoints:\n{}\n".format(breakpoints))

    clustered_items = []
    last_bp = None
    # Lambda to test if item's distance is between breakpoints, using interval: (bp1, bp2]
    between = lambda item: value(item) > last_bp and value(item) <= bp
    # Group items using breakpoints
    for bp in breakpoints:
        # Jenks returns zero distance above
        if last_bp is None:
            last_bp = bp
            continue
        between_items = filter(between, items)
        sys.stderr.write("Values between {} and {}:\n{}\n".format(
            last_bp, bp, map(value, between_items)))
        if len(between_items) == 0:
            last_bp = bp
            continue
        clustered_items.append(between_items)
        last_bp = bp
    sys.stderr.write("{} clusters found (K={})\nItems:\n{}\n".format(
        len(clustered_items), K, clustered_items))
    return clustered_items
예제 #17
0
def myjenks(array, label, sz=6):
    """Create classification breaks for the array"""
    a = list(set(jenks(array, sz)))
    # Some failures happen when number of values > 0 is less than 6
    # sys.stderr.write(label + str(a))
    a.sort()
    if max(a) == 0:
        return [0]
    if a[1] < 0.01:
        newa = [a[0]]
        for _ in a[1:]:
            if _ > 0.01:
                newa.append(_)
        a = newa
    # sys.stderr.write(label + str(a))
    if max(a) == 0 or len(a) < 2:
        return [0]
    if a[0] == 0 and a[1] > 0.001:
        a[0] = 0.001
    return [float(_) for _ in a]
예제 #18
0
def gjsonJenks(polyStats, polys, inP, classes, spacing = None):
    #get the break points
    classes = jenks(polyStats, classes)

    #do the actual classification
    classified = np.array([classify(i,classes) for i in polyStats])

    #max value of zones
    maxz = max(classified)

    #nested list of zone indices
    zoneIndices = [[idx  for idx,val in enumerate(classified) if zone + 1 == val] for zone in range(maxz)]

    #nested list of polygons corresponding to each zone number
    polySort = [[polys[index] for index in zone] for zone in zoneIndices]

    #merge geometries, generate list of zones, create geojson feature collection from list
    polyComb = [cascaded_union(polyz).simplify(.01) for polyz in polySort]
    
    #if simplifying is needed        
    if spacing is not None:
예제 #19
0
def cluster(items, value=None, K=None):
    if value is None:
        raise ValueError("Distance function not set")
    if K is None:
        raise ValueError("Parameter K not set")

    if len(items) <= K:
        sys.stderr.write("WARNING: NOT ENOUGH ITEMS!\nInput List Size: {}\n".format(len(items)))
        return [[i] for i in items]

    distance = lambda p1, p2: value(p1) - value(p2)
    breakpoints = sorted(map(value, items))
    sys.stderr.write("Sorted values list:\n{}\n".format(breakpoints))

    # Find natural jenks breakpoints of items
    breakpoints = jenks(breakpoints, K)
    # Remove duplicate
    # breakpoints = list(set(breakpoints))
    sys.stderr.write("Breakpoints:\n{}\n".format(breakpoints))

    clustered_items = []
    last_bp = None
    # Lambda to test if item's distance is between breakpoints, using interval: (bp1, bp2]
    between = lambda item: value(item) > last_bp and value(item) <= bp
    # Group items using breakpoints
    for bp in breakpoints:
        # Jenks returns zero distance above
        if last_bp is None:
            last_bp = bp
            continue
        between_items = filter(between, items)
        sys.stderr.write("Values between {} and {}:\n{}\n".format(last_bp, bp, map(value, between_items)))
        if len(between_items) == 0:
            last_bp = bp
            continue
        clustered_items.append(between_items)
        last_bp = bp
    sys.stderr.write("{} clusters found (K={})\nItems:\n{}\n".format(len(clustered_items), K, clustered_items))
    return clustered_items
예제 #20
0
파일: plot.py 프로젝트: ritviksahajpal/LUH2
def get_cb_range(arr=np.empty([2, 2]), xaxis_min=0.0, xaxis_max=1.1, xaxis_step=0.1, do_jenks=True):
    """
    https://github.com/perrygeo/jenks
    :param arr:
    :param xaxis_min:
    :param xaxis_max:
    :param xaxis_step:
    :param do_jenks:
    :return:
    """
    # Array can only have shape == 2
    if len(np.shape(arr)) != 2:
        sys.exit(0)

    if do_jenks:
        # Select 11 elements, discard the highest
        arr = np.array(jenks(np.unique(np.round_(arr, decimals=1)).data, 11))[:-1]

        # return only the unique elements, sometimes jenks selects duplicate elements
        return np.unique(arr)
    else:
        return np.arange(xaxis_min, xaxis_max, xaxis_step)
예제 #21
0
def StatesJson(request, word):


	word = urllib.unquote(word);
	states = State.objects.filter(word__word=word)
	
	## Calculate subgrups with jenks
	if len(states) >= 3:
		scores = list()
		for state in states:
			scores.append(state.score)
		scores_jenks = jenks(scores,3)
		negative = float(scores_jenks[1])
		positive = float(scores_jenks[2])
	else:
		# We force to be neutral
		negative = 1
		positive = 10


	## Prepare the dict
	states_dict = dict()
	for state in states:

		if (state.score <= negative):
			fillKey = 'negative'
		elif (state.score >= positive):
			fillKey = 'positive'
		else:
			fillKey = 'neutral'


		states_dict[state.state] = {"fillKey": fillKey, "score": state.score,"recurrence": state.recurrence}

	states_dict = OrderedDict(sorted(states_dict.items(), key=lambda x: x[1]['score'], reverse=True))

	return HttpResponse(json.dumps(states_dict))
예제 #22
0
    def get_sum_amounts_segments(self, n_breaks=10, visualize=False):
        """
        segments the sum amounts array
        :param n_breaks: number of segments to break for jenks algorithm
        :type n_breaks: int
        :param visualize: to visualize the output
        :type visualize: bool
        :return: the array containing the numbers in the list for breaking
        :rtype: numpy.core.numeric.array
        """
        x = self._merchant_data

        # Data Selection
        sum_amounts = x[:, 2].tolist()  # Money
        sum_amounts.sort()
        sum_amounts_breaks = jenks(sum_amounts, n_breaks)

        if visualize:
            plotlyvisualize.segments_plot(
                sum_amounts,
                vertical_lines=sum_amounts_breaks,
                title="Segmentations of Sum Amount With Jenks Natural Breaks",
                out_path=PLOT_OUT_DIR)
        return np.array(sum_amounts_breaks)
예제 #23
0
    def get_harmonic_segments(self, n_breaks=10, visualize=False):
        """
        segments the harmonic number calculated in dataframe
        :param n_breaks: number of segments to break for jenks algorithm
        :type n_breaks: int
        :param visualize: to visualize the output
        :type visualize: bool
        :return: the array containing the numbers in the list for breaking
        :rtype: numpy.core.numeric.array
        """
        x = self._merchant_data

        # Data Selection
        harmonic = x[:, 0].tolist()  # Recency
        harmonic.sort()
        harmonic_breaks = jenks(harmonic, n_breaks)

        if visualize:
            plotlyvisualize.segments_plot(
                harmonic,
                vertical_lines=harmonic_breaks,
                title="Segmentations of Harmonic sum With Jenks Natural Breaks",
                out_path=PLOT_OUT_DIR)
        return np.array(harmonic_breaks)
예제 #24
0
파일: tests.py 프로젝트: perrygeo/jenks
def test_short():
    data = [1, 2, 3, 100]
    breaks = jenks(data, 2)
    assert [round(v, 5) for v in breaks] == [1.0, 3.0, 100.0]
예제 #25
0
파일: tests.py 프로젝트: perrygeo/jenks
def test_json():
    data = json.load(open('test.json'))
    breaks = jenks(data, 5)
    assert [round(float(v), 5) for v in breaks] == \
        [0.00281, 2.09355, 4.2055, 6.17815, 8.09176, 9.99798]
예제 #26
0
def field_jenks(in_table, field_name, class_num):
    rows = get_rows(in_table)
    data_list = [row.getValue(field_name) for row in rows]
    result_data_list = jenks(data_list, class_num)
    print result_data_list
    return [float(group[-1]) for group in result_data_list]
예제 #27
0
def test_short():
    data = [1, 2, 3, 100]
    breaks = jenks(data, 2)
    assert [round(v, 5) for v in breaks] == [1.0, 3.0, 100.0]
            X.append(float(line.strip()))
        
X = np.array(X)

###########
#Jenks

gvf = 0.0
for k in range(2,end):
    gvf = goodness_of_variance_fit(X, k)
    if round(gvf,2) >= thres:
        break


print(k)
breaks = jenks(X,k)
print("breaks =",breaks)

print("\nmean    clust_n")
means = []
for i in range(1,k+1):
    if i == 1:
        X_ = X[X<= breaks[i]]
        means += [X_.mean()]
        print(round(X_.mean(),2)," ",X_.size)
    else:
        x = X[X > breaks[i-1]]
        X_ = x[x<=breaks[i]]
        means += [X_.mean()]
        print(round(X_.mean(),2)," ",X_.size)
예제 #29
0
def test_json():
    data = json.load(open('test.json'))
    breaks = jenks(data, 5)
    assert [round(float(v), 5) for v in breaks] == \
        [0.00281, 2.09355, 4.2055, 6.17815, 8.09176, 9.99798]
예제 #30
0
         randar =  np.random.randint(1,100,8)'''
 #a = np.array([2,2, 4, 20, 18, 22, 28, 35, 42])
 #a = np.array([1, 2, 10, 8, 0, 5, 6, 0, 10, 8, 49, 40, 49, 46, 30, 42, 39, 46, 44, 33, 32, 34, 37, 30, 39, 33, 39, 46, 32, 37, 46, 43, 49, 39, 50, 50, 50, 34, 38, 49, 34, 44, 39, 36, 32, 39, 40, 48, 49, 39, 35, 47, 43, 39, 50, 30, 37, 46, 50, 31, 45, 45, 38, 32, 33, 30, 32, 47, 47, 42, 42, 48, 44, 44, 31, 50, 38, 39, 50, 33, 37, 36, 50, 44, 31, 34, 50, 47, 30, 44, 78, 74, 73, 77, 62, 67, 70, 66, 64, 65, 72, 65, 83, 66, 69, 60, 62, 85, 80, 65])
 a = np.array([
     8, 10, 7, 11, 9, 6, 10, 8, 9, 10, 36, 39, 38, 42, 35, 37, 47, 41, 46,
     38, 32, 48, 42, 38, 43, 33, 40, 41, 45, 37, 36, 41, 30, 43, 48, 49, 36,
     50, 45, 32, 35, 34, 36, 48, 31, 33, 38, 49, 50, 48, 34, 33, 36, 36, 40,
     49, 32, 34, 45, 48, 49, 46, 47, 50, 33, 49, 40, 48, 49, 33, 88, 70, 79,
     63, 86, 60, 83, 75, 88, 60, 77, 62, 65, 73, 72, 64, 62, 66, 76, 71, 75,
     63, 66, 60, 85, 65, 61, 62, 69, 75
 ])
 #a =  np.random.randint(1,100,8)
 gvf = goodness_of_variance_fit(a, 3)
 print(np.sort(a))
 print("GVF: " + str(gvf))
 segs = jenks(a, 3)
 print("SEGS:" + str(segs))
 rates = np.sort(a).tolist()
 peers = ['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8']
 c1 = {}
 c2 = {}
 c3 = {}
 c1_rates = []
 c2_rates = []
 c3_rates = []
 random.shuffle(peers)
 print(peers)
 print("NP SORT" + str(np.sort(a)))
 ind = 0
 for i in rates:
     if i <= segs[1]: