示例#1
0
文件: main.py 项目: L4v/ori
def main():
    # NOTE(Jovan): Load data
    data = pd.read_csv("data/skincancer.csv", delimiter=',', index_col=0)
    mort = data.Mort.values
    lat = data.Lat.values
    lon = data.Long.values

    # NOTE(Jovan): Init LinearRegression and predict
    lin_reg = LinearRegression(lat, mort)
    hawaii = lin_reg.predict(20)
    print("Prediction for hawaii[lat=20]:", hawaii)

    # NOTE(Jovan): Init KMeans and add lat and long points
    k_means = KMeans()
    for i, j in zip(lat, lon):
        k_means.points.append(Point(i, j))
    k_means.split(2, 0.01)

    # NOTE(Jovan): Plot clusters
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    # NOTE(Jovan): First clusters
    for p in k_means._clusters[0].points:
        ax.scatter(p.x, p.y, c="#ff0000")
    # NOTE(Jovan): Second clusters
    for p in k_means._clusters[1].points:
        ax.scatter(p.x, p.y, c="#00ff00")

    # NOTE(Jovan): Plot cluster centers
    center1 = k_means._clusters[0].center
    center2 = k_means._clusters[1].center
    ax.scatter(center1.x, center1.y, marker="P", c="#ff0000")
    ax.scatter(center2.x, center2.y, marker="P", c="#00ff00")
    plt.show()
示例#2
0
def spectral_inner(W, k):
    m = numpy.size(W, 1)
    D = numpy.diag(W * numpy.ones((m, 1)))
    L = D - W
    eigenvalues, eigenvectors = numpy.linalg.eig(L)

    indices = numpy.argsort(eigenvalues)

    eigen_sorted = eigenvalues[indices]

    # Calculate the optimal k
    eigen_diff = []
    for i in xrange(0, len(eigen_sorted) - 2):
        eigen_diff.append(eigen_sorted[i + 1] - eigen_sorted[i])
    k = eigen_diff.index(max(eigen_diff)) + 1
    import pdb
    pdb.set_trace()
    print "%d, %d" % (len(W[0]), k)

    Ut = eigenvectors[:, indices[0:k]]

    f = numpy.vectorize(lambda x: x.real)
    Ut = f(Ut)

    C = kmeans([Point(x) for x in Ut], k)

    A = []
    for c in C:
        cluster = []
        for x in c.points:
            cluster.append(Ut.tolist().index(x.coords.tolist()))
        A.append(cluster)

    return A
示例#3
0
def get_data():
    file_name = 'Sales_Transactions_Dataset_Weekly.csv'
    with open(file_name) as f:
        header = f.readline()
        points = []
        for line in f:
            items = line.strip().split(',')
            r = [float(item) for item in items[1:]]
            points.append(Point(r))
    random.shuffle(points)
    return points
示例#4
0
def get_iris_data():
    file_name = 'datasets/iris.csv'
    with open(file_name) as f:
        header = f.readline()
        points = []
        for line in f:
            items = line.strip().split(',')
            r = (float(items[0]), float(items[1]), float(items[2]),
                 float(items[3]), items[4])
            points.append(Point(r))
    random.shuffle(points)
    return points
示例#5
0
def get_s1():
	file_name = 'datasets/s1.txt'
	with open(file_name) as f:
		#header = f.readline()
		points = []
		for line in f:
			items = line.strip().split('    ')
			r = [
				float(items[0]),
				float(items[1]),
			]
			points.append( Point(r) )
	#random.shuffle(points)
	return points
示例#6
0
def parse_input_file(input_file):
    """
    Parses the input file based on following format.

    First line contains two integers(m and n space separated),
    indicating number of commuters and cabs. Next m lines contain
    commuter locations. Next n lines contain cab locations.
    Finally, last line contains destination location. Locations
    are in format: x,y

    Returns a tuple of commuters, cab and destination location.
    """

    commuters = []
    cabs = []
    destination = None

    try:
        f = open(input_file, 'r')
    except (OSError, IOError) as e:
        if e.errno == 2:
            print ERR_INPUT_FILE_NOT_FOUND
        else:
            print e.strerror
        sys.exit(1)

    lines_list = f.read().splitlines()
    lines_len = len(lines_list)

    try:
        if lines_len == '0':
            raise Exception(ERR_INPUT_FILE_EMPTY)
        # Parse m and n values
        try:
            m = int(lines_list[0].split(' ')[0])
            n = int(lines_list[0].split(' ')[1])
        except:
            raise Exception(ERR_COMMUTERS_CABS_STR % lines_list[0])
        # Less number of commuter locations
        if lines_len < (m+1):
            raise Exception(ERR_LESS_COMMUTER_LOCATIONS % (m, lines_len-1))
        # Less number of commuter locations
        if lines_len < (m+n+1):
            raise Exception(ERR_LESS_CAB_LOCATIONS % (n, lines_len-m-1))
        # Destination location missing
        if lines_len < (m+n+2):
            raise Exception(ERR_MISSING_DEST_LOCATION)
        # Parse commuter locations
        for i in range(m):
            coords_str = lines_list[i+1]
            coords = Point.get_coords_from_str(coords_str)
            commuter = Commuter(coords)
            commuters.append(commuter)
        # Parse cab locations
        for i in range(n):
            coords_str = lines_list[i+1+m]
            coords = Point.get_coords_from_str(coords_str)
            cab = Cab(coords)
            cabs.append(cab)
        # Parse destination location
        coords_str = lines_list[1+m+n]
        coords = Point.get_coords_from_str(coords_str)
        destination = Point(coords)
    except Exception as e:
        print e
        sys.exit(1)

    return (commuters, cabs, destination)
示例#7
0
import json

from kmeans import Point, run

if __name__ == "__main__":
    sortedPoints = lambda ps: sorted(ps, key=lambda p: (p.x, p.y))
    with open("../points.json") as f:
        points = map(lambda x: Point(x[0], x[1]), json.loads(f.read()))
    result = run(points, 10)
    for k in sortedPoints(result.keys()):
        print "==\n# %s #" % k
        print '\n'.join("  " + str(p) for p in sortedPoints(result[k]))
示例#8
0
# Md Lutfar Rahman
# [email protected]
# DataMining Assingment 4



from kmeans import Point, Cluster, KMeans
import random
from UserMatrix import UserMatrix

userMat = UserMatrix()
points = userMat.userpoints
fet = list(range(len(userMat.movieIds)))
k=3
#print(fet)
Point.set_features(*fet)

model = KMeans(points, k, 0.001)
model.cluster()
#print("clustring>>ended")
print('')
model.getIntraCentriodDensity()
print('')
model.getInterCentroidDensity()
示例#9
0
import kmeans.Point

if __name__ == '__main__':
    p = Point([1, 2, 3, 4, 5])
示例#10
0
                x.append([a, b, c])
        X.extend(x)
    X = array(X)[:N]
    return X


if DISTRIB == 'RANDOM':
    set_x, set_y = [random.choice(chemical_symbols) for i in range(N)
                    ], [random.choice(chemical_symbols) for i in range(N)]
    set_z = [round(random.uniform(0.1, 15.0), 2) for i in range(N)]
    data, ref = [], []
    for i in range(N):
        formula = set_x[i] + set_y[i]
        set_x[i] = get_element_group(chemical_symbols.index(set_x[i]))
        set_y[i] = get_element_group(chemical_symbols.index(set_y[i]))
        data.append(Point([set_x[i], set_y[i], set_z[i]], formula))
        ref.append([set_x[i], set_y[i], set_z[i]])

else:
    nte = len(chemical_symbols)
    G = gaussian_distribution(N, k_from_n(N))

    set_x = (G[:, 0] + 1) / 2 * nte
    set_x = map(lambda x: int(math.floor(x)), set_x.tolist())

    set_y = (G[:, 1] + 1) / 2 * nte
    set_y = map(lambda x: int(math.floor(x)), set_y.tolist())

    set_z = (G[:, 2] + 1) / 2 * 15
    set_z = map(lambda x: round(x, 2), set_z.tolist())
示例#11
0
	return sum(s)/len(s)

#-----------------------------------------------------------
def get_s1():
	file_name = 'datasets/s1.txt'
	with open(file_name) as f:
		#header = f.readline()
		points = []
		for line in f:
			items = line.strip().split('    ')
			r = [
				float(items[0]),
				float(items[1]),
			]
			points.append( Point(r) )
	#random.shuffle(points)
	return points

#-----------------------------------------------------------

points = get_s1()
#print(points)
Point.set_features(0,1)
for k in range(3, 4):
	model = KMeans(points, 15, 0.01)
	model.cluster()
	# model.show()
	print("Done")
	print('k = ', k, 'silhouette = ', silhouette(model.points, model.clusters))

示例#12
0
    .join(model.Structure, model.Atom.struct_id == model.Structure.struct_id) \
    .join(model.Electrons, model.Electrons.checksum == model.Structure.checksum) \
    .join(model.Struct_ratios, model.Electrons.checksum == model.Struct_ratios.checksum) \
    .join(model.Energy, model.Electrons.checksum == model.Energy.checksum) \
    .join(emin_query, and_(
        model.Energy.total == emin_query.c.emin,
        model.Struct_ratios.chemical_formula == emin_query.c.chemical_formula,
        model.Struct_ratios.formula_units == emin_query.c.formula_units
    )).filter(model.Struct_ratios.nelem == 2, model.Electrons.gap > 0) \
    .order_by(model.Electrons.gap) \
    .all():
    i += 1
    collected.append(get_element_group(elnum))
    if not i % 2:
        collected.sort()
        data.append(Point(collected + [gap], reference=formula))
        collected = []

with open(points_file, "w") as s:
    s.write("x,y,z,label\n")
    for n, pnt in enumerate(data):
        s.write(",".join(map(str, pnt.coords) + [pnt.reference]) + "\n")

clusters = kmeans(data, k_from_n(len(data)))

with open(cluster_file, "w") as s:
    s.write("x,y,z,label\n")
    for n, cluster in enumerate(clusters, 1):
        for pnt in cluster.points:
            s.write(",".join(map(str, pnt.coords) + [pnt.reference]) + "\n")
        s.write("-,-,-,-\n")
示例#13
0
from sklearn.datasets import load_digits
from sklearn.metrics import fowlkes_mallows_score
from sklearn.cluster import AgglomerativeClustering, AffinityPropagation
from kmeans import kmeans, Point, predict

data, target = load_digits(return_X_y=True)

# K-Means
kmeans_data = [Point(val) for val in data]
k_means = kmeans(kmeans_data, 10)

labels = []

for point in data:
    labels.append(predict(k_means, Point(point)))
target = [int(num) for num in target]

results = [[0 for _ in range(10)] for __ in range(10)]

for i, val in enumerate(labels):
    results[target[i]][val] += 1

conversion = {}
for t_i, targ in enumerate(results):
    max_cluster = None
    for c_i, cluster in enumerate(targ):
        if max_cluster is None or cluster > targ[max_cluster]:
            max_cluster = c_i

        conversion[t_i] = max_cluster
示例#14
0
dfrm = dfrm[dfrm['Units'] == 'eV']
dfrm = dfrm[(dfrm['Bandgap'] > 0) & (dfrm['Bandgap'] < 20)]

avgbgfrm = dfrm.groupby('Formula')['Bandgap'].mean().to_frame().reset_index(
).rename(columns={'Bandgap': 'AvgBandgap'})

dfrm = dfrm.merge(avgbgfrm, how='outer', on='Formula')
dfrm.drop_duplicates('Formula', inplace=True)
dfrm.sort_values('Formula', inplace=True)

fitdata, export_data = [], []

for n, row in dfrm.iterrows():
    groupA, groupB = \
        get_element_group(chemical_symbols.index(row['Elements'][0])), \
        get_element_group(chemical_symbols.index(row['Elements'][1]))
    fitdata.append(
        Point(sorted([groupA, groupB]) + [round(row['AvgBandgap'], 2)],
              reference=row['Formula']))

clusters = kmeans(fitdata, k_from_n(len(fitdata)))

for cluster_n, cluster in enumerate(clusters, start=1):
    for pnt in cluster.points:
        export_data.append(pnt.coords + [pnt.reference] + [cluster_n])

export = MPDSExport.save_plot(
    export_data, ['groupA', 'groupB', 'bandgap', 'compound', 'cluster'],
    'plot3d')
print(export)
示例#15
0

#-----------------------------------------------------------
def get_iris_data():
    file_name = 'datasets/iris.csv'
    with open(file_name) as f:
        header = f.readline()
        points = []
        for line in f:
            items = line.strip().split(',')
            r = [
                float(items[0]),
                float(items[1]),
                float(items[2]),
                float(items[3]), items[4]
            ]
            points.append(Point(r))
    random.shuffle(points)
    return points


#-----------------------------------------------------------

points = get_iris_data()
Point.set_features(0, 1, 2, 3)
for k in range(2, 10):
    model = KMeans(points, k, 0.01)
    model.cluster()
    # model.show()
    print('k = ', k, 'silhouette = ', silhouette(model.points, model.clusters))
示例#16
0
def cluster(doc_ids, lexicon, r, num_clusters, verbose=True):
    words = {}
    t = r.terms()
    if verbose is True:
        print "extracting unique words in the document list"

    i = 0
    while t.next():
        if verbose is True:
            print "word " + str(i) + " is " + str(t.term().text())
        I = lexicon[str(t.term().text())]  # extract the lexicon of the term
        for doc_id, doc_feat in I:  # for every document that carries the term
            if int(doc_id) in doc_ids:
                words[i] = str(t.term().text())
                break
        i = i + 1

    if verbose is True:
        print "creating doc space"

    # create a zero featurespace
    doc_space = OrderedDict()
    for doc in doc_ids:
        doc_space[doc] = [0] * len(words.keys())

    # create a featurespace of data in ditionaries.
    w = 0
    for key in words.keys():
        I = lexicon[words[key]]
        for doc_id, doc_feat in I:
            if int(doc_id) in doc_ids:
                doc_space[int(doc_id)][w] = doc_feat
        w = w + 1

    num_points = len(doc_ids)
    dimensions = len(words.keys())

    if verbose is True:
        print "creating featurespace"

    opt_cutoff = 0.5
    points = [Point(doc_space[doc], doc) for doc in doc_ids]

    # Cluster those data!
    clusters = kmeans(points, num_clusters, opt_cutoff, verbose=verbose)

    for i, c in enumerate(clusters):
        for p in c.points:
            print " cluster: ", i, "\t document [", p.id, "]"

    clus_list = OrderedDict()
    clus_list[0] = []
    clus_list[1] = []
    clus_list[2] = []

    clus = 0
    # Print our clusters

    word_hists = OrderedDict()
    clus = 0
    for i, c in enumerate(clusters):
        word_hists[clus] = {}
        for p in c.points:
            clus_list[clus].append(p.id)
        clus = clus + 1

    for key in words.keys():
        I = lexicon[words[key]]  # extract the lexicon of the term
        if verbose is True:
            print "word is : " + words[key]
        for doc_id, doc_feat in I:  # for every document that carries the term
            hit = False
            if int(doc_id) in doc_space.keys():
                for clus in clus_list.keys():
                    if (int(doc_id) in clus_list[clus]) and hit is False:
                        if not words[key] in word_hists[clus].keys():
                            word_hists[clus][words[key]] = doc_feat
                        else:
                            word_hists[clus][words[key]] += doc_feat
                        hit = True
    clus_sort = {}
    for clus in clus_list.keys():
        clus_sort[clus] = sorted(word_hists[clus], key=word_hists[clus].get)

    import pdb
    pdb.set_trace()
    return clusters