def train(training_data): """ Trains a model, using bernoulli features """ if ARGV.features == "bernoulli": features, featureMap, labels, labelMap = fs.bernoulli(training_data) else: features, scores, featureMap, labels, labelMap = fs.mutualinfo(training_data) learner = models[ ARGV.model ]() if ARGV.one_vs: labels[ labels != labelMap[ ARGV.one_vs ] ] = 0 labels[ labels == labelMap[ ARGV.one_vs ] ] = 1 model = learner.train(features, labels) if ARGV.features == "bernoulli": return (model, featureMap, labelMap) else: return ((model,scores), featureMap, labelMap)
def train(training_data): """ Trains a model, using bernoulli features """ if ARGV.features == "bernoulli": features, featureMap, labels, labelMap = fs.bernoulli(training_data) else: features, scores, featureMap, labels, labelMap = fs.mutualinfo( training_data) learner = models[ARGV.model]() if ARGV.one_vs: labels[labels != labelMap[ARGV.one_vs]] = 0 labels[labels == labelMap[ARGV.one_vs]] = 1 model = learner.train(features, labels) if ARGV.features == "bernoulli": return (model, featureMap, labelMap) else: return ((model, scores), featureMap, labelMap)
def kmeans_summary(): print "---* KMeans clustering *---" data = DataReader(ARGV.data) features, featureMap, labels, labelMap = fs.bernoulli(data) # run kmeans k = len(labelMap) # pca_features, components = milk.unsupervised.pca(features) reduced_features = features cluster_ids, centroids = milk.unsupervised.repeated_kmeans( reduced_features, k, 3) # start outputing out_folder = "clusters" if not path.exists(out_folder): os.mkdir(out_folder) print "---* Results *---" # plot if ARGV.plot: import matplotlib.pyplot as plt colors = "bgrcbgrc" marks = "xxxxoooo" xmin = np.min(pca_features[:, 1]) xmax = np.max(pca_features[:, 1]) ymin = np.min(pca_features[:, 2]) ymax = np.max(pca_features[:, 2]) print[xmin, xmax, ymin, ymax] plt.axis([xmin, xmax, ymin, ymax]) # printing for i in xrange(k): if not ARGV.no_print: out_file = path.join(out_folder, "cluster_{}".format(i)) print "Writing to: {}".format(out_file) with open(out_file, 'w') as out: for j, tweetinfo in enumerate(data): if cluster_ids[j] == i: out.write(str(tweetinfo["Tweet"]) + "\n") if ARGV.plot: plt.plot(pca_features[cluster_ids == i, 1], pca_features[cluster_ids == i, 2], \ colors[i] + marks[i]) if ARGV.plot: print "Writing to: {}".format(path.join(out_folder, "plot.png")) plt.savefig(path.join(out_folder, "plot.png"))
def kmeans_summary(): print "---* KMeans clustering *---" data = DataReader(ARGV.data) features, featureMap, labels, labelMap = fs.bernoulli(data) # run kmeans k = len(labelMap) # pca_features, components = milk.unsupervised.pca(features) reduced_features = features cluster_ids, centroids = milk.unsupervised.repeated_kmeans(reduced_features, k, 3) # start outputing out_folder = "clusters" if not path.exists(out_folder): os.mkdir(out_folder) print "---* Results *---" # plot if ARGV.plot: import matplotlib.pyplot as plt colors = "bgrcbgrc" marks = "xxxxoooo" xmin = np.min(pca_features[:, 1]) xmax = np.max(pca_features[:, 1]) ymin = np.min(pca_features[:, 2]) ymax = np.max(pca_features[:, 2]) print [ xmin, xmax, ymin, ymax ] plt.axis([ xmin, xmax, ymin, ymax ]) # printing for i in xrange(k): if not ARGV.no_print: out_file = path.join(out_folder, "cluster_{}".format(i)) print "Writing to: {}".format(out_file) with open(out_file, 'w') as out: for j, tweetinfo in enumerate(data): if cluster_ids[j] == i: out.write(str(tweetinfo["Tweet"]) + "\n") if ARGV.plot: plt.plot(pca_features[cluster_ids == i, 1], pca_features[cluster_ids == i, 2], \ colors[i] + marks[i]) if ARGV.plot: print "Writing to: {}".format(path.join(out_folder, "plot.png")) plt.savefig(path.join(out_folder, "plot.png"))