def get_labels_data(self): path = self.get_cluster_data_path() labels_path = os.path.join(path, settings.CLUSTERED_DOCS_LABELS_FILENAME) labels_resource = Resource(labels_path, Resource.FILE) data = json.loads(labels_resource.get_data()) return data
def get_model_path(self, model): cluster_data_location = settings.ENVIRON_CLUSTERING_DATA_LOCATION resource = Resource(cluster_data_location, Resource.FILE_AND_ENVIRONMENT) path = os.path.join(resource.get_resource_location(), 'cluster_model_{}'.format(model.id)) return path
def get_cluster_score_vs_size_data(self): path = self.get_cluster_data_path() data_path = os.path.join(path, settings.CLUSTER_SCORE_DOCS_SIZE_FILENAME) data_resource = Resource(data_path, Resource.FILE) datastr = data_resource.get_data() if datastr is None: return [] return json.loads(datastr)
def write_cluster_score_vs_size(model, doc_size): """ Write new scores and doc_size to file. NOTE: This will override the previous data @model: ClusteringModel instance @doc_size: Number of leads on which clustering was done """ data = [(doc_size, model.silhouette_score)] path = model.get_cluster_data_path() data_path = os.path.join(path, settings.CLUSTER_SCORE_DOCS_SIZE_FILENAME) data_resource = Resource(data_path, Resource.FILE) data_resource.write_data(json.dumps(data)) # now plot plot_score_vs_size(model, data)
def get_relevant_terms_data(self): """This gets data from file unlike compute_relevant terms, which calculates relevant terms""" path = self.get_cluster_data_path() data_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME) data_resource = Resource(data_path, Resource.FILE) raw = data_resource.get_data() if raw: return json.loads(raw) # compute and write to file data = self.compute_relevant_terms() # Write to file write_relevant_terms_data(self, data) return data
def update_cluster_score_vs_size(model, increased_size=1): """ Update scores. This won't override. @model: ClusteringModel isntance, this contains new score @increased_size """ current_data = model.get_cluster_score_vs_size_data() last_size = current_data[-1][0] current_data.append((last_size+increased_size, model.silhouette_score)) path = model.get_cluster_data_path() data_path = os.path.join(path, settings.CLUSTER_SCORE_DOCS_SIZE_FILENAME) data_resource = Resource(data_path, Resource.FILE) data_resource.write_data(json.dumps(current_data)) plot_score_vs_size(model, current_data)
def new(cls, doc2vecmodel, name, group_id, extra_info={}): resource = Resource(settings.ENVIRON_DOC2VEC_MODELS_LOCATION, Resource.DIRECTORY_AND_ENVIRONMENT) path = resource.get_resource_location() doc2vec = cls(name=name, group_id=group_id, modelpath=path, extra_info=extra_info) doc2vec.save() filename = 'doc2vec_id_{}'.format(doc2vec.id) doc2vec.modelpath = os.path.join(doc2vec.modelpath, filename) doc2vec.save() # finally save the doc2vec model in a file doc2vecmodel.save(doc2vec.modelpath) return doc2vec
def write_or_update_centers_data(model, cluster_centers): """ @model: ClusteringModel instance @cluster_centers: [<uncompressed_center_data>, ...] """ path = model.get_cluster_data_path() center_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME) center_resource = Resource(center_path, Resource.FILE) # convert to python float first or it won't be json serializable centers_data = { i: compress_sparse_vector([float(y) for y in x]) for i, x in enumerate(cluster_centers) } # Center data can be directly written whether update is true or false as # centers gets updated center_resource.write_data(json.dumps(centers_data))
def visualize_clusters(model, plottype): print("Getting features and labels...") nfeatures = 3 if plottype == '3d' else 2 reduced_features, labels, n_clusters = get_docs_features_labels( model, nfeatures) print("Plotting clusters...") # Plot fig = plot(reduced_features, labels, n_clusters, plottype) # get location of clusters data resource = Resource(settings.ENVIRON_CLUSTERING_DATA_LOCATION, Resource.FILE_AND_ENVIRONMENT) path = os.path.join(resource.get_resource_location(), 'cluster_model_{}'.format(model.id)) filepath = os.path.join(path, 'clusterplot.png') print("Saving plot to {}".format(filepath)) fig.savefig(filepath)
def write_cluster_labels_data( model, docs_labels, docids_features, update=False ): """ @model: ClusteringModel instance @docs_labels:[(<doc_id>, <label_id>), ...] @docids_features: { <doc_id>: <features>, ... } @update: False means replace the file contents, else just update """ path = model.get_cluster_data_path() # now create labels file labels_path = os.path.join(path, settings.CLUSTERED_DOCS_LABELS_FILENAME) labels_resource = Resource(labels_path, Resource.FILE) # create dict dict_data = {x: {'label': y} for x, y in docs_labels} for doc_id, features in docids_features.items(): # first make json serializable dict_data[doc_id]['features'] = [ float(x) if isinstance(model.model, KMeansDoc2Vec) else x for x in features ] # Write docs_clusterlabels if update: data = json.loads(labels_resource.get_data()) data.update(dict_data) labels_resource.write_data(json.dumps(data)) else: labels_resource.write_data(json.dumps(dict_data))
def get_docs_features_labels(model, nfeatures): modelid = model.id # get location of clusters data resource = Resource(settings.ENVIRON_CLUSTERING_DATA_LOCATION, Resource.FILE_AND_ENVIRONMENT) path = os.path.join(resource.get_resource_location(), 'cluster_model_{}'.format(modelid)) labels_path = os.path.join(path, settings.CLUSTERED_DOCS_LABELS_FILENAME) labels_resource = Resource(labels_path, Resource.FILE) labels_data = json.loads(labels_resource.get_data()) zipped = [(docid, data) for docid, data in labels_data.items()] n_clusters = model.n_clusters # Now get documents dimensions docs_features, docs_labels = [], [] if isinstance(model.model, KMeansDoc2Vec): def identity(x): return x features_uncompress_function = identity else: features_uncompress_function = uncompress_compressed_vector for did, data in zipped: # first uncompress and then store docs_features.append(features_uncompress_function(data['features'])) docs_labels.append(data['label']) reduced_features = reduce_dimensions(docs_features, nfeatures) print(reduced_features) return reduced_features, docs_labels, n_clusters
def get_relevant_terms(self, obj): cluster_data_location = settings.ENVIRON_CLUSTERING_DATA_LOCATION resource = Resource(cluster_data_location, Resource.FILE_AND_ENVIRONMENT) path = os.path.join(resource.get_resource_location(), 'cluster_model_{}'.format(obj.id)) relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME) relevant_resource = Resource(relevant_path, Resource.FILE) return json.loads(relevant_resource.get_data())
def write_relevant_terms_data(model, relevant_terms, update=False): """ @model: ClusteringModel instance @relevant_terms: { <cluster_label>: [<relevant_term>, ...], ...} @update: False means replace content in file """ path = model.get_cluster_data_path() relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME) relevant_resource = Resource(relevant_path, Resource.FILE) if update: curr = relevant_resource.get_data() curr.update(relevant_terms) relevant_resource.write_data(json.dumps(curr)) else: relevant_resource.write_data(json.dumps(relevant_terms))
def get_labels_data(self): cluster_data_location = settings.ENVIRON_CLUSTERING_DATA_LOCATION resource = Resource(cluster_data_location, Resource.FILE_AND_ENVIRONMENT) # create another resource(folder to keep files) path = os.path.join(resource.get_resource_location(), 'cluster_model_{}'.format(self.id)) labels_path = os.path.join(path, settings.CLUSTERED_DOCS_LABELS_FILENAME) labels_resource = Resource(labels_path, Resource.FILE) data = json.loads(labels_resource.get_data()) return data
def test_data_files_created(self): model = create_new_clusters('test', self.group_id, 2) path = self.get_model_path(model) center_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME) labels_path = os.path.join(path, settings.CLUSTERED_DOCS_LABELS_FILENAME) relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME) center_resource = Resource(center_path, Resource.FILE) labels_resource = Resource(labels_path, Resource.FILE) relevant_resource = Resource(relevant_path, Resource.FILE) try: center_resource.validate() except Exception as e: assert False, "No center data stored. " + e.args else: data = json.loads(center_resource.get_data()) assert isinstance(data, dict) try: labels_resource.validate() except Exception as e: assert False, "No levels data stored. " + e.args else: data = json.loads(labels_resource.get_data()) assert isinstance(data, dict) try: relevant_resource.validate() except Exception as e: assert False, "No relevant data stored. " + e.args else: data = json.loads(relevant_resource.get_data()) assert isinstance(data, list) assert not all(map(lambda x: len(x) < 2, data))
def test_data_files_created(self): # First remove existing clsuter models ClusteringModel.objects.all().delete() # create one model = create_new_clusters('test', self.group_id, 2) assert model.all_clustered, "All docs should be clustered while cluster model is created new" # noqa path = self.get_model_path(model) center_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME) labels_path = os.path.join(path, settings.CLUSTERED_DOCS_LABELS_FILENAME) relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME) size_score_path = os.path.join( path, settings.CLUSTER_SCORE_DOCS_SIZE_FILENAME) size_score_fig_path = os.path.join( path, settings.CLUSTERS_SCORE_PLOT_FILENAME) center_resource = Resource(center_path, Resource.FILE) labels_resource = Resource(labels_path, Resource.FILE) relevant_resource = Resource(relevant_path, Resource.FILE) size_score_resource = Resource(size_score_path, Resource.FILE) size_score_fig_resource = Resource(size_score_fig_path, Resource.FILE) # check centers try: center_resource.validate() except Exception as e: assert False, "No center data stored. " + e.args else: data = json.loads(center_resource.get_data()) assert isinstance(data, dict) # check labels try: labels_resource.validate() except Exception as e: assert False, "No levels data stored. " + e.args else: data = json.loads(labels_resource.get_data()) assert isinstance(data, dict) # check relevant try: relevant_resource.validate() except Exception as e: assert False, "No relevant data stored. " + e.args else: data = json.loads(relevant_resource.get_data()) assert isinstance(data, dict) for k, v in data.items(): assert isinstance(v, list) # check size vs score try: size_score_resource.validate() except Exception as e: assert False, "No score data stored. " + e.args else: data = json.loads(size_score_resource.get_data()) assert isinstance(data, list) for x in data: assert isinstance(x, list) assert len(x) == 2 # check plot saved try: size_score_fig_resource.validate() except Exception as e: assert False, "No score plot stored. " + e.args
def write_clustured_data_to_files(model, docs_labels, cluster_centers, docids_features, relevant_terms=None): """Write the doc_clusterlabels and cluster_centers to files""" cluster_data_location = settings.ENVIRON_CLUSTERING_DATA_LOCATION resource = Resource(cluster_data_location, Resource.FILE_AND_ENVIRONMENT) # create another resource(folder to keep files) path = os.path.join(resource.get_resource_location(), 'cluster_model_{}'.format(model.id)) # create the directory p = subprocess.Popen(['mkdir', '-p', path], stdout=subprocess.PIPE) _, err = p.communicate() if err: print("Couldn't create cluster data files. {}".format(err)) return # now create centers file center_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME) center_resource = Resource(center_path, Resource.FILE) # convert to python float first or it won't be json serializable centers_data = { i: compress_sparse_vector([float(y) for y in x]) for i, x in enumerate(cluster_centers) } center_resource.write_data(json.dumps(centers_data)) # now create labels file labels_path = os.path.join(path, settings.CLUSTERED_DOCS_LABELS_FILENAME) labels_resource = Resource(labels_path, Resource.FILE) # create dict dict_data = {x: {'label': y} for x, y in docs_labels} for doc_id, features in docids_features.items(): # first make json serializable dict_data[doc_id]['features'] = [ float(x) if isinstance(model.model, KMeansDoc2Vec) else x for x in features ] # Write docs_clusterlabels labels_resource.write_data(json.dumps(dict_data)) # Write relevant terms if present if relevant_terms is not None: relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME) relevant_resource = Resource(relevant_path, Resource.FILE) relevant_resource.write_data(json.dumps(list(relevant_terms))) print("Done writing data")
def get_centers_data(self): path = self.get_cluster_data_path() centers_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME) labels_resource = Resource(centers_path, Resource.FILE) data = json.loads(labels_resource.get_data()) return data
def get_cluster_data_resource(): cluster_data_location = settings.ENVIRON_CLUSTERING_DATA_LOCATION return Resource(cluster_data_location, Resource.FILE_AND_ENVIRONMENT)