예제 #1
0
 def get_labels_data(self):
     path = self.get_cluster_data_path()
     labels_path = os.path.join(path,
                                settings.CLUSTERED_DOCS_LABELS_FILENAME)
     labels_resource = Resource(labels_path, Resource.FILE)
     data = json.loads(labels_resource.get_data())
     return data
예제 #2
0
 def get_model_path(self, model):
     cluster_data_location = settings.ENVIRON_CLUSTERING_DATA_LOCATION
     resource = Resource(cluster_data_location,
                         Resource.FILE_AND_ENVIRONMENT)
     path = os.path.join(resource.get_resource_location(),
                         'cluster_model_{}'.format(model.id))
     return path
예제 #3
0
 def get_cluster_score_vs_size_data(self):
     path = self.get_cluster_data_path()
     data_path = os.path.join(path,
                              settings.CLUSTER_SCORE_DOCS_SIZE_FILENAME)
     data_resource = Resource(data_path, Resource.FILE)
     datastr = data_resource.get_data()
     if datastr is None:
         return []
     return json.loads(datastr)
예제 #4
0
def write_cluster_score_vs_size(model, doc_size):
    """
    Write new scores and doc_size to file.
    NOTE: This will override the previous data
    @model: ClusteringModel instance
    @doc_size: Number of leads on which clustering was done
    """
    data = [(doc_size, model.silhouette_score)]
    path = model.get_cluster_data_path()
    data_path = os.path.join(path, settings.CLUSTER_SCORE_DOCS_SIZE_FILENAME)
    data_resource = Resource(data_path, Resource.FILE)
    data_resource.write_data(json.dumps(data))
    # now plot
    plot_score_vs_size(model, data)
예제 #5
0
 def get_relevant_terms_data(self):
     """This gets data from file unlike compute_relevant terms,
     which calculates relevant terms"""
     path = self.get_cluster_data_path()
     data_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME)
     data_resource = Resource(data_path, Resource.FILE)
     raw = data_resource.get_data()
     if raw:
         return json.loads(raw)
     # compute and write to file
     data = self.compute_relevant_terms()
     # Write to file
     write_relevant_terms_data(self, data)
     return data
예제 #6
0
def update_cluster_score_vs_size(model, increased_size=1):
    """
    Update scores. This won't override.
    @model: ClusteringModel isntance, this contains new score
    @increased_size
    """
    current_data = model.get_cluster_score_vs_size_data()
    last_size = current_data[-1][0]
    current_data.append((last_size+increased_size, model.silhouette_score))
    path = model.get_cluster_data_path()
    data_path = os.path.join(path, settings.CLUSTER_SCORE_DOCS_SIZE_FILENAME)
    data_resource = Resource(data_path, Resource.FILE)
    data_resource.write_data(json.dumps(current_data))
    plot_score_vs_size(model, current_data)
예제 #7
0
파일: models.py 프로젝트: eoglethorpe/DEEPL
 def new(cls, doc2vecmodel, name, group_id, extra_info={}):
     resource = Resource(settings.ENVIRON_DOC2VEC_MODELS_LOCATION,
                         Resource.DIRECTORY_AND_ENVIRONMENT)
     path = resource.get_resource_location()
     doc2vec = cls(name=name,
                   group_id=group_id,
                   modelpath=path,
                   extra_info=extra_info)
     doc2vec.save()
     filename = 'doc2vec_id_{}'.format(doc2vec.id)
     doc2vec.modelpath = os.path.join(doc2vec.modelpath, filename)
     doc2vec.save()
     # finally save the doc2vec model in a file
     doc2vecmodel.save(doc2vec.modelpath)
     return doc2vec
예제 #8
0
def write_or_update_centers_data(model, cluster_centers):
    """
    @model: ClusteringModel instance
    @cluster_centers: [<uncompressed_center_data>, ...]
    """
    path = model.get_cluster_data_path()
    center_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME)
    center_resource = Resource(center_path, Resource.FILE)
    # convert to python float first or it won't be json serializable
    centers_data = {
        i: compress_sparse_vector([float(y) for y in x])
        for i, x in enumerate(cluster_centers)
    }
    # Center data can be directly written whether update is true or false as
    # centers gets updated
    center_resource.write_data(json.dumps(centers_data))
예제 #9
0
def visualize_clusters(model, plottype):
    print("Getting features and labels...")
    nfeatures = 3 if plottype == '3d' else 2
    reduced_features, labels, n_clusters = get_docs_features_labels(
        model, nfeatures)
    print("Plotting clusters...")
    # Plot
    fig = plot(reduced_features, labels, n_clusters, plottype)
    # get location of clusters data
    resource = Resource(settings.ENVIRON_CLUSTERING_DATA_LOCATION,
                        Resource.FILE_AND_ENVIRONMENT)
    path = os.path.join(resource.get_resource_location(),
                        'cluster_model_{}'.format(model.id))
    filepath = os.path.join(path, 'clusterplot.png')
    print("Saving plot to {}".format(filepath))
    fig.savefig(filepath)
예제 #10
0
def write_cluster_labels_data(
        model, docs_labels, docids_features, update=False
        ):
    """
    @model: ClusteringModel instance
    @docs_labels:[(<doc_id>, <label_id>), ...]
    @docids_features: { <doc_id>: <features>, ... }
    @update: False means replace the file contents, else just update
    """
    path = model.get_cluster_data_path()
    # now create labels file
    labels_path = os.path.join(path, settings.CLUSTERED_DOCS_LABELS_FILENAME)
    labels_resource = Resource(labels_path, Resource.FILE)
    # create dict
    dict_data = {x: {'label': y} for x, y in docs_labels}
    for doc_id, features in docids_features.items():
        # first make json serializable
        dict_data[doc_id]['features'] = [
            float(x) if isinstance(model.model, KMeansDoc2Vec) else x
            for x in features
        ]
    # Write docs_clusterlabels
    if update:
        data = json.loads(labels_resource.get_data())
        data.update(dict_data)
        labels_resource.write_data(json.dumps(data))
    else:
        labels_resource.write_data(json.dumps(dict_data))
예제 #11
0
def get_docs_features_labels(model, nfeatures):
    modelid = model.id
    # get location of clusters data
    resource = Resource(settings.ENVIRON_CLUSTERING_DATA_LOCATION,
                        Resource.FILE_AND_ENVIRONMENT)
    path = os.path.join(resource.get_resource_location(),
                        'cluster_model_{}'.format(modelid))
    labels_path = os.path.join(path, settings.CLUSTERED_DOCS_LABELS_FILENAME)
    labels_resource = Resource(labels_path, Resource.FILE)
    labels_data = json.loads(labels_resource.get_data())
    zipped = [(docid, data) for docid, data in labels_data.items()]

    n_clusters = model.n_clusters
    # Now get documents dimensions
    docs_features, docs_labels = [], []
    if isinstance(model.model, KMeansDoc2Vec):

        def identity(x):
            return x

        features_uncompress_function = identity
    else:
        features_uncompress_function = uncompress_compressed_vector
    for did, data in zipped:
        # first uncompress and then store
        docs_features.append(features_uncompress_function(data['features']))
        docs_labels.append(data['label'])
    reduced_features = reduce_dimensions(docs_features, nfeatures)
    print(reduced_features)
    return reduced_features, docs_labels, n_clusters
예제 #12
0
 def get_relevant_terms(self, obj):
     cluster_data_location = settings.ENVIRON_CLUSTERING_DATA_LOCATION
     resource = Resource(cluster_data_location,
                         Resource.FILE_AND_ENVIRONMENT)
     path = os.path.join(resource.get_resource_location(),
                         'cluster_model_{}'.format(obj.id))
     relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME)
     relevant_resource = Resource(relevant_path, Resource.FILE)
     return json.loads(relevant_resource.get_data())
예제 #13
0
def write_relevant_terms_data(model, relevant_terms, update=False):
    """
    @model: ClusteringModel instance
    @relevant_terms: { <cluster_label>: [<relevant_term>, ...], ...}
    @update: False means replace content in file
    """
    path = model.get_cluster_data_path()
    relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME)
    relevant_resource = Resource(relevant_path, Resource.FILE)
    if update:
        curr = relevant_resource.get_data()
        curr.update(relevant_terms)
        relevant_resource.write_data(json.dumps(curr))
    else:
        relevant_resource.write_data(json.dumps(relevant_terms))
예제 #14
0
파일: models.py 프로젝트: eoglethorpe/DEEPL
 def get_labels_data(self):
     cluster_data_location = settings.ENVIRON_CLUSTERING_DATA_LOCATION
     resource = Resource(cluster_data_location,
                         Resource.FILE_AND_ENVIRONMENT)
     # create another resource(folder to keep files)
     path = os.path.join(resource.get_resource_location(),
                         'cluster_model_{}'.format(self.id))
     labels_path = os.path.join(path,
                                settings.CLUSTERED_DOCS_LABELS_FILENAME)
     labels_resource = Resource(labels_path, Resource.FILE)
     data = json.loads(labels_resource.get_data())
     return data
예제 #15
0
 def test_data_files_created(self):
     model = create_new_clusters('test', self.group_id, 2)
     path = self.get_model_path(model)
     center_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME)
     labels_path = os.path.join(path,
                                settings.CLUSTERED_DOCS_LABELS_FILENAME)
     relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME)
     center_resource = Resource(center_path, Resource.FILE)
     labels_resource = Resource(labels_path, Resource.FILE)
     relevant_resource = Resource(relevant_path, Resource.FILE)
     try:
         center_resource.validate()
     except Exception as e:
         assert False, "No center data stored. " + e.args
     else:
         data = json.loads(center_resource.get_data())
         assert isinstance(data, dict)
     try:
         labels_resource.validate()
     except Exception as e:
         assert False, "No levels data stored. " + e.args
     else:
         data = json.loads(labels_resource.get_data())
         assert isinstance(data, dict)
     try:
         relevant_resource.validate()
     except Exception as e:
         assert False, "No relevant data stored. " + e.args
     else:
         data = json.loads(relevant_resource.get_data())
         assert isinstance(data, list)
         assert not all(map(lambda x: len(x) < 2, data))
예제 #16
0
 def test_data_files_created(self):
     # First remove existing clsuter models
     ClusteringModel.objects.all().delete()
     # create one
     model = create_new_clusters('test', self.group_id, 2)
     assert model.all_clustered, "All docs should be clustered while cluster model is created new"  # noqa
     path = self.get_model_path(model)
     center_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME)
     labels_path = os.path.join(path,
                                settings.CLUSTERED_DOCS_LABELS_FILENAME)
     relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME)
     size_score_path = os.path.join(
         path, settings.CLUSTER_SCORE_DOCS_SIZE_FILENAME)
     size_score_fig_path = os.path.join(
         path, settings.CLUSTERS_SCORE_PLOT_FILENAME)
     center_resource = Resource(center_path, Resource.FILE)
     labels_resource = Resource(labels_path, Resource.FILE)
     relevant_resource = Resource(relevant_path, Resource.FILE)
     size_score_resource = Resource(size_score_path, Resource.FILE)
     size_score_fig_resource = Resource(size_score_fig_path, Resource.FILE)
     # check centers
     try:
         center_resource.validate()
     except Exception as e:
         assert False, "No center data stored. " + e.args
     else:
         data = json.loads(center_resource.get_data())
         assert isinstance(data, dict)
     # check labels
     try:
         labels_resource.validate()
     except Exception as e:
         assert False, "No levels data stored. " + e.args
     else:
         data = json.loads(labels_resource.get_data())
         assert isinstance(data, dict)
     # check relevant
     try:
         relevant_resource.validate()
     except Exception as e:
         assert False, "No relevant data stored. " + e.args
     else:
         data = json.loads(relevant_resource.get_data())
         assert isinstance(data, dict)
         for k, v in data.items():
             assert isinstance(v, list)
     # check size vs score
     try:
         size_score_resource.validate()
     except Exception as e:
         assert False, "No score data stored. " + e.args
     else:
         data = json.loads(size_score_resource.get_data())
         assert isinstance(data, list)
         for x in data:
             assert isinstance(x, list)
             assert len(x) == 2
     # check plot saved
     try:
         size_score_fig_resource.validate()
     except Exception as e:
         assert False, "No score plot stored. " + e.args
예제 #17
0
파일: tasks.py 프로젝트: eoglethorpe/DEEPL
def write_clustured_data_to_files(model,
                                  docs_labels,
                                  cluster_centers,
                                  docids_features,
                                  relevant_terms=None):
    """Write the doc_clusterlabels and cluster_centers to files"""
    cluster_data_location = settings.ENVIRON_CLUSTERING_DATA_LOCATION
    resource = Resource(cluster_data_location, Resource.FILE_AND_ENVIRONMENT)
    # create another resource(folder to keep files)
    path = os.path.join(resource.get_resource_location(),
                        'cluster_model_{}'.format(model.id))
    # create the directory
    p = subprocess.Popen(['mkdir', '-p', path], stdout=subprocess.PIPE)
    _, err = p.communicate()
    if err:
        print("Couldn't create cluster data files. {}".format(err))
        return
    # now create centers file
    center_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME)
    center_resource = Resource(center_path, Resource.FILE)
    # convert to python float first or it won't be json serializable
    centers_data = {
        i: compress_sparse_vector([float(y) for y in x])
        for i, x in enumerate(cluster_centers)
    }
    center_resource.write_data(json.dumps(centers_data))
    # now create labels file
    labels_path = os.path.join(path, settings.CLUSTERED_DOCS_LABELS_FILENAME)
    labels_resource = Resource(labels_path, Resource.FILE)
    # create dict
    dict_data = {x: {'label': y} for x, y in docs_labels}
    for doc_id, features in docids_features.items():
        # first make json serializable
        dict_data[doc_id]['features'] = [
            float(x) if isinstance(model.model, KMeansDoc2Vec) else x
            for x in features
        ]
    # Write docs_clusterlabels
    labels_resource.write_data(json.dumps(dict_data))
    # Write relevant terms if present
    if relevant_terms is not None:
        relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME)
        relevant_resource = Resource(relevant_path, Resource.FILE)
        relevant_resource.write_data(json.dumps(list(relevant_terms)))
    print("Done writing data")
예제 #18
0
 def get_centers_data(self):
     path = self.get_cluster_data_path()
     centers_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME)
     labels_resource = Resource(centers_path, Resource.FILE)
     data = json.loads(labels_resource.get_data())
     return data
예제 #19
0
 def get_cluster_data_resource():
     cluster_data_location = settings.ENVIRON_CLUSTERING_DATA_LOCATION
     return Resource(cluster_data_location, Resource.FILE_AND_ENVIRONMENT)