def test_assign_cluster_to_doc(self): # first remove all existing clusters ClusteringModel.objects.all().delete() # and then create a new cluster create_new_clusters(self.cluster_name, self.group_id, self.n_clusters) assert ClusteringModel.objects.all().count() == 1 model = ClusteringModel.objects.last() # add new classifiedDocument doc = ClassifiedDocument.objects.create( classifier=self.doc_sample.classifier, group_id=self.doc_sample.group_id, text="This is another text", classification_label="dummy_label" ) labels_data = model.get_labels_data() len_docs = len(labels_data.keys()) unclustered = get_unclustered_docs(model) assert unclustered, "There should be 1 unclustered doc" assign_cluster_to_doc(doc.id) newmodel = ClusteringModel.objects.last() labels_data = newmodel.get_labels_data() newlen_docs = len(labels_data.keys()) # also check number of docs has increased. Read from file assert newlen_docs == len_docs + 1, "Since one doc is added" assert newmodel.ready
def create_document_clusters(name, group_id, n_clusters, CLUSTER_CLASS=KMeansDocs, doc2vec_group_id=None, recreate=True): """ Create document clusters(ClusteringModel object) based on input params @name: name of the model @group_id: group_id of the model @CLUSTER_CLASS: class on which the clustring(KMeans) is based @doc2vec_group_id: relevant if clusterclass is KMeansDoc2Vec, get doc2vec model and load vectors from it """ # first check if group_id already exists or not try: ClusteringModel.objects.get(group_id=group_id) if not recreate: raise Exception( "Cluster model with group_id {} already exists".format( group_id)) except ClusteringModel.DoesNotExist: pass # create new clustering model create_new_clusters(name, group_id, n_clusters, CLUSTER_CLASS, doc2vec_group_id)
def test_with_valid_doc_id(self): # first create clusters create_new_clusters("test_cluster", self.group_id, 2) params = {'doc_id': self.doc_id} # we created two docs response = self.client.post(self.url, params) assert response.status_code == 200 data = response.json() assert 'similar_docs' in data for docid in data['similar_docs']: assert isinstance(docid, int)
def test_valid_doc_and_doc_id(self): # first create clusters create_new_clusters("test_cluster", self.group_id, 2) params = { 'doc': 'aeroplane, pilot prime minister', 'group_id': self.group_id } response = self.client.post(self.url, params) assert response.status_code == 200 data = response.json() assert 'similar_docs' in data for docid in data['similar_docs']: assert isinstance(docid, int)
def test_data_files_created(self): model = create_new_clusters('test', self.group_id, 2) path = self.get_model_path(model) center_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME) labels_path = os.path.join(path, settings.CLUSTERED_DOCS_LABELS_FILENAME) relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME) center_resource = Resource(center_path, Resource.FILE) labels_resource = Resource(labels_path, Resource.FILE) relevant_resource = Resource(relevant_path, Resource.FILE) try: center_resource.validate() except Exception as e: assert False, "No center data stored. " + e.args else: data = json.loads(center_resource.get_data()) assert isinstance(data, dict) try: labels_resource.validate() except Exception as e: assert False, "No levels data stored. " + e.args else: data = json.loads(labels_resource.get_data()) assert isinstance(data, dict) try: relevant_resource.validate() except Exception as e: assert False, "No relevant data stored. " + e.args else: data = json.loads(relevant_resource.get_data()) assert isinstance(data, list) assert not all(map(lambda x: len(x) < 2, data))
def test_cluster_data_not_fully_clustered(self): """Test by sending valid data""" # Remove clusters ClusteringModel.objects.all().delete() # first create a cluster cluster_model = create_new_clusters("test", self.group_id, self.num_clusters) # add a classifed doc ClassifiedDocument.objects.create(text="test test test", group_id=self.group_id, classifier=self.classifier) params = {'cluster_model_id': cluster_model.id} resp = self.client.get(self.url, params) assert resp.status_code == 200 data = resp.json() assert 'keywords' in data assert 'docs' in data assert 'full_clustered' in data assert not data[ 'full_clustered'], "If doc is added, model should not be fully clustered" # noqa assert isinstance(data['keywords'], list) for entry in data['keywords']: assert isinstance(entry, dict) assert 'cluster' in entry assert 'score' in entry assert 'value' in entry
def test_cluster_data_fully_clustered(self): """Test by sending valid data""" # Remove clusters ClusteringModel.objects.all().delete() # first create a cluster cluster_model = create_new_clusters("test", self.group_id, self.num_clusters) params = {'cluster_model_id': cluster_model.id} resp = self.client.get(self.url, params) assert resp.status_code == 200 data = resp.json() assert 'full_clustered' in data assert data[ 'full_clustered'], "Recently created model should be fully clustered" # noqa assert isinstance(data, dict) assert 'keywords' in data assert 'docs' in data assert isinstance(data['docs'], dict) for entry in data['keywords']: assert isinstance(entry, dict) assert 'cluster' in entry assert 'score' in entry assert 'value' in entry for label, docs in data['docs'].items(): assert isinstance(docs, list) assert docs, "Docs should not be empty for a cluster" for docid in docs: assert isinstance(docid, int)
def test_recluster(self): # first remove all existing clusters ClusteringModel.objects.all().delete() # and then create a new cluster create_new_clusters(self.cluster_name, self.group_id, self.n_clusters) assert ClusteringModel.objects.all().count() == 1 model = ClusteringModel.objects.last() recluster(model) newmodel = ClusteringModel.objects.last() assert newmodel.ready assert newmodel.last_clustering_started \ >= model.last_clustering_started assert newmodel.last_clustered_on >= model.last_clustered_on # also check size vs cluster score file created data = model.get_cluster_score_vs_size_data() assert data is not None assert data != [] assert isinstance(data, list)
def test_new_cluster_created(self): assert ClusteringModel.objects.all().count() == 0 create_new_clusters(self.cluster_name, self.group_id, self.n_clusters) assert ClusteringModel.objects.all().count() == 1 model = ClusteringModel.objects.last() assert model.group_id == self.group_id assert model.ready # test appropriate files created dirname = os.path.join( self.test_cluster_data_dir, "cluster_model_{}".format(model.id) ) assert os.path.isdir(dirname) # also check size vs cluster score file created data = model.get_cluster_score_vs_size_data() assert data is not None assert data != [] assert isinstance(data, list)
def test_clustered_prepared_resposne(self): # create a clustered model cluster_model = create_new_clusters("test_cluster", self.group_id, 2) params = self.valid_params response = self.client.post(self.api_url, params) assert response.status_code == 201 data = response.json() assert 'cluster_model_id' in data assert isinstance(data['cluster_model_id'], int) assert data['cluster_model_id'] == cluster_model.id
def test_all_clustered_if_docs_added(self): # First remove existing clsuter models ClusteringModel.objects.all().delete() # create one model = create_new_clusters('test', self.group_id, 2) assert model.all_clustered, "All docs should be clustered while cluster model is created new" # noqa # now add a Classified Document with same group_id ClassifiedDocument.objects.create(text="test text", classifier=self.classifier, group_id=self.group_id) # re get the model model = ClusteringModel.objects.get(id=model.id) assert not model.all_clustered, "all_clustered should be false whenever a new doc is added" # noqa
def test_cluster_data_not_ready(self): # remove all clusters ClusteringModel.objects.all().delete() # create one and set ready false cluster_model = create_new_clusters("test", self.group_id, self.num_clusters) cluster_model.ready = False cluster_model.save() params = {'cluster_model_id': cluster_model.id} resp = self.client.get(self.url, params) assert resp.status_code == 202 data = resp.json() assert 'message' in data
def test_update_cluster(self): # first remove all existing clusters ClusteringModel.objects.all().delete() # and then create a new cluster create_new_clusters(self.cluster_name, self.group_id, self.n_clusters) assert ClusteringModel.objects.all().count() == 1 model = ClusteringModel.objects.last() # add new classifiedDocument ClassifiedDocument.objects.create( classifier=self.doc_sample.classifier, group_id=self.doc_sample.group_id, text="This is another text", classification_label="dummy_label" ) labels_data = model.get_labels_data() len_docs = len(labels_data.keys()) unclustered = get_unclustered_docs(model) assert unclustered, "There should be 1 unclustered doc" # get current score_vs size data old_data = model.get_cluster_score_vs_size_data() update_clusters() # get new clustere score vs size data new_data = model.get_cluster_score_vs_size_data() assert len(old_data) + 1 == len(new_data) assert new_data[-2] == old_data[-1] newmodel = ClusteringModel.objects.last() labels_data = newmodel.get_labels_data() newlen_docs = len(labels_data.keys()) # also check number of docs has increased. Read from file assert newlen_docs == len_docs + 1, "Since one doc is added" assert newmodel.ready assert newmodel.last_clustering_started > model.last_clustering_started assert newmodel.last_clustered_on > model.last_clustered_on
def test_get_cluster(self): cluster_model = create_new_clusters("test_cluster", self.group_id, 2) params = {'model_id': cluster_model.id} response = self.client.get(self.api_url, params) assert response.status_code == 200 data = response.json() assert 'score' in data assert data['score'] >= -1 and data['score'] <= 1 assert 'doc_ids' in data assert isinstance(data['doc_ids'], list) for did in data['doc_ids']: isinstance(did, int) assert 'relevant_terms' in data for term in data['relevant_terms']: assert isinstance(term, str) assert 'group_id' in data assert isinstance(data['group_id'], str)
def setUp(self): self.cluster_data_path = 'test_clusters/' # create path if not exist os.system('mkdir -p {}'.format(self.cluster_data_path)) os.environ[settings.ENVIRON_CLUSTERING_DATA_LOCATION] = \ self.cluster_data_path # set values self.group_id = '1' self.num_clusters = 2 self.url = '/api/re-cluster/' self.valid_params = { 'group_id': self.group_id, 'num_clusters': self.num_clusters } self.cluster_model = create_new_clusters("test_cluster", self.group_id, self.num_clusters) # this creates token and adds that to client header super().setUp()
def test_data_files_created(self): # First remove existing clsuter models ClusteringModel.objects.all().delete() # create one model = create_new_clusters('test', self.group_id, 2) assert model.all_clustered, "All docs should be clustered while cluster model is created new" # noqa path = self.get_model_path(model) center_path = os.path.join(path, settings.CLUSTERS_CENTERS_FILENAME) labels_path = os.path.join(path, settings.CLUSTERED_DOCS_LABELS_FILENAME) relevant_path = os.path.join(path, settings.RELEVANT_TERMS_FILENAME) size_score_path = os.path.join( path, settings.CLUSTER_SCORE_DOCS_SIZE_FILENAME) size_score_fig_path = os.path.join( path, settings.CLUSTERS_SCORE_PLOT_FILENAME) center_resource = Resource(center_path, Resource.FILE) labels_resource = Resource(labels_path, Resource.FILE) relevant_resource = Resource(relevant_path, Resource.FILE) size_score_resource = Resource(size_score_path, Resource.FILE) size_score_fig_resource = Resource(size_score_fig_path, Resource.FILE) # check centers try: center_resource.validate() except Exception as e: assert False, "No center data stored. " + e.args else: data = json.loads(center_resource.get_data()) assert isinstance(data, dict) # check labels try: labels_resource.validate() except Exception as e: assert False, "No levels data stored. " + e.args else: data = json.loads(labels_resource.get_data()) assert isinstance(data, dict) # check relevant try: relevant_resource.validate() except Exception as e: assert False, "No relevant data stored. " + e.args else: data = json.loads(relevant_resource.get_data()) assert isinstance(data, dict) for k, v in data.items(): assert isinstance(v, list) # check size vs score try: size_score_resource.validate() except Exception as e: assert False, "No score data stored. " + e.args else: data = json.loads(size_score_resource.get_data()) assert isinstance(data, list) for x in data: assert isinstance(x, list) assert len(x) == 2 # check plot saved try: size_score_fig_resource.validate() except Exception as e: assert False, "No score plot stored. " + e.args