def clusters_sizes(): experiment_id = request.args.get('experiment_id', 0, type=int) Result = get_result_object(experiment_id) records = db_session.query(Result).filter( Result.experiment_id == experiment_id) label_counts = defaultdict(int) for rec in records: label_counts[rec.label] += 1 return render_template('cluster_buttons.html', label_counts=label_counts, experiment_id=experiment_id, all_clusters_label=ALL_CLUSTERS_LABEL)
def construct_stats(query, experiment_id): total = query.count() Result = get_result_object(experiment_id) positive = query.filter(Result.evaluation == True).count() negative = query.filter(Result.evaluation == False).count() observed = positive + negative positive_bernoulli = bernoulli_trial_probability(positive, observed) negative_bernoulli = bernoulli_trial_probability(negative, observed) result = {} result['positive'] = "{} / {} ({:.2f} - {:.2f})".format( positive, observed, *positive_bernoulli) result['negative'] = "{} / {} ({:.2f} - {:.2f})".format( negative, observed, *negative_bernoulli) result['observed'] = "{} / {} ({:.0f} %)".format(observed, total, observed / total * 100) return result
def set_evaluation(): result_id = request.form['result_id'] evaluation = request.form['evaluation'] experiment_id = request.form['experiment_id'] Result = get_result_object(experiment_id) if not evaluation: evaluation = None result_row = db_session.query(Result).filter(Result.id == result_id).one() result_row.evaluation = evaluation sentence_id = result_row.sentence_id # Bubble up the evaluation to all parent experiments. parent_experiment_id = db_session.query(Experiment).filter( Experiment.id == result_row.experiment_id).one().parent_id while parent_experiment_id: parent_row = db_session.query(Result)\ .filter(Result.experiment_id==parent_experiment_id, Result.sentence_id==sentence_id)\ .one() parent_row.evaluation = evaluation parent_experiment_id = db_session.query(Experiment).filter( Experiment.id == parent_row.experiment_id).one().parent_id # Float down the evaluation to all child experiments. child_experiment_id = db_session.query(Experiment).filter( Experiment.id == result_row.experiment_id).one().child_id while child_experiment_id: child_row = db_session.query(Result)\ .filter(Result.experiment_id==child_experiment_id, Result.sentence_id==sentence_id)\ .one_or_none() if not child_row: break child_row.evaluation = evaluation child_experiment_id = db_session.query(Experiment).filter( Experiment.id == child_row.experiment_id).one().child_id db_session.commit() return jsonify(success=1)
def save_labels(self, sentence_indexes, labels): print('saving labels') Result = get_result_object(self.experiment.id) # Get the parent experiment evaluations evaluations = defaultdict(lambda: None) if self.experiment.parent_id: parent_results = db_session.query(Result).filter( Result.experiment_id == self.experiment.parent_id).all() for parent_result in parent_results: evaluations[ parent_result.sentence_id] = parent_result.evaluation for sentence_index, label in zip(sentence_indexes, labels): sentence_id = int(sentence_index) db_session.add( Result(experiment_id=self.experiment.id, label=int(label), sentence_id=sentence_id, evaluation=evaluations[sentence_id])) self.experiment.clusters_count = len(set(labels)) db_session.commit()
def get_statistics(): experiment_id = request.args.get('experiment_id', 0, type=int) label = request.args.get('label', ALL_CLUSTERS_LABEL, type=int) Result = get_result_object(experiment_id) # Experiment stats query = db_session.query(Result).filter( Result.experiment_id == experiment_id) experiment_stats = construct_stats(query, experiment_id) # Cluster stats if label != ALL_CLUSTERS_LABEL: query = db_session.query(Result).filter( Result.experiment_id == experiment_id, Result.label == label) cluster_stats = construct_stats(query, experiment_id) else: cluster_stats = None # Total stats parent_experiment_id = db_session.query(Experiment).filter( Experiment.id == experiment_id).one().parent_id root_experiment_id = None while parent_experiment_id: root_experiment_id = parent_experiment_id parent_experiment_id = db_session.query(Experiment).filter( Experiment.id == parent_experiment_id).one().parent_id if root_experiment_id: query = db_session.query(Result).filter( Result.experiment_id == root_experiment_id) total_stats = construct_stats(query, experiment_id) else: total_stats = None return render_template('get_statistics.html', total_stats=total_stats, experiment_stats=experiment_stats, cluster_stats=cluster_stats)
def run(self): try: # Log the self.experiment start_time = time.time() self.experiment = Experiment(input_type=self.input_type_id, algorithm=self.algorithm_id, processing=self.processing_method_id, start_time=func.current_timestamp(), status='running') db_session.add(self.experiment) db_session.commit() # START PREPROCESSING if self.input_type_name == 'Extractor': self.experiment.regex_name = self.regex_name_id self.experiment.regex_pattern = self.regex_pattern_id db_session.commit() arrays = self.similar_experiment_arrays( filter_args=[(Experiment.regex_name, self.regex_name_id), (Experiment.regex_pattern, self.regex_pattern_id)]) cosine_similarities = arrays['cosine_similarities'] sentence_indexes = arrays['sentence_indexes'] features = arrays['features'] self.experiment.lines = len(sentence_indexes) if self.input_type_name == 'Cluster': parent = db_session.query(Experiment).filter( Experiment.id == self.parent_id).one() parent.child_id = self.experiment.id self.experiment.parent_id = self.parent_id self.experiment.parent_label = self.parent_label self.experiment.regex_name = parent.regex_name self.experiment.regex_pattern = parent.regex_pattern cached_arrays_id = parent.cached_arrays_id self.experiment.cached_arrays_id = cached_arrays_id # cached_arrays = db_session.query(CachedArrays).filter(CachedArrays.id==cached_arrays_id).one() cached_arrays = CACHED_ARRAYS[cached_arrays_id] arrays = self.h5_data_to_numpy(cached_arrays) cosine_similarities = arrays['cosine_similarities'] sentence_indexes = arrays['sentence_indexes'] features = arrays['features'] Result = get_result_object(parent.id) rows = db_session.query(Result).filter( Result.experiment_id == self.parent_id, Result.label == self.parent_label) cluster_indexes = [] sentence_indexes_list = sentence_indexes.tolist() for row in rows: cluster_indexes.append( sentence_indexes_list.index(row.sentence_id)) cosine_similarities = cosine_similarities[ cluster_indexes][:, cluster_indexes] sentence_indexes = sentence_indexes[cluster_indexes] if self.algorithm_name == 'DecisionTree': features = features[cluster_indexes, :] self.experiment.lines = len(sentence_indexes) if self.input_type_name == 'Raw Data' or self.input_type_name == 'Sports Data': print(self.input_type_name) self.experiment.lines = 602 db_session.commit() arrays = self.similar_experiment_arrays( filter_args=[(Experiment.lines, self.experiment.lines)]) cosine_similarities = arrays['cosine_similarities'] sentence_indexes = arrays['sentence_indexes'] features = arrays['features'] db_session.commit() # raise Exception('debugging') # END PREPROCESSING # Log preprocessing duration. self.experiment.preprocessing_seconds = time.time() - start_time db_session.commit() # START CLUSTERING print('starting clustering') clustering_start = time.time() self.CLUSTERS = min(20, len(sentence_indexes)) model = self.get_model() if self.algorithm_name != 'DecisionTree': model.fit(cosine_similarities) else: model.fit(cosine_similarities, features) self.save_labels(sentence_indexes, model.labels_) # END CLUSTERING # Log clustering duration. self.experiment.clustering_seconds = time.time() - clustering_start self.experiment.status = 'finished' db_session.commit() logger.debug('finished') except Exception as e: self.experiment.status = 'error' db_session.add( Traceback(experiment_id=self.experiment.id, message=str(e))) logger.exception(str(e)) db_session.commit() CACHED_ARRAYS.close() raise e