def sentences_work_extracted_number(self): print('sentences_work_extracted_number') # regex_name = db_session.query(RegexName).filter(RegexName.id == self.regex_name_id).one().name regex_name = get_model_attr_by_id(RegexName, 'name', self.regex_name_id) # regex_pattern = db_session.query(RegexPattern).filter(RegexPattern.id == self.regex_pattern_id).one().pattern regex_pattern = get_model_attr_by_id(RegexPattern, 'pattern', self.regex_pattern_id) if regex_pattern == '*': rows = db_session.query(WorkExtractedNumber).filter( WorkExtractedNumber.regex_name == regex_name).all() else: rows = db_session.query(WorkExtractedNumber)\ .filter(WorkExtractedNumber.regex_name == regex_name, WorkExtractedNumber.regex_pattern == regex_pattern)\ .all() # remove duplicataes sentences_no_dups = [] sentence_indexes = [] for row in rows: sentence = '%s %s' % (row.left_context, row.right_context) sentence = sentence.replace('\t', '').replace('\n', '').strip().lower() if sentence not in sentences_no_dups: sentences_no_dups.append(sentence) sentence_indexes.append(row.id) return sentences_no_dups, sentence_indexes
def get_root_experiment_input_type(experiment_id): experiment = db_session.query(Experiment).filter(Experiment.id==experiment_id).one() parent_experiment_id = db_session.query(Experiment).filter(Experiment.id==experiment_id).one().parent_id if parent_experiment_id: while parent_experiment_id: experiment = db_session.query(Experiment).filter(Experiment.id==parent_experiment_id).one() parent_experiment_id = experiment.parent_id input_type_id = experiment.input_type input_type_name = get_model_attr_by_id(InputType, 'name', input_type_id) return input_type_name
def similar_experiment_arrays(self, filter_args): similar_experiment_query = db_session.query(Experiment)\ .filter(Experiment.id!=self.experiment.id, Experiment.input_type==self.input_type_id, Experiment.processing==self.processing_method_id, Experiment.cached_arrays_id!=None) for args in filter_args: similar_experiment_query = similar_experiment_query.filter( args[0] == args[1]) similar_experiment = similar_experiment_query.first() if similar_experiment: print('found similar experiment, getting arrays from db') cache_id = similar_experiment.cached_arrays_id cached_arrays = CACHED_ARRAYS[str(cache_id)] self.experiment.cached_arrays_id = cache_id else: print('no such experiment, making new arrays') sentences, sentence_indexes = self.get_indexed_sentences() cached_arrays = self.construct_cached_arrays( sentences, sentence_indexes) arrays = self.h5_data_to_numpy(cached_arrays) return arrays
def sentences_extracted_number(self, model_class): print('sentences_raw_extracted_number') result = db_session.query(model_class).limit(self.experiment.lines) sentences = [] sentence_indexes = [] for row in result: sentences.append("{} {}".format(row.left_context, row.right_context)) sentence_indexes.append(row.id) return sentences, sentence_indexes
def get_statistics(): experiment_id = request.args.get('experiment_id', 0, type=int) label = request.args.get('label', ALL_CLUSTERS_LABEL, type=int) Result = get_result_object(experiment_id) # Experiment stats query = db_session.query(Result).filter( Result.experiment_id == experiment_id) experiment_stats = construct_stats(query, experiment_id) # Cluster stats if label != ALL_CLUSTERS_LABEL: query = db_session.query(Result).filter( Result.experiment_id == experiment_id, Result.label == label) cluster_stats = construct_stats(query, experiment_id) else: cluster_stats = None # Total stats parent_experiment_id = db_session.query(Experiment).filter( Experiment.id == experiment_id).one().parent_id root_experiment_id = None while parent_experiment_id: root_experiment_id = parent_experiment_id parent_experiment_id = db_session.query(Experiment).filter( Experiment.id == parent_experiment_id).one().parent_id if root_experiment_id: query = db_session.query(Result).filter( Result.experiment_id == root_experiment_id) total_stats = construct_stats(query, experiment_id) else: total_stats = None return render_template('get_statistics.html', total_stats=total_stats, experiment_stats=experiment_stats, cluster_stats=cluster_stats)
def get_latest_experiments(): records = db_session.query(Experiment, InputType, Algorithm, RegexName, RegexPattern, Processing, Traceback)\ .join(InputType)\ .join(Algorithm)\ .join(RegexPattern, isouter=True)\ .join(RegexName, isouter=True)\ .join(Processing)\ .join(Traceback, isouter=True)\ .order_by(Experiment.start_time.desc())\ .limit(LATEST_EXPERIMENTS_COUNT) result = [] for rec in records: if rec.Traceback: traceback_message = rec.Traceback.message else: traceback_message = '' if rec.RegexName: regex_name = rec.RegexName.name regex_pattern = rec.RegexPattern.pattern else: regex_name = '' regex_pattern = '' attributes = { 'id': rec.Experiment.id, 'input_type_id': rec.Experiment.input_type, 'input_type': rec.InputType.name, 'algorithm_id': rec.Experiment.algorithm, 'algorithm': rec.Algorithm.name, 'regex_name_id': rec.Experiment.regex_name, 'regex_name': regex_name, 'regex_pattern_id': rec.Experiment.regex_pattern, 'regex_pattern': regex_pattern, 'parent_experiment_id': rec.Experiment.parent_id, 'parent_experiment_label': rec.Experiment.parent_label, 'processing_id': rec.Experiment.processing, 'processing': rec.Processing.name, 'status': rec.Experiment.status, 'start_time': rec.Experiment.start_time, 'lines': rec.Experiment.lines, 'clusters_count': rec.Experiment.clusters_count, 'traceback': traceback_message } result.append(attributes) return result
def clusters_sizes(): experiment_id = request.args.get('experiment_id', 0, type=int) Result = get_result_object(experiment_id) records = db_session.query(Result).filter( Result.experiment_id == experiment_id) label_counts = defaultdict(int) for rec in records: label_counts[rec.label] += 1 return render_template('cluster_buttons.html', label_counts=label_counts, experiment_id=experiment_id, all_clusters_label=ALL_CLUSTERS_LABEL)
def set_evaluation(): result_id = request.form['result_id'] evaluation = request.form['evaluation'] experiment_id = request.form['experiment_id'] Result = get_result_object(experiment_id) if not evaluation: evaluation = None result_row = db_session.query(Result).filter(Result.id == result_id).one() result_row.evaluation = evaluation sentence_id = result_row.sentence_id # Bubble up the evaluation to all parent experiments. parent_experiment_id = db_session.query(Experiment).filter( Experiment.id == result_row.experiment_id).one().parent_id while parent_experiment_id: parent_row = db_session.query(Result)\ .filter(Result.experiment_id==parent_experiment_id, Result.sentence_id==sentence_id)\ .one() parent_row.evaluation = evaluation parent_experiment_id = db_session.query(Experiment).filter( Experiment.id == parent_row.experiment_id).one().parent_id # Float down the evaluation to all child experiments. child_experiment_id = db_session.query(Experiment).filter( Experiment.id == result_row.experiment_id).one().child_id while child_experiment_id: child_row = db_session.query(Result)\ .filter(Result.experiment_id==child_experiment_id, Result.sentence_id==sentence_id)\ .one_or_none() if not child_row: break child_row.evaluation = evaluation child_experiment_id = db_session.query(Experiment).filter( Experiment.id == child_row.experiment_id).one().child_id db_session.commit() return jsonify(success=1)
def save_labels(self, sentence_indexes, labels): print('saving labels') Result = get_result_object(self.experiment.id) # Get the parent experiment evaluations evaluations = defaultdict(lambda: None) if self.experiment.parent_id: parent_results = db_session.query(Result).filter( Result.experiment_id == self.experiment.parent_id).all() for parent_result in parent_results: evaluations[ parent_result.sentence_id] = parent_result.evaluation for sentence_index, label in zip(sentence_indexes, labels): sentence_id = int(sentence_index) db_session.add( Result(experiment_id=self.experiment.id, label=int(label), sentence_id=sentence_id, evaluation=evaluations[sentence_id])) self.experiment.clusters_count = len(set(labels)) db_session.commit()
def get_model_attr_by_id(model, attribute, index): rec = db_session.query(model).filter(model.id == index).one() return getattr(rec, attribute)
def get_sample(): sample_type = request.args.get('type', type=str) experiment_id = request.args.get('experiment_id', 0, type=int) label = request.args.get('label', 0, type=int) sample_size = request.args.get('sample_size', 0, type=int) sample_all = request.args.get('sample_all', 0, type=int) sample_observed = request.args.get('sample-observed', 0, type=int) filtering_type = request.args.get('filtering-type', type=str) left_filter = request.args.get('left_filter', type=str) content_filter = request.args.get('content_filter', type=str) right_filter = request.args.get('right_filter', type=str) InputDocument, Result = get_input_models(experiment_id) query = db_session.query(Result, InputDocument).join(InputDocument) if label == ALL_CLUSTERS_LABEL: query = query.filter(Result.experiment_id == experiment_id) else: query = query.filter(Result.experiment_id == experiment_id, Result.label == label) if sample_observed: query = query.filter((Result.evaluation == True) | (Result.evaluation == False)) else: query = query.filter(Result.evaluation == None) records = query.all() if left_filter or content_filter or right_filter: filtered_records = [] if filtering_type == 'or': filter_base = [False, False, False] if filtering_type == 'and': filter_base = [ not left_filter, not content_filter, not right_filter ] for rec in records: # match = False filter_matches = list(filter_base) if left_filter and re.search( left_filter, getattr(rec, InputDocument.__name__).left_context.lower()): filter_matches[0] = True if content_filter and re.search( content_filter, getattr(rec, InputDocument.__name__).content.lower()): filter_matches[1] = True if right_filter and re.search( right_filter, getattr(rec, InputDocument.__name__).right_context.lower()): filter_matches[2] = True if filtering_type == 'or': match = filter_matches[0] or filter_matches[ 1] or filter_matches[2] if filtering_type == 'and': match = filter_matches[0] and filter_matches[ 1] and filter_matches[2] if match: filtered_records.append(rec) records = filtered_records filtered_size = len(records) else: filtered_size = 0 if sample_type == 'random': random.shuffle(records) if not sample_all: records = records[:sample_size] if sample_type == 'heterogenous': cached_arrays_id = db_session.query(Experiment).filter( Experiment.id == experiment_id).one().cached_arrays_id with h5py.File("cached_arrays.hdf5") as f: cached_arrays = f[cached_arrays_id] cosine_similarities = np.array( cached_arrays['cosine_similarities']) sentence_indexes = np.array(cached_arrays['sentence_indexes']) cluster_indexes = [] result_ids = [] evaluations = [] for rec in records: cluster_indexes.append( np.where(sentence_indexes == getattr( rec, Result.__name__).sentence_id)[0][0]) cosine_similarities = cosine_similarities[ cluster_indexes][:, cluster_indexes] sentence_indexes = np.array(sentence_indexes)[cluster_indexes] indexes = np.array([np.random.randint(len(cosine_similarities))]) for i in range(min(len(cluster_indexes) - 1, sample_size - 1)): distance_array = np.sum(cosine_similarities[indexes], axis=0) distance_array[indexes] = np.inf argmax = np.argwhere( distance_array == np.min(distance_array)).flatten() max_index = np.random.choice(argmax, 1) indexes = np.append(indexes, max_index) # print(indexes) homogenous_records = [] for index in indexes: homogenous_records.append(records[index]) records = homogenous_records sentences = [] for rec in records: left_context = clean_text( getattr(rec, InputDocument.__name__).left_context) text = clean_text(getattr(rec, InputDocument.__name__).content) right_context = clean_text( getattr(rec, InputDocument.__name__).right_context) sentences.append({ 'result_id': getattr(rec, Result.__name__).id, 'event_id': getattr(rec, Result.__name__).sentence_id, 'evaluation': getattr(rec, Result.__name__).evaluation, 'left_context': left_context, 'text': text, 'right_context': right_context }) sentences = sorted(sentences, key=lambda x: x['text']) return jsonify(html=render_template('get_sample.html', sentences=sentences), filteredSize=filtered_size)
def run(self): try: # Log the self.experiment start_time = time.time() self.experiment = Experiment(input_type=self.input_type_id, algorithm=self.algorithm_id, processing=self.processing_method_id, start_time=func.current_timestamp(), status='running') db_session.add(self.experiment) db_session.commit() # START PREPROCESSING if self.input_type_name == 'Extractor': self.experiment.regex_name = self.regex_name_id self.experiment.regex_pattern = self.regex_pattern_id db_session.commit() arrays = self.similar_experiment_arrays( filter_args=[(Experiment.regex_name, self.regex_name_id), (Experiment.regex_pattern, self.regex_pattern_id)]) cosine_similarities = arrays['cosine_similarities'] sentence_indexes = arrays['sentence_indexes'] features = arrays['features'] self.experiment.lines = len(sentence_indexes) if self.input_type_name == 'Cluster': parent = db_session.query(Experiment).filter( Experiment.id == self.parent_id).one() parent.child_id = self.experiment.id self.experiment.parent_id = self.parent_id self.experiment.parent_label = self.parent_label self.experiment.regex_name = parent.regex_name self.experiment.regex_pattern = parent.regex_pattern cached_arrays_id = parent.cached_arrays_id self.experiment.cached_arrays_id = cached_arrays_id # cached_arrays = db_session.query(CachedArrays).filter(CachedArrays.id==cached_arrays_id).one() cached_arrays = CACHED_ARRAYS[cached_arrays_id] arrays = self.h5_data_to_numpy(cached_arrays) cosine_similarities = arrays['cosine_similarities'] sentence_indexes = arrays['sentence_indexes'] features = arrays['features'] Result = get_result_object(parent.id) rows = db_session.query(Result).filter( Result.experiment_id == self.parent_id, Result.label == self.parent_label) cluster_indexes = [] sentence_indexes_list = sentence_indexes.tolist() for row in rows: cluster_indexes.append( sentence_indexes_list.index(row.sentence_id)) cosine_similarities = cosine_similarities[ cluster_indexes][:, cluster_indexes] sentence_indexes = sentence_indexes[cluster_indexes] if self.algorithm_name == 'DecisionTree': features = features[cluster_indexes, :] self.experiment.lines = len(sentence_indexes) if self.input_type_name == 'Raw Data' or self.input_type_name == 'Sports Data': print(self.input_type_name) self.experiment.lines = 602 db_session.commit() arrays = self.similar_experiment_arrays( filter_args=[(Experiment.lines, self.experiment.lines)]) cosine_similarities = arrays['cosine_similarities'] sentence_indexes = arrays['sentence_indexes'] features = arrays['features'] db_session.commit() # raise Exception('debugging') # END PREPROCESSING # Log preprocessing duration. self.experiment.preprocessing_seconds = time.time() - start_time db_session.commit() # START CLUSTERING print('starting clustering') clustering_start = time.time() self.CLUSTERS = min(20, len(sentence_indexes)) model = self.get_model() if self.algorithm_name != 'DecisionTree': model.fit(cosine_similarities) else: model.fit(cosine_similarities, features) self.save_labels(sentence_indexes, model.labels_) # END CLUSTERING # Log clustering duration. self.experiment.clustering_seconds = time.time() - clustering_start self.experiment.status = 'finished' db_session.commit() logger.debug('finished') except Exception as e: self.experiment.status = 'error' db_session.add( Traceback(experiment_id=self.experiment.id, message=str(e))) logger.exception(str(e)) db_session.commit() CACHED_ARRAYS.close() raise e