Exemplo n.º 1
0
    def sentences_work_extracted_number(self):
        print('sentences_work_extracted_number')
        # regex_name = db_session.query(RegexName).filter(RegexName.id == self.regex_name_id).one().name
        regex_name = get_model_attr_by_id(RegexName, 'name',
                                          self.regex_name_id)
        # regex_pattern = db_session.query(RegexPattern).filter(RegexPattern.id == self.regex_pattern_id).one().pattern
        regex_pattern = get_model_attr_by_id(RegexPattern, 'pattern',
                                             self.regex_pattern_id)

        if regex_pattern == '*':
            rows = db_session.query(WorkExtractedNumber).filter(
                WorkExtractedNumber.regex_name == regex_name).all()
        else:
            rows = db_session.query(WorkExtractedNumber)\
                    .filter(WorkExtractedNumber.regex_name == regex_name,
                            WorkExtractedNumber.regex_pattern == regex_pattern)\
                    .all()
        # remove duplicataes
        sentences_no_dups = []
        sentence_indexes = []
        for row in rows:
            sentence = '%s %s' % (row.left_context, row.right_context)
            sentence = sentence.replace('\t', '').replace('\n',
                                                          '').strip().lower()
            if sentence not in sentences_no_dups:
                sentences_no_dups.append(sentence)
                sentence_indexes.append(row.id)

        return sentences_no_dups, sentence_indexes
Exemplo n.º 2
0
def get_root_experiment_input_type(experiment_id):
    experiment = db_session.query(Experiment).filter(Experiment.id==experiment_id).one()
    parent_experiment_id = db_session.query(Experiment).filter(Experiment.id==experiment_id).one().parent_id
    if parent_experiment_id:
        while parent_experiment_id:
            experiment = db_session.query(Experiment).filter(Experiment.id==parent_experiment_id).one()
            parent_experiment_id = experiment.parent_id
    input_type_id = experiment.input_type
    input_type_name = get_model_attr_by_id(InputType, 'name', input_type_id)
    return input_type_name
Exemplo n.º 3
0
    def similar_experiment_arrays(self, filter_args):

        similar_experiment_query = db_session.query(Experiment)\
                .filter(Experiment.id!=self.experiment.id,
                        Experiment.input_type==self.input_type_id,
                        Experiment.processing==self.processing_method_id,
                        Experiment.cached_arrays_id!=None)

        for args in filter_args:
            similar_experiment_query = similar_experiment_query.filter(
                args[0] == args[1])

        similar_experiment = similar_experiment_query.first()

        if similar_experiment:
            print('found similar experiment, getting arrays from db')
            cache_id = similar_experiment.cached_arrays_id
            cached_arrays = CACHED_ARRAYS[str(cache_id)]
            self.experiment.cached_arrays_id = cache_id

        else:
            print('no such experiment, making new arrays')
            sentences, sentence_indexes = self.get_indexed_sentences()
            cached_arrays = self.construct_cached_arrays(
                sentences, sentence_indexes)

        arrays = self.h5_data_to_numpy(cached_arrays)

        return arrays
Exemplo n.º 4
0
 def sentences_extracted_number(self, model_class):
     print('sentences_raw_extracted_number')
     result = db_session.query(model_class).limit(self.experiment.lines)
     sentences = []
     sentence_indexes = []
     for row in result:
         sentences.append("{} {}".format(row.left_context,
                                         row.right_context))
         sentence_indexes.append(row.id)
     return sentences, sentence_indexes
Exemplo n.º 5
0
def get_statistics():

    experiment_id = request.args.get('experiment_id', 0, type=int)
    label = request.args.get('label', ALL_CLUSTERS_LABEL, type=int)

    Result = get_result_object(experiment_id)

    # Experiment stats
    query = db_session.query(Result).filter(
        Result.experiment_id == experiment_id)
    experiment_stats = construct_stats(query, experiment_id)

    # Cluster stats
    if label != ALL_CLUSTERS_LABEL:
        query = db_session.query(Result).filter(
            Result.experiment_id == experiment_id, Result.label == label)
        cluster_stats = construct_stats(query, experiment_id)
    else:
        cluster_stats = None

    # Total stats
    parent_experiment_id = db_session.query(Experiment).filter(
        Experiment.id == experiment_id).one().parent_id
    root_experiment_id = None
    while parent_experiment_id:
        root_experiment_id = parent_experiment_id
        parent_experiment_id = db_session.query(Experiment).filter(
            Experiment.id == parent_experiment_id).one().parent_id

    if root_experiment_id:
        query = db_session.query(Result).filter(
            Result.experiment_id == root_experiment_id)
        total_stats = construct_stats(query, experiment_id)
    else:
        total_stats = None

    return render_template('get_statistics.html',
                           total_stats=total_stats,
                           experiment_stats=experiment_stats,
                           cluster_stats=cluster_stats)
Exemplo n.º 6
0
def get_latest_experiments():

    records = db_session.query(Experiment, InputType, Algorithm, RegexName, RegexPattern, Processing, Traceback)\
            .join(InputType)\
            .join(Algorithm)\
            .join(RegexPattern, isouter=True)\
            .join(RegexName, isouter=True)\
            .join(Processing)\
            .join(Traceback, isouter=True)\
            .order_by(Experiment.start_time.desc())\
            .limit(LATEST_EXPERIMENTS_COUNT)

    result = []
    for rec in records:

        if rec.Traceback:
            traceback_message = rec.Traceback.message
        else:
            traceback_message = ''

        if rec.RegexName:
            regex_name = rec.RegexName.name
            regex_pattern = rec.RegexPattern.pattern
        else:
            regex_name = ''
            regex_pattern = ''

        attributes = {
            'id': rec.Experiment.id,
            'input_type_id': rec.Experiment.input_type,
            'input_type': rec.InputType.name,
            'algorithm_id': rec.Experiment.algorithm,
            'algorithm': rec.Algorithm.name,
            'regex_name_id': rec.Experiment.regex_name,
            'regex_name': regex_name,
            'regex_pattern_id': rec.Experiment.regex_pattern,
            'regex_pattern': regex_pattern,
            'parent_experiment_id': rec.Experiment.parent_id,
            'parent_experiment_label': rec.Experiment.parent_label,
            'processing_id': rec.Experiment.processing,
            'processing': rec.Processing.name,
            'status': rec.Experiment.status,
            'start_time': rec.Experiment.start_time,
            'lines': rec.Experiment.lines,
            'clusters_count': rec.Experiment.clusters_count,
            'traceback': traceback_message
        }
        result.append(attributes)

    return result
Exemplo n.º 7
0
def clusters_sizes():

    experiment_id = request.args.get('experiment_id', 0, type=int)

    Result = get_result_object(experiment_id)

    records = db_session.query(Result).filter(
        Result.experiment_id == experiment_id)
    label_counts = defaultdict(int)
    for rec in records:
        label_counts[rec.label] += 1

    return render_template('cluster_buttons.html',
                           label_counts=label_counts,
                           experiment_id=experiment_id,
                           all_clusters_label=ALL_CLUSTERS_LABEL)
Exemplo n.º 8
0
def set_evaluation():

    result_id = request.form['result_id']
    evaluation = request.form['evaluation']
    experiment_id = request.form['experiment_id']

    Result = get_result_object(experiment_id)

    if not evaluation:
        evaluation = None
    result_row = db_session.query(Result).filter(Result.id == result_id).one()
    result_row.evaluation = evaluation

    sentence_id = result_row.sentence_id
    # Bubble up the evaluation to all parent experiments.
    parent_experiment_id = db_session.query(Experiment).filter(
        Experiment.id == result_row.experiment_id).one().parent_id
    while parent_experiment_id:
        parent_row = db_session.query(Result)\
                .filter(Result.experiment_id==parent_experiment_id,
                        Result.sentence_id==sentence_id)\
                .one()
        parent_row.evaluation = evaluation
        parent_experiment_id = db_session.query(Experiment).filter(
            Experiment.id == parent_row.experiment_id).one().parent_id

    # Float down the evaluation to all child experiments.
    child_experiment_id = db_session.query(Experiment).filter(
        Experiment.id == result_row.experiment_id).one().child_id
    while child_experiment_id:
        child_row = db_session.query(Result)\
                .filter(Result.experiment_id==child_experiment_id,
                        Result.sentence_id==sentence_id)\
                .one_or_none()
        if not child_row:
            break
        child_row.evaluation = evaluation
        child_experiment_id = db_session.query(Experiment).filter(
            Experiment.id == child_row.experiment_id).one().child_id

    db_session.commit()

    return jsonify(success=1)
Exemplo n.º 9
0
    def save_labels(self, sentence_indexes, labels):
        print('saving labels')
        Result = get_result_object(self.experiment.id)
        # Get the parent experiment evaluations
        evaluations = defaultdict(lambda: None)
        if self.experiment.parent_id:
            parent_results = db_session.query(Result).filter(
                Result.experiment_id == self.experiment.parent_id).all()
            for parent_result in parent_results:
                evaluations[
                    parent_result.sentence_id] = parent_result.evaluation

        for sentence_index, label in zip(sentence_indexes, labels):
            sentence_id = int(sentence_index)
            db_session.add(
                Result(experiment_id=self.experiment.id,
                       label=int(label),
                       sentence_id=sentence_id,
                       evaluation=evaluations[sentence_id]))
        self.experiment.clusters_count = len(set(labels))
        db_session.commit()
Exemplo n.º 10
0
def get_model_attr_by_id(model, attribute, index):
    rec = db_session.query(model).filter(model.id == index).one()
    return getattr(rec, attribute)
Exemplo n.º 11
0
def get_sample():

    sample_type = request.args.get('type', type=str)
    experiment_id = request.args.get('experiment_id', 0, type=int)
    label = request.args.get('label', 0, type=int)
    sample_size = request.args.get('sample_size', 0, type=int)
    sample_all = request.args.get('sample_all', 0, type=int)
    sample_observed = request.args.get('sample-observed', 0, type=int)

    filtering_type = request.args.get('filtering-type', type=str)
    left_filter = request.args.get('left_filter', type=str)
    content_filter = request.args.get('content_filter', type=str)
    right_filter = request.args.get('right_filter', type=str)

    InputDocument, Result = get_input_models(experiment_id)

    query = db_session.query(Result, InputDocument).join(InputDocument)

    if label == ALL_CLUSTERS_LABEL:
        query = query.filter(Result.experiment_id == experiment_id)
    else:
        query = query.filter(Result.experiment_id == experiment_id,
                             Result.label == label)

    if sample_observed:
        query = query.filter((Result.evaluation == True)
                             | (Result.evaluation == False))
    else:
        query = query.filter(Result.evaluation == None)

    records = query.all()
    if left_filter or content_filter or right_filter:
        filtered_records = []
        if filtering_type == 'or':
            filter_base = [False, False, False]
        if filtering_type == 'and':
            filter_base = [
                not left_filter, not content_filter, not right_filter
            ]
        for rec in records:
            # match = False
            filter_matches = list(filter_base)
            if left_filter and re.search(
                    left_filter,
                    getattr(rec, InputDocument.__name__).left_context.lower()):
                filter_matches[0] = True
            if content_filter and re.search(
                    content_filter,
                    getattr(rec, InputDocument.__name__).content.lower()):
                filter_matches[1] = True
            if right_filter and re.search(
                    right_filter,
                    getattr(rec,
                            InputDocument.__name__).right_context.lower()):
                filter_matches[2] = True

            if filtering_type == 'or':
                match = filter_matches[0] or filter_matches[
                    1] or filter_matches[2]
            if filtering_type == 'and':
                match = filter_matches[0] and filter_matches[
                    1] and filter_matches[2]
            if match:
                filtered_records.append(rec)
        records = filtered_records
        filtered_size = len(records)
    else:
        filtered_size = 0

    if sample_type == 'random':
        random.shuffle(records)
        if not sample_all:
            records = records[:sample_size]

    if sample_type == 'heterogenous':
        cached_arrays_id = db_session.query(Experiment).filter(
            Experiment.id == experiment_id).one().cached_arrays_id
        with h5py.File("cached_arrays.hdf5") as f:
            cached_arrays = f[cached_arrays_id]
            cosine_similarities = np.array(
                cached_arrays['cosine_similarities'])
            sentence_indexes = np.array(cached_arrays['sentence_indexes'])

        cluster_indexes = []
        result_ids = []
        evaluations = []
        for rec in records:
            cluster_indexes.append(
                np.where(sentence_indexes == getattr(
                    rec, Result.__name__).sentence_id)[0][0])

        cosine_similarities = cosine_similarities[
            cluster_indexes][:, cluster_indexes]
        sentence_indexes = np.array(sentence_indexes)[cluster_indexes]

        indexes = np.array([np.random.randint(len(cosine_similarities))])
        for i in range(min(len(cluster_indexes) - 1, sample_size - 1)):
            distance_array = np.sum(cosine_similarities[indexes], axis=0)
            distance_array[indexes] = np.inf
            argmax = np.argwhere(
                distance_array == np.min(distance_array)).flatten()
            max_index = np.random.choice(argmax, 1)
            indexes = np.append(indexes, max_index)

        # print(indexes)
        homogenous_records = []
        for index in indexes:
            homogenous_records.append(records[index])
        records = homogenous_records

    sentences = []
    for rec in records:
        left_context = clean_text(
            getattr(rec, InputDocument.__name__).left_context)
        text = clean_text(getattr(rec, InputDocument.__name__).content)
        right_context = clean_text(
            getattr(rec, InputDocument.__name__).right_context)

        sentences.append({
            'result_id': getattr(rec, Result.__name__).id,
            'event_id': getattr(rec, Result.__name__).sentence_id,
            'evaluation': getattr(rec, Result.__name__).evaluation,
            'left_context': left_context,
            'text': text,
            'right_context': right_context
        })

    sentences = sorted(sentences, key=lambda x: x['text'])

    return jsonify(html=render_template('get_sample.html',
                                        sentences=sentences),
                   filteredSize=filtered_size)
Exemplo n.º 12
0
    def run(self):
        try:
            # Log the self.experiment
            start_time = time.time()
            self.experiment = Experiment(input_type=self.input_type_id,
                                         algorithm=self.algorithm_id,
                                         processing=self.processing_method_id,
                                         start_time=func.current_timestamp(),
                                         status='running')

            db_session.add(self.experiment)
            db_session.commit()

            # START PREPROCESSING

            if self.input_type_name == 'Extractor':

                self.experiment.regex_name = self.regex_name_id
                self.experiment.regex_pattern = self.regex_pattern_id
                db_session.commit()

                arrays = self.similar_experiment_arrays(
                    filter_args=[(Experiment.regex_name, self.regex_name_id),
                                 (Experiment.regex_pattern,
                                  self.regex_pattern_id)])
                cosine_similarities = arrays['cosine_similarities']
                sentence_indexes = arrays['sentence_indexes']
                features = arrays['features']

                self.experiment.lines = len(sentence_indexes)

            if self.input_type_name == 'Cluster':

                parent = db_session.query(Experiment).filter(
                    Experiment.id == self.parent_id).one()
                parent.child_id = self.experiment.id

                self.experiment.parent_id = self.parent_id
                self.experiment.parent_label = self.parent_label
                self.experiment.regex_name = parent.regex_name
                self.experiment.regex_pattern = parent.regex_pattern

                cached_arrays_id = parent.cached_arrays_id
                self.experiment.cached_arrays_id = cached_arrays_id

                # cached_arrays = db_session.query(CachedArrays).filter(CachedArrays.id==cached_arrays_id).one()
                cached_arrays = CACHED_ARRAYS[cached_arrays_id]

                arrays = self.h5_data_to_numpy(cached_arrays)
                cosine_similarities = arrays['cosine_similarities']
                sentence_indexes = arrays['sentence_indexes']
                features = arrays['features']

                Result = get_result_object(parent.id)
                rows = db_session.query(Result).filter(
                    Result.experiment_id == self.parent_id,
                    Result.label == self.parent_label)
                cluster_indexes = []

                sentence_indexes_list = sentence_indexes.tolist()
                for row in rows:
                    cluster_indexes.append(
                        sentence_indexes_list.index(row.sentence_id))

                cosine_similarities = cosine_similarities[
                    cluster_indexes][:, cluster_indexes]
                sentence_indexes = sentence_indexes[cluster_indexes]
                if self.algorithm_name == 'DecisionTree':
                    features = features[cluster_indexes, :]

                self.experiment.lines = len(sentence_indexes)

            if self.input_type_name == 'Raw Data' or self.input_type_name == 'Sports Data':
                print(self.input_type_name)
                self.experiment.lines = 602
                db_session.commit()

                arrays = self.similar_experiment_arrays(
                    filter_args=[(Experiment.lines, self.experiment.lines)])

                cosine_similarities = arrays['cosine_similarities']
                sentence_indexes = arrays['sentence_indexes']
                features = arrays['features']

            db_session.commit()

            # raise Exception('debugging')

            # END PREPROCESSING

            # Log preprocessing duration.
            self.experiment.preprocessing_seconds = time.time() - start_time
            db_session.commit()

            # START CLUSTERING
            print('starting clustering')
            clustering_start = time.time()

            self.CLUSTERS = min(20, len(sentence_indexes))
            model = self.get_model()

            if self.algorithm_name != 'DecisionTree':
                model.fit(cosine_similarities)
            else:
                model.fit(cosine_similarities, features)

            self.save_labels(sentence_indexes, model.labels_)
            # END CLUSTERING

            # Log clustering duration.
            self.experiment.clustering_seconds = time.time() - clustering_start
            self.experiment.status = 'finished'
            db_session.commit()
            logger.debug('finished')

        except Exception as e:
            self.experiment.status = 'error'
            db_session.add(
                Traceback(experiment_id=self.experiment.id, message=str(e)))
            logger.exception(str(e))
            db_session.commit()
            CACHED_ARRAYS.close()
            raise e