Exemplo n.º 1
0
def get_syntactically_similar_pairs(request):
    extra_args = json.loads(request.POST.get('extras', {}))
    granularity = extra_args['granularity']
    user = request.user
    database = get_user_databases(user)
    permission = database.get_assigned_permission(user)
    if permission < DatabasePermission.ANNOTATE:
        raise CustomAssertionError(
            'You don\'t have permission to annotate this database')

    sids, tids = get_sids_tids(database)

    label_arr = get_syllable_labels(user,
                                    granularity,
                                    sids,
                                    on_no_label='set_blank')
    cls_labels, syl_label_enum_arr = np.unique(label_arr, return_inverse=True)

    enum2label = {enum: label for enum, label in enumerate(cls_labels)}
    sid2enumlabel = {
        sid: enum_label
        for sid, enum_label in zip(sids, syl_label_enum_arr)
    }

    adjacency_mat, classes_info = calc_class_ajacency(database,
                                                      syl_label_enum_arr,
                                                      enum2label,
                                                      sid2enumlabel,
                                                      count_style='forward',
                                                      self_count='append')
    counter = Counter(syl_label_enum_arr)
    nlabels = len(counter)
    frequencies = np.array([counter[i] for i in range(nlabels)])

    return adjacency_mat.tolist(), frequencies.tolist(), cls_labels.tolist()
Exemplo n.º 2
0
    def prepare_data_for_analysis(self, pkl_filename, options):
        label_level = options['label_level']
        cdm = options['cdm']
        dmid = options['dmid']
        annotator_name = options['annotator_name']

        methods = dict(mean=np.mean, median=np.median)
        method = get_or_error(
            methods, cdm,
            'Unknown value {} for --class-distance-method.'.format(cdm))
        dm = get_dm(dmid)
        sids_path = dm.get_sids_path()
        source_bytes_path = dm.get_bytes_path()

        sids = bytes_to_ndarray(sids_path, np.int32)
        coordinates = get_rawdata_from_binary(source_bytes_path, len(sids))
        coordinates = drop_useless_columns(coordinates)
        coordinates = zscore(coordinates)
        coordinates[np.where(np.isinf(coordinates))] = 0
        coordinates[np.where(np.isnan(coordinates))] = 0

        if annotator_name is not None:
            annotator = get_or_error(User,
                                     dict(username__iexact=annotator_name))
            label_arr, syl_label_enum_arr = get_syllable_labels(
                annotator, label_level, sids)
            nlabels = len(label_arr)
            distmat, classes_info = calc_class_dist_by_syl_features(
                syl_label_enum_arr, nlabels, coordinates, method)
            dist_triu = mat2triu(distmat)
        else:
            dist_triu = distance.pdist(coordinates, 'euclidean')
            label_arr = []
            syl_label_enum_arr = []
            classes_info = []
            for sind, sid in enumerate(sids):
                label = str(sind)
                label_arr.append(label)
                syl_label_enum_arr.append(sind)
                classes_info.append([sind])

        tree = linkage(dist_triu, method='average')

        saved_dict = dict(tree=tree,
                          dbid=dm.database.id,
                          sids=sids,
                          unique_labels=label_arr,
                          classes_info=classes_info)

        with open(pkl_filename, 'wb') as f:
            pickle.dump(saved_dict, f)

        return saved_dict
    def get_class_measures_info(self, options):
        annotator_name = options['annotator_name']
        label_level = options['label_level']

        annotator = get_or_error(User, dict(username__iexact=annotator_name))
        syl_labels = get_syllable_labels(annotator, label_level, self.sids)
        cls_labels, syl_label_enum_arr = np.unique(syl_labels,
                                                   return_inverse=True)

        nlabels = len(cls_labels)
        class_measures, classes_info = aggregate_class_features(
            syl_label_enum_arr, nlabels, self.coordinates, method=np.mean)
        return class_measures, classes_info, nlabels, cls_labels, syl_labels
    def prepare_data_for_analysis(self, pkl_filename, options):
        label_level = options['label_level']
        dbid = options['dbid']
        annotator_name = options['annotator_name']

        database = get_database(dbid)
        sids, tids = get_sids_tids(database)
        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        label_arr = get_syllable_labels(annotator, label_level, sids)
        cls_labels, syl_label_enum_arr = np.unique(label_arr,
                                                   return_inverse=True)

        enum2label = {enum: label for enum, label in enumerate(cls_labels)}
        sid2enumlabel = {
            sid: enum_label
            for sid, enum_label in zip(sids, syl_label_enum_arr)
        }

        adjacency_mat, classes_info = calc_class_ajacency(
            database,
            syl_label_enum_arr,
            enum2label,
            sid2enumlabel,
            count_style='symmetric',
            count_circular=False)

        dist_triu = calc_class_dist_by_adjacency(adjacency_mat,
                                                 syl_label_enum_arr,
                                                 return_triu=True)
        tree = linkage(dist_triu, method='average')

        saved_dict = dict(tree=tree,
                          dbid=database.id,
                          sids=sids,
                          unique_labels=label_arr,
                          classes_info=classes_info)

        with open(pkl_filename, 'wb') as f:
            pickle.dump(saved_dict, f)

        return saved_dict