Exemplo n.º 1
0
    def _identify_partial_matches(self, cohort_id, last_name):
        """Search for partial matches to identify students.

        Pulls data from a special set of memcache keys, which are updated by
        cron, and provide the names of all students in the school. All the
        names are examined to see if the typed name is contained in or
        contained by the existing name ("containment matching"), which are
        considered partial matches. Then the matches are ordered by their
        similarity (Levenshtein distance) to the typed name.
        """
        stripped_last_name = util.clean_string(last_name)

        match_data, from_memcache = self.internal_api.get_roster(cohort_id)

        # White list necessary properties (no sense in releasing status codes
        # like 'Parent Refusal' to the public).
        def clean_properties(d):
            white_list = [
                'first_name', 'last_name', 'classroom_name', 'id',
                'stripped_last_name'
            ]
            return {k: v for k, v in d.items() if k in white_list}

        # Containment matching.
        matches = [
            clean_properties(u) for u in match_data
            if u['stripped_last_name'] in stripped_last_name
            or stripped_last_name in u['stripped_last_name']
        ]

        # Order by edit (Levenshtein) distance from the submitted name.
        sort_func = lambda n: util.levenshtein_distance(
            n['stripped_last_name'], stripped_last_name)
        return sorted(matches, key=sort_func)
Exemplo n.º 2
0
    def get_contributors_list(self):
        """
        Creates a list of dicts containing the collaborators for the repository.

        This list is aggregated based following the algorithm on Valente's 2016
        paper "A Novel Approach for Estimating Truck Factor", namely step #2 "Detect
        Developer Aliases.

        The collaborators are indexed by email and the names is a list of developer
        aliases that the user might have. The user is considered the same user
        if the leveshtein distance for the two users names is equal or less than
        one.

        Returns:
            list: A list of dictionaries containing the fields 'email' and 'dev_aliases'.
        """
        contributors_list = []

        if self.api_repository is not None:
            for collaborator in self.api_repository.get_collaborators():
                email = collaborator.email
                name = collaborator.name

                list_entry = {
                    self.EMAIL_FIELD: email,
                    self.DEV_ALIASES: [name]
                }

                fresh_user = True

                for contributor in contributors_list:
                    for alias in contributor.dev_aliases:
                        if levenshtein_distance(alias, name) <= 1:
                            contributor.dev_aliases.append(name)
                            fresh_user = False
                            break
                    if not fresh_user:
                        break

                if fresh_user:
                    contributors_list.append(list_entry)

        return contributors_list
Exemplo n.º 3
0
 def test_saturday_sunday(self):
     """compare Saturday with Sunday"""
     self.assertEquals(3, util.levenshtein_distance('Sunday', 'Saturday'))
Exemplo n.º 4
0
 def test_kitten_sitting(self):
     """compare kitten with sitting"""
     self.assertEquals(3, util.levenshtein_distance('sitting', 'kitten'))
Exemplo n.º 5
0
 def distance(a, b):
     return levenshtein_distance(a, b) / float(max(len(a), len(b), 1))
Exemplo n.º 6
0
def test():
    # Getting settings from config.py
    max_len = cfg.MAX_TOKEN_LEN
    num_token = cfg.NUM_OF_TOKEN
    imw = cfg.IMW
    imh = cfg.IMH

    # Training params
    is_train = False
    batch_size = 1

    # Tracking/Saving
    num_ite_to_log = cfg.NUM_ITE_TO_LOG
    num_ite_to_vis = cfg.NUM_ITE_TO_VIS
    save_name = cfg.SAVE_NAME
    test_name = cfg.TEST_NAME
    vis_path = cfg.VIS_PATH

    use_cuda = cfg.CUDA and torch.cuda.is_available()
    save_path = cfg.MODEL_FOLDER
    dataset_path = cfg.DATASET_PATH + 'CROHME2013_data/TestINKML/'
    scale_factor = cfg.TEST_SCALE_FACTOR

    # Load the vocab dictionary for display purpose
    word_to_id, id_to_word = get_gt.build_vocab('mathsymbolclass.txt')
    start_id = word_to_id['<s>']
    stop_id = word_to_id['</s>']

    # Initialize the network and load its weights
    net = AGRU()
    save_files = glob.glob(save_path + save_name + '*.dat')
    if (len(save_files) > 0):
        save_file = sorted(save_files)[-1]
        print('Loading network weights saved at %s...' % save_file)
        loadobj = torch.load(save_file)
        net.load_state_dict(loadobj['state_dict'])
        print('Loading done.')

    if (use_cuda):
        net.cuda()

    # For debugging
    if (not is_train):
        net.train(False)

    # Get full paths to test inkml files, create a list of scale factors to be used for rendering test images
    inkml_list = glob.glob(dataset_path + '*.inkml')
    scale_list = [scale_factor] * len(inkml_list)
    inkml_list = np.asarray(inkml_list)
    scale_list = np.asarray(scale_list)

    #inkml_list = inkml_list[0:120]
    #scale_list = scale_list[0:120]
    num_test = len(inkml_list)
    num_ite = int(np.ceil(1.0 * num_test / batch_size))

    # Exact match and word error rate
    em = []
    wer = []
    all_pred = []
    all_gt = []
    # Main test loop
    for i in range(num_ite):
        batch_idx = range(i * batch_size, (i + 1) * batch_size)
        if (batch_idx[-1] >= num_test):
            batch_idx = range(i * batch_size, num_test)
        batch_size = len(batch_idx)
        batch_x = util.batch_data(inkml_list[batch_idx], scale_list[batch_idx],
                                  is_train)
        batch_y_np = util.batch_target(inkml_list[batch_idx])
        batch_y = util.np_to_var(batch_y_np, use_cuda)

        #pred_y, attention = net(batch_x, batch_y)
        pred_y, attention = net.beam_search(batch_x, start_id, stop_id)
        pred_y = util.var_to_np(pred_y, use_cuda)
        pred_y = np.argmax(pred_y, 2)
        batch_y = np.reshape(batch_y_np, (batch_size, max_len))

        print('Finished ite %d/%d.' % (i, num_ite))
        j = 0

        pred_string = pred_y[j, :]
        pred_string = [id_to_word[idx] for idx in list(pred_string)]
        gt_string = batch_y[0, :]
        gt_string = [id_to_word[idx] for idx in list(gt_string)]
        all_pred.append(pred_string)
        all_gt.append(gt_string)
        em.append(util.exact_match(pred_string, gt_string))
        if ('</s>' in pred_string):
            pred_string = pred_string[0:pred_string.index('</s>') + 1]
        gt_string = gt_string[0:gt_string.index('</s>') + 1]
        wer.append(util.levenshtein_distance(pred_string, gt_string))

        if (i % 4 == 0):
            continue

        # Printing stuffs to console
        print('Prediction: %s' % ' '.join(pred_string))
        print('Target: %s\n' % ' '.join(gt_string))

        # Save attention to files for visualization
        file_name = ntpath.basename(inkml_list[batch_idx[j]])[:-6]
        vis_path_j = vis_path + file_name + '/'
        if (not os.path.exists(vis_path_j)):
            os.makedirs(vis_path_j)

        tmp_x = np.sum(batch_x.data.cpu().numpy()[j, :, :, :], axis=0)
        attention_np = attention.data.cpu().numpy()[j, 1:, :, :]
        pred_string = pred_string[1:]
        for k, word in enumerate(pred_string):
            word = word.replace('/', 'slash_')
            attention_k = attention_np[k, :, :] / np.max(
                attention_np[k, :, :]) * 0.8
            attention_k = (scipy.misc.imresize(attention_k, 16.0)) / 255.0
            tmp_x = scipy.misc.imresize(tmp_x, attention_k.shape)
            attention_k += tmp_x
            attention_k[attention_k > 1] = 1
            try:
                scipy.misc.imsave(vis_path_j + ('%02d_%s.jpg' % (k, word)),
                                  attention_k)
            except FileNotFoundError:
                pdb.set_trace()
            if (word == '<slash_s>'):
                break

        #pdb.set_trace()

    print("Exact match count: %d/%d" % (sum(em), len(em)))
    print("Word error rate: %.5f" % (np.mean(wer)))
    pdb.set_trace()
    util.save_list([em, wer, all_pred, all_gt], save_path + test_name + '.dat')

    pdb.set_trace()