Пример #1
0
def update_profile(request):
    """
    Renders the profile update view.
    """
    errors = None
    languages = set()
    language_choices = [x for x in LANGUAGE_CODES_AND_NAMES.items()]
    language_choices.sort(key=lambda x: x[1])
    focus_input = 'id_projects'

    if request.method == "POST":
        languages = set(request.POST.getlist('languages', None))
        if languages:
            try:
                # Compute set of evaluation languages for this user.
                for code, _ in language_choices:
                    language_group = Group.objects.filter(name=code)
                    if language_group.exists():
                        language_group = language_group[0]
                        if code in languages:
                            language_group.user_set.add(request.user)
                        else:
                            language_group.user_set.remove(request.user)
                        language_group.save()

                # Redirect to dashboard.
                return redirect('dashboard')

            # For any other exception, clean up and ask user to retry.
            except Exception:
                from traceback import format_exc

                print(format_exc())

                languages = set()

        # Detect which input should get focus for next page rendering.
        if not languages:
            focus_input = 'id_languages'
            errors = ['invalid_languages']

    # Determine user target languages
    for group in request.user.groups.all():
        if group.name.lower() in [x.lower() for x in LANGUAGE_CODES_AND_NAMES]:
            languages.add(group.name.lower())

    context = {
        'active_page': "OVERVIEW",
        'errors': errors,
        'focus_input': focus_input,
        'languages': languages,
        'language_choices': language_choices,
        'title': 'Update profile',
    }
    context.update(BASE_CONTEXT)

    return render(request, 'Dashboard/update-profile.html', context)
Пример #2
0
    def compute_accurate_group_status(cls):
        from Dashboard.models import LANGUAGE_CODES_AND_NAMES
        user_status = defaultdict(list)
        qs = cls.objects.filter(completed=True)

        value_names = ('createdBy', 'item__itemType', 'task__id')
        for result in qs.values_list(*value_names):
            if result[1].lower() != 'tgt':
                continue

            annotatorID = result[0]
            taskID = result[2]
            user_status[annotatorID].append(taskID)

        group_status = defaultdict(list)
        for annotatorID in user_status:
            user = User.objects.get(pk=annotatorID)
            usergroups = ';'.join([
                x.name for x in user.groups.all()
                if not x.name in LANGUAGE_CODES_AND_NAMES.keys()
            ])
            if not usergroups:
                usergroups = 'NoGroupInfo'

            group_status[usergroups].extend(user_status[annotatorID])

        group_hits = {}
        for group_name in group_status:
            task_ids = set(group_status[group_name])
            completed_tasks = 0
            for task_id in task_ids:
                if group_status[group_name].count(task_id) >= 70:
                    completed_tasks += 1

            group_hits[group_name] = (completed_tasks, len(task_ids))

        return group_hits
Пример #3
0
    def dump_all_results_to_csv_file(cls, csv_file):
        from Dashboard.models import LANGUAGE_CODES_AND_NAMES
        system_scores = defaultdict(list)
        user_data = {}
        qs = cls.objects.filter(completed=True)

        value_names = ('item__target1ID', 'score1', 'item__target2ID',
                       'score2', 'start_time', 'end_time', 'createdBy',
                       'item__itemID',
                       'item__metadata__market__sourceLanguageCode',
                       'item__metadata__market__targetLanguageCode',
                       'item__metadata__market__domainName', 'item__itemType',
                       'task__id', 'task__campaign__campaignName')
        for result in qs.values_list(*value_names):

            system1ID = result[0]
            score1 = result[1]
            system2ID = result[2]
            score2 = result[3]
            start_time = result[4]
            end_time = result[5]
            duration = round(float(end_time) - float(start_time), 1)
            annotatorID = result[6]
            segmentID = result[7]
            marketID = '{0}-{1}'.format(result[8], result[9])
            domainName = result[10]
            itemType = result[11]
            taskID = result[12]
            campaignName = result[13]

            if annotatorID in user_data:
                username = user_data[annotatorID][0]
                useremail = user_data[annotatorID][1]
                usergroups = user_data[annotatorID][2]

            else:
                user = User.objects.get(pk=annotatorID)
                username = user.username
                useremail = user.email
                usergroups = ';'.join([
                    x.name for x in user.groups.all()
                    if not x.name in LANGUAGE_CODES_AND_NAMES.keys()
                ])
                if not usergroups:
                    usergroups = 'NoGroupInfo'

                user_data[annotatorID] = (username, useremail, usergroups)

            system_scores[marketID + '-' + domainName].append(
                (taskID, segmentID, username, useremail, usergroups, system1ID,
                 score1, system2ID, score2, start_time, end_time, duration,
                 itemType, campaignName))

        # TODO: this is very intransparent... and needs to be fixed!
        x = system_scores
        s = [
            'taskID,segmentID,username,email,groups,system1ID,score1,system2ID,score2,startTime,endTime,durationInSeconds,itemType,campaignName'
        ]
        for l in x:
            for i in x[l]:
                s.append(','.join([str(a) for a in i]))

        from os.path import join
        from Appraise.settings import BASE_DIR
        media_file_path = join(BASE_DIR, 'media', csv_file)
        with open(media_file_path, 'w') as outfile:
            for c in s:
                outfile.write(c)
                outfile.write('\n')
Пример #4
0
 def marketTargetLanguageCode(self):
     tokens = str(self.items.first().metadata.market).split('_')
     if len(tokens) == 3 and tokens[1] in LANGUAGE_CODES_AND_NAMES.keys():
         return tokens[1]
     return None
Пример #5
0
 def marketSourceLanguage(self):
     tokens = str(self.items.first().metadata.market).split('_')
     if len(tokens) == 3 and tokens[0] in LANGUAGE_CODES_AND_NAMES.keys():
         return LANGUAGE_CODES_AND_NAMES[tokens[0]]
     return None
Пример #6
0
def create_profile(request):
    """
    Renders the create profile view.
    """
    errors = None
    username = None
    email = None
    token = None
    languages = []
    language_choices = [x for x in LANGUAGE_CODES_AND_NAMES.items()]
    language_choices.sort(key=lambda x: x[1])

    focus_input = 'id_username'

    if request.method == "POST":
        username = request.POST.get('username', None)
        email = request.POST.get('email', None)
        token = request.POST.get('token', None)
        languages = request.POST.getlist('languages', None)

        if username and email and token and languages:
            try:
                # Check if given invite token is still active.
                invite = UserInviteToken.objects.filter(token=token)
                if not invite.exists() or not invite[0].active:
                    raise ValueError('invalid_token')

                # We now have a valid invite token...
                invite = invite[0]

                # Check if desired username is already in use.
                current_user = User.objects.filter(username=username)
                if current_user.exists():
                    raise ValueError('invalid_username')

                # Compute set of evaluation languages for this user.
                eval_groups = []
                for code in languages:
                    language_group = Group.objects.filter(name=code)
                    if language_group.exists():
                        eval_groups.extend(language_group)

                # Create new user account and add to group.
                password = '******'.format(
                    invite.group.name[:2].upper(),
                    md5(invite.group.name.encode('utf-8')).hexdigest()[:8],
                )
                user = User.objects.create_user(username, email, password)

                # Update group settings for the new user account.
                user.groups.add(invite.group)
                for eval_group in eval_groups:
                    user.groups.add(eval_group)

                user.save()

                # Disable invite token and attach to current user.
                invite.active = False
                invite.user = user
                invite.save()

                # Login user and redirect to dashboard page.
                user = authenticate(username=username, password=password)
                login(request, user)
                return redirect('dashboard')

            # For validation errors, invalidate the respective value.
            except ValueError as issue:
                if issue.args[0] == 'invalid_username':
                    username = None

                elif issue.args[0] == 'invalid_token':
                    token = None

                else:
                    username = None
                    email = None
                    token = None
                    languages = None

            # For any other exception, clean up and ask user to retry.
            except Exception:
                from traceback import format_exc

                print(format_exc())  # TODO: need logger here!
                username = None
                email = None
                token = None
                languages = None

        # Detect which input should get focus for next page rendering.
        if not username:
            focus_input = 'id_username'
            errors = ['invalid_username']

        elif not email:
            focus_input = 'id_email'
            errors = ['invalid_email']

        elif not token:
            focus_input = 'id_token'
            errors = ['invalid_token']

        elif not languages:
            focus_input = 'id_languages'
            errors = ['invalid_languages']

    context = {
        'active_page': "OVERVIEW",  # TODO: check
        'errors': errors,
        'focus_input': focus_input,
        'username': username,
        'email': email,
        'token': token,
        'languages': languages,
        'language_choices': language_choices,
        'title': 'Create profile',
    }
    context.update(BASE_CONTEXT)

    return render(request, 'Dashboard/create-profile.html', context)
    def handle(self, *args, **options):
        # Validate source and target language codes
        _all = list(set([x.lower() for x in LANGUAGE_CODES_AND_NAMES.keys()]))
        _all.sort()
        _src = options['source_language'].lower()
        if not _src in _all:
            self.stdout.write('Unknown source language: {0}!'.format(_src))
            self.stdout.write('Known languages: {0}'.format(', '.join(_all)))
            return

        _tgt = options['target_language'].lower()
        if not _tgt in _all:
            self.stdout.write('Unknown target language: {0}!'.format(_tgt))
            self.stdout.write('Known languages: {0}'.format(', '.join(_all)))
            return

        # Initialize random number generator
        # Extract batch size number of pairs, randomizing order if requested
        # Serialize pairs into JSON format
        # Write out JSON output file

        batch_size = options['batch_size']

        block_size = 10
        block_annotations = 7
        block_redundants = 1
        block_references = 1
        block_badrefs = 1

        # IF BLOCK DEF IS GIVEN, DO SOMETHING WITH IT
        if options['block_definition'] is not None:
            print("WOOHOO")

        if (batch_size % block_size) > 0:
            self.stdout.write(
                'Batch size needs to be divisible by block size!')
            return

        # CHECK THAT WE END UP WITH EVEN NUMBER OF BLOCKS

        print('We will create {0} blocks'.format(int(batch_size / block_size)))

        # TODO: add parameter to set encoding
        # TODO: need to use OrderedDict to preserve segment IDs' order!
        source_file = Command._load_text_from_file(options['source_file'],
                                                   'utf8')
        print('Loaded {0} source segments'.format(len(source_file.keys())))

        reference_file = Command._load_text_from_file(
            options['reference_file'], 'utf8')
        print('Loaded {0} reference segments'.format(len(
            reference_file.keys())))

        systems_files = []
        systems_path = options['systems_path']
        from glob import iglob
        import os.path
        for system_file in iglob('{0}{1}{2}'.format(systems_path, os.path.sep,
                                                    "*.txt")):
            systems_files.append(system_file)

        random_seed_value = 123456

        systems_files.sort()
        seed(random_seed_value)
        shuffle(systems_files)
        # ADD RANDOMIZED SHUFFLING HERE?

        import hashlib
        hashed_text = {}

        for system_path in systems_files:
            system_txt = Command._load_text_from_file(system_path, 'utf8')
            system_bad = Command._load_text_from_file(
                system_path.replace('.txt', '.bad'), 'utf8')
            system_ids = Command._load_text_from_file(
                system_path.replace('.txt', '.ids'), 'utf8')
            system_url = Command._load_text_from_file(
                system_path.replace('.txt', '.url'), 'utf8')

            for segment_id, segment_text in system_txt.items():
                md5hash = hashlib.new('md5',
                                      segment_text.encode('utf8')).hexdigest()
                if not md5hash in hashed_text.keys():
                    hashed_text[md5hash] = {
                        'segment_id': segment_id,
                        'segment_text': segment_text,
                        'segment_bad': system_bad[segment_id],
                        'segment_ref': reference_file[segment_id],
                        'segment_src': source_file[segment_id],
                        'segment_url': system_url[segment_id],
                        'systems': [os.path.basename(system_path)]
                    }
                else:
                    hashed_text[md5hash]['systems'].append(
                        os.path.basename(system_path))

            print('Loaded {0} system {1} segments'.format(
                len(system_txt.keys()), os.path.basename(system_path)))

        all_keys = list(hashed_text.keys())
        all_keys.sort()
        shuffle(all_keys)

        items_per_batch = 10 * 7

        missing_items = items_per_batch - len(all_keys) % items_per_batch
        print('Missing items is {0}/{1}'.format(missing_items,
                                                items_per_batch))

        all_keys.extend(all_keys[0:missing_items])
        print('Added {0} missing items rotating keys'.format(missing_items))

        total_batches = int(floor(len(all_keys) / items_per_batch))
        print('Total number of batches is {0}'.format(total_batches))

        batch_no = options['batch_no']
        all_batches = options['all_batches']
        source_based = options['source_based']

        # If we don't produce all batches, our batch_id will be batch_no-1.
        # This is because batch numbers are one-based, ids zero-indexed.
        #
        # If we produce all batches, we just use range(total_batches).
        # This implicitly gives us zero-indexed ids already.
        batch_nos = [batch_no-1] if not all_batches \
          else list(range(total_batches))

        json_data = []
        for batch_id in batch_nos:  # range(batch_no):
            block_data = []
            block_offset = batch_id * 10 * 7

            num_blocks = int(batch_size / block_size)
            for block_id in range(num_blocks):
                # Human readable ids are one-based, hence +1
                print('Creating batch {0:05}/{1:05}, block {2:02}'.format(
                    batch_id + 1, total_batches, block_id + 1))

                # Get 7 random system outputs
                block_start = block_offset + 7 * (block_id)
                block_end = block_start + 7
                block_hashes = all_keys[block_start:block_end]

                current_block = {'systems': block_hashes}

                block_data.append(current_block)

            # Compute redundant, reference, bad reference bits
            for block_id in range(num_blocks):
                check_id = int((block_id + (num_blocks / 2)) % num_blocks)
                # Human readable ids are one-based, hence +1
                print('Add checks for batch {0:05}/{1:05}, ' \
                  'block {2:02} to block {3:02}'.format(
                    batch_id+1, total_batches, check_id+1, block_id+1
                  )
                )

                check_systems = block_data[check_id]['systems']
                check_systems.sort()
                shuffle(check_systems)

                block_data[block_id]['redundant'] = check_systems[0]
                block_data[block_id]['reference'] = check_systems[1]
                block_data[block_id]['badref'] = check_systems[2]

            # Direct assessment is reference-based for WMT17
            sourceID = basename(options['reference_file'])

            # Remember, batch numbers are one-based
            taskData = OrderedDict({
                'batchNo': batch_id + 1,
                'batchSize': options['batch_size'],
                'sourceLanguage': options['source_language'],
                'targetLanguage': options['target_language'],
                'requiredAnnotations': 1,
                'randomSeed': random_seed_value
            })
            itemsData = []
            _item = 0

            for block_id in range(num_blocks):
                all_items = [(x, 'TGT')
                             for x in block_data[block_id]['systems']]
                all_items.append((block_data[block_id]['redundant'], 'CHK'))
                all_items.append((block_data[block_id]['reference'], 'REF'))
                all_items.append((block_data[block_id]['badref'], 'BAD'))
                shuffle(all_items)

                for current_item, current_type in all_items:
                    item_data = hashed_text[current_item]

                    item_id = item_data['segment_id']
                    item_text = item_data['segment_text']
                    item_bad = item_data['segment_bad']
                    item_ref = item_data['segment_ref']
                    item_src = item_data['segment_src']
                    item_url = item_data['segment_url']
                    item_systems = item_data['systems']

                    targetID = '+'.join(set(item_systems))
                    targetText = item_text
                    if current_type == 'REF':
                        targetID = basename(options['reference_file'])
                        targetText = item_ref
                    elif current_item == 'BAD':
                        targetText = item_bad

                    obj = OrderedDict()
                    obj['_item'] = _item
                    obj['_block'] = block_id + (10 * batch_id)
                    obj['sourceID'] = sourceID
                    obj['sourceText'] = item_ref if not source_based else item_src
                    obj['targetID'] = targetID
                    obj['targetText'] = targetText
                    obj['itemID'] = item_id
                    obj['itemType'] = current_type
                    obj['imageURL'] = item_url

                    itemsData.append(obj)
                    _item += 1

            outputData = OrderedDict({'task': taskData, 'items': itemsData})

            json_data.append(outputData)

        print(json.dumps(json_data, indent=2))
        json_data = json.dumps(json_data, indent=2)

        with open(options['output_json_file'], mode='w',
                  encoding='utf8') as output_file:
            self.stdout.write('Creating {0} ... '.format(
                options['output_json_file']),
                              ending='')
            output_file.write(str(json_data))
            self.stdout.write('OK')
Пример #8
0
    def handle(self, *args, **options):
        # Validate source and target language codes
        _all = list(set([x.lower() for x in LANGUAGE_CODES_AND_NAMES.keys()]))
        _all.sort()
        _src = options['source_language'].lower()
        if not _src in _all:
            self.stdout.write('Unknown source language: {0}!'.format(_src))
            self.stdout.write('Known languages: {0}'.format(', '.join(_all)))
            return

        _tgt = options['target_language'].lower()
        if not _tgt in _all:
            self.stdout.write('Unknown target language: {0}!'.format(_tgt))
            self.stdout.write('Known languages: {0}'.format(', '.join(_all)))
            return

        # Initialize random number generator
        # Extract batch size number of pairs, randomizing order if requested
        # Serialize pairs into JSON format
        # Write out JSON output file

        batch_size = options['batch_size']
        unicode_enc = options['unicode']
        use_local_src = options['local_src']
        use_local_ref = options['local_ref']
        create_ids = options['create_ids']
        source_based = options['source_based']

        block_size = 10
        block_annotations = 7
        block_redundants = 1
        block_references = 1
        block_badrefs = 1

        # IF BLOCK DEF IS GIVEN, DO SOMETHING WITH IT
        if options['block_definition'] is not None:
            print("WOOHOO")

        if (batch_size % block_size) > 0:
            self.stdout.write(
                'Batch size needs to be divisible by block size!')
            return

        # CHECK THAT WE END UP WITH EVEN NUMBER OF BLOCKS

        print('We will create {0} blocks'.format(int(batch_size / block_size)))

        # TODO: add parameter to set encoding
        # TODO: need to use OrderedDict to preserve segment IDs' order!
        encoding = 'utf16' if unicode_enc else 'utf8'

        source_file = []
        if not use_local_src:
            source_file = Command._load_text_from_file(options['source_file'],
                                                       encoding)
            print('Loaded {0} source segments'.format(len(source_file.keys())))

        reference_file = []
        if not use_local_ref:
            reference_file = Command._load_text_from_file(
                options['reference_file'], encoding)
            print('Loaded {0} reference segments'.format(
                len(reference_file.keys())))

        systems_files = []
        systems_path = options['systems_path']
        from glob import iglob
        import os.path
        for system_file in iglob('{0}{1}{2}'.format(systems_path, os.path.sep,
                                                    "*.txt")):
            if '+' in basename(system_file):
                print('Cannot use system files with + in names ' \
                  'as this breaks multi-system meta systems:\n' \
                  '{0}'.format(system_file))
                sys_exit(-1)
            systems_files.append(system_file)

        random_seed_value = 123456

        systems_files.sort()
        seed(random_seed_value)
        shuffle(systems_files)
        # ADD RANDOMIZED SHUFFLING HERE?

        import hashlib
        hashed_text = {}
        hashes_by_ids = defaultdict(list)

        character_based = _tgt == 'zho' or _tgt == 'jpn' \
          or options['character_based']

        for system_path in systems_files:
            system_txt = Command._load_text_from_file(system_path, encoding)
            # Generate bad references on the fly
            #
            # To do so, we will load a random source segment to fill in a
            # randomly positioned phrase in the given candidate translation.
            #
            # system_bad = Command._load_text_from_file(system_path.replace('.txt', '.bad'), encoding)

            if not create_ids:
                system_ids = Command._load_text_from_file(
                    system_path.replace('.txt', '.ids'), encoding)
            else:
                system_ids = [x + 1 for x in range(len(system_txt))]
            # BASICALLY: add support for local system_src and system_ref files here.
            #   If such files are present, this will overwrite the global src/ref values.
            #   However, this does not fully resolve the issue as we still have to give
            #   a source text file, while is assumed to be shared...
            #
            # IN A SENSE, using these local files makes better sense. It is wasteful, though.
            #   MAYBE, it is better to simply generate a simple JSON config file?!
            local_src = []
            local_ref = []

            if use_local_src:
                local_src_path = system_path.replace('.txt', '.src')
                if os.path.exists(local_src_path):
                    local_src = Command._load_text_from_file(
                        local_src_path, encoding)

            if use_local_ref:
                local_ref_path = system_path.replace('.txt', '.ref')
                if os.path.exists(local_ref_path):
                    local_ref = Command._load_text_from_file(
                        local_src_path, encoding)

            for segment_id, segment_text in system_txt.items():
                _src = local_src[segment_id] if use_local_src else source_file[
                    segment_id]
                _ref = local_src[
                    segment_id] if use_local_ref else reference_file[segment_id]
                md5hash = hashlib.new(
                    'md5',
                    segment_text.encode(encoding) + _src.encode(encoding) +
                    _ref.encode(encoding)).hexdigest()

                # Determine length of bad phrase, relative to segment length
                #
                # This follows WMT17:
                # - http://statmt.org/wmt17/pdf/WMT17.pdf

                _bad_len = 1
                _tokens = segment_text \
                  if character_based \
                  else segment_text.split(' ')

                if len(_tokens) == 1:
                    _bad_len = 1
                elif len(_tokens) > 1 and len(_tokens) <= 5:
                    _bad_len = 2
                elif len(_tokens) > 5 and len(_tokens) <= 8:
                    _bad_len = 3
                elif len(_tokens) > 8 and len(_tokens) <= 15:
                    _bad_len = 4
                elif len(_tokens) > 15 and len(_tokens) <= 20:
                    _bad_len = 5
                else:
                    _bad_len = len(_tokens) // 4

                if character_based:
                    _bad_len = 2 * _bad_len

                # Choose random src/ref segment
                _bad_tokens = []
                while len(_bad_tokens) <= _bad_len:
                    _bad_id = randrange(0, len(local_ref)) + 1 \
                      if use_local_ref else randrange(0, len(reference_file)) + 1

                    if source_based:
                        _bad_id = randrange(0, len(local_src)) + 1 \
                          if use_local_src else randrange(0, len(source_file)) + 1

                    _bad_text = None
                    #                    if source_based:
                    #                        _bad_text = local_src[_bad_id] if use_local_src else source_file[_bad_id]
                    #                    else:
                    #
                    # We are currently forcing reference-based bad reference
                    # generation. If no reference is available, then a copy
                    # of the source file will work just fine.
                    #
                    if True:
                        _bad_text = local_ref[
                            _bad_id] if use_local_ref else reference_file[
                                _bad_id]

                    _bad_tokens = _bad_text \
                      if character_based \
                      else _bad_text.split(' ')

                # If dealing with Chinese or Japanese, use double the amount
                # of characters for the bad replacement phrase.
                _bad_phrase = None

                _index = randrange(0, len(_bad_tokens) - _bad_len) \
                  if len(_bad_tokens) - _bad_len > 0 else 0
                _bad_phrase = _bad_tokens[_index:_index + _bad_len]

                _index = randrange(0, len(_tokens) - _bad_len) \
                  if len(_tokens) - _bad_len > 0 else 0
                _bad = _tokens[:_index] + _bad_phrase \
                  + _tokens[_index + _bad_len:]

                segment_bad = ''.join(_bad) \
                  if character_based \
                  else ' '.join(_bad)

                if not md5hash in hashed_text.keys():
                    hashed_text[md5hash] = {
                        'segment_id': segment_id,
                        'segment_text': segment_text,
                        'segment_bad': segment_bad,
                        'segment_ref': _ref,
                        'segment_src': _src,
                        'systems': [os.path.basename(system_path)]
                    }

                    hashes_by_ids[segment_id].append(md5hash)
                else:
                    hashed_text[md5hash]['systems'].append(
                        os.path.basename(system_path))

            print('Loaded {0} system {1} segments'.format(
                len(system_txt.keys()), os.path.basename(system_path)))

        # Dump deduplicated segment data to JSON file.
        json_data = json.dumps(hashed_text, indent=2, sort_keys=True)
        with open(options['output_json_file'] + '.segments',
                  mode='w',
                  encoding='utf8') as output_file:
            self.stdout.write(
                'Creating {0} ... '.format(options['output_json_file'] +
                                           '.segments'),
                ending='')
            output_file.write(str(json_data))
            self.stdout.write('OK')

        all_keys = list(hashed_text.keys())
        all_keys.sort()
        shuffle(all_keys)

        # If --full-coverage is specified, we want to collect annotations for
        # all unique translations for any given segment ID. To do so, we loop
        # over the all_keys list and for each MD5 hash we have not consumed,
        # we add not only the MD5 hash itself but also all other MD5 hashes
        # matching the respective segment ID.
        full_coverage = options['full_coverage']
        if full_coverage:
            _sorted_keys = []
            for key in all_keys:
                if not key in _sorted_keys:
                    segment_id = hashed_text[key]['segment_id']
                    matching_keys = hashes_by_ids[segment_id]
                    matching_keys.sort()
                    _sorted_keys.extend(matching_keys)
            all_keys = _sorted_keys

        items_per_batch = 10 * 7

        missing_items = items_per_batch - len(all_keys) % items_per_batch
        print('Missing items is {0}/{1}'.format(missing_items,
                                                items_per_batch))

        all_keys.extend(all_keys[0:missing_items])
        print('Added {0} missing items rotating keys'.format(missing_items))

        total_batches = int(floor(len(all_keys) / items_per_batch))
        print('Total number of batches is {0}'.format(total_batches))

        batch_no = options['batch_no']
        max_batches = options['max_batches']
        all_batches = options['all_batches']

        # If we don't produce all batches, our batch_id will be batch_no-1.
        # This is because batch numbers are one-based, ids zero-indexed.
        #
        # If we produce all batches, we just use range(total_batches).
        # This implicitly gives us zero-indexed ids already.
        batch_nos = [batch_no-1] if not all_batches \
          else list(range(total_batches))
        if max_batches:
            batch_nos = batch_nos[:max_batches]

        json_data = []
        for batch_id in batch_nos:  # range(batch_no):
            block_data = []
            block_offset = batch_id * 10 * 7

            num_blocks = int(batch_size / block_size)
            for block_id in range(num_blocks):
                # Human readable ids are one-based, hence +1
                print('Creating batch {0:05}/{1:05}, block {2:02}'.format(
                    batch_id + 1, total_batches, block_id + 1))

                # Get 7 random system outputs
                block_start = block_offset + 7 * (block_id)
                block_end = block_start + 7
                block_hashes = all_keys[block_start:block_end]

                current_block = {'systems': block_hashes}

                block_data.append(current_block)

            # Compute redundant, reference, bad reference bits
            for block_id in range(num_blocks):
                check_id = int((block_id + (num_blocks / 2)) % num_blocks)
                # Human readable ids are one-based, hence +1
                print('Add checks for batch {0:05}/{1:05}, ' \
                  'block {2:02} to block {3:02}'.format(
                    batch_id+1, total_batches, check_id+1, block_id+1
                  )
                )

                check_systems = block_data[check_id]['systems']
                check_systems.sort()
                shuffle(check_systems)

                block_data[block_id]['redundant'] = check_systems[0]
                block_data[block_id]['reference'] = check_systems[1]
                block_data[block_id]['badref'] = check_systems[2]

            # Direct assessment is reference-based for WMT17
            if source_based:
                sourceID = 'LOCAL_SRC' if use_local_src else basename(
                    options['source_file'])
            else:
                sourceID = 'LOCAL_REF' if use_local_ref else basename(
                    options['reference_file'])

            # Remember, batch numbers are one-based
            taskData = OrderedDict({
                'batchNo':
                batch_id + 1,
                'batchSize':
                options['batch_size'],
                'sourceLanguage':
                options['source_language'],
                'targetLanguage':
                options['target_language'],
                'requiredAnnotations':
                options['required_annotations'],
                'randomSeed':
                random_seed_value
            })
            itemsData = []
            _item = 0

            for block_id in range(num_blocks):
                all_items = [(x, 'TGT')
                             for x in block_data[block_id]['systems']]
                all_items.append((block_data[block_id]['redundant'], 'CHK'))
                all_items.append((block_data[block_id]['reference'], 'REF'))
                all_items.append((block_data[block_id]['badref'], 'BAD'))
                shuffle(all_items)

                for current_item, current_type in all_items:
                    item_data = hashed_text[current_item]

                    item_id = item_data['segment_id']
                    item_text = item_data['segment_text']
                    item_bad = item_data['segment_bad']
                    item_ref = item_data['segment_ref']
                    item_src = item_data['segment_src']
                    item_systems = item_data['systems']

                    targetID = '+'.join(sorted(set(item_systems)))
                    targetText = item_text
                    if current_type == 'REF':
                        targetID = basename(options['reference_file'])
                        targetText = item_ref

                    elif current_type == 'BAD':
                        targetText = item_bad

                    obj = OrderedDict()
                    obj['_item'] = _item
                    obj['_block'] = block_id + (10 * batch_id)
                    obj['sourceID'] = sourceID
                    obj['sourceText'] = item_ref if not source_based else item_src
                    obj['targetID'] = targetID
                    obj['targetText'] = targetText
                    obj['itemID'] = item_id
                    obj['itemType'] = current_type

                    itemsData.append(obj)
                    _item += 1

            outputData = OrderedDict({'task': taskData, 'items': itemsData})

            json_data.append(outputData)

        json_data = json.dumps(json_data, indent=2, sort_keys=True)
        print(json_data)

        with open(options['output_json_file'], mode='w',
                  encoding='utf8') as output_file:
            self.stdout.write('Creating {0} ... '.format(
                options['output_json_file']),
                              ending='')
            output_file.write(str(json_data))
            self.stdout.write('OK')