Пример #1
0
class TextSegment(EvalItem):
    """
    Models a single text segment.
    """
    segmentID = models.CharField(
      max_length=MAX_SEGMENTID_LENGTH,
      verbose_name=_('Segment ID'),
      help_text=_(f('(max. {value} characters)',
        value=MAX_SEGMENTID_LENGTH))
    )

    segmentText = models.TextField(
      max_length=MAX_SEGMENTTEXT_LENGTH,
      verbose_name=_('Segment text'),
      help_text=_(f('(max. {value} characters)',
        value=MAX_SEGMENTTEXT_LENGTH))
    )

    # pylint: disable=E1101
    def is_valid(self):
        """
        Validates the current TextSegment instance, checking text.
        """
        if not isinstance(self.segmentText, type('This is a test sentence.')):
            return False

        _len = len(self.segmentText)
        if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH:
            return False

        return super(TextSegment, self).is_valid()
Пример #2
0
class ObjectID(models.Model):
    """
    Encodes an object type and ID for retrieval.
    """
    typeName = models.CharField(
      db_index=True,
      max_length=MAX_TYPENAME_LENGTH,
      verbose_name=_('Type name'),
      help_text=_(f('(max. {value} characters)',
        value=MAX_TYPENAME_LENGTH))
    )

    primaryID = models.CharField(
      db_index=True,
      max_length=MAX_PRIMARYID_LENGTH,
      verbose_name=_('Primary ID'),
      help_text=_(f('(max. {value} characters)',
        value=MAX_PRIMARYID_LENGTH))
    )

    def get_object_instance(self):
        """
        Returns actual object instance for current ObjectID instance.
        """
        instance = None
        try:
            # TODO: add registry of type names to models.py and ensure only
            #   those are used for typeName. Furthermore, verify that the
            #   given primaryID does not contain ')'.

            _code = '{0}.objects.get(id={1})'.format(
              self.typeName, self.primaryID
            )

            # Hack for Python 3.5.2
            from EvalData.models import (
                DataAssessmentTask,
                DirectAssessmentTask,
                DirectAssessmentContextTask,
                DirectAssessmentDocumentTask,
                MultiModalAssessmentTask,
                PairwiseAssessmentTask,
            )

            instance = eval(_code)

        except:
            _msg = 'ObjectID {0}.{1} invalid'.format(
              self.typeName, self.primaryID
            )
            LOGGER.warn(_msg)
            LOGGER.warn(format_exc())

        finally:
            return instance

    def __str__(self):
        return str(self.id)+'.'+self.typeName+'.'+self.primaryID
Пример #3
0
class TextPairWithImage(EvalItem):
    """
    Models a pair of two text segments and an image.
    """
    sourceID = models.CharField(max_length=MAX_SEGMENTID_LENGTH,
                                verbose_name=_('Source ID'),
                                help_text=_(
                                    f('(max. {value} characters)',
                                      value=MAX_SEGMENTID_LENGTH)))

    sourceText = models.CharField(max_length=MAX_SEGMENTTEXT_LENGTH,
                                  verbose_name=_('Source text'),
                                  help_text=_(
                                      f('(max. {value} characters)',
                                        value=MAX_SEGMENTTEXT_LENGTH)))

    targetID = models.CharField(max_length=MAX_SEGMENTID_LENGTH,
                                verbose_name=_('Target ID'),
                                help_text=_(
                                    f('(max. {value} characters)',
                                      value=MAX_SEGMENTID_LENGTH)))

    targetText = models.CharField(max_length=MAX_SEGMENTTEXT_LENGTH,
                                  verbose_name=_('Target text'),
                                  help_text=_(
                                      f('(max. {value} characters)',
                                        value=MAX_SEGMENTTEXT_LENGTH)))

    imageURL = models.URLField(verbose_name=_('image URL'))

    # pylint: disable=E1101
    def is_valid(self):
        """
        Validates the current TextPair instance, checking text.
        """
        if isinstance(self.sourceText, type('This is a test sentence.')):
            return False

        _len = len(self.sourceText)
        if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH:
            return False

        if isinstance(self.targetText, type('This is a test sentence.')):
            return False

        _len = len(self.targetText)
        if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH:
            return False

        # This does not implement validation for image URLs yet.

        return super(TextPairWithImage, self).is_valid()
Пример #4
0
class TextPair(EvalItem):
    """
    Models a pair of two text segments.
    """
    sourceID = models.CharField(
      max_length=MAX_SEGMENTID_LENGTH,
      verbose_name=_('Source ID'),
      help_text=_(f('(max. {value} characters)',
        value=MAX_SEGMENTID_LENGTH))
    )

    sourceText = models.TextField(
      blank=True,
      verbose_name=_('Source text'),
    )

    targetID = models.CharField(
      max_length=MAX_SEGMENTID_LENGTH,
      verbose_name=_('Target ID'),
      help_text=_(f('(max. {value} characters)',
        value=MAX_SEGMENTID_LENGTH))
    )

    targetText = models.TextField(
      blank=True,
      verbose_name=_('Target text'),
    )

    # pylint: disable=E1101
    def is_valid(self):
        """
        Validates the current TextPair instance, checking text.
        """
        if isinstance(self.sourceText, type('This is a test sentence.')):
            return False

        _len = len(self.sourceText)
        if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH:
            return False

        if isinstance(self.targetText, type('This is a test sentence.')):
            return False

        _len = len(self.targetText)
        if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH:
            return False

        return super(TextPair, self).is_valid()
class TextPairWithContext(TextPair):
    """
    Models a pair of two text segments and corresponding context.
    """
    documentID = models.CharField(max_length=MAX_DOCUMENTID_LENGTH,
                                  verbose_name=_('Document ID'),
                                  help_text=_(
                                      f('(max. {value} characters)',
                                        value=MAX_DOCUMENTID_LENGTH)))

    isCompleteDocument = models.BooleanField(
        blank=True,
        db_index=True,
        default=False,
        verbose_name=_('Complete document?'))

    sourceContextLeft = models.TextField(
        blank=True, null=True, verbose_name=_('Source context (left)'))

    sourceContextRight = models.TextField(
        blank=True, null=True, verbose_name=_('Source context (right)'))

    targetContextLeft = models.TextField(
        blank=True, null=True, verbose_name=_('Target context (left)'))

    targetContextRight = models.TextField(
        blank=True, null=True, verbose_name=_('Target context (right)'))

    # pylint: disable=E1101
    def is_valid(self):
        """
        Validates the current TextPairWithContext instance, checking text.
        """
        return super(TextPairWithContext, self).is_valid()
Пример #6
0
class Metadata(BaseMetadata):
    """
    Models metadata associated to tasks.
    """
    market = models.ForeignKey(
      Market,
      db_index=True,
      on_delete=models.PROTECT
    )

    corpusName = models.CharField(
      max_length=MAX_CORPUSNAME_LENGTH,
      verbose_name=_('Corpus name'),
      help_text=_(f('(max. {value} characters)',
        value=MAX_CORPUSNAME_LENGTH))
    )

    versionInfo = models.CharField(
      max_length=MAX_VERSIONINFO_LENGTH,
      verbose_name=_('Version info'),
      help_text=_(f('(max. {value} characters)',
        value=MAX_VERSIONINFO_LENGTH))
    )

    source = models.CharField(
      max_length=MAX_SOURCE_LENGTH,
      verbose_name=_('Source'),
      help_text=_(f('(max. {value} characters)',
        value=MAX_SOURCE_LENGTH))
    )

    class Meta:
        ordering = ['_str_name']
        verbose_name = 'Metadata record'

    def _generate_str_name(self):
        return '{0}->{1}/{2}["{3}"]'.format(
          self.market.sourceLanguageCode,
          self.market.targetLanguageCode,
          self.corpusName,
          self.versionInfo
        )
Пример #7
0
    def clean_fields(self, exclude=None):
        """
        Verifies that desired marketID is still available.
        """
        _new_marketID = '{0}_{1}_{2}'.format(
            self.sourceLanguageCode,
            self.targetLanguageCode,
            self.domainName
        )

        _market_instance = Market.objects.filter(marketID=_new_marketID)
        if _market_instance.exists():
            raise ValidationError(
              _(f('Market with identical marketID ("{mID}") already exists.',
                mID=_new_marketID))
            )

        super(Market, self).clean_fields(exclude)
Пример #8
0
class TextSegmentWithTwoTargets(TextSegment):
    """
    Models a text segment with one or two sub-segments.
    """
    target1ID = models.CharField(max_length=MAX_SEGMENTID_LENGTH,
                                 verbose_name=_('Item ID (1)'),
                                 help_text=_(
                                     f('(max. {value} characters)',
                                       value=MAX_SEGMENTID_LENGTH)))

    target1Text = models.TextField(
        blank=True,
        verbose_name=_('Text (1)'),
    )

    target2ID = models.CharField(null=True,
                                 max_length=MAX_SEGMENTID_LENGTH,
                                 verbose_name=_('Item ID (2)'),
                                 help_text=_(
                                     f('(max. {value} characters)',
                                       value=MAX_SEGMENTID_LENGTH)))

    target2Text = models.TextField(
        blank=True,
        null=True,
        verbose_name=_('Text (2)'),
    )

    contextLeft = models.TextField(blank=True,
                                   null=True,
                                   verbose_name=_('Context (left)'))

    contextRight = models.TextField(blank=True,
                                    null=True,
                                    verbose_name=_('Context (right)'))

    def has_context(self):
        """Checks if the current segment has context provided."""
        return self.contextLeft or self.contextRight

    def context_left(self, last=5, separator=' '):
        """
        Returns formatted last 5 sentences from the left context.
        Use separator='<br>' to show one sentence per line.
        """
        return (separator.join(self.contextLeft.split('\n')[-last:])
                if self.contextLeft else '')

    def context_right(self, first=5, separator=' '):
        """
        Returns formatted first 5 sentences from the right context.
        Use separator='<br>' to show one sentence per line.
        """
        return (separator.join(self.contextRight.split('\n')[:first])
                if self.contextRight else '')

    def target_texts_with_diffs(self):
        """
        Returns the pair of texts with HTML tags highlighting token differences.
        Both texts must be non empty.

        For example,
            'a b c d e' and 'a B c e f'
        will become:
            'a <span class="diff diff-sub">b</span> c <span class="diff diff-del">d</span> e',
            'a <span class="diff diff-sub">B</span> c e <span class="diff diff-ins">f</span>'
        """
        if not self.target1Text or not self.target2Text:
            return (self.target1Text, self.target2Text)

        toks1 = self.target1Text.split()
        toks2 = self.target2Text.split()
        matcher = SequenceMatcher(None, toks1, toks2)

        text1 = ''
        text2 = ''
        for tag, i1, i2, j1, j2 in matcher.get_opcodes():
            if tag == 'equal':
                text1 += ' ' + ' '.join(toks1[i1:i2])
                text2 += ' ' + ' '.join(toks2[j1:j2])
            elif tag == 'replace':
                text1 += ' <span class="diff diff-sub">' + ' '.join(
                    toks1[i1:i2]) + '</span>'
                text2 += ' <span class="diff diff-sub">' + ' '.join(
                    toks2[j1:j2]) + '</span>'
            elif tag == 'insert':
                text2 += ' <span class="diff diff-ins">' + ' '.join(
                    toks2[j1:j2]) + '</span>'
            elif tag == 'delete':
                text1 += ' <span class="diff diff-del">' + ' '.join(
                    toks1[i1:i2]) + '</span>'
        return (text1.strip(), text2.strip())

    # pylint: disable=E1101
    def is_valid(self):
        """
        Validates the current TextSegmentWithTwoTargets instance, checking
        text.
        """
        if isinstance(self.target1Text, type('This is a test sentence.')):
            return False

        _len = len(self.target1Text)
        if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH:
            return False

        if target2Text and len(target2Text) > 0:
            if isinstance(self.target2Text, type('This is a test sentence.')):
                return False

            _len = len(self.target2Text)
            if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH:
                return False

            # Texts must be different
            if self.target1Text == self.target2Text:
                return False

        return super(TextSegmentWithTwoTargets, self).is_valid()
Пример #9
0
class PairwiseAssessmentTask(BaseMetadata):
    """
    Models a direct assessment evaluation task.
    """
    campaign = models.ForeignKey(
        'Campaign.Campaign',
        db_index=True,
        on_delete=models.PROTECT,
        related_name='%(app_label)s_%(class)s_campaign',
        related_query_name="%(app_label)s_%(class)ss",
        verbose_name=_('Campaign'))

    items = models.ManyToManyField(
        TextSegmentWithTwoTargets,
        related_name='%(app_label)s_%(class)s_items',
        related_query_name="%(app_label)s_%(class)ss",
        verbose_name=_('Items'))

    requiredAnnotations = models.PositiveSmallIntegerField(
        verbose_name=_('Required annotations'),
        help_text=_(
            f('(value in range=[1,{value}])',
              value=MAX_REQUIREDANNOTATIONS_VALUE)))

    assignedTo = models.ManyToManyField(
        User,
        blank=True,
        db_index=True,
        related_name='%(app_label)s_%(class)s_assignedTo',
        related_query_name="%(app_label)s_%(class)ss",
        verbose_name=_('Assigned to'),
        help_text=_('(users working on this task)'))

    batchNo = models.PositiveIntegerField(verbose_name=_('Batch number'),
                                          help_text=_('(1-based)'))

    batchData = models.ForeignKey(
        'Campaign.CampaignData',
        on_delete=models.PROTECT,
        blank=True,
        db_index=True,
        null=True,
        related_name='%(app_label)s_%(class)s_batchData',
        related_query_name="%(app_label)s_%(class)ss",
        verbose_name=_('Batch data'))

    def dataName(self):
        return str(self.batchData)

    def marketName(self):
        return str(self.items.first().metadata.market)

    def marketSourceLanguage(self):
        tokens = str(self.items.first().metadata.market).split('_')
        if len(tokens) == 3 and tokens[0] in LANGUAGE_CODES_AND_NAMES.keys():
            return LANGUAGE_CODES_AND_NAMES[tokens[0]]
        return None

    def marketSourceLanguageCode(self):
        tokens = str(self.items.first().metadata.market).split('_')
        if len(tokens) == 3 and tokens[0] in LANGUAGE_CODES_AND_NAMES.keys():
            return tokens[0]
        return None

    def marketTargetLanguage(self):
        tokens = str(self.items.first().metadata.market).split('_')
        if len(tokens) == 3 and tokens[1] in LANGUAGE_CODES_AND_NAMES.keys():
            return LANGUAGE_CODES_AND_NAMES[tokens[1]]
        return None

    def marketTargetLanguageCode(self):
        tokens = str(self.items.first().metadata.market).split('_')
        if len(tokens) == 3 and tokens[1] in LANGUAGE_CODES_AND_NAMES.keys():
            return tokens[1]
        return None

    def completed_items_for_user(self, user):
        results = PairwiseAssessmentResult.objects.filter(
            task=self, activated=False, completed=True,
            createdBy=user).values_list('item_id', flat=True)

        return len(set(results))

    def is_trusted_user(self, user):
        from Campaign.models import TrustedUser
        trusted_user = TrustedUser.objects.filter(\
          user=user, campaign=self.campaign
        )
        return trusted_user.exists()

    def next_item_for_user(self, user, return_completed_items=False):
        trusted_user = self.is_trusted_user(user)

        next_item = None
        completed_items = 0
        for item in self.items.all().order_by('id'):
            result = PairwiseAssessmentResult.objects.filter(item=item,
                                                             activated=False,
                                                             completed=True,
                                                             createdBy=user)

            if not result.exists():
                print('identified next item: {0}/{1} for trusted={2}'.format(
                    item.id, item.itemType, trusted_user))
                if not trusted_user or item.itemType.startswith('TGT'):
                    next_item = item
                    print('  - got it')
                    break

            completed_items += 1

        if not next_item:
            LOGGER.info('No next item found for task {0}'.format(self.id))
            annotations = PairwiseAssessmentResult.objects.filter(
                task=self, activated=False,
                completed=True).values_list('item_id', flat=True)
            uniqueAnnotations = len(set(annotations))

            required_user_results = 100
            if trusted_user:
                required_user_results = 70

            _total_required = self.requiredAnnotations * required_user_results
            LOGGER.info('Unique annotations={0}/{1}'.format(
                uniqueAnnotations, _total_required))
            if uniqueAnnotations >= _total_required:
                LOGGER.info('Completing task {0}'.format(self.id))
                self.complete()
                self.save()

                # Not sure why I would complete the batch here?
                # self.batchData.complete()
                # self.batchData.save()

        if return_completed_items:
            return (next_item, completed_items)

        return next_item

    @classmethod
    def get_task_for_user(cls, user):
        for active_task in cls.objects.filter(assignedTo=user,
                                              activated=True,
                                              completed=False).order_by('-id'):
            next_item = active_task.next_item_for_user(user)
            if next_item is not None:
                return active_task

        return None

    @classmethod
    def get_next_free_task_for_language(cls, code, campaign=None, user=None):
        print('  Looking for next free task for language: {0}'.format(code))
        print('  Campaign: {0}'.format(campaign))
        print('  User: {0}'.format(user))

        active_tasks = cls.objects.filter(
            activated=True,
            completed=False,
            items__metadata__market__targetLanguageCode=code)

        print('    Number of active tasks: ({0})'.format(len(active_tasks)))

        if campaign:
            active_tasks = active_tasks.filter(campaign=campaign)

        for active_task in active_tasks.order_by('id'):
            active_users = active_task.assignedTo.count()
            if active_users < active_task.requiredAnnotations:
                if user and not user in active_task.assignedTo.all():
                    return active_task

        print('    No next free task available')
        return None

        # It seems that assignedTo is converted to an integer count.
        active_tasks = active_tasks.order_by('id') \
         .values_list('id', 'requiredAnnotations', 'assignedTo')

        for active_task in active_tasks:
            print(active_task)
            active_users = active_task[2] or 0
            if active_users < active_task[1]:
                return cls.objects.get(pk=active_task[0])

        return None

        # TODO: this needs to be removed.
        for active_task in active_tasks:
            market = active_task.items.first().metadata.market
            if not market.targetLanguageCode == code:
                continue

            active_users = active_task.assignedTo.count()
            if active_users < active_task.requiredAnnotations:
                return active_task

        return None

    @classmethod
    def get_next_free_task_for_language_and_campaign(cls, code, campaign):
        return cls.get_next_free_task_for_language(code, campaign)

    @classmethod
    def import_from_json(cls, campaign, batch_user, batch_data, max_count):
        """
        Creates new PairwiseAssessmentTask instances based on JSON input.
        """
        batch_meta = batch_data.metadata
        batch_name = batch_data.dataFile.name
        batch_file = batch_data.dataFile
        batch_json = None

        if batch_name.endswith('.zip'):
            if not is_zipfile(batch_file):
                _msg = 'Batch {0} not a valid ZIP archive'.format(batch_name)
                LOGGER.warn(_msg)
                return

            batch_zip = ZipFile(batch_file)
            batch_json_files = [
                x for x in batch_zip.namelist() if x.endswith('.json')
            ]
            # TODO: implement proper support for multiple json files in archive.
            for batch_json_file in batch_json_files:
                batch_content = batch_zip.read(batch_json_file).decode('utf-8')
                batch_json = loads(batch_content, encoding='utf-8')

        else:
            batch_json = loads(str(batch_file.read(), encoding="utf-8"))

        from datetime import datetime
        t1 = datetime.now()

        current_count = 0
        max_length_id = 0
        max_length_text = 0
        for batch_task in batch_json:
            if max_count > 0 and current_count >= max_count:
                _msg = 'Stopping after max_count={0} iterations'.format(
                    max_count)
                LOGGER.info(_msg)

                t2 = datetime.now()
                print(t2 - t1)
                return

            print('Loading batch:', batch_name, batch_task['task']['batchNo'])

            new_items = []
            count_items = 0
            for item in batch_task['items']:
                count_items += 1

                # TODO: check if target1 + target2 should be used here
                current_length_id = len(item['sourceID'])
                current_length_text = len(item['sourceText'])

                if current_length_id > max_length_id:
                    print(current_length_id, item['sourceID'])
                    max_length_id = current_length_id

                if current_length_text > max_length_text:
                    print(current_length_text,
                          item['sourceText'].encode('utf-8'))
                    max_length_text = current_length_text

                item_targets = item['targets']

                # TODO: check if 'targets' is empty or has more elements
                # than 2
                item_tgt1_idx = item_targets[0]['targetID']
                item_tgt1_txt = item_targets[0]['targetText']

                item_tgt2_idx = None
                item_tgt2_txt = None
                if len(item_targets) > 1:
                    item_tgt2_idx = item_targets[1]['targetID']
                    item_tgt2_txt = item_targets[1]['targetText']

                context_left = item.get('contextLeft', None)
                context_right = item.get('contextRight', None)

                new_item = TextSegmentWithTwoTargets(
                    segmentID=item['sourceID'],
                    segmentText=item['sourceText'],
                    target1ID=item_tgt1_idx,
                    target1Text=item_tgt1_txt,
                    target2ID=item_tgt2_idx,
                    target2Text=item_tgt2_txt,
                    createdBy=batch_user,
                    itemID=item['itemID'],
                    itemType=item['itemType'],
                    contextLeft=context_left,
                    contextRight=context_right,
                )
                new_items.append(new_item)

            if not len(new_items) == 100:
                _msg = 'Expected 100 items for task but found {0}'.format(
                    count_items)
                LOGGER.warn(_msg)
                continue

            current_count += 1

            #for new_item in new_items:
            #    new_item.metadata = batch_meta
            #    new_item.save()
            batch_meta.textsegment_set.add(*new_items, bulk=False)
            batch_meta.save()

            new_task = PairwiseAssessmentTask(
                campaign=campaign,
                requiredAnnotations=batch_task['task']['requiredAnnotations'],
                batchNo=batch_task['task']['batchNo'],
                batchData=batch_data,
                createdBy=batch_user,
            )
            new_task.save()

            #for new_item in new_items:
            #    new_task.items.add(new_item)
            new_task.items.add(*new_items)
            new_task.save()

            _msg = 'Success processing batch {0}, task {1}'.format(
                str(batch_data), batch_task['task']['batchNo'])
            LOGGER.info(_msg)

        _msg = 'Max length ID={0}, text={1}'.format(max_length_id,
                                                    max_length_text)
        LOGGER.info(_msg)

        t2 = datetime.now()
        print(t2 - t1)

    # pylint: disable=E1101
    def is_valid(self):
        """
        Validates the current DA task, checking campaign and items exist.
        """
        if not hasattr(self, 'campaign') or not self.campaign.is_valid():
            return False

        if not hasattr(self, 'items'):
            return False

        for item in self.items:
            if not item.is_valid():
                return False

        return True

    def _generate_str_name(self):
        return '{0}.{1}[{2}]'.format(self.__class__.__name__, self.campaign,
                                     self.id)
Пример #10
0
class TextPairWithDomain(TextPair):
    """
    Models a pair of two multi-line text segments with domain and URL.
    """
    SENTENCE_DELIMITER = '\n'

    documentDomain = models.CharField(max_length=MAX_SEGMENTID_LENGTH,
                                      verbose_name=_('Domain'),
                                      help_text=_(
                                          f('(max. {value} characters)',
                                            value=MAX_SEGMENTID_LENGTH)))

    sourceURL = models.TextField(
        blank=True,
        verbose_name=_('Source URL'),
    )

    targetURL = models.TextField(
        blank=True,
        verbose_name=_('Target URL'),
    )

    def get_sentence_pairs(self):
        """
        Returns pairs of source and target sentences created from source
        and target segments.
        """
        return zip(self.sourceText.split(self.SENTENCE_DELIMITER),
                   self.targetText.split(self.SENTENCE_DELIMITER))

    # pylint: disable=E1101
    def is_valid(self):
        """
        Validates the current TextPairWithDomain instance, checking text.
        """
        if isinstance(self.sourceText, type('This is a test sentence.')):
            return False

        _len = len(self.sourceText)
        if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH:
            return False

        if isinstance(self.targetText, type('This is a test sentence.')):
            return False

        _len = len(self.targetText)
        if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH:
            return False

        # Check if multi-line segments are of the same length
        _src_segs = self.sourceText.strip().split(self.SENTENCE_DELIMITER)
        _tgt_segs = self.targetText.strip().split(self.SENTENCE_DELIMITER)
        if len(_src_segs) != len(_tgt_segs):
            return False

        _len = len(self.sourceURL)
        if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH:
            return False

        _len = len(self.targetURL)
        if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH:
            return False

        return super(TextPairWithDomain, self).is_valid()
class DirectAssessmentDocumentTask(BaseMetadata):
    """
    Models a direct assessment document evaluation task.

    Note: this task is, similarily to other models, a shameless copy of
    DirectAssessmentContextTask, with one additional method for retrieving all
    items belonging to the same document in the task called
    `next_document_for_user`, and a helper method `get_results_for_each_item`.
    The underlying model is the same as for
    DirectAssessmentContextTask.
    """
    campaign = models.ForeignKey(
        'Campaign.Campaign',
        db_index=True,
        on_delete=models.PROTECT,
        related_name='%(app_label)s_%(class)s_campaign',
        related_query_name="%(app_label)s_%(class)ss",
        verbose_name=_('Campaign'))

    items = models.ManyToManyField(
        TextPairWithContext,
        related_name='%(app_label)s_%(class)s_items',
        related_query_name="%(app_label)s_%(class)ss",
        verbose_name=_('Items'))

    requiredAnnotations = models.PositiveSmallIntegerField(
        verbose_name=_('Required annotations'),
        help_text=_(
            f('(value in range=[1,{value}])',
              value=MAX_REQUIREDANNOTATIONS_VALUE)))

    assignedTo = models.ManyToManyField(
        User,
        blank=True,
        db_index=True,
        related_name='%(app_label)s_%(class)s_assignedTo',
        related_query_name="%(app_label)s_%(class)ss",
        verbose_name=_('Assigned to'),
        help_text=_('(users working on this task)'))

    batchNo = models.PositiveIntegerField(verbose_name=_('Batch number'),
                                          help_text=_('(1-based)'))

    batchData = models.ForeignKey(
        'Campaign.CampaignData',
        on_delete=models.PROTECT,
        blank=True,
        db_index=True,
        null=True,
        related_name='%(app_label)s_%(class)s_batchData',
        related_query_name="%(app_label)s_%(class)ss",
        verbose_name=_('Batch data'))

    def dataName(self):
        return str(self.batchData)

    def marketName(self):
        return str(self.items.first().metadata.market)

    def marketSourceLanguage(self):
        tokens = str(self.items.first().metadata.market).split('_')
        if len(tokens) == 3 and tokens[0] in LANGUAGE_CODES_AND_NAMES.keys():
            return LANGUAGE_CODES_AND_NAMES[tokens[0]]
        return None

    def marketSourceLanguageCode(self):
        tokens = str(self.items.first().metadata.market).split('_')
        if len(tokens) == 3 and tokens[0] in LANGUAGE_CODES_AND_NAMES.keys():
            return tokens[0]
        return None

    def marketTargetLanguage(self):
        tokens = str(self.items.first().metadata.market).split('_')
        if len(tokens) == 3 and tokens[1] in LANGUAGE_CODES_AND_NAMES.keys():
            return LANGUAGE_CODES_AND_NAMES[tokens[1]]
        return None

    def marketTargetLanguageCode(self):
        tokens = str(self.items.first().metadata.market).split('_')
        if len(tokens) == 3 and tokens[1] in LANGUAGE_CODES_AND_NAMES.keys():
            return tokens[1]
        return None

    def completed_items_for_user(self, user):
        results = DirectAssessmentDocumentResult.objects.filter(
            task=self, activated=False, completed=True,
            createdBy=user).values_list('item_id', flat=True)

        return len(set(results))

    def is_trusted_user(self, user):
        from Campaign.models import TrustedUser
        trusted_user = TrustedUser.objects.filter(\
          user=user, campaign=self.campaign
        )
        return trusted_user.exists()

    def next_item_for_user(self, user, return_completed_items=False):
        trusted_user = self.is_trusted_user(user)

        next_item = None
        completed_items = 0
        for item in self.items.all().order_by('id'):
            result = DirectAssessmentDocumentResult.objects.filter(
                item=item, activated=False, completed=True, createdBy=user)

            if not result.exists():
                print(
                    'Identified next item: {}/{} (itemID={}) for trusted={}' \
                    .format(item.id, item.itemType, item.itemID, trusted_user)
                )
                if not trusted_user or item.itemType == 'TGT':
                    next_item = item
                    break

            completed_items += 1

        if not next_item:
            LOGGER.info('No next item found for task {0}'.format(self.id))
            annotations = DirectAssessmentDocumentResult.objects.filter(
                task=self, activated=False,
                completed=True).values_list('item_id', flat=True)
            uniqueAnnotations = len(set(annotations))

            required_user_results = 100
            if trusted_user:
                required_user_results = 70

            _total_required = self.requiredAnnotations * required_user_results
            LOGGER.info('Unique annotations={0}/{1}'.format(
                uniqueAnnotations, _total_required))
            if uniqueAnnotations >= _total_required:
                LOGGER.info('Completing task {0}'.format(self.id))
                self.complete()
                self.save()

                # Not sure why I would complete the batch here?
                # self.batchData.complete()
                # self.batchData.save()

        if return_completed_items:
            return (next_item, completed_items)

        return next_item

    def next_document_for_user(self, user, return_statistics=True):
        """Returns the next item and all items from its document."""
        # Find the next not annotated item
        (
            next_item,
            completed_items,
        ) = self.next_item_for_user(user, return_completed_items=True)

        if not next_item:
            if not return_statistics:
                return (next_item, [], [])
            return (next_item, completed_items, 0, 0, [], [], 0)

        # Retrieve all items from the document which next_item belongs to
        _items = self.items.filter(
            documentID=next_item.documentID, ).order_by('id')

        block_items = []
        current_block = False
        for item in _items:
            block_items.append(item)
            if item.id == next_item.id:
                current_block = True
            if item.isCompleteDocument:
                if current_block:
                    break
                block_items.clear()

        # Get results for completed items in this block
        block_results = self.get_results_for_each_item(block_items, user)

        if not return_statistics:
            return (next_item, block_items, block_results)

        # Collect statistics
        completed_items_in_block = len(
            [res for res in block_results if res is not None])
        completed_blocks = DirectAssessmentDocumentResult.objects.filter(
            task=self,
            item__isCompleteDocument=True,
            completed=True,
            createdBy=user).count()
        total_blocks = self.items.filter(isCompleteDocument=True).count()

        print(
            'Completed {}/{} documents, {}/{} items in the current document, completed {} items in total' \
            .format(completed_blocks, total_blocks, completed_items_in_block, len(block_items), completed_items)
        )

        return (
            next_item,  # the first unannotated item for the user
            completed_items,  # the number of completed items in the task
            completed_blocks,  # the number of completed documents in the task
            completed_items_in_block,  # the number of completed items in the current document
            block_items,  # all items from the current document
            block_results,  # all score results from the current document
            total_blocks,  # the total number of documents in the task
        )

    def get_results_for_each_item(self, block_items, user):
        """Returns the latest result object for each item or none."""
        # TODO: optimize, this possibly makes too many individual queries
        block_results = []

        for item in block_items:
            result = DirectAssessmentDocumentResult.objects.filter(
                item__id=item.id,
                completed=True,
                createdBy=user,  # TODO: is passing user as an argument needed?
                task=self).order_by('item__id', 'dateModified').first()
            block_results.append(result)

        # Sanity checks for items and results
        if len(block_items) != len(block_results):
            print('Warning: incorrect number of retrieved results!')
        for item, result in zip(block_items, block_results):
            # print(f'  >> item={item} result={result}')
            if result and item.id != result.item.id:
                print('Warning: incorrect order of items and results!')

        return block_results

    @classmethod
    def get_task_for_user(cls, user):
        for active_task in cls.objects.filter(assignedTo=user,
                                              activated=True,
                                              completed=False).order_by('-id'):
            next_item = active_task.next_item_for_user(user)
            if next_item is not None:
                return active_task

        return None

    @classmethod
    def get_next_free_task_for_language(cls, code, campaign=None, user=None):
        active_tasks = cls.objects.filter(
            activated=True,
            completed=False,
            items__metadata__market__targetLanguageCode=code)

        if campaign:
            active_tasks = active_tasks.filter(campaign=campaign)

        for active_task in active_tasks.order_by('id'):
            active_users = active_task.assignedTo.count()
            if active_users < active_task.requiredAnnotations:
                if user and not user in active_task.assignedTo.all():
                    return active_task

        return None

        # It seems that assignedTo is converted to an integer count.
        active_tasks = active_tasks.order_by('id') \
         .values_list('id', 'requiredAnnotations', 'assignedTo')

        for active_task in active_tasks:
            print(active_task)
            active_users = active_task[2] or 0
            if active_users < active_task[1]:
                return cls.objects.get(pk=active_task[0])

        return None

        # TODO: this needs to be removed.
        for active_task in active_tasks:
            market = active_task.items.first().metadata.market
            if not market.targetLanguageCode == code:
                continue

            active_users = active_task.assignedTo.count()
            if active_users < active_task.requiredAnnotations:
                return active_task

        return None

    @classmethod
    def get_next_free_task_for_language_and_campaign(cls, code, campaign):
        return cls.get_next_free_task_for_language(code, campaign)

    @classmethod
    def import_from_json(cls, campaign, batch_user, batch_data, max_count):
        """
        Creates new DirectAssessmentDocumentTask instances based on JSON input.
        """
        batch_meta = batch_data.metadata
        batch_name = batch_data.dataFile.name
        batch_file = batch_data.dataFile
        batch_json = None

        if batch_name.endswith('.zip'):
            if not is_zipfile(batch_file):
                _msg = 'Batch {0} not a valid ZIP archive'.format(batch_name)
                LOGGER.warn(_msg)
                return

            batch_zip = ZipFile(batch_file)
            batch_json_files = [
                x for x in batch_zip.namelist() if x.endswith('.json')
            ]
            # TODO: implement proper support for multiple json files in archive.
            for batch_json_file in batch_json_files:
                batch_content = batch_zip.read(batch_json_file).decode('utf-8')
                batch_json = loads(batch_content, encoding='utf-8')

        else:
            batch_json = loads(str(batch_file.read(), encoding="utf-8"))

        from datetime import datetime
        t1 = datetime.now()

        current_count = 0
        max_length_id = 0
        max_length_text = 0
        for batch_task in batch_json:
            if max_count > 0 and current_count >= max_count:
                _msg = 'Stopping after max_count={0} iterations'.format(
                    max_count)
                LOGGER.info(_msg)

                t2 = datetime.now()
                print(t2 - t1)
                return

            print(batch_name, batch_task['task']['batchNo'])

            doc_items = 0
            new_items = []
            for item in batch_task['items']:
                current_length_id = len(item['targetID'])
                current_length_text = len(item['targetText'])

                if current_length_id > max_length_id:
                    print(current_length_id, item['targetID'])
                    max_length_id = current_length_id

                if current_length_text > max_length_text:
                    print(current_length_text,
                          item['targetText'].encode('utf-8'))
                    max_length_text = current_length_text

                new_item = TextPairWithContext(
                    sourceID=item['sourceID'],
                    sourceText=item['sourceText'],
                    sourceContextLeft=item.get('sourceContextLeft', None),
                    sourceContextRight=item.get('sourceContextRight', None),
                    targetID=item['targetID'],
                    targetText=item['targetText'],
                    targetContextLeft=item.get('targetContextLeft', None),
                    targetContextRight=item.get('targetContextRight', None),
                    createdBy=batch_user,
                    itemID=item['itemID'],
                    itemType=item['itemType'],
                    documentID=item['documentID'],
                    isCompleteDocument=item['isCompleteDocument'],
                )
                new_items.append(new_item)
                if item['isCompleteDocument']:
                    doc_items += 1

            if (len(new_items) - doc_items) != 100:
                _msg = 'Expected 100 items for task but found {0}'.format(
                    len(new_items) - doc_items)
                LOGGER.warn(_msg)
                continue

            current_count += 1

            for new_item in new_items:
                new_item.metadata = batch_meta
                new_item.save()
            #batch_meta.textpairwithcontext_set.add(*new_items, bulk=False)
            #batch_meta.save()

            new_task = DirectAssessmentDocumentTask(
                campaign=campaign,
                requiredAnnotations=batch_task['task']['requiredAnnotations'],
                batchNo=batch_task['task']['batchNo'],
                batchData=batch_data,
                createdBy=batch_user,
            )
            new_task.save()

            #for new_item in new_items:
            #    new_task.items.add(new_item)
            new_task.items.add(*new_items)
            new_task.save()

            _msg = 'Success processing batch {0}, task {1}'.format(
                str(batch_data), batch_task['task']['batchNo'])
            LOGGER.info(_msg)

        _msg = 'Max length ID={0}, text={1}'.format(max_length_id,
                                                    max_length_text)
        LOGGER.info(_msg)

        t2 = datetime.now()
        print(t2 - t1)

    # pylint: disable=E1101
    def is_valid(self):
        """
        Validates the current DA task, checking campaign and items exist.
        """
        if not hasattr(self, 'campaign') or not self.campaign.is_valid():
            return False

        if not hasattr(self, 'items'):
            return False

        for item in self.items:
            if not item.is_valid():
                return False

        return True

    def _generate_str_name(self):
        return '{0}.{1}[{2}]'.format(self.__class__.__name__, self.campaign,
                                     self.id)
Пример #12
0
class Campaign(BaseMetadata):
    """
    Models an evaluation campaign.
    """

    campaignName = models.CharField(
        max_length=MAX_CAMPAIGNNAME_LENGTH,
        verbose_name=_('Campaign name'),
        help_text=_(
            f('(max. {value} characters)', value=MAX_CAMPAIGNNAME_LENGTH)),
    )

    teams = models.ManyToManyField(
        CampaignTeam,
        blank=True,
        related_name='%(app_label)s_%(class)s_teams',
        related_query_name="%(app_label)s_%(class)ss",
        verbose_name=_('Teams'),
    )

    batches = models.ManyToManyField(
        CampaignData,
        blank=True,
        related_name='%(app_label)s_%(class)s_batches',
        related_query_name="%(app_label)s_%(class)ss",
        verbose_name=_('Batches'),
    )

    packageFile = models.FileField(
        blank=True,
        null=True,
        verbose_name=_('Package file'),
        upload_to='Packages',
        validators=[_validate_package_file],
    )

    def _generate_str_name(self):
        return self.campaignName

    @classmethod
    def get_campaign_or_raise(cls, campaign_name):
        """
        Get campaign with name campaign_name from database.

        Returns Campaign instance if exists, otherwise LookupError.
        """
        _obj = Campaign.objects.filter(campaignName=campaign_name)
        if not _obj.exists():
            _msg = 'Failure to identify campaign {0}'.format(campaign_name)
            raise LookupError(_msg)

        return _obj.first()  # if multiple campaigns, return first

    def get_campaign_type(self) -> str:
        """
        Get campaign type based on evaldata_{cls_name}_campaign QuerySet.

        For now, we assume that campaigns can only have a single type.

        We use the following check to identify the campaign's type:
        c.evaldata_directassessmentcontexttask_campaign.exists()

        Returns class object, which is a sub class of BaseAnnotationTask.
        """
        for cls_name in AnnotationTaskRegistry.get_types():
            qs_name = cls_name.lower()
            qs_attr = 'evaldata_{0}_campaign'.format(qs_name)
            qs_obj = getattr(self, qs_attr, None)
            if qs_obj and qs_obj.exists():
                return cls_name

        _msg = 'Unknown type for campaign {0}'.format(self.campaignName)
        raise LookupError(_msg)  # This should never happen, thus raise!
Пример #13
0
class CampaignTeam(BaseMetadata):
    """
    Models a campaign team.
    """

    teamName = models.CharField(
        max_length=MAX_TEAMNAME_LENGTH,
        verbose_name=_('Team name'),
        help_text=_(f('(max. {value} characters)', value=MAX_TEAMNAME_LENGTH)),
    )

    owner = models.ForeignKey(
        User,
        limit_choices_to={'is_staff': True},
        on_delete=models.PROTECT,
        related_name='%(app_label)s_%(class)s_owner',
        related_query_name="%(app_label)s_%(class)ss",
        verbose_name=_('Team owner'),
        help_text=_('(must be staff member)'),
    )

    members = models.ManyToManyField(
        User,
        related_name='%(app_label)s_%(class)s_members',
        related_query_name="%(app_label)s_%(class)ss",
        verbose_name=_('Team members'),
    )

    requiredAnnotations = models.PositiveSmallIntegerField(
        verbose_name=_('Required annotations'),
        help_text=_(
            f('(value in range=[1,{value}])', value=MAX_SMALLINTEGER_VALUE)),
    )

    requiredHours = models.PositiveSmallIntegerField(
        verbose_name=_('Required hours'),
        help_text=_(
            f('(value in range=[1,{value}])', value=MAX_SMALLINTEGER_VALUE)),
    )

    # pylint: disable=C0111,R0903
    class Meta:
        ordering = ['_str_name']
        verbose_name = 'Team'
        verbose_name_plural = 'Teams'

    def _generate_str_name(self):
        return '{0} ({1})'.format(self.teamName, self.owner)

    def is_valid(self):
        """
        Validates the current CampaignTeam instance.
        """
        try:
            self.full_clean()
            return True

        except ValidationError:
            return False

    # pylint: disable=C0103,E1101
    def teamMembers(self):
        """
        Proxy method returning members count.
        """
        return self.members.count()

    teamMembers.short_description = '# of team members'

    # TODO: Connect to actual data, producing correct completion status.
    # pylint: disable=no-self-use
    def completionStatus(self):
        """
        Proxy method return completion status in percent.

        This is defined to be the minimum of:
        - # of completed annotations / # required annotations; and
        - # of completed hours / # required hours.
        """
        return '0%'

    completionStatus.short_description = 'Completion status'
Пример #14
0
class Market(BaseMetadata):
    """
    Models a language/locale market.
    """
    ###
    # Each market has a unique ID composed of source, target language codes
    # and application domain name. This also acts as primary lookup key.
    #
    # By assumption, source language content has been produced natively.
    # For monolingual content, source and target codes are identical.
    ###
    marketID = models.CharField(
        max_length=2 * MAX_LANGUAGECODE_LENGTH + MAX_DOMAINNAME_LENGTH + 2,
        editable=False,
        unique=True
    )

    sourceLanguageCode = models.CharField(
      max_length=MAX_LANGUAGECODE_LENGTH,
      verbose_name=_('Source language'),
      help_text=_(f('(max. {value} characters)',
        value=MAX_LANGUAGECODE_LENGTH))
    )

    targetLanguageCode = models.CharField(
      max_length=MAX_LANGUAGECODE_LENGTH,
      verbose_name=_('Target language'),
      help_text=_(f('(max. {value} characters)',
        value=MAX_LANGUAGECODE_LENGTH))
    )

    domainName = models.CharField(
      max_length=MAX_DOMAINNAME_LENGTH,
      verbose_name=_('Domain name'),
      help_text=_(f('(max. {value} characters)',
        value=MAX_DOMAINNAME_LENGTH))
    )

    def clean_fields(self, exclude=None):
        """
        Verifies that desired marketID is still available.
        """
        _new_marketID = '{0}_{1}_{2}'.format(
            self.sourceLanguageCode,
            self.targetLanguageCode,
            self.domainName
        )

        _market_instance = Market.objects.filter(marketID=_new_marketID)
        if _market_instance.exists():
            raise ValidationError(
              _(f('Market with identical marketID ("{mID}") already exists.',
                mID=_new_marketID))
            )

        super(Market, self).clean_fields(exclude)

    def save(self, *args, **kwargs):
        _new_marketID = '{0}_{1}_{2}'.format(
            self.sourceLanguageCode,
            self.targetLanguageCode,
            self.domainName
        )
        self.marketID = _new_marketID

        super(Market, self).save(*args, **kwargs)

    # TODO: what is this used for? Candidate for deprecation/removal.
    #
    # pylint: disable=E1101
    def my_is_valid(self):
        """
        Validates the current Market instance, checking marketID uniqueness.
        """
        _expected_marketID = '{0}_{1}_{2}'.format(
            self.sourceLanguageCode,
            self.targetLanguageCode,
            self.domainName
        )

        _market_instance = Market.objects.filter(
            marketID=_expected_marketID)
        if not hasattr(self, "marketID") or self.marketID == '':
            if _market_instance.exists():
                return False

        else:
            _market_instance_obj = _market_instance.get()
            if _market_instance_obj is not None \
            and self.id != _market_instance_obj.id:
                return False

        return super(Market, self).is_valid()

    def _generate_str_name(self):
        return self.marketID