Пример #1
0
class RegexTaggerGroupSerializer(serializers.ModelSerializer, ProjectResourceUrlSerializer):
    description = serializers.CharField()
    url = serializers.SerializerMethodField()
    task = TaskSerializer(read_only=True)
    author = UserSerializer(read_only=True)
    tagger_info = serializers.SerializerMethodField(read_only=True)  # Helper field for displaying tagger info in a friendly manner.


    # Ensure that only Regex Taggers inside the same Project are returned.
    def get_fields(self, *args, **kwargs):
        fields = super(RegexTaggerGroupSerializer, self).get_fields(*args, **kwargs)
        project_pk = self.context["view"].kwargs["project_pk"]
        fields['regex_taggers'].queryset = RegexTagger.objects.filter(project__pk=project_pk)
        return fields


    def get_tagger_info(self, value: RegexTaggerGroup):
        queryset = value.regex_taggers.filter(project__pk=value.pk)
        serializer = RegexTaggerSerializer(queryset, many=True, context={"request": self.context["request"]})
        return serializer.data


    class Meta:
        model = RegexTaggerGroup
        # regex_taggers is the field which to use to manipulate the related RegexTagger model objects.
        fields = ('id', 'url', 'regex_taggers', 'author', 'task', 'description', 'tagger_info')
Пример #2
0
class LexiconSerializer(FieldParseSerializer, serializers.ModelSerializer):
    author = UserSerializer(read_only=True)
    positives_used = StringListField(
        help_text=
        f'Positive phrases for the model as list of strings. Default: EMPTY',
        required=False)
    negatives_used = StringListField(
        help_text=
        f'Negative phrases for the model as list of strings. Default: EMPTY',
        required=False)
    positives_unused = StringListField(
        help_text=
        f'Positive phrases in the lexicon, not used in mining as list of strings. Default: EMPTY',
        required=False,
    )
    negatives_unused = StringListField(
        help_text=
        f'Negative phrases left out from the lexicon, not used in mining as list of strings. Default: EMPTY',
        required=False)

    class Meta:
        model = Lexicon
        fields = ('id', 'author', 'description', 'positives_used',
                  'negatives_used', 'positives_unused', 'negatives_unused')
        read_only_fields = ('project', 'author')
        fields_to_parse = ('positives_used', 'negatives_used',
                           'positives_unused', 'negatives_unused')
Пример #3
0
class RegexTaggerSerializer(FieldParseSerializer, serializers.ModelSerializer, ProjectResourceUrlSerializer):
    description = serializers.CharField()
    author = UserSerializer(read_only=True)
    lexicon = serializers.ListField(child=serializers.CharField(required=True), validators=[validate_patterns], help_text="Words/phrases/regex patterns to match.")
    counter_lexicon = serializers.ListField(child=serializers.CharField(required=False), default=[], validators=[validate_patterns], help_text="Words/phrases/regex patterns to nullify lexicon matches. Default = [].")

    operator = serializers.ChoiceField(default=choices.DEFAULT_OPERATOR, choices=choices.OPERATOR_CHOICES, required=False, help_text=f"Logical operation between lexicon entries. Choices =  {choices.OPERATOR_CHOICES}. Default = {choices.DEFAULT_OPERATOR}.")
    match_type = serializers.ChoiceField(default=choices.DEFAULT_MATCH_TYPE, choices=choices.MATCH_TYPE_CHOICES, required=False, help_text=f"How to match lexicon entries to text. Choices = {choices.SUPPORTED_MATCH_TYPES}. Default= {choices.DEFAULT_MATCH_TYPE}.")
    required_words = serializers.FloatField(default=choices.DEFAULT_REQUIRED_WORDS, required=False, help_text=f"Required ratio of lexicon entries matched in text for returning a positive result. NB! Only takes effect if operator=='and'. Default = {choices.DEFAULT_REQUIRED_WORDS}.")
    phrase_slop = serializers.IntegerField(default=choices.DEFAULT_PHRASE_SLOP, required=False, help_text=f"Number of non-lexicon words allowed between the words of one lexicon entry. Default = {choices.DEFAULT_PHRASE_SLOP}.")
    counter_slop = serializers.IntegerField(default=choices.DEFAULT_COUNTER_SLOP, required=False, help_text=f"Number of words allowed between lexicon entries and counter lexicon entries for the latter to have effect. Default = {choices.DEFAULT_COUNTER_SLOP}")
    n_allowed_edits = serializers.IntegerField(default=choices.DEFAULT_N_ALLOWED_EDITS, required=False, help_text=f"Number of allowed character changes between lexicon entries and candidate matches in text. Default = {choices.DEFAULT_N_ALLOWED_EDITS}.")
    return_fuzzy_match = serializers.BooleanField(default=choices.DEFAULT_RETURN_FUZZY_MATCH, required=False, help_text=f"Return fuzzy match (opposed to exact lexicon entry)? Default = {choices.DEFAULT_RETURN_FUZZY_MATCH}.")
    ignore_case = serializers.BooleanField(default=choices.DEFAULT_IGNORE_CASE, required=False, help_text=f"Ignore case while matching? Default = {choices.DEFAULT_IGNORE_CASE}.")
    ignore_punctuation = serializers.BooleanField(default=choices.DEFAULT_IGNORE_PUNCTUATION, required=False, help_text=f"If set False, end-of-sentence characters between lexicon entry words and/or counter lexicon entries, nullify the effect. Default = {choices.DEFAULT_IGNORE_PUNCTUATION}.")
    url = serializers.SerializerMethodField()
    tagger_groups = serializers.SerializerMethodField(read_only=True)
    task = TaskSerializer(read_only=True)


    def get_tagger_groups(self, value: RegexTagger):
        tgs = RegexTaggerGroup.objects.filter(regex_taggers__project_id=value.project.pk, regex_taggers__id=value.pk)
        descriptions = [{"tagger_group_id": tagger.pk, "description": tagger.description} for tagger in tgs]
        return descriptions


    class Meta:
        model = RegexTagger
        fields = ('id', 'url', 'author',
                  'description', 'lexicon', 'counter_lexicon', 'operator', 'match_type', 'required_words',
                  'phrase_slop', 'counter_slop', 'n_allowed_edits', 'return_fuzzy_match', 'ignore_case',
                  'ignore_punctuation', 'phrase_slop', 'counter_slop', 'n_allowed_edits', 'return_fuzzy_match',
                  'ignore_case', 'ignore_punctuation', 'tagger_groups', 'task')
        fields_to_parse = ('lexicon', 'counter_lexicon')
Пример #4
0
class MLPWorkerSerializer(serializers.ModelSerializer, IndicesSerializerMixin,
                          FieldsValidationSerializerMixin):
    author = UserSerializer(read_only=True)
    description = serializers.CharField()
    task = TaskSerializer(read_only=True, required=False)
    url = serializers.SerializerMethodField()
    query = serializers.JSONField(help_text='Query in JSON format',
                                  required=False,
                                  default=json.dumps(EMPTY_QUERY))
    fields = serializers.ListField(
        child=serializers.CharField(),
        required=True,
        allow_empty=False,
        help_text="Which fields to apply the MLP on.")
    analyzers = serializers.MultipleChoiceField(
        choices=list(SUPPORTED_ANALYZERS), default=["all"])
    es_scroll_size = serializers.IntegerField(
        help_text="Scroll size for Elasticsearch (Default: 100)",
        default=100,
        required=False)
    es_timeout = serializers.IntegerField(
        help_text="Scroll timeout in minutes for Elasticsearch (Default: 60)",
        default=60,
        required=False)

    class Meta:
        model = MLPWorker
        fields = ("id", "url", "author", "indices", "description", "task",
                  "query", "fields", "analyzers", "es_scroll_size",
                  "es_timeout")

    def get_url(self, obj):
        default_version = REST_FRAMEWORK.get("DEFAULT_VERSION")
        index = reverse(f"{default_version}:mlp_index-detail",
                        kwargs={
                            "project_pk": obj.project.pk,
                            "pk": obj.pk
                        })
        if "request" in self.context:
            request = self.context["request"]
            url = request.build_absolute_uri(index)
            return url
        else:
            return None

    def to_representation(self, instance: MLPWorker):
        data = super(MLPWorkerSerializer, self).to_representation(instance)
        data["fields"] = json.loads(instance.fields)
        data["query"] = json.loads(instance.query)
        data["analyzers"] = json.loads(instance.analyzers)
        return data
Пример #5
0
class EmbeddingSerializer(FieldParseSerializer,
                          serializers.HyperlinkedModelSerializer,
                          ProjectResourceUrlSerializer,
                          IndicesSerializerMixin):
    author = UserSerializer(read_only=True)
    task = TaskSerializer(read_only=True)
    fields = serializers.ListField(
        child=serializers.CharField(),
        help_text=f'Fields used to build the model.')
    snowball_language = serializers.ChoiceField(
        choices=get_snowball_choices(),
        default=DEFAULT_SNOWBALL_LANGUAGE,
        help_text=
        f'Uses Snowball stemmer with specified language to normalize the texts. Default: {DEFAULT_SNOWBALL_LANGUAGE}'
    )
    max_documents = serializers.IntegerField(
        default=choices.DEFAULT_MAX_DOCUMENTS)
    num_dimensions = serializers.IntegerField(
        default=choices.DEFAULT_NUM_DIMENSIONS,
        help_text=f'Default: {choices.DEFAULT_NUM_DIMENSIONS}')
    min_freq = serializers.IntegerField(
        default=choices.DEFAULT_MIN_FREQ,
        help_text=f'Default: {choices.DEFAULT_MIN_FREQ}')
    window_size = serializers.IntegerField(
        min_value=1,
        default=5,
        help_text=
        "Maximum distance between the current and predicted word within a sentence."
    )
    num_epochs = serializers.IntegerField(
        min_value=1,
        default=5,
        help_text="Number of iterations (epochs) over the corpus.")
    use_phraser = serializers.BooleanField(default=True,
                                           help_text='Phrase input texts.')
    query = serializers.JSONField(help_text='Query in JSON format',
                                  required=False)
    url = serializers.SerializerMethodField()
    embedding_type = serializers.ChoiceField(
        choices=choices.EMBEDDING_CHOICES,
        default=choices.EMBEDDING_CHOICES[0][0])

    class Meta:
        model = Embedding
        fields = ('id', 'url', 'author', 'description', 'indices', 'fields',
                  'use_phraser', 'embedding_type', 'snowball_language',
                  'query', 'num_dimensions', 'max_documents', 'min_freq',
                  'window_size', 'num_epochs', 'vocab_size', 'task')
        read_only_fields = ('vocab_size', )
        fields_to_parse = ('fields', )
Пример #6
0
class ToolkitTaskSerializer(IndicesSerializerMixin,
                            FieldsValidationSerializerMixin):
    description = serializers.CharField(max_length=100,
                                        help_text=DESCRIPTION_HELPTEXT)
    author = UserSerializer(read_only=True)
    fields = serializers.ListField(child=serializers.CharField(),
                                   required=True,
                                   allow_empty=False,
                                   help_text=FIELDS_HELPTEXT)
    query = serializers.JSONField(required=False,
                                  help_text=QUERY_HELPTEXT,
                                  default=json.dumps(EMPTY_QUERY))

    bulk_size = serializers.IntegerField(default=100,
                                         min_value=1,
                                         max_value=ES_BULK_SIZE_MAX,
                                         help_text=BULK_SIZE_HELPTEXT)
    es_timeout = serializers.IntegerField(default=10,
                                          min_value=1,
                                          max_value=ES_TIMEOUT_MAX,
                                          help_text=ES_TIMEOUT_HELPTEXT)
Пример #7
0
class ApplyTaggersSerializer(FieldParseSerializer, IndicesSerializerMixin,
                             ElasticScrollMixIn):
    author = UserSerializer(read_only=True)
    description = serializers.CharField(
        required=True,
        help_text="Text for distinguishing this task from others.")
    new_fact_name = serializers.CharField(
        required=True, help_text="Used as fact name when applying the tagger.")
    fields = serializers.ListField(
        required=True,
        child=serializers.CharField(),
        help_text="Which fields to extract the text from.")
    query = serializers.JSONField(
        help_text='Filter the documents which to scroll and apply to.',
        default=EMPTY_QUERY)
    lemmatize = serializers.BooleanField(
        default=False,
        help_text=
        'Use MLP lemmatizer if available. Use only if training data was lemmatized. Default: False'
    )
    # num_tags = serializers.IntegerField(read_only=True)
    taggers = serializers.ListField(help_text='List of Tagger IDs to be used.',
                                    child=serializers.IntegerField(),
                                    default=[])

    def validate_taggers(self, taggers):
        invalid_ids = []
        for tagger_id in taggers:
            try:
                tagger = Tagger.objects.get(pk=tagger_id)
            except ObjectDoesNotExist:
                invalid_ids.append(tagger_id)
        if invalid_ids:
            raise serializers.ValidationError(
                f"Taggers with following IDs do not exist: {invalid_ids}")
        return taggers
Пример #8
0
class CRFExtractorSerializer(FieldParseSerializer, serializers.ModelSerializer,
                             IndicesSerializerMixin,
                             ProjectResourceUrlSerializer):
    description = serializers.CharField(help_text=DESCRIPTION_HELPTEXT)
    author = UserSerializer(read_only=True)
    query = serializers.JSONField(help_text=QUERY_HELPTEXT,
                                  default=json.dumps(EMPTY_QUERY),
                                  required=False)

    mlp_field = serializers.CharField(
        help_text='MLP field used to build the model.')

    labels = serializers.JSONField(
        default=["GPE", "ORG", "PER", "LOC"],
        help_text="List of labels used to train the extraction model.")

    c_values = serializers.JSONField(
        default=[0.001, 0.1, 0.5],
        help_text="List of C-values to test during training. Best will be used."
    )

    num_iter = serializers.IntegerField(
        default=100, help_text="Number of iterations used in training.")
    test_size = serializers.FloatField(
        default=0.3,
        help_text="Proportion of documents reserved for testing the model.")

    bias = serializers.BooleanField(
        default=True,
        help_text="Capture the proportion of a given label in the training set."
    )
    window_size = serializers.IntegerField(
        default=2,
        help_text=
        "Number of words before and after the observed word analyzed.",
    )
    suffix_len = serializers.JSONField(
        default=json.dumps((2, 2)),
        help_text=
        "Number of characters (min, max) used for word suffixes as features.")
    feature_fields = serializers.MultipleChoiceField(
        choices=FEATURE_FIELDS_CHOICES,
        default=DEFAULT_LAYERS,
        help_text=
        "Layers (MLP subfields) used as features for the observed word.")
    context_feature_fields = serializers.MultipleChoiceField(
        choices=FEATURE_FIELDS_CHOICES,
        default=DEFAULT_LAYERS,
        help_text=
        "Layers (MLP subfields) used as features for the context of the observed word."
    )
    feature_extractors = serializers.MultipleChoiceField(
        choices=FEATURE_EXTRACTOR_CHOICES,
        default=DEFAULT_EXTRACTORS,
        help_text=
        "Feature extractors used for the observed word and it's context.")
    context_feature_extractors = serializers.MultipleChoiceField(
        choices=FEATURE_EXTRACTOR_CHOICES,
        default=DEFAULT_EXTRACTORS,
        help_text=
        "Feature extractors used for the context of the observed word.")
    embedding = ProjectFilteredPrimaryKeyRelatedField(
        queryset=Embedding.objects,
        many=False,
        read_only=False,
        allow_null=True,
        default=None,
        help_text=
        "Embedding to use for finding similar words for the observed word and it's context."
    )
    task = TaskSerializer(read_only=True)
    url = serializers.SerializerMethodField()

    class Meta:
        model = CRFExtractor
        fields = ('id', 'url', 'author', 'description', 'query', 'indices',
                  'mlp_field', 'window_size', 'test_size', 'num_iter',
                  'best_c1', 'best_c2', 'bias', 'suffix_len', 'labels',
                  'feature_fields', 'context_feature_fields',
                  'feature_extractors', 'context_feature_extractors',
                  'embedding', 'task', 'precision', 'recall', 'f1_score',
                  'c_values')
        read_only_fields = ('precision', 'task', 'recall', 'f1_score',
                            'best_c1', 'best_c2')
        fields_to_parse = ('labels', 'suffix_len', 'c_values')
Пример #9
0
class ClusteringSerializer(FieldParseSerializer, serializers.ModelSerializer,
                           IndicesSerializerMixin):
    author = UserSerializer(read_only=True)
    description = serializers.CharField()
    query = serializers.CharField(help_text='Query in JSON format',
                                  default=EMPTY_QUERY)
    num_cluster = serializers.IntegerField(
        min_value=1,
        max_value=1000,
        default=10,
        help_text='Number of document clusters to be formed.')
    clustering_algorithm = serializers.ChoiceField(
        choices=CLUSTERING_ALGORITHMS,
        default=CLUSTERING_ALGORITHMS[0][0],
        required=False)
    fields = serializers.ListField(
        required=True, help_text='Fields that are used for clustering')
    display_fields = serializers.ListField(
        default=[],
        allow_empty=True,
        help_text=
        'Fields that are used for displaying cluster content. If not specified it is same as "fields".'
    )
    vectorizer = serializers.ChoiceField(choices=VECTORIZERS,
                                         default=VECTORIZERS[0][0])
    num_dims = serializers.IntegerField(min_value=1,
                                        max_value=10000,
                                        default=1000,
                                        help_text='Size of the dictionary.')
    use_lsi = serializers.BooleanField(
        default=False,
        help_text=
        'If set to 1 (true), transforms document-term matrix into lower-dimensional space using LSI. Might and might not improve clustering results.'
    )
    num_topics = serializers.IntegerField(
        min_value=1,
        max_value=1000,
        default=50,
        help_text=
        'Is only used if use_lsi is set to 1. The number of dimension in lower-dimensional space.'
    )

    stop_words = serializers.ListField(
        default=[],
        allow_empty=True,
        help_text=
        'List of custom stop words to be removed from documents before clustering.'
    )
    document_limit = serializers.IntegerField(
        default=100,
        min_value=1,
        max_value=10000,
        help_text='Number of documents retrieved from indices.')
    ignored_ids = serializers.ListField(
        default=[],
        help_text=
        "List of Elasticsearch document ids to ignore from the clustering process."
    )
    significant_words_filter = serializers.CharField(
        help_text='Regex to filter out desired words.', default="[0-9]+")

    url = serializers.SerializerMethodField()
    task = TaskSerializer(read_only=True)

    def get_url(self, obj):
        default_version = REST_FRAMEWORK.get("DEFAULT_VERSION")
        if default_version == "v1":
            index = reverse(f"{default_version}:clustering-detail",
                            kwargs={
                                "project_pk": obj.project.pk,
                                "pk": obj.pk
                            })
        elif default_version == "v2":
            index = reverse(f"{default_version}:topic_analyzer-detail",
                            kwargs={
                                "project_pk": obj.project.pk,
                                "pk": obj.pk
                            })
        if "request" in self.context:
            request = self.context["request"]
            url = request.build_absolute_uri(index)
            return url
        else:
            return None

    def validate_significant_words_filter(self, regex):
        try:
            re.compile(regex)
        except re.error:
            raise serializers.ValidationError(
                "Given string is not a valid regex.")
        return regex

    class Meta:
        model = ClusteringResult
        fields = [
            "id", "url", "description", "author", "query", "indices",
            "num_cluster", "clustering_algorithm", "vectorizer", "num_dims",
            "use_lsi", "num_topics", "significant_words_filter",
            "display_fields", "stop_words", "ignored_ids", "fields",
            "embedding", "document_limit", "task"
        ]
        fields_to_parse = ("fields", "query", "display_fields", "ignored_ids",
                           "stop_words")
Пример #10
0
class ApplyLangOnIndicesSerializer(serializers.ModelSerializer,
                                   IndicesSerializerMixin,
                                   FieldsValidationSerializerMixin):
    description = serializers.CharField()
    author = UserSerializer(read_only=True)
    task = TaskSerializer(read_only=True, required=False)
    url = serializers.SerializerMethodField()
    query = serializers.JSONField(help_text='Query in JSON format',
                                  required=False,
                                  default=json.dumps(EMPTY_QUERY))
    field = serializers.CharField(required=True, allow_blank=False)

    def validate_field(self, value: str):
        """
        Check if selected fields are present in the project and raise error on None
        if no "fields" field is declared in the serializer, no validation
        to write custom validation for serializers with FieldParseSerializer, simply override validate_fields in the project serializer
        """
        project_id = self.context['view'].kwargs['project_pk']
        project_obj = Project.objects.get(id=project_id)
        project_fields = set(project_obj.get_elastic_fields(path_list=True))
        if not value or not set([value]).issubset(project_fields):
            raise serializers.ValidationError(
                f'Entered fields not in current project fields: {project_fields}'
            )
        return value

    def validate_query(self, query: Union[str, dict]):
        """
        Check if the query is formatted correctly and store it as JSON string,
        if it is passed as a JSON dict.
        """
        if not isinstance(query, dict):
            try:
                query = json.loads(query)
            except:
                raise serializers.ValidationError(
                    f"Incorrect query: '{query}'. Query should be formatted as a JSON dict or a JSON string."
                )
            # If loaded query is not JSON dict, raise ValidatioNError
            if not isinstance(query, dict):
                raise serializers.ValidationError(
                    f"Incorrect query: '{query}'. Query should contain a JSON dict."
                )

        # Ensure that the query is stored as a JSON string
        query = json.dumps(query)
        return query

    class Meta:
        model = ApplyLangWorker
        fields = ("id", "url", "author", "indices", "description", "task",
                  "query", "field")

    def get_url(self, obj):
        default_version = "v2"
        index = reverse(f"{default_version}:lang_index-detail",
                        kwargs={
                            "project_pk": obj.project.pk,
                            "pk": obj.pk
                        })
        if "request" in self.context:
            request = self.context["request"]
            url = request.build_absolute_uri(index)
            return url
        else:
            return None

    def to_representation(self, instance: ApplyLangWorker):
        data = super(ApplyLangOnIndicesSerializer,
                     self).to_representation(instance)
        data["query"] = json.loads(instance.query)
        return data
Пример #11
0
class TorchTaggerSerializer(FieldParseSerializer, serializers.ModelSerializer,
                            IndicesSerializerMixin,
                            ProjectResourceUrlSerializer):
    author = UserSerializer(read_only=True)
    fields = serializers.ListField(
        child=serializers.CharField(),
        help_text=f'Fields used to build the model.')
    query = serializers.JSONField(help_text='Query in JSON format',
                                  required=False,
                                  default=json.dumps(EMPTY_QUERY))
    fact_name = serializers.CharField(
        default=None,
        required=False,
        help_text=f'Fact name used to filter tags (fact values). Default: None'
    )
    pos_label = serializers.CharField(
        default="",
        required=False,
        allow_blank=True,
        help_text=
        f'Fact value used as positive label while evaluating the results. This is needed only, if the selected fact has exactly two possible values. Default = ""'
    )
    model_architecture = serializers.ChoiceField(choices=choices.MODEL_CHOICES)
    maximum_sample_size = serializers.IntegerField(
        default=choices.DEFAULT_MAX_SAMPLE_SIZE, required=False)
    minimum_sample_size = serializers.IntegerField(
        default=choices.DEFAULT_MIN_SAMPLE_SIZE, required=False)
    num_epochs = serializers.IntegerField(default=choices.DEFAULT_NUM_EPOCHS,
                                          required=False)
    embedding = ProjectFilteredPrimaryKeyRelatedField(
        queryset=Embedding.objects,
        many=False,
        read_only=False,
        required=True,
        help_text=f'Embedding to use, usage mandatory.')

    balance = serializers.BooleanField(
        default=choices.DEFAULT_BALANCE,
        required=False,
        help_text=
        f'Balance sample sizes of different classes. Only applicable for multiclass taggers. Default = {choices.DEFAULT_BALANCE}'
    )
    use_sentence_shuffle = serializers.BooleanField(
        default=choices.DEFAULT_USE_SENTENCE_SHUFFLE,
        required=False,
        help_text=
        f'Shuffle sentences in added examples. NB! Only applicable for multiclass taggers with balance=True. Default = {choices.DEFAULT_USE_SENTENCE_SHUFFLE}'
    )
    balance_to_max_limit = serializers.BooleanField(
        default=choices.DEFAULT_BALANCE_TO_MAX_LIMIT,
        required=False,
        help_text=
        f'If enabled, the number of samples for each class is set to `maximum_sample_size`. Otherwise, it is set to max class size. NB! Only applicable for multiclass taggers with balance == True. Default = {choices.DEFAULT_BALANCE_TO_MAX_LIMIT}'
    )

    task = TaskSerializer(read_only=True)
    plot = serializers.SerializerMethodField()
    url = serializers.SerializerMethodField()

    def validate(self, data):
        # use custom validation for pos label as some other serializer fields are also required
        data = validate_pos_label(data)
        return data

    class Meta:
        model = TorchTagger
        fields = ('url', 'author', 'id', 'description', 'query', 'fields',
                  'embedding', 'f1_score', 'precision', 'recall', 'accuracy',
                  'model_architecture', 'maximum_sample_size',
                  'minimum_sample_size', 'num_epochs', 'plot', 'task',
                  'fact_name', 'indices', 'confusion_matrix', 'num_examples',
                  'balance', 'use_sentence_shuffle', 'balance_to_max_limit',
                  'pos_label', 'classes')
        read_only_fields = ('project', 'fields', 'f1_score', 'precision',
                            'recall', 'accuracy', 'plot', 'task',
                            'confusion_matrix', 'num_examples', 'classes')
        fields_to_parse = ['fields', 'classes']
Пример #12
0
class TaggerSerializer(FieldParseSerializer, serializers.ModelSerializer,
                       IndicesSerializerMixin, ProjectResourceUrlSerializer):
    author = UserSerializer(read_only=True)
    description = serializers.CharField(
        help_text=f'Description for the Tagger. Will be used as tag.')
    fields = serializers.ListField(
        child=serializers.CharField(),
        help_text=f'Fields used to build the model.')
    vectorizer = serializers.ChoiceField(
        choices=choices.get_vectorizer_choices(),
        default=choices.DEFAULT_VECTORIZER,
        help_text=
        'Vectorizer algorithm to create document vectors. NB! HashingVectorizer does not support feature name extraction!'
    )
    analyzer = serializers.ChoiceField(
        choices=choices.get_analyzer_choices(),
        default=choices.DEFAULT_ANALYZER,
        help_text="Analyze text as words or characters.")
    classifier = serializers.ChoiceField(
        choices=choices.get_classifier_choices(),
        default=choices.DEFAULT_CLASSIFIER,
        help_text='Classification algorithm used in the model.')
    embedding = ProjectFilteredPrimaryKeyRelatedField(
        queryset=Embedding.objects,
        many=False,
        read_only=False,
        allow_null=True,
        default=None,
        help_text='Embedding to use')
    negative_multiplier = serializers.FloatField(
        default=choices.DEFAULT_NEGATIVE_MULTIPLIER,
        help_text=
        f'Multiplies the size of positive samples to determine negative example set size. Default: {choices.DEFAULT_NEGATIVE_MULTIPLIER}'
    )
    maximum_sample_size = serializers.IntegerField(
        default=choices.DEFAULT_MAX_SAMPLE_SIZE,
        help_text=
        f'Maximum number of documents used to build a model. Default: {choices.DEFAULT_MAX_SAMPLE_SIZE}'
    )
    minimum_sample_size = serializers.IntegerField(
        default=choices.DEFAULT_MIN_SAMPLE_SIZE,
        help_text=
        f'Minimum number of documents required to train a model. Default: {choices.DEFAULT_MIN_SAMPLE_SIZE}'
    )
    score_threshold = serializers.FloatField(
        default=choices.DEFAULT_SCORE_THRESHOLD,
        help_text=
        f'Elasticsearch score threshold for filtering out irrelevant examples. All examples below first document\'s score * score threshold are ignored. Float between 0 and 1. Default: {choices.DEFAULT_SCORE_THRESHOLD}'
    )
    snowball_language = serializers.ChoiceField(
        choices=get_snowball_choices(),
        default=DEFAULT_SNOWBALL_LANGUAGE,
        help_text=
        f'Uses Snowball stemmer with specified language to normalize the texts. Default: {DEFAULT_SNOWBALL_LANGUAGE}'
    )
    scoring_function = serializers.ChoiceField(
        choices=choices.DEFAULT_SCORING_OPTIONS,
        default=choices.DEFAULT_SCORING_FUNCTION,
        required=False,
        help_text=
        f'Scoring function used while evaluating the results on dev set. Default: {choices.DEFAULT_SCORING_FUNCTION}'
    )
    stop_words = serializers.ListField(
        child=serializers.CharField(),
        default=[],
        required=False,
        help_text='Stop words to add. Default = [].',
        write_only=True)
    ignore_numbers = serializers.BooleanField(
        default=choices.DEFAULT_IGNORE_NUMBERS,
        required=False,
        help_text='If enabled, ignore all numbers as possible features.')
    detect_lang = serializers.BooleanField(
        default=False,
        help_text=
        "Whether to detect the language for the stemmer from the document itself."
    )
    task = TaskSerializer(read_only=True)
    plot = serializers.SerializerMethodField()
    query = serializers.JSONField(help_text='Query in JSON format',
                                  required=False,
                                  default=json.dumps(EMPTY_QUERY))
    fact_name = serializers.CharField(
        default=None,
        required=False,
        help_text=f'Fact name used to filter tags (fact values). Default: None'
    )
    pos_label = serializers.CharField(
        default="",
        required=False,
        allow_blank=True,
        help_text=
        f'Fact value used as positive label while evaluating the results. This is needed only, if the selected fact has exactly two possible values. Default = ""'
    )
    url = serializers.SerializerMethodField()
    tagger_groups = serializers.SerializerMethodField(read_only=True)

    balance = serializers.BooleanField(
        default=choices.DEFAULT_BALANCE,
        required=False,
        help_text=
        f'Balance sample sizes of different classes. Only applicable for multiclass taggers. Default = {choices.DEFAULT_BALANCE}'
    )
    balance_to_max_limit = serializers.BooleanField(
        default=choices.DEFAULT_BALANCE_TO_MAX_LIMIT,
        required=False,
        help_text=
        f'If enabled, the number of samples for each class is set to `maximum_sample_size`. Otherwise, it is set to max class size. NB! Only applicable for multiclass taggers with balance == True. Default = {choices.DEFAULT_BALANCE_TO_MAX_LIMIT}'
    )

    class Meta:
        model = Tagger
        fields = ('id', 'url', 'author', 'description', 'query', 'fact_name',
                  'indices', 'fields', 'detect_lang', 'embedding',
                  'vectorizer', 'analyzer', 'classifier', 'stop_words',
                  'maximum_sample_size', 'minimum_sample_size',
                  'score_threshold', 'negative_multiplier', 'precision',
                  'recall', 'f1_score', 'snowball_language',
                  'scoring_function', 'num_features', 'num_examples',
                  'confusion_matrix', 'plot', 'task', 'tagger_groups',
                  'ignore_numbers', 'balance', 'balance_to_max_limit',
                  'pos_label', 'classes')
        read_only_fields = ('precision', 'recall', 'f1_score', 'num_features',
                            'num_examples', 'tagger_groups',
                            'confusion_matrix', 'classes')
        fields_to_parse = (
            'fields',
            'classes',
        )

    def validate(self, data):
        if data.get("detect_lang", None) is True and data.get(
                "snowball_language", None):
            raise ValidationError(
                "Values 'detect_lang' and 'snowball_language' are mutually exclusive, please opt for one!"
            )

        # use custom validation for pos label as some other serializer fields are also required
        data = validate_pos_label(data)

        return data

    def __init__(self, *args, **kwargs):
        """
        Add the ability to pass extra arguments such as "remove_fields".
        Useful for the Serializer eg in another Serializer, without making a new one.
        """
        remove_fields = kwargs.pop('remove_fields', None)
        super(TaggerSerializer, self).__init__(*args, **kwargs)

        if remove_fields:
            # for multiple fields in a list
            for field_name in remove_fields:
                self.fields.pop(field_name)

    def get_tagger_groups(self, value: Tagger):
        return json.loads(value.tagger_groups)
Пример #13
0
class ProjectSerializer(FieldParseSerializer, serializers.ModelSerializer):
    title = serializers.CharField(required=True)

    indices = IndexSerializer(many=True, required=False, read_only=True)
    indices_write = serializers.ListField(
        child=serializers.CharField(validators=[check_for_existence]),
        write_only=True,
        default=[])

    users = UserSerializer(many=True,
                           default=serializers.CurrentUserDefault(),
                           read_only=True)
    users_write = serializers.ListField(
        child=serializers.CharField(validators=[check_if_username_exist]),
        write_only=True,
        default=[],
        help_text=
        "Usernames of users that should have access to the Projects resources."
    )

    administrators = UserSerializer(many=True,
                                    default=serializers.CurrentUserDefault(),
                                    read_only=True)
    administrators_write = serializers.ListField(
        child=serializers.CharField(validators=[check_if_username_exist]),
        write_only=True,
        default=[],
        help_text=
        "Usernames of users that should be given Project Administrator permissions."
    )

    author = UserSerializer(read_only=True)

    resources = serializers.SerializerMethodField()
    resource_count = serializers.SerializerMethodField()

    scopes = serializers.ListField(
        default=[],
        required=False,
        help_text=
        "Users that belong to the given scope will have access to the Projects resources."
    )

    # For whatever reason, it doesn't validate read-only fields, so we do it manually.
    def validate(self, data):
        if hasattr(self, 'initial_data'):
            read_only_keys = ["indices", "users", "administrators"]
            for key in read_only_keys:
                if key in self.initial_data:
                    raise ValidationError(
                        f"Field: '{key}' is a read-only field, please use {key}_write instead!"
                    )
        return data

    def validate_scopes(self, values):
        user = self.context["request"].user
        user_scopes = json.loads(user.profile.scopes)
        if not user.is_superuser or not user.is_staff:
            for project_scope in values:
                if project_scope not in user_scopes:
                    raise ValidationError(
                        "Normal users can only define scopes they have access to!"
                    )
        return values

    def __enrich_payload_with_orm(self, base, data):
        author = self.context["request"].user
        fields = ["users_write", "administrators_write"]
        for field in fields:
            usernames = data.get(field, None)
            if not usernames:
                base[field] = [author]
            else:
                base[field] = list(User.objects.filter(username__in=usernames))
        return base

    def to_internal_value(self, data):
        base = super(ProjectSerializer, self).to_internal_value(data)
        base = self.__enrich_payload_with_orm(base, data)
        return base

    def update(self, instance: Project, validated_data: dict):
        if "title" in validated_data:
            instance.title = validated_data["title"]
        if "scopes" in validated_data:
            instance.scopes = json.dumps(validated_data["scopes"])

        instance.save()
        return instance

    def create(self, validated_data):
        from toolkit.elastic.index.models import Index
        indices: List[str] = validated_data.get("indices_write", None)
        title = validated_data["title"]
        users = wrap_in_list(validated_data["users_write"])
        administrators = wrap_in_list(validated_data["administrators_write"])
        author = self.context["request"].user
        scopes = json.dumps(validated_data["scopes"], ensure_ascii=False)

        if indices and not author.is_superuser:
            raise PermissionDenied(
                "Non-superusers can not create projects with indices defined!")

        # create object
        with transaction.atomic():
            project = Project.objects.create(title=title,
                                             author=author,
                                             scopes=scopes)
            project.users.add(*users, *administrators, author)
            project.administrators.add(*administrators,
                                       author)  # All admins are also users.

            # only run if indices given as we might not have elastic running
            if indices:
                for index_name in indices:
                    index, is_created = Index.objects.get_or_create(
                        name=index_name)
                    project.indices.add(index)

        return project

    class Meta:
        model = Project
        fields = (
            'url',
            'id',
            'title',
            'author',
            'administrators_write',
            'administrators',
            'users',
            'users_write',
            'indices',
            'indices_write',
            'scopes',
            'resources',
            'resource_count',
        )
        read_only_fields = (
            'author',
            'resources',
        )
        fields_to_parse = ("scopes", )

    def get_resources(self, obj):
        request = self.context.get('request')
        api_version = self.context["request"].version
        version_prefix = f'/api/{api_version}'
        base_url = request.build_absolute_uri(
            f'{version_prefix}/projects/{obj.id}/')
        resource_dict = {}

        if api_version == 'v2':
            resources = (
                'lexicons',
                'elastic/reindexer',
                'elastic/index_splitter',
                'elastic/dataset_imports',
                'elastic/search_query_tagger',
                'elastic/search_fields_tagger',
                'elastic/delete_facts_by_query',
                'elastic/edit_facts_by_query',
                'elastic/scroll',
                'elastic/apply_analyzers',
                'searches',
                'embeddings',
                'topic_analyzer',
                'taggers',
                'tagger_groups',
                'torchtaggers',
                'bert_taggers',
                'regex_taggers',
                'anonymizers',
                'regex_tagger_groups',
                'mlp_index',
                'lang_index',
                'evaluators',
                'summarizer_index',
                'rakun_extractors',
                'crf_extractors',
                'annotator',
                'labelset',
                'annotator_groups',
            )
        elif api_version == 'v1':
            resources = ('lexicons', 'reindexer', 'search_query_tagger',
                         'search_fields_tagger', 'index_splitter',
                         'dataset_imports', 'searches', 'scroll', 'clustering',
                         'embeddings', 'taggers', 'tagger_groups',
                         'torchtaggers', 'bert_taggers', 'regex_taggers',
                         'anonymizers', 'regex_tagger_groups', 'mlp_index',
                         'lang_index', 'evaluators', 'summarizer_index',
                         'apply_analyzers', 'rakun_extractors',
                         'crf_extractors')

        for resource_name in resources:
            resource_dict[resource_name] = f'{base_url}{resource_name}/'

        additional_urls = ['mlp_texts', 'mlp_docs', 'summarizer_summarize']
        for item in additional_urls:
            view_url = reverse(f"{api_version}:{item}")
            resource_dict[item] = request.build_absolute_uri(view_url)

        importer_uri = reverse(f"{api_version}:document_import",
                               kwargs={"pk": obj.id})
        resource_dict["document_import_api"] = request.build_absolute_uri(
            importer_uri)
        return resource_dict

    def get_resource_count(self, obj):
        return sum(obj.get_resource_counts().values())
Пример #14
0
class IndexSplitterSerializer(FieldParseSerializer,
                              serializers.HyperlinkedModelSerializer,
                              IndicesSerializerMixin,
                              ProjectResourceUrlSerializer):
    author = UserSerializer(read_only=True)
    url = serializers.SerializerMethodField()
    scroll_size = serializers.IntegerField(min_value=0,
                                           max_value=10000,
                                           required=False)
    description = serializers.CharField(help_text='Description of the task.',
                                        required=True,
                                        allow_blank=False)
    query = serializers.JSONField(
        help_text=
        'Query used to filter the indices. Defaults to an empty query.',
        required=False)
    train_index = serializers.CharField(help_text='Name of the train index.',
                                        allow_blank=False,
                                        required=True,
                                        validators=[
                                            check_for_wildcards,
                                            check_for_colons,
                                            check_for_special_symbols,
                                            check_for_banned_beginning_chars,
                                            check_for_upper_case
                                        ])
    test_index = serializers.CharField(help_text='Name of the test index.',
                                       allow_blank=False,
                                       required=True,
                                       validators=[
                                           check_for_wildcards,
                                           check_for_colons,
                                           check_for_special_symbols,
                                           check_for_banned_beginning_chars,
                                           check_for_upper_case
                                       ])
    fields = serializers.ListField(
        child=serializers.CharField(),
        help_text=f'Empty fields chooses all posted indices fields.',
        required=False)
    task = TaskSerializer(read_only=True)
    test_size = serializers.IntegerField(
        help_text=
        'Size of the test set. Represents a percentage with "random" or "original" distribution and a quantity with "equal" or "custom" distribution.',
        required=False,
        min_value=1,
        max_value=10000)

    fact = serializers.CharField(
        required=False,
        help_text=
        "Name of the fact on which the test index distribution will base.")
    str_val = serializers.CharField(
        required=False,
        help_text=
        "Name of the fact value on which the test index distribution will base."
    )
    distribution = serializers.ChoiceField(
        choices=LABEL_DISTRIBUTION,
        default=LABEL_DISTRIBUTION[0][0],
        required=False,
        help_text=
        'Distribution of the test set. Either "random", "original", "equal" or "custom".'
    )
    custom_distribution = serializers.JSONField(
        default={},
        help_text=
        "A dictionary containing custom label distribution with keys as labels and values as quantities."
    )

    class Meta:
        model = IndexSplitter
        fields = ('id', 'url', 'author', 'description', 'indices',
                  'scroll_size', 'fields', 'query', 'train_index',
                  'test_index', "test_size", 'fact', 'str_val', 'distribution',
                  'custom_distribution', 'task')
        fields_to_parse = ('fields', 'custom_distribution')

    def validate_train_index(self, value):
        """ Check that new_index does not exist """
        open_indices, closed_indices = ElasticCore().get_indices()
        if value in open_indices or value in closed_indices:
            raise serializers.ValidationError(
                f"{value} already exists, choose a different name for your train index"
            )
        return value

    def validate_test_index(self, value):
        """ Check that new_index does not exist """
        open_indices, closed_indices = ElasticCore().get_indices()
        if value in open_indices or value in closed_indices:
            raise serializers.ValidationError(
                f"{value} already exists, choose a different name for your test index"
            )
        return value

    def validate_indices(self, value):
        """ check if index is in the relevant project indices field """
        project_obj = Project.objects.get(
            id=self.context['view'].kwargs['project_pk'])
        for index in value:
            if index.get("name") not in project_obj.get_indices():
                raise serializers.ValidationError(
                    f'Index "{index.get("name")}" is not contained in your project indices "{project_obj.get_indices()}"'
                )
        return value

    def validate_fields(self, value):
        ''' check if fields included in the request are in the relevant project fields '''
        project_fields = self._get_project_fields()
        field_data = [field["path"] for field in project_fields]
        for field in value:
            if field not in field_data:
                raise serializers.ValidationError(
                    f'The fields you are attempting to add to new indices are not in current project fields: {project_fields}'
                )
        return value

    def validate_query(self, value):
        val = json.loads(value)
        if "query" not in json.loads(value):
            raise serializers.ValidationError(
                "Incorrect elastic query. Must contain field 'query'.")
        return value

    def validate(self, data):
        fact = data.get("fact")
        if data["distribution"] == "custom" and len(
                data["custom_distribution"]) == 0:
            raise serializers.ValidationError(
                "field custom_distribution can not be empty with custom label distribution"
            )
        if fact == "" and data["distribution"] in [
                "custom", "equal", "original"
        ]:
            raise serializers.ValidationError(
                'fact must be specified with "custom", "equal" or "original" distribution'
            )
        if data["distribution"] in ["custom", "equal", "original"]:
            if "fields" in data and "texta_facts" not in data["fields"]:
                project_fields = self._get_project_fields()
                field_data = [field["path"] for field in project_fields]
                if "texta_facts" not in field_data:
                    raise serializers.ValidationError(
                        f'Field texta_facts is required but it is not in project fields.: {project_fields}'
                    )
                else:
                    data["fields"].append("texta_facts")
        return data

    def _get_project_fields(self):
        project_obj = Project.objects.get(
            id=self.context['view'].kwargs['project_pk'])
        project_fields = ElasticCore().get_fields(
            indices=project_obj.get_indices())
        return project_fields
Пример #15
0
class ReindexerCreateSerializer(FieldParseSerializer,
                                serializers.HyperlinkedModelSerializer,
                                ProjectResourceUrlSerializer):
    author = UserSerializer(read_only=True)
    url = serializers.SerializerMethodField()
    scroll_size = serializers.IntegerField(
        min_value=0, max_value=10000, required=False
    )  # Max value stems from Elasticsearch max doc count limitation.
    description = serializers.CharField(
        help_text='Describe your re-indexing task',
        required=True,
        allow_blank=False)
    indices = serializers.ListField(
        child=serializers.CharField(),
        help_text=f'Add the indices, you wish to reindex into a new index.',
        required=True)
    query = serializers.JSONField(
        help_text='Add a query, if you wish to filter the new reindexed index.',
        required=False)
    new_index = serializers.CharField(
        help_text='Your new re-indexed index name',
        allow_blank=False,
        required=True,
        validators=[
            check_for_wildcards, check_for_colons, check_for_special_symbols,
            check_for_banned_beginning_chars, check_for_upper_case
        ])
    field_type = serializers.ListField(
        help_text=
        f'Used to update the fieldname and the field type of chosen paths.',
        required=False)
    add_facts_mapping = serializers.BooleanField(
        help_text=
        'Add texta facts mapping. NB! If texta_facts is present in reindexed fields, the mapping is always created.',
        required=False,
        default=False)
    task = TaskSerializer(read_only=True)
    fields = serializers.ListField(
        child=serializers.CharField(),
        help_text=
        f'Empty fields chooses all posted indices fields. Fields content adds custom field content to the new index.',
        required=False)
    random_size = serializers.IntegerField(
        help_text=
        'Picks a subset of documents of chosen size at random. Disabled by default.',
        required=False,
        min_value=1,
        max_value=10000)

    class Meta:
        model = Reindexer
        fields = ('id', 'url', 'author', 'description', 'indices',
                  'scroll_size', 'fields', 'query', 'new_index', 'random_size',
                  'field_type', 'add_facts_mapping', 'task')
        fields_to_parse = ('fields', 'field_type', 'indices')

    def validate_new_index(self, value):
        """ Check that new_index does not exist """
        if value in ElasticCore().get_indices():
            raise serializers.ValidationError(
                "new_index already exists, choose a different name for your reindexed index"
            )
        return value

    def validate_indices(self, value):
        """ check if re-indexed index is in the relevant project indices field """
        project_obj = Project.objects.get(
            id=self.context['view'].kwargs['project_pk'])
        for index in value:
            indices = project_obj.get_indices()
            if index not in indices:
                raise serializers.ValidationError(
                    f'Index "{index}" is not contained in your project indices "{indices}"'
                )
        return value

    def validate_fields(self, value: List[str]):
        """ check if changed fields included in the request are in the relevant project fields """
        project_obj: Project = Project.objects.get(
            id=self.context['view'].kwargs['project_pk'])
        indices = self.context["request"].data.get("indices", [])
        indices = project_obj.get_available_or_all_project_indices(indices)
        project_fields = ElasticCore().get_fields(indices=indices)
        field_data = [field["path"] for field in project_fields]
        for field in value:
            if field not in field_data:
                raise serializers.ValidationError(
                    f'The fields you are attempting to re-index are not in current project fields: {project_fields}'
                )
        return value
Пример #16
0
class EvaluatorSerializer(serializers.ModelSerializer,
                          ProjectResourceUrlSerializer,
                          IndicesSerializerMixin):
    author = UserSerializer(read_only=True)
    query = serializers.JSONField(required=False,
                                  help_text="Query in JSON format",
                                  default=json.dumps(EMPTY_QUERY))

    true_fact = serializers.CharField(
        required=True,
        help_text=f"Fact name used as true label for mulilabel evaluation.")
    predicted_fact = serializers.CharField(
        required=True,
        help_text=
        f"Fact name used as predicted label for multilabel evaluation.")

    true_fact_value = serializers.CharField(
        required=False,
        default="",
        help_text=f"Fact value used as true label for binary evaluation.")
    predicted_fact_value = serializers.CharField(
        required=False,
        default="",
        help_text=f"Fact value used as predicted label for binary evaluation.")

    average_function = serializers.ChoiceField(
        choices=choices.AVG_CHOICES,
        default=choices.DEFAULT_AVG_FUNCTION,
        required=False,
        help_text=
        f"Sklearn average function. NB! Doesn't have any effect on entity evaluation."
    )

    es_timeout = serializers.IntegerField(
        default=choices.DEFAULT_ES_TIMEOUT,
        help_text=f"Elasticsearch scroll timeout in minutes.")
    scroll_size = serializers.IntegerField(
        min_value=1,
        max_value=10000,
        default=choices.DEFAULT_SCROLL_SIZE,
        help_text=
        f"How many documents should be returned by one Elasticsearch scroll.")

    add_individual_results = serializers.BooleanField(
        default=choices.DEFAULT_ADD_INDIVIDUAL_RESULTS,
        required=False,
        help_text=
        f"Only used for multilabel/multiclass evaluation. If enabled, individual label scores are calculated and stored as well."
    )

    add_misclassified_examples = serializers.BooleanField(
        default=choices.DEFAULT_ADD_MISCLASSIFIED_EXAMPLES,
        required=False,
        help_text=
        f"Only used for entity evaluation. If enabled, misclassified and partially overlapping values are stored and can be analyzed later."
    )

    evaluation_type = serializers.ChoiceField(
        choices=choices.EVALUATION_TYPE_CHOICES,
        default="multilabel",
        required=False,
        help_text=f"Specify the type of labelsets to evaluate.")
    token_based = serializers.BooleanField(
        default=choices.DEFAULT_TOKEN_BASED,
        required=False,
        help_text=
        f"If enabled, uses token-based entity evaluation, otherwise calculates the scores based on the spans of two value-sets."
    )

    field = serializers.CharField(
        default="",
        required=False,
        help_text=
        f"Field related to true and predicted facts. NB! This has effect only for evaluation_type='entity' and is only required if the selected facts have multiple different doc paths."
    )

    plot = serializers.SerializerMethodField()
    task = TaskSerializer(read_only=True)

    url = serializers.SerializerMethodField()

    def validate_indices(self, value):
        """ Check if indices exist in the relevant project. """
        project_obj = Project.objects.get(
            id=self.context["view"].kwargs["project_pk"])
        for index in value:
            if index.get("name") not in project_obj.get_indices():
                raise serializers.ValidationError(
                    f'Index "{index.get("name")}" is not contained in your project indices "{project_obj.get_indices()}"'
                )
        return value

    def validate(self, data):
        """ Check if all inserted facts and fact values are present in the indices."""

        # For PATCH
        if len(data) == 1 and "description" in data:
            return data

        indices = [index.get("name") for index in data.get("indices")]
        query = data.get("query")
        if isinstance(query, str):
            query = json.loads(query)

        true_fact = data.get("true_fact")
        predicted_fact = data.get("predicted_fact")

        true_fact_value = data.get("true_fact_value")
        predicted_fact_value = data.get("predicted_fact_value")

        avg_function = data.get("average_function")
        evaluation_type = data.get("evaluation_type")

        doc_path = data.get("field")

        validate_fact(indices, query, true_fact)
        validate_fact(indices, query, predicted_fact)

        validate_fact_value(indices, query, true_fact, true_fact_value)
        validate_fact_value(indices, query, predicted_fact,
                            predicted_fact_value)

        if evaluation_type == "entity":
            validate_entity_facts(indices, query, true_fact, predicted_fact,
                                  doc_path)

        validate_fact_values_in_sync(true_fact_value, predicted_fact_value)

        validate_average_function(avg_function, true_fact_value,
                                  predicted_fact_value)
        validate_evaluation_type(
            indices,
            query,
            evaluation_type,
            true_fact,
            predicted_fact,
            true_fact_value,
            predicted_fact_value,
        )

        return data

    class Meta:
        model = Evaluator
        fields = ("url", "author", "id", "description", "indices", "query",
                  "true_fact", "predicted_fact", "true_fact_value",
                  "predicted_fact_value", "average_function", "f1_score",
                  "precision", "recall", "accuracy", "confusion_matrix",
                  "n_true_classes", "n_predicted_classes", "n_total_classes",
                  "evaluation_type", "scroll_size", "es_timeout",
                  "scores_imprecise", "score_after_scroll", "document_count",
                  "add_individual_results", "plot", "task",
                  "add_misclassified_examples", "evaluation_type",
                  "token_based", "field")

        read_only_fields = ("project", "f1_score", "precision", "recall",
                            "accuracy", "confusion_matrix", "n_true_classes",
                            "n_predicted_classes", "n_total_classes",
                            "document_count", "evaluation_type",
                            "scores_imprecise", "score_after_scroll", "task")
Пример #17
0
class BertTaggerSerializer(FieldParseSerializer, serializers.ModelSerializer, IndicesSerializerMixin, ProjectResourceUrlSerializer):
    author = UserSerializer(read_only=True)
    fields = serializers.ListField(child=serializers.CharField(), help_text='Fields used to build the model.')
    query = serializers.JSONField(required=False, help_text='Query in JSON format', default=json.dumps(EMPTY_QUERY))
    fact_name = serializers.CharField(default=None, required=False, help_text='Fact name used to filter tags (fact values). Default = None')
    pos_label = serializers.CharField(default="", required=False, allow_blank=True, help_text='Fact value used as positive label while evaluating the results. This is needed only, if the selected fact has exactly two possible values. Default = ""')

    use_gpu = serializers.BooleanField(default=True, help_text="Whether to force the usage of a GPU or not.")

    checkpoint_model = ProjectFilteredPrimaryKeyRelatedField(queryset=BertTagger.objects, many=False, read_only=False, allow_null=True, default=None,
                                                             help_text=f'Previously fine-tuned BERT model. Select this, if you wish to further fine-tune it with additional data and/or new parameters. Default = None')

    maximum_sample_size = serializers.IntegerField(default=choices.DEFAULT_MAX_SAMPLE_SIZE, required=False, help_text=f'Maximum number of positive examples. Default = {choices.DEFAULT_MAX_SAMPLE_SIZE}')
    minimum_sample_size = serializers.IntegerField(default=choices.DEFAULT_MIN_SAMPLE_SIZE, required=False, help_text=f'Minimum number of negative examples. Default = {choices.DEFAULT_MIN_SAMPLE_SIZE}')
    negative_multiplier = serializers.FloatField(default=choices.DEFAULT_NEGATIVE_MULTIPLIER, required=False, help_text=f'Default={choices.DEFAULT_NEGATIVE_MULTIPLIER}')

    bert_model = serializers.CharField(default=choices.DEFAULT_BERT_MODEL, required=False, help_text=f'Pretrained BERT model to use. Default = {choices.DEFAULT_BERT_MODEL}')
    num_epochs = serializers.IntegerField(default=choices.DEFAULT_NUM_EPOCHS, required=False, help_text=f'Number of training epochs. Default = {choices.DEFAULT_NUM_EPOCHS}')
    max_length = serializers.IntegerField(default=choices.DEFAULT_MAX_LENGTH, required=False, min_value=1, max_value=512, help_text=f'Maximum sequence length of BERT tokenized input text used for training. Default = {choices.DEFAULT_MAX_LENGTH}')
    batch_size = serializers.IntegerField(default=choices.DEFAULT_BATCH_SIZE, required=False, help_text=f'Batch size used for training. NB! Autoscaled based on max length if too large. Default = {choices.DEFAULT_BATCH_SIZE}')
    split_ratio = serializers.FloatField(default=choices.DEFAULT_TRAINING_SPLIT, required=False, help_text=f'Proportion of documents used for training; others are used for validation. Default = {choices.DEFAULT_TRAINING_SPLIT}')
    learning_rate = serializers.FloatField(default=choices.DEFAULT_LEARNING_RATE, required=False, help_text=f'Learning rate used while training. Default = {choices.DEFAULT_LEARNING_RATE}')
    eps = serializers.FloatField(default=choices.DEFAULT_EPS, help_text=f'Default = {choices.DEFAULT_EPS}')

    balance = serializers.BooleanField(default=choices.DEFAULT_BALANCE, required=False, help_text=f'Balance sample sizes of different classes. Only applicable for multiclass taggers. Default = {choices.DEFAULT_BALANCE}')
    use_sentence_shuffle = serializers.BooleanField(default=choices.DEFAULT_USE_SENTENCE_SHUFFLE, required=False, help_text=f'Shuffle sentences in added examples. NB! Only applicable for multiclass taggers with balance=True. Default = {choices.DEFAULT_USE_SENTENCE_SHUFFLE}')
    balance_to_max_limit = serializers.BooleanField(default=choices.DEFAULT_BALANCE_TO_MAX_LIMIT, required=False,
                                                    help_text=f'If enabled, the number of samples for each class is set to `maximum_sample_size`. Otherwise, it is set to max class size. NB! Only applicable for multiclass taggers with balance=True. Default = {choices.DEFAULT_BALANCE_TO_MAX_LIMIT}')

    task = TaskSerializer(read_only=True)
    plot = serializers.SerializerMethodField()
    url = serializers.SerializerMethodField()


    def validate_bert_model(self, bert_model):
        available_models = get_downloaded_bert_models(BERT_PRETRAINED_MODEL_DIRECTORY)
        if not bert_model in available_models:
            if ALLOW_BERT_MODEL_DOWNLOADS:
                raise serializers.ValidationError(f"Model '{bert_model}' is not downloaded. Please download it first via action 'Download pretrained model'. Currently available models: {available_models}.")
            else:
                raise serializers.ValidationError(f"Model '{bert_model}' is not downloaded. Downloading models via API is disabled. Please contact you system administrator to make it available. Currently available models: {available_models}.")
        return bert_model


    def validate(self, data):
        # use custom validation for pos label as some other serializer fields are also required
        data = validate_pos_label(data)
        return data


    class Meta:
        model = BertTagger
        fields = ('url', 'author', 'id', 'description', 'query', 'fields', 'use_gpu', 'f1_score', 'precision', 'recall', 'accuracy',
                  'validation_loss', 'training_loss', 'maximum_sample_size', 'minimum_sample_size', 'num_epochs', 'plot', 'task', 'pos_label', 'fact_name',
                  'indices', 'bert_model', 'learning_rate', 'eps', 'max_length', 'batch_size', 'adjusted_batch_size',
                  'split_ratio', 'negative_multiplier', 'checkpoint_model', 'num_examples', 'confusion_matrix', 'balance', 'use_sentence_shuffle', 'balance_to_max_limit', 'classes')

        read_only_fields = ('project', 'fields', 'f1_score', 'precision', 'recall', 'accuracy', 'validation_loss', 'training_loss', 'plot',
                            'task', 'num_examples', 'adjusted_batch_size', 'confusion_matrix', 'classes')

        fields_to_parse = ['fields', 'classes']
Пример #18
0
class TaggerGroupSerializer(serializers.ModelSerializer,
                            ProjectResourceUrlSerializer):
    author = UserSerializer(read_only=True)
    description = serializers.CharField(
        help_text=f'Description for the Tagger Group.')
    minimum_sample_size = serializers.IntegerField(
        default=choices.DEFAULT_MIN_SAMPLE_SIZE,
        help_text=
        f'Minimum number of documents required to train a model. Default: {choices.DEFAULT_MIN_SAMPLE_SIZE}'
    )
    fact_name = serializers.CharField(
        default=choices.DEFAULT_TAGGER_GROUP_FACT_NAME,
        help_text=
        f'Fact name used to filter tags (fact values). Default: {choices.DEFAULT_TAGGER_GROUP_FACT_NAME}'
    )
    tagger = TaggerSerializer(write_only=True,
                              remove_fields=[
                                  'description', 'query', 'fact_name',
                                  'minimum_sample_size'
                              ])
    num_tags = serializers.IntegerField(read_only=True)
    blacklisted_facts = serializers.ListField(
        child=serializers.CharField(),
        default=[],
        help_text="Which fact values to ignore when creating the taggers.")
    tagger_status = serializers.SerializerMethodField()
    tagger_statistics = serializers.SerializerMethodField()
    tagger_params = serializers.SerializerMethodField()
    url = serializers.SerializerMethodField()
    task = TaskSerializer(read_only=True)

    def to_representation(self, instance):
        data = super(TaggerGroupSerializer, self).to_representation(instance)
        try:
            data["blacklisted_facts"] = json.loads(instance.blacklisted_facts)
        except Exception as e:
            logging.getLogger(ERROR_LOGGER).exception(e)
        return data

    class Meta:
        model = TaggerGroup
        fields = ('id', 'url', 'author', 'description', 'fact_name',
                  'num_tags', 'blacklisted_facts', 'minimum_sample_size',
                  'tagger_status', 'tagger_params', 'tagger',
                  'tagger_statistics', 'task')

    def get_tagger_status(self, obj):
        tagger_objects = obj.taggers
        tagger_status = {
            'total': obj.num_tags,
            'completed': len(tagger_objects.filter(task__status='completed')),
            'training': len(tagger_objects.filter(task__status='running')),
            'created': len(tagger_objects.filter(task__status='created')),
            'failed': len(tagger_objects.filter(task__status='failed'))
        }
        return tagger_status

    def get_tagger_statistics(self, obj):
        tagger_objects = obj.taggers
        if tagger_objects.exists():
            try:
                tagger_size_sum = round(
                    tagger_objects.filter(model_size__isnull=False).aggregate(
                        Sum('model_size'))['model_size__sum'], 1)
            except TypeError as e:
                # if models are not ready
                tagger_size_sum = 0
            tagger_stats = {
                'avg_precision':
                tagger_objects.aggregate(Avg('precision'))['precision__avg'],
                'avg_recall':
                tagger_objects.aggregate(Avg('recall'))['recall__avg'],
                'avg_f1_score':
                tagger_objects.aggregate(Avg('f1_score'))['f1_score__avg'],
                'sum_size': {
                    "size": tagger_size_sum,
                    "unit": "mb"
                }
            }
            return tagger_stats

    def _embedding_details(self, instance: Tagger):
        if instance.embedding:
            return {
                "id": instance.embedding.pk,
                "description": instance.embedding.description
            }
        else:
            return None

    def get_tagger_params(self, obj):
        if obj.taggers.exists():
            first_tagger: Tagger = obj.taggers.first()
            params = {
                'fields': json.loads(first_tagger.fields),
                'detect_lang': first_tagger.detect_lang,
                'scoring_function': first_tagger.scoring_function,
                'maximum_sample_size': first_tagger.maximum_sample_size,
                'negative_multiplier': first_tagger.negative_multiplier,
                'snowball_language': first_tagger.snowball_language,
                'embedding': self._embedding_details(first_tagger),
                'indices': first_tagger.get_indices(),
                'vectorizer': first_tagger.vectorizer,
                'classifier': first_tagger.classifier,
                'analyzer': first_tagger.analyzer,
                'stop_words': load_stop_words(first_tagger.stop_words),
                'ignore_numbers': first_tagger.ignore_numbers,
                'balance': first_tagger.balance,
                'balance_to_max_limit': first_tagger.balance_to_max_limit
            }
            return params