Exemplo n.º 1
0
def extract_tags(text):
    text_clean = clean(text)
    return sum(
        [
            [dict(tag) for tag in tagger.tag(tag_group, text_clean)]
            for tag_group in pattern.tag_groups.itervalues()
        ],
        []
    )
Exemplo n.º 2
0
def check_taggers(taggers, input, expected):
    tag_group = TagGroup(taggers, '')
    tags = tag(tag_group, input)
    if expected is not None:
        assert tags is not None
        if isinstance(tags, list):
            assert len(tags) == 1
            tags = tags[0]
        for key, value in expected.iteritems():
            assert tags[key] == value
    else:
        assert (tags is None or tags == [])
Exemplo n.º 3
0
 def test():
     tags = tag(taggers, input)
     if output is not None:
         assert_true(tags is not None)
         if isinstance(tags, list):
             assert_equal(len(tags), 1)
             tags = tags[0]
         for key, value in output.iteritems():
             assert_equal(
                 tags[key], value
             )
     else:
         assert_true(tags is None or len(tags) == 0)
Exemplo n.º 4
0
    def tag(self, tag_groups=None, overwrite=False, save=True):
        """Add tags to article.

        :param list tag_groups: List of TagGroup objects
        :param bool overwrite: Overwrite existing tags
        :param bool save: Save record after update
        :return list: New or modified extracted tags

        """
        tag_groups = tag_groups or pattern.tag_groups.values()

        if overwrite:
            self.tags = []
            existing_tags = []
        else:
            existing_tags = [
                tagger.Tag(tag)
                for tag in self.tags
            ]

        new_tags = []

        self.verify(save=False)

        for document_type in self.verified:

            document_field = DOCUMENT_TYPES_TO_FIELDS[document_type]
            document = getattr(self, document_field)

            # Quit if document not set
            if document is None:
                continue

            doc = document.read()

            # Quit if document empty or fails verification
            if not doc:
                continue

            # Clean document text
            doc = clean(doc)

            for tag_group in tag_groups:

                # Extract tags
                tags = tagger.tag(tag_group, doc)

                for tag in tags:

                    # Build context documents
                    context_data = {document_type: tag['context']}
                    group_data = {document_type: tag['group']}
                    span_data = {document_type: tag['span']}

                    # Update existing tag with context
                    if tag in existing_tags:
                        idx = existing_tags.index(tag)
                        if document_type not in existing_tags[idx]['context']:
                            existing_tags[idx]['context'].update(context_data)
                            existing_tags[idx]['group'].update(group_data)
                            existing_tags[idx]['span'].update(span_data)
                            new_tags.append(existing_tags[idx])
                    # Create new tag in database
                    else:
                        tag['context'] = context_data
                        tag['group'] = group_data
                        tag['span'] = span_data
                        existing_tags.append(tag)
                        new_tags.append(tag)

        # Cast tags to dictionaries for ODM compatibility
        self.tags = [
            dict(tag)
            for tag in existing_tags
        ]

        # Update tagged date
        self.date_last_tagged = datetime.datetime.utcnow()

        if save:
            self.save()

        return new_tags
Exemplo n.º 5
0
    def tag(self, tag_groups=None, overwrite=False, save=True):
        """Add tags to article.

        :param list tag_groups: List of TagGroup objects
        :param bool overwrite: Overwrite existing tags
        :param bool save: Save record after update
        :return list: New or modified extracted tags
        """
        tag_groups = tag_groups or pattern.tag_groups.values()

        if overwrite:
            self.tags = []
            existing_tags = []
        else:
            existing_tags = [tagger.Tag(tag) for tag in self.tags]

        new_tags = []

        self.verify(save=False)

        for document_type in self.verified:

            document_field = DOCUMENT_TYPES_TO_FIELDS[document_type]
            document = getattr(self, document_field)

            # Quit if document not set
            if document is None:
                continue

            doc = document.read()

            # Quit if document empty or fails verification
            if not doc:
                continue

            # Clean document text
            doc = clean(doc)

            for tag_group in tag_groups:

                # Extract tags
                tags = tagger.tag(tag_group, doc)

                for tag in tags:

                    # Build context documents
                    context_data = {document_type: tag['context']}
                    group_data = {document_type: tag['group']}
                    span_data = {document_type: tag['span']}

                    # Update existing tag with context
                    if tag in existing_tags:
                        idx = existing_tags.index(tag)
                        if document_type not in existing_tags[idx]['context']:
                            existing_tags[idx]['context'].update(context_data)
                            existing_tags[idx]['group'].update(group_data)
                            existing_tags[idx]['span'].update(span_data)
                            new_tags.append(existing_tags[idx])
                    # Create new tag in database
                    else:
                        tag['context'] = context_data
                        tag['group'] = group_data
                        tag['span'] = span_data
                        existing_tags.append(tag)
                        new_tags.append(tag)

        # Cast tags to dictionaries for ODM compatibility
        self.tags = [dict(tag) for tag in existing_tags]

        # Update tagged date
        self.date_last_tagged = datetime.datetime.utcnow()

        if save:
            self.save()

        return new_tags
Exemplo n.º 6
0
def extract_tags(text):
    text_clean = clean(text)
    return sum([[dict(tag) for tag in tagger.tag(tag_group, text_clean)]
                for tag_group in pattern.tag_groups.itervalues()], [])