Exemplo n.º 1
0
    def build_level_helper(tmp,
                           subtree,
                           max_depth,
                           stem,
                           list_iter=False,
                           recursive=True):
        """Builds a level of the tree using the stems lookup"""
        lookup = ActionTreeGenerator.trans_lookup[stem]

        for i, w in enumerate(tmp):
            if re.search(stem, w):
                is_plural = helpers.is_plural(w)
                if not list_iter:
                    subtree[lookup]['_id'] = next(
                        helpers.ssconj_doc_iterator(tmp, i))
                else:
                    try:
                        subtree[lookup]['_id'] = list(
                            helpers.ssconj_doc_iterator(tmp,
                                                        i,
                                                        is_plural=is_plural,
                                                        recursive=recursive))
                    except:
                        continue
                subtree[lookup][
                    'children'] = ActionTreeGenerator.children_loopkup[lookup]

        return subtree
Exemplo n.º 2
0
def test_iterator():
    s = 'παράγραφοι 6, 7, 8 και 9, 10 και 11, 18 έως 25, 26 και 27'
    z = helpers.ssconj_doc_iterator(s.split(' '), 0, True, True)
    assert (list(z) == [
        '6', '7', '8', '9', '10', '11', '18', '19', '20', '21', '22', '23',
        '24', '25', '27', '25', '26', '27'
    ])
Exemplo n.º 3
0
    def get_renumbering(tree, doc):
        """Get renumbering content"""
        start = tree['root']['_id']
        is_plural = helpers.is_plural(tree['what']['context'])

        for i in range(start, len(doc)):
            if doc[i].text == 'σε':
                tree['what']['to'] = list(
                    helpers.ssconj_doc_iterator(doc, i, is_plural))
                break

        return tree
Exemplo n.º 4
0
    def build_level(tmp, subtree, max_depth, stem, list_iter=False):
        """Builds a level of the tree using the stems lookup"""

        lookup = ActionTreeGenerator.trans_lookup[stem]

        if not re.search(stem, subtree['what']['context']):
            for i, w in enumerate(tmp):
                if re.search(stem, w):
                    if not list_iter:
                        subtree[lookup]['_id'] = next(
                            helpers.ssconj_doc_iterator(tmp, i))
                    else:
                        subtree[lookup]['_id'] = list(
                            helpers.ssconj_doc_iterator(tmp, i))
                    subtree[lookup]['children'] = ActionTreeGenerator.children_loopkup[lookup]
                    break
        else:
            subtree[lookup]['_id'] = subtree['what']['number']
            subtree[lookup]['children'] = []

        return subtree
Exemplo n.º 5
0
    def get_nsubj_fallback(tmp, tree, i, max_what_window=20):
        found_what = False
        logging.info('Fallback mode')
        logging.info(tmp)
        for j in range(1, max_what_window + 1):
            for what in entities.whats:
                if i + j <= len(tmp) - 1 and what == tmp[i + j]:
                    tree['root']['children'].append('law')
                    tree['what'] = {
                        'index': i + j,
                        'context': what,
                    }

                    if i + j + 1 <= len(tmp):
                        tree['what']['number'] = list(
                            helpers.ssconj_doc_iterator(tmp, i + j))
                    else:
                        tree['what']['number'] = None

                    is_plural = helpers.is_plural(what)
                    return found_what, tree, is_plural

                if i - j >= 0 and what == tmp[i - j]:
                    tree['root']['children'].append('law')
                    tree['what'] = {
                        'index': i - j,
                        'context': what,
                    }
                    if i - j >= 0:
                        tree['what']['number'] = list(
                            helpers.ssconj_doc_iterator(tmp, i - j))
                    else:
                        tree['what']['number'] = None

                    is_plural = helpers.is_plural(what)
                    return found_what, tree, is_plural

        return found_what, tree, False
Exemplo n.º 6
0
    def generate_action_tree_from_string(s,
                                         nested=False,
                                         max_what_window=20,
                                         max_where_window=30,
                                         use_regex=False):
        """Main algorithm for amendment detection
        The approach followed is hybrid
        The procedure is outlined here:
        https://github.com/eellak/gsoc2018-3gm/wiki/Algorithms-for-analyzing-Government-Gazette-Documents
        """

        # results are stored here

        trees = []
        # fix par abbrev
        s = helpers.fix_par_abbrev(s)

        # get extracts and non-extracts using helper functions
        parts = tokenizer.tokenizer.split(s, False, '. ')
        extracts, non_extracts = helpers.get_extracts(s)

        non_extracts = ' '.join(non_extracts)
        non_extracts = tokenizer.tokenizer.split(non_extracts, True, '. ')

        extract_cnt = 0

        for part_cnt, non_extract in enumerate(non_extracts):

            doc = nlp(non_extract)

            tmp = list(
                map(lambda s: s.strip(string.punctuation),
                    non_extract.split(' ')))

            # Detect amendment action
            for action in entities.actions:
                for i, w in enumerate(doc):
                    if action == w.text:
                        tree = collections.defaultdict(dict)
                        tree['root'] = {
                            '_id': i,
                            'action': action.__str__(),
                            'children': []
                        }
                        max_depth = 0

                        logging.info('Found ' + str(action))

                        extract = None
                        if str(action) not in [
                                'διαγράφεται', 'παύεται', 'καταργείται'
                        ]:
                            try:
                                extract = extracts[extract_cnt]
                                extract_cnt += 1
                            except IndexError:
                                extract = None

                        # Detect what is amended
                        found_what, tree, is_plural = ActionTreeGenerator.get_nsubj(
                            doc, i, tree)
                        if found_what:
                            k = tree['what']['index']
                            if tree['what']['context'] not in [
                                    'φράση', 'φράσεις', 'λέξη', 'λέξεις'
                            ]:
                                tree['what']['number'] = list(
                                    helpers.ssconj_doc_iterator(
                                        doc, k, is_plural))
                            else:
                                tree = phrase_fun.detect_phrase_components(
                                    parts[part_cnt], tree)
                                tree['what']['context'] = 'φράση'
                            logging.info(tree['what'])

                        else:
                            found_what, tree, is_plural = ActionTreeGenerator.get_nsubj_fallback(
                                tmp, tree, i)

                        # get content
                        if action not in [
                                'διαγράφεται', 'διαγράφονται', 'αναριθμείται',
                                'αναριθμούνται'
                        ]:
                            tree, max_depth = ActionTreeGenerator.get_content(
                                tree, extract, s)
                        if action in ['αναριθμείται', 'αναριθμούνται']:
                            # get renumbering
                            tree = ActionTreeGenerator.get_renumbering(
                                tree, doc)
                            subtrees = ActionTreeGenerator.split_renumbering_tree(
                                tree)

                        # split to subtrees
                        if action not in ['αναριθμείται', 'αναριθμούνται']:
                            subtrees = ActionTreeGenerator.split_tree(tree)

                        # iterate over subtrees
                        for subtree in subtrees:

                            subtree, max_depth = ActionTreeGenerator.get_content(
                                subtree, extract, s, secondary=True)

                            # get latest statute
                            try:
                                law = ActionTreeGenerator.detect_latest_statute(
                                    non_extract)
                            except BaseException:
                                law = ''

                            # first level are laws
                            subtree['law'] = {
                                '_id': law,
                                'children': ['article']
                            }

                            splitted = non_extract.split(' ')

                            # build levels bottom up
                            subtree = ActionTreeGenerator.build_levels(
                                splitted, subtree)

                            # nest into dictionary
                            if nested:
                                ActionTreeGenerator.nest_tree('root', subtree)

                            trees.append(subtree)

        return trees