def build_level_helper(tmp, subtree, max_depth, stem, list_iter=False, recursive=True): """Builds a level of the tree using the stems lookup""" lookup = ActionTreeGenerator.trans_lookup[stem] for i, w in enumerate(tmp): if re.search(stem, w): is_plural = helpers.is_plural(w) if not list_iter: subtree[lookup]['_id'] = next( helpers.ssconj_doc_iterator(tmp, i)) else: try: subtree[lookup]['_id'] = list( helpers.ssconj_doc_iterator(tmp, i, is_plural=is_plural, recursive=recursive)) except: continue subtree[lookup][ 'children'] = ActionTreeGenerator.children_loopkup[lookup] return subtree
def test_iterator(): s = 'παράγραφοι 6, 7, 8 και 9, 10 και 11, 18 έως 25, 26 και 27' z = helpers.ssconj_doc_iterator(s.split(' '), 0, True, True) assert (list(z) == [ '6', '7', '8', '9', '10', '11', '18', '19', '20', '21', '22', '23', '24', '25', '27', '25', '26', '27' ])
def get_renumbering(tree, doc): """Get renumbering content""" start = tree['root']['_id'] is_plural = helpers.is_plural(tree['what']['context']) for i in range(start, len(doc)): if doc[i].text == 'σε': tree['what']['to'] = list( helpers.ssconj_doc_iterator(doc, i, is_plural)) break return tree
def build_level(tmp, subtree, max_depth, stem, list_iter=False): """Builds a level of the tree using the stems lookup""" lookup = ActionTreeGenerator.trans_lookup[stem] if not re.search(stem, subtree['what']['context']): for i, w in enumerate(tmp): if re.search(stem, w): if not list_iter: subtree[lookup]['_id'] = next( helpers.ssconj_doc_iterator(tmp, i)) else: subtree[lookup]['_id'] = list( helpers.ssconj_doc_iterator(tmp, i)) subtree[lookup]['children'] = ActionTreeGenerator.children_loopkup[lookup] break else: subtree[lookup]['_id'] = subtree['what']['number'] subtree[lookup]['children'] = [] return subtree
def get_nsubj_fallback(tmp, tree, i, max_what_window=20): found_what = False logging.info('Fallback mode') logging.info(tmp) for j in range(1, max_what_window + 1): for what in entities.whats: if i + j <= len(tmp) - 1 and what == tmp[i + j]: tree['root']['children'].append('law') tree['what'] = { 'index': i + j, 'context': what, } if i + j + 1 <= len(tmp): tree['what']['number'] = list( helpers.ssconj_doc_iterator(tmp, i + j)) else: tree['what']['number'] = None is_plural = helpers.is_plural(what) return found_what, tree, is_plural if i - j >= 0 and what == tmp[i - j]: tree['root']['children'].append('law') tree['what'] = { 'index': i - j, 'context': what, } if i - j >= 0: tree['what']['number'] = list( helpers.ssconj_doc_iterator(tmp, i - j)) else: tree['what']['number'] = None is_plural = helpers.is_plural(what) return found_what, tree, is_plural return found_what, tree, False
def generate_action_tree_from_string(s, nested=False, max_what_window=20, max_where_window=30, use_regex=False): """Main algorithm for amendment detection The approach followed is hybrid The procedure is outlined here: https://github.com/eellak/gsoc2018-3gm/wiki/Algorithms-for-analyzing-Government-Gazette-Documents """ # results are stored here trees = [] # fix par abbrev s = helpers.fix_par_abbrev(s) # get extracts and non-extracts using helper functions parts = tokenizer.tokenizer.split(s, False, '. ') extracts, non_extracts = helpers.get_extracts(s) non_extracts = ' '.join(non_extracts) non_extracts = tokenizer.tokenizer.split(non_extracts, True, '. ') extract_cnt = 0 for part_cnt, non_extract in enumerate(non_extracts): doc = nlp(non_extract) tmp = list( map(lambda s: s.strip(string.punctuation), non_extract.split(' '))) # Detect amendment action for action in entities.actions: for i, w in enumerate(doc): if action == w.text: tree = collections.defaultdict(dict) tree['root'] = { '_id': i, 'action': action.__str__(), 'children': [] } max_depth = 0 logging.info('Found ' + str(action)) extract = None if str(action) not in [ 'διαγράφεται', 'παύεται', 'καταργείται' ]: try: extract = extracts[extract_cnt] extract_cnt += 1 except IndexError: extract = None # Detect what is amended found_what, tree, is_plural = ActionTreeGenerator.get_nsubj( doc, i, tree) if found_what: k = tree['what']['index'] if tree['what']['context'] not in [ 'φράση', 'φράσεις', 'λέξη', 'λέξεις' ]: tree['what']['number'] = list( helpers.ssconj_doc_iterator( doc, k, is_plural)) else: tree = phrase_fun.detect_phrase_components( parts[part_cnt], tree) tree['what']['context'] = 'φράση' logging.info(tree['what']) else: found_what, tree, is_plural = ActionTreeGenerator.get_nsubj_fallback( tmp, tree, i) # get content if action not in [ 'διαγράφεται', 'διαγράφονται', 'αναριθμείται', 'αναριθμούνται' ]: tree, max_depth = ActionTreeGenerator.get_content( tree, extract, s) if action in ['αναριθμείται', 'αναριθμούνται']: # get renumbering tree = ActionTreeGenerator.get_renumbering( tree, doc) subtrees = ActionTreeGenerator.split_renumbering_tree( tree) # split to subtrees if action not in ['αναριθμείται', 'αναριθμούνται']: subtrees = ActionTreeGenerator.split_tree(tree) # iterate over subtrees for subtree in subtrees: subtree, max_depth = ActionTreeGenerator.get_content( subtree, extract, s, secondary=True) # get latest statute try: law = ActionTreeGenerator.detect_latest_statute( non_extract) except BaseException: law = '' # first level are laws subtree['law'] = { '_id': law, 'children': ['article'] } splitted = non_extract.split(' ') # build levels bottom up subtree = ActionTreeGenerator.build_levels( splitted, subtree) # nest into dictionary if nested: ActionTreeGenerator.nest_tree('root', subtree) trees.append(subtree) return trees