예제 #1
0
def collect_chain(sent, head_token, link):
    stack = get_children(sent, head_token.attrib['ID'], links=link)
    chain = []
    while stack != []:
        candidate = stack.pop()
        chain.append(candidate)
        stack.extend(get_children(sent, candidate.attrib['ID'], links=link))
    return sorted(chain + [head_token], key=lambda x: int(x.attrib['ID']))
예제 #2
0
def get_fixed_info(sent, head_token):
    children = get_children(sent, head_token.attrib['ID'])
    candidate_list = sorted(children + [head_token],
                            key=lambda x: int(x.attrib['ID']))
    lemma_list = tuple(item.attrib['LEMMA'] for item in candidate_list)

    friend_start, friend_end = None, None
    onetwo_start, onetwo_end = None, None
    etc_start, etc_end = None, None

    for i, item in enumerate(lemma_list):
        # друг PR друг
        if item == 'друг':
            if friend_start is None:
                friend_start = i
            else:
                friend_end = friend_end or i + 1
        # один PR другой
        elif item == 'один':
            onetwo_start = onetwo_start or i
        elif item == 'другой' and onetwo_start is not None:
            onetwo_end = i + 1

    # и так далее
    if 'и так далее' in ' '.join(lemma_list):
        etc_start = lemma_list.index('и')
        etc_end = etc_start + 3

    for trim_start, trim_end in [
        (friend_start, friend_end),
        (onetwo_start, onetwo_end),
        (etc_start, etc_end),
    ]:
        if trim_end is not None:
            candidate_list = candidate_list[trim_start:trim_end]
            lemma_list = tuple(lemma_list[trim_start:trim_end])
            break
    else:
        children = get_children(sent, head_token.attrib['ID'], links='fixed')
        candidate_list = sorted(children + [head_token],
                                key=lambda x: int(x.attrib['ID']))
        lemma_list = tuple(item.attrib['LEMMA'] for item in candidate_list)

    if ' '.join(lemma_list) == 'точка зрение':
        link = 'compound'  # the only compound
    elif any(len(lemma) == 2 and lemma.endswith('.') for lemma in lemma_list):
        link = 'flat:name'  # initials
    else:
        link = 'fixed'

    return candidate_list, link
예제 #3
0
def flatten(sent, head_token, candidate_list, link_to_use):
    link_to_use = link_to_use + '_already'

    # new_head
    new_head = candidate_list[0]
    new_head.attrib['DOM'] = head_token.attrib['DOM']
    if 'LINK' in head_token.attrib:
        new_head.attrib['LINK'] = head_token.attrib['LINK']
    elif 'LINK' in new_head.attrib:
        del new_head.attrib['LINK']

    # repossess all children
    new_children_ids = set()
    for item in candidate_list:
        new_children_ids |= set(
            int(child.attrib['ID']) - 1
            for child in get_children(sent, item.attrib['ID']))
    new_children_ids -= set(
        int(item.attrib['ID']) - 1 for item in candidate_list)
    for new_child_id in new_children_ids:
        sent.findall('W')[new_child_id].attrib['DOM'] = new_head.attrib['ID']

    # repossess all words that are included in this fixed expression
    for item in candidate_list[1:]:
        item.attrib['DOM'] = new_head.attrib['ID']
        item.attrib['LINK'] = link_to_use
예제 #4
0
def munch(ifiles, ofiles):
    """
    Process all files in ifiles list.
    Output into ofiles list.
    """
    for ifname, ofname in zip(ifiles, ofiles):
        tree = ET.parse(ifname)
        root = tree.getroot()
        for sentence in root[-1].findall(
                'S'):  # step 1: collect token numbers old:new
            numbering = {}
            fantom_number = 0
            token_number = 0
            for token in sentence.findall('W'):
                if token.text != 'FANTOM':
                    token_number += 1
                    fantom_number = 0
                    numbering[token.attrib['ID']] = str(token_number)
                else:
                    fantom_number += 1
                    numbering[token.attrib['ID']] = str(
                        token_number) + '.' + str(fantom_number)

            for word in sentence.findall('W'):  # step 2: assign new numbers
                word.attrib['ID'] = numbering[word.attrib['ID']]
                if word.attrib['DOM'] != '_root':
                    word.attrib['DOM'] = numbering[word.attrib['DOM']]

            for elem in sentence.findall(
                    'W'
            ):  # step 3: add new atribute for enhanced representation
                if elem.attrib['DOM'] == '_root':
                    elem.attrib['ENH'] = '0:root'
                else:
                    elem.attrib[
                        'ENH'] = elem.attrib['DOM'] + ':' + elem.attrib['LINK']

            for token in sentence.findall('W'):  # step 7: fix head ellipsys
                if token.text == 'FANTOM' and token.attrib['DOM'] == '_root':
                    candidate_children = get_children(sentence,
                                                      token.attrib['ID'])
                    children = []
                    fantom_children = []
                    for child in candidate_children:
                        # real children to the left,
                        # fantom children to the right
                        if child.text == 'FANTOM':
                            fantom_children.append(child)
                        else:
                            children.append(child)
                    guy_to_promote = None  # haven't found him yet

                    token.attrib['LINK'] = 'none'

                    if len(children) == 1:
                        guy_to_promote = children[0]
                        new_children = get_children(sentence,
                                                    children[0].attrib['ID'])
                        children[0].attrib['DOM'] = '_root'
                        children[0].attrib['ENH'] = '0:root'
                        del children[0].attrib['LINK']
                        token.attrib['DOM'] = children[0].attrib['ID']

                        if len(new_children) == 1:
                            if new_children[
                                    0].text != 'FANTOM' and new_children[
                                        0].attrib['LINK'] != 'parataxis':
                                new_children[0].attrib['LINK'] = 'orphan'
                        elif len(new_children) == 2 and any(
                                n_ch.attrib['LINK'] == 'fixed'
                                for n_ch in new_children):
                            for ch in new_children:
                                if ch.attrib['LINK'] != 'fixed':
                                    if ch.text != 'FANTOM' and ch.attrib[
                                            'LINK'] != 'parataxis':
                                        ch.attrib['LINK'] = 'orphan'
                                    break
                        else:
                            for ch in new_children:
                                if ch.attrib['LINK'] != 'iobj':
                                    if ch.text != 'FANTOM' and ch.attrib[
                                            'LINK'] != 'parataxis':
                                        ch.attrib['LINK'] = 'orphan'
                                    break

                    elif len(children) >= 2:
                        if token.attrib['FEAT'].split()[0] in {
                                'PROPN', 'NOUN', 'PRON', 'SYM'
                        }:
                            if any(child.attrib['LINK'] == 'nsubj'
                                   for child in children):
                                for item in children:
                                    if item.attrib['LINK'] == 'nsubj':  #**
                                        guy_to_promote = item
                                        item.attrib['DOM'] = '_root'
                                        item.attrib['ENH'] = '0:root'
                                        del item.attrib['LINK']
                                        token.attrib['DOM'] = item.attrib['ID']
                                        for elem in children:
                                            if elem.attrib[
                                                    'ID'] != item.attrib['ID']:
                                                elem.attrib[
                                                    'DOM'] = item.attrib['ID']
                                                if elem.text != 'FANTOM' and elem.attrib[
                                                        'LINK'] != 'parataxis':
                                                    elem.attrib[
                                                        'LINK'] = 'orphan'
                                        break

                            elif any(child.attrib['LINK'] in promotion_nominal
                                     for child in
                                     children):  # UD relations priority
                                children.sort(key=lambda x: promotion_nominal.
                                              get(x.attrib['LINK'], 100))
                                if children[0].attrib['LINK'] != children[
                                        1].attrib['LINK']:  #**
                                    guy_to_promote = children[0]
                                    children[0].attrib['DOM'] = '_root'
                                    children[0].attrib['ENH'] = '0:root'
                                    del children[0].attrib['LINK']
                                    token.attrib['DOM'] = children[0].attrib[
                                        'ID']
                                    for elem in children:
                                        if elem.attrib['ID'] != children[
                                                0].attrib['ID']:
                                            elem.attrib['DOM'] = children[
                                                0].attrib['ID']
                                else:
                                    if any(child.attrib['OLD'] in priority
                                           for child in children
                                           ):  # original relations priority
                                        children.sort(
                                            key=lambda x: priority.get(
                                                x.attrib['OLD'], 100))
                                        if children[0].attrib[
                                                'OLD'] != children[1].attrib[
                                                    'OLD']:  #**
                                            guy_to_promote = children[0]
                                            children[0].attrib['DOM'] = '_root'
                                            children[0].attrib[
                                                'ENH'] = '0:root'
                                            del children[0].attrib['LINK']
                                            token.attrib['DOM'] = children[
                                                0].attrib['ID']
                                            for elem in children:
                                                if elem.attrib[
                                                        'ID'] != children[
                                                            0].attrib['ID']:
                                                    elem.attrib[
                                                        'DOM'] = children[
                                                            0].attrib['ID']

                        elif any(child.attrib['LINK'] in promotion for child in
                                 children):  # UD relations priority
                            children.sort(key=lambda x: promotion.get(
                                x.attrib['LINK'], 100))
                            if children[0].attrib['LINK'] != children[
                                    1].attrib['LINK']:
                                guy_to_promote = children[0]
                                children[0].attrib['DOM'] = '_root'
                                children[0].attrib['ENH'] = '0:root'
                                del children[0].attrib['LINK']
                                token.attrib['DOM'] = children[0].attrib['ID']
                                for elem in children:
                                    if elem.attrib['ID'] != children[0].attrib[
                                            'ID']:
                                        elem.attrib['DOM'] = children[
                                            0].attrib['ID']
                                        if elem.text != 'FANTOM' and elem.attrib[
                                                'LINK'] != 'parataxis':
                                            elem.attrib['LINK'] = 'orphan'
                            else:
                                if any(child.attrib['OLD'] in priority
                                       for child in children
                                       ):  # original relations priority
                                    children.sort(key=lambda x: priority.get(
                                        x.attrib['OLD'], 100))
                                    #if children[0].attrib['OLD'] != children[1].attrib['OLD']:
                                    # we can't distinguish them in any further way,
                                    # so we just pick the first one regardless
                                    guy_to_promote = children[0]
                                    children[0].attrib['DOM'] = '_root'
                                    children[0].attrib['ENH'] = '0:root'
                                    del children[0].attrib['LINK']
                                    token.attrib['DOM'] = children[0].attrib[
                                        'ID']
                                    for elem in children:
                                        if elem.attrib['ID'] != children[
                                                0].attrib['ID']:
                                            elem.attrib['DOM'] = children[
                                                0].attrib['ID']
                                            if elem.text != 'FANTOM' and elem.attrib[
                                                    'LINK'] != 'parataxis':
                                                elem.attrib['LINK'] = 'orphan'

                        else:  # parataxis: 2 examples
                            for elem in children:
                                if elem.attrib['LINK'] == 'parataxis':
                                    guy_to_promote = 'parataxis'
                                    elem.attrib['DOM'] = '_root'
                                    elem.attrib['ENH'] = '0:root'
                                    del elem.attrib['LINK']
                                    token.attrib['DOM'] = elem.attrib['ID']
                                    for it in children:
                                        if it.attrib['ID'] != elem.attrib['ID']:
                                            it.attrib['DOM'] = elem.attrib[
                                                'ID']
                                            if it.text != 'FANTOM' and it.attrib[
                                                    'LINK'] != 'parataxis':
                                                it.attrib['LINK'] = 'orphan'

                    # rehang fantom children onto guy_to_promote
                    for fantom_child in fantom_children:
                        fantom_child.attrib['DOM'] = guy_to_promote.attrib[
                            'ID']
                    break

            for token in sentence.findall('W'):
                # step 4: detect orphan deprel
                if token.text != 'FANTOM':
                    children = get_children(sentence, token.attrib['ID'])
                    if all(child.text != 'FANTOM' for child in children):
                        continue

                    # populate with initial fantoms
                    fantom_list = [
                        child for child in children if child.text == 'FANTOM'
                    ]
                    fantom_queue = [fantom for fantom in fantom_list]

                    while fantom_queue != []:
                        current_fantom = fantom_queue.pop(0)
                        grand_children = get_children(
                            sentence, current_fantom.attrib['ID'])
                        for ch in grand_children:
                            if ch.text == 'FANTOM':
                                fantom_queue.append(ch)
                                fantom_list.append(ch)

                    # fix unexpected orphans in fantoms
                    for fantom in fantom_list:
                        if fantom.attrib['LINK'] == 'orphan':
                            fantom.attrib['LINK'] = fantom.attrib['ENH'].split(
                                ':', maxsplit=1)[1]

                    for initial_fantom in fantom_list[::-1]:

                        children_list = [
                            child for child in get_children(
                                sentence, initial_fantom.attrib['ID'])
                            if child.text != 'FANTOM'
                        ]
                        nominal_successful = False
                        fantom_feat = initial_fantom.attrib['FEAT'].split()[0]

                        if fantom_feat in {
                                'PROPN', 'NOUN', 'PRON', 'SYM', 'ADJ'
                        }:
                            if any(child.attrib['LINK'] == 'nsubj'
                                   for child in children_list):
                                for item in children_list:
                                    if item.attrib['LINK'] == 'nsubj':
                                        item.attrib[
                                            'LINK'] = initial_fantom.attrib[
                                                'LINK']
                                        item.attrib[
                                            'DOM'] = initial_fantom.attrib[
                                                'DOM']
                                        for elem in children_list:
                                            if elem.attrib[
                                                    'ID'] != item.attrib['ID']:
                                                elem.attrib[
                                                    'DOM'] = item.attrib['ID']
                                                elem.attrib['LINK'] = 'orphan'
                                        break
                                nominal_successful = True
                            else:

                                if len(children_list) == 1:
                                    if children_list[0].attrib['LINK'] != 'acl':
                                        children_list[0].attrib[
                                            'LINK'] = initial_fantom.attrib[
                                                'LINK']
                                    children_list[0].attrib[
                                        'DOM'] = initial_fantom.attrib['DOM']
                                    nominal_successful = True
                                else:
                                    promotion_sorted, priority_sorted, evolution_list = None, None, None
                                    if any(child.attrib['LINK'] in
                                           promotion_nominal
                                           for child in children_list
                                           ):  # UD relations priority
                                        promotion_sorted = sorted(
                                            children_list,
                                            key=lambda x: promotion_nominal.
                                            get(x.attrib['LINK'], 100))

                                    if any(child.attrib['OLD'] in
                                           priority_nominal
                                           for child in children_list
                                           ):  # original relations priority
                                        priority_sorted = sorted(
                                            children_list,
                                            key=lambda x: priority_nominal.get(
                                                x.attrib['OLD'], 100))
                                    if promotion_sorted is None:
                                        evolution_list = priority_sorted
                                    elif promotion_sorted[0].attrib[
                                            'LINK'] == promotion_sorted[
                                                1].attrib['LINK']:
                                        if promotion_sorted[0].attrib[
                                                'LINK'] == 'amod':
                                            evolution_list = promotion_sorted
                                        elif priority_sorted is None:
                                            evolution_list = promotion_sorted
                                        elif priority_sorted[0].attrib[
                                                'OLD'] != priority_sorted[
                                                    1].attrib['OLD']:
                                            evolution_list = priority_sorted
                                        else:
                                            evolution_list = promotion_sorted
                                    else:
                                        evolution_list = promotion_sorted

                                    if evolution_list is not None:
                                        children_list = evolution_list

                                        children_list[0].attrib[
                                            'LINK'] = initial_fantom.attrib[
                                                'LINK']
                                        children_list[0].attrib[
                                            'DOM'] = initial_fantom.attrib[
                                                'DOM']
                                        for elem in children_list:
                                            if elem.attrib[
                                                    'ID'] != children_list[
                                                        0].attrib['ID']:
                                                elem.attrib[
                                                    'DOM'] = children_list[
                                                        0].attrib['ID']
                                        nominal_successful = True
                        if not nominal_successful:
                            if len(children_list) == 1:
                                children_list[0].attrib[
                                    'LINK'] = initial_fantom.attrib['LINK']
                                children_list[0].attrib[
                                    'DOM'] = initial_fantom.attrib['DOM']
                            else:
                                promotion_sorted, priority_sorted, evolution_list = None, None, None

                                if any(child.attrib['LINK'] in promotion
                                       for child in
                                       children_list):  # UD relations priority
                                    promotion_sorted = sorted(
                                        children_list,
                                        key=lambda x: promotion.get(
                                            x.attrib['LINK'], 100))

                                if any(child.attrib['OLD'] in priority
                                       for child in children_list
                                       ):  # original relations priority
                                    priority_sorted = sorted(
                                        children_list,
                                        key=lambda x: priority.get(
                                            x.attrib['OLD'], 100))

                                if promotion_sorted is None:
                                    evolution_list = priority_sorted
                                elif promotion_sorted[0].attrib[
                                        'LINK'] == promotion_sorted[1].attrib[
                                            'LINK']:
                                    if priority_sorted is None:
                                        evolution_list = promotion_sorted
                                    elif priority_sorted[0].attrib[
                                            'OLD'] != priority_sorted[
                                                1].attrib['OLD']:
                                        evolution_list = priority_sorted
                                    else:
                                        evolution_list = promotion_sorted
                                else:
                                    evolution_list = promotion_sorted

                                if evolution_list is not None:
                                    children_list = evolution_list
                                    children_list[0].attrib[
                                        'LINK'] = initial_fantom.attrib['LINK']
                                    children_list[0].attrib[
                                        'DOM'] = initial_fantom.attrib['DOM']
                                    for elem in children_list:
                                        if elem.attrib['ID'] != children_list[
                                                0].attrib['ID']:
                                            elem.attrib['DOM'] = children_list[
                                                0].attrib['ID']
                                            if elem.attrib['LINK'] not in {
                                                    'cc', 'mark', 'parataxis',
                                                    'conj'
                                            }:
                                                elem.attrib['LINK'] = 'orphan'
                                else:
                                    if any(child.attrib['LINK'] == 'discourse'
                                           and child.attrib['LEMMA'] == 'нет'
                                           for child in children_list):
                                        for elem in children_list:
                                            if elem.attrib[
                                                    'LINK'] == 'discourse' and elem.attrib[
                                                        'LEMMA'] == 'нет':
                                                elem.attrib[
                                                    'LINK'] = initial_fantom.attrib[
                                                        'LINK']
                                                elem.attrib[
                                                    'DOM'] = initial_fantom.attrib[
                                                        'DOM']
                                                for item in children_list:
                                                    if item.attrib[
                                                            'ID'] != elem.attrib[
                                                                'ID']:
                                                        item.attrib[
                                                            'DOM'] = elem.attrib[
                                                                'ID']
                                                        if item.attrib[
                                                                'LINK'] not in {
                                                                    'cc',
                                                                    'mark',
                                                                    'parataxis',
                                                                    'conj'
                                                                }:
                                                            item.attrib[
                                                                'LINK'] = 'orphan'
                                    elif any(child.attrib['LINK'] == 'advcl'
                                             for child in children_list):
                                        for elem in children_list:
                                            if elem.attrib['LINK'] == 'advcl':
                                                elem.attrib[
                                                    'LINK'] = initial_fantom.attrib[
                                                        'LINK']
                                                elem.attrib[
                                                    'DOM'] = initial_fantom.attrib[
                                                        'DOM']
                                                for item in children_list:
                                                    if item.attrib[
                                                            'ID'] != elem.attrib[
                                                                'ID']:
                                                        item.attrib[
                                                            'DOM'] = elem.attrib[
                                                                'ID']
                                                        if item.attrib[
                                                                'LINK'] not in {
                                                                    'cc',
                                                                    'mark',
                                                                    'parataxis',
                                                                    'conj'
                                                                }:
                                                            item.attrib[
                                                                'LINK'] = 'orphan'
                                    elif any(
                                            child.attrib['LINK'] == 'discourse'
                                            for child in children_list):
                                        for elem in children_list:
                                            if elem.attrib[
                                                    'LINK'] == 'discourse':
                                                elem.attrib[
                                                    'LINK'] = initial_fantom.attrib[
                                                        'LINK']
                                                elem.attrib[
                                                    'DOM'] = initial_fantom.attrib[
                                                        'DOM']
                                                for item in children_list:
                                                    if item.attrib[
                                                            'ID'] != elem.attrib[
                                                                'ID']:
                                                        item.attrib[
                                                            'DOM'] = elem.attrib[
                                                                'ID']
                                                        if item.attrib[
                                                                'LINK'] not in {
                                                                    'cc',
                                                                    'mark',
                                                                    'parataxis',
                                                                    'conj'
                                                                }:
                                                            item.attrib[
                                                                'LINK'] = 'orphan'

            for token in sentence.findall('W'):
                # step 5: delete 'cop' fantom tokens (preparations)
                # well, looks like not only 'cop', but all fantoms
                # that are leaves and have another fantom as a head
                change_number = {}
                if token.text == 'FANTOM':
                    children = get_enh_children(sentence, token.attrib['ID'])
                    if children == []:
                        token.attrib['DEL'] = 'YES'
                        current_fantom = round(float(token.attrib['ID']), 1)
                        start_token = round(
                            float(token.attrib['ID'].split('.')[0]), 1)
                        end_token = start_token + 1
                        for elem in sentence.findall('W'):
                            if start_token < round(float(elem.attrib['ID']),
                                                   1) < end_token:
                                if round(float(elem.attrib['ID']),
                                         1) > current_fantom:
                                    change_number[elem.attrib['ID']] = str(
                                        round(
                                            float(elem.attrib['ID']) - 0.1, 1))
                        if change_number != {}:
                            for fantom in sentence.findall('W'):
                                if fantom.attrib['ID'] in change_number:
                                    fantom.attrib['ID'] = change_number[
                                        fantom.attrib['ID']]
                                if fantom.attrib[
                                        'DOM'] != '_root' and fantom.attrib[
                                            'DOM'] in change_number:
                                    fantom.attrib['DOM'] = change_number[
                                        fantom.attrib['DOM']]
                                enh_no = fantom.attrib['ENH'].split(':')[0]
                                if enh_no in change_number:
                                    fantom.attrib['ENH'] = fantom.attrib[
                                        'ENH'].replace(enh_no,
                                                       change_number[enh_no])

            for token in sentence.findall(
                    'W'):  # step 6: delete 'cop' fantom tokens (deletion)
                if token.text == 'FANTOM' and token.attrib.get(
                        'DEL', 'EMPTY'
                ) == 'YES':  # and token.attrib.get('LINK', 'EMPTY') == 'cop':
                    sentence.remove(token)

            for token in sentence.findall('W'):  # fix orphan + CCONJ 29.11.17
                if token.attrib.get('FEAT', 'EMPTY').split()[0] in {
                        'CCONJ', 'SCONJ'
                } and token.attrib.get('LINK', 'EMPTY') == 'orphan':
                    if token.attrib['ENH'].split(':')[1] == 'orphan':
                        if token.attrib['LEMMA'] == 'чтобы':
                            token.attrib['LINK'] = 'mark'
                        else:
                            token.attrib['LINK'] = 'cc'
                    else:
                        token.attrib['LINK'] = token.attrib['ENH'].split(
                            ':')[1]

            # Something went wrong
            for token in sentence.findall('W'):
                if token.text != 'FANTOM' and '.' in token.get('DOM', ''):
                    print('-' * 20)
                    print(ifname)
                    print('-' * 20)
                    print(token.attrib['DOM'])
                    print(*[
                        ch.attrib['LINK']
                        for ch in get_children(sentence, token.attrib['DOM'])
                    ])
                    print(*[
                        ch.attrib['OLD']
                        for ch in get_children(sentence, token.attrib['DOM'])
                    ])
                    print('-' * 20)
                    for item in sentence.findall('W'):
                        print(item.text, item.attrib['LEMMA'],
                              item.attrib['FEAT'].split()[0],
                              item.attrib['ID'], item.attrib['DOM'],
                              item.attrib.get('LINK', ''), item.attrib['ENH'])
                    print('=' * 20)

        tree.write(ofname, encoding="UTF-8")
    return
예제 #5
0
def check_citation(sentence, symbol, i, token_id, file_name, start):
    sentence_element = sentence
    sentence = sentence.findall('W')
    if start:
        for tok in sentence[:i + 1]:
            root_token = [
                token for token in sentence if token.attrib['DOM'] == '_root'
                and '.' not in token.attrib['ID']
            ]
            if tok.attrib['DOM'] == '_root':
                if all(
                        ch.attrib.get('LINK', 'EMPTY') != 'parataxis' for ch in
                        get_children(sentence_element, tok.attrib['ID'])):
                    for j, new_tok in enumerate(sentence[i + 1:]):
                        if ',-' in new_tok.tail.strip().replace(
                                ' ', '').replace(
                                    '\n', '') or ',"' in new_tok.tail.strip(
                                    ).replace(' ', '').replace('\n', ''):

                            for t in sentence[i + 1:i + j + 2]:
                                if float(t.attrib['DOM']) < float(sentence[i+1].attrib['ID']) or float(t.attrib['DOM']) > float(sentence[i+j+2].attrib['ID']) and \
                                   t.text != 'FANTOM':
                                    t.attrib['LINK'] = 'parataxis'
                                    t.attrib['DOM'] = root_token[0].attrib[
                                        'ID']

                            for t in sentence[i + j + 2:]:
                                if float(t.attrib['DOM']) < float(
                                        sentence[i + j + 2].attrib['ID']
                                ) and t.text != 'FANTOM' and t.attrib[
                                        'LINK'] in ['orphan', 'conj']:
                                    head_token = [
                                        token.attrib for token in sentence if
                                        token.attrib['ID'] == t.attrib['DOM']
                                    ]
                                    if head_token[0].get('LINK',
                                                         'EMPTY') != 'conj':
                                        t.attrib['LINK'] = 'conj'
                                        t.attrib['DOM'] = root_token[0].attrib[
                                            'ID']
                            break
                    else:
                        local_list = []
                        for t in sentence[i + 1:]:
                            if float(t.attrib['DOM']) <= float(token_id):
                                local_list.append(t)

                        if len(local_list) > 0:
                            local_list[0].attrib['LINK'] = 'parataxis'
                            local_list[0].attrib['DOM'] = root_token[0].attrib[
                                'ID']
                break
        else:
            for j, t in enumerate(sentence[i + 1:]):
                if (citation_punct_re.search(t.tail) is not None
                        and end_of_citation_re.search(t.tail) is not None
                        and whitespace_re.sub('', t.tail) != '",'):

                    if any(token.attrib['DOM'] == '_root'
                           for token in sentence[i + j + 2:]):
                        break
            else:
                if not any(',-' in t.tail.strip().replace(' ', '').replace('\n', '') or ',"' in t.tail.strip().replace(' ', '').replace('\n', '') \
                   for t in sentence[i+1:]):
                    candidates = [
                        t for t in sentence[:i + 1]
                        if t.attrib['DOM'] == root_token[0].attrib['ID']
                    ]

                    if len(candidates) == 1:
                        candidates[0].attrib['DOM'] = '_root'
                        del candidates[0].attrib['LINK']
                        root_token[0].attrib['DOM'] = candidates[0].attrib[
                            'ID']
                        root_token[0].attrib['LINK'] = 'parataxis'
                    else:
                        #exceptions
                        if sentence[0].text == 'Николай':
                            print('\nException:')
                            print(' '.join((file_name.split('/')[-1],
                                            sentence_element.attrib['ID'])))
                            print_sentence(sentence_element)
                            sentence[0].attrib['LINK'] = 'vocative'
                            sentence[1].attrib['LINK'] = 'flat:name'
                            print('\nCorrected to:')
                            print_sentence(sentence_element)

                        elif sentence[0].text == 'Быстро':
                            print('\nException:')
                            print(' '.join((file_name.split('/')[-1],
                                            sentence_element.attrib['ID'])))
                            print_sentence(sentence_element)
                            sentence[3].attrib['DOM'] = '_root'
                            del sentence[3].attrib['LINK']
                            sentence[4].attrib['DOM'] = '3'
                            sentence[4].attrib['LINK'] = 'parataxis'
                            print('\nCorrected to:')
                            print_sentence(sentence_element)
                else:
                    #print(i, token_id, sentence[0].text, sentence[1].text)
                    #print('+' * 20)
                    #print(*[(token.attrib['ID'], token.text, token.attrib['DOM'], token.attrib.get('LINK', 'EMPTY'), token.tail) for token in sentence], sep='\n')
                    #print('*' * 20)
                    pass
    else:
        if any(token.attrib['DOM'] == '_root' for token in sentence[:i + 1]):
            candidates = [
                t for t in sentence[i + 1:]
                if t.text != 'FANTOM' and '.' not in t.attrib['DOM']
                and int(t.attrib['DOM']) <= i + 1
            ]
            if len(candidates
                   ) == 1 and candidates[0].attrib['LINK'] != 'parataxis':
                candidates[0].attrib['LINK'] = 'parataxis'
            elif len(candidates
                     ) == 1 and candidates[0].attrib['LINK'] == 'parataxis':
                pass
            elif len(candidates) > 1:
                for j, t in enumerate(sentence[i + 1:]):
                    if ',"' in t.tail.strip().replace(' ', '').replace('\n', '') or \
                       '!"' in t.tail.strip().replace(' ', '').replace('\n', '') or \
                       '?"' in t.tail.strip().replace(' ', '').replace('\n', ''):
                        candidate = [
                            t for t in sentence[i + 1:i + j + 2]
                            if t.text != 'FANTOM' and '.' not in
                            t.attrib['DOM'] and int(t.attrib['DOM']) <= i + 1
                        ]
                        if len(candidate) == 1:
                            candidate[0].attrib['LINK'] = 'parataxis'
                        elif len(candidate) == 2:
                            if any(can.text == 'то'
                                   and can.attrib['LINK'] == 'mark'
                                   for can in candidate):
                                try:
                                    sentence[23].attrib['DOM'] = '28'
                                except IndexError:
                                    pass
                            elif any(can.text == 'Париже'
                                     and can.attrib['LINK'] == 'conj'
                                     for can in candidate):
                                print('\nException:')
                                print(' '.join(
                                    (file_name.split('/')[-1],
                                     sentence_element.attrib['ID'])))
                                print_sentence(sentence_element)
                                sentence[9].attrib['LINK'] = 'parataxis'
                                sentence[7].attrib['DOM'] = '10'
                                sentence[7].attrib['LINK'] = 'discourse'
                                print('\nCorrected to:')
                                print_sentence(sentence_element)
                            elif any(can.text == 'смотришь'
                                     and can.attrib['LINK'] == 'conj'
                                     for can in candidate):
                                print('\nException:')
                                print(' '.join(
                                    (file_name.split('/')[-1],
                                     sentence_element.attrib['ID'])))
                                print_sentence(sentence_element)
                                sentence[12].attrib['LINK'] = 'parataxis'
                                sentence[25].attrib['DOM'] = '13'
                                print('\nCorrected to:')
                                print_sentence(sentence_element)

                        else:
                            if any(can.text == 'Aravot' for can in candidate):
                                for el in candidate:
                                    if el.attrib['LINK'] == 'flat:foreign':
                                        el.attrib['DOM'] = '5'
        else:
            for j, t in enumerate(sentence[i + 1:]):
                if ',"' in t.tail.strip().replace(' ', '').replace('\n', '') or \
                   '!"' in t.tail.strip().replace(' ', '').replace('\n', '') or \
                   '?"' in t.tail.strip().replace(' ', '').replace('\n', '') or \
                   '".' in t.tail.strip().replace(' ', '').replace('\n', ''):
                    if any(token.attrib['DOM'] == '_root'
                           for token in sentence[i + 1:i + j + 2]):
                        root_token = [
                            token for token in sentence
                            if token.attrib['DOM'] == '_root'
                            and '.' not in token.attrib['ID']
                        ]
                        children = get_children(sentence_element,
                                                root_token[0].attrib['ID'])
                        if any(ch.attrib['LINK'] in {'parataxis', 'cc'}
                               for ch in children):
                            cand = [
                                c for c in children
                                if c.attrib['LINK'] in {'parataxis', 'cc'}
                            ]
                            if len(cand) == 1:
                                root_token[0].attrib['DOM'] = cand[0].attrib[
                                    'ID']
                                root_token[0].attrib['LINK'] = 'parataxis'
                                cand[0].attrib['DOM'] = '_root'
                                cand[0].attrib.pop('LINK')
                        else:
                            pass
                    else:
                        pass
def process_puzzle_meta(pid, overwrite=False, snapshot_threads=15):
    metafile = "data/puzzle_solutions/solution_{}/{}_meta.h5".format(pid, pid)
    tmscore_file = "data/puzzle_solutions/solution_{}/{}_tmscore.csv".format(
        pid, pid)
    soln_csv_file = "data/puzzle_solutions/solution_{}/{}_soln.csv".format(
        pid, pid)
    hist_csv_file = "data/puzzle_solutions/solution_{}/{}_hist.csv".format(
        pid, pid)

    if not os.path.exists(metafile) or overwrite:
        # tmscore_lookup = {}
        # if os.path.exists(tmscore_file):
        #     with open(tmscore_file) as fp:
        #         print(pid, "loading tmscores")
        #         tmscore_in = csv.DictReader(fp, fieldnames=['sid_a', 'sid_b', 'tmscore'])
        #         tmscore_lookup = {(r['sid_a'], r['sid_b']): float(r['tmscore']) for r in tmscore_in}

        # def get_tmscore(key):
        #     return tmscore_lookup.get(key, np.nan)

        soln_lookup = {}
        nid_to_sid = {}
        history = {}
        # with open("data/puzzle_solutions/solution_{}/{}_soln.pickle".format(pid, pid), 'rb') as fp:
        #     solns_clean = pickle.load(fp)
        #     soln_lookup = {get_nid(s): s for s in solns_clean}
        #     solvers = {k: sorted(g, key=lambda x: int(x['timestamp'])) for k, g in
        #                groupby(sorted(solns_clean, key=lambda s: s['uid']), lambda s: s['uid'])}
        # with open("data/puzzle_solutions/solution_{}/{}_hist.pickle".format(pid, pid), 'rb') as fp:
        #     history = pickle.load(fp)

        if not os.path.exists(soln_csv_file):
            print(pid, "fetching soln csv")
            sys.stdout.flush()
            if not os.path.exists(
                    "data/puzzle_solutions/solution_{}".format(pid)):
                os.makedirs("data/puzzle_solutions/solution_{}".format(pid))
            subprocess.run([
                "scp", "wannacut:~/foldit/{}".format(soln_csv_file),
                soln_csv_file
            ],
                           stdout=subprocess.DEVNULL)
        with open("data/puzzle_solutions/solution_{}/{}_soln.csv".format(
                pid, pid)) as fp:
            print(pid, "processing", soln_csv_file)
            sys.stdout.flush()
            soln_in = csv.DictReader(fp, lineterminator='\n')
            for r in soln_in:
                r['pdl'] = json.loads(r['pdl'])
                r['guide_used'] = False
                for p in r['pdl']:
                    try:  # a pdl entries have a different header structure and were parsed incorrectly
                        p['header']['score'] = float(p['header']['score'])
                    except ValueError:
                        continue
                    if p['header']['score'] == 9999.99:
                        r['guide_used'] = True
                r['energy'] = float(r['energy'])
                r['timestamp'] = int(r['timestamp'])
                r['atoms'] = get_atoms(r)
                r.pop('ca')
                r['energies'] = [
                    EnergyComponent(*e) for e in json.loads(r['energies'])
                ] if r['energies'] else None
                if len(r['pdl']) > 0 and (r['uuid'], int(
                        r['count'])) != ROOT_NID:
                    #and not all(sum(p['actions'].values()) == 0 for p in r['pdl']):
                    soln_lookup.setdefault((r['uuid'], int(r['count'])),
                                           []).append(r)

            solns_pre = []
            for nid, ss in soln_lookup.items():
                s = min(ss, key=lambda x: x['energy'])
                soln_lookup[nid] = s
                nid_to_sid[nid] = s['sid']
                if len(s['pdl']) > 0:
                    solns_pre.append(s)
            protein_size, _ = Counter([len(s['atoms'])
                                       for s in solns_pre]).most_common(1)[0]
            solns_clean = [
                s for s in solns_pre if len(s['atoms']) == protein_size
            ]
            solvers = {
                k: sorted(g, key=lambda x: int(x['timestamp']))
                for k, g in groupby(
                    sorted(solns_clean, key=lambda s: s['uid']),
                    lambda s: s['uid'])
            }

        if not os.path.exists(hist_csv_file):
            print(pid, "fetching hist csv")
            sys.stdout.flush()
            subprocess.run([
                "scp", "wannacut:~/foldit/{}".format(hist_csv_file),
                hist_csv_file
            ],
                           stdout=subprocess.DEVNULL)
        with open("data/puzzle_solutions/solution_{}/{}_hist.csv".format(
                pid, pid)) as fp:
            print(pid, "processing", hist_csv_file)
            hist_in = csv.DictReader(fp,
                                     fieldnames=[
                                         "pid", "uuid", "count", "parent_uuid",
                                         "parent_count"
                                     ])
            for r in hist_in:
                key = (r['parent_uuid'], int(r['parent_count']))
                r['count'] = int(r['count'])
                r['parent_count'] = int(r['parent_count'])
                history.setdefault(key, []).append(r)

        parents = {}
        children = get_children(ROOT_NID, history)
        children = [(ROOT_NID, c) for c in children]
        while len(children) > 0:
            for p, c in children:
                assert c not in parents
                parents[c] = p
            children = [(c, nc) for p, c in children
                        for nc in get_children(c, history)]

        logging.debug("{} generating lookups".format(pid))
        parent_lookup = {}
        for k in soln_lookup:
            parent = parents[k]
            while parent not in soln_lookup and parent != ROOT_NID:
                parent = parents[parent]
            assert parent in soln_lookup or parent == ROOT_NID
            parent_lookup[k] = parent
        child_lookup = {
            parent: [c for p, c in g]
            for parent, g in groupby(
                sorted([(p, c)
                        for c, p in parent_lookup.items()]), lambda x: x[0])
        }

        descendants_memo = {}

        def get_descendants(nid):
            if nid in descendants_memo:
                return descendants_memo[nid]
            # soln_lookup is generated from the list of solutions passed in which are all from a single user
            # the history may include evolver children, which we have to avoid trying to look up
            children = [
                c for c in child_lookup[nid]
                if c in soln_lookup or any(x in soln_lookup
                                           for x in get_descendants(c))
            ] if nid in child_lookup else []
            descendants_memo[nid] = children + [
                d for c in children for d in get_descendants(c)
            ]
            return descendants_memo[nid]

        logging.debug("{} correcting timestamps".format(pid))
        bases = [
            get_nid(s) for s in soln_lookup.values()
            if parent_lookup[get_nid(s)] == ROOT_NID
        ]
        while len(bases) > 0:
            nid = bases.pop(0)
            if nid in child_lookup:
                if nid in soln_lookup:
                    cur = soln_lookup[nid]
                    descendants = [
                        soln_lookup[x] for x in get_descendants(nid)
                        if x in soln_lookup
                    ]
                    if cur['timestamp'] > min(c['timestamp']
                                              for c in descendants):
                        grandparent = {'timestamp': 0}
                        if parent_lookup[nid] in soln_lookup:
                            grandparent = soln_lookup[parent_lookup[nid]]
                        assert grandparent['timestamp'] <= min(
                            c['timestamp'] for c in descendants)
                        cur['timestamp'] = max(
                            min(c['timestamp'] for c in descendants) - 300,
                            grandparent['timestamp'] + 1)
                bases.extend([c for c in child_lookup[nid]])

        delta = 3600

        print(pid, "computing soln metrics")
        sys.stdout.flush()

        param_ranges = {
            "energy_threshold_frac": [0.25, 0.5, 0.75],
            "rate_threshold": [-0.001, -0.01],
            "diff_threshold": [-1, -10, -25],
            "tm_threshold": [0.5, 0.9, 1]
        }

        breakthrough_params = [
            dict(d) for d in product(*[[(k, v) for v in vs]
                                       for k, vs in param_ranges.items()])
        ]

        logging.debug(
            "{} passing parent_lookup, size {} and child_lookup, size {} to threads"
            .format(pid, sys.getsizeof(parent_lookup),
                    sys.getsizeof(child_lookup)))
        with Pool(snapshot_threads) as snapshot_pool:
            acc = snapshot_pool.map_async(
                partial(
                    process_snapshots,
                    delta=delta,
                    breakthrough_params=breakthrough_params,
                    parent_lookup=parent_lookup,
                    child_lookup=child_lookup,
                    # nid_to_sid=nid_to_sid, get_tmscore_pkl=dill.dumps(get_tmscore)),
                    nid_to_sid=nid_to_sid),
                sorted(solvers.values(), key=len, reverse=True),
                chunksize=1).get()
        df = pd.DataFrame(data=[d for d, _ in acc if d is not None])
        # breakthroughs = pd.concat([b for _, b in acc if b is not None])
        breakthroughs = pd.DataFrame()

        print(pid, 'metrics computed')
        sys.stdout.flush()

        best = df[df.frontier_pdbs.notnull()].frontier_pdbs.apply(
            lambda x: x[-1])
        # logging.debug("{} puzzle frontier tmscores".format(pid))
        # atoms_lookup = {s['sid']: s['atoms'] for s in best}
        # best_pairs = list(combinations([s['sid'] for s in best], 2))
        # for k, v in tmscore([c for c in best_pairs if c not in tmscore_lookup],
        #                     "tmp_data/{}_best".format(pid), atoms_lookup):
        #     tmscore_lookup[k] = v
        # best_tmscores = {c: tmscore_lookup[c] if c in tmscore_lookup else np.nan for c in best_pairs}

        en_lookup = {}
        for _, z in df.apply(lambda r: zip(r['timestamps'], r['energies']),
                             axis=1).iteritems():
            for t, e in z:
                if t not in en_lookup:
                    en_lookup[t] = []
                en_lookup[t].append(e)
        pfront = np.minimum.accumulate(
            [min(es) for t, es in sorted(en_lookup.items())])

        upload_baseline = max(
            stats.mode(
                np.concatenate(
                    df.upload_rate[df.upload_rate.notnull()].values)).mode)
        df = df.assign(upload_ratio=df.upload_rate / upload_baseline)

        # it appears there's a clustering of energies for solutions that have only one or two actions (usually repack), so we'll use that as the energy baseline
        energy_baseline = scipy.stats.mode(
            df[df.first_pdb.notnull() & df.first_pdb.apply(lambda p: p and sum(
                p['pdl'][0]['actions'].values()) < 3)].first_pdb.apply(
                    lambda p: round(p['energy']))).mode.min()

        print(pid, "getting structure")
        struct_file = "data/puzzle_solutions/solution_{}/{:010}.ir_puzzle.pdb".format(
            pid, int(pid))
        # setup_file = "data/puzzle_solutions/solution_{}/{:010}.ir_puzzle.puzzle_setup".format(pid, int(pid))
        if not os.path.exists(struct_file):
            subprocess.run([
                "scp", "wannacut:~/foldit/{}".format(struct_file), struct_file
            ],
                           stdout=subprocess.DEVNULL)
            # subprocess.run(["scp", "wannacut:~/foldit/{}".format(setup_file), setup_file], stdout=subprocess.DEVNULL)
        with open(struct_file) as init_pdb:
            content = init_pdb.read()
            sec_struct = {
                i: l
                for i, l in [
                    x.split()[:2] for x in re.findall(r'^(?!ATOM)\s+?\d+.*',
                                                      content, re.MULTILINE)
                ]
            }
            assert all(v in ['H', 'E', 'L', 'C'] for v in sec_struct.values())
            atoms = Counter([
                x.split()[5]
                for x in re.findall('^ATOM.*', content, re.MULTILINE)
            ])
            structure = {
                'loop': [
                    atoms[i] for i, l in sec_struct.items()
                    if l == 'C' or l == 'L'
                ],
                'helix': [atoms[i] for i, l in sec_struct.items() if l == 'H'],
                'sheet': [atoms[i] for i, l in sec_struct.items() if l == 'E']
            }
        # meta = PuzzleMeta(pid, best_tmscores, pfront, upload_baseline, energy_baseline, structure)
        meta = PuzzleMeta(pid, None, pfront, upload_baseline, energy_baseline,
                          structure)
        print(pid, 'puzzle metrics computed')
        sys.stdout.flush()

        print(pid, "writing soln output")
        sys.stdout.flush()
        if os.path.exists(metafile) and overwrite:
            logging.debug("{} deleting existing meta file".format(pid))
            subprocess.run(['rm', metafile
                            ])  # remove to avoid ever accumulating data files
        store = pd.HDFStore(metafile)
        store["df"] = df
        store["bts"] = breakthroughs
        store["puz"] = pd.Series(
            [meta])  # must be wrapped in a pandas data structure
        store.close()
        subprocess.run(["rm", soln_csv_file])
        print(pid, "done")
    else:
        print(metafile, "exists, will not overwrite")
예제 #7
0
def munch(ifiles, ofiles):
    """
    Process all files in ifiles list.
    Output into ofiles list.
    """
    for ifname, ofname in zip(ifiles, ofiles):
        tree = ET.parse(ifname)
        root = tree.getroot()
        for sentence in root[-1].findall('S'):
            for token in sentence.findall(
                    'W'):  # step 0: detect and re-annotate 'не'
                if token.attrib['LEMMA'] == 'не' and 'VERB' in token.attrib[
                        'FEAT']:
                    link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(
                        token, sentence)
                    children = get_children(sentence, token.attrib['ID'])
                    if token.text != 'FANTOM' and all(ch.text != 'FANTOM'
                                                      for ch in children):
                        for elem in children:
                            if 'VerbForm=Inf' in elem.attrib['FEAT']:
                                gr_children = get_children(
                                    sentence, elem.attrib['ID'])
                                break
                        for item in gr_children:
                            if item.attrib['LEMMA'] in suspicious:
                                gr_gr_children = get_children(
                                    sentence, item.attrib['ID'])
                                if all(gr_gr.attrib['FEAT'].split()[0] != 'ADP'
                                       for gr_gr in gr_gr_children):
                                    token.attrib['LEMMA'] = suspicious[
                                        item.attrib['LEMMA']]
                                    token.text = token.text + item.text
                                    token.attrib['FEAT'] = item.attrib['FEAT']
                                    item.attrib['DEL'] = 'YES'
                                break
                    elif token.text != 'FANTOM' and any(ch.text == 'FANTOM'
                                                        for ch in children):
                        for elem in children:
                            if 'VerbForm=Inf' in elem.attrib['FEAT']:
                                gr_children = get_children(
                                    sentence, elem.attrib['ID'])
                                break
                        for item in gr_children:
                            if item.attrib['LEMMA'] in suspicious:
                                gr_gr_children = get_children(
                                    sentence, item.attrib['ID'])
                                if all(gr_gr.attrib['FEAT'].split()[0] != 'ADP'
                                       for gr_gr in gr_gr_children):
                                    token.attrib['LEMMA'] = suspicious[
                                        item.attrib['LEMMA']]
                                    token.attrib['FEAT'] = item.attrib['FEAT']
                                    item.attrib['DEL'] = 'YES'
                                break
                    elif token.text == 'FANTOM' and children == []:
                        if sentence.attrib['ID'] == '217':
                            for elem in sentence.findall('W'):
                                if elem.attrib['ID'] == '11':
                                    elem.attrib['DEL'] = 'YES'
                                if elem.attrib['ID'] == '12':
                                    elem.attrib['LEMMA'] = 'нечего'
                        if sentence.attrib['ID'] == '94':
                            for elem in sentence.findall('W'):
                                if elem.attrib['ID'] == '11':
                                    elem.attrib['DEL'] = 'YES'
                        if sentence.attrib['ID'] == '169':
                            for elem in sentence.findall('W'):
                                if elem.attrib['ID'] == '6':
                                    elem.attrib['DOM'] = '14'
                                if elem.attrib['ID'] == '9':
                                    elem.attrib['DEL'] = 'YES'
                                if elem.attrib['ID'] == '10':
                                    elem.attrib['LEMMA'] = 'некого'
                                if elem.attrib['ID'] == '11':
                                    elem.attrib['DOM'] = '13'
                                if elem.attrib['ID'] == '12':
                                    elem.attrib['DEL'] = 'YES'
                                if elem.attrib['ID'] == '13':
                                    elem.attrib['LEMMA'] = 'негде'
                                    elem.attrib['DOM'] = '10'

                    elif token.text == 'FANTOM' and any(ch.text == 'FANTOM'
                                                        for ch in children):
                        for elem in sentence.findall('W'):
                            if elem.attrib['ID'] == '11':
                                elem.attrib['DEL'] = 'YES'
                            if elem.attrib['ID'] == '2':
                                elem.attrib['LEMMA'] = suspicious[
                                    elem.attrib['LEMMA']]
                                elem.attrib['DOM'] = '_root'
                                del elem.attrib['LINK']
                            if elem.attrib['DOM'] == '1':
                                elem.attrib['DOM'] == '2'

                    elif token.text == 'FANTOM' and all(ch.text != 'FANTOM'
                                                        for ch in children):
                        if all('VerbForm=Inf' not in ch.attrib['FEAT']
                               for ch in children):
                            if sentence.attrib['ID'] == '440':
                                for elem in sentence.findall('W'):
                                    if elem.attrib['ID'] == '16':
                                        elem.attrib['DOM'] = '18'
                                    if elem.attrib['ID'] == '17':
                                        elem.attrib['DEL'] = 'YES'
                                    if elem.attrib['ID'] == '18':
                                        elem.attrib['LEMMA'] = suspicious[
                                            elem.attrib['LEMMA']]
                        for elem in children:
                            if 'VerbForm=Inf' in elem.attrib['FEAT']:
                                gr_children = get_children(
                                    sentence, elem.attrib['ID'])
                                if head_token is None:
                                    for item in gr_children:
                                        if item.attrib['LEMMA'] in suspicious:
                                            gr_gr_children = get_children(
                                                sentence, item.attrib['ID'])
                                            if all(gr_gr.attrib['FEAT'].split(
                                            )[0] != 'ADP' for gr_gr in
                                                   gr_gr_children):
                                                item.attrib[
                                                    'LEMMA'] = suspicious[
                                                        item.attrib['LEMMA']]
                                                item.attrib['DOM'] = '_root'
                                                del item.attrib['LINK']
                                                token.attrib['DEL'] = 'YES'
                                                for renum in sentence.findall(
                                                        'W'):
                                                    if renum.attrib[
                                                            'DOM'] == token.attrib[
                                                                'ID']:
                                                        renum.attrib[
                                                            'DOM'] = item.attrib[
                                                                'ID']
                                            break
                                    else:
                                        for broken in children:
                                            if broken.attrib[
                                                    'LEMMA'] in suspicious:
                                                broken.attrib[
                                                    'LEMMA'] = suspicious[
                                                        broken.attrib['LEMMA']]
                                                broken.attrib['DOM'] = '_root'
                                                del broken.attrib['LINK']
                                                token.attrib['DEL'] = 'YES'
                                                for renum in sentence.findall(
                                                        'W'):
                                                    if renum.attrib[
                                                            'DOM'] == token.attrib[
                                                                'ID']:
                                                        renum.attrib[
                                                            'DOM'] = broken.attrib[
                                                                'ID']

                                else:
                                    for item in gr_children:
                                        if item.attrib['LEMMA'] in suspicious:
                                            gr_gr_children = get_children(
                                                sentence, item.attrib['ID'])
                                            if all(gr_gr.attrib['FEAT'].split(
                                            )[0] != 'ADP' for gr_gr in
                                                   gr_gr_children):
                                                token.attrib[
                                                    'LEMMA'] = suspicious[
                                                        item.attrib['LEMMA']]
                                                token.attrib[
                                                    'FEAT'] = item.attrib[
                                                        'FEAT']
                                                token.text = item.text
                                                item.attrib['DEL'] = "YES"
                                                for renum in sentence.findall(
                                                        'W'):
                                                    if renum.attrib[
                                                            'DOM'] == item.attrib[
                                                                'ID']:
                                                        renum.attrib[
                                                            'DOM'] = token.attrib[
                                                                'ID']
                    else:
                        pass
        for sentence in root[-1].findall(
                'S'):  # step 2: collect token numbers old:new
            numbering = {}
            token_number = 0
            for token in sentence.findall('W'):
                if 'DEL' not in token.attrib:
                    token_number += 1
                numbering[token.attrib['ID']] = str(token_number)

            for word in sentence.findall('W'):  # step 3: assign new numbers
                word.attrib['ID'] = numbering[word.attrib['ID']]
                if word.attrib['DOM'] != '_root':
                    word.attrib['DOM'] = numbering[word.attrib['DOM']]
            for elem in sentence.findall('W'):  # step 4: remove tokens
                if 'DEL' in elem.attrib:
                    sentence.remove(elem)

        for sentence in root[-1].findall('S'):
            for token in sentence.findall('W'):  # Mood=Cnd fix
                if token.attrib['LEMMA'] in {'бы', 'б', 'чтобы', 'чтоб'}:
                    link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(
                        token, sentence)
                    try:
                        if head_token.attrib['LEMMA'] not in forbidden_head:
                            if pos in {'SCONJ', 'PART'}:
                                token.attrib['FEAT'] = token.attrib[
                                    'FEAT'] + ' Mood=Cnd'
                            else:
                                token.attrib['FEAT'] = token.attrib[
                                    'FEAT'].replace(' Foreign=Yes', '')
                    except:
                        print('Something went wrong')
                        print(*[(elem.text, elem.tail.rstrip('\n'),
                                 elem.attrib) for elem in sentence],
                              sep='\n')
                        print()

        tree.write(ofname, encoding="UTF-8")
    return