def main(ifiles, ofiles): for ifname, ofname in zip(ifiles, ofiles): tree = ET.parse(ifname) root = tree.getroot() for sentence in root[-1].findall('S'): list_of_compounds = [] for i, token in enumerate(sentence.findall('W')): if token.attrib.get('FEAT', 'EMPTY').split()[0] == 'COM': link, pos, feats, head_token, head_pos, head_feats, head_root = get_info( token, sentence) children = get_children_attrib(sentence, token.attrib['ID']) if head_token.text != 'FANTOM': list_of_compounds.append((token, head_token, children)) for elem in list_of_compounds: wordf, head_word, children = elem if children != []: for child in children: child['DOM'] = head_word.attrib['ID'] for elem in list_of_compounds: wordf, head_word, children = elem shift_position = wordf.attrib['ID'] for elem in sentence.findall('W'): if int(elem.attrib['ID']) == int(shift_position): sentence.remove(elem) break for item in sentence.findall('W'): if int(item.attrib['ID']) > int(shift_position): item.attrib['ID'] = str(int(item.attrib['ID']) - 1) if item.attrib.get('DOM', 'EMPTY') != '_root' and int( item.attrib['DOM']) > int(shift_position) - 1: item.attrib['DOM'] = str(int(item.attrib['DOM']) - 1) tree.write(ofname, encoding='utf-8') return
def get_tile(self, cid): """fetch info about a tiled tiff, or retrieve a specific tile.""" _id = bson.ObjectId(cid) container, _ = self._get(_id, 'ro') # need at least read access to view tiles montage_info = None for f in container.get('files'): if f['filetype'] == 'montage': montage_info = f break if not montage_info: self.abort(404, 'montage zip not found') fn = montage_info['filename'] fp = os.path.join(self.app.config['data_path'], cid[-3:], cid, fn) z = self.request.GET.get('z') x = self.request.GET.get('x') y = self.request.GET.get('y') if not (z and x and y): return util.get_info(fp) else: self.response.content_type = 'image/jpeg' tile = util.get_tile(fp, int(z), int(x), int(y)) if tile: self.response.write(tile)
def get_tile(self, cid): """fetch info about a tiled tiff, or retrieve a specific tile.""" _id = bson.ObjectId(cid) container, _ = self._get( _id, 'ro') # need at least read access to view tiles montage_info = None for f in container.get('files'): if f['filetype'] == 'montage': montage_info = f break if not montage_info: self.abort(404, 'montage zip not found') fn = montage_info['filename'] fp = os.path.join(self.app.config['data_path'], cid[-3:], cid, fn) z = self.request.GET.get('z') x = self.request.GET.get('x') y = self.request.GET.get('y') if not (z and x and y): return util.get_info(fp) else: self.response.content_type = 'image/jpeg' tile = util.get_tile(fp, int(z), int(x), int(y)) if tile: self.response.write(tile)
def get_all_info(filepath): ext = os.path.splitext(filepath)[1] if ext not in FORMATS: raise UnsupportedFileTypeError() try: audio = FORMATS[ext](filepath) except ID3NoHeaderError: audio = MP3(filepath) audio.add_tags() audio.save() audio = EasyID3(filepath) artist = get_info(audio, 'artist') album = get_info(audio, 'album') date = get_info(audio, 'date') tracknumber = get_info(audio, 'tracknumber', '') title = get_info(audio, 'title') is_compil = get_info(audio, 'compilation') or False return (sanitize(artist), sanitize(album), sanitize(date), sanitize(trackFormat(tracknumber)), sanitize(title), ext, is_compil)
def munch(ifiles, ofiles): # part 1 count = 0 for ifname, ofname in zip(ifiles, ofiles): tree = ET.parse(ifname) root = tree.getroot() for sent in root[-1].findall('S'): for word in sent.findall('W'): if word.attrib.get( 'FEAT', 'EMPTY').split()[0] == 'PR' and word.attrib.get( 'LEMMA', 'EMPTY') in lemmas_adv: count += 1 word.attrib['FEAT'] = 'ADV' if 'LINK' in word.attrib and word.attrib[ 'LINK'] not in conjrels: word.attrib['LINK'] = 'advmod' # part 2 # (the code does not cover all cases on the first iteration, # therefore this part needs to be repeated twice) for r in range(2): for sent in root[-1].findall('S'): for candidate in sent.findall('W'): if candidate.attrib.get( 'LEMMA', 'EMPTY' ) in fix and 'МН' not in candidate.attrib['FEAT']: link, pos, feats, head_token, head_pos, head_feats, head_root = get_info( candidate, sent) is_root = (candidate.attrib.get('DOM', 'EMPTY') == '_root') ch = candidate.attrib['ID'] parent = candidate.attrib['DOM'] away = False do_not_change = False deprel = '' if 'LINK' in candidate.attrib: deprel = candidate.attrib['LINK'] if 'LINK' in candidate.attrib and candidate.attrib[ 'LINK'] in conjrels: do_not_change = True for token in sent.findall('W'): if token.attrib.get( 'ID', 'EMPTY') == parent and token.attrib.get( 'LEMMA', 'EMPTY') in go_away: away = True if away: continue children = get_children_attrib(sent, ch) if len(children) == 0: ch_parent = get_children_attrib(sent, parent) check = 0 for elem in ch_parent: if elem['FEAT'].split()[0] == 'NUM': check += 1 if check == 1: for elem in ch_parent: if elem['FEAT'].split()[0] == 'NUM': elem['DOM'] = ch elem['LINK'] = 'compound' if not do_not_change: candidate.attrib[ 'LINK'] = 'nummod:gov' if len(children) == 1 and children[0]['FEAT'].split( )[0] == 'NUM': #only one and it is NUM children[0]['LINK'] = 'compound' if not do_not_change: candidate.attrib['LINK'] = 'nummod:gov' if any(child['FEAT'].split()[0] == 'NUM' for child in children ) and len(children) > 1: # NUM among others if not do_not_change: candidate.attrib['LINK'] = 'nummod:gov' for elem in children: if elem['FEAT'].split()[0] == 'NUM': elem['LINK'] = 'compound' if len(children) == 1 and children[0]['FEAT'].split( )[0] == 'A': continue if any(child['FEAT'].split()[0] == 'S' for child in children): list_of_nouns = [] numgov = False for elem in children: genetive = False if 'РОД' in elem['FEAT']: genetive = True if elem['FEAT'].split()[0] == 'S' and elem[ 'LEMMA'] in lemmas and genetive: list_of_nouns.append(elem) if elem['FEAT'].split()[0] == 'NUM': elem['LINK'] = 'compound' numgov = True if len(list_of_nouns) > 0: trace = list_of_nouns[0]['ID'] for elem in children: if elem['ID'] == trace: if is_root: elem['DOM'] = '_root' candidate.attrib['LINK'] = elem[ 'LINK'] del elem['LINK'] else: elem['DOM'] = parent if deprel != '': elem['LINK'] = deprel else: print(candidate.attrib, elem) candidate.attrib['DOM'] = elem['ID'] if numgov: if not do_not_change: candidate.attrib[ 'LINK'] = 'nummod:gov' for elem in children: if elem['ID'] != trace and elem[ 'FEAT'].split( )[0] != 'NUM': elem['DOM'] = trace # part 3 for sent in root[-1].findall('S'): for word in sent.findall('W'): link, pos, feats, head_token, head_pos, head_feats, head_root = get_info( word, sent) if link not in ['предик' ] + conjrels and word.attrib['DOM'] != '_root': if 'NUM' in pos and word.attrib['LEMMA'] == 'один': # один word.attrib['LINK'] = 'nummod' if link != 'EMPTY' and 'NUM' in pos: if 'ВИН' in feats or 'ИМ' in feats: word.attrib['LINK'] = 'nummod:gov' else: word.attrib['LINK'] = 'nummod' if word.attrib.get('LEMMA', 'EMPTY') in lemmas_compare: children = get_children_attrib(sent, word.attrib['ID']) for elem in children: if 'FEAT' in elem and elem['FEAT'].startswith( 'S ') and 'РОД' in elem['FEAT']: grandchildren = get_children_attrib( sent, elem['ID']) if not any('PR' in grchild['FEAT'] for grchild in grandchildren): elem['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = elem['ID'] word.attrib['LINK'] = 'nummod:gov' tree.write(ofname, encoding="UTF-8") return
def munch(ifiles, ofiles): """ Process all files in ifiles list. Output into ofiles list. """ temp_info = [] for ifname, ofname in zip(ifiles, ofiles): tree = ET.parse(ifname) root = tree.getroot() for sent in root[-1].findall('S'): for word in sent.findall('W'): if word.attrib.get('FEAT', 'EMPTY').split()[0] == 'PR': link, pos, feats, head_token, head_pos, head_feats, head_root = get_info( word, sent) children = get_children_attrib(sent, word.attrib['ID']) if children == [] and head_pos != 'CONJ': #CONJ is converted during syntax phase # only 'за' is converted here if word.attrib['LEMMA'] == 'за': if head_token.attrib['DOM'] == '_root': # small fix for one sentence word.attrib['DOM'] = '3' else: word.attrib['DOM'] = head_token.attrib['DOM'] new_dep = relation(head_pos) word.attrib['LINK'] = new_dep elif len(children) >= 1: if any(ch['LINK'] == 'предл' for ch in children): new_head_found = True elif any(ch['LINK'] == 'сочин' for ch in children): list_of_coord_candidates = [] candidate_coord = [ ch for ch in children if ch['LINK'] == 'сочин' ] for cand in candidate_coord: if cand['FEAT'] == 'PR': list_of_coord_candidates.append(cand) elif cand['FEAT'] == 'CONJ': sub_children = get_children_attrib( sent, cand['ID']) check_pr = [ ch for ch in sub_children if ch['FEAT'] == 'PR' ] list_of_coord_candidates += check_pr new_head_found = all( any(elem['LINK'] == 'предл' for elem in get_children_attrib( sent, item['ID'])) for item in list_of_coord_candidates) else: new_head_found = True if new_head_found: for child in children: if child['FEAT'].split( )[0] in safe + ['V'] and child.get( 'LINK', 'EMPTY') == 'предл': child['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = child['ID'] if child['DOM'] == '_root': child.pop('LINK') else: child['LINK'] = word.attrib['LINK'] word.attrib['LINK'] = relation( child['FEAT'].split()[0]) for elem in children: if elem['ID'] != child['ID']: elem['DOM'] = child['ID'] break else: if any(elem['LINK'] == 'сочин' for elem in children): continue elif len(children ) == 1 and children[0]['LINK'] in [ 'разъяснит', 'огранич' ]: continue elif len(children) == 1 and children[0][ 'FEAT'] == 'CONJ МЕТА': child['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = child['ID'] child['LINK'] = word.attrib['LINK'] word.attrib['LINK'] = 'case' elif len(children) >= 1 and any( elem['LINK'] == 'предик' for elem in children): continue elif word.text == 'кроме' and children[0][ 'LEMMA'] == 'как': sub_ch = get_children_attrib( sent, children[0]['ID']) children[0]['LINK'] = 'fixed' sub_ch[0]['DOM'] = children[0]['DOM'] elif children[0]['LEMMA'] == 'минус': sub_ch = get_children_attrib( sent, children[0]['ID']) sub_ch[0]['DOM'] = word.attrib['DOM'] sub_ch[0]['LINK'] = word.attrib['LINK'] word.attrib['DOM'] = sub_ch[0]['ID'] word.attrib['LINK'] = 'case' children[0]['DOM'] = sub_ch[0]['ID'] children[0]['FEAT'] = 'S ЕД МУЖ ИМ НЕОД' children[0]['LINK'] = 'nmod' elif word.text == 'вроде' and children[0][ 'LEMMA'] == 'при': children[0]['LINK'] = 'об-аппоз' elif word.text == 'Около' and children[0][ 'LEMMA'] == 'назад': sub_ch = get_children_attrib( sent, children[0]['ID']) sub_ch[0]['DOM'] = word.attrib['DOM'] sub_ch[0]['LINK'] = word.attrib['LINK'] word.attrib['DOM'] = sub_ch[0]['ID'] word.attrib['LINK'] = 'case' children[0]['DOM'] = sub_ch[0]['ID'] children[0]['FEAT'] = 'ADV' children[0]['LINK'] = 'advmod' else: # these are for debug purposes and normaly silent; if they scream something went wrong print(word.attrib.get('ID', 'EMPTY'), word.text, word.attrib.get('FEAT', 'EMPTY')) print(*[(ch.get('ID', 'EMPTY'), ch.get('LEMMA', 'EMPTY'), ch.get('FEAT', 'EMPTY'), ch.get('LINK', 'EMPTY')) for ch in children], sep=' ') print('+' * 20) print(*[ (token.attrib.get('ID', 'EMPTY'), token.text, token.attrib.get('DOM', 'EMPTY'), token.attrib.get('FEAT', 'EMPTY'), token.attrib.get('LINK', 'EMPTY'), token.tail) for token in sent ], sep='\n') print('*' * 20) else: continue tree.write(ofname, encoding="UTF-8")
import re import sys from subprocess import run import bs4 import pyminizip from util import (clean_tags, get_arg, get_function, get_info, get_soup, set_info, to_md) arg = get_arg('Genera un epub de un programa electoral') re_rtrim = re.compile(r" +$", re.MULTILINE) re_ltrim = re.compile(r"^\s*\n+") yml = get_info(autocomplete=True) isLastLineBlank = False def fprint(txt, *args, re_clean=None, **kargs): global isLastLineBlank if isinstance(txt, bs4.Tag): txt = to_md(txt) if re_clean is not None: txt = re_clean.sub("", txt) txt = re_rtrim.sub("", txt) if isLastLineBlank: txt = re_ltrim.sub("", txt) if len(txt) == 0: return
wubi_dictionary = wf.read() input_files = os.listdir(input_dir) for input_file in input_files: hsk_level = re.search(r"[1-7]", input_file).group() # use 7 for 7 through 9 with open("{}{}".format(input_dir, input_file), "r", encoding="utf-8-sig") as f: lines = f.readlines() for line in lines: match = re.match(r"(\d+) ([^(|\s]+)", line) index = match.group(1) word = match.group(2) logging.info("Getting info for '{}'.".format(word)) word_info = util.get_info(word) word_info["hsk_level"] = hsk_level # find the wubi strokes for each character wubi = [] for char in word: # this pops up a couple times, ignore it (for now) try: keys = re.search(r"^{}\t([a-z]+)$".format(char), wubi_dictionary, re.MULTILINE).group(1) except: logging.warning( "Unable to find wubi strokes for '{}', skipping.". format(char)) next
v = True info.pdf[k] = v fecha = None for k, v in info.pdf.items(): if v and k.endswith("Date"): d = datetime.strptime(v, "%a %b %d %H:%M:%S %Y %Z") if fecha is None or d < fecha: fecha = d if fecha: info.fecha = fecha.date() # .strftime('%Y-%m-%d') indices = [] for c in glob("*/info.yml"): indices.append((c, os.path.dirname(c), get_info(c, autocomplete=False))) for path_info, codigo, info in sorted(indices): print("Descargando %s: %s" % (codigo, info.url)) pth = codigo+"/wks" os.makedirs(pth, exist_ok=True) book = "book" out = pth + "/" + book pdf = out + ".pdf" xml = out + ".xml" htm = out + ".html" flag = False if info.url.endswith(".pdf"): if not os.path.isfile(pdf):
def munch(ifiles, ofiles): """ Process all files in ifiles list. Output into ofiles list. """ dict_of_fixed = get_fixed_rel() dict_of_lemmas = get_verb_lemmas() for ifname, ofname in zip(ifiles, ofiles): tree = ET.parse(ifname) root = tree.getroot() for k, sentence in enumerate(root[-1].findall('S')): for j, token in enumerate(sentence.findall('W')): if ( token.attrib['LEMMA'], token.attrib['FEAT'] ) in dict_of_fixed and token.text != 'FANTOM': # поменять сам токен current_position = int(token.attrib['ID']) shift_num = len(dict_of_fixed[token.attrib['LEMMA'], token.attrib['FEAT']][0]) - 1 shift_position = current_position - 1 for tok in sentence.findall('W'): if '.' not in str(tok.attrib['ID']): if int(tok.attrib['ID']) > current_position: tok.attrib['ID'] = str( int(tok.attrib['ID']) + shift_num) else: if float(tok.attrib['ID']) > float( current_position): tok.attrib['ID'] = str( round( float(tok.attrib['ID']) + float(shift_num), 1)) if '.' not in str(tok.attrib['DOM']): if tok.attrib['DOM'] != '_root': if int(tok.attrib['DOM']) > current_position: tok.attrib['DOM'] = str( int(tok.attrib['DOM']) + shift_num) else: if float(tok.attrib['DOM']) > float( current_position): tok.attrib['DOM'] = str( round( float(tok.attrib['DOM']) + float(shift_num), 1)) first_position = '' if tok.attrib['ENH'].startswith('E:E'): first_position = 'E:E' change_enh = tok.attrib['ENH'][3:].split(':')[0] elif tok.attrib['ENH'].startswith('E'): first_position = 'E' change_enh = tok.attrib['ENH'][1:].split(':')[0] else: change_enh = tok.attrib['ENH'].split(':')[0] if '.' not in str(change_enh): #print(change_enh) if int(change_enh) > current_position: change_enh = int(change_enh) + shift_num tok.attrib['ENH'] = first_position + str( change_enh ) + ':' + tok.attrib['ENH'].split(':')[1] else: if float(change_enh) > float(current_position): change_enh = round( float(change_enh) + float(shift_num), 1) tok.attrib['ENH'] = first_position + str( change_enh ) + ':' + tok.attrib['ENH'].split(':')[1] if 'LINK' in token.attrib: temp_rel = token.attrib['LINK'] # а если нет link else: temp_rel = '_root' temp_dom = token.attrib['DOM'] temp_text = token.text.replace('.', '. ').split()[0] temp_tail = token.tail no_dot = (j == len(sentence.findall('W')) - 1) if not no_dot and temp_tail.startswith('.'): temp_tail = temp_tail.lstrip('.') sentence.remove(token) starting_position = current_position for i, elem in enumerate( dict_of_fixed[token.attrib['LEMMA'], token.attrib['FEAT']][0]): tag = ET.fromstring('<W></W>') tag.attrib['ID'] = str(current_position) tag.attrib['LEMMA'] = elem[1] tag.attrib['OLD'] = 'EMPTY' tag.attrib['FEAT'] = elem[2] head_position = dict_of_fixed[token.attrib['LEMMA'], token.attrib['FEAT']][1] if i == head_position: # this is the head token of the group tag.attrib['DOM'] = str(temp_dom) if elem[3] == '%': tag.attrib['LINK'] = temp_rel else: tag.attrib['LINK'] = elem[3] else: tag.attrib['LINK'] = elem[3] tag.attrib['DOM'] = str(starting_position + head_position) # if DOM happened to become _root, remove LINK if tag.attrib['DOM'] == '_root': del tag.attrib['LINK'] if i == 0: tag.text = temp_text else: tag.text = elem[0] if i == len( dict_of_fixed[token.attrib['LEMMA'], token.attrib['FEAT']][0]) - 1: tag.tail = temp_tail if no_dot: tag.text = tag.text.rstrip('.') tag.attrib['LEMMA'] = tag.attrib[ 'LEMMA'].strip('.') else: tag.tail = ' \n' if 'LINK' not in tag.attrib and tag.attrib[ 'DOM'] == '_root': tag.attrib['ENH'] = '0:root' else: tag.attrib['ENH'] = str( tag.attrib['DOM']) + ':' + tag.attrib['LINK'] sentence.insert(shift_position, tag) current_position += 1 shift_position += 1 sorted_tokens = sorted( sentence.findall('W'), key=lambda x: float(x.attrib.get('ID', '100500'))) while len(sentence.findall('W')) != 0: sentence.remove(sentence.findall('W')[-1]) while len(sentence.findall('LF')) != 0: sentence.remove(sentence.findall('LF')[-1]) for token in sorted_tokens: sentence.append(token) for sentence in root[-1].findall('S'): for i, token in enumerate(sentence.findall('W')): if i == 0: token.text = token.text[0].upper() + token.text[1:] token.attrib['LEMMA'] = token.attrib['LEMMA'].replace('|', ',') if '|' in token.text: token.text = token.text.replace('|', ',') if token.attrib['LEMMA'].endswith('-знак'): token.attrib['LEMMA'] = token.text token.attrib['FEAT'] = 'SYM' # small fixes from Olga 04.04.2018 for sentence in root[-1].findall('S'): for i, token in enumerate(sentence.findall('W')): if token.attrib['LEMMA'] in glue: if ifname.split('/')[ -1] == '2011Petrushka.xml' and sentence.attrib[ 'ID'] == '141': sentence.findall('W')[39].attrib['DOM'] = '46' sentence.findall('W')[40].attrib['DOM'] = '40' sentence.findall('W')[40].attrib['LINK'] = 'fixed' sentence.findall('W')[41].attrib['DOM'] = '46' for h in range(39, 42): sentence.findall( 'W')[h].attrib['ENH'] = sentence.findall('W')[ h].attrib['DOM'] + ':' + sentence.findall( 'W')[h].attrib['LINK'] elif len(sentence.findall('W')) > i+1 and \ sentence.findall('W')[i+1].attrib['LEMMA'] == 'бы' and \ sentence.findall('W')[i+1].attrib['LINK'] != 'fixed': sentence.findall('W')[i + 1].attrib['LINK'] = 'fixed' if token.attrib['DOM'] == sentence.findall('W')[ i + 1].attrib['ID']: token.attrib['DOM'] = sentence.findall('W')[ i + 1].attrib['DOM'] token.attrib['ENH'] = ':'.join( (token.attrib['DOM'], token.attrib['LINK'])) sentence.findall('W')[i + 1].attrib[ 'DOM'] = sentence.findall('W')[i].attrib['ID'] sentence.findall('W')[i + 1].attrib['ENH'] = ':'.join( (sentence.findall('W')[i + 1].attrib['DOM'], sentence.findall('W')[i + 1].attrib['LINK'])) elif len(sentence.findall('W')) > i+1 and \ sentence.findall('W')[i+1].attrib['LEMMA'] == 'бы' and \ sentence.findall('W')[i+1].attrib['DOM'] != sentence.findall('W')[i].attrib['ID']: print(token.attrib['ID']) print(*[(token.attrib['ID'], token.text, token.attrib['DOM'], token.attrib.get('LINK', 'EMPTY'), token.tail) for token in sentence], sep='\n') print('*' * 20) elif token.attrib['LEMMA'] == 'второе' and ( 'ADJ' in token.attrib['FEAT'] or 'nmod' in token.attrib.get('LINK', 'EMPTY')): token.attrib['LEMMA'] = 'второй' token.attrib['FEAT'] = token.attrib['FEAT'].replace( 'Animacy=Inan|', '').replace('NOUN', 'ADJ') if token.attrib.get('LINK', 'EMPTY') == 'nmod': token.attrib['LINK'] = 'amod' token.attrib['ENH'] = token.attrib['ENH'].replace( 'nmod', 'amod') elif token.attrib[ 'LEMMA'] == 'вооружать' and 'ADJ' in token.attrib[ 'FEAT']: token.attrib['LEMMA'] = 'вооруженный' elif token.attrib[ 'LEMMA'] == 'весь' and 'PRON' in token.attrib['FEAT']: token.attrib['FEAT'] = ' '.join( ['DET'] + token.attrib['FEAT'].split()[1:]) elif token.attrib[ 'LEMMA'] == 'главное' and 'ADV' in token.attrib['FEAT']: token.attrib[ 'FEAT'] = 'NOUN Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing' elif token.attrib[ 'LEMMA'] == 'дома' and 'NOUN' in token.attrib['FEAT']: token.attrib['LEMMA'] = 'дом' elif token.attrib[ 'LEMMA'] == 'звонок' and 'ADJ' in token.attrib['FEAT']: token.attrib['FEAT'] = token.attrib['FEAT'].replace( 'Degree=Pos', 'Animacy=Inan|Case=Gen').replace( 'ADJ', 'NOUN').replace('|Variant=Short', '') elif token.attrib[ 'LEMMA'] == 'многие' and 'ADJ' in token.attrib['FEAT']: token.attrib['FEAT'] = 'NUM' token.attrib['LEMMA'] = 'много' elif token.attrib[ 'LEMMA'] == 'легкий' and 'NOUN' in token.attrib['FEAT']: token.attrib['LEMMA'] = 'легкие' elif token.attrib['LEMMA'] == 'плюс' and 'SYM' in token.attrib[ 'FEAT']: token.attrib['LEMMA'] = '+' elif token.attrib['LEMMA'] == 'ли': if token.attrib['LINK'] == 'conj': token.attrib['LINK'] = 'advmod' if token.attrib['LINK'] == 'discourse': token.attrib['LINK'] = 'fixed' token.attrib['ENH'] = token.attrib['ENH'].split( ':')[0] + ':' + token.attrib['LINK'] elif token.attrib['LEMMA'] == 'значит': token.attrib['LINK'] = 'discourse' token.attrib['ENH'] = token.attrib['ENH'].split( ':')[0] + ':' + token.attrib['LINK'] elif token.attrib['LEMMA'] == 'один' and 'ADJ' in token.attrib[ 'FEAT']: token.attrib['FEAT'] = ' '.join( ['DET'] + token.attrib['FEAT'].split()[1:]) for sentence in root[-1].findall('S'): for token in sentence.findall('W'): if token.attrib[ 'LEMMA'] in dict_of_lemmas and 'Aspect=Perf' in token.attrib[ 'FEAT']: token.attrib['LEMMA'] = dict_of_lemmas[ token.attrib['LEMMA']] #this is test for debug purposes: #for sent in root[-1].findall('S'): #for wt in sent.findall('W'): # link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(wt, sent) # if wt.attrib.get('LINK') == 'acl:relcl' and head_token.attrib.get('LINK') == 'obl': # print(wt.attrib['ID'], ifname, sent.attrib['ID']) # Change lemma capitalisation for sent in root[-1].findall('S'): for word in sent.findall('W'): link, pos, feats, head_token, head_pos, head_feats, head_root = get_info( word, sent) if pos == 'PROPN' and word.attrib['ID'] == '1': if word.attrib['LEMMA'] not in { 'формула', 'чижик', 's', 'ps', 'f**k', 'да', 'ох' }: if word.text.isupper(): word.attrib['LEMMA'] = word.attrib['LEMMA'].upper() else: word.attrib['LEMMA'] = word.attrib['LEMMA'].title() elif word.attrib['LEMMA'] == 'чижик': word.attrib['FEAT'] = word.attrib['FEAT'].replace( 'PROPN', 'NOUN') elif word.attrib['LEMMA'] == 'ох': word.attrib['FEAT'] = 'PART' elif word.attrib['LEMMA'] in {'s', 'ps', 'f**k', 'да'}: word.attrib['FEAT'] = word.attrib['FEAT'].replace( 'PROPN', 'X') elif word.attrib['LEMMA'] == 'формула': if sent[1].text == '1': #sent[1].attrib['LINK'] = 'fixed' maybe later for all occurances word.attrib['LEMMA'] = word.attrib['LEMMA'].title() else: word.attrib['FEAT'] = word.attrib['FEAT'].replace( 'PROPN', 'NOUN') elif pos == 'PROPN' and word.attrib['ID'] != '1': if word.text.isupper(): word.attrib['LEMMA'] = word.attrib['LEMMA'].upper() else: word.attrib['LEMMA'] = word.attrib['LEMMA'].title() if word.text.istitle( ) and pos != 'PROPN' and word.attrib['ID'] != '1': if word.text not in uncertain: word.attrib['FEAT'] = 'PROPN' + ( word.attrib['FEAT'] + '\t').split('\t', maxsplit=1)[1] if word.text.isupper(): word.attrib['LEMMA'] = word.attrib['LEMMA'].upper() else: word.attrib['LEMMA'] = word.attrib['LEMMA'].title() elif word.text in uncertain and word.attrib['LEMMA'].lower( ) in certain: word.attrib['FEAT'] = 'PROPN' + ( word.attrib['FEAT'] + '\t').split('\t', maxsplit=1)[1] if word.text.isupper(): word.attrib['LEMMA'] = word.attrib['LEMMA'].upper() else: word.attrib['LEMMA'] = word.attrib['LEMMA'].title() else: pass # TODO: сделать ветку для неразобранных tree.write(ofname, encoding="utf-8") return
def main(ifname_list, ofname_list): #collect all PROPN proper_detected = defaultdict(int) for ifname, ofname in zip(ifname_list, ofname_list): tree = ET.parse(ifname) root = tree.getroot() for sent in root[-1].findall('S'): for word in sent.findall('W'): feats = word.attrib.get('FEAT', 'EMPTY').split() if 'PROPN' in feats: proper_detected[word.attrib['LEMMA']] += 1 not_proper_detected = defaultdict(int) for ifname, ofname in zip(ifname_list, ofname_list): tree = ET.parse(ifname) root = tree.getroot() for sent in root[-1].findall('S'): for word in sent.findall('W'): if word.attrib['LEMMA'] in proper_detected: feats = word.attrib.get('FEAT', 'EMPTY').split() if 'PROPN' not in feats: not_proper_detected[word.attrib['LEMMA']] += 1 for ifname, ofname in zip(ifname_list, ofname_list): tree = ET.parse(ifname) root = tree.getroot() for sent in root[-1].findall('S'): for word in sent.findall('W'): feats = word.attrib.get('FEAT', 'EMPTY').split() if (word.text is not None and word.text.istitle() and ('NOUN' in feats or 'ADJ' in feats) and word.attrib['LEMMA'] in proper_detected and proper_detected[word.attrib['LEMMA']] > not_proper_detected[word.attrib['LEMMA']]): feats[0] = 'PROPN' word.attrib['FEAT'] = ' '.join(feats) for sent in root[-1].findall('S'): nidChain = [] listOfChains = [] for word in sent.findall('W'): feats = word.attrib.get('FEAT', 'EMPTY').split() if 'NID' in feats: nidChain.append(word) elif nidChain != []: listOfChains.append(nidChain) nidChain = [] if nidChain != []: listOfChains.append(nidChain) nidChain = [] for nidChain in listOfChains: if len(nidChain) == 1: assign(nidChain) else: ids = [elem.attrib['ID'] for elem in nidChain] condidates = [ elem for elem in nidChain if elem.attrib['DOM'] not in ids ] if len(condidates) == 1: domNumber = condidates[0].attrib['DOM'] nidChain = revertLink(domNumber, nidChain) assign(nidChain) else: for item in condidates: revisedNidChaine = [item] going = False currentHeadID = item.attrib['ID'] while not going: for elem in nidChain: if elem.attrib['DOM'] == currentHeadID: revisedNidChaine.append(elem) currentHeadID = elem.attrib['ID'] break else: going = True if len(revisedNidChaine) > 1: nidChain = revertLink(item.attrib['DOM'], revisedNidChaine) assign(nidChain) else: assign(revisedNidChaine) for sent in root[-1].findall('S'): for word in sent.findall('W'): link, pos, feats, head_token, head_pos, head_feats, head_root = get_info( word, sent) if word.attrib.get('LEMMA', 'EMPTY') in [ 'все', 'это', 'то' ] and pos in ['PROPN', 'NOUN']: feats_temp = word.attrib['FEAT'].split(' ') word.attrib['FEAT'] = 'PRON ' + feats_temp[1] tree.write(ofname, encoding="UTF-8")
def munch(ifiles, ofiles): """ Process all files in ifiles list. Output into ofiles list. """ for ifname, ofname in zip(ifiles, ofiles): tree = ET.parse(ifname) root = tree.getroot() for sentence in root[-1].findall('S'): for token in sentence.findall( 'W'): # step 0: detect and re-annotate 'не' if token.attrib['LEMMA'] == 'не' and 'VERB' in token.attrib[ 'FEAT']: link, pos, feats, head_token, head_pos, head_feats, head_root = get_info( token, sentence) children = get_children(sentence, token.attrib['ID']) if token.text != 'FANTOM' and all(ch.text != 'FANTOM' for ch in children): for elem in children: if 'VerbForm=Inf' in elem.attrib['FEAT']: gr_children = get_children( sentence, elem.attrib['ID']) break for item in gr_children: if item.attrib['LEMMA'] in suspicious: gr_gr_children = get_children( sentence, item.attrib['ID']) if all(gr_gr.attrib['FEAT'].split()[0] != 'ADP' for gr_gr in gr_gr_children): token.attrib['LEMMA'] = suspicious[ item.attrib['LEMMA']] token.text = token.text + item.text token.attrib['FEAT'] = item.attrib['FEAT'] item.attrib['DEL'] = 'YES' break elif token.text != 'FANTOM' and any(ch.text == 'FANTOM' for ch in children): for elem in children: if 'VerbForm=Inf' in elem.attrib['FEAT']: gr_children = get_children( sentence, elem.attrib['ID']) break for item in gr_children: if item.attrib['LEMMA'] in suspicious: gr_gr_children = get_children( sentence, item.attrib['ID']) if all(gr_gr.attrib['FEAT'].split()[0] != 'ADP' for gr_gr in gr_gr_children): token.attrib['LEMMA'] = suspicious[ item.attrib['LEMMA']] token.attrib['FEAT'] = item.attrib['FEAT'] item.attrib['DEL'] = 'YES' break elif token.text == 'FANTOM' and children == []: if sentence.attrib['ID'] == '217': for elem in sentence.findall('W'): if elem.attrib['ID'] == '11': elem.attrib['DEL'] = 'YES' if elem.attrib['ID'] == '12': elem.attrib['LEMMA'] = 'нечего' if sentence.attrib['ID'] == '94': for elem in sentence.findall('W'): if elem.attrib['ID'] == '11': elem.attrib['DEL'] = 'YES' if sentence.attrib['ID'] == '169': for elem in sentence.findall('W'): if elem.attrib['ID'] == '6': elem.attrib['DOM'] = '14' if elem.attrib['ID'] == '9': elem.attrib['DEL'] = 'YES' if elem.attrib['ID'] == '10': elem.attrib['LEMMA'] = 'некого' if elem.attrib['ID'] == '11': elem.attrib['DOM'] = '13' if elem.attrib['ID'] == '12': elem.attrib['DEL'] = 'YES' if elem.attrib['ID'] == '13': elem.attrib['LEMMA'] = 'негде' elem.attrib['DOM'] = '10' elif token.text == 'FANTOM' and any(ch.text == 'FANTOM' for ch in children): for elem in sentence.findall('W'): if elem.attrib['ID'] == '11': elem.attrib['DEL'] = 'YES' if elem.attrib['ID'] == '2': elem.attrib['LEMMA'] = suspicious[ elem.attrib['LEMMA']] elem.attrib['DOM'] = '_root' del elem.attrib['LINK'] if elem.attrib['DOM'] == '1': elem.attrib['DOM'] == '2' elif token.text == 'FANTOM' and all(ch.text != 'FANTOM' for ch in children): if all('VerbForm=Inf' not in ch.attrib['FEAT'] for ch in children): if sentence.attrib['ID'] == '440': for elem in sentence.findall('W'): if elem.attrib['ID'] == '16': elem.attrib['DOM'] = '18' if elem.attrib['ID'] == '17': elem.attrib['DEL'] = 'YES' if elem.attrib['ID'] == '18': elem.attrib['LEMMA'] = suspicious[ elem.attrib['LEMMA']] for elem in children: if 'VerbForm=Inf' in elem.attrib['FEAT']: gr_children = get_children( sentence, elem.attrib['ID']) if head_token is None: for item in gr_children: if item.attrib['LEMMA'] in suspicious: gr_gr_children = get_children( sentence, item.attrib['ID']) if all(gr_gr.attrib['FEAT'].split( )[0] != 'ADP' for gr_gr in gr_gr_children): item.attrib[ 'LEMMA'] = suspicious[ item.attrib['LEMMA']] item.attrib['DOM'] = '_root' del item.attrib['LINK'] token.attrib['DEL'] = 'YES' for renum in sentence.findall( 'W'): if renum.attrib[ 'DOM'] == token.attrib[ 'ID']: renum.attrib[ 'DOM'] = item.attrib[ 'ID'] break else: for broken in children: if broken.attrib[ 'LEMMA'] in suspicious: broken.attrib[ 'LEMMA'] = suspicious[ broken.attrib['LEMMA']] broken.attrib['DOM'] = '_root' del broken.attrib['LINK'] token.attrib['DEL'] = 'YES' for renum in sentence.findall( 'W'): if renum.attrib[ 'DOM'] == token.attrib[ 'ID']: renum.attrib[ 'DOM'] = broken.attrib[ 'ID'] else: for item in gr_children: if item.attrib['LEMMA'] in suspicious: gr_gr_children = get_children( sentence, item.attrib['ID']) if all(gr_gr.attrib['FEAT'].split( )[0] != 'ADP' for gr_gr in gr_gr_children): token.attrib[ 'LEMMA'] = suspicious[ item.attrib['LEMMA']] token.attrib[ 'FEAT'] = item.attrib[ 'FEAT'] token.text = item.text item.attrib['DEL'] = "YES" for renum in sentence.findall( 'W'): if renum.attrib[ 'DOM'] == item.attrib[ 'ID']: renum.attrib[ 'DOM'] = token.attrib[ 'ID'] else: pass for sentence in root[-1].findall( 'S'): # step 2: collect token numbers old:new numbering = {} token_number = 0 for token in sentence.findall('W'): if 'DEL' not in token.attrib: token_number += 1 numbering[token.attrib['ID']] = str(token_number) for word in sentence.findall('W'): # step 3: assign new numbers word.attrib['ID'] = numbering[word.attrib['ID']] if word.attrib['DOM'] != '_root': word.attrib['DOM'] = numbering[word.attrib['DOM']] for elem in sentence.findall('W'): # step 4: remove tokens if 'DEL' in elem.attrib: sentence.remove(elem) for sentence in root[-1].findall('S'): for token in sentence.findall('W'): # Mood=Cnd fix if token.attrib['LEMMA'] in {'бы', 'б', 'чтобы', 'чтоб'}: link, pos, feats, head_token, head_pos, head_feats, head_root = get_info( token, sentence) try: if head_token.attrib['LEMMA'] not in forbidden_head: if pos in {'SCONJ', 'PART'}: token.attrib['FEAT'] = token.attrib[ 'FEAT'] + ' Mood=Cnd' else: token.attrib['FEAT'] = token.attrib[ 'FEAT'].replace(' Foreign=Yes', '') except: print('Something went wrong') print(*[(elem.text, elem.tail.rstrip('\n'), elem.attrib) for elem in sentence], sep='\n') print() tree.write(ofname, encoding="UTF-8") return
def munch(ifiles, ofiles): """ Process all files in ifiles list. Output into ofiles list. """ for ifname, ofname in zip(ifiles, ofiles): tree = ET.parse(ifname) root = tree.getroot() for sentence in root[-1].findall('S'): remove_sentence = False for token in sentence.findall('W'): if token.attrib.get('FEAT','EMPTY').split()[0] == 'COM': link, pos, feats, head_token, head_pos, head_feats, head_root, nodetype = get_info(token, sentence, get_nodetype=True) if not nodetype and head_pos == 'COM': print(ifname, sentence.attrib['ID']) elif nodetype: token.attrib['FEAT'] = head_token.attrib['FEAT'] elif not nodetype and head_pos in ['PR', 'NUM', 'CONJ']: remove_sentence = True continue elif not nodetype and head_pos == 'V': if token.text in ['не', 'полу']: head_token.attrib['LEMMA'] = token.attrib['LEMMA'] + head_token.attrib['LEMMA'] head_token.text = token.text + head_token.text else: remove_sentence = True continue elif not nodetype and head_pos == 'A': head_token.attrib['LEMMA'] = token.attrib['LEMMA'] + head_token.attrib['LEMMA'] head_token.text = token.text + head_token.text elif not nodetype and head_pos == 'S': if head_token.attrib['LEMMA'] not in ['слово', 'фактор', 'циклон', 'янус', 'буква', 'орбита', 'мониторинг', 'спектроскопия']: head_token.attrib['LEMMA'] = token.attrib['LEMMA'] + token.tail.strip() + head_token.attrib['LEMMA'] head_token.text = token.text + token.tail.strip() + head_token.text else: remove_sentence = True continue else: print(ifname, sentence.attrib['ID']) if remove_sentence: sentence.clear() continue for sentence in root[-1].findall('S'): for token in sentence.findall('W'): if 'NODETYPE' in token.attrib: token.text = 'FANTOM' del token.attrib['NODETYPE'] if 'LEMMA' not in token.attrib: token.attrib['LEMMA'] = 'FANTOM' if 'LINK' in token.attrib and token.attrib['LINK'] == 'предик': dom = int(token.attrib['DOM']) number = int(token.attrib['ID']) for item in sentence.findall('W'): if 'LINK' in item.attrib and item.attrib['LINK'] == 'предик' and \ int(item.attrib['ID']) != number and int(item.attrib['DOM']) == dom: remove_sentence = True # Это для отдельно болтающихся предлогох/союзов if token.attrib.get('FEAT', '') and token.attrib.get('FEAT', '').split()[0] in garbage and token.attrib['DOM'] == '_root': child_id = token.attrib['ID'] children = get_children_attrib(sentence, child_id) if children == []: remove_sentence = True if remove_sentence: sentence.clear() continue tree.write(ofname, encoding = 'utf-8') return
#c = get_cosine_sim("AI is our friend and it has been friendly", "AI and humans have always been friendly") # print(c) # sys.exit() if reload: datas = [] for y in sorted(glob("*/info.yml")): d = os.path.dirname(y) if d in ("psoe110", ): continue print("Analizando %s" % d) os.chdir(cwd) os.chdir(d) data = get_info(autocomplete=True) soup = get_soup(data.output + ".html") body = soup.find("body") body_txt = re.sub(r" +", " ", body.get_text()).strip() body_slp = body_txt.split() data.pages = get_pages(data.output + ".html") data.caracteres = len(body_txt) data.palabras = len(body_slp) data.parrafos = len(body.findAll(["p", "li"])) data.capitulos = len(body.findAll(["h1"])) # , "h2"])) data.root = d filesize = data.get("filesize", {}) for k in ("md", "html", "epub"): filesize[k] = os.path.getsize(data.output + '.' + k) for k in ("pdf", "html", "xml"):
def munch(ifiles, ofiles): # part 1 count = 0 for ifname, ofname in zip(ifiles, ofiles): tree = ET.parse(ifname) root = tree.getroot() for sent in root[-1].findall('S'): for word in sent.findall('W'): if word.attrib.get( 'FEAT', 'EMPTY').split()[0] == 'PR' and word.attrib.get( 'LEMMA', 'EMPTY') in lemmas_adv: word.attrib['FEAT'] = 'ADV' if 'LINK' in word.attrib and word.attrib[ 'LINK'] not in conjrels: word.attrib['LINK'] = 'advmod' for sent in root[-1].findall('S'): for word in sent.findall('W'): if word.attrib.get('LINK', 'EMPTY') in ['количест', 'аппрокс-колич']: link, pos, feats, head_token, head_pos, head_feats, head_root = get_info( word, sent) feats_str = ''.join(feats) if 'NUM' in pos and word.attrib['LEMMA'] == 'один': word.attrib['LINK'] = 'nummod' elif word.attrib['LEMMA'].endswith('1'): word.attrib['LINK'] = 'nummod' elif feats_str == 'NUM': word.attrib['LINK'] = 'nummod' elif 'ИМ' in feats_str: word.attrib['LINK'] = 'nummod:gov' elif 'ВИНОД' in feats_str: word.attrib['LINK'] = 'nummod:gov' elif 'ОД' not in feats_str and 'НЕОД' not in feats_str and 'ВИН' in feats_str: head_feats_str = ''.join(head_feats) if 'ОД' in head_feats_str: word.attrib['LINK'] = 'nummod:gov' else: word.attrib['LINK'] = 'nummod' else: word.attrib['LINK'] = 'nummod' for sent in root[-1].findall('S'): for word in sent.findall('W'): if word.attrib.get('LEMMA', 'EMPTY') in big_num + big_num_fem: link, pos, feats, head_token, head_pos, head_feats, head_root = get_info( word, sent) if link == 'колич-вспом': # we need to deal with it later if head_token.attrib['FEAT'].split(' ')[ 0] == 'A': # will be converted in syntax.py pass else: #fix sentences if ifname.endswith('newsYa_16.xml' ) and sent.attrib['ID'] == '31': sent[44].attrib['DOM'] = '47' sent[44].attrib['LINK'] = 'колич-вспом' elif ifname.endswith( '2014Na_dvukh_voinakh_2.xml' ) and sent.attrib['ID'] == '232': sent[8].attrib['DOM'] = '12' sent[9].attrib['DOM'] = '12' elif ifname.endswith( '2014Na_dvukh_voinakh_1.xml' ) and sent.attrib['ID'] == '397': sent[2].attrib['DOM'] = '5' elif ifname.endswith( '2003Opasnaya_blizost.xml' ) and sent.attrib['ID'] == '5': sent[18].attrib['DOM'] = '14' sent[18].attrib['LINK'] = 'компл-аппоз' sent[17].attrib['DOM'] = '19' sent[17].attrib['LINK'] = 'nummod' sent[16].attrib['LINK'] = 'колич-вспом' sent[14].attrib['DOM'] = '18' sent[14].attrib['LINK'] = 'колич-вспом' elif ifname.endswith( '2003Bolshie_peremeny.xml' ) and sent.attrib['ID'] == '6': sent[11].attrib['DOM'] = '5' sent[11].attrib['LINK'] = 'обст' sent[9].attrib['DOM'] = '12' sent[9].attrib['LINK'] = 'nummod' sent[8].attrib['LINK'] = 'колич-вспом' sent[6].attrib['DOM'] = '10' sent[6].attrib['LINK'] = 'колич-вспом' sent[5].attrib['DOM'] = '12' elif ifname.endswith( '2003Bolshie_peremeny.xml' ) and sent.attrib['ID'] == '45': sent[14].attrib['DOM'] = '20' sent[15].attrib['DOM'] = '19' sent[15].attrib['LINK'] = 'колич-вспом' sent[17].attrib['LINK'] = 'колич-вспом' sent[18].attrib['DOM'] = '20' sent[18].attrib['LINK'] = 'nummod:gov' sent[19].attrib['LINK'] = 'обст' sent[19].attrib['DOM'] = '26' sent[21].attrib['DOM'] = '24' sent[21].attrib['LINK'] = 'колич-вспом' sent[22].attrib['DOM'] = '24' elif ifname.endswith( '2003Tyurma_dlya_svekrovei.xml' ) and sent.attrib['ID'] == '18': sent[12].attrib['DOM'] = '18' sent[13].attrib['DOM'] = '17' sent[13].attrib['LINK'] = 'колич-вспом' sent[15].attrib['LINK'] = 'колич-вспом' sent[16].attrib['DOM'] = '18' sent[16].attrib['LINK'] = 'nummod' sent[17].attrib['DOM'] = '9' sent[17].attrib['LINK'] = '3-компл' elif ifname.endswith( '2014Vladimir_Vladimirovich.xml' ) and sent.attrib['ID'] == '96': sent[15].attrib['DOM'] = '19' sent[15].attrib['LINK'] = 'колич-вспом' sent[17].attrib['LINK'] = 'колич-вспом' sent[18].attrib['DOM'] = '20' sent[18].attrib['LINK'] = 'nummod' sent[19].attrib['DOM'] = '15' sent[19].attrib['LINK'] = '1-компл' else: print('Unaccounted entry:', ifname, sent.attrib['ID'], file=sys.stderr) else: feats_str = ''.join(feats) children = get_children_attrib(sent, word.attrib['ID']) if link == 'предик' and any( child['LINK'] == 'квазиагент' for child in children): continue if 'РОД' in feats_str: new_link = 'nummod' elif word.attrib.get('LEMMA', 'EMPTY') in big_num and ( 'ИМ' in feats_str or 'ВИН' in feats_str): new_link = 'nummod:gov' elif word.attrib.get( 'LEMMA', 'EMPTY') in big_num_fem and ( 'ИМ' in feats_str or 'ВИНОД' in feats_str): new_link = 'nummod:gov' else: if all(child['LINK'] != 'квазиагент' for child in children): # Not interested in this condition pass else: for child in children: if child['LINK'] == 'квазиагент': if 'РОД' in child[ 'FEAT'] or '$' in child[ 'LEMMA']: new_link = 'nummod:gov' else: new_link = 'nummod' for child_token in children: if child_token['LINK'] == 'квазиагент': child_token['LINK'] = link child_token['DOM'] = word.attrib['DOM'] word.attrib['LINK'] = new_link word.attrib['DOM'] = child_token['ID'] for ch in children: if ch['ID'] != child_token['ID'] and ch[ 'LINK'] not in [ 'nummod', 'nummod:gov' ]: ch['DOM'] = child_token['ID'] for sent in root[-1].findall('S'): for word in sent.findall('W'): if word.attrib.get('LEMMA', '') in [ 'сколько', 'несколько' ] and word.attrib.get('LINK', 'EMPTY') in ['присвяз', 'соч-союзн']: word.attrib['FEAT'] = 'NUM' if word.attrib.get( 'LEMMA', 'EMPTY') in lemmas_to_check and word.attrib.get( 'LINK', 'EMPTY') not in [ 'огранич', 'присвяз', 'соч-союзн', 'nummod', 'nummod:gov' ]: children = get_children_attrib(sent, word.attrib['ID']) if len(children) != 0: if word.attrib.get('LINK', '') == 'предик' and word.attrib.get( 'LEMMA', '') != 'сколько': pass # do nothing elif word.attrib['DOM'] == '_root' or word.attrib[ 'LINK'] in ['вспом']: if word.attrib['DOM'] == '_root' and any( ch['LINK'] == '1-компл' for ch in children ) and 'СРАВ' not in word.attrib.get('FEAT', ''): if (ifname + '_' + sent.attrib['ID'] ).split('/')[1] in [ 'uppsalaBitov_3.xml_454', '2007Pylesos.xml_77', '2009Nebesnye_formatsii.xml_8', '2012Chto_delat_posle_24_dekabrya.xml_9', '2003Nelzya_sebya_delit.xml_49', '2006Dobretsov.xml_57', '2011Mariam_Petrosyan.xml_296', '2003Lyubit_drakona.xml_122', 'uppsalaKorp_220.xml_112', '2005Sluzhit_by_rad.xml_82', '2009Final_Ligi_Chempionov.xml_30', '2003Zhores.xml_336', '2003Opasnaya_blizost.xml_74' ]: for ch in children: if ch['LINK'] == '1-компл': ch['DOM'] = word.attrib['DOM'] for child in children: if child['ID'] != ch[ 'ID'] and child[ 'LINK'] not in { 'огранич', 'колич-огран' }: child['DOM'] = ch['ID'] word.attrib['DOM'] = ch['ID'] del ch['LINK'] word.attrib['LINK'] = 'nummod:gov' word.attrib['FEAT'] = 'NUM' else: pass elif word.attrib.get( 'LINK', '') == 'предик' and word.attrib.get( 'LEMMA', '') in ['сколько', 'несколько']: if len(children) == 1 and ( 'S' in children[0]['FEAT'] or 'A ' in children[0]['FEAT']): children[0]['LINK'] = 'предик' children[0]['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = children[0]['ID'] word.attrib['LINK'] = 'nummod:gov' word.attrib['FEAT'] = 'NUM' else: for child in children: if 'S ' in child['FEAT']: child['LINK'] = 'предик' child['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = child['ID'] word.attrib['LINK'] = 'nummod:gov' word.attrib['FEAT'] = 'NUM' if len(children) == 3: for ch in children: if ch['LINK'] == 'соч-союзн': ch['DOM'] = child['ID'] elif all(ch['LINK'] in ['огранич', 'колич-огран', 'вспом', 'case'] for ch in children): pass # don't need to do anything elif word.attrib.get('LINK', '') == 'обст': if word.attrib.get('LEMMA', '') == 'немного': pass # do nothing elif word.attrib['LEMMA'] in [ 'больше', 'столько', 'мало', 'много' ]: if len(children) == 1: if word.attrib['LEMMA'] in [ 'мало', 'много' ]: malo = True else: malo = False if children[0]['FEAT'].strip( ).split(' ')[0] in [ 'S' ]: #TODO проверить - может быть тут еще надо поменять связь children[0]['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = children[0]['ID'] if malo and children[0][ 'LINK'] != 'предик': word.attrib['LINK'] = 'nummod:gov' word.attrib['FEAT'] = 'NUM' elif children[0]['FEAT'].strip().split( ' ')[0] not in [ 'CONJ', 'A', 'ADV', 'V' ]: print('Unaccounted entry (FEAT):', file=sys.stderr) else: for ch in children: if ch['LINK'] == 'сравнит': if ch['FEAT'].strip().split( ' ')[0] in ['S']: ch['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = ch['ID'] break elif word.attrib['LEMMA'] == 'меньше': pass elif word.attrib['LEMMA'] in [ 'сколько', 'несколько' ]: for ch in children: if ch['LINK'] == '1-компл': ch['LINK'] = word.attrib['LINK'] ch['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = ch['ID'] word.attrib['LINK'] = 'nummod:gov' word.attrib['FEAT'] = 'NUM' break else: print('Unaccounted entry (FEAT):', file=sys.stderr) elif word.attrib['LEMMA'] in ['более', 'менее']: for ch in children: if ch['FEAT'].strip().split(' ')[0] in [ 'S' ] and ch['LINK'] != 'атриб': ch['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = ch['ID'] for chld in children: if chld['ID'] != ch['ID'] and chld[ 'LINK'] not in ['огранич']: chld['DOM'] = ch['ID'] elif word.attrib.get('LINK', '').endswith( 'компл' ): # перевесить на существительное. Сколько -> NUM? if (len(children) == 1 and children[0]['FEAT'].strip().split(' ')[0] in ['S', 'A'] and children[0]['LINK'] not in ['атриб', 'предик']): children[0]['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = children[0]['ID'] if word.attrib['LEMMA'] in [ 'сколько', 'несколько' ]: word.attrib['LINK'] = 'nummod:gov' word.attrib['FEAT'] = 'NUM' elif (len(children) == 1 and children[0]['FEAT'].strip().split(' ')[0] in ['CONJ', 'V']): pass elif len(children) == 1: if ifname.endswith( '2003Vyzhivshii_kamikadze.xml' ) and sent.attrib['ID'] == '257': children[0]['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = children[0]['ID'] if ifname.endswith( '2003Artist_mimansa.xml' ) and sent.attrib['ID'] == '330': word.attrib['FEAT'] = 'NUM' elif (len(children) > 1): candidate_1 = None candidate_2 = None found_predic = False for chld in children: if chld['LINK'] == 'предик': found_predic = True break if chld['LINK'] == '1-компл': candidate_1 = chld break if chld['LINK'] == 'сравнит': candidate_2 = chld if not found_predic: if candidate_1 is not None: candidate_1['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = candidate_1['ID'] elif candidate_2 is not None: candidate_2['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = candidate_2['ID'] if word.attrib['LEMMA'] in [ 'сколько', 'несколько' ]: word.attrib['LINK'] = 'nummod:gov' word.attrib['FEAT'] = 'NUM' else: print('Unaccounted entry (FEAT):', file=sys.stderr) elif word.attrib.get( 'LINK', '' ) == 'вводн': # there is no 'сколько', 'несколько' here for chld in children: if chld['FEAT'].strip().split(' ')[0] == 'S': chld['DOM'] = word.attrib['DOM'] word.attrib['DOM'] = chld['ID'] elif word.attrib.get('LINK', '') == 'подч-союзн': word.attrib['FEAT'] = 'NUM' if any(ch['LINK'] == 'предик' for ch in children): pass else: for ch in children: if ch['LINK'] == '1-компл': ch['DOM'] = word.attrib['DOM'] for child in children: if child['ID'] != ch[ 'ID'] and child[ 'LINK'] not in { 'огранич', 'колич-огран' }: child['DOM'] = ch['ID'] word.attrib['DOM'] = ch['ID'] ch['LINK'] = word.attrib['LINK'] word.attrib['LINK'] = 'nummod:gov' elif word.attrib.get('LINK', '') in [ 'соотнос', 'кратн', 'электив', 'аппоз', 'эксплет', 'атриб', 'релят', 'квазиагент', 'колич-копред', 'разъяснит', 'сент-соч', 'сравнит', 'изъясн', 'компл-аппоз' ]: pass elif word.attrib.get('LINK', '') == 'уточн': for ch in children: if ch['LINK'] == 'сравнит': ch['DOM'] = word.attrib['DOM'] for child in children: if child['ID'] != ch['ID'] and child[ 'LINK'] not in { 'огранич', 'колич-огран' }: child['DOM'] = ch['ID'] word.attrib['DOM'] = ch['ID'] elif word.attrib.get('LINK', '') in ['длительн']: if any(ch['LINK'] == '1-компл' for ch in children): for ch in children: if ch['LINK'] == '1-компл': ch['DOM'] = word.attrib['DOM'] for child in children: if child['ID'] != ch[ 'ID'] and child[ 'LINK'] not in { 'огранич', 'колич-огран' }: child['DOM'] = ch['ID'] word.attrib['DOM'] = ch['ID'] ch['LINK'] = word.attrib['LINK'] word.attrib['LINK'] = 'nummod:gov' word.attrib['FEAT'] = 'NUM' else: for ch in children: if ch['LINK'] == 'сравнит': ch['DOM'] = word.attrib['DOM'] for child in children: if child['ID'] != ch[ 'ID'] and child[ 'LINK'] not in { 'огранич', 'колич-огран' }: child['DOM'] = ch['ID'] word.attrib['DOM'] = ch['ID'] elif word.attrib.get( 'LINK', '') in ['примыкат', 'колич-огран']: if any(ch['LINK'] == 'сравнит' for ch in children) and not any( ch['LINK'] == '1-компл' for ch in children): pass else: if word.attrib.get( 'LINK', '') == 'примыкат' and word.attrib.get( 'LEMMA', '') == 'более': pass # one exception for ch in children: if ch['LINK'] == '1-компл': ch['DOM'] = word.attrib['DOM'] for child in children: if child['ID'] != ch[ 'ID'] and child[ 'LINK'] not in { 'огранич', 'колич-огран' }: child['DOM'] = ch['ID'] word.attrib['DOM'] = ch['ID'] ch['LINK'] = word.attrib['LINK'] word.attrib['LINK'] = 'nummod:gov' word.attrib['FEAT'] = 'NUM' elif word.attrib.get('LINK', '') == 'сравн-союзн': if any(ch['LINK'] == 'предик' for ch in children) or any( ch['LINK'] == 'разъяснит' for ch in children): pass else: for ch in children: if ch['LINK'] == '1-компл': ch['DOM'] = word.attrib['DOM'] for child in children: if child['ID'] != ch[ 'ID'] and child[ 'LINK'] not in { 'огранич', 'колич-огран' }: child['DOM'] = ch['ID'] word.attrib['DOM'] = ch['ID'] ch['LINK'] = word.attrib['LINK'] word.attrib['LINK'] = 'nummod:gov' word.attrib['FEAT'] = 'NUM' elif word.attrib.get('LINK', '') == 'сочин': if any(ch['LINK'] == '1-компл' for ch in children) and 'СРАВ' not in word.attrib.get( 'FEAT', ''): for ch in children: if ch['LINK'] == '1-компл': ch['DOM'] = word.attrib['DOM'] for child in children: if child['ID'] != ch[ 'ID'] and child[ 'LINK'] not in { 'огранич', 'колич-огран' }: child['DOM'] = ch['ID'] word.attrib['DOM'] = ch['ID'] ch['LINK'] = word.attrib['LINK'] word.attrib['LINK'] = 'nummod:gov' word.attrib['FEAT'] = 'NUM' else: pass else: #for_debug_rels[word.attrib['LINK']] += 1 print('Error in numerals.py: missing condition') print(word.attrib.get('ID', ''), word.attrib.get('LINK', ''), word.attrib.get('LEMMA', ''), word.attrib.get('FEAT', ''), file=sys.stderr) print(children, file=sys.stderr) print(ifname + '_' + sent.attrib['ID'], file=sys.stderr) print(*[(token.attrib.get('ID', 'EMPTY'), token.text, token.attrib.get('DOM', 'EMPTY'), token.attrib.get('FEAT', 'EMPTY'), token.attrib.get('LINK', 'EMPTY'), token.tail) for token in sent], file=sys.stderr, sep='\n') print('***', file=sys.stderr) #count += 1 #for sent in root[-1].findall('S'): # for word in sent.findall('W'): # if word.attrib.get('LINK', 'EMPTY') in ['nummod', 'nummod:gov']: # print(word.attrib.get('ID', ''), word.attrib.get('LINK', ''), word.attrib.get('FEAT', ''), file=sys.stderr) # print(*[(token.attrib.get('ID', 'EMPTY'), token.text, token.attrib.get('DOM', 'EMPTY'), token.attrib.get('FEAT', 'EMPTY'), token.attrib.get('LINK', 'EMPTY'), token.tail) for token in sent], file=sys.stderr, sep='\n') # print('***', file=sys.stderr) # continue tree.write(ofname, encoding="UTF-8") #print(count) #for elem in for_debug_rels: # print(elem, for_debug_rels[elem]) return