class PrjMgr(object): """ gestisce i progetti codificti in json. digitando senza argomenti vengono visualizzate tutte le opzioni es. prjmgr.py prol_txt.json: prol_txt.json: { "log": "0", "exe": [ [ "teimxml.py ", "-i tou1/prol.txt", "-t teimcfg/teimtags.csv", "-o tou1/log/prol_teim.txt" ] ] } """ def __init__(self): self.logerr = Log("a") self.log = Log("a") self.logerr.open("log/prjmgr.ERR.log", 1) self.log.open("log/prjmgr.log", 0) def kv_split(self, s, sep): sp = s.split(sep) s0 = sp[0].strip() s1 = '' if len(sp) > 1: s1 = sp[1].strip() return s0, s1 def list2str(self, data): if isinstance(data, str): return data.strip() s = " ".join(data) return s.strip() def get(self, js, k): s = js.get(k, None) if s is None: raise Exception(f"{k} not found.{os.linesep}") return s def files_of_dir(self, d, e): p = pl.Path(d) if p.exists() is False: raise Exception(f'{d} not found.') fs = sorted(list(p.glob(e))) return fs def chmod(self, path): os.chmod(path, stat.S_IRWXG + stat.S_IRWXU + stat.S_IRWXO) def include_files(self, include): """nel file host sostitusce ogni parametro con il file ad esso collegato Args: js (dict): "include". ramo del project """ self.log.log(os.linesep, ">> include") try: file_host = include.get("host", None) file_dest = include.get("dest", None) file_lst = include.get("files", []) param_lst = include.get("params", []) # with open(file_host, "rt") as f: host = f.read() # for param_path in file_lst: param, path = self.kv_split(param_path, '|') self.log.log(f"{param}: {path}") with open(path, "rt") as f: txt = f.read() host = host.replace(param, txt) # for key_val in param_lst: key, val = self.kv_split(key_val, '|') self.log.log(f"{key}: {val}") host = host.replace(key, val) # with open(file_dest, "w+") as f: f.write(host) self.chmod(file_dest) except Exception as e: self.logerr.log("include") self.logerr.log(e) sys.exit(1) def execute_files_of_dir(self, exe_dir): self.log.log(">> exe_dir").prn() try: dr = self.get(exe_dir, 'dir') ptrn = self.get(exe_dir, 'pattern') exe_lst = self.get(exe_dir, 'exe_file') par_name = self.get(exe_dir, 'par_name') par_subst = self.get(exe_dir, 'par_subst') # replace par in par_name k, v = self.kv_split(par_subst, '|') files = self.files_of_dir(dr, ptrn) for f in files: file_name = os.path.basename(f) file_par = file_name.replace(k, v) for exe in exe_lst: exe = self.list2str(exe) x = exe.replace(par_name, file_par) self.log.log(x) r = os.system(x) if r != 0: raise Exception(f"execute:{x}") except Exception as e: self.logerr.log("ERROR","exe_dir") self.logerr.log(e) # self.logerr.log(pp(exe_dir)) #sys.exit(1) def remove_files_of_dir(self, remove_dir): self.log.log(">> remove_dir").prn() try: for de in remove_dir: self.log.log(de) dr = de.get('dir') ptrn = de.get('pattern') files = self.files_of_dir(dr, ptrn) for f in files: self.log.log(f) os.remove(f) except Exception as e: self.logerr.log("remove_dir") self.logerr.log(e) self.logerr.log(pp(remove_dir)) #sys.exit(1) def merge_files_of_list(self, merge_files): self.log.log(">> merge_files").prn() out = self.get(merge_files, "out_path") files = self.get(merge_files, "files") fout = open(out, "w+") for f in files: self.log.log(f) with open(f, "rt") as f: txt = f.read() fout.write(txt) fout.write(os.linesep) fout.close() self.log.log(out) self.chmod(out) def merge_files_of_dir(self, merge_dir): self.log.log(">> merge_dir").prn() try: dr = self.get(merge_dir, 'dir') ptrn = self.get(merge_dir, 'pattern') out_path = self.get(merge_dir, 'out_path') files = self.files_of_dir(dr, ptrn) file_out = open(out_path, "w") for fpath in files: self.log.log(fpath) with open(fpath, "rt") as f: txt = f.read() file_out.write(txt) file_out.write(os.linesep) file_out.close() self.chmod(out_path) self.log.log(out_path) except Exception as e: self.logerr.log("merge_dir") self.logerr.log(e) self.logerr.log(pp(merge_dir)) #sys.exit(1) def execute_list_progs(self, exe): self.log.log( ">> exe").prn() try: for x in exe: x = self.list2str(x) self.log.log(x) r = os.system(x) if r != 0: raise Exception(str(r)) except Exception as e: self.logerr.log("exe") self.logerr.log(e) self.logerr.log(pp(exe)) #sys.exit(1) def copy_file(self, copy_file): self.log.log(">> copy_file").prn() try: for x in copy_file: in_path = self.get(x, 'in_path') out_path = self.get(x, 'out_path') aw = self.get(x, "aw") self.log.log(in_path) with open(in_path, "rt") as f: text = f.read() with open(out_path, aw) as f: f.write(text) if aw == 'a': f.write(os.linesep) self.chmod(out_path) self.log.log(out_path) except Exception as e: self.logerr.log("copy_file") self.logerr.log(e) self.logerr.log(pp(copy_file)) sys.exit(1) def write_text(self, write_text): self.log.log(">> write_text").prn() try: text = self.get(write_text, 'text') out_path = self.get(write_text, 'out_path') aw = self.get(write_text, "aw") with open(out_path, aw) as f: f.write(text) if aw == 'a': f.write(os.linesep) self.chmod(out_path) self.log.log(out_path) except Exception as e: self.logerr.log("write_text") self.logerr.log(e) self.logerr.log(pp(write_text)) sys.exit(1) def parse_json(self, js): for k, v in js.items(): # accetta tag del tipo exe.1 exe.2 .. k = k.split('.')[0] if k == "exe": self.execute_list_progs(v) elif k == "merge_files": self.merge_files_of_list(v) elif k == "merge_dir": self.merge_files_of_dir(v) elif k == "include": self.include_files(v) elif k == "exe_dir": self.execute_files_of_dir(v) elif k == "remove_dir": self.remove_files_of_dir(v) elif k == "write_text": self.write_text(v) elif k == "copy_file": self.copy_file(v) elif k == "log": l = int(v) self.log.set_liv(l) else: self.logerr.log(f"ERROR option:{k} not implemented") def parse_file(self, in_path): try: with open(in_path, "r") as f: txt = f.read() js = json.loads(txt) except Exception as e: self.logerr.log("prjmgr.py json ERROR") self.logerr.log(e) sys.exit(1) self.parse_json(js) def parse_jsons(self,*js): lst=list(js) for j in lst: self.parse_json(j)
class Xml2Txt: """ Estrae un file di testo da un file tei xml """ def __init__(self, path_xml='', path_txt='', write_append='w'): self.path_xml = path_xml self.path_txt = path_txt self.write_append = write_append path_err = path_txt.replace(".txt", ".ERR.log") self.logerr = Log("w").open(path_err, 1).log self.txt_builder = None self.trace = False def node_liv(self, node): d = 0 while node is not None: d += 1 node = node.getparent() return d - 1 def clean_key(self, k): s = k p0 = k.find("{http") if (p0 > -1): p1 = k.rfind('}') if p1 > -1: s = k[p1 + 1:] return s def node_items(self, nd): kvs = nd.items() js = {} for kv in kvs: k = self.clean_key(kv[0]) v = kv[1] js[k] = v return js def node_tag(self, nd): try: tag = nd.tag tag = tag if type(nd.tag) is str else "XXX" pid = tag.find('}') if pid > 0: tag = tag[pid + 1:] return tag.strip() except Exception as e: self.logerr.log("ERROR in xml") self.logerr.log(str(e)) return "XXX" def node_id(self, nd): s = '' kvs = nd.items() for kv in kvs: if kv[0].rfind('id') > -1: s = kv[1] break return s def node_id_num(self, id): if id == '': return '' m = re.search(r'\d', id) if m is None: return -1 p = m.start() return id[p:] def node_text(self, nd): text = nd.text text = '' if text is None else text.strip() text = text.strip().replace(os.linesep, ',,') return text def node_tail(self, nd): tail = '' if nd.tail is None else nd.tail tail = tail.strip().replace(os.linesep, '') return tail def node_val(self, nd): ls = [] for x in nd.itertext(): s = x.strip().replace(os.linesep, '') ls.append(s) texts = ' '.join(ls) s = re.sub(r"\s{2,}", ' ', texts) return s def node_is_parent(self, nd): cs = nd.getchildren() le = len(cs) return le > 0 def get_node_data(self, nd): items = self.node_items(nd) id = self.node_id(nd) if id != '': id_num = self.node_id_num(id) items['id_num'] = id_num js = { 'id': id, 'liv': self.node_liv(nd), 'tag': self.node_tag(nd), 'text': self.node_text(nd), 'tail': self.node_tail(nd), 'items': items, # 'keys': self.node_keys(nd) # 'val': self.node_val(nd), 'val': "", 'is_parent': self.node_is_parent(nd) } return js def build_txt_data(self, nd): """ crea un json contenente x_data (estratto da xml) t_data (empty per la furua elaborazione) Args: nd : nod xml Returns: json: json=x_data + c_data + t_data """ x_data = self.get_node_data(nd) txt_data = { 'id': x_data.get('id', 0), 'is_parent': x_data.get('is_parent', False), 'items': x_data.get('items', {}), 'liv': x_data.get('liv', 0), 'tag': x_data.get('tag', ''), 'text': x_data.get('text', ''), 'tail': x_data.get('tail', ''), 'val': x_data.get('val', ''), 't_i': 0, 't_type': '', 't_up': False, 't_start': '', 't_end': '', 't_sp': '', 't_ln': False, 't_flag': False } return txt_data def write_txt(self): try: parser = etree.XMLParser(ns_clean=True) xml_root = etree.parse(self.path_xml, parser) except Exception as e: self.logerr.log("ERROR teixml2txt.py write_txt() parse_xml") self.logerr.log(e) sys.exit(str(e)) try: self.txt_builder = TxtBuilder() ######################## for nd in xml_root.iter(): txt_data = self.build_txt_data(nd) self.txt_builder.add(txt_data) ######################## self.txt_builder.elab() txt = self.txt_builder.txt make_dir_of_file(self.path_txt) with open(self.path_txt, self.write_append) as f: f.write(txt) chmod(self.path_txt) except Exception as e: self.logerr.log("ERROR teixml2txt.py write_html()") self.logerr.log(e) ou = StringIO() traceback.print_exc(file=ou) st = ou.getvalue() ou.close() self.logerr.log(st) sys.exit(1) return self.path_txt
class TxtBuilder: def __init__(self): self.log = Log("w") self.log.open("log/txtbuilder.log", 0) self.logerr = Log("a") self.logerr.open("log/txtbuilder.ERR.log", 1) self.data_lst = [] self.data_txt_lst = [] self.data_span_lst = [] self.from_to_lst = [] self.txt_rows = [] self.up = True self.w_liv = 100 self.trace = False self.ramis=self.set_ramis_dict() def set_ramis_dict(self): js={} for r in RAMIS: k,v=r.split('|') js[k]={} ls=v.split(',') for xy in ls: x,y=xy.split(':') js[k][x]=y return js def get_ramis(self,key,ch): js=self.ramis.get(key,None) if js is None: return f"ERR{key}" r=js.get(ch,None) if r is None: return f"ERR{ch}" return r def fill_from_to_list(self): for data_span in self.data_span_lst: x_items = data_span.get('items', {}) x_from = x_items.get('from', None) x_to = x_items.get('to', None) x_type = x_items.get('type', None) if x_from is None or x_to is None or x_type is None: self.logerr.log("fill_from_to_list ERROR.").prn() self.logerr.log(pp(data_span)).prn() sys.exit(1) item = { "id0": x_from, "id1": x_to, "type": x_type } self.from_to_lst.append(item) def from_to_set_data_txt(self): for i in range(0, len(self.from_to_lst)): from_to = self.from_to_lst[i] id_from = from_to['id0'].strip() id_to = from_to['id1'].strip() span_type = from_to['type'].strip() err = 0 if id_from == '': err = 1 if id_to == '': err = 2 if err == 1: self.logerr.log(f"ERROR from is null. to:{id_to}.") elif err == 2: self.logerr.log(f"ERROR from={id_from} to is null.") for i in range(0, len(self.data_txt_lst)): data_txt = self.data_txt_lst[i] id = data_txt['id'] if id == '': continue if id_from == id: if span_type == MONOLOG: if err == 0: data_txt[START] = '[' else: data_txt[START] = '[ERR ' elif span_type == DIRECT: if err == 0: data_txt[START] = '{' else: data_txt[START] = '{ERR ' elif id_to == id: if span_type == MONOLOG: if err == 0: data_txt[END] = ']' else: data_txt[END] = ' ERR]' elif span_type == DIRECT: if err == 0: data_txt[END] = '}' else: data_txt[END] = ' ERR}' # def w_num(self, id): # p = id.find('w') # if p < 0: # return -1 # return int(id[p+1:]) def set_data_txt_list(self): """setta t_data utilizzano xml_data e csv_data """ t_up = False sic = False w_num = 0 for i, d in enumerate(self.data_txt_lst): #id = d["id"] liv = d["liv"] tag = d['tag'].lower().strip() d['tag'] = tag text = d['text'].strip() d['t_i'] = i sp = '' ln = False if text != '': if t_up: self.data_txt_lst[i]['t_up'] = True t_up = False if sic: self.data_txt_lst[i]['text'] = '' sic = False if tag == 'w': sp = ' ' self.w_liv = liv elif tag == 'pc': if text in ['.', '!', '?']: t_up = True elif tag in NAMES_UP: t_up = True elif tag in ['lg']: t_up = True elif tag == 'del': self.data_txt_lst[i]['text'] = '' self.data_txt_lst[i]['tail'] = '' elif tag == 'sic': sic = True elif tag == 'l': ln = True d['t_sp'] = sp d['t_ln'] = ln def is_in_xml_items(self, items, key, val): v = items.get(key, '') v = v.replace('#', '').strip() return v == val def build_txt_rows(self): """crea le righe di testo self._txt_rows utilizzando data_text=xml_data + csv_data + t_data """ self.txt_rows = [] words = [] for i, d in enumerate(self.data_txt_lst): id = d['id'] tag = d['tag'].strip() text = d['text'].strip() tail = d['tail'].strip() items = d['items'] t_start = d['t_start'] t_sp = d['t_sp'] t_up = d['t_up'] t_end = d['t_end'] t_ln = d['t_ln'] if tag == 'c': if len(text)==1: k=items.get('ana',None) if k is not None: r=self.get_ramis(k,text) text=r elif tag == 'w': # els if self.is_in_xml_items(items, 'ana', 'elis'): text = f'{text}{ELIS}' self.data_txt_lst[i+1]['t_sp'] = '' # encl if self.is_in_xml_items(items, 'ana', 'encl'): text = f'{ENCL}{text} ' t_sp = '' if t_sp != '': words.append(t_sp) if t_start != '': words.append(t_start) if t_up: text = text.capitalize() else: text = text.lower() tail = tail.lower() w = f"{text}{tail}" if w != '': words.append(w) if t_end != '': words.append(t_end) if t_ln: row = ''.join(words) self.txt_rows.append(row) words = [] row = ''.join(words).strip() self.txt_rows.append(row) def text_adjust(self): VIRG = '"' for i, rw in enumerate(self.txt_rows): rw = re.sub(r" ,", ", ", rw) rw = re.sub(r" ;", "; ", rw) rw = re.sub(r" \.", ". ", rw) rw = re.sub(r'\[\s*', ' "', rw) rw = re.sub(r'\]', '" ', rw) rw = re.sub(r'{\s*', ' "', rw) rw = re.sub(r'}', '" ', rw) rw = rw.replace(f"{ELIS} ", ELIS) rw = re.sub(r"\s{2,}", " ", rw) self.txt_rows[i] = rw.strip() def elab(self): for data in self.data_lst: if data['tag'] == 'span': self.data_span_lst.append(data) else: self.data_txt_lst.append(data) # popola la lista con gli id from to self.fill_from_to_list() # completa gli elemnti di data_txt_lst self.set_data_txt_list() # setta start ed end in datat_tx self.from_to_set_data_txt() # cra le righe di testo self.build_txt_rows() # sistema le righe du testo self.text_adjust() def add(self, data): self.data_lst.append(data) @property def txt(self): s = os.linesep.join(self.txt_rows) return s
class TxtBuilder: def __init__(self): self.log = Log("w") self.log.open("log/txtbuilder.log", 0) self.logerr = Log("a") self.logerr.open("log/txtbuilder.ERR.log", 1) self.data_lst = [] self.data_txt_lst = [] self.data_span_lst = [] self.from_to_lst = [] self.txt_rows = [] self.up = True self.w_liv = 100 self.trace = False self.ramis = self.set_ramis_dict() def set_ramis_dict(self): js = {} for r in RAMIS: k, v = r.split('|') js[k] = {} ls = v.split(',') for xy in ls: x, y = xy.split(':') js[k][x] = y return js def get_ramis(self, key, ch): js = self.ramis.get(key, None) if js is None: return f"ERR{key}" r = js.get(ch, None) if r is None: return f"ERR{ch}" return r def fill_from_to_list(self): for data_span in self.data_span_lst: x_items = data_span.get('items', {}) x_from = x_items.get('from', None) x_to = x_items.get('to', None) x_type = x_items.get('type', None) if x_from is None or x_to is None or x_type is None: self.logerr.log("fill_from_to_list ERROR.").prn() self.logerr.log(pp(data_span)).prn() sys.exit(1) item = { "id0": x_from, "id1": x_to, "type": x_type } self.from_to_lst.append(item) def from_to_set_data_txt(self): for i in range(0, len(self.from_to_lst)): from_to = self.from_to_lst[i] id_from = from_to['id0'].strip() id_to = from_to['id1'].strip() span_type = from_to['type'].strip() err = 0 if id_from == '': err = 1 if id_to == '': err = 2 if err == 1: self.logerr.log(f"ERROR from is null. to:{id_to}.") elif err == 2: self.logerr.log(f"ERROR from={id_from} to is null.") for i in range(0, len(self.data_txt_lst)): data_txt = self.data_txt_lst[i] id = data_txt['id'] if id == '': continue if id_from == id: if span_type == MONOLOG: if err == 0: data_txt[START] = '[' else: data_txt[START] = '[ERR ' elif span_type == DIRECT: if err == 0: data_txt[START] = '{' else: data_txt[START] = '{ERR ' elif id_to == id: if span_type == MONOLOG: if err == 0: data_txt[END] = ']' else: data_txt[END] = ' ERR]' elif span_type == DIRECT: if err == 0: data_txt[END] = '}' else: data_txt[END] = ' ERR}' # def w_num(self, id): # p = id.find('w') # if p < 0: # return -1 # return int(id[p+1:]) def set_data_txt_list(self): """setta t_data utilizzano xml_data e csv_data """ t_up = False sic = False #w_num = 0 for i, d in enumerate(self.data_txt_lst): #id = d["id"] liv = d["liv"] tag = d['tag'].lower().strip() d['tag'] = tag text = d['text'].strip() d['t_i'] = i sp = '' ln = False if text != '': if t_up: self.data_txt_lst[i]['t_up'] = True t_up = False if sic: self.data_txt_lst[i]['text'] = '' sic = False if tag == 'w': sp = ' ' self.w_liv = liv elif tag == 'pc': if text in ['.', '!', '?']: t_up = True elif tag in NAMES_UP: t_up = True elif tag in ['lg']: t_up = True elif tag == 'del': self.data_txt_lst[i]['text'] = '' self.data_txt_lst[i]['tail'] = '' elif tag == 'sic': sic = True elif tag == 'l': ln = True d['t_sp'] = sp d['t_ln'] = ln def is_in_xml_items(self, items, key, val): v = items.get(key, '') v = v.replace('#', '').strip() return v == val def adjust_tail_inversion(self): """ <w xml:id="Kch2h1w14">des <expan corresp="#ab-sus-tu">t <ex>r</ex>u </expan>c <c ana="#hiat">i</c>on </w> <w xml:id="Kch1p1w104"> <expan corresp="#ab-tir-9"> <ex>con</ex> </expan> <expan corresp="#ab-tild-q">q<ex>ue</ex> </expan>re </w> errattO: con q re ue corretto: con q ue re "re" tail di expan "ue" text di ex "u" è stampato dopo perchè <ex> segue <expan> souzione: spostare "ue" prima di "re" <ex>text => prima di <expan>tail """ le = len(self.data_txt_lst)-1 for i, t_curr in enumerate(self.data_txt_lst): if i == 0: continue if i >= le: continue t_prec = self.data_txt_lst[i-1] t_succ = self.data_txt_lst[i+1] if t_curr['tail'] != '': if t_succ['liv'] > t_curr['liv']: # text e tail di <ex> text_succ = t_succ['text'] t_succ['text'] = '' tail_succ = t_succ['tail'] t_succ['tail'] = '' # il tail di<expan> tail_curr = t_curr['tail'] s = f'{text_succ}{tail_succ}{tail_curr}' t_curr['tail'] = s def build_txt_rows(self): """crea le righe di testo self._txt_rows utilizzando data_text=xml_data + csv_data + t_data """ self.adjust_tail_inversion() self.txt_rows = [] words = [] # n=8000 for i, d in enumerate(self.data_txt_lst): id_ = d['id'] tag = d['tag'].strip() text = d['text'].strip() tail = d['tail'].strip() items = d['items'] t_start = d['t_start'] t_sp = d['t_sp'] t_up = d['t_up'] t_end = d['t_end'] t_ln = d['t_ln'] if tag == 'c': if len(text) == 1: k = items.get('ana', None) if k is not None: r = self.get_ramis(k, text) text = r elif tag == 'w': # els if self.is_in_xml_items(items, 'ana', 'elis'): text = f'{text}{ELIS}' self.data_txt_lst[i+1]['t_sp'] = '' # encl if self.is_in_xml_items(items, 'ana', 'encl'): text = f'{ENCL}{text} ' t_sp = '' if t_sp != '': words.append(t_sp) if t_start != '': words.append(t_start) if t_up: text = text.capitalize() else: text = text.lower() tail = tail.lower() w = f"{text}{tail}" if w != '': words.append(w) if t_end != '': words.append(t_end) if t_ln: row = ''.join(words) self.txt_rows.append(row) words = [] # # if xtc['tag'] == 'w': # xtw = xtc # if tail != "" and i < xle: # if xts['liv'] > xtc['liv']: # print(pp(xtw, 20)) # s = xtw['val'].replace(" ", "") # print(pp(xtp, 20)) # print(pp(xtc, 20)) # print(pp(xts, 20)) # print(s) # input("?") # xtc = d # if xtc['tag'] == 'w' and tail != "": # print(pp(xtc)) # input("?") # if id_ == "Kch1p1w104": # #self.trace = True # pass # if self.trace: # print(pp(d, 20)) # print(text) # # print(d) # set_trace() # if id_ == "Kch2h1w14": # n = i # if tag == 'w': # xtw = self.data_txt_lst[i] # if i == n+1: # xtp = self.data_txt_lst[i-1] # xtc = d # xts = self.data_txt_lst[i+1] # print(pp(xtw, 20)) # s = xtw['val'].replace(" ", "") # print(pp(xtp, 20)) # print(pp(xtc, 20)) # print(pp(xts, 20)) # print(s) # input("?") row = ''.join(words).strip() self.txt_rows.append(row) def text_adjust(self): VIRG = '"' for i, rw in enumerate(self.txt_rows): rw = re.sub(r" ,", ", ", rw) rw = re.sub(r" ;", "; ", rw) rw = re.sub(r" \.", ". ", rw) rw = re.sub(r'\[\s*', ' "', rw) rw = re.sub(r'\]', '" ', rw) rw = re.sub(r'{\s*', ' "', rw) rw = re.sub(r'}', '" ', rw) rw = rw.replace(f"{ELIS} ", ELIS) rw = re.sub(r"\s{2,}", " ", rw) self.txt_rows[i] = rw.strip() def elab(self): for data in self.data_lst: if data['tag'] == 'span': self.data_span_lst.append(data) else: self.data_txt_lst.append(data) # popola la lista con gli id from to self.fill_from_to_list() # completa gli elemnti di data_txt_lst self.set_data_txt_list() # setta start ed end in data_txt self.from_to_set_data_txt() # cra le righe di testo self.build_txt_rows() # sistema le righe du testo self.text_adjust() def add(self, data): self.data_lst.append(data) @property def txt(self): s = os.linesep.join(self.txt_rows) return s