def zones(self): if self._zones_cache is None: xml = self._desc_xml self._zones_cache = [ e.get("YNC_Tag") for e in xml.findall('.//*[@Func="Subunit"]') ] return self._zones_cache
def _FindChildren(xml, tag): namespace = re.finditer('{(.*)}.*', xml.tag).next().groups()[0] resolved_tags = [ '{%s}%s' % (namespace, tag) for tag in tag.split('/') ] result = xml.findall('.//' + '/'.join(resolved_tags)) return result
def _FindChildren(xml, tag): namespace = re.finditer('{(.*)}.*', xml.tag).next().groups()[0] resolved_tags = ['{%s}%s' % (namespace, tag) for tag in tag.split('/')] result = xml.findall('.//' + '/'.join(resolved_tags)) return result
def iamondb_extract(partial_path): """ Lightly modified from https://github.com/Grzego/handwriting-generation/blob/master/preprocess.py """ data = [] charset = set() file_no = 0 pth = os.path.join(partial_path, "original") for root, dirs, files in os.walk(pth): # sort the dirs to iterate the same way every time # https://stackoverflow.com/questions/18282370/os-walk-iterates-in-what-order dirs.sort() for file in files: file_name, extension = os.path.splitext(file) if extension == '.xml': file_no += 1 print('[{:5d}] File {} -- '.format(file_no, os.path.join(root, file)), end='') xml = ElementTree.parse(os.path.join(root, file)).getroot() transcription = xml.findall('Transcription') if not transcription: print('skipped') continue #texts = [html.unescape(s.get('text')) for s in transcription[0].findall('TextLine')] texts = [ HTMLParser.HTMLParser().unescape(s.get('text')) for s in transcription[0].findall('TextLine') ] points = [ s.findall('Point') for s in xml.findall('StrokeSet')[0].findall('Stroke') ] strokes = [] mid_points = [] for ps in points: pts = np.array([[int(p.get('x')), int(p.get('y')), 0] for p in ps]) pts[-1, 2] = 1 pts = clear_middle(pts) if len(pts) == 0: continue seps = separate(pts) for pss in seps: if len(seps) > 1 and len(pss) == 1: continue pss[-1, 2] = 1 xmax, ymax = max(pss, key=lambda x: x[0])[0], max( pss, key=lambda x: x[1])[1] xmin, ymin = min(pss, key=lambda x: x[0])[0], min( pss, key=lambda x: x[1])[1] strokes += [pss] mid_points += [[(xmax + xmin) / 2., (ymax + ymin) / 2.]] distances = [ -(abs(p1[0] - p2[0]) + abs(p1[1] - p2[1])) for p1, p2 in zip(mid_points, mid_points[1:]) ] splits = sorted(np.argsort(distances)[:len(texts) - 1] + 1) lines = [] for b, e in zip([0] + splits, splits + [len(strokes)]): lines += [[p for pts in strokes[b:e] for p in pts]] print('lines = {:4d}; texts = {:4d}'.format( len(lines), len(texts))) charset |= set(''.join(texts)) data += [(texts, lines)] print('data = {}; charset = ({}) {}'.format(len(data), len(charset), ''.join(sorted(charset)))) translation = {'<NULL>': 0} for c in ''.join(sorted(charset)): translation[c] = len(translation) def translate(txt): return list(map(lambda x: translation[x], txt)) dataset = [] labels = [] for texts, lines in data: for text, line in zip(texts, lines): line = np.array(line, dtype=np.float32) line[:, 0] = line[:, 0] - np.min(line[:, 0]) line[:, 1] = line[:, 1] - np.mean(line[:, 1]) dataset += [line] labels += [translate(text)] whole_data = np.concatenate(dataset, axis=0) std_y = np.std(whole_data[:, 1]) norm_data = [] for line in dataset: line[:, :2] /= std_y norm_data += [line] dataset = norm_data print('datset = {}; labels = {}'.format(len(dataset), len(labels))) save_path = os.path.join(partial_path, 'preprocessed_data') try: os.makedirs(save_path) except FileExistsError: pass np.save(os.path.join(save_path, 'dataset'), np.array(dataset)) np.save(os.path.join(save_path, 'labels'), np.array(labels)) with open(os.path.join(save_path, 'translation.pkl'), 'wb') as file: pickle.dump(translation, file) print("Preprocessing finished and cached at {}".format(save_path))
def getXmlTagByXpath(xml,xpath): road_leve_infos = xml.findall(xpath) return road_leve_infos