Exemplo n.º 1
0
Arquivo: rxv.py Projeto: syn2083/rxv
 def zones(self):
     if self._zones_cache is None:
         xml = self._desc_xml
         self._zones_cache = [
             e.get("YNC_Tag") for e in xml.findall('.//*[@Func="Subunit"]')
         ]
     return self._zones_cache
Exemplo n.º 2
0
Arquivo: rxv.py Projeto: wuub/rxv
 def zones(self):
     if self._zones_cache is None:
         xml = self._desc_xml
         self._zones_cache = [
             e.get("YNC_Tag") for e in xml.findall('.//*[@Func="Subunit"]')
         ]
     return self._zones_cache
Exemplo n.º 3
0
 def _FindChildren(xml, tag):
     namespace = re.finditer('{(.*)}.*', xml.tag).next().groups()[0]
     resolved_tags = [
         '{%s}%s' % (namespace, tag) for tag in tag.split('/')
     ]
     result = xml.findall('.//' + '/'.join(resolved_tags))
     return result
Exemplo n.º 4
0
 def _FindChildren(xml, tag):
   namespace = re.finditer('{(.*)}.*', xml.tag).next().groups()[0]
   resolved_tags = ['{%s}%s' % (namespace, tag) for tag in tag.split('/')]
   result = xml.findall('.//' + '/'.join(resolved_tags))
   return result
Exemplo n.º 5
0
def iamondb_extract(partial_path):
    """
    Lightly modified from https://github.com/Grzego/handwriting-generation/blob/master/preprocess.py
    """
    data = []
    charset = set()

    file_no = 0
    pth = os.path.join(partial_path, "original")
    for root, dirs, files in os.walk(pth):
        # sort the dirs to iterate the same way every time
        # https://stackoverflow.com/questions/18282370/os-walk-iterates-in-what-order
        dirs.sort()
        for file in files:
            file_name, extension = os.path.splitext(file)
            if extension == '.xml':
                file_no += 1
                print('[{:5d}] File {} -- '.format(file_no,
                                                   os.path.join(root, file)),
                      end='')
                xml = ElementTree.parse(os.path.join(root, file)).getroot()
                transcription = xml.findall('Transcription')
                if not transcription:
                    print('skipped')
                    continue
                #texts = [html.unescape(s.get('text')) for s in transcription[0].findall('TextLine')]
                texts = [
                    HTMLParser.HTMLParser().unescape(s.get('text'))
                    for s in transcription[0].findall('TextLine')
                ]
                points = [
                    s.findall('Point')
                    for s in xml.findall('StrokeSet')[0].findall('Stroke')
                ]
                strokes = []
                mid_points = []
                for ps in points:
                    pts = np.array([[int(p.get('x')),
                                     int(p.get('y')), 0] for p in ps])
                    pts[-1, 2] = 1

                    pts = clear_middle(pts)
                    if len(pts) == 0:
                        continue

                    seps = separate(pts)
                    for pss in seps:
                        if len(seps) > 1 and len(pss) == 1:
                            continue
                        pss[-1, 2] = 1

                        xmax, ymax = max(pss, key=lambda x: x[0])[0], max(
                            pss, key=lambda x: x[1])[1]
                        xmin, ymin = min(pss, key=lambda x: x[0])[0], min(
                            pss, key=lambda x: x[1])[1]

                        strokes += [pss]
                        mid_points += [[(xmax + xmin) / 2.,
                                        (ymax + ymin) / 2.]]
                distances = [
                    -(abs(p1[0] - p2[0]) + abs(p1[1] - p2[1]))
                    for p1, p2 in zip(mid_points, mid_points[1:])
                ]
                splits = sorted(np.argsort(distances)[:len(texts) - 1] + 1)
                lines = []
                for b, e in zip([0] + splits, splits + [len(strokes)]):
                    lines += [[p for pts in strokes[b:e] for p in pts]]
                print('lines = {:4d}; texts = {:4d}'.format(
                    len(lines), len(texts)))
                charset |= set(''.join(texts))
                data += [(texts, lines)]
    print('data = {}; charset = ({}) {}'.format(len(data), len(charset),
                                                ''.join(sorted(charset))))

    translation = {'<NULL>': 0}
    for c in ''.join(sorted(charset)):
        translation[c] = len(translation)

    def translate(txt):
        return list(map(lambda x: translation[x], txt))

    dataset = []
    labels = []
    for texts, lines in data:
        for text, line in zip(texts, lines):
            line = np.array(line, dtype=np.float32)
            line[:, 0] = line[:, 0] - np.min(line[:, 0])
            line[:, 1] = line[:, 1] - np.mean(line[:, 1])

            dataset += [line]
            labels += [translate(text)]

    whole_data = np.concatenate(dataset, axis=0)

    std_y = np.std(whole_data[:, 1])
    norm_data = []
    for line in dataset:
        line[:, :2] /= std_y
        norm_data += [line]
    dataset = norm_data

    print('datset = {}; labels = {}'.format(len(dataset), len(labels)))

    save_path = os.path.join(partial_path, 'preprocessed_data')
    try:
        os.makedirs(save_path)
    except FileExistsError:
        pass
    np.save(os.path.join(save_path, 'dataset'), np.array(dataset))
    np.save(os.path.join(save_path, 'labels'), np.array(labels))
    with open(os.path.join(save_path, 'translation.pkl'), 'wb') as file:
        pickle.dump(translation, file)
    print("Preprocessing finished and cached at {}".format(save_path))
Exemplo n.º 6
0
def getXmlTagByXpath(xml,xpath):
    road_leve_infos = xml.findall(xpath)
    return road_leve_infos