示例#1
0
def find_pubid(rowstr):
    '''rowstr is a concatentation of all metadata fields
    Returns None if file not exist or empty
    '''
    try:
        regexes = utils.parse_tsv_data(open(PUBREGEX_TSV, 'r').read())
    except FileNotFoundError:
        utils.error("File not exists: %s" % PUBREGEX_TSV, severity='WARNING')
        return None

    matching = set()
    for r in regexes:
        m = re.search(r['regex'], rowstr, flags=re.IGNORECASE)
        if m:
            matching.add(r['pubid'])

    if not matching:
        utils.warn("%s: no regex matches" % rowstr)
    else:
        if len(matching) > 1:
            utils.warn("%s: too many regex matches (%s)" % (rowstr, " ".join(matching)))
            return None
        else:
            return matching.pop()

    return None
示例#2
0
def find_pubid(rowstr):
    '''rowstr is a concatentation of all metadata fields
    Returns None if file not exist or empty
    '''
    try:
        regexes = utils.parse_tsv_data(open(PUBREGEX_TSV, 'r').read())
    except FileNotFoundError:
        utils.error("File not exists: %s" % PUBREGEX_TSV, severity='WARNING')
        return None

    matching = set()
    for r in regexes:
        m = re.search(r['regex'], rowstr, flags=re.IGNORECASE)
        if m:
            matching.add(r['pubid'])

    if not matching:
        utils.warn("%s: no regex matches" % rowstr)
    else:
        if len(matching) > 1:
            utils.warn("%s: too many regex matches (%s)" %
                       (rowstr, " ".join(matching)))
            return None
        else:
            return matching.pop()

    return None
示例#3
0
文件: remix.py 项目: rbairwell/xd
def main():
    args = get_args("reclue puzzle with clues from other publications")
    outf = open_output()

    all_clues = load_clues()

    missing_tsv = COLUMN_SEPARATOR.join(
        ["grid_xdid", "clues_pubid", "num_missing"]) + EOL

    for fn, contents in find_files(*args.inputs, ext=".xd"):
        xd = xdfile(contents, fn)
        if not xd.grid:
            continue
        xd.set_header("Title", None)
        xd.set_header("Editor", "Timothy Parker Bot")
        xd.set_header(
            "Author",
            "%s %s" % (random.choice(fake_first), random.choice(fake_last)))
        xd.set_header("Copyright", None)
        xd.set_header("Date", iso8601())

        remixed = set()
        for pubid, pub_clues in list(all_clues.items()):
            try:
                if pubid == xd.publication_id():
                    continue  # don't use same publisher's clues

                nmissing = reclue(xd, pub_clues)

                outfn = "%s-%s.xd" % (xd.xdid(), pubid)

                if nmissing == 0:
                    nmutated = 0
                    while nmutated < 100:
                        nmutated += mutate(xd, pub_clues)
                    nmissing = reclue(xd, pub_clues)
                    info("%s missing %d clues after %d mutations" %
                         (outfn, nmissing, nmutated))

                    remixed.add(pubid)
                    outf.write_file(outfn, xd.to_unicode())
                else:
                    debug("%s missing %d clues" % (outfn, nmissing))

                    missing_tsv += COLUMN_SEPARATOR.join(
                        [xd.xdid(), pubid, str(nmissing)]) + EOL

            except Exception as e:
                error("remix error %s" % str(e))

        if remixed:
            info("%d remixed: %s" % (len(remixed), " ".join(remixed)))
            try:
                outf.write_file(
                    parse_pathname(fn).base + ".xd", contents.encode("utf-8"))
            except Exception as e:
                error("couldn't write: " + str(e))

    outf.write_file("remix.log", get_log().encode("utf-8"))
    outf.write_file("remix.tsv", missing_tsv)
示例#4
0
def clean_headers(xd):
    # remove known unwanted header fields, log unknown headers
    for hdr in list(xd.headers.keys()):
        if hdr in ["Source", "Identifier", "Acquired", "Issued", "Category"]:
            xd.set_header(hdr, None)
        else:
            if hdr.lower() not in xdfile.HEADER_ORDER:
                utils.warn("%s: '%s' header not known: '%s'" %
                           (xd.filename, hdr, xd.headers[hdr]))

    # clean Author and Editor headers
    author = xd.get_header("Author") or ""
    if not author:
        if xd.get_header("Creator"):
            assert not author
            author = xd.get_header("Creator")
            xd.set_header("Creator", None)

    editor = xd.get_header("Editor") or ""

    newauthor, neweditor = clean_author(author, editor)

    if newauthor != author:
        xd.set_header("Author" + CLEAN_SUFFIX, newauthor)

    if neweditor != editor:
        xd.set_header("Editor" + CLEAN_SUFFIX, neweditor)

    # clean Title header
    title = xd.get_header("Title") or ""
    newtitle = clean_title(title)

    if newtitle != title:
        xd.set_header("Title" + CLEAN_SUFFIX, newtitle)
    # create Date header
    dt = xd.get_header("Date")

    ## try getting Date from filename
    if not dt:
        try:
            d = utils.parse_date_from_filename(xd.filename)
            if d:
                dt = d.strftime("%Y-%m-%d")
        except Exception as e:
            utils.error(str(e))
            if args.debug:
                raise

    ## try getting Date from copyright
    if not dt:
        rights = xd.get_header("Copyright") or ""
        dt = find_date(rights)

    if dt:
        xd.set_header("Date", dt)
示例#5
0
def main():
    args = get_args("reclue puzzle with clues from other publications")
    outf = open_output()

    all_clues = load_clues()

    missing_tsv = COLUMN_SEPARATOR.join([ "grid_xdid", "clues_pubid", "num_missing" ]) + EOL

    for fn, contents in find_files(*args.inputs, ext=".xd"):
        xd = xdfile(contents, fn)
        if not xd.grid:
            continue
        xd.set_header("Title", None)
        xd.set_header("Editor", "Timothy Parker Bot")
        xd.set_header("Author", "%s %s" % (random.choice(fake_first), random.choice(fake_last)))
        xd.set_header("Copyright", None)
        xd.set_header("Date", iso8601())

        remixed = set()
        for pubid, pub_clues in list(all_clues.items()):
            try:
                if pubid == xd.publication_id():
                    continue  # don't use same publisher's clues

                nmissing = reclue(xd, pub_clues)

                outfn = "%s-%s.xd" % (xd.xdid(), pubid)

                if nmissing == 0:
                    nmutated = 0
                    while nmutated < 100:
                        nmutated += mutate(xd, pub_clues)
                    nmissing = reclue(xd, pub_clues)
                    info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated))

                    remixed.add(pubid)
                    outf.write_file(outfn, xd.to_unicode())
                else:
                    debug("%s missing %d clues" % (outfn, nmissing))

                    missing_tsv += COLUMN_SEPARATOR.join([ xd.xdid(), pubid, str(nmissing) ]) + EOL

            except Exception as e:
                error("remix error %s" % str(e))

        if remixed:
            info("%d remixed: %s" % (len(remixed), " ".join(remixed)))
            try:
                outf.write_file(parse_pathname(fn).base + ".xd", contents.encode("utf-8"))
            except Exception as e:
                error("couldn't write: " + str(e))

    outf.write_file("remix.log", get_log().encode("utf-8"))
    outf.write_file("remix.tsv", missing_tsv)
示例#6
0
def clean_headers(xd):
    # remove known unwanted header fields, log unknown headers
    for hdr in list(xd.headers.keys()):
        if hdr in ["Source", "Identifier", "Acquired", "Issued", "Category"]:
            xd.set_header(hdr, None)
        else:
            if hdr.lower() not in xdfile.HEADER_ORDER:
                utils.warn("%s: '%s' header not known: '%s'" % (xd.filename, hdr, xd.headers[hdr]))

    # clean Author and Editor headers
    author = xd.get_header("Author") or ""
    if not author:
        if xd.get_header("Creator"):
            assert not author
            author = xd.get_header("Creator")
            xd.set_header("Creator", None)

    editor = xd.get_header("Editor") or ""

    newauthor, neweditor = clean_author(author, editor)

    if newauthor != author:
        xd.set_header("Author" + CLEAN_SUFFIX, newauthor)

    if neweditor != editor:
        xd.set_header("Editor" + CLEAN_SUFFIX, neweditor)

    # clean Title header
    title = xd.get_header("Title") or ""
    newtitle = clean_title(title)

    if newtitle != title:
        xd.set_header("Title" + CLEAN_SUFFIX, newtitle)
    # create Date header
    dt = xd.get_header("Date")

    ## try getting Date from filename
    if not dt:
        try:
            d = utils.parse_date_from_filename(xd.filename)
            if d:
                dt = d.strftime("%Y-%m-%d")
        except Exception as e:
            utils.error(str(e))
            if args.debug:
                raise

    ## try getting Date from copyright
    if not dt:
        rights = xd.get_header("Copyright") or ""
        dt = find_date(rights)

    if dt:
        xd.set_header("Date", dt)
示例#7
0
def xd_send_email(destaddr, fromaddr='*****@*****.**', subject='', body=''):
    client = boto3.client('ses', region_name=os.environ['REGION'])
    info("sending email to %s (subject '%s')" % (destaddr, subject))
    try:
        response = client.send_email(
                Source=fromaddr,
                Destination= {'ToAddresses': [ destaddr ] },
                Message={ 'Subject': { 'Data': subject },
                'Body': { 'Text': { 'Data': body } } })
        return response
    except Exception as e:
        error("xd_send_email(): %s" % str(e))
        return None
示例#8
0
文件: cloud.py 项目: rbairwell/xd
def xd_send_email(destaddr, fromaddr='*****@*****.**', subject='', body=''):
    client = boto3.client('ses', region_name=os.environ['REGION'])
    info("sending email to %s (subject '%s')" % (destaddr, subject))
    try:
        response = client.send_email(Source=fromaddr,
                                     Destination={'ToAddresses': [destaddr]},
                                     Message={
                                         'Subject': {
                                             'Data': subject
                                         },
                                         'Body': {
                                             'Text': {
                                                 'Data': body
                                             }
                                         }
                                     })
        return response
    except Exception as e:
        error("xd_send_email(): %s" % str(e))
        return None
示例#9
0
def main():
    global args
    parsers = {
        '.xml': [parse_ccxml, parse_uxml],
        '.json': [parse_ujson],
        '.puz': [parse_puz],
        '.html': [parse_xwordinfo],
        '.pdf': [],
        '.jpg': [],
        '.gif': [],
        '.xd': [],  # special case, just copy the input, in case re-emitting screws it up
    }

    p = args_parser('convert crosswords to .xd format')
    p.add_argument('--copyright', default=None, help='Default value for unspecified Copyright headers')
    p.add_argument('--extsrc', default=None, help='Value for receipts.ExternalSource')
    p.add_argument('--intsrc', default=None, help='Value for receipts.InternalSource')
    p.add_argument('--pubid', default=None, help='PublicationAbbr (pubid) to use')
    args = get_args(parser=p)

    outf = open_output()

    for input_source in args.inputs:
        try:
            # collect 'sources' metadata
            source_files = {}
            # collect receipts
            receipts = []

            for fn, contents, dt in find_files_with_time(input_source, ext='.tsv'):
                progress(fn)
                for row in parse_tsv_data(contents.decode('utf-8'), "Source"):
                    innerfn = strip_toplevel(row.SourceFilename)
                    if innerfn in source_files:
                        warn("%s: already in source_files!" % innerfn)
                        continue
                    source_files[innerfn] = row

            # enumerate all files in this source, reverse-sorted by time
            #  (so most recent edition gets main slot in case of shelving
            #  conflict)
            for fn, contents, dt in sorted(find_files_with_time(input_source, strip_toplevel=False), reverse=True, key=lambda x: x[2]):
                if fn.endswith(".tsv") or fn.endswith(".log"):
                    continue

                if not contents:  # 0-length files
                    continue

                innerfn = strip_toplevel(fn)
                if innerfn in source_files:
                    srcrow = source_files[innerfn]
                    CaptureTime = srcrow.DownloadTime
                    ExternalSource = args.extsrc or srcrow.ExternalSource
                    SourceFilename = innerfn
                else:
                    debug("%s not in sources.tsv" % innerfn)
                    CaptureTime = iso8601(dt)
                    ExternalSource = args.extsrc or parse_pathname(input_source).filename
                    SourceFilename = innerfn

                ReceivedTime = iso8601(time.time())
                InternalSource = args.intsrc or parse_pathname(input_source).filename

                already_received = metadb.check_already_received(ExternalSource, SourceFilename)
                xdid = ""
                prev_xdid = ""  # unshelved by default

                existing_xdids = set(r.xdid for r in already_received)
                if existing_xdids:
                    if len(existing_xdids) > 1:
                        warn('previously received this same file under multiple xdids:' + ' '.join(existing_xdids))
                    else:
                        prev_xdid = existing_xdids.pop()
                        debug('already received as %s' % prev_xdid)

                # try each parser by extension
                ext = parse_pathname(fn).ext.lower()
                possible_parsers = parsers.get(ext, parsers[".puz"])

                progress(fn)

                if ext == ".xd":
                    outf.write_file(fn, contents.decode('utf-8'), dt)
                elif not possible_parsers:
                    rejected = "no parser"
                else:
                    rejected = ""
                    for parsefunc in possible_parsers:
                        try:
                            try:
                                xd = parsefunc(contents, fn)
                            except IncompletePuzzleParse as e:
                                error("%s  %s" % (fn, e))
                                xd = e.xd
                            if not xd:
                                continue

                            xd.filename = replace_ext(strip_toplevel(fn), ".xd")
                            if not xd.get_header("Copyright"):
                                if args.copyright:
                                    xd.set_header("Copyright", args.copyright)

                            catalog.deduce_set_seqnum(xd)

                            xdstr = xd.to_unicode()

                            mdtext = "|".join((ExternalSource,InternalSource,SourceFilename))
                            xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext)
                            path = catalog.get_shelf_path(xd, args.pubid, mdtext)
                            outf.write_file(path + ".xd", xdstr, dt)

                            rejected = ""
                            break  # stop after first successful parsing
                        except xdfile.NoShelfError as e:
                            error("could not shelve: %s" % str(e))
                            rejected += "[shelver] %s  " % str(e)
                        except Exception as e:
                            error("%s could not convert [%s]: %s" % (parsefunc.__name__, fn, str(e)))
                            rejected += "[%s] %s  " % (parsefunc.__name__, str(e))
                            # raise

                    if rejected:
                        error("could not convert: %s" % rejected)

                    # only add receipt if first time converting this source
                    if already_received:
                        debug("already received %s:%s" % (ExternalSource, SourceFilename))
                    else:
                        receipts.append([
                            CaptureTime,
                            ReceivedTime,
                            ExternalSource,
                            InternalSource,
                            SourceFilename,
                            xdid
                        ])

            for r in receipts:
                metadb.append_row('gxd/receipts', r)

        except Exception as e:
            error(str(e))
            if args.debug:
                raise
示例#10
0
def parse_ccxml(data, filename):
    content = data.decode('utf-8', errors='replace')
    content = escape(content, xml_escape_table)
    content = consecutive(content)
    content = re.sub(r'(=["]{2}([^"]+?)["]{2})+',r'="&quot;\2&quot;"', content) # Replace double quotes
    content_xml = content.encode('utf-8')

    ns = {
        'puzzle': 'http://crossword.info/xml/rectangular-puzzle'
    }
    try:
        root = etree.fromstring(content_xml)
    except Exception as e:
        error('Exception %s' % e)
        error(content)
        exit

    # init crossword
    grid = root.xpath('//puzzle:crossword/puzzle:grid', namespaces=ns)
    if not grid:
        return None

    grid = grid[0]
    rows = int(grid.attrib['height'])
    cols = int(grid.attrib['width'])

    xd = xdfile.xdfile('', filename)

    # add metadata
    for metadata in root.xpath('//puzzle:metadata', namespaces=ns)[0]:
        text = metadata.text and metadata.text.strip()
        title = re.sub('\{[^\}]*\}', '', metadata.tag.title())
        title = escape(title, rev_xml_escape_table)
        if text:
            text = escape(text, rev_xml_escape_table)
            xd.set_header(HEADER_RENAMES.get(title, title), text)

    # add puzzle
    puzzle = []
    for i in range(rows):
        puzzle.append([" "] * cols)

    for cell in grid.xpath('./puzzle:cell', namespaces=ns):
        x = int(cell.attrib['x']) - 1
        y = int(cell.attrib['y']) - 1
        if 'solution' in cell.attrib:
            value = cell.attrib['solution']
        if 'type' in cell.attrib and cell.attrib['type'] == 'block':
            value = xdfile.BLOCK_CHAR
        puzzle[y][x] = value

    xd.grid = ["".join(row) for row in puzzle]

    # add clues
    word_map = {}
    for word in root.xpath('//puzzle:crossword/puzzle:word', namespaces=ns):
        word_map[word.attrib['id']] = (word.attrib['x'], word.attrib['y'])

    for clues in root.xpath('//puzzle:crossword/puzzle:clues', namespaces=ns):
        type = clues.xpath('./puzzle:title', namespaces=ns)[0]
        type = "".join(chr(x) for x in etree.tostring(type, method='text').upper() if chr(x) in string.ascii_uppercase)
        type = type[0]

        for clue in clues.xpath('./puzzle:clue', namespaces=ns):
            word_id = clue.attrib['word']
            number = int(clue.attrib['number'])
            text = "|".join(clue.itertext()).strip()
            text = escape(text, rev_xml_escape_table)
            solution = get_solution(word_id, word_map, puzzle)
            xd.clues.append(((type, number), text, solution))

    return xd
示例#11
0
def main():
    global args
    parsers = {
        '.xml': [parse_ccxml, parse_uxml],
        '.json': [parse_ujson],
        '.puz': [parse_puz],
        '.html': [parse_xwordinfo],
        '.pdf': [],
        '.jpg': [],
        '.gif': [],
        '.xd':
        [],  # special case, just copy the input, in case re-emitting screws it up
    }

    p = args_parser('convert crosswords to .xd format')
    p.add_argument('--copyright',
                   default=None,
                   help='Default value for unspecified Copyright headers')
    p.add_argument('--extsrc',
                   default=None,
                   help='Value for receipts.ExternalSource')
    p.add_argument('--intsrc',
                   default=None,
                   help='Value for receipts.InternalSource')
    p.add_argument('--pubid',
                   default=None,
                   help='PublicationAbbr (pubid) to use')
    args = get_args(parser=p)

    outf = open_output()

    for input_source in args.inputs:
        try:
            # collect 'sources' metadata
            source_files = {}
            # collect receipts
            receipts = []

            for fn, contents, dt in find_files_with_time(input_source,
                                                         ext='.tsv'):
                progress(fn)
                for row in parse_tsv_data(contents.decode('utf-8'), "Source"):
                    innerfn = strip_toplevel(row.SourceFilename)
                    if innerfn in source_files:
                        warn("%s: already in source_files!" % innerfn)
                        continue
                    source_files[innerfn] = row

            # enumerate all files in this source, reverse-sorted by time
            #  (so most recent edition gets main slot in case of shelving
            #  conflict)
            for fn, contents, dt in sorted(find_files_with_time(
                    input_source, strip_toplevel=False),
                                           reverse=True,
                                           key=lambda x: x[2]):
                if fn.endswith(".tsv") or fn.endswith(".log"):
                    continue

                if not contents:  # 0-length files
                    continue

                innerfn = strip_toplevel(fn)
                if innerfn in source_files:
                    srcrow = source_files[innerfn]
                    CaptureTime = srcrow.DownloadTime
                    ExternalSource = args.extsrc or srcrow.ExternalSource
                    SourceFilename = innerfn
                else:
                    debug("%s not in sources.tsv" % innerfn)
                    CaptureTime = iso8601(dt)
                    ExternalSource = args.extsrc or parse_pathname(
                        input_source).filename
                    SourceFilename = innerfn

                ReceivedTime = iso8601(time.time())
                InternalSource = args.intsrc or parse_pathname(
                    input_source).filename

                already_received = metadb.check_already_received(
                    ExternalSource, SourceFilename)
                xdid = ""
                prev_xdid = ""  # unshelved by default

                existing_xdids = set(r.xdid for r in already_received)
                if existing_xdids:
                    if len(existing_xdids) > 1:
                        warn(
                            'previously received this same file under multiple xdids:'
                            + ' '.join(existing_xdids))
                    else:
                        prev_xdid = existing_xdids.pop()
                        debug('already received as %s' % prev_xdid)

                # try each parser by extension
                ext = parse_pathname(fn).ext.lower()
                possible_parsers = parsers.get(ext, parsers[".puz"])

                progress(fn)

                if ext == ".xd":
                    outf.write_file(fn, contents.decode('utf-8'), dt)
                elif not possible_parsers:
                    rejected = "no parser"
                else:
                    rejected = ""
                    for parsefunc in possible_parsers:
                        try:
                            try:
                                xd = parsefunc(contents, fn)
                            except IncompletePuzzleParse as e:
                                error("%s  %s" % (fn, e))
                                xd = e.xd
                            if not xd:
                                continue

                            xd.filename = replace_ext(strip_toplevel(fn),
                                                      ".xd")
                            if not xd.get_header("Copyright"):
                                if args.copyright:
                                    xd.set_header("Copyright", args.copyright)

                            catalog.deduce_set_seqnum(xd)

                            xdstr = xd.to_unicode()

                            mdtext = "|".join((ExternalSource, InternalSource,
                                               SourceFilename))
                            xdid = prev_xdid or catalog.deduce_xdid(xd, mdtext)
                            path = catalog.get_shelf_path(
                                xd, args.pubid, mdtext)
                            outf.write_file(path + ".xd", xdstr, dt)

                            rejected = ""
                            break  # stop after first successful parsing
                        except xdfile.NoShelfError as e:
                            error("could not shelve: %s" % str(e))
                            rejected += "[shelver] %s  " % str(e)
                        except Exception as e:
                            error("%s could not convert [%s]: %s" %
                                  (parsefunc.__name__, fn, str(e)))
                            rejected += "[%s] %s  " % (parsefunc.__name__,
                                                       str(e))
                            # raise

                    if rejected:
                        error("could not convert: %s" % rejected)

                    # only add receipt if first time converting this source
                    if already_received:
                        debug("already received %s:%s" %
                              (ExternalSource, SourceFilename))
                    else:
                        receipts.append([
                            CaptureTime, ReceivedTime, ExternalSource,
                            InternalSource, SourceFilename, xdid
                        ])

            for r in receipts:
                metadb.append_row('gxd/receipts', r)

        except Exception as e:
            error(str(e))
            if args.debug:
                raise
示例#12
0
文件: ccxml2xd.py 项目: rbairwell/xd
def parse_ccxml(data, filename):
    content = data.decode('utf-8', errors='replace')
    content = escape(content, xml_escape_table)
    content = consecutive(content)
    content = re.sub(r'(=["]{2}([^"]+?)["]{2})+', r'="&quot;\2&quot;"',
                     content)  # Replace double quotes
    content_xml = content.encode('utf-8')

    ns = {'puzzle': 'http://crossword.info/xml/rectangular-puzzle'}
    try:
        root = etree.fromstring(content_xml)
    except Exception as e:
        error('Exception %s' % e)
        error(content)
        exit

    # init crossword
    grid = root.xpath('//puzzle:crossword/puzzle:grid', namespaces=ns)
    if not grid:
        return None

    grid = grid[0]
    rows = int(grid.attrib['height'])
    cols = int(grid.attrib['width'])

    xd = xdfile.xdfile('', filename)

    # add metadata
    for metadata in root.xpath('//puzzle:metadata', namespaces=ns)[0]:
        text = metadata.text and metadata.text.strip()
        title = re.sub('\{[^\}]*\}', '', metadata.tag.title())
        title = escape(title, rev_xml_escape_table)
        if text:
            text = escape(text, rev_xml_escape_table)
            xd.set_header(HEADER_RENAMES.get(title, title), text)

    # add puzzle
    puzzle = []
    for i in range(rows):
        puzzle.append([" "] * cols)

    for cell in grid.xpath('./puzzle:cell', namespaces=ns):
        x = int(cell.attrib['x']) - 1
        y = int(cell.attrib['y']) - 1
        if 'solution' in cell.attrib:
            value = cell.attrib['solution']
        if 'type' in cell.attrib and cell.attrib['type'] == 'block':
            value = xdfile.BLOCK_CHAR
        puzzle[y][x] = value

    xd.grid = ["".join(row) for row in puzzle]

    # add clues
    word_map = {}
    for word in root.xpath('//puzzle:crossword/puzzle:word', namespaces=ns):
        word_map[word.attrib['id']] = (word.attrib['x'], word.attrib['y'])

    for clues in root.xpath('//puzzle:crossword/puzzle:clues', namespaces=ns):
        type = clues.xpath('./puzzle:title', namespaces=ns)[0]
        type = "".join(
            chr(x) for x in etree.tostring(type, method='text').upper()
            if chr(x) in string.ascii_uppercase)
        type = type[0]

        for clue in clues.xpath('./puzzle:clue', namespaces=ns):
            word_id = clue.attrib['word']
            number = int(clue.attrib['number'])
            text = "|".join(clue.itertext()).strip()
            text = escape(text, rev_xml_escape_table)
            solution = get_solution(word_id, word_map, puzzle)
            xd.clues.append(((type, number), text, solution))

    return xd