예제 #1
0
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'frse'
    data['_template'] = 'frse.html'

    # easily translatable info
    data['name'] = symbolreplace.tags_to_unicode(datasheet['NAME'])
    data['birth'] = datasheet['BIRTH']
    data['birth'] = datasheet['DEATH']
    data['birthplace'] = symbolreplace.tags_to_unicode(datasheet['BIRTHPLACE'])
    data['profession'] = datasheet['PROFESSION']
    data['fellowship'] = datasheet['FELLOWSHIP']
    #data['biography'] = datasheet['BIOGRAPHY']

    if datasheet['BIOGRAPHY'] != '':
        assert datasheet['BIOGRAPHY'] == datasheet['FILENAME']

    elected = datasheet['ELECTED']
    if '/  /' in elected:
        # hack to make it work
        elected = elected.replace('/  /', '01/01/')
    electedDate = datetime.datetime.strptime(elected, '%d/%m/%Y')
    data['elected'] = electedDate.strftime('%Y-%m-%d')

    return data
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'honour'
    data['_template'] = 'honour.html'

    # filename, title, headline and update date for this
    data['title'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['TITLE']))
    data['headline'] = htmlparser.parse(datasheet['HEADLINE'],
                                        datasheet['FILENAME'],
                                        paragraphs=False,
                                        url_context=url_context)

    # parse biography
    data['content'] = htmlparser.parse(datasheet['CONTENT'],
                                       datasheet['FILENAME'],
                                       paragraphs=True,
                                       url_context=url_context)

    data['tags'] = ''

    # alphabetical display entries
    parsed_entries = []
    if data['title'].strip() != '':
        s = data['title'].strip()
        parsed_entries.append(s)
    elif data['headline'].strip() != '':
        s = data['headline'].strip()
        parsed_entries.append(s)
    data['alphabetical'] = '\n'.join(parsed_entries)

    return data
def load():
    for letter in letters:
        # read the data
        filepath = '../datasheets/AlphaIndex/%s' % letter
        with open(filepath, 'r', encoding='mac_roman') as f:
            lines = f.readlines()

        # parse each line
        for line in lines:
            line = line.strip()
            if line == '':
                continue

            # extract the name out of this line
            match = pattern.search(line)
            assert match

            # get text
            pretext = match.group('pretext') or ''
            text = match.group('text')
            posttext = match.group('posttext')
            text = pretext + text + posttext
            text = symbolreplace.tags_to_unicode(text)
            text = symbolreplace.strip_tags(text)
            text = text.strip()
            text = text.replace(' , ', ', ')

            # get name
            name = match.group('name') or match.group('text')
            name = urls.biography_rename(name)
            data.append((name, text))
def get_displays_2(find_name):
    new_pattern = re.compile(
        r'\.\./Biographies/%s\.html">(?P<display>.+?)<td' %
        re.escape(find_name))

    displays = []
    for letter in letters:
        # read the data
        filepath = '/Users/david/Documents/MacTutor/actual-work/from-server/2/history/Indexes/%s.html' % letter
        with open(filepath, 'r', encoding='mac_roman') as f:
            data = f.read()

        for match in re.finditer(new_pattern, data):
            display = match.group('display')
            display = display.replace('</a>', '')
            display = symbolreplace.tags_to_unicode(display)
            display = symbolreplace.strip_tags(display)
            display = display.strip()
            print('%s -> %s' % (match.group('display'), display))
            displays.append(display)

    # check for (and remove) duplicates
    if len(displays) != len(set(displays)):
        with open('duplicate-displays.txt', 'a') as f:
            f.write('%s :: %s\n' % (find_name, displays))
        displays = list(set(displays))

    missing_list = ['Moriarty']
    if len(displays) == 0 and find_name not in missing_list:
        print('No displays found for %s' % find_name)
        return False

    displays.sort()
    return displays
예제 #5
0
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'curve'
    data['_template'] = 'curve.html'

    # easily translatable info
    data['name'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['FULLNAME']))

    # need to parse the individual equations out, and convert to flow
    equations = parse_equations(datasheet['EQUATIONS'], datasheet['FILENAME'])
    data['equations'] = flow.to_flow_block('curveequation', equations)

    # parse java applet options
    options = '{\n'
    pattern = re.compile(
        r'\<PARAM NAME="(?P<name>.+?)" VALUE="(?P<value>.+?)">')
    for match in re.finditer(pattern, datasheet['JAVA']):
        name = match.group('name')
        value = match.group('value')
        line = '%s: "%s",\n' % (name, value)
        options += line
    options += '}'
    data['appletoptions'] = options

    # parse content
    data['content'] = htmlparser.parse(datasheet['CONTENTS'],
                                       datasheet['FILENAME'],
                                       paragraphs=True,
                                       url_context=url_context)

    return data
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'gazperson'
    data['_template'] = 'gazperson.html'

    data['name'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['TITLE']))

    # parse the list of people
    parsed_places = []
    places = datasheet['LINKS'].strip().split('\n')
    for place in places:
        place = place.strip()
        if place == '':
            continue

        addFragment = 'no'
        if place.endswith('*'):
            place = place[:-1]
            addFragment = 'yes'

        parsed_places.append({'place': place, 'fragment': addFragment})

    data['places'] = flow.to_flow_block('gazplaceflow', parsed_places)

    return data
예제 #7
0
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'place'
    data['_hidden'] = 'yes'

    # easily translatable info
    data['name'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['PLACENAME']))
    data['country'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['COUNTRY']))
    data['webref'] = datasheet['WEBREF']
    data['gaz'] = datasheet['GAZ']

    # some places are missing a country
    if datasheet['FILENAME'] == 'Higham_Ferrers':
        data['country'] = 'England'
    elif datasheet['FILENAME'] == 'Kansas_City':
        data['country'] = 'USA'
    elif datasheet['FILENAME'] == 'Lit':
        data['country'] = 'Sweden'
    elif datasheet['FILENAME'] == 'Martos':
        data['country'] = 'Spain'

    # and some places have a malformed country
    if data['country'] == 'Czech_Republic':
        data['country'] = 'Czech Republic'
    elif data['country'] == 'Sicily':
        data['country'] = 'Italy'
    elif data['country'].endswith(')'):
        data['country'] = data['country'][:-1]
    elif data['country'] == '':
        data['country'] == '--Unknown--'

    # lat and long
    pattern = re.compile(r'(?P<lat>-?[\d.]+),(?P<long>-?[\d.]+)')
    match = pattern.search(datasheet['LATLONG0'])
    data['latitude'] = ''
    data['longitude'] = ''
    if match:
        data['latitude'] = match.group('lat')
        data['longitude'] = match.group('long')

    return data
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'quotation'
    data['_template'] = 'quotation.html'

    # filename, name
    data['name'] = symbolreplace.tags_to_unicode(datasheet['NAME'])

    content = datasheet['CONTENT']
    numquotes = datasheet['NUMQUOTES']

    # special case cleaning rules
    if datasheet['FILENAME'] == 'Carmichael':
        content = content.replace('<p>', '')
    if datasheet['FILENAME'] in NUMBER_CORRECTIONS:
        numquotes = NUMBER_CORRECTIONS[datasheet['FILENAME']]

    # now parse the individual quotes
    content = content.split('<p>')
    quotes = []
    for quote in content:
        if quote.strip() != '':
            quotes.append(quote.strip())

    # holding 'more quotes' links, or 'translations by'
    data['more'] = ''

    if len(quotes) != 0 and 'More ' in quotes[-1] and '<a href' in quotes[-1]:
        #print('I *think* this is a *more quotes* paragraph:', quotes[-1])
        data['more'] = quotes.pop()

    if len(quotes) != 0 and data['more'] == '' and 'Translations ' in quotes[-1]:
        #print('I *think* this is a *translations by* paragraph:', quotes[-1])
        data['more'] = quotes.pop()

    if len(quotes) != int(numquotes):
        print('ERROR', len(quotes), 'expcting', int(numquotes))
        print(quotes)
        assert False

    # now parse the quotes and convert to html
    for idx, quote in enumerate(quotes):
        q = parse_quote(quote)
        q['quote'] = htmlparser.parse(q['quote'], 'Quotations/%s' % datasheet['FILENAME'], paragraphs=True, url_context=url_context)
        q['source'] = htmlparser.parse(q['source'], 'Quotations/%s' % datasheet['FILENAME'], paragraphs=False, url_context=url_context)
        quotes[idx] = q

    quotations = flow.to_flow_block('quotation', quotes)
    data['quotations'] = quotations

    return quotations
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'extra'
    data['_template'] = 'extra.html'

    # filename, title, headline and update date for this
    data['title'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['TITLE']))
    data['headline'] = htmlparser.parse(datasheet['HEADLINE'], datasheet['FILENAME'], paragraphs=False, url_context=url_context)
    data['update'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['UPDATE']))

    # parse references
    references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context)
    data['references'] = flow.to_flow_block('reference', json.loads(references)['data'])

    # parse biography
    data['content'] = htmlparser.parse(datasheet['EXTRA'],
                                datasheet['FILENAME'],
                                paragraphs=True,
                                url_context=url_context)

    return data
예제 #10
0
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'obituary'
    data['_template'] = 'obituary.html'

    # easily translatable info
    data['name'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['HEADING1']))
    data['summary'] = htmlparser.parse(datasheet['HEADING2'],
                                       datasheet['FILENAME'],
                                       paragraphs=False,
                                       url_context=url_context)
    data['wherefrom'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['TITLE']))

    # parse biography, and add in extras and translations
    data['content'] = htmlparser.parse(datasheet['CONTENT'],
                                       datasheet['FILENAME'],
                                       paragraphs=True,
                                       url_context=url_context)

    return data
예제 #11
0
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'page'
    data['_template'] = 'page.html'

    # sidebar
    data['sidebar'] = ''

    # easily translatable info
    data['authors'] = htmlparser.parse(datasheet['WHODIDIT'], datasheet['FILENAME'], paragraphs=False, url_context=url_context)
    data['title'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['TITLE']))

    # check that this is a standard page
    #assert datasheet['USEHTMLFORMAT'] == 'Y'

    # need to convert it to a standard page
    content = datasheet['CONTENT']

    regex = re.compile(r'<html>(?P<content>.*?)</html>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, strip, content)

    regex = re.compile(r'<head>(?P<content>.*?)</head>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, strip_all, content)

    regex = re.compile(r'<title>(.*?)</title>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, r'', content)
    regex = re.compile(r'<meta (.*?)/>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, r'', content)
    regex = re.compile(r'<style>(.*?)</style>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, r'', content)
    regex = re.compile(r'<body(.*?)>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, r'', content)
    content = content.replace('</body>', '')
    content = content.strip()

    # also get rid of the 'show larger image' button
    regex = re.compile(r'<form>(.*?)</form>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, r'', content)

    # parse biography, and add in extras and translations
    data['content'] = htmlparser.parse(content,
                                datasheet['FILENAME'],
                                paragraphs=True,
                                url_context=url_context)

    return data
예제 #12
0
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'glossary'
    data['_template'] = 'glossary.html'

    # easily translatable info
    data['term'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['WORD']))

    # parse biography, and add in extras and translations
    data['content'] = htmlparser.parse(datasheet['CONTENTS'],
                                datasheet['FILENAME'],
                                paragraphs=True,
                                url_context=url_context)

    return data
예제 #13
0
def parse_equations(text, filename):
    equations = []
    eqtype = None

    typeregex = re.compile(
        r'^<b><font color=green>(?P<type>.+?)</font></b>.*$')
    equationregex = re.compile(r'^(?P<equation>\\.+?\\\\)$')

    text = text.split('\n')
    for line in text:
        line = line.strip()
        typematch = typeregex.search(line)
        equationmatch = equationregex.search(line)
        if typematch:
            # it's a type!
            assert eqtype == None
            eqtype = typematch.group('type')
        elif equationmatch:
            # it's an equation!
            assert eqtype
            eqtype = symbolreplace.strip_tags(
                symbolreplace.tags_to_unicode(eqtype))
            equation = {
                'type':
                eqtype,
                'equation':
                htmlparser.parse(equationmatch.group('equation'),
                                 filename,
                                 paragraphs=False)
            }
            eqtype = None
            equations.append(equation)
        else:
            assert False

    return equations
예제 #14
0
def mathreplace(match):
    entire = match.group(0)
    math = match.group('math')
    math = symbolreplace.symbols_to_unicode(math, katex=True)
    math = symbolreplace.tags_to_unicode(math, katex=True)

    # remove <b>...</b>
    regex = re.compile(r'<b>(.*?)</b>', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'\1', math)
    # remove <i>...</i>
    regex = re.compile(r'<i>(.*?)</i>', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'\1', math)
    # remove <u>...</u>
    regex = re.compile(r'<u>(.*?)</u>', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'\1', math)
    # remove <r>...</r>, <bl>...</bl>, <gr>...</gr> and <bro>...</bro>
    regex = re.compile(r'<(?:r|bl|gr|bro)>(.*?)</(?:r|bl|gr|bro)>', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'\1', math)
    # remove <f+>...</f+>
    regex = re.compile(r'<f\+>(.*?)</f>', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'\1', math)
    regex = re.compile(r'<fp>(.*?)</fp>', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'\1', math)
    # remove <f++>...</f>
    regex = re.compile(r'<f\+\+>(.*?)</f>', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'\1', math)
    # remove <f->...</->
    regex = re.compile(r'<f->(.*?)</f>', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'\1', math)
    regex = re.compile(r'<fm>(.*?)</fm>', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'\1', math)
    # remove <ovl>...</ovl>
    regex = re.compile(r'<ovl>(.*?)</ovl>', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'\1', math)

    # convert fractions
    regex = re.compile(r'\^(\S+) ?\/¬(\S+) ?', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'{{\1}\over{\2}}', math)

    # convert vector bold
    regex = re.compile(r'`(.)`', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'\mathbb{\1}', math)

    # convert ^superscript
    regex = re.compile(r'\^(\S+)(?: ?)', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'^{\1}', math)
    regex = re.compile(r'<sup>(.*?)</sup>', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'^{\1}', math)

    # convert ¬subscript
    regex = re.compile(r'¬(\S+)(?: ?)', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'_{\1}', math)
    regex = re.compile(r'<sub>(.*?)</sub>', re.MULTILINE | re.DOTALL)
    math = re.sub(regex, r'_{\1}', math)

    # fix functions
    mappings = ['isin','arcsin','arccos','arctan','arctg','arg','ch','cosec','cosh',
        'cos','cotg','coth','cot','argmin','csc','ctg','cth','deg','dim','exp',
        'hom','ker','lg','ln','log','sec','sinh','sin','tanh','tan',#'sh','tg','th'
        'det','gcd','inf','lim','liminf','limsup','Pr','sup','argmax',
        'max','min']
    for mapping in mappings:
        old_math = math
        regex = re.compile(r'(?<!(?:(?:\\)|(?:\\i)|(?:\\arc)))(%s)' % mapping)
        math = re.sub(regex, r'\\\1', math)

    # remove multiline formulas
    lines = math.split('\n')
    output = ''
    for line in lines:
        line = line.strip()
        if line == '':
            output += '\n\n'
            continue
        output += '\n<latex>%s</latex>' % line

    return output.strip()
예제 #15
0
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'historytopic'
    data['_template'] = 'historytopic.html'

    # filename, short and full name, authors, update
    data['shortname'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['SHORTNAME']))
    data['fullname'] = htmlparser.parse(datasheet['FULLNAME'],
                                        datasheet['FILENAME'],
                                        paragraphs=False,
                                        url_context=url_context)
    data['authors'] = htmlparser.parse(datasheet['AUTHORS'],
                                       datasheet['FILENAME'],
                                       paragraphs=False,
                                       url_context=url_context)
    data['update'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['UPDATE']))

    # something about indexes, not sure how this is used yet
    data['indexref'] = datasheet['INDEXREF']
    data['indexreffile'] = datasheet['INDEXREFFILE']

    # parse references
    references = referenceparser.parse_references(datasheet['REFERENCES'],
                                                  datasheet['FILENAME'],
                                                  url_context)
    data['references'] = flow.to_flow_block('reference',
                                            json.loads(references)['data'])

    # parse additional links (they use the same format as cross references)
    additional = referenceparser.parse_cross_references(
        datasheet['ADDITIONAL'], datasheet['FILENAME'], url_context)
    data['additional'] = flow.to_flow_block('otherweb',
                                            json.loads(additional)['data'])

    # parse translations (use the same format as references)
    # don't add them to data, as we're combining them with bio
    translations = referenceparser.parse_references(datasheet['TRANSLATION'],
                                                    datasheet['FILENAME'],
                                                    url_context)
    translation_data = json.loads(translations)['data']
    translation_data = [{
        'number': d['number'],
        'translation': d['reference']
    } for d in translation_data]
    data['translations'] = flow.to_flow_block('translation', translation_data)

    # parse otherweb links (they use the same format as cross references)
    otherweb = referenceparser.parse_cross_references(datasheet['OTHERWEB'],
                                                      datasheet['FILENAME'],
                                                      url_context)
    data['otherweb'] = flow.to_flow_block('otherweb',
                                          json.loads(otherweb)['data'])

    # parse history topic
    data['content'] = htmlparser.parse(
        datasheet['HISTTOPIC'],
        datasheet['FILENAME'],
        translations=json.loads(translations)['data'],
        extras=json.loads(additional)['data'],
        paragraphs=True,
        url_context=url_context)

    # discover categories for this mathematician
    path = '/HistTopics/%s' % datasheet['FILENAME']
    tags = []
    #with open('../datasheets/Indexes/data.json') as f:
    #    category_data = json.load(f)
    category_data = categories.categories()
    for category in category_data:
        if path in category['entries']:
            tags.append(category['name'])
    data['tags'] = ', '.join(tags)

    # discover alphabetical index names for this history topic
    parsed_entries = []
    if 'INDEXNAMES' not in datasheet:
        if data['fullname'].strip() != '':
            parsed_entries.append(data['fullname'].strip())
        elif data['shortname'].strip() != '':
            parsed_entries.append(data['shortname'].strip())
        else:
            print('no names for this topic')
            assert False
    else:
        entries = datasheet['INDEXNAMES'].strip().split('\n')

        for entry in entries:
            entry = entry.strip()
            entry = symbolreplace.strip_tags(
                symbolreplace.tags_to_unicode(entry))
            parsed_entries.append(entry)
    data['alphabetical'] = '\n'.join(parsed_entries)

    return data
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'gazplace'
    data['_template'] = 'gazplace.html'

    data['place'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['TITLE']))

    pattern = re.compile(r'(?P<lat>-?[\d.]+),(?P<long>-?[\d.]+)')
    match = pattern.search(datasheet['COORDS'])
    data['latitude'] = ''
    data['longitude'] = ''
    if match:
        data['latitude'] = match.group('lat')
        data['longitude'] = match.group('long')

    # i was an idiot, and made a mistake in generating the datasheets for GazData
    # the correct CONTENTS is in GazData3
    # so we have to read that instead
    path = os.path.join('../datasheets/GazData3/', datasheet['FILENAME'])
    datasheet2 = datasheetparser.parse_file(path)

    # convert the references to the new style of references
    refcount = 1
    parsed_references = []
    references = datasheet['REFERENCES'].strip().split('\n')
    for reference in references:
        reference = reference.strip()
        if reference == '':
            continue
        parts = reference.split('@')
        if len(parts) != 3:
            print(reference)
            assert len(parts) == 3
        replacement = parts[0].strip()
        text = parts[2].strip()

        if replacement not in datasheet2['CONTENTS']:
            print(reference)
        assert replacement in datasheet2['CONTENTS']
        datasheet2['CONTENTS'] = datasheet2['CONTENTS'].replace(
            replacement, '[%s]' % refcount)

        parsed_references.append({
            'number':
            str(refcount),
            'reference':
            htmlparser.parse(text, datasheet['FILENAME'])
        })

        refcount = refcount + 1

    data['references'] = flow.to_flow_block('reference', parsed_references)

    # parse biography, and add in extras and translations
    data['content'] = htmlparser.parse(datasheet2['CONTENTS'],
                                       datasheet['FILENAME'],
                                       paragraphs=True,
                                       url_context=url_context)

    if data['place'] == 'Whitburn, Tyne & Wear':
        # add in the missing lat and long
        data['latitude'] = '54.9550395'
        data['longitude'] = '-1.3867149'

    if data['latitude'] == '' and data['longitude'] == '':
        # this is not a place, it should just be a page
        newdata = {}
        newdata['_model'] = 'page'
        newdata['_template'] = 'gazplace.html'
        newdata['title'] = data['place']
        newdata['authors'] = ''
        newdata['content'] = data['content']
        return newdata

    return data
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'society'
    data['_template'] = 'society.html'

    # easily translatable info
    data['name'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['TITLENAME']))
    data['headline'] = htmlparser.parse(datasheet['HEADLINE'],
                                        datasheet['FILENAME'],
                                        paragraphs=False,
                                        url_context=url_context)
    data['update'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['UPDATE']))
    data['foundation'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['FOUNDATION']))

    # external site parsing
    link = re.compile(
        r'<a\s+href ?= ?[\'"]?(?P<href>.+?)[\'"]?\s*>(?P<text>.*?)<\/a>')
    if datasheet['OTHERWEB'].strip() == '':
        data['website'] = ''
    else:
        match = link.search(datasheet['OTHERWEB'].strip())
        if not match:
            print('not link "%s"' % datasheet['OTHERWEB'].strip())
            assert match
        data['website'] = match.group('href')

    # parse references
    references = referenceparser.parse_references(datasheet['REFERENCES'],
                                                  datasheet['FILENAME'],
                                                  url_context)
    data['references'] = flow.to_flow_block('reference',
                                            json.loads(references)['data'])

    # parse additional links (they use the same format as cross references)
    # don't add them to data, as we're combining them with bio
    additional = referenceparser.parse_cross_references(
        datasheet['EXTRAS'], datasheet['FILENAME'], url_context)
    data['additional'] = flow.to_flow_block('otherweb',
                                            json.loads(additional)['data'])

    # parse biography, and add in extras and translations
    data['content'] = htmlparser.parse(datasheet['CONTENT'],
                                       datasheet['FILENAME'],
                                       extras=json.loads(additional)['data'],
                                       paragraphs=True,
                                       url_context=url_context)

    data['tags'] = ''

    # alphabetical display entries
    parsed_entries = []
    if data['name'].strip() != '':
        s = data['name'].strip()
        parsed_entries.append(s)
    elif data['headline'].strip() != '':
        s = data['headline'].strip()
        parsed_entries.append(s)
    data['alphabetical'] = '\n'.join(parsed_entries)

    return data
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'biography'
    data['_template'] = 'biography.html'

    # name and shortname
    data['shortname'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['SHORTNAME']))
    data['fullname'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['FULLNAME']))

    # authors
    data['authors'] = htmlparser.parse(datasheet['AUTHORS'], datasheet['FILENAME'], paragraphs=False, url_context=url_context)

    # last update
    data['update'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['UPDATE']))

    data['summary'] = htmlparser.parse(datasheet['SUMMARY'], datasheet['FILENAME'], paragraphs=False, url_context=url_context)

    # dates are tricky. for now leave them as they are
    data['birthdate'] = datasheet['BIRTHDATE']
    data['deathdate'] = datasheet['DEATHDATE']

    # birth and death year - remove the ,? if necessary
    date_pattern = re.compile(r'(\d+)(?:,\??)?')
    data['birthyear'] = re.sub(date_pattern, r'\1', datasheet['BIRTHYEAR'])
    data['deathyear'] = re.sub(date_pattern, r'\1', datasheet['DEATHYEAR'])

    # birthplace, deathplace
    data['birthplace'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['BIRTHPLACE']))
    data['deathplace'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['DEATHPLACE']))

    # mapinfo - just take the name, ignore mapnum and lat/long
    mapinfo = re.compile(r'\d,(?P<name>.+?),(?:(?P<lat>-?[\d.]+),(?P<long>-?[\d.]+))?')
    match = mapinfo.search(datasheet['MAPINFO'])
    data['maplocation'] = '--Unknown--'
    data['maplocation'] = ''
    if match:
        data['maplocation'] = match.group('name')

    # country
    data['country'] = '--Unknown--'
    if datasheet['COUNTRY'].strip() != '':
        data['country'] = datasheet['COUNTRY']

        if data['country'] == 'Czech_Republic':
            data['country'] = 'Czech Republic'
        elif data['country'] == 'Sicily':
            data['country'] = 'Italy'
        elif data['country'].endswith(')'):
            data['country'] = data['country'][:-1]
        elif data['country'] == '':
            data['country'] == '--Unknown--'

        # also add countries to global array
        if not data['country'] in countries:
            countries.append(data['country'])

    # parse references
    references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context)
    data['references'] = flow.to_flow_block('reference', json.loads(references)['data'])

    # parse translations (use the same format as references)
    # don't add them to data, as we're combining them with bio
    translations = referenceparser.parse_references(datasheet['TRANSLATION'], datasheet['FILENAME'], url_context)
    translation_data = json.loads(translations)['data']
    translation_data = [{'number':d['number'],'translation':d['reference']} for d in translation_data]
    data['translations'] = flow.to_flow_block('translation', translation_data)

    # parse cross references
    #xrefs = referenceparser.parse_cross_references(datasheet['XREFS'], datasheet['FILENAME'])
    #data['xrefs'] = xrefs

    # parse additional links (they use the same format as cross references)
    # don't add them to data, as we're combining them with bio
    additional = referenceparser.parse_cross_references(datasheet['ADDITIONAL'], datasheet['FILENAME'], url_context)
    data['additional'] = flow.to_flow_block('otherweb', json.loads(additional)['data'])

    # parse otherweb links (they use the same format as cross references)
    otherweb = referenceparser.parse_cross_references(datasheet['OTHERWEB'], datasheet['FILENAME'], url_context)
    data['otherweb'] = flow.to_flow_block('otherweb', json.loads(otherweb)['data'])

    # parse honours links (they use the same format as cross references)
    honours = referenceparser.parse_cross_references(datasheet['HONOURS'], datasheet['FILENAME'], url_context)
    data['honours'] = flow.to_flow_block('otherweb', json.loads(honours)['data'])

    # parse biography, and add in extras and translations
    data['content'] = htmlparser.parse(datasheet['BIOGRAPHY'],
                                datasheet['FILENAME'],
                                translations=json.loads(translations)['data'],
                                extras=json.loads(additional)['data'],
                                paragraphs=True,
                                url_context=url_context)

    # discover categories for this mathematician
    path = '/Biographies/%s' % datasheet['FILENAME']
    tags = []
    #with open('../datasheets/Indexes/data.json') as f:
    #    category_data = json.load(f)
    category_data = categories.categories()
    for category in category_data:
        if path in category['entries']:
            tags.append(category['name'])
    data['tags'] = ', '.join(tags)

    # discover alphabetical tags for this mathematician
    displays = alphaindexparser.get_displays_2(datasheet['FILENAME'])
    if not displays:
        assert False
    displays = '\n'.join(displays)
    data['alphabetical'] = displays

    return data