def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'frse' data['_template'] = 'frse.html' # easily translatable info data['name'] = symbolreplace.tags_to_unicode(datasheet['NAME']) data['birth'] = datasheet['BIRTH'] data['birth'] = datasheet['DEATH'] data['birthplace'] = symbolreplace.tags_to_unicode(datasheet['BIRTHPLACE']) data['profession'] = datasheet['PROFESSION'] data['fellowship'] = datasheet['FELLOWSHIP'] #data['biography'] = datasheet['BIOGRAPHY'] if datasheet['BIOGRAPHY'] != '': assert datasheet['BIOGRAPHY'] == datasheet['FILENAME'] elected = datasheet['ELECTED'] if '/ /' in elected: # hack to make it work elected = elected.replace('/ /', '01/01/') electedDate = datetime.datetime.strptime(elected, '%d/%m/%Y') data['elected'] = electedDate.strftime('%Y-%m-%d') return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'honour' data['_template'] = 'honour.html' # filename, title, headline and update date for this data['title'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['TITLE'])) data['headline'] = htmlparser.parse(datasheet['HEADLINE'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) # parse biography data['content'] = htmlparser.parse(datasheet['CONTENT'], datasheet['FILENAME'], paragraphs=True, url_context=url_context) data['tags'] = '' # alphabetical display entries parsed_entries = [] if data['title'].strip() != '': s = data['title'].strip() parsed_entries.append(s) elif data['headline'].strip() != '': s = data['headline'].strip() parsed_entries.append(s) data['alphabetical'] = '\n'.join(parsed_entries) return data
def load(): for letter in letters: # read the data filepath = '../datasheets/AlphaIndex/%s' % letter with open(filepath, 'r', encoding='mac_roman') as f: lines = f.readlines() # parse each line for line in lines: line = line.strip() if line == '': continue # extract the name out of this line match = pattern.search(line) assert match # get text pretext = match.group('pretext') or '' text = match.group('text') posttext = match.group('posttext') text = pretext + text + posttext text = symbolreplace.tags_to_unicode(text) text = symbolreplace.strip_tags(text) text = text.strip() text = text.replace(' , ', ', ') # get name name = match.group('name') or match.group('text') name = urls.biography_rename(name) data.append((name, text))
def get_displays_2(find_name): new_pattern = re.compile( r'\.\./Biographies/%s\.html">(?P<display>.+?)<td' % re.escape(find_name)) displays = [] for letter in letters: # read the data filepath = '/Users/david/Documents/MacTutor/actual-work/from-server/2/history/Indexes/%s.html' % letter with open(filepath, 'r', encoding='mac_roman') as f: data = f.read() for match in re.finditer(new_pattern, data): display = match.group('display') display = display.replace('</a>', '') display = symbolreplace.tags_to_unicode(display) display = symbolreplace.strip_tags(display) display = display.strip() print('%s -> %s' % (match.group('display'), display)) displays.append(display) # check for (and remove) duplicates if len(displays) != len(set(displays)): with open('duplicate-displays.txt', 'a') as f: f.write('%s :: %s\n' % (find_name, displays)) displays = list(set(displays)) missing_list = ['Moriarty'] if len(displays) == 0 and find_name not in missing_list: print('No displays found for %s' % find_name) return False displays.sort() return displays
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'curve' data['_template'] = 'curve.html' # easily translatable info data['name'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['FULLNAME'])) # need to parse the individual equations out, and convert to flow equations = parse_equations(datasheet['EQUATIONS'], datasheet['FILENAME']) data['equations'] = flow.to_flow_block('curveequation', equations) # parse java applet options options = '{\n' pattern = re.compile( r'\<PARAM NAME="(?P<name>.+?)" VALUE="(?P<value>.+?)">') for match in re.finditer(pattern, datasheet['JAVA']): name = match.group('name') value = match.group('value') line = '%s: "%s",\n' % (name, value) options += line options += '}' data['appletoptions'] = options # parse content data['content'] = htmlparser.parse(datasheet['CONTENTS'], datasheet['FILENAME'], paragraphs=True, url_context=url_context) return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'gazperson' data['_template'] = 'gazperson.html' data['name'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['TITLE'])) # parse the list of people parsed_places = [] places = datasheet['LINKS'].strip().split('\n') for place in places: place = place.strip() if place == '': continue addFragment = 'no' if place.endswith('*'): place = place[:-1] addFragment = 'yes' parsed_places.append({'place': place, 'fragment': addFragment}) data['places'] = flow.to_flow_block('gazplaceflow', parsed_places) return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'place' data['_hidden'] = 'yes' # easily translatable info data['name'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['PLACENAME'])) data['country'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['COUNTRY'])) data['webref'] = datasheet['WEBREF'] data['gaz'] = datasheet['GAZ'] # some places are missing a country if datasheet['FILENAME'] == 'Higham_Ferrers': data['country'] = 'England' elif datasheet['FILENAME'] == 'Kansas_City': data['country'] = 'USA' elif datasheet['FILENAME'] == 'Lit': data['country'] = 'Sweden' elif datasheet['FILENAME'] == 'Martos': data['country'] = 'Spain' # and some places have a malformed country if data['country'] == 'Czech_Republic': data['country'] = 'Czech Republic' elif data['country'] == 'Sicily': data['country'] = 'Italy' elif data['country'].endswith(')'): data['country'] = data['country'][:-1] elif data['country'] == '': data['country'] == '--Unknown--' # lat and long pattern = re.compile(r'(?P<lat>-?[\d.]+),(?P<long>-?[\d.]+)') match = pattern.search(datasheet['LATLONG0']) data['latitude'] = '' data['longitude'] = '' if match: data['latitude'] = match.group('lat') data['longitude'] = match.group('long') return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'quotation' data['_template'] = 'quotation.html' # filename, name data['name'] = symbolreplace.tags_to_unicode(datasheet['NAME']) content = datasheet['CONTENT'] numquotes = datasheet['NUMQUOTES'] # special case cleaning rules if datasheet['FILENAME'] == 'Carmichael': content = content.replace('<p>', '') if datasheet['FILENAME'] in NUMBER_CORRECTIONS: numquotes = NUMBER_CORRECTIONS[datasheet['FILENAME']] # now parse the individual quotes content = content.split('<p>') quotes = [] for quote in content: if quote.strip() != '': quotes.append(quote.strip()) # holding 'more quotes' links, or 'translations by' data['more'] = '' if len(quotes) != 0 and 'More ' in quotes[-1] and '<a href' in quotes[-1]: #print('I *think* this is a *more quotes* paragraph:', quotes[-1]) data['more'] = quotes.pop() if len(quotes) != 0 and data['more'] == '' and 'Translations ' in quotes[-1]: #print('I *think* this is a *translations by* paragraph:', quotes[-1]) data['more'] = quotes.pop() if len(quotes) != int(numquotes): print('ERROR', len(quotes), 'expcting', int(numquotes)) print(quotes) assert False # now parse the quotes and convert to html for idx, quote in enumerate(quotes): q = parse_quote(quote) q['quote'] = htmlparser.parse(q['quote'], 'Quotations/%s' % datasheet['FILENAME'], paragraphs=True, url_context=url_context) q['source'] = htmlparser.parse(q['source'], 'Quotations/%s' % datasheet['FILENAME'], paragraphs=False, url_context=url_context) quotes[idx] = q quotations = flow.to_flow_block('quotation', quotes) data['quotations'] = quotations return quotations
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'extra' data['_template'] = 'extra.html' # filename, title, headline and update date for this data['title'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['TITLE'])) data['headline'] = htmlparser.parse(datasheet['HEADLINE'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['update'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['UPDATE'])) # parse references references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context) data['references'] = flow.to_flow_block('reference', json.loads(references)['data']) # parse biography data['content'] = htmlparser.parse(datasheet['EXTRA'], datasheet['FILENAME'], paragraphs=True, url_context=url_context) return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'obituary' data['_template'] = 'obituary.html' # easily translatable info data['name'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['HEADING1'])) data['summary'] = htmlparser.parse(datasheet['HEADING2'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['wherefrom'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['TITLE'])) # parse biography, and add in extras and translations data['content'] = htmlparser.parse(datasheet['CONTENT'], datasheet['FILENAME'], paragraphs=True, url_context=url_context) return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'page' data['_template'] = 'page.html' # sidebar data['sidebar'] = '' # easily translatable info data['authors'] = htmlparser.parse(datasheet['WHODIDIT'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['title'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['TITLE'])) # check that this is a standard page #assert datasheet['USEHTMLFORMAT'] == 'Y' # need to convert it to a standard page content = datasheet['CONTENT'] regex = re.compile(r'<html>(?P<content>.*?)</html>', re.MULTILINE | re.DOTALL) content = re.sub(regex, strip, content) regex = re.compile(r'<head>(?P<content>.*?)</head>', re.MULTILINE | re.DOTALL) content = re.sub(regex, strip_all, content) regex = re.compile(r'<title>(.*?)</title>', re.MULTILINE | re.DOTALL) content = re.sub(regex, r'', content) regex = re.compile(r'<meta (.*?)/>', re.MULTILINE | re.DOTALL) content = re.sub(regex, r'', content) regex = re.compile(r'<style>(.*?)</style>', re.MULTILINE | re.DOTALL) content = re.sub(regex, r'', content) regex = re.compile(r'<body(.*?)>', re.MULTILINE | re.DOTALL) content = re.sub(regex, r'', content) content = content.replace('</body>', '') content = content.strip() # also get rid of the 'show larger image' button regex = re.compile(r'<form>(.*?)</form>', re.MULTILINE | re.DOTALL) content = re.sub(regex, r'', content) # parse biography, and add in extras and translations data['content'] = htmlparser.parse(content, datasheet['FILENAME'], paragraphs=True, url_context=url_context) return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'glossary' data['_template'] = 'glossary.html' # easily translatable info data['term'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['WORD'])) # parse biography, and add in extras and translations data['content'] = htmlparser.parse(datasheet['CONTENTS'], datasheet['FILENAME'], paragraphs=True, url_context=url_context) return data
def parse_equations(text, filename): equations = [] eqtype = None typeregex = re.compile( r'^<b><font color=green>(?P<type>.+?)</font></b>.*$') equationregex = re.compile(r'^(?P<equation>\\.+?\\\\)$') text = text.split('\n') for line in text: line = line.strip() typematch = typeregex.search(line) equationmatch = equationregex.search(line) if typematch: # it's a type! assert eqtype == None eqtype = typematch.group('type') elif equationmatch: # it's an equation! assert eqtype eqtype = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(eqtype)) equation = { 'type': eqtype, 'equation': htmlparser.parse(equationmatch.group('equation'), filename, paragraphs=False) } eqtype = None equations.append(equation) else: assert False return equations
def mathreplace(match): entire = match.group(0) math = match.group('math') math = symbolreplace.symbols_to_unicode(math, katex=True) math = symbolreplace.tags_to_unicode(math, katex=True) # remove <b>...</b> regex = re.compile(r'<b>(.*?)</b>', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'\1', math) # remove <i>...</i> regex = re.compile(r'<i>(.*?)</i>', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'\1', math) # remove <u>...</u> regex = re.compile(r'<u>(.*?)</u>', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'\1', math) # remove <r>...</r>, <bl>...</bl>, <gr>...</gr> and <bro>...</bro> regex = re.compile(r'<(?:r|bl|gr|bro)>(.*?)</(?:r|bl|gr|bro)>', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'\1', math) # remove <f+>...</f+> regex = re.compile(r'<f\+>(.*?)</f>', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'\1', math) regex = re.compile(r'<fp>(.*?)</fp>', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'\1', math) # remove <f++>...</f> regex = re.compile(r'<f\+\+>(.*?)</f>', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'\1', math) # remove <f->...</-> regex = re.compile(r'<f->(.*?)</f>', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'\1', math) regex = re.compile(r'<fm>(.*?)</fm>', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'\1', math) # remove <ovl>...</ovl> regex = re.compile(r'<ovl>(.*?)</ovl>', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'\1', math) # convert fractions regex = re.compile(r'\^(\S+) ?\/¬(\S+) ?', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'{{\1}\over{\2}}', math) # convert vector bold regex = re.compile(r'`(.)`', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'\mathbb{\1}', math) # convert ^superscript regex = re.compile(r'\^(\S+)(?: ?)', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'^{\1}', math) regex = re.compile(r'<sup>(.*?)</sup>', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'^{\1}', math) # convert ¬subscript regex = re.compile(r'¬(\S+)(?: ?)', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'_{\1}', math) regex = re.compile(r'<sub>(.*?)</sub>', re.MULTILINE | re.DOTALL) math = re.sub(regex, r'_{\1}', math) # fix functions mappings = ['isin','arcsin','arccos','arctan','arctg','arg','ch','cosec','cosh', 'cos','cotg','coth','cot','argmin','csc','ctg','cth','deg','dim','exp', 'hom','ker','lg','ln','log','sec','sinh','sin','tanh','tan',#'sh','tg','th' 'det','gcd','inf','lim','liminf','limsup','Pr','sup','argmax', 'max','min'] for mapping in mappings: old_math = math regex = re.compile(r'(?<!(?:(?:\\)|(?:\\i)|(?:\\arc)))(%s)' % mapping) math = re.sub(regex, r'\\\1', math) # remove multiline formulas lines = math.split('\n') output = '' for line in lines: line = line.strip() if line == '': output += '\n\n' continue output += '\n<latex>%s</latex>' % line return output.strip()
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'historytopic' data['_template'] = 'historytopic.html' # filename, short and full name, authors, update data['shortname'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['SHORTNAME'])) data['fullname'] = htmlparser.parse(datasheet['FULLNAME'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['authors'] = htmlparser.parse(datasheet['AUTHORS'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['update'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['UPDATE'])) # something about indexes, not sure how this is used yet data['indexref'] = datasheet['INDEXREF'] data['indexreffile'] = datasheet['INDEXREFFILE'] # parse references references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context) data['references'] = flow.to_flow_block('reference', json.loads(references)['data']) # parse additional links (they use the same format as cross references) additional = referenceparser.parse_cross_references( datasheet['ADDITIONAL'], datasheet['FILENAME'], url_context) data['additional'] = flow.to_flow_block('otherweb', json.loads(additional)['data']) # parse translations (use the same format as references) # don't add them to data, as we're combining them with bio translations = referenceparser.parse_references(datasheet['TRANSLATION'], datasheet['FILENAME'], url_context) translation_data = json.loads(translations)['data'] translation_data = [{ 'number': d['number'], 'translation': d['reference'] } for d in translation_data] data['translations'] = flow.to_flow_block('translation', translation_data) # parse otherweb links (they use the same format as cross references) otherweb = referenceparser.parse_cross_references(datasheet['OTHERWEB'], datasheet['FILENAME'], url_context) data['otherweb'] = flow.to_flow_block('otherweb', json.loads(otherweb)['data']) # parse history topic data['content'] = htmlparser.parse( datasheet['HISTTOPIC'], datasheet['FILENAME'], translations=json.loads(translations)['data'], extras=json.loads(additional)['data'], paragraphs=True, url_context=url_context) # discover categories for this mathematician path = '/HistTopics/%s' % datasheet['FILENAME'] tags = [] #with open('../datasheets/Indexes/data.json') as f: # category_data = json.load(f) category_data = categories.categories() for category in category_data: if path in category['entries']: tags.append(category['name']) data['tags'] = ', '.join(tags) # discover alphabetical index names for this history topic parsed_entries = [] if 'INDEXNAMES' not in datasheet: if data['fullname'].strip() != '': parsed_entries.append(data['fullname'].strip()) elif data['shortname'].strip() != '': parsed_entries.append(data['shortname'].strip()) else: print('no names for this topic') assert False else: entries = datasheet['INDEXNAMES'].strip().split('\n') for entry in entries: entry = entry.strip() entry = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(entry)) parsed_entries.append(entry) data['alphabetical'] = '\n'.join(parsed_entries) return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'gazplace' data['_template'] = 'gazplace.html' data['place'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['TITLE'])) pattern = re.compile(r'(?P<lat>-?[\d.]+),(?P<long>-?[\d.]+)') match = pattern.search(datasheet['COORDS']) data['latitude'] = '' data['longitude'] = '' if match: data['latitude'] = match.group('lat') data['longitude'] = match.group('long') # i was an idiot, and made a mistake in generating the datasheets for GazData # the correct CONTENTS is in GazData3 # so we have to read that instead path = os.path.join('../datasheets/GazData3/', datasheet['FILENAME']) datasheet2 = datasheetparser.parse_file(path) # convert the references to the new style of references refcount = 1 parsed_references = [] references = datasheet['REFERENCES'].strip().split('\n') for reference in references: reference = reference.strip() if reference == '': continue parts = reference.split('@') if len(parts) != 3: print(reference) assert len(parts) == 3 replacement = parts[0].strip() text = parts[2].strip() if replacement not in datasheet2['CONTENTS']: print(reference) assert replacement in datasheet2['CONTENTS'] datasheet2['CONTENTS'] = datasheet2['CONTENTS'].replace( replacement, '[%s]' % refcount) parsed_references.append({ 'number': str(refcount), 'reference': htmlparser.parse(text, datasheet['FILENAME']) }) refcount = refcount + 1 data['references'] = flow.to_flow_block('reference', parsed_references) # parse biography, and add in extras and translations data['content'] = htmlparser.parse(datasheet2['CONTENTS'], datasheet['FILENAME'], paragraphs=True, url_context=url_context) if data['place'] == 'Whitburn, Tyne & Wear': # add in the missing lat and long data['latitude'] = '54.9550395' data['longitude'] = '-1.3867149' if data['latitude'] == '' and data['longitude'] == '': # this is not a place, it should just be a page newdata = {} newdata['_model'] = 'page' newdata['_template'] = 'gazplace.html' newdata['title'] = data['place'] newdata['authors'] = '' newdata['content'] = data['content'] return newdata return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'society' data['_template'] = 'society.html' # easily translatable info data['name'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['TITLENAME'])) data['headline'] = htmlparser.parse(datasheet['HEADLINE'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['update'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['UPDATE'])) data['foundation'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['FOUNDATION'])) # external site parsing link = re.compile( r'<a\s+href ?= ?[\'"]?(?P<href>.+?)[\'"]?\s*>(?P<text>.*?)<\/a>') if datasheet['OTHERWEB'].strip() == '': data['website'] = '' else: match = link.search(datasheet['OTHERWEB'].strip()) if not match: print('not link "%s"' % datasheet['OTHERWEB'].strip()) assert match data['website'] = match.group('href') # parse references references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context) data['references'] = flow.to_flow_block('reference', json.loads(references)['data']) # parse additional links (they use the same format as cross references) # don't add them to data, as we're combining them with bio additional = referenceparser.parse_cross_references( datasheet['EXTRAS'], datasheet['FILENAME'], url_context) data['additional'] = flow.to_flow_block('otherweb', json.loads(additional)['data']) # parse biography, and add in extras and translations data['content'] = htmlparser.parse(datasheet['CONTENT'], datasheet['FILENAME'], extras=json.loads(additional)['data'], paragraphs=True, url_context=url_context) data['tags'] = '' # alphabetical display entries parsed_entries = [] if data['name'].strip() != '': s = data['name'].strip() parsed_entries.append(s) elif data['headline'].strip() != '': s = data['headline'].strip() parsed_entries.append(s) data['alphabetical'] = '\n'.join(parsed_entries) return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'biography' data['_template'] = 'biography.html' # name and shortname data['shortname'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['SHORTNAME'])) data['fullname'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['FULLNAME'])) # authors data['authors'] = htmlparser.parse(datasheet['AUTHORS'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) # last update data['update'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['UPDATE'])) data['summary'] = htmlparser.parse(datasheet['SUMMARY'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) # dates are tricky. for now leave them as they are data['birthdate'] = datasheet['BIRTHDATE'] data['deathdate'] = datasheet['DEATHDATE'] # birth and death year - remove the ,? if necessary date_pattern = re.compile(r'(\d+)(?:,\??)?') data['birthyear'] = re.sub(date_pattern, r'\1', datasheet['BIRTHYEAR']) data['deathyear'] = re.sub(date_pattern, r'\1', datasheet['DEATHYEAR']) # birthplace, deathplace data['birthplace'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['BIRTHPLACE'])) data['deathplace'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['DEATHPLACE'])) # mapinfo - just take the name, ignore mapnum and lat/long mapinfo = re.compile(r'\d,(?P<name>.+?),(?:(?P<lat>-?[\d.]+),(?P<long>-?[\d.]+))?') match = mapinfo.search(datasheet['MAPINFO']) data['maplocation'] = '--Unknown--' data['maplocation'] = '' if match: data['maplocation'] = match.group('name') # country data['country'] = '--Unknown--' if datasheet['COUNTRY'].strip() != '': data['country'] = datasheet['COUNTRY'] if data['country'] == 'Czech_Republic': data['country'] = 'Czech Republic' elif data['country'] == 'Sicily': data['country'] = 'Italy' elif data['country'].endswith(')'): data['country'] = data['country'][:-1] elif data['country'] == '': data['country'] == '--Unknown--' # also add countries to global array if not data['country'] in countries: countries.append(data['country']) # parse references references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context) data['references'] = flow.to_flow_block('reference', json.loads(references)['data']) # parse translations (use the same format as references) # don't add them to data, as we're combining them with bio translations = referenceparser.parse_references(datasheet['TRANSLATION'], datasheet['FILENAME'], url_context) translation_data = json.loads(translations)['data'] translation_data = [{'number':d['number'],'translation':d['reference']} for d in translation_data] data['translations'] = flow.to_flow_block('translation', translation_data) # parse cross references #xrefs = referenceparser.parse_cross_references(datasheet['XREFS'], datasheet['FILENAME']) #data['xrefs'] = xrefs # parse additional links (they use the same format as cross references) # don't add them to data, as we're combining them with bio additional = referenceparser.parse_cross_references(datasheet['ADDITIONAL'], datasheet['FILENAME'], url_context) data['additional'] = flow.to_flow_block('otherweb', json.loads(additional)['data']) # parse otherweb links (they use the same format as cross references) otherweb = referenceparser.parse_cross_references(datasheet['OTHERWEB'], datasheet['FILENAME'], url_context) data['otherweb'] = flow.to_flow_block('otherweb', json.loads(otherweb)['data']) # parse honours links (they use the same format as cross references) honours = referenceparser.parse_cross_references(datasheet['HONOURS'], datasheet['FILENAME'], url_context) data['honours'] = flow.to_flow_block('otherweb', json.loads(honours)['data']) # parse biography, and add in extras and translations data['content'] = htmlparser.parse(datasheet['BIOGRAPHY'], datasheet['FILENAME'], translations=json.loads(translations)['data'], extras=json.loads(additional)['data'], paragraphs=True, url_context=url_context) # discover categories for this mathematician path = '/Biographies/%s' % datasheet['FILENAME'] tags = [] #with open('../datasheets/Indexes/data.json') as f: # category_data = json.load(f) category_data = categories.categories() for category in category_data: if path in category['entries']: tags.append(category['name']) data['tags'] = ', '.join(tags) # discover alphabetical tags for this mathematician displays = alphaindexparser.get_displays_2(datasheet['FILENAME']) if not displays: assert False displays = '\n'.join(displays) data['alphabetical'] = displays return data