def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'honour' data['_template'] = 'honour.html' # filename, title, headline and update date for this data['title'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['TITLE'])) data['headline'] = htmlparser.parse(datasheet['HEADLINE'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) # parse biography data['content'] = htmlparser.parse(datasheet['CONTENT'], datasheet['FILENAME'], paragraphs=True, url_context=url_context) data['tags'] = '' # alphabetical display entries parsed_entries = [] if data['title'].strip() != '': s = data['title'].strip() parsed_entries.append(s) elif data['headline'].strip() != '': s = data['headline'].strip() parsed_entries.append(s) data['alphabetical'] = '\n'.join(parsed_entries) return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'quotation' data['_template'] = 'quotation.html' # filename, name data['name'] = symbolreplace.tags_to_unicode(datasheet['NAME']) content = datasheet['CONTENT'] numquotes = datasheet['NUMQUOTES'] # special case cleaning rules if datasheet['FILENAME'] == 'Carmichael': content = content.replace('<p>', '') if datasheet['FILENAME'] in NUMBER_CORRECTIONS: numquotes = NUMBER_CORRECTIONS[datasheet['FILENAME']] # now parse the individual quotes content = content.split('<p>') quotes = [] for quote in content: if quote.strip() != '': quotes.append(quote.strip()) # holding 'more quotes' links, or 'translations by' data['more'] = '' if len(quotes) != 0 and 'More ' in quotes[-1] and '<a href' in quotes[-1]: #print('I *think* this is a *more quotes* paragraph:', quotes[-1]) data['more'] = quotes.pop() if len(quotes) != 0 and data['more'] == '' and 'Translations ' in quotes[-1]: #print('I *think* this is a *translations by* paragraph:', quotes[-1]) data['more'] = quotes.pop() if len(quotes) != int(numquotes): print('ERROR', len(quotes), 'expcting', int(numquotes)) print(quotes) assert False # now parse the quotes and convert to html for idx, quote in enumerate(quotes): q = parse_quote(quote) q['quote'] = htmlparser.parse(q['quote'], 'Quotations/%s' % datasheet['FILENAME'], paragraphs=True, url_context=url_context) q['source'] = htmlparser.parse(q['source'], 'Quotations/%s' % datasheet['FILENAME'], paragraphs=False, url_context=url_context) quotes[idx] = q quotations = flow.to_flow_block('quotation', quotes) data['quotations'] = quotations return quotations
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'page' data['_template'] = 'page.html' # sidebar data['sidebar'] = '' # easily translatable info data['authors'] = htmlparser.parse(datasheet['WHODIDIT'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['title'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['TITLE'])) # check that this is a standard page #assert datasheet['USEHTMLFORMAT'] == 'Y' # need to convert it to a standard page content = datasheet['CONTENT'] regex = re.compile(r'<html>(?P<content>.*?)</html>', re.MULTILINE | re.DOTALL) content = re.sub(regex, strip, content) regex = re.compile(r'<head>(?P<content>.*?)</head>', re.MULTILINE | re.DOTALL) content = re.sub(regex, strip_all, content) regex = re.compile(r'<title>(.*?)</title>', re.MULTILINE | re.DOTALL) content = re.sub(regex, r'', content) regex = re.compile(r'<meta (.*?)/>', re.MULTILINE | re.DOTALL) content = re.sub(regex, r'', content) regex = re.compile(r'<style>(.*?)</style>', re.MULTILINE | re.DOTALL) content = re.sub(regex, r'', content) regex = re.compile(r'<body(.*?)>', re.MULTILINE | re.DOTALL) content = re.sub(regex, r'', content) content = content.replace('</body>', '') content = content.strip() # also get rid of the 'show larger image' button regex = re.compile(r'<form>(.*?)</form>', re.MULTILINE | re.DOTALL) content = re.sub(regex, r'', content) # parse biography, and add in extras and translations data['content'] = htmlparser.parse(content, datasheet['FILENAME'], paragraphs=True, url_context=url_context) return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'curve' data['_template'] = 'curve.html' # easily translatable info data['name'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['FULLNAME'])) # need to parse the individual equations out, and convert to flow equations = parse_equations(datasheet['EQUATIONS'], datasheet['FILENAME']) data['equations'] = flow.to_flow_block('curveequation', equations) # parse java applet options options = '{\n' pattern = re.compile( r'\<PARAM NAME="(?P<name>.+?)" VALUE="(?P<value>.+?)">') for match in re.finditer(pattern, datasheet['JAVA']): name = match.group('name') value = match.group('value') line = '%s: "%s",\n' % (name, value) options += line options += '}' data['appletoptions'] = options # parse content data['content'] = htmlparser.parse(datasheet['CONTENTS'], datasheet['FILENAME'], paragraphs=True, url_context=url_context) return data
def parse_cross_references(references, name, url_context): parsed = [] for line in references.splitlines(): line = line.strip() # match against reference line #bio_regex = re.compile(r'^(?P<number>\d+)\s*,\s*(?P<link>.+?)\s*,\s*(?P<text>.+?)(?:,\s*(?P<extratext>.+?))?$') bio_regex = re.compile( r'^(?P<number>\d+)\s*,\s*(?P<link>.+?)\s*(?:,\s*(?P<text>.+?))?(?:,\s*(?P<extratext>.+?))?$' ) match = re.match(bio_regex, line) if match: # this is a reference line number = match.group('number') link = match.group('link') text = match.group('text') if not text: text = 'THIS LINK' link = urls.convert(link, url_context) if match.group('extratext'): text += ' ' + match.group('extratext') text = text.strip() if not text: text = link else: text = htmlparser.parse(text, name) reference = {'link': link, 'text': text, 'number': number} parsed.append(reference) continue return_str = {'data': parsed} return_str = json.dumps(return_str) return return_str
def parse_references(references, name, url_context): parsed_references = [] in_reference = False reference = None for line in references.splitlines(): line = line.strip() # match against reference line bio_regex = re.compile(r'^(?P<number>\d+)\s*,\s*(?P<reference>.+)$') match = re.match(bio_regex, line) if match: # this is a reference line reference = match.group('reference') number = match.group('number') ref = {'number': number, 'reference': reference.strip()} parsed_references.append(ref) in_reference = True continue # match against url if (line.startswith('http://') or line.startswith('https://')) and in_reference: # check there's not an issue with the line assert '<' not in line and '>' not in line # make the entire reference a link href = line href = urls.convert(href, url_context) text = parsed_references[-1]['reference'] text = text.replace('<br>', '') '''link = '<a href="%s">%s</a>' % (href, text) # only do this if there isn't already a link in the reference if '<a' not in text: parsed_references[-1]['reference'] = link''' parsed_references[-1]['reference'] = '%s <a href="%s">%s</a>' % ( text, href, href) in_reference = False # match against empty line if line == '' or '<p>' in line: in_reference = False continue # any other line if in_reference: parsed_references[-1]['reference'] += (' ' + line.strip()) for reference in parsed_references: reference['reference'] = htmlparser.parse(reference['reference'], name) return_str = {'data': parsed_references} return_str = json.dumps(return_str) return return_str
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'extra' data['_template'] = 'extra.html' # filename, title, headline and update date for this data['title'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['TITLE'])) data['headline'] = htmlparser.parse(datasheet['HEADLINE'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['update'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['UPDATE'])) # parse references references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context) data['references'] = flow.to_flow_block('reference', json.loads(references)['data']) # parse biography data['content'] = htmlparser.parse(datasheet['EXTRA'], datasheet['FILENAME'], paragraphs=True, url_context=url_context) return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'obituary' data['_template'] = 'obituary.html' # easily translatable info data['name'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['HEADING1'])) data['summary'] = htmlparser.parse(datasheet['HEADING2'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['wherefrom'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['TITLE'])) # parse biography, and add in extras and translations data['content'] = htmlparser.parse(datasheet['CONTENT'], datasheet['FILENAME'], paragraphs=True, url_context=url_context) return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'glossary' data['_template'] = 'glossary.html' # easily translatable info data['term'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['WORD'])) # parse biography, and add in extras and translations data['content'] = htmlparser.parse(datasheet['CONTENTS'], datasheet['FILENAME'], paragraphs=True, url_context=url_context) return data
def chronology_convert(input_dir, output_dir, url_context): # get all the files that need to be processed path = os.path.join(input_dir, '*') files = glob.glob(path) dates = {} # process all the files for file in files: # parse sections from datasheet datasheet = datasheetparser.parse_file(file) date = datasheet['DATE'] content = htmlparser.parse(datasheet['BIG'], os.path.basename(file), paragraphs=False, url_context=url_context) data = { 'about': 'yes' if datasheet['ABOUT'] != '' else 'no', 'content': content } if date not in dates: dates[date] = [] dates[date].append(data) # convert to nested flow chronology = [] for date, events in dates.items(): data = { '_model': 'chronologyyear', '_hidden': 'yes', 'year': date, 'events': flow.to_flow_block('chronology-event', events) } filename = os.path.join(LEKTOR_CONTENT_PATH, output_dir, date) save(data, filename)
def parse_equations(text, filename): equations = [] eqtype = None typeregex = re.compile( r'^<b><font color=green>(?P<type>.+?)</font></b>.*$') equationregex = re.compile(r'^(?P<equation>\\.+?\\\\)$') text = text.split('\n') for line in text: line = line.strip() typematch = typeregex.search(line) equationmatch = equationregex.search(line) if typematch: # it's a type! assert eqtype == None eqtype = typematch.group('type') elif equationmatch: # it's an equation! assert eqtype eqtype = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(eqtype)) equation = { 'type': eqtype, 'equation': htmlparser.parse(equationmatch.group('equation'), filename, paragraphs=False) } eqtype = None equations.append(equation) else: assert False return equations
def read_logs(): msg = servicehandler.getlogs() return htmlparser.parse(msg)
def project_convert(input_dir, output_dir, url_context, name): # get all the files that need to be processed path = os.path.join(input_dir, '*') files = glob.glob(path) titles = { 'Ayel': 'The French Grandes Ecoles', 'Brunk': 'The development of Galois theory', 'Burslem': 'Sofia Kovalevskaya', 'Daxenberger': 'Johan de Witt - The first calculation on the valuation of life annuities', 'Ellison': 'Sofia Kovalevskaya', 'Johnson': 'James Clerk Maxwell - The Great Unknown', 'MacQuarrie': 'Mathematics and Chess', 'Pearce': 'Indian Mathematics - Redressing the balance', 'Watson': 'Some topics in the history of mathematical education', 'Ledermann': 'Walter Ledermann - Encounters of a Mathematician', 'DickinsonCernokova': "An investigation of some of D'Arcy Thompson's correspondence", 'GowenlockTuminauskaite': "D'Arcy Thompson and Mathematics" } authors = { 'Ayel': 'Mathieu Ayel', 'Brunk': 'Fiona Brunk', 'Burslem': 'Tom Burslem', 'Daxenberger': 'Livia Daxenberger', 'Ellison': 'Leigh Ellison', 'Johnson': 'Kevin Johnson', 'MacQuarrie': 'John MacQuarrie', 'Pearce': 'Ian G Pearce', 'Watson': 'Helen Watson', 'Ledermann': "J J O'Connor and E F Robertson", 'DickinsonCernokova': 'Heather Dickinson and Barbora Cernokova', 'GowenlockTuminauskaite': 'Alice Gowenlock and Indre Tuminauskaite' } pages = [] references = '' # process all the files for file in files: # parse sections from datasheet datasheet = datasheetparser.parse_file(file) if datasheet['NUMBER'] == 'refs' and 'REFERENCES' in datasheet: # this is the references, not a page references = referenceparser.parse_references( datasheet['REFERENCES'], file, url_context) references = flow.to_flow_block('reference', json.loads(references)['data']) continue pagenum = int(datasheet['NUMBER']) assert pagenum == len(pages) content = cleaning.project_cleaning(datasheet['CONTENT']) data = { '_model': 'projectpage', '_template': 'projectpage.html', 'title': datasheet['TITLE'], 'content': htmlparser.parse(content, file, paragraphs=True, url_context=url_context), 'chapter': str(len(pages) + 1) } pages.append(data) # main project page data = { '_model': 'project', '_template': 'project.html', 'title': titles[name], 'author': authors[name], 'references': '' if references is None else references } filename = os.path.join(LEKTOR_CONTENT_PATH, output_dir) save(data, filename) # the chapters for page in pages: filename = os.path.join(LEKTOR_CONTENT_PATH, output_dir, 'chapter-%s' % page['chapter']) save(page, filename) print('processed', name)
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'historytopic' data['_template'] = 'historytopic.html' # filename, short and full name, authors, update data['shortname'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['SHORTNAME'])) data['fullname'] = htmlparser.parse(datasheet['FULLNAME'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['authors'] = htmlparser.parse(datasheet['AUTHORS'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['update'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['UPDATE'])) # something about indexes, not sure how this is used yet data['indexref'] = datasheet['INDEXREF'] data['indexreffile'] = datasheet['INDEXREFFILE'] # parse references references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context) data['references'] = flow.to_flow_block('reference', json.loads(references)['data']) # parse additional links (they use the same format as cross references) additional = referenceparser.parse_cross_references( datasheet['ADDITIONAL'], datasheet['FILENAME'], url_context) data['additional'] = flow.to_flow_block('otherweb', json.loads(additional)['data']) # parse translations (use the same format as references) # don't add them to data, as we're combining them with bio translations = referenceparser.parse_references(datasheet['TRANSLATION'], datasheet['FILENAME'], url_context) translation_data = json.loads(translations)['data'] translation_data = [{ 'number': d['number'], 'translation': d['reference'] } for d in translation_data] data['translations'] = flow.to_flow_block('translation', translation_data) # parse otherweb links (they use the same format as cross references) otherweb = referenceparser.parse_cross_references(datasheet['OTHERWEB'], datasheet['FILENAME'], url_context) data['otherweb'] = flow.to_flow_block('otherweb', json.loads(otherweb)['data']) # parse history topic data['content'] = htmlparser.parse( datasheet['HISTTOPIC'], datasheet['FILENAME'], translations=json.loads(translations)['data'], extras=json.loads(additional)['data'], paragraphs=True, url_context=url_context) # discover categories for this mathematician path = '/HistTopics/%s' % datasheet['FILENAME'] tags = [] #with open('../datasheets/Indexes/data.json') as f: # category_data = json.load(f) category_data = categories.categories() for category in category_data: if path in category['entries']: tags.append(category['name']) data['tags'] = ', '.join(tags) # discover alphabetical index names for this history topic parsed_entries = [] if 'INDEXNAMES' not in datasheet: if data['fullname'].strip() != '': parsed_entries.append(data['fullname'].strip()) elif data['shortname'].strip() != '': parsed_entries.append(data['shortname'].strip()) else: print('no names for this topic') assert False else: entries = datasheet['INDEXNAMES'].strip().split('\n') for entry in entries: entry = entry.strip() entry = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(entry)) parsed_entries.append(entry) data['alphabetical'] = '\n'.join(parsed_entries) return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'biography' data['_template'] = 'biography.html' # name and shortname data['shortname'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['SHORTNAME'])) data['fullname'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['FULLNAME'])) # authors data['authors'] = htmlparser.parse(datasheet['AUTHORS'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) # last update data['update'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['UPDATE'])) data['summary'] = htmlparser.parse(datasheet['SUMMARY'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) # dates are tricky. for now leave them as they are data['birthdate'] = datasheet['BIRTHDATE'] data['deathdate'] = datasheet['DEATHDATE'] # birth and death year - remove the ,? if necessary date_pattern = re.compile(r'(\d+)(?:,\??)?') data['birthyear'] = re.sub(date_pattern, r'\1', datasheet['BIRTHYEAR']) data['deathyear'] = re.sub(date_pattern, r'\1', datasheet['DEATHYEAR']) # birthplace, deathplace data['birthplace'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['BIRTHPLACE'])) data['deathplace'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['DEATHPLACE'])) # mapinfo - just take the name, ignore mapnum and lat/long mapinfo = re.compile(r'\d,(?P<name>.+?),(?:(?P<lat>-?[\d.]+),(?P<long>-?[\d.]+))?') match = mapinfo.search(datasheet['MAPINFO']) data['maplocation'] = '--Unknown--' data['maplocation'] = '' if match: data['maplocation'] = match.group('name') # country data['country'] = '--Unknown--' if datasheet['COUNTRY'].strip() != '': data['country'] = datasheet['COUNTRY'] if data['country'] == 'Czech_Republic': data['country'] = 'Czech Republic' elif data['country'] == 'Sicily': data['country'] = 'Italy' elif data['country'].endswith(')'): data['country'] = data['country'][:-1] elif data['country'] == '': data['country'] == '--Unknown--' # also add countries to global array if not data['country'] in countries: countries.append(data['country']) # parse references references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context) data['references'] = flow.to_flow_block('reference', json.loads(references)['data']) # parse translations (use the same format as references) # don't add them to data, as we're combining them with bio translations = referenceparser.parse_references(datasheet['TRANSLATION'], datasheet['FILENAME'], url_context) translation_data = json.loads(translations)['data'] translation_data = [{'number':d['number'],'translation':d['reference']} for d in translation_data] data['translations'] = flow.to_flow_block('translation', translation_data) # parse cross references #xrefs = referenceparser.parse_cross_references(datasheet['XREFS'], datasheet['FILENAME']) #data['xrefs'] = xrefs # parse additional links (they use the same format as cross references) # don't add them to data, as we're combining them with bio additional = referenceparser.parse_cross_references(datasheet['ADDITIONAL'], datasheet['FILENAME'], url_context) data['additional'] = flow.to_flow_block('otherweb', json.loads(additional)['data']) # parse otherweb links (they use the same format as cross references) otherweb = referenceparser.parse_cross_references(datasheet['OTHERWEB'], datasheet['FILENAME'], url_context) data['otherweb'] = flow.to_flow_block('otherweb', json.loads(otherweb)['data']) # parse honours links (they use the same format as cross references) honours = referenceparser.parse_cross_references(datasheet['HONOURS'], datasheet['FILENAME'], url_context) data['honours'] = flow.to_flow_block('otherweb', json.loads(honours)['data']) # parse biography, and add in extras and translations data['content'] = htmlparser.parse(datasheet['BIOGRAPHY'], datasheet['FILENAME'], translations=json.loads(translations)['data'], extras=json.loads(additional)['data'], paragraphs=True, url_context=url_context) # discover categories for this mathematician path = '/Biographies/%s' % datasheet['FILENAME'] tags = [] #with open('../datasheets/Indexes/data.json') as f: # category_data = json.load(f) category_data = categories.categories() for category in category_data: if path in category['entries']: tags.append(category['name']) data['tags'] = ', '.join(tags) # discover alphabetical tags for this mathematician displays = alphaindexparser.get_displays_2(datasheet['FILENAME']) if not displays: assert False displays = '\n'.join(displays) data['alphabetical'] = displays return data
def get_dom(self): return htmlparser.parse(self.file.read())
continue pattern = re.compile(r'^(?P<position>[SCLR]),(?P<path>.+?),(?P<height>.+?)(?:,(?P<description>.*))?$') match = pattern.search(line) if not match: print('not a match! (%s), (%s)' % (name, line)) assert False position = match.group('position') path = match.group('path') height = match.group('height') description = match.group('description') or '' description = strip_br(description) # parse the description description = htmlparser.parse(description, 'PictDisplay/%s' % name, paragraphs=False, url_context='PictDisplay/%s' % name) description = strip_br(description) # check this person exists biography_dir = os.path.join(CONTENT_DIR, 'Biographies/', name) if not os.path.isdir(biography_dir): with open('not-exists.txt', 'a') as f: f.write('%s\n' % name) continue # copy that image in img_dst = os.path.join(biography_dir, os.path.basename(path)) img_src = os.path.join(SERVER_FILES, path)
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'society' data['_template'] = 'society.html' # easily translatable info data['name'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['TITLENAME'])) data['headline'] = htmlparser.parse(datasheet['HEADLINE'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['update'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['UPDATE'])) data['foundation'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['FOUNDATION'])) # external site parsing link = re.compile( r'<a\s+href ?= ?[\'"]?(?P<href>.+?)[\'"]?\s*>(?P<text>.*?)<\/a>') if datasheet['OTHERWEB'].strip() == '': data['website'] = '' else: match = link.search(datasheet['OTHERWEB'].strip()) if not match: print('not link "%s"' % datasheet['OTHERWEB'].strip()) assert match data['website'] = match.group('href') # parse references references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context) data['references'] = flow.to_flow_block('reference', json.loads(references)['data']) # parse additional links (they use the same format as cross references) # don't add them to data, as we're combining them with bio additional = referenceparser.parse_cross_references( datasheet['EXTRAS'], datasheet['FILENAME'], url_context) data['additional'] = flow.to_flow_block('otherweb', json.loads(additional)['data']) # parse biography, and add in extras and translations data['content'] = htmlparser.parse(datasheet['CONTENT'], datasheet['FILENAME'], extras=json.loads(additional)['data'], paragraphs=True, url_context=url_context) data['tags'] = '' # alphabetical display entries parsed_entries = [] if data['name'].strip() != '': s = data['name'].strip() parsed_entries.append(s) elif data['headline'].strip() != '': s = data['headline'].strip() parsed_entries.append(s) data['alphabetical'] = '\n'.join(parsed_entries) return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'gazplace' data['_template'] = 'gazplace.html' data['place'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['TITLE'])) pattern = re.compile(r'(?P<lat>-?[\d.]+),(?P<long>-?[\d.]+)') match = pattern.search(datasheet['COORDS']) data['latitude'] = '' data['longitude'] = '' if match: data['latitude'] = match.group('lat') data['longitude'] = match.group('long') # i was an idiot, and made a mistake in generating the datasheets for GazData # the correct CONTENTS is in GazData3 # so we have to read that instead path = os.path.join('../datasheets/GazData3/', datasheet['FILENAME']) datasheet2 = datasheetparser.parse_file(path) # convert the references to the new style of references refcount = 1 parsed_references = [] references = datasheet['REFERENCES'].strip().split('\n') for reference in references: reference = reference.strip() if reference == '': continue parts = reference.split('@') if len(parts) != 3: print(reference) assert len(parts) == 3 replacement = parts[0].strip() text = parts[2].strip() if replacement not in datasheet2['CONTENTS']: print(reference) assert replacement in datasheet2['CONTENTS'] datasheet2['CONTENTS'] = datasheet2['CONTENTS'].replace( replacement, '[%s]' % refcount) parsed_references.append({ 'number': str(refcount), 'reference': htmlparser.parse(text, datasheet['FILENAME']) }) refcount = refcount + 1 data['references'] = flow.to_flow_block('reference', parsed_references) # parse biography, and add in extras and translations data['content'] = htmlparser.parse(datasheet2['CONTENTS'], datasheet['FILENAME'], paragraphs=True, url_context=url_context) if data['place'] == 'Whitburn, Tyne & Wear': # add in the missing lat and long data['latitude'] = '54.9550395' data['longitude'] = '-1.3867149' if data['latitude'] == '' and data['longitude'] == '': # this is not a place, it should just be a page newdata = {} newdata['_model'] = 'page' newdata['_template'] = 'gazplace.html' newdata['title'] = data['place'] newdata['authors'] = '' newdata['content'] = data['content'] return newdata return data