def parse_data(path): data = {} cl = as_document(path) if cl is None: return data for row in cl.cssselect('#docContent tr'): code, title, value = row.getchildren() code, title = code.text, title.text if not code in DATA_CODES: print "Missing code spec:", path, code, [title] continue field, field_type = DATA_CODES[code] text = value.text if field_type == 'list': data[field] = [text] + [br.tail for br in value.findall('br')] elif field_type == 'code': id, name = text.split('-', 1) data[field + '_id'] = id.strip() data[field + '_name'] = name.strip() else: data[field] = text num, year = data.get('document_number').split('-') data['uri'] = 'TED:NOTICE:%s-%s:DATA:EN:HTML' % (num, year) return data
def parse_current_language(path): cl = as_document(path) data = {'source_tender': path.rsplit('/', 1)[0]} data['title_uc'] = cl.cssselect('#mainContent h2').pop().text content = cl.cssselect('#docContent').pop() data['date'] = content.cssselect('#docHeader span.date').pop().text data['oj_uc'] = content.cssselect('#docHeader span.oj').pop().text data['heading'] = content.cssselect('#docHeader span.heading').pop().text.strip() signature, identifier = content.cssselect('.tab > div.stdoc p') #org, org_identifier, org_type, org_regulation = stddocs data['signature'] = signature.text data['identifier'] = identifier.text return cl, data