Exemplo n.º 1
0
def parse_data(path):
    data = {}
    cl = as_document(path)
    if cl is None:
        return data
    for row in cl.cssselect('#docContent tr'):
        code, title, value = row.getchildren()
        code, title = code.text, title.text
        if not code in DATA_CODES:
            print "Missing code spec:", path, code, [title]
            continue
        field, field_type = DATA_CODES[code]
        text = value.text
        if field_type == 'list':
            data[field] = [text] + [br.tail for br in value.findall('br')]
        elif field_type == 'code':
            id, name = text.split('-', 1)
            data[field + '_id'] = id.strip()
            data[field + '_name'] = name.strip()
        else:
            data[field] = text

    num, year = data.get('document_number').split('-')
    data['uri'] = 'TED:NOTICE:%s-%s:DATA:EN:HTML' % (num, year)
    return data
Exemplo n.º 2
0
def parse_data(path):
    data = {}
    cl = as_document(path)
    if cl is None:
        return data
    for row in cl.cssselect('#docContent tr'):
        code, title, value = row.getchildren()
        code, title = code.text, title.text
        if not code in DATA_CODES:
            print "Missing code spec:", path, code, [title]
            continue
        field, field_type = DATA_CODES[code]
        text = value.text
        if field_type == 'list':
            data[field] = [text] + [br.tail for br in value.findall('br')]
        elif field_type == 'code':
            id, name = text.split('-', 1)
            data[field + '_id'] = id.strip()
            data[field + '_name'] = name.strip()
        else:
            data[field] = text

    num, year = data.get('document_number').split('-')
    data['uri'] = 'TED:NOTICE:%s-%s:DATA:EN:HTML' % (num, year)
    return data
Exemplo n.º 3
0
def parse_current_language(path):
    cl = as_document(path)
    data = {'source_tender': path.rsplit('/', 1)[0]}
    data['title_uc'] = cl.cssselect('#mainContent h2').pop().text
    content = cl.cssselect('#docContent').pop()
    data['date'] = content.cssselect('#docHeader span.date').pop().text
    data['oj_uc'] = content.cssselect('#docHeader span.oj').pop().text
    data['heading'] = content.cssselect('#docHeader span.heading').pop().text.strip()
    signature, identifier = content.cssselect('.tab > div.stdoc p')
    #org, org_identifier, org_type, org_regulation = stddocs
    data['signature'] = signature.text
    data['identifier'] = identifier.text

    return cl, data