Exemplo n.º 1
0
#     digests = f.readlines()
# responses = ['testdata/docs/response_%s.json' % d.strip() for d in digests]

with open('testdata/second_harvest/priority_identification_all.csv', 'w') as f:
    f.write('digest|url|protocol|subtype|service|has dataset|has metadata|version|is error\n')

for response in responses:
    with open(response, 'r') as f:
        data = json.loads(f.read())

    digest = data['digest']
    raw_content = data['raw_content']
    url = data['url']

    rr = RawResponse(url.upper(), raw_content, digest, **{})
    cleaned_text = rr.clean_raw_content()
    cleaned_text = cleaned_text.strip()

    try:
        parser = Parser(cleaned_text)
    except Exception as ex:
        logger.debug('xml parsing error: %s' % digest, exc_info=1)
        continue

    print digest

    identifier = Identify(YAML_FILE, cleaned_text, url, **{'parser': parser, 'ignore_case': True})
    identifier.identify()
    protocol = identifier.protocol
    subtype = identifier.subtype
    service = identifier.service