예제 #1
0
파일: tasks.py 프로젝트: Eleonore9/scrapi
def lint(name):
    harvester = registry[name]
    try:
        linter.lint(harvester.harvest, harvester.normalize)
    except Exception as e:
        print('Harvester {} raise the following exception'.format(harvester.short_name))
        print(e)
예제 #2
0
def lint(name):
    harvester = registry[name]
    try:
        linter.lint(harvester.harvest, harvester.normalize)
    except Exception as e:
        print('Harvester {} raise the following exception'.format(harvester.short_name))
        print(e)
예제 #3
0
    date = parse(date_updated).isoformat()
    return copy_to_unicode(date)


def normalize(raw_doc):
    raw_doc_text = raw_doc.get('doc')
    xml_doc = etree.XML(raw_doc_text)

    # Title
    title = (xml_doc.xpath('//official_title/node()') or xml_doc.xpath('//brief_title/node()') or [''])[0]

    # abstract
    abstract = (xml_doc.xpath('//brief_summary/textblock/node()') or xml_doc.xpath('//brief_summary/textblock/node()') or [''])[0]

    normalized_dict = {
        'title': copy_to_unicode(title),
        'contributors': get_contributors(xml_doc),
        'properties': get_properties(xml_doc),
        'description': copy_to_unicode(abstract),
        'id': get_ids(raw_doc, xml_doc),
        'source': NAME,
        'tags': get_tags(xml_doc),
        'dateUpdated': get_date_updated(xml_doc)
    }

    return NormalizedDocument(normalized_dict)


if __name__ == '__main__':
    print(lint(consume, normalize))
예제 #4
0

def normalize(raw_doc):
    raw_doc_text = raw_doc.get('doc')
    doc = etree.XML(raw_doc_text)

    title = (doc.xpath("str[@name='title']/node()") or [''])[0]
    description = (doc.xpath("str[@name='abstract']/node()") or [''])[0]

    normalized_dict = {
        'title': copy_to_unicode(title),
        'contributors': get_contributors(doc),
        'properties': get_properties(doc),
        'description': copy_to_unicode(description),
        'id': get_ids(doc, raw_doc),
        'tags': get_tags(doc),
        'source': NAME,
        'dateCreated': get_date_created(doc),
        'dateUpdated': get_date_updated(doc)
    }

    if normalized_dict['id']['url'] == u'':
        return None

    #import json; print json.dumps(normalized_dict['contributors'], indent=4)
    return NormalizedDocument(normalized_dict)


if __name__ == '__main__':
    print(lint(consume, normalize))