def parse(xml_file):
    print "XML", xml_file
    fh = open(xml_file, 'r')
    # res = requests.get(URL, stream=True)
    # res.raw.decode_content = True
    for evt, ap_el in etree.iterparse(fh):
        if evt != 'end' or ap_el.tag != NS + 'accreditedPerson':
            continue
        ap = {
            'org_identification_code': ap_el.findtext(NS + 'orgIdentificationCode'),
            'number_of_ir': ap_el.findtext(NS + 'numberOfIR'),
            # 'xml': etree.tostring(ap_el),
            'org_name': ap_el.findtext(NS + 'orgName'),
            'title': ap_el.findtext(NS + 'title'),
            'first_name': ap_el.findtext(NS + 'firstName'),
            'last_name': ap_el.findtext(NS + 'lastName'),
            'start_date': dateconv(ap_el.findtext(NS + 'accreditationStartDate')),
            'end_date': dateconv(ap_el.findtext(NS + 'accreditationEndDate')),
        }
        yield ap
        ap_el.clear()
示例#2
0
def parse(xml_file):
    print "XML", xml_file
    fh = open(xml_file, 'r')
    # res = requests.get(URL, stream=True)
    # res.raw.decode_content = True
    for evt, ap_el in etree.iterparse(fh):
        if evt != 'end' or ap_el.tag != NS + 'accreditedPerson':
            continue
        ap = {
            'org_identification_code':
            ap_el.findtext(NS + 'orgIdentificationCode'),
            'number_of_ir': ap_el.findtext(NS + 'numberOfIR'),
            # 'xml': etree.tostring(ap_el),
            'org_name': ap_el.findtext(NS + 'orgName'),
            'title': ap_el.findtext(NS + 'title'),
            'first_name': ap_el.findtext(NS + 'firstName'),
            'last_name': ap_el.findtext(NS + 'lastName'),
            'start_date':
            dateconv(ap_el.findtext(NS + 'accreditationStartDate')),
            'end_date': dateconv(ap_el.findtext(NS + 'accreditationEndDate')),
        }
        yield ap
        ap_el.clear()
示例#3
0
def parse_rep(rep_el):
    rep = {}
    rep['identification_code'] = rep_el.findtext(NS + 'identificationCode')
    rep['status'] = rep_el.findtext(NS + 'status')
    rep['registration_date'] = dateconv(rep_el.findtext(NS + 'registrationDate'))
    rep['last_update_date'] = dateconv(rep_el.findtext(NS + 'lastUpdateDate'))
    rep['legal_status'] = rep_el.findtext(NS + 'legalStatus')
    rep['acronym'] = rep_el.findtext(NS + 'acronym')
    rep['original_name'] = rep_el.findtext('.//' + NS + 'originalName')
    el = rep_el.find(NS + 'webSiteURL')
    rep['web_site_url'] = el.get(NS2 + 'href') if el is not None else None
    rep['main_category'] = rep_el.findtext('.//' + NS + 'mainCategory')
    rep['sub_category'] = rep_el.findtext('.//' + NS + 'subCategory')

    legal = {}
    legal['title'] = rep_el.findtext(NS + 'legalResp/' + NS + 'title')
    legal['first_name'] = rep_el.findtext(NS + 'legalResp/' + NS + 'firstName')
    legal['last_name'] = rep_el.findtext(NS + 'legalResp/' + NS + 'lastName')
    legal['position'] = rep_el.findtext(NS + 'legalResp/' + NS + 'position')
    rep['legal_person'] = legal

    eu = {}
    eu['title'] = rep_el.findtext(NS + 'euRelationsResp/' + NS + 'title')
    eu['first_name'] = rep_el.findtext(NS + 'euRelationsResp/' + NS + 'firstName')
    eu['last_name'] = rep_el.findtext(NS + 'euRelationsResp/' + NS + 'lastName')
    eu['position'] = rep_el.findtext(NS + 'euRelationsResp/' + NS + 'position')
    rep['eu_person'] = eu

    rep['contact_street'] = rep_el.findtext(NS + 'contactDetails/' + NS + 'addressline1')
    rep['contact_number'] = rep_el.findtext(NS + 'contactDetails/' + NS + 'number')
    rep['contact_post_code'] = rep_el.findtext(NS + 'contactDetails/' + NS
            + 'postCode')
    rep['contact_town'] = rep_el.findtext(NS + 'contactDetails/' + NS
            + 'town')
    rep['contact_country'] = rep_el.findtext(NS + 'contactDetails/' + NS
            + 'country')
    rep['contact_indic_phone'] = rep_el.findtext(NS + 'contactDetails//' + NS
            + 'indicPhone')
    rep['contact_indic_fax'] = rep_el.findtext(NS + 'contactDetails//' + NS
            + 'indicFax')
    rep['contact_fax'] = rep_el.findtext(NS + 'contactDetails//' + NS
            + 'fax')
    rep['contact_phone'] = rep_el.findtext(NS + 'contactDetails//' + NS
            + 'phoneNumber')
    rep['contact_more'] = rep_el.findtext(NS + 'contactDetails/' + NS
            + 'moreContactDetails')

    rep['goals'] = rep_el.findtext(NS + 'goals')
    rep['networking'] = rep_el.findtext(NS + 'networking')

    # rep['activities'] = rep_el.findtext(NS + 'activities')
    act_el = rep_el.find(NS + 'activities')
    if act_el is not None:
        rep['activity_eu_legislative'] = act_el.findtext(NS + 'activityEuLegislative')
        rep['activity_communication'] = act_el.findtext(NS + 'activityRelevantComm')
        rep['activity_high_level_groups'] = act_el.findtext(NS + 'activityHighLevelGroups')
        rep['activity_consult_committee'] = act_el.findtext(NS + 'activityConsultCommittees')
        rep['activity_expert_groups'] = act_el.findtext(NS + 'activityExpertGroups')
        rep['activity_inter_groups'] = act_el.findtext(NS + 'activityInterGroups')
        rep['activity_industry_forums'] = act_el.findtext(NS + 'activityIndustryForums')
        rep['activity_other'] = act_el.findtext(NS + 'activityOther')

    rep['code_of_conduct'] = rep_el.findtext(NS + 'codeOfConduct')
    rep['action_fields'] = []
    for field in rep_el.findall('.//' + NS + 'actionField/' + NS + 'actionField'):
        rep['action_fields'].append(field.text)
    rep['interests'] = []
    for interest in rep_el.findall('.//' + NS + 'interest/' + NS + 'name'):
        rep['interests'].append(interest.text)

    rep['number_of_natural_persons'] = intconv(rep_el.findtext('.//' + NS + 'structure/' + NS
            + 'numberOfNaturalPersons'))
    rep['number_of_organisations'] = intconv(rep_el.findtext('.//' + NS + 'structure/' + NS
            + 'numberOfOrganisations'))

    mem_el = rep_el.find(NS + 'members')
    if mem_el is not None:
        rep['members_100_percent'] = mem_el.findtext(NS + 'members100Percent')
        rep['members_25_percent'] = mem_el.findtext(NS + 'members25Percent')
        rep['members_total'] = mem_el.findtext(NS + 'members')
        rep['members_fte'] = mem_el.findtext(NS + 'membersFTE')
        rep['members_info'] = mem_el.findtext(NS + 'infoMembers')

    #pprint((rep['numberOfNaturalPersons'], rep['numberOfOrganisations']))
    rep['country_of_members'] = []
    el = rep_el.find(NS + 'structure/' + NS + 'countries')
    if el is not None:
        for country in el.findall('.//' + NS + 'country'):
            rep['country_of_members'].append(country.text)

    rep['organisations'] = []
    el = rep_el.find(NS + 'structure/' + NS + 'organisations')
    if el is not None:
        for org_el in el.findall(NS + 'organisation'):
            org = {}
            org['name'] = org_el.findtext(NS + 'name')
            org['number_of_members'] = org_el.findtext(NS + 'numberOfMembers')
            rep['organisations'].append(org)

    fd = {}
    fd_el = rep_el.find(NS + 'financialData')
    if fd_el is not None:
        fd['start_date'] = dateconv(fd_el.findtext(NS + 'startDate'))
        fd['end_date'] = dateconv(fd_el.findtext(NS + 'endDate'))
        fd['eur_sources_procurement'] = intconv(fd_el.findtext(NS + 'eurSourcesProcurement'))
        fd['eur_sources_grants'] = intconv(fd_el.findtext(NS + 'eurSourcesGrants'))
        fi = fd_el.find(NS + 'financialInformation')
        fd['type'] = fi.get(SI + 'type')
        #import ipdb; ipdb.set_trace()
        fd['total_budget'] = intconv(fi.findtext('.//' + NS +
            'total_budget'))
        fd['public_financing_total'] = intconv(fi.findtext('.//' + NS +
            'totalPublicFinancing'))
        fd['public_financing_national'] = intconv(fi.findtext('.//' + NS +
            'nationalSources'))
        fd['public_financing_infranational'] = intconv(fi.findtext('.//' + NS +
            'infranationalSources'))
        cps = fi.find('.//' + NS + 'customisedPublicSources')
        fd['public_customized'] = []
        if cps is not None:
            for src_el in cps.findall(NS + 'customizedSource'):
                src = {}
                src['name'] = src_el.findtext(NS + 'name')
                src['amount'] = intconv(src_el.findtext(NS + 'amount'))
                fd['public_customized'].append(src)
        fd['other_sources_total'] = intconv(fi.findtext('.//' + NS +
            'totalOtherSources'))
        fd['other_sources_donation'] = intconv(fi.findtext('.//' + NS +
            'donation'))
        fd['other_sources_contributions'] = intconv(fi.findtext('.//' + NS +
            'contributions'))

        # TODO customisedOther
        cps = fi.find('.//' + NS + 'customisedOther')
        fd['other_customized'] = []
        if cps is not None:
            for src_el in cps.findall(NS + 'customizedSource'):
                src = {}
                src['name'] = src_el.findtext(NS + 'name')
                src['amount'] = intconv(src_el.findtext(NS + 'amount'))
                fd['other_customized'].append(src)

        fd['direct_rep_costs_min'] = intconv(fi.findtext('.//' + NS +
            'directRepresentationCosts//' + NS + 'min'))
        fd['direct_rep_costs_max'] = intconv(fi.findtext('.//' + NS +
            'directRepresentationCosts//' + NS + 'max'))
        fd['cost_min'] = intconv(fi.findtext('.//' + NS +
            'cost//' + NS + 'min'))
        fd['cost_max'] = intconv(fi.findtext('.//' + NS +
            'cost//' + NS + 'max'))
        fd['cost_absolute'] = intconv(fi.findtext('.//' + NS +
            'cost//' + NS + 'absoluteAmount'))
        fd['turnover_min'] = intconv(fi.findtext('.//' + NS +
            'turnover//' + NS + 'min'))
        fd['turnover_max'] = intconv(fi.findtext('.//' + NS +
            'turnover//' + NS + 'max'))
        fd['turnover_absolute'] = intconv(fi.findtext('.//' + NS +
            'turnover//' + NS + 'absoluteAmount'))
        tb = fi.find(NS + 'turnoverBreakdown')
        fd['turnover_breakdown'] = []
        if tb is not None:
            for range_ in tb.findall(NS + 'customersGroupsInAbsoluteRange'):
                max_ = range_.findtext('.//' + NS + 'max')
                min_ = range_.findtext('.//' + NS + 'min')
                for customer in range_.findall('.//' + NS + 'customer'):
                    fd['turnover_breakdown'].append({
                        'name': customer.findtext(NS + 'name'),
                        'min': intconv(min_),
                        'max': intconv(max_)
                        })
            for range_ in tb.findall(NS + 'customersGroupsInPercentageRange'):
                # FIXME: I hate political compromises going into DB design
                # so directly.
                max_ = range_.findtext('.//' + NS + 'max')
                if max_:
                    max_ = float(max_) / 100.0 * \
                            float(fd['turnover_absolute'] or
                                  fd['turnover_max'] or fd['turnover_min'])
                min_ = range_.findtext('.//' + NS + 'min')
                if min_:
                    min_ = float(min_) / 100.0 * \
                            float(fd['turnover_absolute'] or
                                  fd['turnover_min'] or fd['turnover_max'])
                for customer in range_.findall('.//' + NS + 'customer'):
                    fd['turnover_breakdown'].append({
                        'name': customer.findtext(NS + 'name'),
                        'min': intconv(min_),
                        'max': intconv(max_)
                        })
    # from pprint import pprint
    # pprint(rep)
    rep['fd'] = fd
    return rep