def parse(xml_file): print "XML", xml_file fh = open(xml_file, 'r') # res = requests.get(URL, stream=True) # res.raw.decode_content = True for evt, ap_el in etree.iterparse(fh): if evt != 'end' or ap_el.tag != NS + 'accreditedPerson': continue ap = { 'org_identification_code': ap_el.findtext(NS + 'orgIdentificationCode'), 'number_of_ir': ap_el.findtext(NS + 'numberOfIR'), # 'xml': etree.tostring(ap_el), 'org_name': ap_el.findtext(NS + 'orgName'), 'title': ap_el.findtext(NS + 'title'), 'first_name': ap_el.findtext(NS + 'firstName'), 'last_name': ap_el.findtext(NS + 'lastName'), 'start_date': dateconv(ap_el.findtext(NS + 'accreditationStartDate')), 'end_date': dateconv(ap_el.findtext(NS + 'accreditationEndDate')), } yield ap ap_el.clear()
def parse_rep(rep_el): rep = {} rep['identification_code'] = rep_el.findtext(NS + 'identificationCode') rep['status'] = rep_el.findtext(NS + 'status') rep['registration_date'] = dateconv(rep_el.findtext(NS + 'registrationDate')) rep['last_update_date'] = dateconv(rep_el.findtext(NS + 'lastUpdateDate')) rep['legal_status'] = rep_el.findtext(NS + 'legalStatus') rep['acronym'] = rep_el.findtext(NS + 'acronym') rep['original_name'] = rep_el.findtext('.//' + NS + 'originalName') el = rep_el.find(NS + 'webSiteURL') rep['web_site_url'] = el.get(NS2 + 'href') if el is not None else None rep['main_category'] = rep_el.findtext('.//' + NS + 'mainCategory') rep['sub_category'] = rep_el.findtext('.//' + NS + 'subCategory') legal = {} legal['title'] = rep_el.findtext(NS + 'legalResp/' + NS + 'title') legal['first_name'] = rep_el.findtext(NS + 'legalResp/' + NS + 'firstName') legal['last_name'] = rep_el.findtext(NS + 'legalResp/' + NS + 'lastName') legal['position'] = rep_el.findtext(NS + 'legalResp/' + NS + 'position') rep['legal_person'] = legal eu = {} eu['title'] = rep_el.findtext(NS + 'euRelationsResp/' + NS + 'title') eu['first_name'] = rep_el.findtext(NS + 'euRelationsResp/' + NS + 'firstName') eu['last_name'] = rep_el.findtext(NS + 'euRelationsResp/' + NS + 'lastName') eu['position'] = rep_el.findtext(NS + 'euRelationsResp/' + NS + 'position') rep['eu_person'] = eu rep['contact_street'] = rep_el.findtext(NS + 'contactDetails/' + NS + 'addressline1') rep['contact_number'] = rep_el.findtext(NS + 'contactDetails/' + NS + 'number') rep['contact_post_code'] = rep_el.findtext(NS + 'contactDetails/' + NS + 'postCode') rep['contact_town'] = rep_el.findtext(NS + 'contactDetails/' + NS + 'town') rep['contact_country'] = rep_el.findtext(NS + 'contactDetails/' + NS + 'country') rep['contact_indic_phone'] = rep_el.findtext(NS + 'contactDetails//' + NS + 'indicPhone') rep['contact_indic_fax'] = rep_el.findtext(NS + 'contactDetails//' + NS + 'indicFax') rep['contact_fax'] = rep_el.findtext(NS + 'contactDetails//' + NS + 'fax') rep['contact_phone'] = rep_el.findtext(NS + 'contactDetails//' + NS + 'phoneNumber') rep['contact_more'] = rep_el.findtext(NS + 'contactDetails/' + NS + 'moreContactDetails') rep['goals'] = rep_el.findtext(NS + 'goals') rep['networking'] = rep_el.findtext(NS + 'networking') # rep['activities'] = rep_el.findtext(NS + 'activities') act_el = rep_el.find(NS + 'activities') if act_el is not None: rep['activity_eu_legislative'] = act_el.findtext(NS + 'activityEuLegislative') rep['activity_communication'] = act_el.findtext(NS + 'activityRelevantComm') rep['activity_high_level_groups'] = act_el.findtext(NS + 'activityHighLevelGroups') rep['activity_consult_committee'] = act_el.findtext(NS + 'activityConsultCommittees') rep['activity_expert_groups'] = act_el.findtext(NS + 'activityExpertGroups') rep['activity_inter_groups'] = act_el.findtext(NS + 'activityInterGroups') rep['activity_industry_forums'] = act_el.findtext(NS + 'activityIndustryForums') rep['activity_other'] = act_el.findtext(NS + 'activityOther') rep['code_of_conduct'] = rep_el.findtext(NS + 'codeOfConduct') rep['action_fields'] = [] for field in rep_el.findall('.//' + NS + 'actionField/' + NS + 'actionField'): rep['action_fields'].append(field.text) rep['interests'] = [] for interest in rep_el.findall('.//' + NS + 'interest/' + NS + 'name'): rep['interests'].append(interest.text) rep['number_of_natural_persons'] = intconv(rep_el.findtext('.//' + NS + 'structure/' + NS + 'numberOfNaturalPersons')) rep['number_of_organisations'] = intconv(rep_el.findtext('.//' + NS + 'structure/' + NS + 'numberOfOrganisations')) mem_el = rep_el.find(NS + 'members') if mem_el is not None: rep['members_100_percent'] = mem_el.findtext(NS + 'members100Percent') rep['members_25_percent'] = mem_el.findtext(NS + 'members25Percent') rep['members_total'] = mem_el.findtext(NS + 'members') rep['members_fte'] = mem_el.findtext(NS + 'membersFTE') rep['members_info'] = mem_el.findtext(NS + 'infoMembers') #pprint((rep['numberOfNaturalPersons'], rep['numberOfOrganisations'])) rep['country_of_members'] = [] el = rep_el.find(NS + 'structure/' + NS + 'countries') if el is not None: for country in el.findall('.//' + NS + 'country'): rep['country_of_members'].append(country.text) rep['organisations'] = [] el = rep_el.find(NS + 'structure/' + NS + 'organisations') if el is not None: for org_el in el.findall(NS + 'organisation'): org = {} org['name'] = org_el.findtext(NS + 'name') org['number_of_members'] = org_el.findtext(NS + 'numberOfMembers') rep['organisations'].append(org) fd = {} fd_el = rep_el.find(NS + 'financialData') if fd_el is not None: fd['start_date'] = dateconv(fd_el.findtext(NS + 'startDate')) fd['end_date'] = dateconv(fd_el.findtext(NS + 'endDate')) fd['eur_sources_procurement'] = intconv(fd_el.findtext(NS + 'eurSourcesProcurement')) fd['eur_sources_grants'] = intconv(fd_el.findtext(NS + 'eurSourcesGrants')) fi = fd_el.find(NS + 'financialInformation') fd['type'] = fi.get(SI + 'type') #import ipdb; ipdb.set_trace() fd['total_budget'] = intconv(fi.findtext('.//' + NS + 'total_budget')) fd['public_financing_total'] = intconv(fi.findtext('.//' + NS + 'totalPublicFinancing')) fd['public_financing_national'] = intconv(fi.findtext('.//' + NS + 'nationalSources')) fd['public_financing_infranational'] = intconv(fi.findtext('.//' + NS + 'infranationalSources')) cps = fi.find('.//' + NS + 'customisedPublicSources') fd['public_customized'] = [] if cps is not None: for src_el in cps.findall(NS + 'customizedSource'): src = {} src['name'] = src_el.findtext(NS + 'name') src['amount'] = intconv(src_el.findtext(NS + 'amount')) fd['public_customized'].append(src) fd['other_sources_total'] = intconv(fi.findtext('.//' + NS + 'totalOtherSources')) fd['other_sources_donation'] = intconv(fi.findtext('.//' + NS + 'donation')) fd['other_sources_contributions'] = intconv(fi.findtext('.//' + NS + 'contributions')) # TODO customisedOther cps = fi.find('.//' + NS + 'customisedOther') fd['other_customized'] = [] if cps is not None: for src_el in cps.findall(NS + 'customizedSource'): src = {} src['name'] = src_el.findtext(NS + 'name') src['amount'] = intconv(src_el.findtext(NS + 'amount')) fd['other_customized'].append(src) fd['direct_rep_costs_min'] = intconv(fi.findtext('.//' + NS + 'directRepresentationCosts//' + NS + 'min')) fd['direct_rep_costs_max'] = intconv(fi.findtext('.//' + NS + 'directRepresentationCosts//' + NS + 'max')) fd['cost_min'] = intconv(fi.findtext('.//' + NS + 'cost//' + NS + 'min')) fd['cost_max'] = intconv(fi.findtext('.//' + NS + 'cost//' + NS + 'max')) fd['cost_absolute'] = intconv(fi.findtext('.//' + NS + 'cost//' + NS + 'absoluteAmount')) fd['turnover_min'] = intconv(fi.findtext('.//' + NS + 'turnover//' + NS + 'min')) fd['turnover_max'] = intconv(fi.findtext('.//' + NS + 'turnover//' + NS + 'max')) fd['turnover_absolute'] = intconv(fi.findtext('.//' + NS + 'turnover//' + NS + 'absoluteAmount')) tb = fi.find(NS + 'turnoverBreakdown') fd['turnover_breakdown'] = [] if tb is not None: for range_ in tb.findall(NS + 'customersGroupsInAbsoluteRange'): max_ = range_.findtext('.//' + NS + 'max') min_ = range_.findtext('.//' + NS + 'min') for customer in range_.findall('.//' + NS + 'customer'): fd['turnover_breakdown'].append({ 'name': customer.findtext(NS + 'name'), 'min': intconv(min_), 'max': intconv(max_) }) for range_ in tb.findall(NS + 'customersGroupsInPercentageRange'): # FIXME: I hate political compromises going into DB design # so directly. max_ = range_.findtext('.//' + NS + 'max') if max_: max_ = float(max_) / 100.0 * \ float(fd['turnover_absolute'] or fd['turnover_max'] or fd['turnover_min']) min_ = range_.findtext('.//' + NS + 'min') if min_: min_ = float(min_) / 100.0 * \ float(fd['turnover_absolute'] or fd['turnover_min'] or fd['turnover_max']) for customer in range_.findall('.//' + NS + 'customer'): fd['turnover_breakdown'].append({ 'name': customer.findtext(NS + 'name'), 'min': intconv(min_), 'max': intconv(max_) }) # from pprint import pprint # pprint(rep) rep['fd'] = fd return rep