def init_footprint_xml(): outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += "<FeedInfo>" outstr += xmlh.output_val("providerID", "105") outstr += xmlh.output_val("providerName", "craigslist") outstr += xmlh.output_val("feedID", "craigslist") outstr += xmlh.output_val("createdDateTime", xmlh.current_ts()) outstr += xmlh.output_val("providerURL", "http://www.craigslist.org/") outstr += "</FeedInfo>" # no "organization" in craigslist postings outstr += "<Organizations>" outstr += "<Organization>" outstr += "<organizationID>0</organizationID>" outstr += "<nationalEIN></nationalEIN>" outstr += "<name></name>" outstr += "<missionStatement></missionStatement>" outstr += "<description></description>" outstr += "<location>" outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += "</location>" outstr += "<organizationURL></organizationURL>" outstr += "<donateURL></donateURL>" outstr += "<logoURL></logoURL>" outstr += "<detailURL></detailURL>" outstr += "</Organization>" outstr += "</Organizations>" outstr += "<VolunteerOpportunities>" return outstr
def parser(providerID, providerName, feedID, providerURL, feedDescription): """create an FPXML-compatible parser""" feedinfo = "<FeedInfo>" feedinfo += xmlh.output_val('providerID', providerID) feedinfo += xmlh.output_val('providerName', providerName) feedinfo += xmlh.output_val('feedID', feedID) feedinfo += xmlh.output_val('createdDateTime', xmlh.current_ts()) feedinfo += xmlh.output_val('providerURL', providerURL) feedinfo += xmlh.output_val('description', feedDescription) feedinfo += "</FeedInfo>" def parse_func(instr, maxrecs, progress): """closure-- generated parse func""" outstr, numorgs, numopps = parse_fast(instr, maxrecs, progress) return re.sub(re.compile(r'<FeedInfo>.+?</FeedInfo>', re.DOTALL), feedinfo, outstr), numorgs, numopps return parse_func
def parse(instr, maxrecs, progress): numorgs = numopps = 0 instr = re.sub(r'<(/?db):', r'<\1_', instr) opps = re.findall(r'<VolunteerOpportunity>.+?</VolunteerOpportunity>', instr, re.DOTALL) volopps = "" for i, oppstr in enumerate(opps): #if progress and i > 0 and i % 250 == 0: # print str(datetime.now())+": ", i, " opportunities processed." if (maxrecs > 0 and i > maxrecs): break xmlh.print_rps_progress("opps", progress, i, maxrecs) item = xmlh.simple_parser(oppstr, known_elnames, progress=False) orgid = register_org(item) # logoURL -- sigh, this is for the opportunity not the org volopps += '<VolunteerOpportunity>' volopps += xmlh.output_val('volunteerOpportunityID', str(i)) volopps += xmlh.output_val('sponsoringOrganizationID', str(orgid)) volopps += xmlh.output_node('volunteerHubOrganizationID', item, "LocalID") volopps += xmlh.output_node('title', item, "Title") volopps += xmlh.output_node('abstract', item, "Description") volopps += xmlh.output_node('description', item, "Description") volopps += xmlh.output_node('detailURL', item, "DetailURL") volopps += xmlh.output_val('volunteersNeeded', "-8888") try: oppdates = item.getElementsByTagName("OpportunityDate") except: oppdates = [] if len(oppdates) > 1: print datetime.now(), \ "parse_servenet.py: only 1 OpportunityDate supported." #return None oppdate = oppdates[0] elif len(oppdates) == 0: oppdate = None else: oppdate = oppdates[0] volopps += '<dateTimeDurations><dateTimeDuration>' if oppdate: volopps += xmlh.output_val('openEnded', 'No') volopps += xmlh.output_val( 'duration', 'P%s%s' % (xmlh.get_tag_val(oppdate, "DurationQuantity"), xmlh.get_tag_val(oppdate, "DurationUnit"))) volopps += xmlh.output_val('commitmentHoursPerWeek', '0') volopps += xmlh.output_node('startDate', oppdate, "StartDate") volopps += xmlh.output_node('endDate', oppdate, "EndDate") else: volopps += xmlh.output_val('openEnded', 'Yes') volopps += xmlh.output_val('commitmentHoursPerWeek', '0') volopps += '</dateTimeDuration></dateTimeDurations>' volopps += '<locations>' try: opplocs = item.getElementsByTagName("Location") except: opplocs = [] for opploc in opplocs: volopps += '<location>' virtual_tag = opploc.getElementsByTagName("Virtual") if virtual_tag and xmlh.get_tag_val( opploc, "Virtual").lower() == "yes": volopps += xmlh.output_val('virtual', 'Yes') else: volopps += xmlh.output_node('region', opploc, "StateOrProvince") volopps += xmlh.output_node('country', opploc, "Country") volopps += xmlh.output_node('postalCode', opploc, "ZipOrPostalCode") volopps += '</location>' volopps += '</locations>' volopps += '<categoryTags/>' volopps += '</VolunteerOpportunity>' numopps += 1 # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', providerID) outstr += xmlh.output_val('providerName', providerName) outstr += xmlh.output_val('feedID', feedID) outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', providerURL) outstr += xmlh.output_val('description', feedDescription) # TODO: capture ts -- use now?! outstr += '</FeedInfo>' # hardcoded: Organization outstr += '<Organizations>' for key in ORGS: outstr += ORGS[key] numorgs += 1 outstr += '</Organizations>' outstr += '<VolunteerOpportunities>' outstr += volopps outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr) return outstr, numorgs, numopps
def parse(instr, maxrecs, progress): """return FPXML given craigslist data""" if CL_LATLONGS == None: load_craigslist_latlongs() xmlh.print_progress("loading craigslist crawler output...") crawl_craigslist.parse_cache_file(instr, listings_only=True) xmlh.print_progress("loaded " + str(len(crawl_craigslist.pages)) + " craigslist pages.") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += "<FeedInfo>" outstr += xmlh.output_val("providerID", "105") outstr += xmlh.output_val("providerName", "craigslist") outstr += xmlh.output_val("feedID", "craigslist") outstr += xmlh.output_val("createdDateTime", xmlh.current_ts()) outstr += xmlh.output_val("providerURL", "http://www.craigslist.org/") outstr += "</FeedInfo>" numorgs = numopps = 0 # no "organization" in craigslist postings outstr += "<Organizations>" outstr += "<Organization>" outstr += "<organizationID>0</organizationID>" outstr += "<nationalEIN></nationalEIN>" outstr += "<name></name>" outstr += "<missionStatement></missionStatement>" outstr += "<description></description>" outstr += "<location>" outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += "</location>" outstr += "<organizationURL></organizationURL>" outstr += "<donateURL></donateURL>" outstr += "<logoURL></logoURL>" outstr += "<detailURL></detailURL>" outstr += "</Organization>" numorgs += 1 outstr += "</Organizations>" skipped_listings = {} skipped_listings["body"] = skipped_listings["title"] = skipped_listings["not-ok"] = 0 outstr += "<VolunteerOpportunities>" for i, url in enumerate(crawl_craigslist.pages): page = crawl_craigslist.pages[url] ok = extract(page, "it's OK to distribute this " + "charitable volunteerism opportunity") if ok == "": skipped_listings["not-ok"] += 1 continue title = extract(page, "<title>(.+?)</title>") if title == "": skipped_listings["title"] += 1 continue body = extract(page, '<div id="userbody">(.+?)<') if len(body) < 25: skipped_listings["body"] += 1 continue item_id = extract(url, "/vol/(.+?)[.]html$") locstr = extract(page, "Location: (.+?)<") datestr = extract(page, "Date: (.+?)<") ts = dateutil.parser.parse(datestr) datetimestr = ts.strftime("%Y-%m-%dT%H:%M:%S") datestr = ts.strftime("%Y-%m-%d") if maxrecs > 0 and i > maxrecs: break xmlh.print_rps_progress("opps", progress, i, maxrecs) if progress and i > 0 and i % 250 == 0: msg = "skipped " + str(skipped_listings["title"] + skipped_listings["body"]) msg += " listings (" + str(skipped_listings["title"]) + " for no-title and " msg += str(skipped_listings["body"]) + " for short body and " msg += str(skipped_listings["not-ok"]) + " for no-redistrib)" xmlh.print_progress(msg) # print "---" # print "title:",title # print "loc:",locstr # print "date:",datestr # print "body:",body[0:100] # craigslist is full of weird escapes-- strip them body = re.sub(r"&[a-z]+;", "", body) title = re.sub(r"&[a-z]+;", "", title) locstr = re.sub(r"&[a-z]+;", "", locstr) outstr += "<VolunteerOpportunity>" outstr += "<volunteerOpportunityID>%s</volunteerOpportunityID>" % (item_id) outstr += "<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>" outstr += "<volunteerHubOrganizationIDs><volunteerHubOrganizationID>0</volunteerHubOrganizationID></volunteerHubOrganizationIDs>" outstr += "<title>%s</title>" % (title) outstr += "<detailURL>%s</detailURL>" % (url) # avoid CDATA in body... esc_body = xml.sax.saxutils.escape(body) esc_body100 = xml.sax.saxutils.escape(body[0:100]) outstr += "<description>%s</description>" % (esc_body) outstr += "<abstract>%s</abstract>" % (esc_body100 + "...") outstr += "<lastUpdated>%s</lastUpdated>" % (datetimestr) # TODO: expires # TODO: synthesize location from metro... outstr += "<locations><location>" outstr += "<name>%s</name>" % (xml.sax.saxutils.escape(locstr)) # what about the few that do geocode? lat, lng = "", "" try: domain, unused = url.split("vol/") lat, lng = CL_LATLONGS[domain].split(",") except: # ignore for now # print url # continue pass outstr += "<latitude>%s</latitude>" % (lat) outstr += "<longitude>%s</longitude>" % (lng) outstr += "</location></locations>" # outstr += '<locations><location>' # outstr += '<city>%s</city>' % ( # outstr += '<region>%s</region>' % ( # outstr += '</location></locations>' outstr += "<dateTimeDurations><dateTimeDuration>" outstr += "<openEnded>No</openEnded>" outstr += "<startDate>%s</startDate>" % (datestr) # TODO: endDate = startDate + N=14 days? # TODO: timezone??? # outstr += '<endDate>%s</endDate>' % ( outstr += "</dateTimeDuration></dateTimeDurations>" # TODO: categories??? # outstr += '<categoryTags>' outstr += "</VolunteerOpportunity>" numopps += 1 outstr += "</VolunteerOpportunities>" outstr += "</FootprintFeed>" # outstr = re.sub(r'><([^/])', r'>\n<\1', outstr) return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given 350.org data""" feed = xmlh.parse_or_die(instr.encode('utf-8')) org_id = str(139) mission_statement = "350.org is an international campaign that's building a movement to unite the world around solutions to the climate crisis--the solutions that science and justice demand." org_desc = "On October 10 we'll be helping host a Global Work Party, with thousands of communities setting up solar panels or digging community gardens or laying out bike paths." start_date = '2010-10-01' today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") numorgs = 1 numopps = 0 xmlh.print_progress("loading 350.org custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "350org") outstr += xmlh.output_val('feedID', "350org") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.350.org/") outstr += '</FeedInfo>' # 1 "organization" in 350.org postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>350.org</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += '</location>' # TODO: make these variables outstr += '<organizationURL>http://www.350.org/</organizationURL>' outstr += '<donateURL>http://www.350.org/donate</donateURL>' outstr += '<logoURL>http://www.350.org/sites/all/themes/threefifty/logo.gif</logoURL>' outstr += '<detailURL>http://www.350.org/about</detailURL>' outstr += '</Organization></Organizations>' outstr += '\n<VolunteerOpportunities>\n' nodes = feed.getElementsByTagName('node') for i, node in enumerate(nodes): if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>' desc = '<![CDATA[' + xmlh.get_tag_val(node, "Body") + ']]>' url = xmlh.get_tag_val(node, "Link") lat = xmlh.get_tag_val(node, "Latitude") lng = xmlh.get_tag_val(node, "Longitude") start_datetime = xmlh.get_tag_val(node, "Start_Date") start_time = None if not start_datetime: start_date = "2010-10-10" else: start_datetime = start_datetime.replace(" (All day)", "T00:00:00") dt = start_datetime.split("T") start_date = dt[0][0:10] if len(dt) > 1: start_time = dt[1] end_datetime = xmlh.get_tag_val(node, "End_Date") end_time = None if not end_datetime: open_ended = True else: open_ended = False end_datetime = end_datetime.replace(" (All day)", "T23:00:00") dt = end_datetime.split("T") end_date = dt[0][0:10] if len(dt) > 1: end_time = dt[1] end_datetime = xmlh.get_tag_val(node, "End_Date") locstr = "%s, %s %s" % (xmlh.get_tag_val( node, "City"), xmlh.get_tag_val( node, "Province"), xmlh.get_tag_val(node, "Country")) outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % ( str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % ( org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % ( org_id) outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' % (last_updated) outstr += '<locations><location>' outstr += '<location_string>%s</location_string>' % (locstr) outstr += '<latitude>%s</latitude>' % (lat) outstr += '<longitude>%s</longitude>' % (lng) outstr += '</location></locations>' outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) if start_time: outstr += '<startTime>%s</startTime>' % (start_time) if open_ended: outstr += '<openEnded>Yes</openEnded>' else: outstr += '<openEnded>No</openEnded>' outstr += '<endDate>%s</endDate>' % (end_date) if end_time: outstr += '<endTime>%s</endTime>' % (end_time) outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '</VolunteerOpportunity>\n' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
def record_to_fpxml(record): """convert a spreadsheet record to an FPXML fragment.""" fpxml = "" fpxml += '<VolunteerOpportunity>' fpxml += xmlh.output_val("volunteerOpportunityID", recordval(record, 'oppid')) orgname = recordval(record, 'SponsoringOrganization') if orgname not in KNOWN_ORGS: KNOWN_ORGS[orgname] = len(KNOWN_ORGS) fpxml += xmlh.output_val("sponsoringOrganizationID", KNOWN_ORGS[orgname]) title = recordval(record, 'OpportunityTitle') if title == "": parser_error("missing OpportunityTitle-- this field is required.") fpxml += xmlh.output_val("title", title, cdata=True) fpxml += '<dateTimeDurations>' fpxml += '<dateTimeDuration>' if ('StartDate' in record and recordval(record, 'StartDate').lower().find("ongoing") >= 0): fpxml += xmlh.output_val('openEnded', 'Yes') else: fpxml += xmlh.output_val('openEnded', 'No') startdtval = get_dtval(record, 'StartDate') if startdtval != "": fpxml += xmlh.output_val('startDate', startdtval) starttmval = get_tmval(record, 'StartTime') if starttmval != "": fpxml += xmlh.output_val('startTime', starttmval) enddtval = get_dtval(record, 'EndDate') if enddtval != "": fpxml += xmlh.output_val('endDate', enddtval) endtmval = get_tmval(record, 'EndTime') if endtmval != "": fpxml += xmlh.output_val('endTime', endtmval) freq = recordval(record, 'Frequency').lower() if freq == "" or freq.find("once") >= 0: fpxml += '<iCalRecurrence/>' elif freq.find("daily") >= 0: fpxml += '<iCalRecurrence>FREQ=DAILY</iCalRecurrence>' elif freq.find("weekly") >= 0: fpxml += '<iCalRecurrence>FREQ=WEEKLY</iCalRecurrence>' elif freq.find("other") >= 0 and freq.find("week") >= 0: fpxml += '<iCalRecurrence>FREQ=WEEKLY;INTERVAL=2</iCalRecurrence>' elif freq.find("monthly") >= 0: fpxml += '<iCalRecurrence>FREQ=MONTHLY</iCalRecurrence>' else: fpxml += '<iCalRecurrence/>' # just disregard the bad value instead of discarding the opp #parser_error("unsupported frequency: '"+ # recordval(record, 'Frequency')+"'-- skipping") fpxml += xmlh.output_val('commitmentHoursPerWeek', recordval(record, 'CommitmentHours')) fpxml += '</dateTimeDuration>' fpxml += '</dateTimeDurations>' fpxml += '<locations>' fpxml += '<location>' if recordval(record, 'LocationName').find("virtual") >= 0: fpxml += xmlh.output_val('virtual', 'Yes') else: fpxml += xmlh.output_val('virtual', 'No') fpxml += xmlh.output_val('name', recordval(record, 'LocationName'), cdata=True) fpxml += xmlh.output_val('streetAddress1', recordval(record, 'LocationStreet')) fpxml += xmlh.output_val('city', recordval(record, 'LocationCity'), cdata=True) fpxml += xmlh.output_val('region', recordval(record, 'LocationProvince'), cdata=True) fpxml += xmlh.output_val('postalCode', recordval(record, 'LocationPostalCode'), cdata=True) fpxml += xmlh.output_val('country', recordval(record, 'LocationCountry'), cdata=True) fpxml += '</location>' fpxml += '</locations>' fpxml += xmlh.output_val('paid', recordval(record, 'Paid')) fpxml += xmlh.output_val('self_directed', recordval(record, 'self_directed')) v = recordval(record, 'MinimumAge') if v: try: v = int(v) except: v = '' fpxml += xmlh.output_val('minimumAge', str(v)) # TODO: seniors only, kidfriendly fpxml += xmlh.output_val('sexRestrictedTo', recordval(record, 'SexRestrictedTo')) fpxml += xmlh.output_val('skills', recordval(record, 'Skills')) fpxml += xmlh.output_val('contactName', recordval(record, 'ContactName'), cdata=True) fpxml += xmlh.output_val('contactPhone', recordval(record, 'ContactPhone'), cdata=True) fpxml += xmlh.output_val('contactEmail', recordval(record, 'ContactEmail'), cdata=True) url = recordval(record, 'URL') if not url.lower().startswith('http'): url = 'http://' + url fpxml += xmlh.output_val('detailURL', url, cdata=True) # note: preserve whitespace in description fpxml += xmlh.output_val('description', raw_recordval(record, 'Description'), cdata=True) fpxml += '<lastUpdated olsonTZ="Etc/UTC">' fpxml += recordval(record, 'LastUpdated') + '</lastUpdated>' fpxml += '</VolunteerOpportunity>' return fpxml
def parse(instr, maxrecs, progress): """return FPXML given craigslist data""" if CL_LATLONGS == None: load_craigslist_latlongs() xmlh.print_progress("loading craigslist crawler output...") crawl_craigslist.parse_cache_file(instr, listings_only=True) xmlh.print_progress("loaded "+str(len(crawl_craigslist.pages))+" craigslist pages.") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', "105") outstr += xmlh.output_val('providerName', "craigslist") outstr += xmlh.output_val('feedID', "craigslist") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.craigslist.org/") outstr += '</FeedInfo>' numorgs = numopps = 0 # no "organization" in craigslist postings outstr += '<Organizations>' outstr += '<Organization>' outstr += '<organizationID>0</organizationID>' outstr += '<nationalEIN></nationalEIN>' outstr += '<name></name>' outstr += '<missionStatement></missionStatement>' outstr += '<description></description>' outstr += '<location>' outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += '</location>' outstr += '<organizationURL></organizationURL>' outstr += '<donateURL></donateURL>' outstr += '<logoURL></logoURL>' outstr += '<detailURL></detailURL>' outstr += '</Organization>' numorgs += 1 outstr += '</Organizations>' skipped_listings = {} skipped_listings["body"] = skipped_listings["title"] = \ skipped_listings["not-ok"] = 0 outstr += '<VolunteerOpportunities>' for i, url in enumerate(crawl_craigslist.pages): page = crawl_craigslist.pages[url] ok = extract(page, "it's OK to distribute this "+ "charitable volunteerism opportunity") if ok == "": skipped_listings["not-ok"] += 1 continue title = extract(page, "<title>(.+?)</title>") if title == "": skipped_listings["title"] += 1 continue body = extract(page, '<div id="userbody">(.+?)<') if len(body) < 25: skipped_listings["body"] += 1 continue item_id = extract(url, "/vol/(.+?)[.]html$") locstr = extract(page, "Location: (.+?)<") datestr = extract(page, "Date: (.+?)<") ts = dateutil.parser.parse(datestr) datetimestr = ts.strftime("%Y-%m-%dT%H:%M:%S") datestr = ts.strftime("%Y-%m-%d") if (maxrecs>0 and i>maxrecs): break xmlh.print_rps_progress("opps", progress, i, maxrecs) if progress and i > 0 and i % 250 == 0: msg = "skipped " + str(skipped_listings["title"]+skipped_listings["body"]) msg += " listings ("+str(skipped_listings["title"]) + " for no-title and " msg += str(skipped_listings["body"]) + " for short body and " msg += str(skipped_listings["not-ok"]) + " for no-redistrib)" xmlh.print_progress(msg) #print "---" #print "title:",title #print "loc:",locstr #print "date:",datestr #print "body:",body[0:100] # craigslist is full of weird escapes-- strip them body = re.sub(r'&[a-z]+;', '', body) title = re.sub(r'&[a-z]+;', '', title) locstr = re.sub(r'&[a-z]+;', '', locstr) outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (item_id) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>' outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>0</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) # avoid CDATA in body... esc_body = xml.sax.saxutils.escape(body) esc_body100 = xml.sax.saxutils.escape(body[0:100]) outstr += '<description>%s</description>' % (esc_body) outstr += '<abstract>%s</abstract>' % (esc_body100 + "...") outstr += '<lastUpdated>%s</lastUpdated>' % (datetimestr) # TODO: expires # TODO: synthesize location from metro... outstr += '<locations><location>' outstr += '<name>%s</name>' % (xml.sax.saxutils.escape(locstr)) # what about the few that do geocode? lat, lng = "", "" try: domain, unused = url.split("vol/") lat, lng = CL_LATLONGS[domain].split(",") except: # ignore for now #print url #continue pass outstr += '<latitude>%s</latitude>' % (lat) outstr += '<longitude>%s</longitude>' % (lng) outstr += '</location></locations>' #outstr += '<locations><location>' #outstr += '<city>%s</city>' % ( #outstr += '<region>%s</region>' % ( #outstr += '</location></locations>' outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<openEnded>No</openEnded>' outstr += '<startDate>%s</startDate>' % (datestr) # TODO: endDate = startDate + N=14 days? # TODO: timezone??? #outstr += '<endDate>%s</endDate>' % ( outstr += '</dateTimeDuration></dateTimeDurations>' # TODO: categories??? #outstr += '<categoryTags>' outstr += '</VolunteerOpportunity>' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr) return outstr, numorgs, numopps
org_desc = "Volunteer Opportunities that were posted to idealist.org in English" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") start_date = last_updated numorgs = 1 numopps = 0 xmlh.print_progress("loading idealist.xml custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "idealist") outstr += xmlh.output_val('feedID', "idealist") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.idealist.org/") outstr += '</FeedInfo>' # 1 "organization" in idealist.org postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>idealist.org</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "New York") outstr += xmlh.output_val("region", "NY")
def parse(instr, maxrec, progress): """return FPXML given sparked feed data""" from xml.dom import minidom org_id = "140" mission_statement = "Do it yourself volunteer opportunities." org_desc = "Do it yourself volunteer opportunities" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") numorgs = 1 numopps = 0 xmlh.print_progress("loading diy custom TSV...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "diy") outstr += xmlh.output_val('feedID', "diy") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.allforgood.org/") outstr += '</FeedInfo>' outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>allforgood.org</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "San Francisco") outstr += xmlh.output_val("region", "CA") outstr += xmlh.output_val("postalCode", "94105") outstr += '</location>' outstr += '<organizationURL>http://www.allforgood.org/</organizationURL>' outstr += '<donateURL>http://www.allforgood.org/</donateURL>' outstr += '<logoURL>http://www.allforgood.org/</logoURL>' outstr += '<detailURL>http://www.allforgood.org/</detailURL>' outstr += '</Organization></Organizations>' outstr += '<VolunteerOpportunities>' lines = instr.split("\n") header = lines.pop(0).strip().split("\t") for i, line in enumerate(lines): row = line.strip().split("\t") if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + get_field("title", row, header) + ']]>' url = get_field("url", row, header) if not title or not url: continue sponsor = get_field("sponsoringOrganization", row, header) desc = ('<![CDATA[' + sponsor + ': ' + get_field("description", row, header) + ' Areas of interest: ' + get_field("subjectArea", row, header) + ' Tags: ' + get_field("keywords", row, header) + ']]>') start_date = last_updated outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % ( str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % ( org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % ( org_id) outstr += '<self_directed>Yes</self_directed>' outstr += '<title>%s</title>' % (title) outstr += '<detailURL><![CDATA[%s]]></detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' % (last_updated) outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) outstr += '<openEnded>Yes</openEnded>' outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '<locations><location><virtual>Yes</virtual></location></locations>' outstr += '</VolunteerOpportunity>' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given sparked feed data""" from xml.dom import minidom org_id = "140" mission_statement = "Do it yourself volunteer opportunities." org_desc = "Do it yourself volunteer opportunities" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") numorgs = 1 numopps = 0 xmlh.print_progress("loading diy custom TSV...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += "<FeedInfo>" outstr += xmlh.output_val("providerID", org_id) outstr += xmlh.output_val("providerName", "diy") outstr += xmlh.output_val("feedID", "diy") outstr += xmlh.output_val("createdDateTime", xmlh.current_ts()) outstr += xmlh.output_val("providerURL", "http://www.allforgood.org/") outstr += "</FeedInfo>" outstr += "<Organizations><Organization>" outstr += xmlh.output_val("organizationID", org_id) outstr += "<nationalEIN></nationalEIN>" outstr += "<name>allforgood.org</name>" outstr += xmlh.output_val("missionStatement", mission_statement) outstr += xmlh.output_val("description", org_desc) outstr += "<location>" outstr += xmlh.output_val("city", "San Francisco") outstr += xmlh.output_val("region", "CA") outstr += xmlh.output_val("postalCode", "94105") outstr += "</location>" outstr += "<organizationURL>http://www.allforgood.org/</organizationURL>" outstr += "<donateURL>http://www.allforgood.org/</donateURL>" outstr += "<logoURL>http://www.allforgood.org/</logoURL>" outstr += "<detailURL>http://www.allforgood.org/</detailURL>" outstr += "</Organization></Organizations>" outstr += "<VolunteerOpportunities>" lines = instr.split("\n") header = lines.pop(0).strip().split("\t") for i, line in enumerate(lines): row = line.strip().split("\t") if maxrec > 0 and i > maxrec: break title = "<![CDATA[" + get_field("title", row, header) + "]]>" url = get_field("url", row, header) if not title or not url: continue sponsor = get_field("sponsoringOrganization", row, header) desc = ( "<![CDATA[" + sponsor + ": " + get_field("description", row, header) + " Areas of interest: " + get_field("subjectArea", row, header) + " Tags: " + get_field("keywords", row, header) + "]]>" ) start_date = last_updated outstr += "<VolunteerOpportunity>" outstr += "<volunteerOpportunityID>%s</volunteerOpportunityID>" % (str(i)) outstr += ( "<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>" % (org_id) ) outstr += ( "<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>" % (org_id) ) outstr += "<self_directed>Yes</self_directed>" outstr += "<title>%s</title>" % (title) outstr += "<detailURL><![CDATA[%s]]></detailURL>" % (url) outstr += "<description>%s</description>" % (desc) outstr += "<abstract>%s</abstract>" % (desc) outstr += "<lastUpdated>%s</lastUpdated>" % (last_updated) outstr += "<dateTimeDurations><dateTimeDuration>" outstr += "<startDate>%s</startDate>" % (start_date) outstr += "<openEnded>Yes</openEnded>" outstr += "</dateTimeDuration></dateTimeDurations>" outstr += "<locations><location><virtual>Yes</virtual></location></locations>" outstr += "</VolunteerOpportunity>" numopps += 1 outstr += "</VolunteerOpportunities>" outstr += "</FootprintFeed>" return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given sparked feed data""" feed = xmlh.parse_or_die(instr.encode('utf-8')) org_id = str(139) mission_statement = "Sparked makes it easy for people with busy lives to help nonprofits get valuable work done when it's convenient. We call it microvolunteering. Through the convenience of the Internet, and with the collaboration of others, micro-volunteers use their professional skills to help causes they care about." org_desc = "Sparked is the world's first Microvolunteering network" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") start_date = last_updated numorgs = 1 numopps = 0 xmlh.print_progress("loading sparked.com custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "sparked") outstr += xmlh.output_val('feedID', "sparked") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.sparked.com/") outstr += '</FeedInfo>' # 1 "organization" in sparked.com postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>sparked.com</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "San Francisco") outstr += xmlh.output_val("region", "CA") outstr += xmlh.output_val("postalCode", "94105") outstr += '</location>' outstr += '<organizationURL>http://www.sparked.com/</organizationURL>' outstr += '<donateURL>http://www.sparked.com/</donateURL>' outstr += '<logoURL>http://www.sparked.com/imgver4/logo_sparked.gif</logoURL>' outstr += '<detailURL>http://www.sparked.com/</detailURL>' outstr += '</Organization></Organizations>' outstr += '\n<VolunteerOpportunities>\n' nodes = feed.getElementsByTagName('challenge') for i, node in enumerate(nodes): if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>' desc = '<![CDATA[' + xmlh.get_tag_val(node, "description") + ']]>' url = xmlh.get_tag_val(node, "url") start_date = last_updated open_ended = True #01234567 #02/15/11 mdy = xmlh.get_tag_val(node, "deadline") if mdy: try: end_date = str(2000 + int(mdy[6:])) + "-" + mdy[0:2] + "-" + mdy[3:5] open_ended = False except: pass outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % ( str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % ( org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % ( org_id) outstr += '<micro>Yes</micro>' outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' % (last_updated) outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) if open_ended: outstr += '<openEnded>Yes</openEnded>' else: outstr += '<openEnded>No</openEnded>' outstr += '<endDate>%s</endDate>' % (end_date) outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '<locations><location><virtual>Yes</virtual></location></locations>' outstr += '</VolunteerOpportunity>\n' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
def parse(instr, maxrecs, progress): numorgs = numopps = 0 instr = re.sub(r'<(/?db):', r'<\1_', instr) opps = re.findall(r'<VolunteerOpportunity>.+?</VolunteerOpportunity>', instr, re.DOTALL) volopps = "" for i, oppstr in enumerate(opps): #if progress and i > 0 and i % 250 == 0: # print str(datetime.now())+": ", i, " opportunities processed." if (maxrecs > 0 and i > maxrecs): break xmlh.print_rps_progress("opps", progress, i, maxrecs) item = xmlh.simple_parser(oppstr, known_elnames, progress=False) orgid = register_org(item) # logoURL -- sigh, this is for the opportunity not the org volopps += '<VolunteerOpportunity>' volopps += xmlh.output_val('volunteerOpportunityID', str(i)) volopps += xmlh.output_val('sponsoringOrganizationID', str(orgid)) volopps += xmlh.output_node('volunteerHubOrganizationID', item, "LocalID") volopps += xmlh.output_node('title', item, "Title") volopps += xmlh.output_node('abstract', item, "Description") volopps += xmlh.output_node('description', item, "Description") volopps += xmlh.output_node('detailURL', item, "DetailURL") volopps += xmlh.output_val('volunteersNeeded', "-8888") try: oppdates = item.getElementsByTagName("OpportunityDate") except: oppdates = [] if len(oppdates) > 1: print datetime.now(), \ "parse_servenet.py: only 1 OpportunityDate supported." #return None oppdate = oppdates[0] elif len(oppdates) == 0: oppdate = None else: oppdate = oppdates[0] volopps += '<dateTimeDurations><dateTimeDuration>' if oppdate: volopps += xmlh.output_val('openEnded', 'No') volopps += xmlh.output_val('duration', 'P%s%s' % (xmlh.get_tag_val(oppdate, "DurationQuantity"), xmlh.get_tag_val(oppdate, "DurationUnit"))) volopps += xmlh.output_val('commitmentHoursPerWeek', '0') volopps += xmlh.output_node('startDate', oppdate, "StartDate") volopps += xmlh.output_node('endDate', oppdate, "EndDate") else: volopps += xmlh.output_val('openEnded', 'Yes') volopps += xmlh.output_val('commitmentHoursPerWeek', '0') volopps += '</dateTimeDuration></dateTimeDurations>' volopps += '<locations>' try: opplocs = item.getElementsByTagName("Location") except: opplocs = [] for opploc in opplocs: volopps += '<location>' virtual_tag = opploc.getElementsByTagName("Virtual") if virtual_tag and xmlh.get_tag_val(opploc, "Virtual").lower() == "yes": volopps += xmlh.output_val('virtual', 'Yes') else: volopps += xmlh.output_node('region', opploc, "StateOrProvince") volopps += xmlh.output_node('country', opploc, "Country") volopps += xmlh.output_node('postalCode', opploc, "ZipOrPostalCode") volopps += '</location>' volopps += '</locations>' volopps += '<categoryTags/>' volopps += '</VolunteerOpportunity>' numopps += 1 # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', providerID) outstr += xmlh.output_val('providerName', providerName) outstr += xmlh.output_val('feedID', feedID) outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', providerURL) outstr += xmlh.output_val('description', feedDescription) # TODO: capture ts -- use now?! outstr += '</FeedInfo>' # hardcoded: Organization outstr += '<Organizations>' for key in ORGS: outstr += ORGS[key] numorgs += 1 outstr += '</Organizations>' outstr += '<VolunteerOpportunities>' outstr += volopps outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr) return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given sparked feed data""" feed = xmlh.parse_or_die(instr.encode('utf-8')) org_id = str(139) mission_statement = "Sparked makes it easy for people with busy lives to help nonprofits get valuable work done when it's convenient. We call it microvolunteering. Through the convenience of the Internet, and with the collaboration of others, micro-volunteers use their professional skills to help causes they care about." org_desc = "Sparked is the world's first Microvolunteering network" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") start_date = last_updated numorgs = 1 numopps = 0 xmlh.print_progress("loading sparked.com custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "sparked") outstr += xmlh.output_val('feedID', "sparked") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.sparked.com/") outstr += '</FeedInfo>' # 1 "organization" in sparked.com postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>sparked.com</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "San Francisco") outstr += xmlh.output_val("region", "CA") outstr += xmlh.output_val("postalCode", "94105") outstr += '</location>' outstr += '<organizationURL>http://www.sparked.com/</organizationURL>' outstr += '<donateURL>http://www.sparked.com/</donateURL>' outstr += '<logoURL>http://www.sparked.com/imgver4/logo_sparked.gif</logoURL>' outstr += '<detailURL>http://www.sparked.com/</detailURL>' outstr += '</Organization></Organizations>' outstr += '\n<VolunteerOpportunities>\n' nodes = feed.getElementsByTagName('challenge') for i, node in enumerate(nodes): if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>' desc = '<![CDATA[' + xmlh.get_tag_val(node, "description") + ']]>' url = xmlh.get_tag_val(node, "url") start_date = last_updated open_ended = True #01234567 #02/15/11 mdy = xmlh.get_tag_val(node, "deadline") if mdy: try: end_date = str(2000 + int(mdy[6:])) + "-" + mdy[0:2] + "-" + mdy[3:5] open_ended = False except: pass outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % (org_id) outstr += '<micro>Yes</micro>' outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' %(last_updated) outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) if open_ended: outstr += '<openEnded>Yes</openEnded>' else: outstr += '<openEnded>No</openEnded>' outstr += '<endDate>%s</endDate>' % (end_date) outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '<locations><location><virtual>Yes</virtual></location></locations>' outstr += '</VolunteerOpportunity>\n' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given 350.org data""" feed = xmlh.parse_or_die(instr.encode('utf-8')) org_id = str(139) mission_statement = "350.org is an international campaign that's building a movement to unite the world around solutions to the climate crisis--the solutions that science and justice demand." org_desc = "On October 10 we'll be helping host a Global Work Party, with thousands of communities setting up solar panels or digging community gardens or laying out bike paths." start_date = '2010-10-01' today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") numorgs = 1 numopps = 0 xmlh.print_progress("loading 350.org custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "350org") outstr += xmlh.output_val('feedID', "350org") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.350.org/") outstr += '</FeedInfo>' # 1 "organization" in 350.org postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>350.org</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += '</location>' # TODO: make these variables outstr += '<organizationURL>http://www.350.org/</organizationURL>' outstr += '<donateURL>http://www.350.org/donate</donateURL>' outstr += '<logoURL>http://www.350.org/sites/all/themes/threefifty/logo.gif</logoURL>' outstr += '<detailURL>http://www.350.org/about</detailURL>' outstr += '</Organization></Organizations>' outstr += '\n<VolunteerOpportunities>\n' nodes = feed.getElementsByTagName('node') for i, node in enumerate(nodes): if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>' desc = '<![CDATA[' + xmlh.get_tag_val(node, "Body") + ']]>' url = xmlh.get_tag_val(node, "Link") lat = xmlh.get_tag_val(node, "Latitude") lng = xmlh.get_tag_val(node, "Longitude") start_datetime = xmlh.get_tag_val(node, "Start_Date") start_time = None if not start_datetime: start_date = "2010-10-10" else: start_datetime = start_datetime.replace(" (All day)", "T00:00:00") dt = start_datetime.split("T") start_date = dt[0][0:10] if len(dt) > 1: start_time = dt[1] end_datetime = xmlh.get_tag_val(node, "End_Date") end_time = None if not end_datetime: open_ended = True else: open_ended = False end_datetime = end_datetime.replace(" (All day)", "T23:00:00") dt = end_datetime.split("T") end_date = dt[0][0:10] if len(dt) > 1: end_time = dt[1] end_datetime = xmlh.get_tag_val(node, "End_Date") locstr = "%s, %s %s" % (xmlh.get_tag_val(node, "City"), xmlh.get_tag_val(node, "Province"), xmlh.get_tag_val(node, "Country")) outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % (org_id) outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' %(last_updated) outstr += '<locations><location>' outstr += '<location_string>%s</location_string>' % (locstr) outstr += '<latitude>%s</latitude>' % (lat) outstr += '<longitude>%s</longitude>' % (lng) outstr += '</location></locations>' outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) if start_time: outstr += '<startTime>%s</startTime>' % (start_time) if open_ended: outstr += '<openEnded>Yes</openEnded>' else: outstr += '<openEnded>No</openEnded>' outstr += '<endDate>%s</endDate>' % (end_date) if end_time: outstr += '<endTime>%s</endTime>' % (end_time) outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '</VolunteerOpportunity>\n' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
def parse(instr, maxrecs = 0, progress = False): """parser main.""" data = {} updated = {} maxrow, maxcol = parse_gspreadsheet(instr, data, updated, progress) if DEBUG and progress: print str(datetime.now())+": maxrow="+str(maxrow)+" maxcol="+str(maxcol) # find header row: look for "opportunity title" (case insensitive) header_row, header_startcol = find_header_row(data, 'opportunity\s*title') header_colidx = {} header_names = {} header_col = header_startcol while True: header_str = cellval(data, header_row, header_col) if not header_str: break field_name = None header_str = header_str.lower() if header_str.find("title") >= 0: field_name = "OpportunityTitle" elif (header_str.find("organization") >= 0 and header_str.find("sponsor") >= 0): field_name = "SponsoringOrganization" elif header_str.find("description") >= 0: field_name = "Description" elif header_str.find("skills") >= 0: field_name = "Skills" elif header_str.find("location") >= 0 and header_str.find("name") >= 0: field_name = "LocationName" elif header_str.find("street") >= 0: field_name = "LocationStreet" elif header_str.find("city") >= 0: field_name = "LocationCity" elif header_str.find("state") >= 0 or header_str.find("province") >= 0: field_name = "LocationProvince" elif header_str.find("zip") >= 0 or header_str.find("postal") >= 0: field_name = "LocationPostalCode" elif header_str.find("country") >= 0: field_name = "LocationCountry" elif header_str.find("start") >= 0 and header_str.find("date") >= 0: field_name = "StartDate" elif header_str.find("start") >= 0 and header_str.find("time") >= 0: field_name = "StartTime" elif header_str.find("end") >= 0 and header_str.find("date") >= 0: field_name = "EndDate" elif header_str.find("end") >= 0 and header_str.find("time") >= 0: field_name = "EndTime" elif header_str.find("contact") >= 0 and header_str.find("name") >= 0: field_name = "ContactName" elif header_str.find("email") >= 0 or header_str.find("e-mail") >= 0: field_name = "ContactEmail" elif header_str.find("phone") >= 0: field_name = "ContactPhone" elif header_str.find("website") >= 0 or header_str.find("url") >= 0: field_name = "URL" elif header_str.find("often") >= 0: field_name = "Frequency" elif header_str.find("days") >= 0 and header_str.find("week") >= 0: field_name = "DaysOfWeek" elif header_str.find("paid") >= 0: field_name = "Paid" elif header_str.find("self_directed") >= 0: field_name = "SelfDirected" elif header_str.find("commitment") >= 0 or header_str.find("hours") >= 0: field_name = "CommitmentHours" elif header_str.find("age") >= 0 and header_str.find("min") >= 0: field_name = "MinimumAge" elif header_str.find("kid") >= 0: field_name = "KidFriendly" elif header_str.find("senior") >= 0 and header_str.find("only") >= 0: field_name = "SeniorsOnly" elif header_str.find("sex") >= 0 or header_str.find("gender") >= 0: field_name = "SexRestrictedTo" elif header_str.find("volunteer appeal") >= 0: field_name = None elif header_str.find("volunteerOptIn") >= 0: field_name = None elif header_str.find("booksOptIn") >= 0: field_name = None else: parser_error("couldn't map header '"+header_str+"' to a field name.") if field_name != None: header_colidx[field_name] = header_col header_names[header_col] = field_name #print header_str, "=>", field_name header_col += 1 if len(header_names) < 10: parser_error("too few fields found: "+str(len(header_names))) # check to see if there's a header-description row header_desc = cellval(data, header_row+1, header_startcol) if not header_desc: parser_error("empty spreadsheet? blank row not allowed below header row") return '', 0, 0 #data_startrow = 3 else: header_desc = header_desc.lower() data_startrow = header_row + 1 if header_desc.find("up to") >= 0: data_startrow += 1 # find the data global CURRENT_ROW CURRENT_ROW = row = data_startrow blankrows = 0 volopps = '<VolunteerOpportunities>' numorgs = numopps = 0 while True: blankrow = True #rowstr = "row="+str(row)+"\n" record = {} record['LastUpdated'] = '2000-01-01' for field_name in header_colidx: col = header_colidx[field_name] val = cellval(data, row, col) if val: blankrow = False else: val = "" #rowstr += " "+field_name+"="+val+"\n" record[field_name] = val key = 'R'+str(row)+'C'+str(col) if (key in updated and updated[key] > record['LastUpdated']): record['LastUpdated'] = updated[key] if blankrow: blankrows += 1 if blankrows > MAX_BLANKROWS: break else: numopps += 1 blankrows = 0 record['oppid'] = str(numopps) volopps += record_to_fpxml(record) row += 1 CURRENT_ROW = row CURRENT_ROW = None if DEBUG and progress: print str(datetime.now())+": ", numopps, "opportunities found." volopps += '</VolunteerOpportunities>' outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' # providerID replaced by caller outstr += '<providerID></providerID>' # providerName replaced by caller outstr += '<providerName></providerName>' outstr += '<feedID>1</feedID>' outstr += '<createdDateTime>%s</createdDateTime>' % xmlh.current_ts() # providerURL replaced by caller outstr += '<providerURL></providerURL>' outstr += '<description></description>' outstr += '</FeedInfo>' outstr += "<Organizations>" for orgname in KNOWN_ORGS: outstr += "<Organization>" outstr += xmlh.output_val("organizationID", KNOWN_ORGS[orgname]) outstr += xmlh.output_val("name", orgname, cdata=True) outstr += "</Organization>" outstr += "</Organizations>" outstr += volopps outstr += '</FootprintFeed>' #outstr = re.sub(r'><', '>\n<', outstr) #print outstr return outstr, numorgs, numopps