def parse(instr, maxrec, progress): """return FPXML given sparked feed data""" feed = xmlh.parse_or_die(instr.encode('utf-8')) org_id = str(139) mission_statement = "Sparked makes it easy for people with busy lives to help nonprofits get valuable work done when it's convenient. We call it microvolunteering. Through the convenience of the Internet, and with the collaboration of others, micro-volunteers use their professional skills to help causes they care about." org_desc = "Sparked is the world's first Microvolunteering network" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") start_date = last_updated numorgs = 1 numopps = 0 xmlh.print_progress("loading sparked.com custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "sparked") outstr += xmlh.output_val('feedID', "sparked") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.sparked.com/") outstr += '</FeedInfo>' # 1 "organization" in sparked.com postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>sparked.com</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "San Francisco") outstr += xmlh.output_val("region", "CA") outstr += xmlh.output_val("postalCode", "94105") outstr += '</location>' outstr += '<organizationURL>http://www.sparked.com/</organizationURL>' outstr += '<donateURL>http://www.sparked.com/</donateURL>' outstr += '<logoURL>http://www.sparked.com/imgver4/logo_sparked.gif</logoURL>' outstr += '<detailURL>http://www.sparked.com/</detailURL>' outstr += '</Organization></Organizations>' outstr += '\n<VolunteerOpportunities>\n' nodes = feed.getElementsByTagName('challenge') for i, node in enumerate(nodes): if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>' desc = '<![CDATA[' + xmlh.get_tag_val(node, "description") + ']]>' url = xmlh.get_tag_val(node, "url") start_date = last_updated open_ended = True #01234567 #02/15/11 mdy = xmlh.get_tag_val(node, "deadline") if mdy: try: end_date = str(2000 + int(mdy[6:])) + "-" + mdy[0:2] + "-" + mdy[3:5] open_ended = False except: pass outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % (org_id) outstr += '<micro>Yes</micro>' outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' %(last_updated) outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) if open_ended: outstr += '<openEnded>Yes</openEnded>' else: outstr += '<openEnded>No</openEnded>' outstr += '<endDate>%s</endDate>' % (end_date) outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '<locations><location><virtual>Yes</virtual></location></locations>' outstr += '</VolunteerOpportunity>\n' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given 350.org data""" feed = xmlh.parse_or_die(instr.encode('utf-8')) org_id = str(139) mission_statement = "350.org is an international campaign that's building a movement to unite the world around solutions to the climate crisis--the solutions that science and justice demand." org_desc = "On October 10 we'll be helping host a Global Work Party, with thousands of communities setting up solar panels or digging community gardens or laying out bike paths." start_date = '2010-10-01' today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") numorgs = 1 numopps = 0 xmlh.print_progress("loading 350.org custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "350org") outstr += xmlh.output_val('feedID', "350org") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.350.org/") outstr += '</FeedInfo>' # 1 "organization" in 350.org postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>350.org</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += '</location>' # TODO: make these variables outstr += '<organizationURL>http://www.350.org/</organizationURL>' outstr += '<donateURL>http://www.350.org/donate</donateURL>' outstr += '<logoURL>http://www.350.org/sites/all/themes/threefifty/logo.gif</logoURL>' outstr += '<detailURL>http://www.350.org/about</detailURL>' outstr += '</Organization></Organizations>' outstr += '\n<VolunteerOpportunities>\n' nodes = feed.getElementsByTagName('node') for i, node in enumerate(nodes): if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>' desc = '<![CDATA[' + xmlh.get_tag_val(node, "Body") + ']]>' url = xmlh.get_tag_val(node, "Link") lat = xmlh.get_tag_val(node, "Latitude") lng = xmlh.get_tag_val(node, "Longitude") start_datetime = xmlh.get_tag_val(node, "Start_Date") start_time = None if not start_datetime: start_date = "2010-10-10" else: start_datetime = start_datetime.replace(" (All day)", "T00:00:00") dt = start_datetime.split("T") start_date = dt[0][0:10] if len(dt) > 1: start_time = dt[1] end_datetime = xmlh.get_tag_val(node, "End_Date") end_time = None if not end_datetime: open_ended = True else: open_ended = False end_datetime = end_datetime.replace(" (All day)", "T23:00:00") dt = end_datetime.split("T") end_date = dt[0][0:10] if len(dt) > 1: end_time = dt[1] end_datetime = xmlh.get_tag_val(node, "End_Date") locstr = "%s, %s %s" % (xmlh.get_tag_val( node, "City"), xmlh.get_tag_val( node, "Province"), xmlh.get_tag_val(node, "Country")) outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % ( str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % ( org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % ( org_id) outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' % (last_updated) outstr += '<locations><location>' outstr += '<location_string>%s</location_string>' % (locstr) outstr += '<latitude>%s</latitude>' % (lat) outstr += '<longitude>%s</longitude>' % (lng) outstr += '</location></locations>' outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) if start_time: outstr += '<startTime>%s</startTime>' % (start_time) if open_ended: outstr += '<openEnded>Yes</openEnded>' else: outstr += '<openEnded>No</openEnded>' outstr += '<endDate>%s</endDate>' % (end_date) if end_time: outstr += '<endTime>%s</endTime>' % (end_time) outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '</VolunteerOpportunity>\n' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
def parse(instr, maxrecs, progress): """return FPXML given craigslist data""" if CL_LATLONGS == None: load_craigslist_latlongs() xmlh.print_progress("loading craigslist crawler output...") crawl_craigslist.parse_cache_file(instr, listings_only=True) xmlh.print_progress("loaded " + str(len(crawl_craigslist.pages)) + " craigslist pages.") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += "<FeedInfo>" outstr += xmlh.output_val("providerID", "105") outstr += xmlh.output_val("providerName", "craigslist") outstr += xmlh.output_val("feedID", "craigslist") outstr += xmlh.output_val("createdDateTime", xmlh.current_ts()) outstr += xmlh.output_val("providerURL", "http://www.craigslist.org/") outstr += "</FeedInfo>" numorgs = numopps = 0 # no "organization" in craigslist postings outstr += "<Organizations>" outstr += "<Organization>" outstr += "<organizationID>0</organizationID>" outstr += "<nationalEIN></nationalEIN>" outstr += "<name></name>" outstr += "<missionStatement></missionStatement>" outstr += "<description></description>" outstr += "<location>" outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += "</location>" outstr += "<organizationURL></organizationURL>" outstr += "<donateURL></donateURL>" outstr += "<logoURL></logoURL>" outstr += "<detailURL></detailURL>" outstr += "</Organization>" numorgs += 1 outstr += "</Organizations>" skipped_listings = {} skipped_listings["body"] = skipped_listings["title"] = skipped_listings["not-ok"] = 0 outstr += "<VolunteerOpportunities>" for i, url in enumerate(crawl_craigslist.pages): page = crawl_craigslist.pages[url] ok = extract(page, "it's OK to distribute this " + "charitable volunteerism opportunity") if ok == "": skipped_listings["not-ok"] += 1 continue title = extract(page, "<title>(.+?)</title>") if title == "": skipped_listings["title"] += 1 continue body = extract(page, '<div id="userbody">(.+?)<') if len(body) < 25: skipped_listings["body"] += 1 continue item_id = extract(url, "/vol/(.+?)[.]html$") locstr = extract(page, "Location: (.+?)<") datestr = extract(page, "Date: (.+?)<") ts = dateutil.parser.parse(datestr) datetimestr = ts.strftime("%Y-%m-%dT%H:%M:%S") datestr = ts.strftime("%Y-%m-%d") if maxrecs > 0 and i > maxrecs: break xmlh.print_rps_progress("opps", progress, i, maxrecs) if progress and i > 0 and i % 250 == 0: msg = "skipped " + str(skipped_listings["title"] + skipped_listings["body"]) msg += " listings (" + str(skipped_listings["title"]) + " for no-title and " msg += str(skipped_listings["body"]) + " for short body and " msg += str(skipped_listings["not-ok"]) + " for no-redistrib)" xmlh.print_progress(msg) # print "---" # print "title:",title # print "loc:",locstr # print "date:",datestr # print "body:",body[0:100] # craigslist is full of weird escapes-- strip them body = re.sub(r"&[a-z]+;", "", body) title = re.sub(r"&[a-z]+;", "", title) locstr = re.sub(r"&[a-z]+;", "", locstr) outstr += "<VolunteerOpportunity>" outstr += "<volunteerOpportunityID>%s</volunteerOpportunityID>" % (item_id) outstr += "<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>" outstr += "<volunteerHubOrganizationIDs><volunteerHubOrganizationID>0</volunteerHubOrganizationID></volunteerHubOrganizationIDs>" outstr += "<title>%s</title>" % (title) outstr += "<detailURL>%s</detailURL>" % (url) # avoid CDATA in body... esc_body = xml.sax.saxutils.escape(body) esc_body100 = xml.sax.saxutils.escape(body[0:100]) outstr += "<description>%s</description>" % (esc_body) outstr += "<abstract>%s</abstract>" % (esc_body100 + "...") outstr += "<lastUpdated>%s</lastUpdated>" % (datetimestr) # TODO: expires # TODO: synthesize location from metro... outstr += "<locations><location>" outstr += "<name>%s</name>" % (xml.sax.saxutils.escape(locstr)) # what about the few that do geocode? lat, lng = "", "" try: domain, unused = url.split("vol/") lat, lng = CL_LATLONGS[domain].split(",") except: # ignore for now # print url # continue pass outstr += "<latitude>%s</latitude>" % (lat) outstr += "<longitude>%s</longitude>" % (lng) outstr += "</location></locations>" # outstr += '<locations><location>' # outstr += '<city>%s</city>' % ( # outstr += '<region>%s</region>' % ( # outstr += '</location></locations>' outstr += "<dateTimeDurations><dateTimeDuration>" outstr += "<openEnded>No</openEnded>" outstr += "<startDate>%s</startDate>" % (datestr) # TODO: endDate = startDate + N=14 days? # TODO: timezone??? # outstr += '<endDate>%s</endDate>' % ( outstr += "</dateTimeDuration></dateTimeDurations>" # TODO: categories??? # outstr += '<categoryTags>' outstr += "</VolunteerOpportunity>" numopps += 1 outstr += "</VolunteerOpportunities>" outstr += "</FootprintFeed>" # outstr = re.sub(r'><([^/])', r'>\n<\1', outstr) return outstr, numorgs, numopps
def parse(instr, maxrecs, progress): """return FPXML given craigslist data""" if CL_LATLONGS == None: load_craigslist_latlongs() xmlh.print_progress("loading craigslist crawler output...") crawl_craigslist.parse_cache_file(instr, listings_only=True) xmlh.print_progress("loaded "+str(len(crawl_craigslist.pages))+" craigslist pages.") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', "105") outstr += xmlh.output_val('providerName', "craigslist") outstr += xmlh.output_val('feedID', "craigslist") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.craigslist.org/") outstr += '</FeedInfo>' numorgs = numopps = 0 # no "organization" in craigslist postings outstr += '<Organizations>' outstr += '<Organization>' outstr += '<organizationID>0</organizationID>' outstr += '<nationalEIN></nationalEIN>' outstr += '<name></name>' outstr += '<missionStatement></missionStatement>' outstr += '<description></description>' outstr += '<location>' outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += '</location>' outstr += '<organizationURL></organizationURL>' outstr += '<donateURL></donateURL>' outstr += '<logoURL></logoURL>' outstr += '<detailURL></detailURL>' outstr += '</Organization>' numorgs += 1 outstr += '</Organizations>' skipped_listings = {} skipped_listings["body"] = skipped_listings["title"] = \ skipped_listings["not-ok"] = 0 outstr += '<VolunteerOpportunities>' for i, url in enumerate(crawl_craigslist.pages): page = crawl_craigslist.pages[url] ok = extract(page, "it's OK to distribute this "+ "charitable volunteerism opportunity") if ok == "": skipped_listings["not-ok"] += 1 continue title = extract(page, "<title>(.+?)</title>") if title == "": skipped_listings["title"] += 1 continue body = extract(page, '<div id="userbody">(.+?)<') if len(body) < 25: skipped_listings["body"] += 1 continue item_id = extract(url, "/vol/(.+?)[.]html$") locstr = extract(page, "Location: (.+?)<") datestr = extract(page, "Date: (.+?)<") ts = dateutil.parser.parse(datestr) datetimestr = ts.strftime("%Y-%m-%dT%H:%M:%S") datestr = ts.strftime("%Y-%m-%d") if (maxrecs>0 and i>maxrecs): break xmlh.print_rps_progress("opps", progress, i, maxrecs) if progress and i > 0 and i % 250 == 0: msg = "skipped " + str(skipped_listings["title"]+skipped_listings["body"]) msg += " listings ("+str(skipped_listings["title"]) + " for no-title and " msg += str(skipped_listings["body"]) + " for short body and " msg += str(skipped_listings["not-ok"]) + " for no-redistrib)" xmlh.print_progress(msg) #print "---" #print "title:",title #print "loc:",locstr #print "date:",datestr #print "body:",body[0:100] # craigslist is full of weird escapes-- strip them body = re.sub(r'&[a-z]+;', '', body) title = re.sub(r'&[a-z]+;', '', title) locstr = re.sub(r'&[a-z]+;', '', locstr) outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (item_id) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>' outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>0</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) # avoid CDATA in body... esc_body = xml.sax.saxutils.escape(body) esc_body100 = xml.sax.saxutils.escape(body[0:100]) outstr += '<description>%s</description>' % (esc_body) outstr += '<abstract>%s</abstract>' % (esc_body100 + "...") outstr += '<lastUpdated>%s</lastUpdated>' % (datetimestr) # TODO: expires # TODO: synthesize location from metro... outstr += '<locations><location>' outstr += '<name>%s</name>' % (xml.sax.saxutils.escape(locstr)) # what about the few that do geocode? lat, lng = "", "" try: domain, unused = url.split("vol/") lat, lng = CL_LATLONGS[domain].split(",") except: # ignore for now #print url #continue pass outstr += '<latitude>%s</latitude>' % (lat) outstr += '<longitude>%s</longitude>' % (lng) outstr += '</location></locations>' #outstr += '<locations><location>' #outstr += '<city>%s</city>' % ( #outstr += '<region>%s</region>' % ( #outstr += '</location></locations>' outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<openEnded>No</openEnded>' outstr += '<startDate>%s</startDate>' % (datestr) # TODO: endDate = startDate + N=14 days? # TODO: timezone??? #outstr += '<endDate>%s</endDate>' % ( outstr += '</dateTimeDuration></dateTimeDurations>' # TODO: categories??? #outstr += '<categoryTags>' outstr += '</VolunteerOpportunity>' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr) return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given 350.org data""" feed = xmlh.parse_or_die(instr.encode('utf-8')) org_id = str(139) mission_statement = "350.org is an international campaign that's building a movement to unite the world around solutions to the climate crisis--the solutions that science and justice demand." org_desc = "On October 10 we'll be helping host a Global Work Party, with thousands of communities setting up solar panels or digging community gardens or laying out bike paths." start_date = '2010-10-01' today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") numorgs = 1 numopps = 0 xmlh.print_progress("loading 350.org custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "350org") outstr += xmlh.output_val('feedID', "350org") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.350.org/") outstr += '</FeedInfo>' # 1 "organization" in 350.org postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>350.org</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += '</location>' # TODO: make these variables outstr += '<organizationURL>http://www.350.org/</organizationURL>' outstr += '<donateURL>http://www.350.org/donate</donateURL>' outstr += '<logoURL>http://www.350.org/sites/all/themes/threefifty/logo.gif</logoURL>' outstr += '<detailURL>http://www.350.org/about</detailURL>' outstr += '</Organization></Organizations>' outstr += '\n<VolunteerOpportunities>\n' nodes = feed.getElementsByTagName('node') for i, node in enumerate(nodes): if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>' desc = '<![CDATA[' + xmlh.get_tag_val(node, "Body") + ']]>' url = xmlh.get_tag_val(node, "Link") lat = xmlh.get_tag_val(node, "Latitude") lng = xmlh.get_tag_val(node, "Longitude") start_datetime = xmlh.get_tag_val(node, "Start_Date") start_time = None if not start_datetime: start_date = "2010-10-10" else: start_datetime = start_datetime.replace(" (All day)", "T00:00:00") dt = start_datetime.split("T") start_date = dt[0][0:10] if len(dt) > 1: start_time = dt[1] end_datetime = xmlh.get_tag_val(node, "End_Date") end_time = None if not end_datetime: open_ended = True else: open_ended = False end_datetime = end_datetime.replace(" (All day)", "T23:00:00") dt = end_datetime.split("T") end_date = dt[0][0:10] if len(dt) > 1: end_time = dt[1] end_datetime = xmlh.get_tag_val(node, "End_Date") locstr = "%s, %s %s" % (xmlh.get_tag_val(node, "City"), xmlh.get_tag_val(node, "Province"), xmlh.get_tag_val(node, "Country")) outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % (org_id) outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' %(last_updated) outstr += '<locations><location>' outstr += '<location_string>%s</location_string>' % (locstr) outstr += '<latitude>%s</latitude>' % (lat) outstr += '<longitude>%s</longitude>' % (lng) outstr += '</location></locations>' outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) if start_time: outstr += '<startTime>%s</startTime>' % (start_time) if open_ended: outstr += '<openEnded>Yes</openEnded>' else: outstr += '<openEnded>No</openEnded>' outstr += '<endDate>%s</endDate>' % (end_date) if end_time: outstr += '<endTime>%s</endTime>' % (end_time) outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '</VolunteerOpportunity>\n' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given sparked feed data""" from xml.dom import minidom org_id = "140" mission_statement = "Do it yourself volunteer opportunities." org_desc = "Do it yourself volunteer opportunities" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") numorgs = 1 numopps = 0 xmlh.print_progress("loading diy custom TSV...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "diy") outstr += xmlh.output_val('feedID', "diy") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.allforgood.org/") outstr += '</FeedInfo>' outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>allforgood.org</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "San Francisco") outstr += xmlh.output_val("region", "CA") outstr += xmlh.output_val("postalCode", "94105") outstr += '</location>' outstr += '<organizationURL>http://www.allforgood.org/</organizationURL>' outstr += '<donateURL>http://www.allforgood.org/</donateURL>' outstr += '<logoURL>http://www.allforgood.org/</logoURL>' outstr += '<detailURL>http://www.allforgood.org/</detailURL>' outstr += '</Organization></Organizations>' outstr += '<VolunteerOpportunities>' lines = instr.split("\n") header = lines.pop(0).strip().split("\t") for i, line in enumerate(lines): row = line.strip().split("\t") if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + get_field("title", row, header) + ']]>' url = get_field("url", row, header) if not title or not url: continue sponsor = get_field("sponsoringOrganization", row, header) desc = ('<![CDATA[' + sponsor + ': ' + get_field("description", row, header) + ' Areas of interest: ' + get_field("subjectArea", row, header) + ' Tags: ' + get_field("keywords", row, header) + ']]>') start_date = last_updated outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % ( str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % ( org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % ( org_id) outstr += '<self_directed>Yes</self_directed>' outstr += '<title>%s</title>' % (title) outstr += '<detailURL><![CDATA[%s]]></detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' % (last_updated) outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) outstr += '<openEnded>Yes</openEnded>' outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '<locations><location><virtual>Yes</virtual></location></locations>' outstr += '</VolunteerOpportunity>' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
if not feed: return '', 0, 0 org_id = str(103) mission_statement = "Idealist connects people, organizations, and resources to help build a world where all people can live free and dignified lives. Idealist is independent of any government, political ideology, or religious creed. Our work is guided by the common desire of our members and supporters to find practical solutions to social and environmental problems, in a spirit of generosity and mutual respect." org_desc = "Volunteer Opportunities that were posted to idealist.org in English" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") start_date = last_updated numorgs = 1 numopps = 0 xmlh.print_progress("loading idealist.xml custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "idealist") outstr += xmlh.output_val('feedID', "idealist") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.idealist.org/") outstr += '</FeedInfo>' # 1 "organization" in idealist.org postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>'
def parse(instr, maxrec, progress): """return FPXML given sparked feed data""" from xml.dom import minidom org_id = "140" mission_statement = "Do it yourself volunteer opportunities." org_desc = "Do it yourself volunteer opportunities" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") numorgs = 1 numopps = 0 xmlh.print_progress("loading diy custom TSV...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += "<FeedInfo>" outstr += xmlh.output_val("providerID", org_id) outstr += xmlh.output_val("providerName", "diy") outstr += xmlh.output_val("feedID", "diy") outstr += xmlh.output_val("createdDateTime", xmlh.current_ts()) outstr += xmlh.output_val("providerURL", "http://www.allforgood.org/") outstr += "</FeedInfo>" outstr += "<Organizations><Organization>" outstr += xmlh.output_val("organizationID", org_id) outstr += "<nationalEIN></nationalEIN>" outstr += "<name>allforgood.org</name>" outstr += xmlh.output_val("missionStatement", mission_statement) outstr += xmlh.output_val("description", org_desc) outstr += "<location>" outstr += xmlh.output_val("city", "San Francisco") outstr += xmlh.output_val("region", "CA") outstr += xmlh.output_val("postalCode", "94105") outstr += "</location>" outstr += "<organizationURL>http://www.allforgood.org/</organizationURL>" outstr += "<donateURL>http://www.allforgood.org/</donateURL>" outstr += "<logoURL>http://www.allforgood.org/</logoURL>" outstr += "<detailURL>http://www.allforgood.org/</detailURL>" outstr += "</Organization></Organizations>" outstr += "<VolunteerOpportunities>" lines = instr.split("\n") header = lines.pop(0).strip().split("\t") for i, line in enumerate(lines): row = line.strip().split("\t") if maxrec > 0 and i > maxrec: break title = "<![CDATA[" + get_field("title", row, header) + "]]>" url = get_field("url", row, header) if not title or not url: continue sponsor = get_field("sponsoringOrganization", row, header) desc = ( "<![CDATA[" + sponsor + ": " + get_field("description", row, header) + " Areas of interest: " + get_field("subjectArea", row, header) + " Tags: " + get_field("keywords", row, header) + "]]>" ) start_date = last_updated outstr += "<VolunteerOpportunity>" outstr += "<volunteerOpportunityID>%s</volunteerOpportunityID>" % (str(i)) outstr += ( "<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>" % (org_id) ) outstr += ( "<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>" % (org_id) ) outstr += "<self_directed>Yes</self_directed>" outstr += "<title>%s</title>" % (title) outstr += "<detailURL><![CDATA[%s]]></detailURL>" % (url) outstr += "<description>%s</description>" % (desc) outstr += "<abstract>%s</abstract>" % (desc) outstr += "<lastUpdated>%s</lastUpdated>" % (last_updated) outstr += "<dateTimeDurations><dateTimeDuration>" outstr += "<startDate>%s</startDate>" % (start_date) outstr += "<openEnded>Yes</openEnded>" outstr += "</dateTimeDuration></dateTimeDurations>" outstr += "<locations><location><virtual>Yes</virtual></location></locations>" outstr += "</VolunteerOpportunity>" numopps += 1 outstr += "</VolunteerOpportunities>" outstr += "</FootprintFeed>" return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given sparked feed data""" feed = xmlh.parse_or_die(instr.encode('utf-8')) org_id = str(139) mission_statement = "Sparked makes it easy for people with busy lives to help nonprofits get valuable work done when it's convenient. We call it microvolunteering. Through the convenience of the Internet, and with the collaboration of others, micro-volunteers use their professional skills to help causes they care about." org_desc = "Sparked is the world's first Microvolunteering network" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") start_date = last_updated numorgs = 1 numopps = 0 xmlh.print_progress("loading sparked.com custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "sparked") outstr += xmlh.output_val('feedID', "sparked") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.sparked.com/") outstr += '</FeedInfo>' # 1 "organization" in sparked.com postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>sparked.com</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "San Francisco") outstr += xmlh.output_val("region", "CA") outstr += xmlh.output_val("postalCode", "94105") outstr += '</location>' outstr += '<organizationURL>http://www.sparked.com/</organizationURL>' outstr += '<donateURL>http://www.sparked.com/</donateURL>' outstr += '<logoURL>http://www.sparked.com/imgver4/logo_sparked.gif</logoURL>' outstr += '<detailURL>http://www.sparked.com/</detailURL>' outstr += '</Organization></Organizations>' outstr += '\n<VolunteerOpportunities>\n' nodes = feed.getElementsByTagName('challenge') for i, node in enumerate(nodes): if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>' desc = '<![CDATA[' + xmlh.get_tag_val(node, "description") + ']]>' url = xmlh.get_tag_val(node, "url") start_date = last_updated open_ended = True #01234567 #02/15/11 mdy = xmlh.get_tag_val(node, "deadline") if mdy: try: end_date = str(2000 + int(mdy[6:])) + "-" + mdy[0:2] + "-" + mdy[3:5] open_ended = False except: pass outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % ( str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % ( org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % ( org_id) outstr += '<micro>Yes</micro>' outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' % (last_updated) outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) if open_ended: outstr += '<openEnded>Yes</openEnded>' else: outstr += '<openEnded>No</openEnded>' outstr += '<endDate>%s</endDate>' % (end_date) outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '<locations><location><virtual>Yes</virtual></location></locations>' outstr += '</VolunteerOpportunity>\n' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
def geocode(query): """Looks up a location query using GMaps API with a local cache and returns: address, latitude, longitude, accuracy (as strings). On failure, returns False. Accuracy levels: 7-9 = street address, 6 = road, 5 = zip code 4 = city, 3 = county, 2 = state, 1 = country""" global GEOCODE_CACHE query = filter_cache_delimiters(query) # load the cache if GEOCODE_CACHE == None: GEOCODE_CACHE = {} geocode_fh = open(GEOCODE_CACHE_FN, "r") try: for line in geocode_fh: # Cache line format is: # query|address;latitude;longitude;accuracy # For example: # ca|California;36.7782610;-119.4179324;2 # Or, if the location can't be found: # Any city anywhere| if "|" in line: key, result = line.strip().split("|") key = normalize_cache_key(key) if ";" in result: result = tuple(result.split(";")) else: result = False GEOCODE_CACHE[key] = result #if GEOCODE_DEBUG: # if len(GEOCODE_CACHE) % 250 == 0: # print_debug("read " + str(len(GEOCODE_CACHE)) + " geocode cache entries.") finally: geocode_fh.close() # try the cache key = normalize_cache_key(query) if key in GEOCODE_CACHE: return GEOCODE_CACHE[key] # call Google Maps API result = geocode_call(query) #print_debug("geocode result: " + str(result)) if result == False: return False # do not cache # cache the result if result == None: result = False cacheline = query + "|" else: result = map(filter_cache_delimiters, result) cacheline = query + "|" + ";".join(result) geocode_fh = open(GEOCODE_CACHE_FN, "a") xmlh.print_progress("storing cacheline: "+cacheline, "", SHOW_PROGRESS) geocode_fh.write(cacheline + "\n") geocode_fh.close() GEOCODE_CACHE[key] = result return result
respcode = 403 else: node = xmlh.simple_parser(res, [], False) respcode = xmlh.get_tag_val(node, "code") if respcode == "": #print_debug("unparseable response: "+res) return False respcode = int(respcode) if respcode in (400, 601, 602, 603): # problem with the query return None if respcode in (403, 500, 620): # problem with the server print_debug("geocode_call: Connection problem or quota exceeded. Sleeping...") if retries == 4: xmlh.print_progress("geocoder: %d" % respcode, "", SHOW_PROGRESS) time.sleep(5) return geocode_call(query, retries - 1) if respcode != 200: return False # TODO(danyq): if the query is a lat/lng, find the city-level # address, not just the first one. addr = xmlh.get_tag_val(node, "address") # TODO(danyq): Return street/city/country fields separately so that # the frontend can decide what to display. For now, this hack just # removes "USA" from all addresses. addr = re.sub(r', USA$', r'', addr) coords = xmlh.get_tag_val(node, "coordinates")