def ftp_to_base(filename, ftpinfo, instr): """ftp the string to base, guessing the feed name from the orig filename.""" print_progress("attempting to FTP " + filename + " to base") ftplib = __import__("ftplib") stringio = __import__("StringIO") dest_fn = footprint_lib.guess_shortname(filename) if dest_fn == "": dest_fn = "footprint1.txt" else: dest_fn = dest_fn + "1.gz" if re.search(r"[.]gz$", dest_fn): print_progress("compressing data from " + str(len(instr)) + " bytes") gzip_fh = gzip.open(dest_fn, "wb", 9) gzip_fh.write(instr) gzip_fh.close() data_fh = open(dest_fn, "rb") else: data_fh = stringio.StringIO(instr) host = "uploads.google.com" (user, passwd) = ftpinfo.split(":") print_progress("connecting to " + host + " as user " + user + "...") ftp = ftplib.FTP(host) welcomestr = re.sub(r"\n", "\\n", ftp.getwelcome()) print_progress("FTP server says: " + welcomestr) ftp.login(user, passwd) print_progress("uploading filename " + dest_fn) success = False while not success: try: ftp.storbinary("STOR " + dest_fn, data_fh, 8192) success = True except: # probably ftplib.error_perm: 553: Permission denied on server. (Overwrite) print_progress("upload failed-- sleeping and retrying...") time.sleep(1) ftp = ftplib.FTP(host) welcomestr = re.sub(r"\n", "\\n", ftp.getwelcome()) print_progress("FTP server says: " + welcomestr) ftp.login(user, passwd) print_progress("uploading filename " + dest_fn) if success: print_progress("done uploading.") else: print_progress("giving up.") ftp.quit() data_fh.close()
def solr_retransform(fname, start_time, feed_file_size): """Create Solr-compatible versions of a datafile""" numopps = 0 print_progress("Creating Solr transformed file for: " + fname) out_filename = fname + ".transformed" data_file = open(fname, "r") try: csv_reader = DictReader(data_file, dialect="our-dialect") csv_reader.next() except: print data_file.read() print_progress("error processing %s" % str(fname)) return shortname = footprint_lib.guess_shortname(fname) if not shortname: shortname = fname fnames = csv_reader.fieldnames[:] fnames.append("c:eventrangestart:dateTime") fnames.append("c:eventrangeend:dateTime") fnames.append("c:eventduration:integer") fnames.append("c:aggregatefield:string") fnames.append("c:dateopportunityidgroup:string") fnames.append("c:randomsalt:float") fnamesdict = dict([(x, x) for x in fnames]) data_file = open(fname, "r") # TODO: Switch to TSV - Faster and simpler csv_reader = DictReader(data_file, dialect="our-dialect") csv_writer = DictWriter(open(out_filename, "w"), dialect="excel-tab", fieldnames=fnames) for field_name in fnamesdict.keys(): fnamesdict[field_name] = fnamesdict[field_name].lower() if fnamesdict[field_name].startswith("c:"): fnamesdict[field_name] = fnamesdict[field_name].split(":")[1] csv_writer.writerow(fnamesdict) now = parser.parse(commands.getoutput("date")) today = now.date() expired_by_end_date = num_bad_links = 0 for rows in csv_reader: if rows["title"] and rows["title"].lower().find("anytown museum") >= 0: # bogus event continue if not "c:OpportunityID:string" in rows: continue # Split the date range into separate fields # event_date_range can be either start_date or start_date/end_date split_date_range = [] if rows["event_date_range"]: split_date_range = rows["event_date_range"].split("/") if split_date_range: rows["c:eventrangestart:dateTime"] = split_date_range[0] if len(split_date_range) > 1: rows["c:eventrangeend:dateTime"] = split_date_range[1] else: if rows["c:openended:boolean"] == "Yes": rows["c:eventrangeend:dateTime"] = rows["c:expires:dateTime"] else: rows["c:eventrangeend:dateTime"] = rows["c:eventrangestart:dateTime"] # in case we somehow got here without already doing this rows["title"] = footprint_lib.cleanse_snippet(rows["title"]) rows["description"] = footprint_lib.cleanse_snippet(rows["description"]) rows["c:detailURL:URL"] = rows["c:detailURL:URL"].replace("&", "&") if not rows["c:detailURL:URL"].lower().startswith("http"): rows["c:detailURL:URL"] = "http://" + rows["c:detailURL:URL"] link = str(rows["c:detailURL:URL"]) if link in BAD_LINKS or check_links.is_bad_link(link, RECHECK_BAD_LINKS): num_bad_links += 1 footprint_lib.feed_report(rows["c:OpportunityID:string"], "badlinks", shortname, link) dlink = "'" + str(link) + "'" if dlink not in BAD_LINKS: BAD_LINKS[dlink] = 0 print_progress("bad link: " + dlink) BAD_LINKS[dlink] += 1 continue rows["c:org_missionStatement:string"] = footprint_lib.cleanse_snippet(rows["c:org_missionStatement:string"]) rows["c:org_description:string"] = footprint_lib.cleanse_snippet(rows["c:org_description:string"]) rows["c:aggregatefield:string"] = footprint_lib.cleanse_snippet( " ".join( [ rows["title"], rows["description"], rows["c:provider_proper_name:string"], rows.get("c:skills:string", rows.get("c:skill:string", "")), rows.get("c:categoryTags:string", rows.get("c:categoryTag:string", "")), rows["c:org_name:string"], rows["c:eventName:string"], ] ) ) ids = rows.get("c:OpportunityID:string", rows.get("c:opportunityID:string", "OpportunityID")) ds = str(rows.get("c:eventrangestart:dateTime", "2001")) if ds.find("T") > 0: ds = ds.split("T")[0] rows["c:dateopportunityidgroup:string"] = "".join([ds, ids]) for key in rows.keys(): if key.find(":dateTime") != -1: if rows[key].find(":") > 0: rows[key] += "Z" elif key.find(":integer") != -1: if rows[key] == "": rows[key] = 0 else: # find the first numbers from the string, e.g. abc123.4 => 123 try: rows[key] = int(re.sub(r"^.*?([0-9]+).*$", r"\1", rows[key])) except: print_progress("error parsing rows[key]=%s -- rejecting record." % str(rows[key])) continue try: start_date = parser.parse(rows["c:eventrangestart:dateTime"], ignoretz=True) except: start_date = "2001-01-01T00:00:00" try: end_date = parser.parse(rows["c:eventrangeend:dateTime"], ignoretz=True) except: end_date = "2020-12-31T23:59:59" try: # check for expired opportunities delta_days = get_delta_days(relativedelta.relativedelta(end_date, today)) if delta_days < -2 and delta_days > -3000: # more than 3000? it's the 1971 thing # else it expired at least two days ago footprint_lib.feed_report(rows["c:OpportunityID:string"], "expired", shortname, link) expired_by_end_date += 1 continue duration_rdelta = relativedelta.relativedelta(end_date, start_date) duration_delta_days = get_delta_days(duration_rdelta) # Check whether start/end dates are the wrong way around. if duration_delta_days < 0: # removing this code for now-- too scary wrt. typos # e.g. what happens if 9/11/2009 - 9/7/2009 and it turns out # that the 7 was supposed to be a 17 i.e. simple typo-- by # swapping you've made it worse. Correct solution is to add # to spreadsheet checker, then reject start>end here. # even this is the wrong place to do this-- should apply to # both Base and SOLR. # print_progress('Date error: start > end. Swapping dates...') # duration_delta_days = -duration_delta_days # temp = rows["c:eventrangestart:dateTime"] # rows["c:eventrangestart:dateTime"] = rows["c:eventrangeend:dateTime"] # rows["c:eventrangeend:dateTime"] = temp print_progress("start date after end date: rejecting record.") continue # Fix for events that are ongoing or whose dates were unsucessfully # parsed. These events have start and end dates on 0000-01-01. # # These events get a large eventduration (used for ranking) so that # they are not erroneously boosted for having a short duration. current_rdelta = relativedelta.relativedelta(today, end_date) current_delta_days = get_delta_days(current_rdelta) rows["c:eventduration:integer"] = max(duration_delta_days, current_delta_days) except: pass # GBASE LEGACY: Fix to the +1000 to lat/long hack if not rows["c:latitude:float"] is None and float(rows["c:latitude:float"]) > 500: rows["c:latitude:float"] = float(rows["c:latitude:float"]) - 1000.0 if not rows["c:longitude:float"] is None and float(rows["c:longitude:float"]) > 500: rows["c:longitude:float"] = float(rows["c:longitude:float"]) - 1000.0 # The random salt is added to the result score during ranking to prevent # groups of near-identical results with identical scores from appearing # together in the same result pages without harming quality. rows["c:randomsalt:float"] = str(random.uniform(0.0, 1.0)) csv_writer.writerow(rows) numopps += 1 data_file.close() print_progress("bad links: %d" % num_bad_links) print_progress(" expired: %d" % expired_by_end_date) # NOTE: if you change this, you also need to update datahub/load_gbase.py # and frontend/views.py to avoid breaking the dashboard-- other status # messages don't matter. elapsed = datetime.now() - start_time xmlh.print_status( "done parsing: output " + str(footprint_lib.NUMORGS) + " organizations" + " and " + str(numopps) + " opportunities" + " (" + str(feed_file_size) + " bytes): " + str(int(elapsed.seconds / 60)) + " minutes.", shortname, ) proper_name = shortname if shortname in providers.ProviderNames: proper_name = providers.ProviderNames[shortname].get("name", shortname) # do the per-provider summary if shortname: processed = str(datetime.now()).split(".")[0] try: fh = open(FEEDSDIR + "/" + shortname + "-last.txt", "r") except: fh = None footprint_stats = None if fh: footprint_stats = fh.read() fh.close() fh = open(FEEDSDIR + "/" + shortname + "-history.txt", "a") if fh: fh.write("processed\t" + processed + "\n") fh.write("elapsed\t" + str(int(elapsed.seconds / 60)) + "\n") fh.write("bytes\t" + str(feed_file_size) + "\n") fh.write("numopps\t" + str(numopps) + "\n") fh.write("expired\t" + str(expired_by_end_date) + "\n") fh.write("badlinks\t" + str(num_bad_links) + "\n") if footprint_stats: fh.write(footprint_stats) fh.write("proper_name\t" + proper_name + "\n") fh.close() return out_filename