예제 #1
0
def ftp_to_base(filename, ftpinfo, instr):
    """ftp the string to base, guessing the feed name from the orig filename."""
    print_progress("attempting to FTP " + filename + " to base")
    ftplib = __import__("ftplib")
    stringio = __import__("StringIO")

    dest_fn = footprint_lib.guess_shortname(filename)
    if dest_fn == "":
        dest_fn = "footprint1.txt"
    else:
        dest_fn = dest_fn + "1.gz"

    if re.search(r"[.]gz$", dest_fn):
        print_progress("compressing data from " + str(len(instr)) + " bytes")
        gzip_fh = gzip.open(dest_fn, "wb", 9)
        gzip_fh.write(instr)
        gzip_fh.close()
        data_fh = open(dest_fn, "rb")
    else:
        data_fh = stringio.StringIO(instr)

    host = "uploads.google.com"
    (user, passwd) = ftpinfo.split(":")
    print_progress("connecting to " + host + " as user " + user + "...")
    ftp = ftplib.FTP(host)
    welcomestr = re.sub(r"\n", "\\n", ftp.getwelcome())
    print_progress("FTP server says: " + welcomestr)
    ftp.login(user, passwd)
    print_progress("uploading filename " + dest_fn)
    success = False
    while not success:
        try:
            ftp.storbinary("STOR " + dest_fn, data_fh, 8192)
            success = True
        except:
            # probably ftplib.error_perm: 553: Permission denied on server. (Overwrite)
            print_progress("upload failed-- sleeping and retrying...")
            time.sleep(1)
            ftp = ftplib.FTP(host)
            welcomestr = re.sub(r"\n", "\\n", ftp.getwelcome())
            print_progress("FTP server says: " + welcomestr)
            ftp.login(user, passwd)
            print_progress("uploading filename " + dest_fn)
    if success:
        print_progress("done uploading.")
    else:
        print_progress("giving up.")
    ftp.quit()
    data_fh.close()
예제 #2
0
def solr_retransform(fname, start_time, feed_file_size):
    """Create Solr-compatible versions of a datafile"""
    numopps = 0

    print_progress("Creating Solr transformed file for: " + fname)
    out_filename = fname + ".transformed"
    data_file = open(fname, "r")
    try:
        csv_reader = DictReader(data_file, dialect="our-dialect")
        csv_reader.next()
    except:
        print data_file.read()
        print_progress("error processing %s" % str(fname))
        return

    shortname = footprint_lib.guess_shortname(fname)
    if not shortname:
        shortname = fname

    fnames = csv_reader.fieldnames[:]
    fnames.append("c:eventrangestart:dateTime")
    fnames.append("c:eventrangeend:dateTime")
    fnames.append("c:eventduration:integer")
    fnames.append("c:aggregatefield:string")
    fnames.append("c:dateopportunityidgroup:string")
    fnames.append("c:randomsalt:float")
    fnamesdict = dict([(x, x) for x in fnames])

    data_file = open(fname, "r")
    # TODO: Switch to TSV - Faster and simpler
    csv_reader = DictReader(data_file, dialect="our-dialect")
    csv_writer = DictWriter(open(out_filename, "w"), dialect="excel-tab", fieldnames=fnames)
    for field_name in fnamesdict.keys():
        fnamesdict[field_name] = fnamesdict[field_name].lower()
        if fnamesdict[field_name].startswith("c:"):
            fnamesdict[field_name] = fnamesdict[field_name].split(":")[1]

    csv_writer.writerow(fnamesdict)
    now = parser.parse(commands.getoutput("date"))
    today = now.date()
    expired_by_end_date = num_bad_links = 0
    for rows in csv_reader:
        if rows["title"] and rows["title"].lower().find("anytown museum") >= 0:
            # bogus event
            continue

        if not "c:OpportunityID:string" in rows:
            continue

        # Split the date range into separate fields
        # event_date_range can be either start_date or start_date/end_date
        split_date_range = []
        if rows["event_date_range"]:
            split_date_range = rows["event_date_range"].split("/")

        if split_date_range:
            rows["c:eventrangestart:dateTime"] = split_date_range[0]
            if len(split_date_range) > 1:
                rows["c:eventrangeend:dateTime"] = split_date_range[1]
            else:
                if rows["c:openended:boolean"] == "Yes":
                    rows["c:eventrangeend:dateTime"] = rows["c:expires:dateTime"]
                else:
                    rows["c:eventrangeend:dateTime"] = rows["c:eventrangestart:dateTime"]

        # in case we somehow got here without already doing this
        rows["title"] = footprint_lib.cleanse_snippet(rows["title"])
        rows["description"] = footprint_lib.cleanse_snippet(rows["description"])
        rows["c:detailURL:URL"] = rows["c:detailURL:URL"].replace("&", "&")
        if not rows["c:detailURL:URL"].lower().startswith("http"):
            rows["c:detailURL:URL"] = "http://" + rows["c:detailURL:URL"]

        link = str(rows["c:detailURL:URL"])
        if link in BAD_LINKS or check_links.is_bad_link(link, RECHECK_BAD_LINKS):
            num_bad_links += 1
            footprint_lib.feed_report(rows["c:OpportunityID:string"], "badlinks", shortname, link)
            dlink = "'" + str(link) + "'"
            if dlink not in BAD_LINKS:
                BAD_LINKS[dlink] = 0
                print_progress("bad link: " + dlink)
            BAD_LINKS[dlink] += 1
            continue

        rows["c:org_missionStatement:string"] = footprint_lib.cleanse_snippet(rows["c:org_missionStatement:string"])
        rows["c:org_description:string"] = footprint_lib.cleanse_snippet(rows["c:org_description:string"])

        rows["c:aggregatefield:string"] = footprint_lib.cleanse_snippet(
            " ".join(
                [
                    rows["title"],
                    rows["description"],
                    rows["c:provider_proper_name:string"],
                    rows.get("c:skills:string", rows.get("c:skill:string", "")),
                    rows.get("c:categoryTags:string", rows.get("c:categoryTag:string", "")),
                    rows["c:org_name:string"],
                    rows["c:eventName:string"],
                ]
            )
        )

        ids = rows.get("c:OpportunityID:string", rows.get("c:opportunityID:string", "OpportunityID"))
        ds = str(rows.get("c:eventrangestart:dateTime", "2001"))
        if ds.find("T") > 0:
            ds = ds.split("T")[0]
        rows["c:dateopportunityidgroup:string"] = "".join([ds, ids])

        for key in rows.keys():
            if key.find(":dateTime") != -1:
                if rows[key].find(":") > 0:
                    rows[key] += "Z"
            elif key.find(":integer") != -1:
                if rows[key] == "":
                    rows[key] = 0
                else:
                    # find the first numbers from the string, e.g. abc123.4 => 123
                    try:
                        rows[key] = int(re.sub(r"^.*?([0-9]+).*$", r"\1", rows[key]))
                    except:
                        print_progress("error parsing rows[key]=%s -- rejecting record." % str(rows[key]))
                        continue

        try:
            start_date = parser.parse(rows["c:eventrangestart:dateTime"], ignoretz=True)
        except:
            start_date = "2001-01-01T00:00:00"

        try:
            end_date = parser.parse(rows["c:eventrangeend:dateTime"], ignoretz=True)
        except:
            end_date = "2020-12-31T23:59:59"

        try:
            # check for expired opportunities
            delta_days = get_delta_days(relativedelta.relativedelta(end_date, today))
            if delta_days < -2 and delta_days > -3000:
                # more than 3000? it's the 1971 thing
                # else it expired at least two days ago
                footprint_lib.feed_report(rows["c:OpportunityID:string"], "expired", shortname, link)
                expired_by_end_date += 1
                continue

            duration_rdelta = relativedelta.relativedelta(end_date, start_date)
            duration_delta_days = get_delta_days(duration_rdelta)

            # Check whether start/end dates are the wrong way around.
            if duration_delta_days < 0:
                # removing this code for now-- too scary wrt. typos
                # e.g. what happens if 9/11/2009 - 9/7/2009  and it turns out
                # that the 7 was supposed to be a 17 i.e. simple typo-- by
                # swapping you've made it worse.  Correct solution is to add
                # to spreadsheet checker, then reject start>end here.
                # even this is the wrong place to do this-- should apply to
                # both Base and SOLR.
                # print_progress('Date error: start > end. Swapping dates...')
                # duration_delta_days = -duration_delta_days
                # temp = rows["c:eventrangestart:dateTime"]
                # rows["c:eventrangestart:dateTime"] = rows["c:eventrangeend:dateTime"]
                # rows["c:eventrangeend:dateTime"] = temp
                print_progress("start date after end date: rejecting record.")
                continue

            # Fix for events that are ongoing or whose dates were unsucessfully
            # parsed. These events have start and end dates on 0000-01-01.
            #
            # These events get a large eventduration (used for ranking) so that
            # they are not erroneously boosted for having a short duration.
            current_rdelta = relativedelta.relativedelta(today, end_date)
            current_delta_days = get_delta_days(current_rdelta)
            rows["c:eventduration:integer"] = max(duration_delta_days, current_delta_days)
        except:
            pass

        # GBASE LEGACY: Fix to the +1000 to lat/long hack
        if not rows["c:latitude:float"] is None and float(rows["c:latitude:float"]) > 500:
            rows["c:latitude:float"] = float(rows["c:latitude:float"]) - 1000.0
        if not rows["c:longitude:float"] is None and float(rows["c:longitude:float"]) > 500:
            rows["c:longitude:float"] = float(rows["c:longitude:float"]) - 1000.0

        # The random salt is added to the result score during ranking to prevent
        # groups of near-identical results with identical scores from appearing
        # together in the same result pages without harming quality.
        rows["c:randomsalt:float"] = str(random.uniform(0.0, 1.0))

        csv_writer.writerow(rows)
        numopps += 1

    data_file.close()
    print_progress("bad links: %d" % num_bad_links)
    print_progress("  expired: %d" % expired_by_end_date)

    # NOTE: if you change this, you also need to update datahub/load_gbase.py
    # and frontend/views.py to avoid breaking the dashboard-- other status
    # messages don't matter.
    elapsed = datetime.now() - start_time
    xmlh.print_status(
        "done parsing: output "
        + str(footprint_lib.NUMORGS)
        + " organizations"
        + " and "
        + str(numopps)
        + " opportunities"
        + " ("
        + str(feed_file_size)
        + " bytes): "
        + str(int(elapsed.seconds / 60))
        + " minutes.",
        shortname,
    )

    proper_name = shortname
    if shortname in providers.ProviderNames:
        proper_name = providers.ProviderNames[shortname].get("name", shortname)

    # do the per-provider summary
    if shortname:
        processed = str(datetime.now()).split(".")[0]

        try:
            fh = open(FEEDSDIR + "/" + shortname + "-last.txt", "r")
        except:
            fh = None
            footprint_stats = None

        if fh:
            footprint_stats = fh.read()
            fh.close()

        fh = open(FEEDSDIR + "/" + shortname + "-history.txt", "a")
        if fh:
            fh.write("processed\t" + processed + "\n")
            fh.write("elapsed\t" + str(int(elapsed.seconds / 60)) + "\n")
            fh.write("bytes\t" + str(feed_file_size) + "\n")
            fh.write("numopps\t" + str(numopps) + "\n")
            fh.write("expired\t" + str(expired_by_end_date) + "\n")
            fh.write("badlinks\t" + str(num_bad_links) + "\n")
            if footprint_stats:
                fh.write(footprint_stats)
            fh.write("proper_name\t" + proper_name + "\n")
            fh.close()

    return out_filename