def process_site(site): """ Process the feeds of a site """ logging.info("") logging.info("* site: %s", site) logging.info("") body = _getfeed(site) if not body: logging.warning("main_rss: empty feed") return each_post = _select_generator(body) for post in each_post(body): logging.info("") logging.info("- [%04d-%02d-%02d] <%s>", post["year"], post["month"], post["day"], post["link"]) if SETTINGS["before_tuple"]: if post["year"] > SETTINGS["before_tuple"][0]: logging.info("main_rss: year greater than %d; skip", SETTINGS["before_tuple"][0]) continue if post["month"] > SETTINGS["before_tuple"][1]: logging.info("main_rss: month greater than %d; skip", SETTINGS["before_tuple"][1]) continue if post["day"] >= SETTINGS["before_tuple"][2]: logging.info( "main_rss: day greater than (or equal to) %d; " "skip", SETTINGS["before_tuple"][2]) continue # Pause a bit before we process each post time.sleep(random.random() + 0.5) link = _get_final_url(post["link"]) if not link: continue link = _canonicalize(link) path = _to_bitpath(link) if not path: continue path = subr_misc.make_post_folder(site, path) logging.info("- <%s> => .../%s", link, path) logging.info("") if os.path.isdir(path): logging.warning("main_rss: dup <.../%s>; skip", path) continue success = subr_misc.mkdir_recursive_idempotent(path) if not success: continue path += os.sep path += "%02d-%02d-%02d.html" % (post["year"], post["month"], post["day"]) _savepost(link, path)
def process_site(site): """ Process the feeds of a site """ logging.info("") logging.info("* site: %s", site) logging.info("") body = _getfeed(site) if not body: logging.warning("main_rss: empty feed") return each_post = _select_generator(body) for post in each_post(body): logging.info("") logging.info("- [%04d-%02d-%02d] <%s>", post["year"], post["month"], post["day"], post["link"]) if SETTINGS["before_tuple"]: if post["year"] > SETTINGS["before_tuple"][0]: logging.info("main_rss: year greater than %d; skip", SETTINGS["before_tuple"][0]) continue if post["month"] > SETTINGS["before_tuple"][1]: logging.info("main_rss: month greater than %d; skip", SETTINGS["before_tuple"][1]) continue if post["day"] >= SETTINGS["before_tuple"][2]: logging.info("main_rss: day greater than (or equal to) %d; " "skip", SETTINGS["before_tuple"][2]) continue # Pause a bit before we process each post time.sleep(random.random() + 0.5) link = _get_final_url(post["link"]) if not link: continue link = _canonicalize(link) path = _to_bitpath(link) if not path: continue path = subr_misc.make_post_folder(site, path) logging.info("- <%s> => .../%s", link, path) logging.info("") if os.path.isdir(path): logging.warning("main_rss: dup <.../%s>; skip", path) continue success = subr_misc.mkdir_recursive_idempotent(path) if not success: continue path += os.sep path += "%02d-%02d-%02d.html" % (post["year"], post["month"], post["day"]) _savepost(link, path)
def process_site(site, noisy): """ Process the feeds of a site """ logging.info("") logging.info("* site: %s", site) logging.info("") result = subr_rss.fetch(site, noisy=noisy) if not result or not result[0]: return body = result[0] if "<rss" not in body: handler = sax_atom.AtomHandler() else: handler = sax_rss.RssHandler() sax.parseString(body, handler) content = zip(handler.links, handler.pub_dates) for link, date in content: if date[0] < 2013: continue if date[1] != 5: continue if date[2] < 15: continue logging.info("") logging.info("- <%s>", link) logging.info("") folder = subr_misc.make_post_folder(date, site) subr_misc.mkdir_recursive_idempotent(folder) time.sleep(random.randrange(5, 8)) link = subr_bitly.shorten(link, noisy=noisy) filename = subr_misc.bitlink_to_filename(link) pname = os.sep.join([folder, filename]) if os.path.isfile(pname): logging.info("main: file already exists: %s", pname) continue time.sleep(random.randrange(5, 8)) _, body = subr_http.fetch_url(link, noisy=noisy) filep = open(pname, "w") filep.write(body) filep.close()