Пример #1
0
def process_site(site):
    """ Process the feeds of a site """

    logging.info("")
    logging.info("* site: %s", site)
    logging.info("")

    body = _getfeed(site)
    if not body:
        logging.warning("main_rss: empty feed")
        return
    each_post = _select_generator(body)

    for post in each_post(body):
        logging.info("")
        logging.info("- [%04d-%02d-%02d] <%s>", post["year"], post["month"],
                     post["day"], post["link"])

        if SETTINGS["before_tuple"]:
            if post["year"] > SETTINGS["before_tuple"][0]:
                logging.info("main_rss: year greater than %d; skip",
                             SETTINGS["before_tuple"][0])
                continue
            if post["month"] > SETTINGS["before_tuple"][1]:
                logging.info("main_rss: month greater than %d; skip",
                             SETTINGS["before_tuple"][1])
                continue
            if post["day"] >= SETTINGS["before_tuple"][2]:
                logging.info(
                    "main_rss: day greater than (or equal to) %d; "
                    "skip", SETTINGS["before_tuple"][2])
                continue

        # Pause a bit before we process each post
        time.sleep(random.random() + 0.5)

        link = _get_final_url(post["link"])
        if not link:
            continue
        link = _canonicalize(link)
        path = _to_bitpath(link)
        if not path:
            continue
        path = subr_misc.make_post_folder(site, path)

        logging.info("- <%s> => .../%s", link, path)
        logging.info("")

        if os.path.isdir(path):
            logging.warning("main_rss: dup <.../%s>; skip", path)
            continue

        success = subr_misc.mkdir_recursive_idempotent(path)
        if not success:
            continue
        path += os.sep
        path += "%02d-%02d-%02d.html" % (post["year"], post["month"],
                                         post["day"])
        _savepost(link, path)
Пример #2
0
def process_site(site):
    """ Process the feeds of a site """

    logging.info("")
    logging.info("* site: %s", site)
    logging.info("")

    body = _getfeed(site)
    if not body:
        logging.warning("main_rss: empty feed")
        return
    each_post = _select_generator(body)

    for post in each_post(body):
        logging.info("")
        logging.info("- [%04d-%02d-%02d] <%s>", post["year"], post["month"],
                     post["day"], post["link"])

        if SETTINGS["before_tuple"]:
            if post["year"] > SETTINGS["before_tuple"][0]:
                logging.info("main_rss: year greater than %d; skip",
                             SETTINGS["before_tuple"][0])
                continue
            if post["month"] > SETTINGS["before_tuple"][1]:
                logging.info("main_rss: month greater than %d; skip",
                             SETTINGS["before_tuple"][1])
                continue
            if post["day"] >= SETTINGS["before_tuple"][2]:
                logging.info("main_rss: day greater than (or equal to) %d; "
                             "skip", SETTINGS["before_tuple"][2])
                continue

        # Pause a bit before we process each post
        time.sleep(random.random() + 0.5)

        link = _get_final_url(post["link"])
        if not link:
            continue
        link = _canonicalize(link)
        path = _to_bitpath(link)
        if not path:
            continue
        path = subr_misc.make_post_folder(site, path)

        logging.info("- <%s> => .../%s", link, path)
        logging.info("")

        if os.path.isdir(path):
            logging.warning("main_rss: dup <.../%s>; skip", path)
            continue

        success = subr_misc.mkdir_recursive_idempotent(path)
        if not success:
            continue
        path += os.sep
        path += "%02d-%02d-%02d.html" % (post["year"], post["month"],
                                         post["day"])
        _savepost(link, path)
Пример #3
0
def process_site(site, noisy):
    """ Process the feeds of a site """

    logging.info("")
    logging.info("* site: %s", site)
    logging.info("")

    result = subr_rss.fetch(site, noisy=noisy)
    if not result or not result[0]:
        return
    body = result[0]

    if "<rss" not in body:
        handler = sax_atom.AtomHandler()
    else:
        handler = sax_rss.RssHandler()
    sax.parseString(body, handler)

    content = zip(handler.links, handler.pub_dates)
    for link, date in content:

        if date[0] < 2013:
            continue
        if date[1] != 5:
            continue
        if date[2] < 15:
            continue

        logging.info("")
        logging.info("- <%s>", link)
        logging.info("")

        folder = subr_misc.make_post_folder(date, site)
        subr_misc.mkdir_recursive_idempotent(folder)

        time.sleep(random.randrange(5, 8))
        link = subr_bitly.shorten(link, noisy=noisy)

        filename = subr_misc.bitlink_to_filename(link)
        pname = os.sep.join([folder, filename])
        if os.path.isfile(pname):
            logging.info("main: file already exists: %s", pname)
            continue

        time.sleep(random.randrange(5, 8))
        _, body = subr_http.fetch_url(link, noisy=noisy)

        filep = open(pname, "w")
        filep.write(body)
        filep.close()
Пример #4
0
def process_site(site, noisy):
    """ Process the feeds of a site """

    logging.info("")
    logging.info("* site: %s", site)
    logging.info("")

    result = subr_rss.fetch(site, noisy=noisy)
    if not result or not result[0]:
        return
    body = result[0]

    if "<rss" not in body:
        handler = sax_atom.AtomHandler()
    else:
        handler = sax_rss.RssHandler()
    sax.parseString(body, handler)

    content = zip(handler.links, handler.pub_dates)
    for link, date in content:

        if date[0] < 2013:
            continue
        if date[1] != 5:
            continue
        if date[2] < 15:
            continue

        logging.info("")
        logging.info("- <%s>", link)
        logging.info("")

        folder = subr_misc.make_post_folder(date, site)
        subr_misc.mkdir_recursive_idempotent(folder)

        time.sleep(random.randrange(5, 8))
        link = subr_bitly.shorten(link, noisy=noisy)

        filename = subr_misc.bitlink_to_filename(link)
        pname = os.sep.join([folder, filename])
        if os.path.isfile(pname):
            logging.info("main: file already exists: %s", pname)
            continue

        time.sleep(random.randrange(5, 8))
        _, body = subr_http.fetch_url(link, noisy=noisy)

        filep = open(pname, "w")
        filep.write(body)
        filep.close()