Exemplo n.º 1
def memoize(links, domain):

    if not os.path.exists(os.path.join(os.getcwd(), MEMO_DIRECTORY)):

    # Else, we are already in the memoized articles file
    memo = {link.href : link for link in links}

    # If we are on our 2nd run or above, we begin to memoize
    if os.path.exists(domain_to_key(domain)):
        # r+ means open to reading and writing, file is not truncated down
        file_obj = open(domain_to_key(domain), "r+")
        # Python automatically handles platform differences \n handles \r\n also
        # Do not refactor into file_obj.read().split("\n"), for whatever reason,
        # that gives an empty string as output, as some weird bug?
        saved_links = file_obj.read()
        # Chop off the last element, it's just an empty string
        saved_links = saved_links.split("\n")[:-1]
        for link in saved_links:
            if memo.get(link): # If the link lasts so long on a page, it's not news
                del memo[link]

    text = ""
    for link in memo.keys():
        text += link + "\n"

    # Override the txt file with a new list of links, for next time
    write_unicode_to_file(domain_to_key(domain) ,safe_unicode(text), "w")

    # Construct the new list of links (objects)
    survived_links = memo.values()
    return survived_links
Exemplo n.º 2
def extract_data(link, get_objects):
        html = urllib2.urlopen(link.href).read()
    except Exception:
        return # Bust link, keep going

    obj = MaxSubSequence(html)
    txt, title = obj.MaxSubSequence(), ""
    if txt is None:
        return # Beautiful Soup unicode error we need to account for
    title = obj.getTitle()
    if link.title is None: # If the title was not captured in the seeker, try once more
        link.title = title

    # Create txt file with link, title, date, body, all in UNICODE
    total = PROPERTY_DELIMITER.join([link.href,
                        safe_unicode(title), safe_unicode(txt)]) + ARTICLE_DELIMITER
    article_file = codecs.open(SAVED_DIRECTORY, "a+", "utf-8")
    if get_objects:
        link.text = txt
        return link