示例#1
0
def exposed_retrigger_feed_urls():
    '''
	Retrigger the content urls from each feed item.
	'''

    # RssFeedPost attributes:
    # 	id
    # 	type
    # 	feed_id
    # 	contenturl
    # 	contentid
    # 	title
    # 	contents
    # 	updated
    # 	published
    # 	tag_rel
    # 	author_rel
    # 	tags
    # 	author

    urls = set()
    with db.session_context() as sess:
        processor = WebMirror.processor.RssProcessor.RssProcessor(
            loggerPath="Main.RssDb",
            pageUrl='http://www.example.org',
            pgContent='',
            type='application/atom+xml',
            transfer=False,
            debug_print=True,
            db_sess=sess,
            write_debug=False)

        print("Loading posts....")
        items = sess.query(db.RssFeedPost).all()
        print("Loaded %s rows" % len(items))
        have_content = [tmp for tmp in items if tmp.contents]
        print("%s rows have content" % len(have_content))

        pbar = tqdm.tqdm(items, desc="Retriggering RSS URLs")
        for post in pbar:
            if post.contenturl.startswith("tag:blogger.com"):
                continue

            if post.contenturl and not '#comment_' in post.contenturl:
                urls.add(post.contenturl)

            if post.contents and post.contents != 'Disabled?' and post.contents != 'wat':
                soup = WebRequest.as_soup(post.contents)
                # print(post.contents)
                # Make all the page URLs fully qualified, so they're unambiguous
                soup = urlFuncs.canonizeUrls(soup, post.contenturl)

                # pull out the page content and enqueue it. Filtering is
                # done in the parent.
                plainLinks = processor.extractLinks(soup, post.contenturl)
                imageLinks = processor.extractImages(soup, post.contenturl)

                # if plainLinks or imageLinks:
                # 	print((len(plainLinks), len(imageLinks)))

                urls.update(plainLinks)
                urls.update(imageLinks)
            # pbar.set_description("Links: %s" % len(urls))

    urls = list(urls)

    urld = {}
    for url in [tmp for tmp in urls if tmp]:
        nl = urllib.parse.urlsplit(url).netloc
        if nl:
            urld.setdefault(nl, [])
            urld[nl].append(url)

    print("Extracted %s unique links for %s netlocs" % (len(urls), len(urld)))

    # rules = WebMirror.rules.load_rules()
    # feeds = [item['feedurls'] for item in rules]
    # feeds = [item for sublist in feeds for item in sublist]
    # url = feeds[0]
    # parsed = urllib.parse.urlparse(url)
    # root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", ""))
    # print("Using feed url %s for job base" % url)

    try:
        with db.session_context() as sess:
            archiver = SiteArchiver(None, sess, None)
            for key, urls in tqdm.tqdm(urld.items(), desc='Source Netlocs'):
                sel_url = urls[0]
                parsed = urllib.parse.urlparse(sel_url)
                root = urllib.parse.urlunparse(
                    (parsed[0], parsed[1], "", "", "", ""))

                job = db.WebPages(
                    url=sel_url,
                    starturl=root,
                    netloc=key,
                    distance=0,
                    is_text=True,
                    priority=db.DB_LOW_PRIORITY,
                    type='unknown',
                    fetchtime=datetime.datetime.now(),
                )
                for chunk in chunks(urls, 500):
                    archiver.upsertResponseLinks(job,
                                                 plain=chunk,
                                                 resource=[],
                                                 debug=True,
                                                 interactive=True)

    except Exception as e:
        traceback.print_exc()