Пример #1
0
def get_articles(grab_object, title_path, link_path, source, site_url="",
        summary_path=''):
    posts = []
        
    post_links = grab_object.doc.tree.cssselect(link_path)
    post_titles = grab_object.doc.tree.cssselect(title_path)
    if summary_path:
        summary = grab_object.doc.tree.cssselect(summary_path)
        for i in summary:
            for j in i.cssselect('script') + i.cssselect('style'):
                j.drop_tree()
    else:
        summary = []
    
    while len(summary) < len(post_links):
        summary.append(u'')
    
    zip_object = zip(post_links, post_titles, summary)
    
    for (title, link, summary_text) in zip_object:
        title = unicode_(title.text_content()).strip()
        link = grab_object.make_url_absolute(link.get("href"))
        
        posts.append(
            {"title": escape_title(title),
            "link": unicode_(link),
            "source": source,
            "summary": summary_text})
    
    return posts
Пример #2
0
def get_articles(grab_object,
                 title_path,
                 link_path,
                 source,
                 site_url="",
                 summary_path=''):
    posts = []

    post_links = grab_object.css_list(link_path)
    post_titles = grab_object.css_list(title_path)
    if summary_path:
        summary = grab_object.css_list(summary_path)
        for i in summary:
            for j in i.cssselect('script') + i.cssselect('style'):
                j.drop_tree()
    else:
        summary = []

    while len(summary) < len(post_links):
        summary.append('')

    zip_object = zip(post_links, post_titles, summary)

    for (title, link, summary_text) in zip_object:
        title = unicode_(title.text_content()).strip()
        link = grab_object.make_url_absolute(link.get("href"))

        posts.append({
            "title": escape_title(title),
            "link": unicode_(link),
            "source": source,
            "summary": summary_text
        })

    return posts
Пример #3
0
def has_words(qs, article):
    """Check if article contains words"""
    
    text = remove_tags(unicode_(article['title'])).lower() \
    + remove_tags(unicode_(article['summary'])).lower()
    
    for i in qs:
        if i not in text:
            return False
    return True
Пример #4
0
def has_words(qs, article):
	"""Check if article contains words"""
	
	title = unicode_(article['title']).lower()
	summary = unicode_(article['summary']).lower()
	
	for i in qs:
		if i not in title and i not in summary:
			return False
	return True
Пример #5
0
def has_words(qs, article):
    """Check if article contains words"""

    title = remove_tags(unicode_(article['title']).lower())
    summary = remove_tags(unicode_(article['summary']).lower())

    for i in qs:
        if i not in title and i not in summary:
            return False
    return True
Пример #6
0
def show_blacklist(page_number=1):
	history_page = mylookup.get_template('blacklist.html')
	q = unicode_(request.GET.get('q', ''))
	
	articles = recommend.get_blacklist(db=get_conf.config.db)
	
	try:
		page_number = int(page_number)
	except ValueError:
		page_number = 1
	
	if q:
		qs = q.lower().split()
		articles = filter(lambda x: has_words(qs, x), articles)
	
	articles = map(lambda x: replace_newlines(escape_link(x)), articles)
	
	all_articles = articles
	articles = split_into_pages(articles, 30)
	try:
		requested_page = articles[page_number-1]
	except IndexError:
		requested_page = []
	
	return history_page.render(articles=requested_page,
		num_pages=len(articles),
		page_num=page_number,
		q=q,
		all_articles=all_articles)
Пример #7
0
def show_blacklist(page_number=1):
    blacklist_page = mylookup.get_template('blacklist.html')
    q = unicode_(request.GET.get('q', ''))

    articles = recommend.get_blacklist()

    try:
        page_number = int(page_number)
    except ValueError:
        page_number = 1

    if q:
        qs = q.lower().split()
        articles = filter(lambda x: has_words(qs, x), articles)

    articles = map(lambda x: escape_link(x), articles)

    all_articles = articles
    articles = split_into_pages(articles, 30)
    try:
        requested_page = articles[page_number - 1]
    except IndexError:
        requested_page = []

    return blacklist_page.render(articles=requested_page,
                                 num_pages=len(articles),
                                 page_num=page_number,
                                 q=q,
                                 page='blacklist',
                                 config=get_conf.config)
Пример #8
0
def show_blacklist(page_number=1):
    try:
        page_number = int(page_number)
    except ValueError:
        page_number = 1

    q = unicode_(request.GET.getunicode('q', ''))
    
    html = get_cache('cached_blacklist_{0}_{1}'.format(page_number, hexlify(q.encode('utf8')).decode('utf8')))
    if html:
        return html
    
    blacklist_page = cache['templates'].get('blacklist.html')
    if not blacklist_page:
        blacklist_page = mylookup.get_template('blacklist.html')
        cache['templates']['blacklist.html'] = blacklist_page
    
    # TODO cache data
    articles = recommend.get_blacklist()
    
    if q:
        qs = q.lower().split()
        articles = iter(filter(lambda x: has_words(qs, x), articles))
    
    articles = iter(map(lambda x: escape_link(x), articles))
    
    requested_page = list(get_page(articles, 30, page_num=page_number))
    num_pages = page_number
    try:
        next(articles)
        num_pages += 1
    except StopIteration:
        pass
    
    html = blacklist_page.render(articles=requested_page,
                                 num_pages=num_pages,
                                 page_num=page_number,
                                 q=q, page='blacklist',
                                 config=get_conf.config).decode('utf8')
    if len(requested_page):
        cache_data('cached_blacklist_{0}_{1}'.format(page_number, hexlify(q.encode('utf8')).decode('utf8')), html)
    return html
Пример #9
0
def get_articles():
	g = grab.Grab()
	parser.setup_grab(g)
	
	g.go('http://planet.clojure.in')
	
	css_path = '.entry .article > h2 a'
	summary_texts = []
	for elem in g.css_list(".entry .article"):
		text = ''
		for children in elem.getchildren()[1:-1]:
			text += unicode_(children.text_content()).strip()
		summary_texts.append(parser.cut_text(text))
			
	posts = parser.get_articles(g, css_path, css_path,
		'planetclojure', 'planet.clojure.in')
	
	for (post, summary_text) in zip(posts, summary_texts):
		post['summary'] = summary_text
	
	return posts
Пример #10
0
def article_list(page_number=1):
    """Show list of articles | Search for articles"""

    main_page = mylookup.get_template("articles.html")
    q = unicode_(request.GET.get('q', ''))

    try:
        page_number = int(page_number)
    except ValueError:
        page_number = 1

    try:
        articles = load_articles()
    except IOError:
        dump_articles()
        articles = load_articles()

    articles = filter_articles(articles)
    if q:
        qs = q.lower().split()
        articles = filter(lambda x: has_words(qs, x), articles.values())
        articles = map(lambda x: escape_link(x), articles)
    else:
        articles = map(lambda x: escape_link(x), articles.values())
    all_articles = articles
    articles = split_into_pages(articles, 30)
    try:
        requested_page = articles[page_number - 1]
        set_liked(requested_page)
    except IndexError:
        requested_page = []

    return main_page.render(articles=requested_page,
                            num_pages=len(articles),
                            page_num=page_number,
                            q=q,
                            page='main',
                            config=get_conf.config,
                            is_parsing=get_var('parsing', '0') == '1')
Пример #11
0
 def unescape(text):
     try:
         text = unicode_(text)
     except TypeError:
         pass
     
     def fixup(m):
         text = m.group(0)
         if text[:2] == "&#":
             try:
                 if text[:3] == "&#x":
                     return chr(int(text[3:-1], 16))
                 else:
                     return chr(int(text[2:-1]))
             except ValueError:
                 pass
         else:
             try:
                 text = chr(htmlentitydefs.name2codepoint[text[1:-1]])
             except KeyError:
                 pass
         return text
     
     return TextClassifier.unicode_chr_regex.sub(fixup, text)
Пример #12
0
def article_list(page_number=1):
	"""Show list of articles | Search for articles"""
	
	main_page = mylookup.get_template("articles.html")
	q = unicode_(request.GET.get('q', ''))
	
	try:
		page_number = int(page_number)
	except ValueError:
		page_number = 1
	
	try:
		articles = load_articles()
	except IOError:
		dump_articles()
		articles = load_articles()
	
	articles = filter_articles(articles)
	if q:
		qs = q.lower().split()
		articles = filter(lambda x: has_words(qs, x[0]), articles)
	
	articles = map(lambda x: replace_newlines(escape_link(x[0])), articles)
	all_articles = articles
	articles = split_into_pages(articles, 30)
	try:
		requested_page = articles[page_number-1]
		set_liked(requested_page)
	except IndexError:
		requested_page = []
	
	return main_page.render(articles=requested_page,
		num_pages=len(articles),
		page_num=page_number,
		q=q,
		all_articles=all_articles,)
Пример #13
0
def article_list(page_number=1):
    """Show list of articles | Search for articles"""
    
    try:
        page_number = int(page_number)
    except ValueError:
        page_number = 1

    q = unicode_(request.GET.getunicode('q', ''))
    if not get_conf.config.enable_random or q:
        html = get_cache('cached_main_{0}_{1}'.format(page_number, hexlify(q.encode('utf8')).decode('utf8')))
        if html:
            return html

    main_page = cache['templates'].get('articles.html')
    if not main_page:
        main_page = mylookup.get_template("articles.html")
        cache['templates']['articles.html'] = main_page
    
    cache_page = False
    
    if get_conf.config.data_format == 'db':
        if q:
            articles = select_all_articles()
            qs = q.lower().split()
            requested_page = []
            j = 0
            k = (page_number - 1) * 30
            n = 0
            append = requested_page.append
            
            for article in articles:
                if has_words(qs, article):
                    j += 1
                    if j > k:
                        n += 1
                        if n == 31:
                            break
                        else:
                            append(escape_link(article))
            
            num_pages = page_number - 1 + math.ceil(n / 30.0)
            cache_page = True
        else:
            requested_page = select_articles_from_page(page_number)
            requested_page = list(map(lambda x: escape_link(x), requested_page))
            num_pages = int(get_var('num_pages', 0))
            if num_pages == 0:
                num_pages += 1
                cache_page = False
            else:
                cache_page = True
    else:
        # TODO reduce memory usage
        try:
            articles = load_articles()
        except IOError:
            dump_articles()
            articles = load_articles()
    
        if q:
            qs = q.lower().split()
            articles = iter(filter(lambda x: has_words(qs, x), articles.values()))
            articles = iter(map(lambda x: escape_link(x), articles))
            cache_page = True
        else:
            articles = iter(map(lambda x: escape_link(x), articles.values()))
        
        articles = split_into_pages(articles, 30)
        num_pages = len(articles)
        
        try:
            requested_page = articles[page_number-1]
        except IndexError:
            requested_page = []
    
    if get_conf.config.enable_random and not q:
        random_articles = [escape_link(x) for x in articles_from_list(getRandomArticles(page_number))]
        set_liked(random_articles)
    else:
        random_articles = []
    
    set_liked(requested_page)
    
    html = main_page.render(articles=requested_page,
                            random_articles=random_articles,
                            num_pages=num_pages,
                            page_num=page_number,
                            q=q, page='main',
                            config=get_conf.config,
                            is_parsing=get_var('parsing', '0') == '1').decode('utf8')
    if not cache_page:
        cache_page = bool(len(requested_page))
    
    if cache_page and not get_conf.config.enable_random and not q:
        cache_data('cached_main_{0}_{1}'.format(page_number, hexlify(q.encode('utf8')).decode('utf8')), html)
    return html