示例#1
0
def ichinese():
	urls = get_test_urls()
	res = defaultdict(int)
	for url in urls:
		html = get_or_cache(url)
		chs = get_chinese(html)
		for ch in chs:
			res[ch] += 1

	res = '|'.join([a for a, b in sorted(filter(lambda x: x[1] >= 40, res.iteritems()), key=lambda x: -x[1])])
	save_json('chineses.json', res)
示例#2
0
def ichinese():
    urls = get_test_urls()
    res = defaultdict(int)
    for url in urls:
        html = get_or_cache(url)
        chs = get_chinese(html)
        for ch in chs:
            res[ch] += 1

    res = '|'.join([
        a for a, b in sorted(filter(lambda x: x[1] >= 40, res.iteritems()),
                             key=lambda x: -x[1])
    ])
    save_json('chineses.json', res)
示例#3
0
def article():
    url = request.args.get('url')

    article = mongo.article.find_one({'_id': url})

    if not article:
        try:
            html = get_or_cache(url)
            article = html2article(html, url, selector=True, merge=True)
            if article and not article['src_name']:
                article['src_name'] = get_domain(url)

            tpl = url2tpl(url)
            urls = html2urls(html, url)
            texts = dict(
                map(lambda x: (x[0], max(x[1], key=lambda y: len(y))),
                    urls.iteritems()))
            tmp = dict(map(lambda x: (x, url2tpl(x)), texts.iterkeys()))

            urls = {}
            for u, t in tmp.iteritems():
                if u != url and t == tpl:
                    urls[u] = texts[u]
                    if len(urls) >= 10:
                        break

            if article:
                article['urls'] = urls
                article['_id'] = url
                article['view'] = 1
                article['last'] = time.time()

                copy = article.copy()
                copy['urls'] = json.dumps(copy['urls'])
                mongo.article.save(copy)
        except:
            pass
    else:
        article['urls'] = json.loads(article['urls'])
        mongo.article.update({'_id': url},
                             {'$set': {
                                 'view': article['view'] + 1
                             }})

    if article:
        article['pubtime'] = article['pubtime'][:10]

    return render_template('extract/article.html', article=article, url=url)
示例#4
0
def test_article():
    debug = True if request.args.get('debug') == 'true' else False
    url = request.args.get('url', '')
    if not url.startswith('http://'):
        return 'url is not startswith http://'
    add_test_url(url)
    html = get_or_cache(url, print_path=True)
    extractor = ArticleExtractor(html, url, debug=debug)
    article = extractor.article
    selector = extractor.selector
    if extractor.pages and article:
        article['content'] = ArticleMerger(url,
                                           extractor.title,
                                           fetch_urls(extractor.pages,
                                                      handle=get_or_cache),
                                           debug=debug,
                                           **selector).content
    return json.dumps({'url': url, 'article': article, 'selector': selector})
示例#5
0
def article():
	url = request.args.get('url')

	article = mongo.article.find_one({'_id':url})

	if not article:
		try:
			html = get_or_cache(url)
			article = html2article(html, url, selector=True, merge=True)
			if article and not article['src_name']:
				article['src_name'] = get_domain(url)

			tpl = url2tpl(url)
			urls = html2urls(html, url)
			texts = dict(map(lambda x: (x[0], max(x[1], key=lambda y:len(y))), urls.iteritems()))
			tmp = dict(map(lambda x: (x, url2tpl(x)), texts.iterkeys()))

			urls = {}
			for u, t in tmp.iteritems():
				if u != url and t == tpl:
					urls[u] = texts[u]
					if len(urls) >= 10:
						break

			if article:
				article['urls'] = urls
				article['_id'] = url
				article['view'] = 1
				article['last'] = time.time()

				copy = article.copy()
				copy['urls'] = json.dumps(copy['urls'])
				mongo.article.save(copy)
		except:
			pass
	else:
		article['urls'] = json.loads(article['urls'])
		mongo.article.update({'_id':url}, {'$set':{'view':article['view'] + 1}})

	if article:
		article['pubtime'] = article['pubtime'][:10]

	return render_template('extract/article.html', article=article, url=url)
示例#6
0
def test_article():
	debug = True if request.args.get('debug') == 'true' else False
	url = request.args.get('url', '')
	if not url.startswith('http://'):
		return 'url is not startswith http://'
	add_test_url(url)
	html = get_or_cache(url, print_path=True)
	extractor = ArticleExtractor(html, url, debug=debug)
	article = extractor.article
	selector = extractor.selector
	if extractor.pages and article:
		article['content'] = ArticleMerger(
			url,
			extractor.title, 
			fetch_urls(extractor.pages, handle=get_or_cache),
			debug=debug,
			**selector
		).content
	return json.dumps({'url':url, 'article':article, 'selector':selector})
示例#7
0
	def run(self):
		html = get_or_cache(self.url)
		doc = clean_html(html, self.url, return_doc=True)
		urls = html2urls(html, self.url, name=False)
		urls = sorted(urls, key=self.score)

		name = tag2text(doc, 'meta', property="og:site_name")
		if name:
			self.name = self.text = name
		else:
			cnt = 10
			while cnt <= 100:
				if self.get_name(urls[:cnt]):
					print self.domain, cnt
					break
				cnt += 10

		if self.name is not None:
			self.get_sub(urls)
示例#8
0
def test_article_all():
    urls = get_test_urls()
    res = []
    for url in urls:
        try:
            html = get_or_cache(url)
        except:
            continue

        try:
            extractor = ArticleExtractor(html, url)
            res.append({
                'url': url,
                'article': extractor.article,
                'selector': extractor.selector,
            })
        except ArticleNotFound, e:
            print url
            print str(e)
            continue
示例#9
0
def test_segment(url):
	url = url.split('#')[0].split('?')[0]
	if not url.startswith('http://'):
		return 'url is not startswith http://'
	add_test_url(url)
	html = get_or_cache(url)
	extractor = ArticleExtractor(html, url)
	content = extractor.content
	if extractor.pages:
		content = ArticleMerger(
			url, 
			extractor.title, 
			fetch_urls(extractor.pages, handle=get_or_cache),
			**extractor.selector
		).content
	return json.dumps({
		'url':url, 
		'title': extractor.title,
		'words': segmentor.seg(extractor.title, html2text(content))
	})
示例#10
0
def test_segment_all():
	urls = get_test_urls()
	res = []
	for url in urls:
		html = get_or_cache(url)
		extractor = ArticleExtractor(html, url)
		content = extractor.content
		if extractor.pages:
			content = ArticleMerger(
				url, 
				extractor.title, 
				fetch_urls(extractor.pages, handle=get_or_cache),
				**extractor.selector
			).content
		res.append({
			'url':url, 
			'title': extractor.title,
			'words': segmentor.seg(extractor.title, html2text(content, code=False))
		})
	return json.dumps(res)
示例#11
0
def test_article_all():
	urls = get_test_urls()
	res = []
	for url in urls:
		try:
			html = get_or_cache(url)
		except:
			continue

		try:
			extractor = ArticleExtractor(html, url)
			res.append({
				'url':url, 
				'article':extractor.article, 
				'selector':extractor.selector,
			})
		except ArticleNotFound, e:
			print url
			print str(e)
			continue
示例#12
0
def test_segment(url):
    url = url.split('#')[0].split('?')[0]
    if not url.startswith('http://'):
        return 'url is not startswith http://'
    add_test_url(url)
    html = get_or_cache(url)
    extractor = ArticleExtractor(html, url)
    content = extractor.content
    if extractor.pages:
        content = ArticleMerger(
            url, extractor.title,
            fetch_urls(extractor.pages, handle=get_or_cache),
            **extractor.selector).content
    return json.dumps({
        'url':
        url,
        'title':
        extractor.title,
        'words':
        segmentor.seg(extractor.title, html2text(content))
    })
示例#13
0
def test_segment_all():
    urls = get_test_urls()
    res = []
    for url in urls:
        html = get_or_cache(url)
        extractor = ArticleExtractor(html, url)
        content = extractor.content
        if extractor.pages:
            content = ArticleMerger(
                url, extractor.title,
                fetch_urls(extractor.pages, handle=get_or_cache),
                **extractor.selector).content
        res.append({
            'url':
            url,
            'title':
            extractor.title,
            'words':
            segmentor.seg(extractor.title, html2text(content, code=False))
        })
    return json.dumps(res)
示例#14
0
def test_article_all():
	urls = get_test_urls()
	res = []
	for url in urls:
		try:
			html = get_or_cache(url)
			extractor = Article(html, url)
			res.append({
				'url':url, 
				'article':extractor.article, 
				'selector':extractor.selector,
			})

			from utils import print_dict
			print_dict(extractor.article)
			print_dict(extractor.selector)
		except:
			print 'error', url
	print '-' * 80
	print len(urls)
	return json.dumps(res)
示例#15
0
def test(url):
    html = get_or_cache(url)
    urls = html2urls(html, url, name=False)

    words = defaultdict(int)
    u = set()
    for i in urls:
        if i.startswith(url) and len(get_path(i).split('/')) <= 2:
            u.add(i)

    if len(u) < 10:
        for i in urls:
            if i.startswith(url) and len(get_path(i).split('/')) <= 3:
                u.add(i)

        if len(u) < 20:
            for i in urls:
                if i.startswith(url) and len(get_path(i).split('/')) <= 4:
                    u.add(i)

    urls = list(u)[:10]

    for i in urls:
        res = url2meta(i, get=get_or_cache)
        if res is not None:
            if '_' in res['title']:
                for word in res['title'].split('_'):
                    if word.strip():
                        words[word.strip()] += 1
            elif '|' in res['title']:
                for word in res['title'].split('|'):
                    if word.strip():
                        words[word.strip()] += 1
            elif '-' in res['title']:
                for word in res['title'].split('-'):
                    if word.strip():
                        words[word.strip()] += 1
    print_dict(words, cmp_key=lambda x: -x[1], limit=5)
示例#16
0
def test_article(url):
	debug = True if request.args.get('debug') == 'true' else False
	url = url.split('#')[0].split('?')[0]
	if not url.startswith('http://'):
		return 'url is not startswith http://'
	add_test_url(url)
	html = get_or_cache(url)
	extractor = Article(html, url, debug=debug)
	article = extractor.article
	selector = extractor.selector

	from utils import print_dict
	print_dict(article)
	print_dict(selector)
	# if extractor.pages:
	# 	article['content'] = ArticleMerger(
	# 		url,
	# 		extractor.title, 
	# 		fetch_urls(extractor.pages, handle=get_or_cache),
	# 		debug=debug,
	# 		**selector
	# 	).content
	return json.dumps({'url':url, 'article':article, 'selector':selector})