def getInfo(filename): url_name = t.getUrl(filename) html = request.urlopen(url_name).read() view_pattern = re.compile(r'dd\stitle="[0-9]+') rank_pattern = re.compile(r'dl\stitle="[0-9]+') view_result = view_pattern.findall(str(html)) rank_result = rank_pattern.findall(str(html)) view_num = view_result[0].split("=\"")[1] rank_num = rank_result[0].split("=\"")[1] print("view_num=", view_num) print("rank_view=", rank_num) return view_num, rank_num
def getContent(): article = {} h = HTMLParser.HTMLParser() target_url = getUrl.getUrl() print target_url response = urllib2.urlopen(target_url) html = response.read() flag_title = html.find('<title>') flag_title_end = html.find('</title>', flag_title) article['title'] = html[flag_title + 7:flag_title_end] print article['title'] flag_start = html.find('<div class="article-text') flag_end = html.find('The above text is a transcript of this podcast', flag_start) flag_end = flag_end - 8 print flag_start print flag_end paragraphs = [] cursor = flag_start while cursor < flag_end: flag_p = html.find('<p>', cursor) flag_p_end = html.find('</p>', flag_p) paragraphs.append(html[flag_p + 3:flag_p_end]) cursor = flag_p_end text = article['title'] + '\n\n\n\n' for para in paragraphs: para = h.unescape(para) para = para.replace('<em>', '') para = para.replace('</em>', '') para = para.replace('</a>', '') flag_a = para.find('<a') while flag_a != -1: flag_a_end = para.find('>', flag_a) para = para[:flag_a] + para[flag_a_end + 1:] flag_a = para.find('<a') # print para text += para + '\n\n' article['content'] = text return article
def getContent(): article = {} h = HTMLParser.HTMLParser() target_url = getUrl.getUrl() print target_url response = urllib2.urlopen(target_url) html = response.read() flag_title = html.find('<title>') flag_title_end = html.find('</title>', flag_title) article['title'] = html[flag_title + 7: flag_title_end] print article['title'] flag_start = html.find('<div class="article-text') flag_end = html.find('The above text is a transcript of this podcast', flag_start) flag_end = flag_end - 8 print flag_start print flag_end paragraphs = [] cursor = flag_start while cursor < flag_end: flag_p = html.find('<p>', cursor) flag_p_end = html.find('</p>', flag_p) paragraphs.append(html[flag_p + 3:flag_p_end]) cursor = flag_p_end text = article['title'] + '\n\n\n\n' for para in paragraphs: para = h.unescape(para) para = para.replace('<em>', '') para = para.replace('</em>', '') para = para.replace('</a>', '') flag_a = para.find('<a') while flag_a!= -1: flag_a_end = para.find('>', flag_a) para = para[:flag_a] + para[flag_a_end + 1:] flag_a = para.find('<a') # print para text += para + '\n\n' article['content'] = text return article
import glob, os from create import createExcel, createCompanyInfo from getUrl import getUrl files = [] for filename in glob.glob('templates/*.html'): files.append(filename) link_list = [] for filename in files: link_list += getUrl(filename) # print(link_list) # print(len(link_list)) company_list = [] for link in link_list: company_list.append(list(createCompanyInfo(link).__dict__.values())) createExcel(company_list)
def crawl(): target = request.forms.get('target') collection = getUrl(target) collection.run() return template('html/crawl.html', target=target)
def crawl(): target = request.forms.get('target') collection = getUrl(target) collection.run() return template('html/crawl.html',target=target)
from getUrl import getUrl ret = getUrl("http://qt.gtimg.cn/q=sz000858") ret2 = getUrl("http://hq.sinajs.cn/list=sh601006") print ret print ret2
# -*- coding: utf-8 -*- import sys from getUrl import getUrl import simplejson as json from bs4 import BeautifulSoup import time newscount = 30 ret = getUrl("http://finance.ifeng.com/stock/"); #retjson = json.JSONDecoder().decode(ret) #num = len(retjson["data"]) #print num #i = 1 #content = getUrl("http://toutiao.com/"+element["source_url"]) #print content #soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(ret) print soup.title; print soup.find_all("a"); #print soup.get_text()
#!/usr/bin/env python # coding=utf-8 # jetson@2014-12-19 21:19:23 import time,os import threading import getUrl import reptile cnt = 0 url = getUrl.getUrl('http://tieba.baidu.com/f/good?kw=%E6%B0%B8%E5%A4%9C%E5%90%9B%E7%8E%8B&ie=utf-8&cid=4&pn=') lock = threading.Lock() def downLoad(no,a): ''' 每个线程处理n个页面 ''' lock.acquire() if (a+1)*no > len(url): url_part = url[a*no:] else: url_part = url[a*no:(a+1)*no] lock.release() for i in url_part: reptile.grap1page(i) def assignTask(): thread_num = len(url)/20 + 1 print '%d threads will start'%thread_num
# -*- coding: utf-8 -*- import sys from getUrl import getUrl import simplejson as json from bs4 import BeautifulSoup import time newscount = 30 ret = getUrl("http://toutiao.com/api/article/recent/?source=2&count=%s&category=news_finance&max_behot_time=%s&utm_source=toutiao&offset=0"%(newscount,time.time)); retjson = json.JSONDecoder().decode(ret) num = len(retjson["data"]) print num i = 1 for element in retjson["data"]: print element["title"] print element["source_url"] #content = getUrl("http://toutiao.com/"+element["source_url"]) print element["article_url"] content = getUrl(element["article_url"]) #print content #soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content) print soup.title; print soup.find_all("div", class_="article-content pgc_top_banner"); #print soup.get_text() print "----------***********-----" print "----------***********-----:%d"%(i) #print content i = i + 1
def download(): url = getUrl() print url os.system('wget ' + str(url) + " -O " + str(os.getcwd()) + "/wallpapers/wallpaper" + str(date.today()) + ".jpg")