예제 #1
0
def getInfo(filename):

    url_name = t.getUrl(filename)

    html = request.urlopen(url_name).read()

    view_pattern = re.compile(r'dd\stitle="[0-9]+')

    rank_pattern = re.compile(r'dl\stitle="[0-9]+')

    view_result = view_pattern.findall(str(html))
    rank_result = rank_pattern.findall(str(html))

    view_num = view_result[0].split("=\"")[1]
    rank_num = rank_result[0].split("=\"")[1]
    print("view_num=", view_num)
    print("rank_view=", rank_num)
    return view_num, rank_num
예제 #2
0
def getContent():
    article = {}
    h = HTMLParser.HTMLParser()
    target_url = getUrl.getUrl()
    print target_url
    response = urllib2.urlopen(target_url)
    html = response.read()
    flag_title = html.find('<title>')
    flag_title_end = html.find('</title>', flag_title)
    article['title'] = html[flag_title + 7:flag_title_end]
    print article['title']
    flag_start = html.find('<div class="article-text')
    flag_end = html.find('The above text is a transcript of this podcast',
                         flag_start)
    flag_end = flag_end - 8
    print flag_start
    print flag_end
    paragraphs = []
    cursor = flag_start
    while cursor < flag_end:
        flag_p = html.find('<p>', cursor)
        flag_p_end = html.find('</p>', flag_p)
        paragraphs.append(html[flag_p + 3:flag_p_end])
        cursor = flag_p_end
    text = article['title'] + '\n\n\n\n'
    for para in paragraphs:
        para = h.unescape(para)
        para = para.replace('<em>', '')
        para = para.replace('</em>', '')
        para = para.replace('</a>', '')
        flag_a = para.find('<a')
        while flag_a != -1:
            flag_a_end = para.find('>', flag_a)
            para = para[:flag_a] + para[flag_a_end + 1:]
            flag_a = para.find('<a')
        # print para
        text += para + '\n\n'
    article['content'] = text
    return article
예제 #3
0
def getContent():
    article = {}
    h = HTMLParser.HTMLParser()
    target_url = getUrl.getUrl()
    print target_url
    response = urllib2.urlopen(target_url)
    html = response.read()
    flag_title = html.find('<title>')
    flag_title_end = html.find('</title>', flag_title)
    article['title'] = html[flag_title + 7: flag_title_end]
    print article['title']
    flag_start = html.find('<div class="article-text')
    flag_end = html.find('The above text is a transcript of this podcast',
        flag_start)
    flag_end = flag_end - 8
    print flag_start
    print flag_end
    paragraphs = []
    cursor = flag_start
    while cursor < flag_end:
        flag_p = html.find('<p>', cursor)
        flag_p_end = html.find('</p>', flag_p)
        paragraphs.append(html[flag_p + 3:flag_p_end])
        cursor = flag_p_end
    text = article['title'] + '\n\n\n\n'
    for para in paragraphs:
        para = h.unescape(para)
        para = para.replace('<em>', '')
        para = para.replace('</em>', '')
        para = para.replace('</a>', '')
        flag_a = para.find('<a')
        while flag_a!= -1:
            flag_a_end = para.find('>', flag_a)
            para = para[:flag_a] + para[flag_a_end + 1:]
            flag_a = para.find('<a')
        # print para
        text += para + '\n\n'
    article['content'] = text
    return article
예제 #4
0
파일: crawling.py 프로젝트: min16/crawling
import glob, os
from create import createExcel, createCompanyInfo
from getUrl import getUrl

files = []
for filename in glob.glob('templates/*.html'):
        files.append(filename)

link_list = []
for filename in files:
    link_list += getUrl(filename)

# print(link_list)
# print(len(link_list))

company_list = []
for link in link_list:
    company_list.append(list(createCompanyInfo(link).__dict__.values()))

createExcel(company_list)
예제 #5
0
def crawl():
    target = request.forms.get('target')
    collection = getUrl(target)
    collection.run()
    return template('html/crawl.html', target=target)
예제 #6
0
파일: web.py 프로젝트: ch0ck/proxy-inject
def crawl():
	target = request.forms.get('target')
	collection = getUrl(target)
	collection.run()
	return template('html/crawl.html',target=target)
예제 #7
0
from getUrl import getUrl

ret = getUrl("http://qt.gtimg.cn/q=sz000858")
ret2 = getUrl("http://hq.sinajs.cn/list=sh601006")
print ret
print ret2

예제 #8
0
# -*- coding: utf-8 -*- 
import sys
from getUrl import getUrl
import simplejson as json
from bs4 import BeautifulSoup 
import time
newscount = 30
ret = getUrl("http://finance.ifeng.com/stock/");
#retjson = json.JSONDecoder().decode(ret)
#num = len(retjson["data"])
#print num
#i = 1
    #content = getUrl("http://toutiao.com/"+element["source_url"])
    
    #print content
    #soup = BeautifulSoup(content, "html.parser")
soup = BeautifulSoup(ret)
print soup.title;
print soup.find_all("a");
   #print soup.get_text()
예제 #9
0
파일: main.py 프로젝트: LIMITob/reptile
#!/usr/bin/env python
# coding=utf-8
# jetson@2014-12-19 21:19:23

import time,os
import threading
import getUrl
import reptile


cnt = 0
url = getUrl.getUrl('http://tieba.baidu.com/f/good?kw=%E6%B0%B8%E5%A4%9C%E5%90%9B%E7%8E%8B&ie=utf-8&cid=4&pn=')
lock = threading.Lock()
def downLoad(no,a):
    '''
    每个线程处理n个页面
    '''
    lock.acquire()
    if (a+1)*no > len(url):
        url_part = url[a*no:]
    else:
        url_part = url[a*no:(a+1)*no]
    lock.release()

    for i in url_part:
        reptile.grap1page(i)
        

def assignTask():
    thread_num = len(url)/20 + 1
    print '%d threads will start'%thread_num
예제 #10
0
# -*- coding: utf-8 -*- 
import sys
from getUrl import getUrl
import simplejson as json
from bs4 import BeautifulSoup 
import time
newscount = 30
ret = getUrl("http://toutiao.com/api/article/recent/?source=2&count=%s&category=news_finance&max_behot_time=%s&utm_source=toutiao&offset=0"%(newscount,time.time));
retjson = json.JSONDecoder().decode(ret)
num = len(retjson["data"])
print num
i = 1
for element in retjson["data"]: 
    print element["title"]
    print element["source_url"]
    #content = getUrl("http://toutiao.com/"+element["source_url"])
    
    print element["article_url"]
    content = getUrl(element["article_url"])
    #print content
    #soup = BeautifulSoup(content, "html.parser")
    soup = BeautifulSoup(content)
    print soup.title;
    print soup.find_all("div", class_="article-content  pgc_top_banner");
    #print soup.get_text()
    print "----------***********-----"
    print "----------***********-----:%d"%(i)
    #print content
    i = i + 1
예제 #11
0
def download():
    url = getUrl()
    print url
    os.system('wget ' + str(url) + " -O " + str(os.getcwd()) +
              "/wallpapers/wallpaper" + str(date.today()) + ".jpg")