Пример #1
0
def main():
    list1 = []
    Parser = argparse.ArgumentParser(description = "WebCrawler")
    Parser.add_argument('-url',help = 'Initial url', nargs = '?', default = "https://www.python.org", type = str)
    Parser.add_argument('-depth',help = 'Depth for crawl', nargs = '?', default = 2, type = int)
    args = Parser.parse_args()
    argsdict = vars(args)
    url = argsdict['url']
    depth = argsdict['depth']
    if url == None or depth == None:
        print "Arguments empty. Please enter valid url and depth"
        exit(0)
    print "Url considered - ",url
    print "Going for",depth,"number of depths"
    parse_url.parse_url(url,list1,depth)
Пример #2
0
def fetch(url, proxy=None, ua=None, cookie=None, verbose=False):
    buf = cStringIO.StringIO()
    c = pycurl.Curl()

    c.setopt(c.URL, url)

    if cookie:
        c.setopt(c.COOKIEFILE, cookie)
        c.setopt(c.COOKIEJAR, cookie)

    if ua:
        c.setopt(c.USERAGENT, ua)

    if proxy:
        proxy = parse_url(proxy)
        c.setopt(c.PROXY, proxy['host'])
        c.setopt(c.PROXYUSERPWD, "%s:%s" % (proxy['user'], proxy['pass']))
        if proxy['port']:
            c.setopt(c.PROXYPORT, int(proxy['port']))

    # c.setopt(c.CONNECTTIMEOUT, 10)
    # c.setopt(c.TIMEOUT, 20)
    c.setopt(c.FAILONERROR, True)
    c.setopt(c.VERBOSE, verbose)
    c.setopt(c.FOLLOWLOCATION, 1)
    c.setopt(c.WRITEFUNCTION, buf.write)
    c.setopt(c.HTTPHEADER, ['Accept: text/html', 'Accept-Charset: UTF-8'])

    c.perform()
    body = buf.getvalue()
    buf.close()

    return body
Пример #3
0
    def run(self):
        url_list = self.get_list_url()

        for url in url_list:
            print('请求url', url)
            pythonJson = parse_url(url)
            print('请求pythonJson', pythonJson)
            pageNum = url_list.index(url) + 1
            self.save_json(pythonJson, pageNum)
Пример #4
0
def home():
    ha = None
    if request.method == "POST":
        url = request.form.get("search")
        limit = int(request.form.get("limit"))
        urlParsed = parse_url(url)
        if urlParsed == -1:
            ha = {"errors": "Not a valid amazon product url."}
        else:
            ha = get_data(urlParsed, limit)

    return render_template("index.html", text=ha)
Пример #5
0
 def get_content_list(self, url):
     html_str = parse_url(url)
     html = etree.HTML(html_str)
     content_list = []
     for div in html.xpath("//div[@id='content-left']"):
         item = {}
         item['content'] = div.xpath('.//div[@class="content"]/span/text()')
         item['author_gender'] = div.xpath(
             '..//div[contains(@class,"articleGender")]/@class')
         item['author_gender'] = item['author_gender'][0].split(
             ' ')[-1].replace(
                 'Icon', '') if len(item['author_gender']) > 0 else None
         content_list.append(item)
     return content_list
Пример #6
0
    def get_img_list(self,detail_url):
        #     3.1 提取列表页的url地址和标题
        #     3.2 请求列表url地址,获取详情页的第一页
        if detail_url is not None:
            detail_html_str = parse_url(detail_url)
            detail_html = etree.HTML(detail_html_str)
            #     3.3 提取详情页第一页的图片,提取下一页的地址
            img_list = detail_html.xpath('//img[@class="BDE_Image"]/@src')
            print(detail_html_str)

        #     3.4 获取详情
        else: 
            img_list = []
        return img_list
Пример #7
0
 def run(self):
     num = 0
     total = 100
     while num < total + 10:  # 这个循环条件是非常需要注意的
         # 1.设置请求的URL地址
         url = self.temp_url.format(num)
         # 2.解析地址,返回网页源码形式的json字符串
         html_str = parse_url(url)
         print(url)
         # 3.json字符串转化为字典,提取有用的内容
         content_list, total = self.get_content_list(html_str)
         # 4.把想要的内容再转化为json串,记录到本地或者回写到浏览器
         self.save_content_list(content_list)
         # 5.构造新的URL地址,提取下一单元记录
         num += 10
Пример #8
0
 def run(self):
     # 1.start_url
     num = 0
     total = 100
     while num < total + 18:
         # 2.发送请求
         url = self.start_temp(num)
         json_str = parse_url(url)
         # 3. 提取数据
         content_list, total = self.get_content_list(json_str)
         # 4. 保存
         self.save_content_list(content_list)
         # if len(content_list < 18):
         #     break
         # 5.构造下一页url地址
         num += 18
Пример #9
0
    def run(self):
        # 1.遍历url列表 发送请求
        num = 0

        with open("AmericanTv.json", "w", encoding="utf-8") as f:
            while num < self.get_total_tvs():

                url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?start=" + str(
                    num) + "&count=18&loc_id=108288"

                # 2.获取返回数据
                data = parse_url(url)

                # 3.保存到本地
                json.dump(data, f, indent=2, ensure_ascii=False)

                num = int(num) + 18
Пример #10
0
 def run(self):
     # 1. start_url
     pn = 0
     take = 50
     while pn <= 0:
         url = self.start_url.format(pn * take)
         html_str = parse_url(url)
         content_list = self.get_content_list(html_str)
         # 2. 发送请求,获取响应
         # 3. 提取数据,提取下一页的url地址
         #     3.1 提取列表页的url地址和标题
         #     3.2 请求列表url地址,获取详情页的第一页
         #     3.3 提取详情页第一页的图片,提取下一页的地址
         #     3.4 获取详情
         # 4.保存
         self.save_content_list(content_list)
         # 5.请求下一页
         pn += 1
Пример #11
0
def main(url1, url2, out_dir):
    image_url = parse_url(url1, url2)
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    file_ext = image_url[-4:]
    for i in count(1, 1):
        try:
            r = requests.get(image_url.format(i), stream=True)
            if r.status_code == 200:
                img_file = os.path.join(out_dir, str(i) + file_ext)
                with open(img_file, 'wb') as f:
                    r.raw.decode_content = True
                    copyfileobj(r.raw, f)
            else:
                print('image ' + str(i) + ' not found')
                break
        except requests.exceptions.RequestException as e:
            raise SystemExit(e)
        else:
            print('done image ' + str(i))
Пример #12
0
def fetch(url, proxy = None, ua = None, cookie = None, verbose = True):
    from parse_url import parse_url
    import pycurl
    import cStringIO

    buf = cStringIO.StringIO()
    c = pycurl.Curl()

    c.setopt(c.URL, url)

    if cookie:
        c.setopt(c.COOKIEFILE, cookie)
        c.setopt(c.COOKIEJAR, cookie)

    if ua:
        c.setopt(c.USERAGENT, ua)

    if proxy:
        proxy = parse_url(proxy)
        c.setopt(c.PROXY, proxy['host'])
        c.setopt(c.PROXYPORT, int(proxy['port']))
        c.setopt(c.PROXYUSERPWD, "%s:%s" % (proxy['user'], proxy['pass']))

    # c.setopt(c.CONNECTTIMEOUT, 10)
    # c.setopt(c.TIMEOUT, 20)
    c.setopt(c.FAILONERROR, True)
    c.setopt(c.VERBOSE, verbose)
    c.setopt(c.FOLLOWLOCATION, 1)
    c.setopt(c.WRITEFUNCTION, buf.write)
    c.setopt(c.HTTPHEADER, ['Accept: text/html', 'Accept-Charset: UTF-8'])
    try:
        c.perform()
        status = c.getinfo(c.HTTP_CODE)
        body = buf.getvalue()
        buf.close()
        return {'status': status, 'response': body}
    except pycurl.error, error:
        errno, errstr = error
        return {'status': errno, 'response': errstr}
Пример #13
0
def main():
    url = "http://127.0.0.1:8000/return_json"
    html_str = parse_url(url)

    ret1 = json.loads(html_str)
    # pprint(ret1)

    # with open("douban.json","w",encoding="utf-8") as f:
    #     f.write(json.dumps(ret1,ensure_ascii=False,indent=2))
    #
    # with open("douban.json","r",encoding="utf-8") as f:
    #     ret2 = f.read()
    #     ret3 = json.loads(ret2)
    #     print(ret3)

    #使用json,load提取类文件对象中的数据
    with open("douban.json", "r", encoding="utf-8") as f:
        ret4 = json.load(f)
        print(ret4)
        print(type(ret4))

    # 使用json,dump能够把python类型放入类文件对象中
    with open("douban1.json", "w", encoding="utf-8") as f:
        json.dump(ret1, f, ensure_ascii=False, indent=2)
Пример #14
0
df = get_reports.get_report(report, client, ids, downloader)
dictionary = parse_url.init_dictionary(dictfile)

elapsed = (time.time() - start)/60
print "Run time: " + str(elapsed)
start = time.time()

print "Creating headers"
df, urlparams = parse_url.create_headers(df)

elapsed = (time.time() - start)/60
print "Run time: " + str(elapsed)
start = time.time()

print "Parsing URL"
df = parse_url.parse_url(df)

elapsed = (time.time() - start)/60
print "Run time: " + str(elapsed)
start = time.time()

print "Checking values"
df = parse_url.typo_correct(urlparams, df, dictionary)
today = "".join(l for l in str(date.today()) if l not in string.punctuation and l not in " ")
filename = report['reportType'] + today + '.csv'

df.to_csv(filename)
elapsed = (time.time() - start)/60
print "Run time: " + str(elapsed)

total_elapsed = (time.time() - total_start)/60
Пример #15
0
import re, json
from parse_url import parse_url

url = 'https://36kr.com/'
html_str = parse_url(url)
ret = re.findall('<script>var props=(.*?),locationnal=', html_str)[0]
with open('36kr.json', 'w', encoding='utf-8') as f:
    f.write(ret)
ret = json.loads(ret)

print(ret)
Пример #16
0
import json

from parse_url import parse_url
from pprint import pprint

url = "https://m.douban.com/rexxar/api/v2/subject_collection/movie_showing/items?start=0&count=18&loc_id=108288"
# url = "https://www.baidu.com"
str = parse_url(url)

# 使用json.loads把json字符串转化为python类型
ret = json.loads(str)
# pprint(ret1)
# print(type(ret1))

# json.dumps能够把python类型转化为json字符串,写入的时候必须要是字符串
with open("douban.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(ret, ensure_ascii=False, indent=2))  # 显示中文格式,换行并且空两格
with open("douban.json", "r") as f:
    ret2 = f.read()
    ret3 = json.loads(ret2)
    print(ret3)
    print(type(ret3))

#使用json.load提取类对象中的数据
with open("douban.json", "r", encoding="utf-8") as f:
    ret4 = json.load(f)
    print(ret4)
    print(type(ret4))
#使用json.dump能够把python类型放入类文件对象中
with open("douban.json", "w", encoding="utf-8") as f:
    json.dump(ret4, f, ensure_ascii=False, indent=2)
Пример #17
0
 def get_total(self):
     html_str = parse_url(self.url)
     # json.loads把json字符串转化为python类型
     ret1 = json.loads(html_str)
     total = ret1["total"]
     return total
Пример #18
0
urllib3.disable_warnings()

import requests
import json
import re
from parse_url import parse_url

proxies = {
    'http': 'http://110.86.137.0:9999',
    'https': 'http://110.86.137.0:9999'
}
ex_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag={}&page_limit=1000&page_start=0'

url = 'https://movie.douban.com/j/search_tags?type=movie&source=index'

response_dict = json.loads(parse_url(url))

move_name_list = response_dict["tags"]
print(move_name_list)

url_list = [ex_url.format(i) for i in move_name_list]

print(url_list)

for url in url_list:

    html_str = parse_url(url)
    print(html_str)
    # 将json数据转换成python数据
    response = json.loads(html_str)
    print(re.findall(r'tag=(.*?)&', url))
Пример #19
0
 def data(self, name):
     for url in self.L_url:
         self.html_str = parse_url(url)
         ret = json.loads(self.html_str)
         with open(name, "a", encoding="utf-8") as f:
             f.write(json.dumps(ret, ensure_ascii=False, indent=4))
Пример #20
0
    def get_total_tvs(self):
        """ 获取总记录数据 """
        data = json.loads(parse_url(self.start_url))

        return int(data["total"])
Пример #21
0
# coding=utf-8
import json
import re
from parse_url import parse_url

url = "http://36kr.com/"
headers = {
    "User-Agent":
    "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36",
}

htmlStr = parse_url(url, headers)

ret = re.findall("<script>var props=(.*?),locationnal=", htmlStr)[0]

with open("./res/36kr.json", "w", encoding='utf-8') as f:
    f.write(ret)

ret = json.loads(ret)
print(ret)