Python Spider示例，Spider Python示例

示例#1

0

显示文件

文件： healthbarta.py 项目： dipankar08/FlipNews

def parse_artical_list(url):
    #pdb.set_trace();
    try:
        #Build the soup from URL
        soup = Spider.buildSoup(url)
        if soup == None:
            return None

        #Get the info from soup.
        x1 = Spider.getAttrListForXPath(soup, '.mh-main .loop-title', None,
                                        {'url': ['a', 'href']})

        #Process:
        res = []
        for link in x1:
            s = link.get('url')[0]
            if s.startswith('/'):
                res.append(ROOT + s)
            else:
                res.append(s)

        return res
    except Exception, e:
        print '[ERROR] parse_first_page', url, str(e)
        return None

示例#2

0

显示文件

文件： VmessGetter.py 项目： lafyq/ewZapXUmRppCtW

def py_main():
    data = DBUtil.get_email_data()
    email_address = data[1]
    email_password = data[2]

    print(data)

    robot_send_email(email_address)  # 发送邮件

    print("已发送邮件")

    emailcode = robot_get_emailcode(email_address, email_password)  # 获取邮箱验证码

    print("邮箱验证码：" + emailcode)

    WebTools.register(email_address, emailcode)  # 网站注册

    print("注册成功")

    content = WebTools.login_to_user(email_address)  # 登录并进入用户中心

    print("登陆成功")

    my_vmess = Spider.change_vmess(
        Spider.get_vmess(content))  # 获取机场地址并改为我们的机场地址

    print("机场已就绪，随时可以转发！！！")

    return my_vmess

示例#3

0

显示文件

文件： LiJiangYunNanSpider.py 项目： yanbo8502/ArticleSpider

 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败，异常信息为:{1}')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(url=Spider.ComposeUrl(self.url, x[0]), title=x[1])
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败，异常信息为:{1}')
         if html == None:
             continue
         content = None
         images = []
         imageCount = 0
         for y in recArticle.findall(html):
             article['time'] = datetime.datetime.strptime(y[0], '%Y-%m-%d')
             if not self.CheckNewArticle(article):
                 logging.debug('文章源{0}并非新文章。'.format(article['url']))
                 continue
             content = y[1]
             for z in recImage.findall(content):
                 imageCount += 1
                 imageUrl = Spider.ComposeUrl(article['url'], z)
                 image = self.DownLoadImage(imageUrl, '图片{0}提取失败，异常信息为:{1}')
                 if image == None:
                     continue
                 images.append(image)
         if not content \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, content, images, '成功自{0}提取文章')
     return self.articles

示例#4

0

显示文件

 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败，异常信息为:{1}')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(
             time=datetime.datetime.strptime(x[0], '%Y-%m-%d'),
             # url = self.url[0:self.url.rfind('/')] + x[1][1:],
             url=Spider.ComposeUrl(self.url, x[1]),
             title=x[2])
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败，异常信息为:{1}')
         if html == None:
             continue
         content = None
         images = []
         imageCount = 0
         for y in recArticle.findall(html):
             content = y
             for z in recImage.findall(content):
                 imageCount += 1
                 # imageUrl = article['url'][0:article['url'].rfind('/')] + z[1:]
                 imageUrl = Spider.ComposeUrl(article['url'], z)
                 image = self.DownLoadImage(imageUrl, '图片{0}提取失败，异常信息为:{1}')
                 if image == None:
                     continue
                 images.append(image)
         if not content \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, content, images, '成功自{0}提取文章')
     return self.articles

示例#5

0

显示文件

文件： RongShuiSpider.py 项目： ttxx9999/ArticleSpider

 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败，异常信息为:{1}')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(url=Spider.ComposeUrl(self.url, x[0]),
                        title=x[1],
                        time=datetime.datetime.strptime(x[2], '%Y-%m-%d'))
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败，异常信息为:{1}')
         if html == None:
             continue
         content = None
         images = []
         imageCount = 0
         for y in recArticle.findall(html):
             content = y
             for z in recImage.findall(content):
                 imageCount += 1
                 imageUrl = Spider.ComposeUrl(
                     article['url'],
                     urllib.parse.quote(z) if z[0] in ['/', '.'] else z)
                 image = self.DownLoadImage(imageUrl, '图片{0}提取失败，异常信息为:{1}')
                 if image == None:
                     continue
                 image['imageUrl'] = Spider.ComposeUrl(article['url'], z)
                 images.append(image)
         if not content \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, content, images, '成功自{0}提取文章')
     return self.articles

示例#6

0

显示文件

 def __index__(self):
     lists = Article.query.all()
     for item in lists:
         db.session.delete(item)
     db.session.commit()
     Spider.getHyArtical()
     lists = Article.query.all()
     size = len(lists)
     return self.render('admin/article.html', lists=lists, size=size)

示例#7

0

显示文件

文件： Main.py 项目： windform/Python

 def __init__(self):
     #,self.handle.application
     self.data = []
     httpd = make_server('', 1234, self.handle)
     print('Server HTTP on port 1234...')
     #Application类的实例化
     self.app = Application()
     #Spider类的实例化
     self.spider = Spider()
     httpd.serve_forever()

示例#8

0

显示文件

 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     recPage = re.compile(
         '<OPTION value=([^>\s]+?)(?:\s[^>]*?)*?>[^<]*?</OPTION>',
         re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败，异常信息为:{1}')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(url=Spider.ComposeUrl(self.url, x[0]),
                        title=x[1],
                        time=datetime.datetime.strptime(x[2], '%Y-%m-%d'))
         if not self.CheckNewArticle(article):
             logging.debug('文章源{0}并非新文章。'.format(article['url']))
             continue
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败，异常信息为:{1}')
         if html == None:
             continue
         totalContent = ''
         images = []
         imageCount = 0
         pageUrls = recPage.findall(html)
         if len(pageUrls) == 0:
             pageUrls += [article['url']]
         for p in pageUrls:
             pageUrl = Spider.ComposeUrl(article['url'], p)
             if pageUrl != article['url']:
                 html = self.DownLoadHtml(pageUrl, '文章页{0}访问失败，异常信息为:{1}')
                 if html == None:
                     continue
             content = None
             for y in recArticle.findall(html):
                 content = y
                 for z in recImage.findall(content):
                     imageCount += 1
                     imageUrl = Spider.ComposeUrl(article['url'], z)
                     image = self.DownLoadImage(imageUrl,
                                                '图片{0}提取失败，异常信息为:{1}')
                     if image == None:
                         continue
                     images.append(image)
                 if content != None:
                     totalContent += content
         if totalContent == '' \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, totalContent, images, '成功自{0}提取文章')
     return self.articles

示例#9

0

显示文件

	def __init__(self, word):
		'''
		Constructor to crawl web for a word 
		'''
		self.word = word
		sp = Spider(word, spread=2, limit=0.01)
		self.web = sp.crawl('Graph.shelve')	# Crawled web
		self.graph = Shelveopen('Graph.shelve')

		self.paths = []	# To store all paths
		self.scores = []	# To store corresponding pathscores

		self.clientfeatures = []	# Feature vector for client
		self.standardfeatures = []	# To compare against

示例#10

0

显示文件

文件： LaSaXiZangTourSpider.py 项目： yanbo8502/ArticleSpider

 def CatchArticles(self):
     abstracts = None
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章摘要接口{0}访问失败，异常信息为:{1}')
     if html == None:
         return self.articles
     try:
         html = html.replace('null', 'None')
         abstracts = eval(html)
     except Exception as e:
         logging.warn('文章摘要信息{0}格式异常，异常信息为:{1}'.format(html, str(e)))
         return self.articles
     for x in abstracts['contents']:
         try:
             article = dict(url=Spider.ComposeUrl(
                 self.url, '/{0}/{1}.jhtml'.format(x['channel_path'],
                                                   x['contentId'])),
                            title=x['title'])
             html = super().DownLoadHtml(article['url'],
                                         '文章页{0}访问失败，异常信息为:{1}')
             if html == None:
                 continue
             content = None
             images = []
             imageCount = 0
             for y in recArticle.findall(html):
                 article['time'] = datetime.datetime.strptime(
                     y[0], '%Y-%m-%d %H:%M:%S')
                 if not self.CheckNewArticle(article):
                     logging.debug('文章源{0}并非新文章。'.format(article['url']))
                     continue
                 content = y[1]
                 for z in recImage.findall(content):
                     imageCount += 1
                     imageUrl = Spider.ComposeUrl(article['url'], z)
                     image = self.DownLoadImage(imageUrl,
                                                '图片{0}提取失败，异常信息为:{1}')
                     if image == None:
                         continue
                     images.append(image)
             if not content \
             or imageCount != len(images):
                 continue
             self.CacheArticle(article, content, images, '成功自{0}提取文章')
         except Exception as e:
             logging.warn('文章明细信息{0}格式异常，异常信息为:{1}'.format(str(x), str(e)))
             continue
     return self.articles

示例#11

0

显示文件

	def CatchArticles(self):
		recAbstract = re.compile(self.reAbstract, re.DOTALL)
		recArticle = re.compile(self.reArticle, re.DOTALL)
		recImage = re.compile(self.reImage, re.DOTALL)
		recDate = re.compile('Details([\d-]+?).html', re.DOTALL)
		html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败，异常信息为:{1}')
		if html == None:
			return self.articles
		for x in recAbstract.findall(html):
			article = dict(
				url = Spider.ComposeUrl(self.url, x[0]),
 				title = x[1].strip()
			)
			for w in recDate.findall(article['url']):
				try:
					article['time'] = datetime.datetime.strptime('20{0}'.format(w[0:8]),
															'%Y-%m-%d')
				except Exception as e:
					logging.warn('文章源{0}无法识别发布日期，异常为:{1}'.format(article['url'],
																				str(e)))
					continue		
			# logging.debug(str(article))
			if not 'time' in article:
				#不符合格式的外部链接忽略
				continue
			if not self.CheckNewArticle(article):
				logging.debug('文章源{0}并非新文章。'.format(article['url']))
				continue
			html = self.DownLoadHtml(article['url'], '文章页{0}访问失败，异常信息为:{1}')
			if html == None:
				continue
			content = None
			images = []
			imageCount = 0
			for y in recArticle.findall(html):
				content = y
				for z in recImage.findall(content):
					imageCount += 1
					imageUrl = Spider.ComposeUrl(article['url'], z)
					image = self.DownLoadImage(imageUrl, '图片{0}提取失败，异常信息为:{1}')
					if image == None:
						continue
					images.append(image)
			if not content \
			or imageCount != len(images):
				continue
			self.CacheArticle(article, content, images, '成功自{0}提取文章')
		return self.articles

示例#12

0

显示文件

文件： main.py 项目： selimmidikoglu/upwork

def initializeSpiders():
    homePageList = ['http://finans.mynet.com/borsa/hisseler/', 'http://finans.mynet.com/borsa/hisseler/c-e/',
                    'http://finans.mynet.com/borsa/hisseler/f-j/', 'http://finans.mynet.com/borsa/hisseler/k-q/',
                    'http://finans.mynet.com/borsa/hisseler/r-z/']

    for i in range(0,5):
        Spider(PROJECT_NAME,homePageList[i],DOMAIN_NAME)

示例#13

0

显示文件

文件： Form_Util_Tests.py 项目： hiEntropy/SirSpamAlot

 def test_link_extraction(self):
     resp = requests.get("https://docs.python.org/2/library/os.path.html")
     soup = BeautifulSoup(resp.text, 'html.parser')
     links = Spider.extract_links(
         soup, [],
         urlparse("https://docs.python.org/2/library/os.path.html"))
     self.assertTrue(len(links) > 50)

示例#14

0

显示文件

文件： main.py 项目： yoyo-sincerely/AwesomeSpiders

def test_get_data():
    f = codecs.open(CODE_FILE_TEMP, 'wb+', encoding='utf-8')
    address = "上海市丽泽梅傲苑"
    house = S.house()
    house.address = address
    print(house.getInfo())
    f.write(house.getInfo())
    f.close()

示例#15

0

显示文件

文件： PullDataFromWenshu.py 项目： lol2025/HANAH

def download_caselist(search_criteria, index, csv_file):
    cases = dict()
    wenshu = Spider.WenShu()
    wenshu.set_search_criteria(search_criteria)
    cases = wenshu.get_case_list(index)
    csv_file.write(cases)
    print('dump {} items into {}'.format(len(cases['doc_id']), cases))
    print('Sleep 10s.....')
    time.sleep(2)

示例#16

0

显示文件

文件： main.py 项目： vicki886/cnvdSpider

def main(currentTime, startTime, web, weblevel, app, applevel, device,
         devicelevel, sys, syslevel, method):
    start_time = time.time()
    print("程序运行中....")
    #新建一个爬虫对象
    spider = Spider.Spider()
    #web应用漏洞
    page = web
    level = weblevel
    type = 1
    spider = SpiderMain(currentTime,
                        startTime,
                        spider,
                        page,
                        level=level,
                        type=type,
                        method=method)

    #应用程序漏洞
    page = app
    level = applevel
    type = 2
    spider = SpiderMain(currentTime,
                        startTime,
                        spider,
                        page,
                        level=level,
                        type=type,
                        method=method)

    #网络设备漏洞
    page = device
    level = devicelevel
    type = 3
    spider = SpiderMain(currentTime,
                        startTime,
                        spider,
                        page,
                        level=level,
                        type=type,
                        method=method)

    #操作系统漏洞
    page = sys
    level = syslevel
    type = 4
    spider = SpiderMain(currentTime,
                        startTime,
                        spider,
                        page,
                        level=level,
                        type=type,
                        method=method)

    spider.save_doc(currentTime, startTime)
    end_time = time.time()
    print("总共花费了%s" % str((end_time - start_time) / 60) + "分钟!")

示例#17

0

显示文件

文件： Main.py 项目： windform/Python

class Main:
    def __init__(self):
        #,self.handle.application
        self.data = []
        httpd = make_server('', 1234, self.handle)
        print('Server HTTP on port 1234...')
        #Application类的实例化
        self.app = Application()
        #Spider类的实例化
        self.spider = Spider()
        httpd.serve_forever()

    def handle(self, environ, start_response):
        start_response('200 ok', [('Content-Type', 'text/html')])
        info = (environ['PATH_INFO'][1:])
        if info == 'a':
            responseInfo = 'aaaaa'
        elif info == 'b':
            responseInfo = 'bbbbb'
        elif info == 'c':
            self.data = self.connectDataBase()
            responseInfo = self.data
        elif info == 'e':
            value = self.app.printData()
            responseInfo = value
        elif info == 'spider':
            spiderData = self.spider.start()
            #打开文件，如果不存在，则创建
            file = open('baidu.html', 'w+')
            #在创建的文件中写入数据
            file.write(spiderData)
            #关闭文件
            file.close()
            responseInfo = '文件写入成功'
        else:
            responseInfo = '什么鬼'
        return responseInfo

    def connectDataBase(self):
        config = {
            'user': '',
            'password': '',
            'host': '127.0.0.1',
            'database': 'test',
            'raise_on_warnings': True,
        }
        cnx = mysql.connector.connect(**config)
        cursor = cnx.cursor()
        name = 'lily'
        cursor.execute("select * from node")
        values = cursor.fetchall()
        return str(values)
        '''
		for value in values:
			print 'id:' + str(value[0]) + ', username: '******',password: '******'''
        cnx.close()

示例#18

0

显示文件

def main():
    # parse arguments
    args = parse_args()
    if args is None:
        exit()

    resultsFilePath = args.results_path
    #resultsFile = open(resultsFilePath, 'w')

    regionUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html"
    regionUrlStarter = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/"
    spider = Spider(regionUrl, regionUrlStarter)
    spider.processData()
    print(spider.provinceList)
    province_dict = spider.provinceList[0]

    with open(resultsFilePath, "w") as f:
        json.dump(province_dict, f, ensure_ascii=False)
        print("加载入文件完成...")

示例#19

0

显示文件

 def __init__(self, dbUser, dbPassword, homeWebPageInfoFile):
     """constructor
     """
     self._init(homeWebPageInfoFile)
     self.spider = Spider.Spider()
     self.parser = Parser.Parser()
     self.conn = Connection(conf.dbHost,
                            conf.dbName,
                            user=dbUser,
                            password=dbPassword)  #db instance

示例#20

0

显示文件

文件： healthbarta.py 项目： dipankar08/FlipNews

def parse_artical(url):
    try:
        #Build the soup from URL
        soup = Spider.buildSoup(url)
        if soup == None :
            return None
        
        #define the prory to retrive from soup.
        info2 = {
                    'title':['h1.entry-title','text','NO_LIST_IF_ONE'],
                    'date':['.updated','text'],
                    'details':['.entry p','text','JOIN'],
                    'images':['img','src'],
                    'video':['iframe','src']
                }
                
        #Get the info from soup.
        x = Spider.getAttrListForXPath(soup,'.mh-content', None,info2)
        if x == None:
            return None
        info = x[0]
        
        #Make some modification on info.
        if info.get('images') != None:
            new =[]
            for f in info['images']:
                if f.startswith('/'):
                    new.append(ROOT+ f)
                else:
                    new.append(f)
            #Please check this. Bug prone..
            new  = [ n[:n.rfind('?')+1] for n in new ] 
            info['images'] = new
            info['head_image'] = new[0]
        
        info['details'] = cleanText(info['details'] )
        #we need to modify images
        
        return info
    except Exception, e:
        print 'Error in (get_artical_info): ',url,str(e)
        return None

示例#21

0

显示文件

 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     recKeyword = re.compile(self.reKeyword, re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败，异常信息为:{1}')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(url=xml.sax.saxutils.unescape(
             Spider.ComposeUrl(self.url, x[0])),
                        title=x[1].strip(),
                        time=datetime.datetime.strptime(x[2], '%Y-%m-%d'))
         #关键词检查
         if recKeyword.match(x[1]) == None:
             continue
         else:
             logging.debug('文章Url为{0}, 标题为{1}。'.format(
                 article['url'], article['title']))
         if not self.CheckNewArticle(article):
             logging.debug('文章源{0}并非新文章。'.format(article['url']))
             continue
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败，异常信息为:{1}')
         if html == None:
             continue
         content = None
         images = []
         imageCount = 0
         for y in recArticle.findall(html):
             content = y
             for z in recImage.findall(content):
                 imageCount += 1
                 imageUrl = Spider.ComposeUrl(article['url'], z)
                 image = self.DownLoadImage(imageUrl, '图片{0}提取失败，异常信息为:{1}')
                 if image == None:
                     continue
                 images.append(image)
         if not content \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, content, images, '成功自{0}提取文章')
     return self.articles

示例#22

0

显示文件

	def CatchArticles(self):
		recAbstract = re.compile(self.reAbstract, re.DOTALL)
		recArticle = re.compile(self.reArticle, re.DOTALL)
		recImage = re.compile(self.reImage, re.DOTALL)
		recDate = re.compile('http://www.gotohz.com/\w+?/\w+?/\d+?/t(\d+?)_\d+?.shtml', re.DOTALL)
		html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败，异常信息为:{1}')
		if html == None:
			return self.articles
		for x in recAbstract.findall(html):
			article = dict(
				title = x[0],
 				url = Spider.ComposeUrl(self.url, x[1])
			)
			for w in recDate.findall(article['url']):
				article['time'] = datetime.datetime.strptime(w,'%Y%m%d')
			# logging.debug(str(article))
			if not 'time' in article:
				#不符合格式的外部链接忽略
				continue
			if not self.CheckNewArticle(article):
				logging.debug('文章源{0}并非新文章。'.format(article['url']))
				continue
			html = self.DownLoadHtml(article['url'], '文章页{0}访问失败，异常信息为:{1}')
			if html == None:
				continue
			content = None
			images = []
			imageCount = 0
			for y in recArticle.findall(html):
				content = y
				for z in recImage.findall(content):
					imageCount += 1
					imageUrl = Spider.ComposeUrl(article['url'], z)
					image = self.DownLoadImage(imageUrl, '图片{0}提取失败，异常信息为:{1}')
					if image == None:
						continue
					images.append(image)
			if not content \
			or imageCount != len(images):
				continue
			self.CacheArticle(article, content, images, '成功自{0}提取文章')
		return self.articles

示例#23

0

显示文件

文件： healthbarta.py 项目： dipankar08/FlipNews

def parse_artical(url):
    try:
        #Build the soup from URL
        soup = Spider.buildSoup(url)
        if soup == None:
            return None

        #define the prory to retrive from soup.
        info2 = {
            'title': ['h1.entry-title', 'text', 'NO_LIST_IF_ONE'],
            'date': ['.updated', 'text'],
            'details': ['.entry p', 'text', 'JOIN'],
            'images': ['img', 'src'],
            'video': ['iframe', 'src']
        }

        #Get the info from soup.
        x = Spider.getAttrListForXPath(soup, '.mh-content', None, info2)
        if x == None:
            return None
        info = x[0]

        #Make some modification on info.
        if info.get('images') != None:
            new = []
            for f in info['images']:
                if f.startswith('/'):
                    new.append(ROOT + f)
                else:
                    new.append(f)
            #Please check this. Bug prone..
            new = [n[:n.rfind('?') + 1] for n in new]
            info['images'] = new
            info['head_image'] = new[0]

        info['details'] = cleanText(info['details'])
        #we need to modify images

        return info
    except Exception, e:
        print 'Error in (get_artical_info): ', url, str(e)
        return None

示例#24

0

显示文件

文件： main.py 项目： yoyo-sincerely/AwesomeSpiders

def get_data():
    house_list = S.main()
    print(len(house_list))
    f = codecs.open(CODE_FILE_TEMP, 'wb+', encoding='utf-8')
    for house in house_list:

        address = ADDRESS + house.address.replace('\n', '')
        house.duration, house.duration_str, house.cost = G.getPathTime(address)
        print(house.getInfo())
        f.write(house.getInfo())
    f.close()

示例#25

0

显示文件

 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     validUrl = 'http://news.cncn.com/'
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败，异常信息为:{1}', 'gbk')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(time=datetime.datetime.strptime(
             '{0}-01-01'.format(datetime.datetime.today().year),
             '%Y-%m-%d'),
                        url=Spider.ComposeUrl(self.url, x[0]),
                        title=x[1])
         if not validUrl in article['url']:
             #无效URL
             continue
         if not self.CheckNewArticle(article):
             logging.debug('文章源{0}并非新文章。'.format(article['url']))
             continue
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败，异常信息为:{1}',
                                  'gbk')
         if html == None:
             continue
         content = None
         images = []
         imageCount = 0
         for y in recArticle.findall(html):
             content = y
             for z in recImage.findall(content):
                 imageCount += 1
                 imageUrl = Spider.ComposeUrl(article['url'], z)
                 image = self.DownLoadImage(imageUrl, '图片{0}提取失败，异常信息为:{1}')
                 if image == None:
                     continue
                 images.append(image)
         if not content \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, content, images, '成功自{0}提取文章')
     return self.articles

示例#26

0

显示文件

文件： main.py 项目： thesky341/bqgspider

 def startCrawl(self):
     if self.bookName == None:
         print("请输入参数")
     else:
         dirUrl = Directory.DirSearcher(self.bookName).search()
         if dirUrl == None:
             print("没有这本书")
             return
         links = Spider.Spi(dirUrl).search()
         with open(self.bookName + ".txt", "w") as book:
             for link in links:
                 cc = ccc.ConCraw(link).crawling(book)

示例#27

0

显示文件

def startSpider():
    print('WhiteList spider started!', file=sys.stderr)
    try:
        daemonize(PIDFILE,
                  stdout='/tmp/spider-log.log',
                  stderr='/tmp/spider-err.log')
    except RuntimeError as e:
        print(e, file=sys.stderr)
        raise SystemExit(1)

    io = IO.IO()
    spider = Spider.Spider(io)
    spider.start()

示例#28

0

显示文件

文件： CollectDataFromInternet.py 项目： joeylee1125/HANAHv2

def download_all_caselist(search_criteria, max_page):
    cases = dict()
    wenshu = Spider.WenShu()
    wenshu.set_search_criteria(search_criteria)
    for index in range(1, max_page + 1):
        tmp_case_list = wenshu.get_case_list(index)
        if not cases:
            cases = tmp_case_list
        else:
            for key, value in tmp_case_list.items():
                cases[key] += value
    print(f"{cases}")
    return cases

示例#29

0

显示文件

def first_page_parser(url):
    #pdb.set_trace();
    try:
        #Build the soup from URL
        soup = Spider.buildSoup(url)
        if soup == None :
            return None
            
        #Get the info from soup.
        x1 = Spider.getAttrListForXPath(soup,'div.leadstoryheading', None,{'url':['a','href']})
        
        #Process:
        res =[ ]
        for link in x1:
            s = link.get('url')[0]
            if s.startswith('/'):
                res.append(ROOT+s)
                
        return res;        
    except Exception, e:
        print '[ERROR] get_all_artical_links_first_page',url,str(e)
        return None

示例#30

0

显示文件

def get_all_artical_links(url):
    try:
        #Build the soup from URL
        soup = Spider.buildSoup(url)
        if soup == None :
            return None
            
        #Get the info from soup.
        x1 = Spider.getAttrListForXPath(soup,'div.leadstory-section-heading', None,{'url':['a','href']})
        x2 = Spider.getAttrListForXPath(soup,'div.sectionstoryinside-sub', None,{'url':['a','href']})
        
        #Process:
        res =[ ]
        for link in x1+x2:
            s = link.get('url')[0]
            if s.startswith('/'):
                res.append(ROOT+s)
                
        return res;  
    except Exception, e:
        print 'Error in Group',url,str(e)
        return None

示例#31

0

显示文件

def parse_artical(url):
    try:
        #Build the soup from URL
        soup = Spider.buildSoup(url)
        if soup == None :
            return None
        
        #define the prory to retrive from soup.
        info2 = {
                    'title':['h1.heading','text','NO_LIST_IF_ONE'],
                    'date':['.articlePublishDate','text'],
                    'details':['article p','text','JOIN'],
                    'images':['img','src'],
                    'video':['iframe','src']
                }
                
        #Get the info from soup.
        x = Spider.getAttrListForXPath(soup,'#container', None,info2)
        if x == None:
            return None
        info = x[0]
        
        #Make some modification on info.
        if info.get('images') != None:
            new =[]
            for f in info['images']:
                if f.startswith('/'):
                    new.append(ROOT+ f)
                else:
                    new.append(f)
            info['images'] = new
            info['head_image'] = new[0]
        
        info['details'] = cleanText(info['details'] )
        return info
    except Exception, e:
        print 'Error in (get_artical_info): ',url,str(e)
        return None

示例#32

0

显示文件

文件： zeenews.py 项目： dipankar08/FlipNews

def get_artical_info(url):
    try:
        #Build the soup from URL
        soup = Spider.buildSoup(url)
        if soup == None :
            return None
        
        #define the prory to retrive from soup.
        info2 = {
                    'title':['.full-story-head','text','JOIN'],
                    'date':['.writer','text','JOIN'],
                    'details':['.full-con p','text','JOIN'],
                    'images':['.full-con img','src'],
                    'video':['iframe','src']
                }
                
        #Get the info from soup.
        x = Spider.getAttrListForXPath(soup,'div.connrtund', None,info2)
        if x == None:
            return None
        info = x[0]
        
        #Make some modification on info.
        if info.get('images') != None:
            new =[]
            for f in info['images']:
                if f.startswith('/'):
                    new.append(ROOT+ f)
                else:
                    new.append(f)
            info['images'] = new
            info['head_image'] = new[0]
        
        return info
    except Exception, e:
        print 'Error in (get_artical_info): ',url,str(e)
        return None

示例#33

0

显示文件

文件： oneindia.py 项目： dipankar08/FlipNews

def parse_artical_list(url):
    #pdb.set_trace();
    try:
        #Build the soup from URL
        soup = Spider.buildSoup(url)
        if soup == None :
            return None
            
        #Get the info from soup.
        x1 = Spider.getAttrListForXPath(soup,'div#collection-wrapper .collection-container', None,{'url':['a','href']})
        
        #Process:
        res =[ ]
        for link in x1:
            s = link.get('url')[0]
            if s.startswith('/'):
                res.append(ROOT+s)
            else:
                res.append(s)
                
        return res;        
    except Exception, e:
        print '[ERROR] parse_first_page',url,str(e)
        return None

示例#34

0

显示文件

文件： Centipede.py 项目： ryanpoon/Centipede-Game

centipede_parts = []
shoot_x = 0
titlefont = pygame.font.SysFont("Baskerville", 100)
myfont = pygame.font.SysFont("Arial", 20)
shoot_y = 0
time = 0
score = 0
spawning_centipedes = 0
can_shoot = True
player_x = 312
player_y = 650
pygame.mouse.set_visible(False)
running = True
pygame.draw.rect(screen, pygame.color.THECOLORS['black'], (0,0,750,840))
tick = 0
spider = Spider()

def setup_game_map():
    global game_map
    game_map = []
    for x in range(0,28):
        arrayOfZeros = [0]*25
        game_map.append(arrayOfZeros)
    for x in range (0,30):
        mushroomx = random.randint(0, 24)
        mushroomy = random.randint(0, 24)
        mushrooms.append("mushroom")
        game_map[mushroomy][mushroomx] = 1

示例#35

0

显示文件

文件： test.py 项目： BlitzTeam/Arachne

def testWalk():
	legs = configLegs(connexion = Connexion())
	spider = Spider(legs)
	spider.move()

Python Spider, myscan示例