def parse_artical_list(url): #pdb.set_trace(); try: #Build the soup from URL soup = Spider.buildSoup(url) if soup == None: return None #Get the info from soup. x1 = Spider.getAttrListForXPath(soup, '.mh-main .loop-title', None, {'url': ['a', 'href']}) #Process: res = [] for link in x1: s = link.get('url')[0] if s.startswith('/'): res.append(ROOT + s) else: res.append(s) return res except Exception, e: print '[ERROR] parse_first_page', url, str(e) return None
def py_main(): data = DBUtil.get_email_data() email_address = data[1] email_password = data[2] print(data) robot_send_email(email_address) # 发送邮件 print("已发送邮件") emailcode = robot_get_emailcode(email_address, email_password) # 获取邮箱验证码 print("邮箱验证码:" + emailcode) WebTools.register(email_address, emailcode) # 网站注册 print("注册成功") content = WebTools.login_to_user(email_address) # 登录并进入用户中心 print("登陆成功") my_vmess = Spider.change_vmess( Spider.get_vmess(content)) # 获取机场地址并改为我们的机场地址 print("机场已就绪,随时可以转发!!!") return my_vmess
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict(url=Spider.ComposeUrl(self.url, x[0]), title=x[1]) html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): article['time'] = datetime.datetime.strptime(y[0], '%Y-%m-%d') if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue content = y[1] for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict( time=datetime.datetime.strptime(x[0], '%Y-%m-%d'), # url = self.url[0:self.url.rfind('/')] + x[1][1:], url=Spider.ComposeUrl(self.url, x[1]), title=x[2]) html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 # imageUrl = article['url'][0:article['url'].rfind('/')] + z[1:] imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict(url=Spider.ComposeUrl(self.url, x[0]), title=x[1], time=datetime.datetime.strptime(x[2], '%Y-%m-%d')) html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl( article['url'], urllib.parse.quote(z) if z[0] in ['/', '.'] else z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue image['imageUrl'] = Spider.ComposeUrl(article['url'], z) images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def __index__(self): lists = Article.query.all() for item in lists: db.session.delete(item) db.session.commit() Spider.getHyArtical() lists = Article.query.all() size = len(lists) return self.render('admin/article.html', lists=lists, size=size)
def __init__(self): #,self.handle.application self.data = [] httpd = make_server('', 1234, self.handle) print('Server HTTP on port 1234...') #Application类的实例化 self.app = Application() #Spider类的实例化 self.spider = Spider() httpd.serve_forever()
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) recPage = re.compile( '<OPTION value=([^>\s]+?)(?:\s[^>]*?)*?>[^<]*?</OPTION>', re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict(url=Spider.ComposeUrl(self.url, x[0]), title=x[1], time=datetime.datetime.strptime(x[2], '%Y-%m-%d')) if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue totalContent = '' images = [] imageCount = 0 pageUrls = recPage.findall(html) if len(pageUrls) == 0: pageUrls += [article['url']] for p in pageUrls: pageUrl = Spider.ComposeUrl(article['url'], p) if pageUrl != article['url']: html = self.DownLoadHtml(pageUrl, '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if content != None: totalContent += content if totalContent == '' \ or imageCount != len(images): continue self.CacheArticle(article, totalContent, images, '成功自{0}提取文章') return self.articles
def __init__(self, word): ''' Constructor to crawl web for a word ''' self.word = word sp = Spider(word, spread=2, limit=0.01) self.web = sp.crawl('Graph.shelve') # Crawled web self.graph = Shelveopen('Graph.shelve') self.paths = [] # To store all paths self.scores = [] # To store corresponding pathscores self.clientfeatures = [] # Feature vector for client self.standardfeatures = [] # To compare against
def CatchArticles(self): abstracts = None recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) html = self.DownLoadHtml(self.url, '文章摘要接口{0}访问失败,异常信息为:{1}') if html == None: return self.articles try: html = html.replace('null', 'None') abstracts = eval(html) except Exception as e: logging.warn('文章摘要信息{0}格式异常,异常信息为:{1}'.format(html, str(e))) return self.articles for x in abstracts['contents']: try: article = dict(url=Spider.ComposeUrl( self.url, '/{0}/{1}.jhtml'.format(x['channel_path'], x['contentId'])), title=x['title']) html = super().DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): article['time'] = datetime.datetime.strptime( y[0], '%Y-%m-%d %H:%M:%S') if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue content = y[1] for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') except Exception as e: logging.warn('文章明细信息{0}格式异常,异常信息为:{1}'.format(str(x), str(e))) continue return self.articles
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) recDate = re.compile('Details([\d-]+?).html', re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict( url = Spider.ComposeUrl(self.url, x[0]), title = x[1].strip() ) for w in recDate.findall(article['url']): try: article['time'] = datetime.datetime.strptime('20{0}'.format(w[0:8]), '%Y-%m-%d') except Exception as e: logging.warn('文章源{0}无法识别发布日期,异常为:{1}'.format(article['url'], str(e))) continue # logging.debug(str(article)) if not 'time' in article: #不符合格式的外部链接忽略 continue if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def initializeSpiders(): homePageList = ['http://finans.mynet.com/borsa/hisseler/', 'http://finans.mynet.com/borsa/hisseler/c-e/', 'http://finans.mynet.com/borsa/hisseler/f-j/', 'http://finans.mynet.com/borsa/hisseler/k-q/', 'http://finans.mynet.com/borsa/hisseler/r-z/'] for i in range(0,5): Spider(PROJECT_NAME,homePageList[i],DOMAIN_NAME)
def test_link_extraction(self): resp = requests.get("https://docs.python.org/2/library/os.path.html") soup = BeautifulSoup(resp.text, 'html.parser') links = Spider.extract_links( soup, [], urlparse("https://docs.python.org/2/library/os.path.html")) self.assertTrue(len(links) > 50)
def test_get_data(): f = codecs.open(CODE_FILE_TEMP, 'wb+', encoding='utf-8') address = "上海市丽泽梅傲苑" house = S.house() house.address = address print(house.getInfo()) f.write(house.getInfo()) f.close()
def download_caselist(search_criteria, index, csv_file): cases = dict() wenshu = Spider.WenShu() wenshu.set_search_criteria(search_criteria) cases = wenshu.get_case_list(index) csv_file.write(cases) print('dump {} items into {}'.format(len(cases['doc_id']), cases)) print('Sleep 10s.....') time.sleep(2)
def main(currentTime, startTime, web, weblevel, app, applevel, device, devicelevel, sys, syslevel, method): start_time = time.time() print("程序运行中....") #新建一个爬虫对象 spider = Spider.Spider() #web应用漏洞 page = web level = weblevel type = 1 spider = SpiderMain(currentTime, startTime, spider, page, level=level, type=type, method=method) #应用程序漏洞 page = app level = applevel type = 2 spider = SpiderMain(currentTime, startTime, spider, page, level=level, type=type, method=method) #网络设备漏洞 page = device level = devicelevel type = 3 spider = SpiderMain(currentTime, startTime, spider, page, level=level, type=type, method=method) #操作系统漏洞 page = sys level = syslevel type = 4 spider = SpiderMain(currentTime, startTime, spider, page, level=level, type=type, method=method) spider.save_doc(currentTime, startTime) end_time = time.time() print("总共花费了%s" % str((end_time - start_time) / 60) + "分钟!")
class Main: def __init__(self): #,self.handle.application self.data = [] httpd = make_server('', 1234, self.handle) print('Server HTTP on port 1234...') #Application类的实例化 self.app = Application() #Spider类的实例化 self.spider = Spider() httpd.serve_forever() def handle(self, environ, start_response): start_response('200 ok', [('Content-Type', 'text/html')]) info = (environ['PATH_INFO'][1:]) if info == 'a': responseInfo = 'aaaaa' elif info == 'b': responseInfo = 'bbbbb' elif info == 'c': self.data = self.connectDataBase() responseInfo = self.data elif info == 'e': value = self.app.printData() responseInfo = value elif info == 'spider': spiderData = self.spider.start() #打开文件,如果不存在,则创建 file = open('baidu.html', 'w+') #在创建的文件中写入数据 file.write(spiderData) #关闭文件 file.close() responseInfo = '文件写入成功' else: responseInfo = '什么鬼' return responseInfo def connectDataBase(self): config = { 'user': '', 'password': '', 'host': '127.0.0.1', 'database': 'test', 'raise_on_warnings': True, } cnx = mysql.connector.connect(**config) cursor = cnx.cursor() name = 'lily' cursor.execute("select * from node") values = cursor.fetchall() return str(values) ''' for value in values: print 'id:' + str(value[0]) + ', username: '******',password: '******''' cnx.close()
def main(): # parse arguments args = parse_args() if args is None: exit() resultsFilePath = args.results_path #resultsFile = open(resultsFilePath, 'w') regionUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html" regionUrlStarter = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/" spider = Spider(regionUrl, regionUrlStarter) spider.processData() print(spider.provinceList) province_dict = spider.provinceList[0] with open(resultsFilePath, "w") as f: json.dump(province_dict, f, ensure_ascii=False) print("加载入文件完成...")
def __init__(self, dbUser, dbPassword, homeWebPageInfoFile): """constructor """ self._init(homeWebPageInfoFile) self.spider = Spider.Spider() self.parser = Parser.Parser() self.conn = Connection(conf.dbHost, conf.dbName, user=dbUser, password=dbPassword) #db instance
def parse_artical(url): try: #Build the soup from URL soup = Spider.buildSoup(url) if soup == None : return None #define the prory to retrive from soup. info2 = { 'title':['h1.entry-title','text','NO_LIST_IF_ONE'], 'date':['.updated','text'], 'details':['.entry p','text','JOIN'], 'images':['img','src'], 'video':['iframe','src'] } #Get the info from soup. x = Spider.getAttrListForXPath(soup,'.mh-content', None,info2) if x == None: return None info = x[0] #Make some modification on info. if info.get('images') != None: new =[] for f in info['images']: if f.startswith('/'): new.append(ROOT+ f) else: new.append(f) #Please check this. Bug prone.. new = [ n[:n.rfind('?')+1] for n in new ] info['images'] = new info['head_image'] = new[0] info['details'] = cleanText(info['details'] ) #we need to modify images return info except Exception, e: print 'Error in (get_artical_info): ',url,str(e) return None
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) recKeyword = re.compile(self.reKeyword, re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict(url=xml.sax.saxutils.unescape( Spider.ComposeUrl(self.url, x[0])), title=x[1].strip(), time=datetime.datetime.strptime(x[2], '%Y-%m-%d')) #关键词检查 if recKeyword.match(x[1]) == None: continue else: logging.debug('文章Url为{0}, 标题为{1}。'.format( article['url'], article['title'])) if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) recDate = re.compile('http://www.gotohz.com/\w+?/\w+?/\d+?/t(\d+?)_\d+?.shtml', re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict( title = x[0], url = Spider.ComposeUrl(self.url, x[1]) ) for w in recDate.findall(article['url']): article['time'] = datetime.datetime.strptime(w,'%Y%m%d') # logging.debug(str(article)) if not 'time' in article: #不符合格式的外部链接忽略 continue if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def parse_artical(url): try: #Build the soup from URL soup = Spider.buildSoup(url) if soup == None: return None #define the prory to retrive from soup. info2 = { 'title': ['h1.entry-title', 'text', 'NO_LIST_IF_ONE'], 'date': ['.updated', 'text'], 'details': ['.entry p', 'text', 'JOIN'], 'images': ['img', 'src'], 'video': ['iframe', 'src'] } #Get the info from soup. x = Spider.getAttrListForXPath(soup, '.mh-content', None, info2) if x == None: return None info = x[0] #Make some modification on info. if info.get('images') != None: new = [] for f in info['images']: if f.startswith('/'): new.append(ROOT + f) else: new.append(f) #Please check this. Bug prone.. new = [n[:n.rfind('?') + 1] for n in new] info['images'] = new info['head_image'] = new[0] info['details'] = cleanText(info['details']) #we need to modify images return info except Exception, e: print 'Error in (get_artical_info): ', url, str(e) return None
def get_data(): house_list = S.main() print(len(house_list)) f = codecs.open(CODE_FILE_TEMP, 'wb+', encoding='utf-8') for house in house_list: address = ADDRESS + house.address.replace('\n', '') house.duration, house.duration_str, house.cost = G.getPathTime(address) print(house.getInfo()) f.write(house.getInfo()) f.close()
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) validUrl = 'http://news.cncn.com/' html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}', 'gbk') if html == None: return self.articles for x in recAbstract.findall(html): article = dict(time=datetime.datetime.strptime( '{0}-01-01'.format(datetime.datetime.today().year), '%Y-%m-%d'), url=Spider.ComposeUrl(self.url, x[0]), title=x[1]) if not validUrl in article['url']: #无效URL continue if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}', 'gbk') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def startCrawl(self): if self.bookName == None: print("请输入参数") else: dirUrl = Directory.DirSearcher(self.bookName).search() if dirUrl == None: print("没有这本书") return links = Spider.Spi(dirUrl).search() with open(self.bookName + ".txt", "w") as book: for link in links: cc = ccc.ConCraw(link).crawling(book)
def startSpider(): print('WhiteList spider started!', file=sys.stderr) try: daemonize(PIDFILE, stdout='/tmp/spider-log.log', stderr='/tmp/spider-err.log') except RuntimeError as e: print(e, file=sys.stderr) raise SystemExit(1) io = IO.IO() spider = Spider.Spider(io) spider.start()
def download_all_caselist(search_criteria, max_page): cases = dict() wenshu = Spider.WenShu() wenshu.set_search_criteria(search_criteria) for index in range(1, max_page + 1): tmp_case_list = wenshu.get_case_list(index) if not cases: cases = tmp_case_list else: for key, value in tmp_case_list.items(): cases[key] += value print(f"{cases}") return cases
def first_page_parser(url): #pdb.set_trace(); try: #Build the soup from URL soup = Spider.buildSoup(url) if soup == None : return None #Get the info from soup. x1 = Spider.getAttrListForXPath(soup,'div.leadstoryheading', None,{'url':['a','href']}) #Process: res =[ ] for link in x1: s = link.get('url')[0] if s.startswith('/'): res.append(ROOT+s) return res; except Exception, e: print '[ERROR] get_all_artical_links_first_page',url,str(e) return None
def get_all_artical_links(url): try: #Build the soup from URL soup = Spider.buildSoup(url) if soup == None : return None #Get the info from soup. x1 = Spider.getAttrListForXPath(soup,'div.leadstory-section-heading', None,{'url':['a','href']}) x2 = Spider.getAttrListForXPath(soup,'div.sectionstoryinside-sub', None,{'url':['a','href']}) #Process: res =[ ] for link in x1+x2: s = link.get('url')[0] if s.startswith('/'): res.append(ROOT+s) return res; except Exception, e: print 'Error in Group',url,str(e) return None
def parse_artical(url): try: #Build the soup from URL soup = Spider.buildSoup(url) if soup == None : return None #define the prory to retrive from soup. info2 = { 'title':['h1.heading','text','NO_LIST_IF_ONE'], 'date':['.articlePublishDate','text'], 'details':['article p','text','JOIN'], 'images':['img','src'], 'video':['iframe','src'] } #Get the info from soup. x = Spider.getAttrListForXPath(soup,'#container', None,info2) if x == None: return None info = x[0] #Make some modification on info. if info.get('images') != None: new =[] for f in info['images']: if f.startswith('/'): new.append(ROOT+ f) else: new.append(f) info['images'] = new info['head_image'] = new[0] info['details'] = cleanText(info['details'] ) return info except Exception, e: print 'Error in (get_artical_info): ',url,str(e) return None
def get_artical_info(url): try: #Build the soup from URL soup = Spider.buildSoup(url) if soup == None : return None #define the prory to retrive from soup. info2 = { 'title':['.full-story-head','text','JOIN'], 'date':['.writer','text','JOIN'], 'details':['.full-con p','text','JOIN'], 'images':['.full-con img','src'], 'video':['iframe','src'] } #Get the info from soup. x = Spider.getAttrListForXPath(soup,'div.connrtund', None,info2) if x == None: return None info = x[0] #Make some modification on info. if info.get('images') != None: new =[] for f in info['images']: if f.startswith('/'): new.append(ROOT+ f) else: new.append(f) info['images'] = new info['head_image'] = new[0] return info except Exception, e: print 'Error in (get_artical_info): ',url,str(e) return None
def parse_artical_list(url): #pdb.set_trace(); try: #Build the soup from URL soup = Spider.buildSoup(url) if soup == None : return None #Get the info from soup. x1 = Spider.getAttrListForXPath(soup,'div#collection-wrapper .collection-container', None,{'url':['a','href']}) #Process: res =[ ] for link in x1: s = link.get('url')[0] if s.startswith('/'): res.append(ROOT+s) else: res.append(s) return res; except Exception, e: print '[ERROR] parse_first_page',url,str(e) return None
centipede_parts = [] shoot_x = 0 titlefont = pygame.font.SysFont("Baskerville", 100) myfont = pygame.font.SysFont("Arial", 20) shoot_y = 0 time = 0 score = 0 spawning_centipedes = 0 can_shoot = True player_x = 312 player_y = 650 pygame.mouse.set_visible(False) running = True pygame.draw.rect(screen, pygame.color.THECOLORS['black'], (0,0,750,840)) tick = 0 spider = Spider() def setup_game_map(): global game_map game_map = [] for x in range(0,28): arrayOfZeros = [0]*25 game_map.append(arrayOfZeros) for x in range (0,30): mushroomx = random.randint(0, 24) mushroomy = random.randint(0, 24) mushrooms.append("mushroom") game_map[mushroomy][mushroomx] = 1
def testWalk(): legs = configLegs(connexion = Connexion()) spider = Spider(legs) spider.move()