def get_team_groups(cb=None): print('获取团队分组') cookie = get_cookie('sp.mai2.cc') if cookie is None: if cb: cb('获取登陆信息失败,请使用Chrome浏览器重新登陆海淘后重试') return None headers['Cookie'] = "mmzl_sp=%s" % cookie try: groups_url = 'http://sp.mai2.cc/prodistr/lists' groups_lists = request.Request(url=groups_url, headers=headers) doc = request.urlopen(groups_lists) bs_obj = BeautifulSoup(doc, 'html.parser') # 获取页面内容转化为bs对象 user_config = bs_obj.select('body script')[1].string # 获取登陆用户信息 user_config_text = parse(user_config, debug=False) user_config_tree = pretty_print(user_config_text) user = html.etree.HTML(user_config_tree) mobile = user.xpath("//property[@name='mobile']/string/text()")[0] # 获取团队列表信息 src = bs_obj.select('body script')[6].string src_text = parse(src, debug=False) src_tree = pretty_print(src_text) # 生成结果展示图一 selector = html.etree.HTML(src_tree) # 自己去匹配自己想要的数据 content = selector.xpath("//property[@name='teams']//object") for obj in content: if flag_mmzl_stop: return None owner_mobile = obj.xpath( "./property[@name='owner_mobile']/string/text()")[ 0] # 获取owner_mobile,来过滤数据 if owner_mobile == mobile: g_id = obj.xpath("./property[@name='id']/string/text()")[ 0] # 获取id,来请求维护的客户列表 g_name = obj.xpath( "./property[@name='team_name']/string/text()")[ 0] # 获取id,来请求维护的客户列表 return (g_id, g_name) except URLError as ue: if cb: cb('数据解析错误') return None except KeyError as ke: if cb: cb('数据解析错误') return None except ValueError as ve: if cb: cb('数据解析错误') return None return None
def parse_live_info(self, response): # 抓取现场数据’ https://www.dszuqiu.com/race_xc/701326 print("开始抓取现场数据") names = response.xpath("//script")[-5] js_text = names.xpath("./text()").extract_first() script_text = js2xml.parse(js_text, encoding='utf-8', debug=False) script_tree = js2xml.pretty_print(script_text) selector = etree.HTML(script_tree) data_s = selector.xpath("//property[@name='data']") dics = [] for xys in data_s: xs = xys.xpath(".//property[@name='x']/number/@value") ys = xys.xpath(".//property[@name='y']/number/@value") dic = dict(map(lambda x, y: [x, y], xs, ys)) dics.append(dic) logger.warning(response) logger.warning(dics) item = response.meta["item"] letballs = response.meta["letBalls"] match_data = DataHandle().all_data_handle(item,letballs,dics) # item = response.meta["item"] # print(response.xpath("//script")) # print(item) # print(match_data) line_data = {} item_values = item.values() line_data["match_info"] = item_values line_data["match_data"] = match_data yield line_data
def get_clubs_rank_his(data_list): base_url = 'https://footballdatabase.com' his_rank_list = [] for data in data_list: print(data) url = data[0][3] his_res = requests.get(base_url + url) his_content = BeautifulSoup(his_res.text, "html.parser") js = his_content.find_all('script')[1].string src_text = js2xml.parse(js) src_tree = js2xml.pretty_print(src_text) data_tree = BeautifulSoup(src_tree, 'html.parser') array_list = data_tree.find_all('array') club_name = data[0][1] for array in array_list[2:-2]: his_rank_list = [] array_date = array.find('string').text date = array_date try: month = month_map[array_date.split(' ')[0]] year = array_date.split(' ')[1] date = str(year) + '-' + str(month) except: pass array_data = array.find_all('number') try: point = array_data[0]['value'] rank = array_data[1]['value'] his_rank_list.append([rank, point, date]) save_his_data(club_name, his_rank_list) except: raise return his_rank_list
def get_db_data(): url = 'https://db-engines.com/en/ranking_trend/' res = requests.get(url) content = BeautifulSoup(res.text, "html.parser") db_data = content.find_all("script")[2].string src_text = js2xml.parse(db_data) src_tree = js2xml.pretty_print(src_text) data_tree = BeautifulSoup(src_tree, 'html.parser') year = data_tree.find_all('number')[:2] year_list = [] for y in year: year_list.append(y['value']) # date_list = gen_time('%s-%s' % (year_list[0], str(int(year_list[1]) + 1))) for i in data_tree.find_all('object'): date_list = gen_time('%s-%s' % (year_list[0], str(int(year_list[1]) + 1))) data = [] tmp_list = [] db_name = i.find('string') if i.find('null'): null_num = len(i.find_all('null')) tmp_list = list( zip(date_list[:null_num], ['0' for i in range(null_num + 1)])) date_list = date_list[null_num:] for j in i.find_all('number'): data.append(j['value']) date_value_tmp = list(zip(date_list, data)) date_value = tmp_list + date_value_tmp d_data = zip([db_name.string for i in range(len(date_value))], date_value) save_data(d_data)
def getNbaVideoIframeUrl(self, url, gameBean): # print(url) didiaokan = DidiaokanService() rsq = RequestGet() rsBs = rsq.getBs4Encode(url, 'utf8') src = rsBs.select('body script')[0].string src_text = js2xml.parse(src, encoding='utf-8', debug=False) src_tree = js2xml.pretty_print(src_text) selector = etree.HTML(src_tree) content = selector.xpath('//left/binaryoperation/left/string') iframeTemp = content[0].text + "\">" iframeBs = BeautifulSoup(iframeTemp, "lxml") iframeUrl = iframeBs.find('iframe').get('src') + didiaokan.getJsonx1( url) # print("aaaaa"+iframeBs.find('iframe').get('src')+didiaokan.getJsonx1(url)) print("比赛名:" + gameBean.gameName + "主队:" + gameBean.homeTeam + "客队:" + gameBean.guestTeam + "时间:" + gameBean.time) print("iframe地址:" + iframeUrl)
def main(): ap = ArgumentParser() ap.add_argument("--debug", action="store_true") ap.add_argument("filenames", nargs="*", default=["-"]) args = ap.parse_args() for fn in args.filenames: fo = sys.stdin if fn == "-" else open(fn, "r") parsed = js2xml.parse(fo.read()) print(js2xml.pretty_print(parsed))
def main(): ap = ArgumentParser() ap.add_argument('--debug', action='store_true') ap.add_argument('filenames', nargs='*', default=['-']) args = ap.parse_args() for fn in args.filenames: fo = sys.stdin if fn == '-' else open(fn, 'rU') parsed = js2xml.parse(fo.read()) print(js2xml.pretty_print(parsed))
def main(): ap = ArgumentParser() ap.add_argument('--debug', action='store_true') ap.add_argument('filenames', nargs='*', default=['-']) args = ap.parse_args() for fn in args.filenames: fo = sys.stdin if fn == '-' else open(fn, 'r') parsed = js2xml.parse(fo.read()) print(js2xml.pretty_print(parsed))
def parse_detail(self, response): res = BeautifulSoup(response.body, 'lxml') #print(res) scripts = res.find_all('script') for script in scripts: src_text = js2xml.parse(script.string, encoding='utf-8',debug=False) #print(src_text) src_tree = js2xml.pretty_print(src_text) selector = etree.HTML(src_tree) yield scrapy.Request(selector.xpath("//right/string/text()")[0] + '/', callback = self.parse_detail2, dont_filter = True)
def js2xml_unescape(script_text, encoding='utf8', debug=False): """ :param script_text: :param encoding: :param debug: :return: selector """ tree = js2xml.parse(script_text, encoding=encoding, debug=debug) script_tree = js2xml.pretty_print(tree) # 字符反转义 script_tree = unescape(script_tree) selector = etree.HTML(script_tree) return selector
def get_db_data(db_type): url = 'https://db-engines.com/en/ranking_trend/%s' % db_type res = requests.get(url) content = BeautifulSoup(res.text, "html.parser") db_data = content.find_all("script")[2].string src_text = js2xml.parse(db_data) src_tree = js2xml.pretty_print(src_text) data_tree = BeautifulSoup(src_tree, 'html.parser') db_name = data_tree.find_all('string') print(db_name) name_list = [] for i in db_name: name_list.append(i.string) save_data(db_type, name_list)
def get_lng_lat(script): """ 获得经纬度 Parameters ---------- script Returns ------- tuple """ script = js2xml.parse(script, encoding='utf-8', debug=False) script = js2xml.pretty_print(script) script_selector = etree.HTML(script) lng, lat = script_selector.xpath(Xpath.lng_lat)[0].split(",") return lng, lat, script_selector
def get_pl_data(name): name_lower = [i.lower() for i in name] for i in name_lower: print("Request ", i) if i == 'c#': i = 'csharp' if i == 'pl/sql': i = 'pl-sql' if i == 'visual basic .net': i = 'visual-basic-dotnet' if i == 'delphi/object pascal': i = 'delphi-object-pascal' if i == 'assembly language': i = 'assembly-language' if i == 'visual basic': i = 'visual-basic' if i == 'c++': i = 'cplusplus' url = 'https://www.tiobe.com/tiobe-index/' + i res = requests.get(url).text content = BeautifulSoup(res, "html.parser") js = content.find_all('script')[9].string src_text = js2xml.parse(js) src_tree = js2xml.pretty_print(src_text) data_tree = BeautifulSoup(src_tree, 'html.parser') array_list = data_tree.find_all('array') data_list = [] for array in array_list[3:]: array_data = array.find_all('number') data_list.append({ 'date': array_data[0]['value'] + '-' + str(int(array_data[1]['value']) + 1) + '-' + array_data[2]['value'], 'value': array_data[3]['value'] }) if i == 'csharp': i = 'c#' if i == 'pl-sql': i = 'pl/sql' if i == 'cplusplus': i = 'c++' save_data(i, data_list) time.sleep(2)
def catchtheweb1(url): #爬这个url req = urllib2.Request(url) #超时退出(万一超过这个时间还没读取到网站及退出这个爬虫) socket.setdefaulttimeout(10.0) #从这下面都是获取界面的html res_data = urllib2.urlopen(req) res = res_data.read() soup = BeautifulSoup(res, 'lxml') src = soup.select('body script')[3].string src_text = js2xml.parse(src, debug=False) src_tree = js2xml.pretty_print(src_text) selector = etree.HTML(src_tree) #到这一步已经把界面的html获取了,而且变为了可读形式 #这里是解析读到的可读形式html content = selector.xpath("//property[@name = 'data']/array/array/number") return content
def getpagelink(url): header = headers header.update({"Referer": url}) wb_data = requests.get(url, headers=header) soup = BeautifulSoup(wb_data.text, 'lxml') each_url = soup.select('script')[2].string src_text = js2xml.parse(each_url, encoding='utf-8', debug=False) src_tree = js2xml.pretty_print(src_text) #print(src_tree) src_tree = BeautifulSoup(src_tree, 'lxml') print(str(src_tree.select("var")[0].text)) image = decrypt(src_tree.select("var")[0].text) fpath = src_tree.select("var")[1].text #print(fpath) count = soup.select(".head_title > h2")[0].getText() print(count) #print(image, len(image)) download(image, count, str(fpath))
def parse_category(content): # 用 lxml 的 bs bs = BeautifulSoup(content, "lxml") # 选择第一个script l = bs.select("body script")[0].string # js2xml 解析成 xml src_text = js2xml.parse(l, encoding='utf-8', debug=False) src_tree = js2xml.pretty_print(src_text) # XML 解析 selector = etree.HTML(src_tree) # 所有分类 navSecond1-5 category_nodes = [] for i in range(1, 6): category_nodes.extend( selector.xpath( '//property[@name="navSecond{}"]/array/object'.format( str(i)))) category_dict = {} for node in category_nodes: # 获取二级分类名 category = node.xpath('./property[@name="NAME"]/string/text()')[0] # category_dict[category] = [] # 具体分类 details = node.xpath('./property[@name="children"]')[0] names = details.xpath( './array/object/property[@name="NAME"]/string/text()') urls = details.xpath( './array/object/property[@name="URL"]/string/text()') for name, url in zip(names, urls): category_dict[name] = url # print(category_dict) # 序列化存到本地 # serialize_dict('./jdfresh_all', category_dict) return category_dict
def autoJsToxml(url): #js转换成xml格式 wb_data = requests.get(url, headers=headers) wb_data.encoding = "utf-8" soup = BeautifulSoup(wb_data.text, "html.parser") selction = soup.select('script') # print(selction) i = 0 for keyWord in selction: #自动匹配需要script字段 keyWord = keyWord.get_text() if keyWord.find("jpg") != -1: key = i else: i = i + 1 # print(key) 自动匹配 l = selction[key].string src_text = js2xml.parse(l, encoding='utf-8', debug=False) # print(type(src_text)) src_tree = js2xml.pretty_print(src_text) # print(src_tree) return src_tree
async def get_db_data(db_name): url = 'https://db-engines.com/en/ranking_trend/system/%s' % db_name async with aiohttp.ClientSession() as session: res = await fetch(session, url) content = BeautifulSoup(res, "html.parser") content.find_all("script") db_data = content.find_all("script")[2].string src_text = js2xml.parse(db_data) src_tree = js2xml.pretty_print(src_text) data_tree = BeautifulSoup(src_tree, 'html.parser') data_tree.find_all('number') data = [] for i in data_tree.find_all('number'): data.append(i['value']) date_list = gen_time('%s-%s' % (data[0], str(int(data[1]) + 1))) date_value = list(zip(date_list, data[3:])) d_data = zip([db_name for i in range(len(date_value))], date_value) await save_data(d_data)
def get_weather(city_id,cursor,user_agent): # conn=pymysql.connect(host='localhost',user='******',passwd='root',db='weather',port=3306,charset='utf8') #连接数据库 # cursor=conn.cursor() # 使用cursor()方法获取操作游标 #http://www.weather.com.cn/weather1d/101270101.shtml#input headers = {'User-Agent':user_agent} url = 'http://www.weather.com.cn/weather1d/'+city_id+'.shtml#input' response = requests.get(url, headers=headers) # 提交requests get 请求 soup = BeautifulSoup(response.content, "lxml") # 用Beautifulsoup 进行解析 div = soup.find('div', class_='crumbs fl') span_list = div.find_all('span') a_list = div.find_all('a') area = span_list[3].text #print(area) if area =='城区': city_name = a_list[2].text else: city_name = span_list[3].text #print(city_name) src = soup.select('body script')[6].string src_text = js2xml.parse(src, encoding='utf-8', debug=False) # javascript代码解析,返回一个Element对象 #print(type(src_text)) src_tree = js2xml.pretty_print(src_text) # 将Element解析为标签形式的代码(类似html标签) #print(src_tree) selector = etree.HTML(src_tree) # 建立xpath树 event_24hour = selector.xpath('//property[@name="1d"]/array/string/text()') #列表 #print(event_24hour) list_hour = [] for index,each_3h in enumerate(event_24hour): hour = event_24hour[index].split(',') #使用split(',')将一个字符串中有','的,分裂成多个字符串组成的列表 list_hour.append(tuple(hour)) #将此列表转化成元组,并填入新列表中 print(list_hour) for each_hour in list_hour: #print(type(each_3hour)) cursor.execute('insert into forecast_24h(time,city_id,city_name,temperature,hour_desc,wind,wind_size) values(%s,%s,%s,%s,%s,%s,%s)',(each_hour[0],city_id,city_name,each_hour[3],each_hour[2],each_hour[4],each_hour[5]))
def parse(self, response): # html --> xml对象 soup = BeautifulSoup(response.text, 'lxml') # 选择script标签 src = soup.select("html head script")[0].string # js代码 --> xml文档对象 src_text = js2xml.parse(src, debug=False) src_tree = js2xml.pretty_print(src_text) # xml文档对象 --> html文档对象 selector = etree.HTML(src_tree) # 使用html xpath查找标签 value = selector.xpath("//property[@name = 'skuId']/number/@value") print(value) for obj in selector.xpath( "//property[@name = 'colorSize']/array/object"): # @value 获取标签属性的值 # ./ 从当前标签开始寻址 id = obj.xpath("./property/number/@value")[0] str = ",".join(obj.xpath("./property/string/text()")) print(id) print(str)
def parse(self, response): """ :param response: :description: main content :return: """ resp = response.text soup = BeautifulSoup(resp, 'lxml') src = soup.select('body script')[2].string src_text = js2xml.parse(src, debug=False) src_tree = js2xml.pretty_print(src_text) """ Convert the tags in the `script` to `xml` """ selector = etree.HTML(src_tree) """ Get the elements in the converted `xml` tag """ _item_id = selector.xpath( "//property[@name='itemId']/string/text()")[0] _dytk = selector.xpath("//property[@name='dytk']/string/text()")[0] _center_url = 'https://www.iesdouyin.com/web/api/v2/aweme/iteminfo/?item_ids=' + _item_id + '&dytk=' + _dytk yield Request(url=_center_url, callback=self.parse_content)
def parse_reviews(self, response): jstree = js2xml.parse(response.body) xml = js2xml.pretty_print(jstree) html_xpath = "//var[@name='materials']/object/property[@name='BVRRSourceID']/string/text()" html = jstree.xpath(html_xpath) if html: selector = Selector(text=html[0]) next_page_xpath = '(//*[contains(@class,"BVRRNextPage")])[1]/a/@data-bvjsref' review_list_xpath = '//*[contains(@class,"BVRRContentReview")]' from_product_url_xpath = ".//div[contains(@class, 'BVDI_SUAttribution')]//a[@class='BVDILink']/@href" from_another_source_xpath = ".//*[contains(@class,'BVRRSyndicatedContentAttribution')]" filter_other_sources = response.meta.get('filter_other_sources', None) extra_review_parser = response.meta.get('extra_review_parser', None) last_user_review = response.meta.get('last_user_review', None) product = response.meta['product'] if not product["source_internal_id"]: raise Exception("BV Product without source_internal_id") if not last_user_review: last_user_review = incremental_utils.get_latest_user_review_date_by_sii( self.mysql_manager, self.source_id, product["source_internal_id"]) review_list = selector.xpath(review_list_xpath) if not review_list: return for review_selector in review_list: skip_review = False if filter_other_sources: skip_review = review_selector.xpath(from_another_source_xpath) from_product_url = self.extract_xpath(review_selector, from_product_url_xpath) from_product = True if from_product_url: from_product = (product["source_internal_id"].lower() in from_product_url.lower()) review = self._parse_review(product, review_selector, extra_review_parser) if last_user_review: current_user_review = datetime.strptime( review['TestDateText'], '%Y-%m-%d') if last_user_review > current_user_review: return if from_product and not skip_review: yield review next_page_url = self.extract_xpath(selector, next_page_xpath) if next_page_url: headers = response.request.headers request = Request(next_page_url, callback=self.parse_reviews, headers=headers) request.meta['product'] = product request.meta['last_user_review'] = last_user_review request.meta['filter_other_sources'] = filter_other_sources request.meta['extra_review_parser'] = extra_review_parser yield request
def process_Many(self, response): result = redis.Redis(host='localhost', port=6379, decode_responses=True) # url = response.url # driver = webdriver.Chrome() # driver.get(url) # driver.save_screenshot('screen.png') # driver.page_source # demo = response.meta['demo'] # print(demo) eventItem = HudongbaItem() resp = response.text #将以字串形式的xml进行取值 soup = BeautifulSoup(resp, 'lxml') src = soup.select('head script')[6].string src_text = js2xml.parse(src, debug=False) src_tree = js2xml.pretty_print(src_text) print('treeeeeeeeeeeeeeeeeeeeeeeeeeeee') # print(src_tree) selector = etree.HTML(src_tree) start = selector.xpath( "//property[@name = '_oldStartDate']/string/text()")[0] print(start) print(len(start)) end = selector.xpath( "//property[@name = '_oldEndDate']/string/text()")[0] print(end) print(len(end)) startDate = datetime.datetime.now().strftime('%Y-') + start endDate = datetime.datetime.now().strftime('%Y-') + end if (len(start) == 16): eventItem['startDate'] = start else: eventItem['startDate'] = startDate if (len(end) == 16): eventItem['endDate'] = end else: eventItem['endDate'] = endDate print('treeeeeeeeeeeeeeeeeeeeeeeeeeeee') html = etree.HTML(resp) eventItem['url'] = response.url str = html.xpath("//div[@class='content-body_head_l']/img/@alt")[0] eventItem['title'] = str.replace("互动吧-", "") eventItem['imageUrl'] = html.xpath( "//div[@class='content-body_head_l']/img/@src")[0] eventItem['address'] = html.xpath( "//div[@class='detail_Attr']/a/text()")[0] eventItem['author'] = "" eventItem['description'] = "" print('000000000000') print(eventItem['title']) print(eventItem['url']) print(eventItem['imageUrl']) print(eventItem['address']) print(eventItem['startDate']) print(eventItem['endDate']) print('000000000000') yield eventItem en = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") result.set('hen', en)
import js2xml from lxml import etree import random url = 'http://www.baidu.com/link?url=0y0HQpNoALTsyOd0j-MsnEykrBtifZ5KoCsz3M-YWsc9q7ofOfCaZvVeZc-Hljv2' headers = { 'User-Agent': 'Mozilla/5.0', 'referer': 'https://www.meipian.cn/' } response = requests.get(url, headers=headers) # 使用headers避免访问受限 soup = BeautifulSoup(response.content, 'lxml') # 获取script里面的数据 script = soup.select("body script")[0].string # 利用js2xml格式化script script_text = js2xml.parse(script,encoding='utf-8',debug=False) # type <class 'lxml.etree._Element'> script_tree = js2xml.pretty_print(script_text) # <class 'str'> # etree只能解析Strings seletor = etree.HTML(script_tree) # <class 'lxml.etree._Element'> # 通过Xpath获取内容 img_urls = seletor.xpath('//property[@name="img_url"]/string/text()') folder_path = 'D:/photo/' if os.path.exists(folder_path) == False: # 判断文件夹是否已经存在 os.makedirs(folder_path) # 创建文件夹 for img_url in img_urls: picture = requests.get(img_url) # get函数获取图片链接地址,requests发送访问请求 img_name = folder_path + str(random.sample('zyxwvutsrqponmlkjihgfedcba',10)) +'.jpg' with open(img_name, 'wb') as file: # 以byte形式将图片数据写入 file.write(picture.content)
def parse_case(self, response): item = JiaminAssetsItem() html = response.text soup = BeautifulSoup(html, 'html') for script in soup.select('body script'): if script.string: script_text = js2xml.parse(script.string, debug=False) script_tree = js2xml.pretty_print(script_text) # print(script_tree) selector = etree.HTML(script_tree) for obj in selector.xpath( "//var[@name='defaultDataFeatureProperty']//object"): # url = str(response.urljoin(obj.xpath(".//property[@name='propertyLink']/string/text()")[0])) item['assetstitle'] = str( obj.xpath(".//property[@name='title']/string/text()") [0]) item['assetaddress'] = str( obj.xpath(".//property[@name='address']/string/text()") [0]) item['assettedian'] = str( obj.xpath( ".//property[@name='description']/string/text()") [0]) item['area'] = str( obj.xpath( ".//property[@name='sortMinSize']/string/text()") [0]) if str( obj.xpath( ".//property[@name='typetext']/string/text()") [0]): item['assetwuyeleibie'] = str( obj.xpath( ".//property[@name='typetext']/string/text()") [0]) for obj in selector.xpath( "//var[@name='defaultData']//object"): # url = str(response.urljoin(obj.xpath(".//property[@name='propertyLink']/string/text()")[0])) item['assetstitle'] = str( obj.xpath(".//property[@name='title']/string/text()") [0]) item['assetaddress'] = str( obj.xpath(".//property[@name='address']/string/text()") [0]) item['assettedian'] = str( obj.xpath( ".//property[@name='description']/string/text()") [0]) item['area'] = str( obj.xpath( ".//property[@name='sortMinSize']/string/text()") [0]) if str( obj.xpath( ".//property[@name='typetext']/string/text()") [0]): item['assetwuyeleibie'] = str( obj.xpath( ".//property[@name='typetext']/string/text()") [0]) yield item
import sys import js2xml text = sys.stdin.read() if not text: text = """ var x = { "key1": "value1", "key2": "value2", "key3": 1, "key4": false }; """ print text tree = js2xml.parse(text, debug=False) print js2xml.pretty_print(tree)
import sys import js2xml text = sys.stdin.read() if not text: text = """ var x = { "key1": "value1", "key2": "value2", "key3": 1, "key4": false }; """ print(text) tree = js2xml.parse(text, debug=False) print(js2xml.pretty_print(tree))
url = 'https://service.account.weibo.com/index?type=5&status=0&page=2' for i in range(1,250): url ='https://service.account.weibo.com/index?type=5&status=0&page={}'.format(i) r = requests.get(url, headers=headers, cookies=cookies) r.encoding = 'utf-8' # print (r.text) response = etree.HTML(r.text) script_list = response.xpath('//script/text()') # print(script_list) # filter_script = [ script for script in script_list if script.find('pl_service_showcomplaint')!=-1] script_text = js2xml.parse(script_list[-1], encoding='utf-8', debug=False) # print(script_list[-1]) script_tree = js2xml.pretty_print(script_text) # print(script_tree) selector = etree.HTML(script_tree) div_selector = selector.xpath("//program//property[@name='html']/string/text()")[0] div_tree_se = etree.HTML(div_selector) url_list = div_tree_se.xpath("//div[@class='m_table_tit']/a/@href") for url in url_list: url_pre = "https://service.account.weibo.com" url_com = url_pre + url r = requests.get(url_com, headers=headers, cookies=cookies) response = etree.HTML(r.text) # print(r.text) script_list = response.xpath('//script/text()') filter_script = [script for script in script_list if script.find('pl_service_common') != -1] script_text = js2xml.parse(filter_script[0], encoding='utf-8', debug=False)
def test_schema(): jscode_snippets = [ # strings (r""" "test"; """, """ <program> <string>test</string> </program> """), (r""" "test\ multiline"; """, """ <program> <string>test multiline</string> </program> """), # numbers ("3.14;", """ <program> <number value="3.14"/> </program> """), ("-12;", """ <program> <number value="-12"/> </program> """), ("3.45e2;", """ <program> <number value="3.45e2"/> </program> """), ("0377;", """ <program> <number value="0377"/> </program> """), ("0xFF;", """ <program> <number value="0xFF"/> </program> """), # arrays ("[]", """ <program> <array/> </program> """), ("[1,2]", """ <program> <array> <number value="1"/> <number value="2"/> </array> </program> """), ("[1,,2]", """ <program> <array> <number value="1"/> <undefined/> <number value="2"/> </array> </program> """), ("[1,,2,,,3,]", """ <program> <array> <number value="1"/> <undefined/> <number value="2"/> <undefined/> <undefined/> <number value="3"/> </array> </program> """), ("['a', 'b','c']", """ <program> <array> <string>a</string> <string>b</string> <string>c</string> </array> </program> """), ("[a, 'b', c]", """ <program> <array> <identifier name="a"/> <string>b</string> <identifier name="c"/> </array> </program> """), # objects ("o = {};", """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object/> </right> </assign> </program> """), ("o = {a: 1};", """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="a"> <number value="1"/> </property> </object> </right> </assign> </program> """), ("o = {a: 1, b: 2};", """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="a"> <number value="1"/> </property> <property name="b"> <number value="2"/> </property> </object> </right> </assign> </program> """), ("o = {'c': 1, 'd': 2};", """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="c"> <number value="1"/> </property> <property name="d"> <number value="2"/> </property> </object> </right> </assign> </program> """), ('o = {"c": 1, "d": 2};', """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="c"> <number value="1"/> </property> <property name="d"> <number value="2"/> </property> </object> </right> </assign> </program> """), ('o = {"c": 1, d: "e"};', """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="c"> <number value="1"/> </property> <property name="d"> <string>e</string> </property> </object> </right> </assign> </program> """), ("e = {foo: 5, bar: 6, baz: ['Baz', 'Content']};", """ <program> <assign operator="="> <left> <identifier name="e"/> </left> <right> <object> <property name="foo"> <number value="5"/> </property> <property name="bar"> <number value="6"/> </property> <property name="baz"> <array> <string>Baz</string> <string>Content</string> </array> </property> </object> </right> </assign> </program> """), # other primitive data types ("null;", """ <program> <null/> </program> """), ("undefined;", """ <program> <undefined/> </program> """), ("true;", """ <program> <boolean>true</boolean> </program> """), ("false;", """ <program> <boolean>false</boolean> </program> """), # variables (r""" var i; """, """ <program> <var name="i"/> </program> """), (r""" var i,j,k; """, """ <program> <var name="i"/> <var name="j"/> <var name="k"/> </program> """), (r""" var i = 0; """, """ <program> <var name="i"> <number value="0"/> </var> </program> """), (r""" var i = "test"; """, """ <program> <var name="i"> <string>test</string> </var> </program> """), (r"""var z = 'foxes', r = 'birds';""", """ <program> <var name="z"> <string>foxes</string> </var> <var name="r"> <string>birds</string> </var> </program> """), (r""" var i, j, k = 0; """, """ <program> <var name="i"/> <var name="j"/> <var name="k"> <number value="0"/> </var> </program> """), (r""" var i=1, j, k = 2; """, """ <program> <var name="i"> <number value="1"/> </var> <var name="j"/> <var name="k"> <number value="2"/> </var> </program> """), (r""" var i = obj.prop; """, """ <program> <var name="i"> <dotaccessor> <object> <identifier name="obj"/> </object> <property> <identifier name="prop"/> </property> </dotaccessor> </var> </program> """), (r"""var testObj = {};""", """ <program> <var name="testObj"> <object/> </var> </program> """), (r"""var testObj = [];""", """ <program> <var name="testObj"> <array/> </var> </program> """), # operations (r""" 1 + 2; "foo" + false; 3 - 5 """, """ <program> <binaryoperation operation="+"> <left> <number value="1"/> </left> <right> <number value="2"/> </right> </binaryoperation> <binaryoperation operation="+"> <left> <string>foo</string> </left> <right> <boolean>false</boolean> </right> </binaryoperation> <binaryoperation operation="-"> <left> <number value="3"/> </left> <right> <number value="5"/> </right> </binaryoperation> </program> """), (r""" 1.0 / 2.0; -2 * 2; 12 % 5; """, """ <program> <binaryoperation operation="/"> <left> <number value="1.0"/> </left> <right> <number value="2.0"/> </right> </binaryoperation> <binaryoperation operation="*"> <left> <number value="-2"/> </left> <right> <number value="2"/> </right> </binaryoperation> <binaryoperation operation="%"> <left> <number value="12"/> </left> <right> <number value="5"/> </right> </binaryoperation> </program> """), (r""" // Postfix var x = 3; y = x++; // y = 3, x = 4 // Prefix var a = 2; b = ++a; // a = 3, b = 3 """, """ <program> <var name="x"> <number value="3"/> </var> <assign operator="="> <left> <identifier name="y"/> </left> <right> <postfix operation="++"> <identifier name="x"/> </postfix> </right> </assign> <var name="a"> <number value="2"/> </var> <assign operator="="> <left> <identifier name="b"/> </left> <right> <unaryoperation operation="++"> <identifier name="a"/> </unaryoperation> </right> </assign> </program> """), (r""" // Postfix var x = 3; y = x--; // y = 3, x = 2 // Prefix var a = 2; b = --a; // a = 1, b = 1 """, """ <program> <var name="x"> <number value="3"/> </var> <assign operator="="> <left> <identifier name="y"/> </left> <right> <postfix operation="--"> <identifier name="x"/> </postfix> </right> </assign> <var name="a"> <number value="2"/> </var> <assign operator="="> <left> <identifier name="b"/> </left> <right> <unaryoperation operation="--"> <identifier name="a"/> </unaryoperation> </right> </assign> </program> """), (r""" var x = 3; y = -x; // y = -3, x = 3 """, """ <program> <var name="x"> <number value="3"/> </var> <assign operator="="> <left> <identifier name="y"/> </left> <right> <unaryoperation operation="-"> <identifier name="x"/> </unaryoperation> </right> </assign> </program> """), (r""" +3; // 3 +"3"; // 3 +true; // 1 +false; // 0 +null; // 0 """, """ <program> <number value="+3"/> <unaryoperation operation="+"> <string>3</string> </unaryoperation> <unaryoperation operation="+"> <boolean>true</boolean> </unaryoperation> <unaryoperation operation="+"> <boolean>false</boolean> </unaryoperation> <unaryoperation operation="+"> <null/> </unaryoperation> </program> """), # assignements (r""" i = b; """, """ <program> <assign operator="="> <left> <identifier name="i"/> </left> <right> <identifier name="b"/> </right> </assign> </program> """), (r""" i.a = "b"; """, """ <program> <assign operator="="> <left> <dotaccessor> <object> <identifier name="i"/> </object> <property> <identifier name="a"/> </property> </dotaccessor> </left> <right> <string>b</string> </right> </assign> </program> """), (r""" i["a"] = "b"; """, """ <program> <assign operator="="> <left> <bracketaccessor> <object> <identifier name="i"/> </object> <property> <string>a</string> </property> </bracketaccessor> </left> <right> <string>b</string> </right> </assign> </program> """), (r""" i[a] = "b"; """, """ <program> <assign operator="="> <left> <bracketaccessor> <object> <identifier name="i"/> </object> <property> <identifier name="a"/> </property> </bracketaccessor> </left> <right> <string>b</string> </right> </assign> </program> """), # control structures (r""" if (condition) { result = expression; }""", """ <program> <if> <predicate> <identifier name="condition"/> </predicate> <then> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="expression"/> </right> </assign> </block> </then> </if> </program> """), (r""" if (condition) { result = expression; } else { result = alternative; }""", """ <program> <if> <predicate> <identifier name="condition"/> </predicate> <then> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="expression"/> </right> </assign> </block> </then> <else> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="alternative"/> </right> </assign> </block> </else> </if> </program> """), (r""" if (exprA == exprB) { result = expression; } else if (expr2) { result = alternative1; } else { result = alternative2; }""", """ <program> <if> <predicate> <binaryoperation operation="=="> <left> <identifier name="exprA"/> </left> <right> <identifier name="exprB"/> </right> </binaryoperation> </predicate> <then> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="expression"/> </right> </assign> </block> </then> <else> <if> <predicate> <identifier name="expr2"/> </predicate> <then> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="alternative1"/> </right> </assign> </block> </then> <else> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="alternative2"/> </right> </assign> </block> </else> </if> </else> </if> </program> """), ("result = condition ? expression : alternative;", """ <program> <assign operator="="> <left> <identifier name="result"/> </left> <right> <conditional> <condition> <identifier name="condition"/> </condition> <value1> <identifier name="expression"/> </value1> <value2> <identifier name="alternative"/> </value2> </conditional> </right> </assign> </program> """), # switch (r""" switch (expr) { case SOMEVALUE: //statements; break; case ANOTHERVALUE: //statements; break; default: //statements; break; } """, """ <program> <switch> <expression> <identifier name="expr"/> </expression> <case> <expression> <identifier name="SOMEVALUE"/> </expression> <break/> </case> <case> <expression> <identifier name="ANOTHERVALUE"/> </expression> <break/> </case> <default> <break/> </default> </switch> </program> """), # for loop (r""" for (var i = 0; i < 5; i++) { a = i; } """, """ <program> <for> <init> <var name="i"> <number value="0"/> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <number value="5"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="i"/> </postfix> </post> <statement> <block> <assign operator="="> <left> <identifier name="a"/> </left> <right> <identifier name="i"/> </right> </assign> </block> </statement> </for> </program> """), (r""" for (var i = 0; i < 5; i++) { a = i } """, """ <program> <for> <init> <var name="i"> <number value="0"/> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <number value="5"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="i"/> </postfix> </post> <statement> <block> <assign operator="="> <left> <identifier name="a"/> </left> <right> <identifier name="i"/> </right> </assign> </block> </statement> </for> </program> """), (r""" for (var key in array) { continue; } """, """ <program> <forin> <variable> <var name="key"/> </variable> <object> <identifier name="array"/> </object> <statement> <block> <continue/> </block> </statement> </forin> </program> """), (r""" for (;;) { break; } """, """ <program> <for> <init> <empty>;</empty> </init> <condition> <empty>;</empty> </condition> <statement> <block> <break/> </block> </statement> </for> </program> """), (r""" for (; i < len; i++) { j = i; } """, """ <program> <for> <init> <empty>;</empty> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <identifier name="len"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="i"/> </postfix> </post> <statement> <block> <assign operator="="> <left> <identifier name="j"/> </left> <right> <identifier name="i"/> </right> </assign> </block> </statement> </for> </program> """), (r""" for (var i = 0, len = cars.length, text = ""; i < len; i++) { text += cars[i] + "<br>"; } """, """ <program> <for> <init> <var name="i"> <number value="0"/> </var> <var name="len"> <dotaccessor> <object> <identifier name="cars"/> </object> <property> <identifier name="length"/> </property> </dotaccessor> </var> <var name="text"> <string></string> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <identifier name="len"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="i"/> </postfix> </post> <statement> <block> <assign operator="+="> <left> <identifier name="text"/> </left> <right> <binaryoperation operation="+"> <left> <bracketaccessor> <object> <identifier name="cars"/> </object> <property> <identifier name="i"/> </property> </bracketaccessor> </left> <right> <string><br></string> </right> </binaryoperation> </right> </assign> </block> </statement> </for> </program> """), (""" for (; i < len; ) { text += cars[i] + "<br>"; i++; } """, """ <program> <for> <init> <empty>;</empty> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <identifier name="len"/> </right> </binaryoperation> </condition> <statement> <block> <assign operator="+="> <left> <identifier name="text"/> </left> <right> <binaryoperation operation="+"> <left> <bracketaccessor> <object> <identifier name="cars"/> </object> <property> <identifier name="i"/> </property> </bracketaccessor> </left> <right> <string><br></string> </right> </binaryoperation> </right> </assign> <postfix operation="++"> <identifier name="i"/> </postfix> </block> </statement> </for> </program> """), # while loop (""" while (a<b) { a+=1; } """, """ <program> <while> <predicate> <binaryoperation operation="<"> <left> <identifier name="a"/> </left> <right> <identifier name="b"/> </right> </binaryoperation> </predicate> <statement> <block> <assign operator="+="> <left> <identifier name="a"/> </left> <right> <number value="1"/> </right> </assign> </block> </statement> </while> </program> """), (""" do { a+=1; } while (a<b); """, """ <program> <statement> <block> <assign operator="+="> <left> <identifier name="a"/> </left> <right> <number value="1"/> </right> </assign> </block> </statement> <while> <binaryoperation operation="<"> <left> <identifier name="a"/> </left> <right> <identifier name="b"/> </right> </binaryoperation> </while> </program> """), # with (""" with (document) { var a = getElementById('a'); var b = getElementById('b'); var c = getElementById('c'); var c = document.get('c'); }; """, """ <program> <with> <identifier name="document"/> <statement> <block> <var name="a"> <functioncall> <function> <identifier name="getElementById"/> </function> <arguments> <string>a</string> </arguments> </functioncall> </var> <var name="b"> <functioncall> <function> <identifier name="getElementById"/> </function> <arguments> <string>b</string> </arguments> </functioncall> </var> <var name="c"> <functioncall> <function> <identifier name="getElementById"/> </function> <arguments> <string>c</string> </arguments> </functioncall> </var> <var name="c"> <functioncall> <function> <dotaccessor> <object> <identifier name="document"/> </object> <property> <identifier name="get"/> </property> </dotaccessor> </function> <arguments> <string>c</string> </arguments> </functioncall> </var> </block> </statement> </with> <empty>;</empty> </program> """), # label (r""" loop1: for (var a = 0; a < 10; a++) { if (a == 4) { break loop1; // Stops after the 4th attempt } alert('a = ' + a); loop2: for (var b = 0; b < 10; ++b) { if (b == 3) { continue loop2; // Number 3 is skipped } if (b == 6) { continue loop1; // Continues the first loop, 'finished' is not shown } alert('b = ' + b); } alert('finished') } block1: { alert('hello'); // Displays 'hello' break block1; alert('world'); // Will never get here } """, """ <program> <label name="loop1"> <statement> <for> <init> <var name="a"> <number value="0"/> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="a"/> </left> <right> <number value="10"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="a"/> </postfix> </post> <statement> <block> <if> <predicate> <binaryoperation operation="=="> <left> <identifier name="a"/> </left> <right> <number value="4"/> </right> </binaryoperation> </predicate> <then> <block> <break> <identifier name="loop1"/> </break> </block> </then> </if> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <binaryoperation operation="+"> <left> <string>a = </string> </left> <right> <identifier name="a"/> </right> </binaryoperation> </arguments> </functioncall> <label name="loop2"> <statement> <for> <init> <var name="b"> <number value="0"/> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="b"/> </left> <right> <number value="10"/> </right> </binaryoperation> </condition> <post> <unaryoperation operation="++"> <identifier name="b"/> </unaryoperation> </post> <statement> <block> <if> <predicate> <binaryoperation operation="=="> <left> <identifier name="b"/> </left> <right> <number value="3"/> </right> </binaryoperation> </predicate> <then> <block> <continue> <identifier name="loop2"/> </continue> </block> </then> </if> <if> <predicate> <binaryoperation operation="=="> <left> <identifier name="b"/> </left> <right> <number value="6"/> </right> </binaryoperation> </predicate> <then> <block> <continue> <identifier name="loop1"/> </continue> </block> </then> </if> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <binaryoperation operation="+"> <left> <string>b = </string> </left> <right> <identifier name="b"/> </right> </binaryoperation> </arguments> </functioncall> </block> </statement> </for> </statement> </label> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>finished</string> </arguments> </functioncall> </block> </statement> </for> </statement> </label> <label name="block1"> <statement> <block> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>hello</string> </arguments> </functioncall> <break> <identifier name="block1"/> </break> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>world</string> </arguments> </functioncall> </block> </statement> </label> </program> """), # functions (""" function foo(p) { p = "bar"; } """, """ <program> <funcdecl name="foo"> <parameters> <identifier name="p"/> </parameters> <body> <assign operator="="> <left> <identifier name="p"/> </left> <right> <string>bar</string> </right> </assign> </body> </funcdecl> </program> """), (""" function hello() { alert('world'); } """, """ <program> <funcdecl name="hello"> <parameters/> <body> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>world</string> </arguments> </functioncall> </body> </funcdecl> </program> """), (""" var anon = function() { alert('I am anonymous'); }; """, """ <program> <var name="anon"> <funcexpr> <identifier/> <parameters/> <body> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>I am anonymous</string> </arguments> </functioncall> </body> </funcexpr> </var> </program> """), (""" anon(); """, """ <program> <functioncall> <function> <identifier name="anon"/> </function> <arguments/> </functioncall> </program> """), (""" setTimeout(function() { alert('hello'); }, 1000) """, """ <program> <functioncall> <function> <identifier name="setTimeout"/> </function> <arguments> <funcexpr> <identifier/> <parameters/> <body> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>hello</string> </arguments> </functioncall> </body> </funcexpr> <number value="1000"/> </arguments> </functioncall> </program> """), (""" (function() { alert('foo'); }()); """, """ <program> <groupingoperator> <functioncall> <function> <funcexpr> <identifier/> <parameters/> <body> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>foo</string> </arguments> </functioncall> </body> </funcexpr> </function> <arguments/> </functioncall> </groupingoperator> </program> """), # get/set (""" var obj = { get latest () { return "latest"; } } """, """ <program> <var name="obj"> <object> <get> <property> <identifier name="latest"/> </property> <body> <return> <string>latest</string> </return> </body> </get> </object> </var> </program> """), (""" delete obj.latest; """, """ <program> <unaryoperation operation="delete"> <dotaccessor> <object> <identifier name="obj"/> </object> <property> <identifier name="latest"/> </property> </dotaccessor> </unaryoperation> </program> """), (""" var o = { set current (str) { return this.log[this.log.length] = str; }, log: [] } """, """ <program> <var name="o"> <object> <set> <body> <return> <assign operator="="> <left> <bracketaccessor> <object> <dotaccessor> <object> <identifier>this</identifier> </object> <property> <identifier name="log"/> </property> </dotaccessor> </object> <property> <dotaccessor> <object> <dotaccessor> <object> <identifier>this</identifier> </object> <property> <identifier name="log"/> </property> </dotaccessor> </object> <property> <identifier name="length"/> </property> </dotaccessor> </property> </bracketaccessor> </left> <right> <identifier name="str"/> </right> </assign> </return> </body> </set> <property name="log"> <array/> </property> </object> </var> </program> """), ] for snippet, expected in jscode_snippets: print("---------------------------------------------------------") print(snippet) js = js2xml.parse(snippet) output = js2xml.pretty_print(js).strip() assert_equal(output, expected.strip(), "got\n%s\nexpected:\n%s" % (output, expected))
def get_comment_info(self, id): c_urls = 'https://m.weibo.cn/api/comments/show?id=' + id + '&page={}' wb_url = 'https://m.weibo.cn/detail/' + id wb_r = requests.get(wb_url, headers=self.agent, cookies=self.cookie).content soup = BeautifulSoup(wb_r, 'lxml') src = soup.select('body script')[0].string src_text = js2xml.parse(src, debug=False) src_tree = js2xml.pretty_print(src_text) selector2 = etree.HTML(src_tree) wb_id = selector2.xpath("//property[@name='id']//text()")[1] wb_userName = selector2.xpath( "//property[@name='screen_name']/string//text()")[0] wb_userId = selector2.xpath( "//property[@name='profile_url']//text()")[1].split('uid=')[1] wb_user_profile_image_url = selector2.xpath( "//property[@name='profile_image_url']//text()")[1] wb_created_at = selector2.xpath( "//property[@name='created_at']//text()")[1] wb_source = selector2.xpath("//property[@name='source']//text()")[1] wb_text = selector2.xpath("//property[@name='text']//text()")[1] # https://wx2.sinaimg.cn/large/+字符串(大图) # http://wx2.sinaimg.cn/bmiddle/+字符串(中图) # https://wx2.sinaimg.cn/thumbnail/+字符串(小图) wb_pic_ids = selector2.xpath( "//property[@name='pic_ids']/array/string//text()") wb_reposts = selector2.xpath( "//property[@name='reposts_count']//@value")[0] wb_comments = selector2.xpath( "//property[@name='comments_count']//@value")[0] wb_like = selector2.xpath( "//property[@name='attitudes_count']//@value")[0] # print(src_tree) # print(wb_userName) # print(wb_like) commentWeiboInfo = CommentWeiboInfo() if wb_id: commentWeiboInfo.wb_id = wb_id if wb_userName: commentWeiboInfo.wb_userName = wb_userName if wb_userId: commentWeiboInfo.wb_userId = wb_userId if wb_user_profile_image_url: commentWeiboInfo.wb_user_profile_image_url = wb_user_profile_image_url if wb_created_at: commentWeiboInfo.wb_created_at = self.fix_time(wb_created_at) if wb_source: commentWeiboInfo.wb_source = wb_source if wb_text: commentWeiboInfo.wb_text = wb_text if wb_pic_ids: commentWeiboInfo.wb_pic_ids = wb_pic_ids filepath = path.abspath(path.join(os.getcwd(), "webview/static")) print(filepath) for wb_pic_id in wb_pic_ids: with urllib.request.urlopen( "https://wx2.sinaimg.cn/large/" + wb_pic_id, timeout=30) as response, open( filepath + "\\" + wb_pic_id + ".jpg", 'wb') as f_save: print("下载图片%s" % wb_pic_id) f_save.write(response.read()) f_save.flush() f_save.close() if wb_reposts: commentWeiboInfo.wb_reposts = int(wb_reposts) if wb_comments: commentWeiboInfo.wb_comments = int(wb_comments) if wb_like: commentWeiboInfo.wb_like = int(wb_like) try: CommentWeiboInfo.objects.get(wb_id=commentWeiboInfo.wb_id) print("微博内容已存在数据库") except CommentWeiboInfo.DoesNotExist: print("微博内容抓取完毕,开始写入数据库") commentWeiboInfo.save() print("微博内容写入数据库成功,开始抓取评论") except Exception as e: return "e:", e i = 1 comment_num = 1 while True: r = requests.get(url=c_urls.format(i), headers=self.agent, cookies=self.cookie) if int(r.json()['ok']) == 1: comment_data = r.json()['data']['data'] print('正在读取第 %s 页评论:' % i) for j in range(0, len(comment_data)): commentInfo = CommentInfo() print('第 %s 条评论' % comment_num) user = comment_data[j] wb_id = id c_id = user['id'] c_created_at = user['created_at'] c_source = re.sub( '[\U00010000-\U0010ffff]|[\uD800-\uDBFF][\uDC00-\uDFFF]', '', user['source']) c_user_id = user['user']['id'] c_user_name = user['user']['screen_name'] c_user_img = user['user']['profile_image_url'] c_user_url = user['user']['profile_url'] c_text = re.sub( '<.*?>|回复<.*?>:|[\U00010000-\U0010ffff]|[\uD800-\uDBFF][\uDC00-\uDFFF]', '', user['text']) c_likenum = user['like_counts'] if wb_id: commentInfo.CommentWeiboInfo_id = wb_id if c_id: commentInfo.c_id = c_id if c_created_at: commentInfo.c_created_at = self.time_fix(c_created_at) if c_source: commentInfo.c_source = c_source if c_user_id: commentInfo.c_userId = c_user_id if c_user_name: commentInfo.c_user_name = c_user_name if c_user_img: commentInfo.C_profile_image_url = c_user_img if c_user_url: commentInfo.C_profile_url = c_user_url if c_text: commentInfo.c_text = c_text if c_likenum: commentInfo.c_like_num = int(c_likenum) comment_num += 1 try: CommentInfo.objects.get(c_id=commentInfo.c_id) print("评论已存在数据库") except CommentInfo.DoesNotExist: self.comment_list_to_insert.append(commentInfo) print(len(self.comment_list_to_insert)) i += 1 time.sleep(2) else: print("跳出while=======================") break try: print("评论抓取完毕,开始写入数据库") CommentInfo.objects.bulk_create(self.comment_list_to_insert) print("评论写入数据库成功") return "数据抓取完毕" except Exception as e: return "e:", e
headers = { 'Cookie': 'tuniuuser_citycode=MjAw; p_phone_400=4007-999-999; p_phone_level=0; p_global_phone=%2B0086-25-8685-9999; tuniu_partner=MTQwMCwwLCwzMTExMWViZjMxNTgyMWUxOTcwZWE0YTAzNzZhMDRjMw%3D%3D; _tacau=MCxmZWI4N2Q3Zi00OGQyLTQwM2MtYzcxZi0yMDkxZTI5MjllYTQs; _tact=ODk2NmJlNTAtNDQ0My0yMmY0LThjYmQtOTNjODIyMzM4ZmJi; _tacz2=taccsr%3Dbaidu%7Ctacccn%3D%28organic%29%7Ctaccmd%3Dmkt_06002401%7Ctaccct%3D%2525E9%252580%252594%2525E7%252589%25259B%2525E6%252597%252585%2525E6%2525B8%2525B8%2525E7%2525BD%252591%7Ctaccrt%3D%28none%29; _taca=1556873527993.1556873527993.1556873527993.1; _tacb=M2E0NzJhZjUtYWFjOC05NWY4LTZkZTItYmJkMTNiNDc4MWY0; _tacc=1; PageSwitch=1%2C213612736; __utma=1.99986577.1556873529.1556873529.1556873529.1; __utmc=1; __utmz=1.1556873529.1.1.utmcsr=baidu|utmccn=brand|utmcmd=brand|utmctr=%E9%80%94%E7%89%9B%E6%97%85%E6%B8%B8%E7%BD%91; Hm_lvt_51d49a7cda10d5dd86537755f081cc02=1556873529; OLBSESSID=r2vvonrso0iterbqr3uvl4ed55; tuniu_searched=a%3A1%3A%7Bi%3A0%3Ba%3A2%3A%7Bs%3A7%3A%22keyword%22%3Bs%3A6%3A%22%E5%8C%97%E4%BA%AC%22%3Bs%3A4%3A%22link%22%3Bs%3A50%3A%22http%3A%2F%2Fwww.tuniu.com%2Fg200%2Fwhole-bj-0%2Flist-h0-j0_0%2F%22%3B%7D%7D; MOBILE_APP_SETTING_OPEN-126=1; isHaveShowPriceTips=1; tuniuuser_ip_citycode=MjAw; hotel_checkindate=2019-05-04; hotel_checkoutdate=2019-05-05; __utma=1.99986577.1556873529.1556873529.1556873529.1; __utmc=1; __utmz=1.1556873529.1.1.utmcsr=baidu|utmccn=brand|utmcmd=brand|utmctr=%E9%80%94%E7%89%9B%E6%97%85%E6%B8%B8%E7%BD%91; UM_distinctid=16a7ce50111600-02532d105778c5-3e385e0c-100200-16a7ce501128d4; CNZZDATA5726564=cnzz_eid%3D1174976800-1556870598-http%253A%252F%252Fwww.tuniu.com%252F%26ntime%3D1556870598; fp_ver=4.7.1; BSFIT_EXPIRATION=1556914639593; BSFIT_OkLJUJ=FHFFCnknL-o5bRSI5BLbZ0nRVV9vEfUl; BSFIT_DEVICEID=GvPdjwlHiUDJCyZ76z9hSWlE64277FYP2lDKtLRNYlCovVCyEOZqS7v1K8q6b-KmpNoIJd2wAZdD6ycR7SgbxBzBsm3GoWvIQ7i5rIVqGONXQMmPDwqVVug5MO8Rk1_pPUHGD3C4HC00bswmmTQOP4-gNig7RuwR; __utmb=1.2.10.1556873529; __xsptplus352=352.1.1556873534.1556873730.2%231%7Cbaidu%7Cbrand%7Cbrand%7C%25E9%2580%2594%25E7%2589%259B%25E6%2597%2585%25E6%25B8%25B8%25E7%25BD%2591%7C%23%23Ut_C1T0sii6u6C7-BkuuijGkH1Ey-YML%23; MOBILE_APP_SETTING_STATE-126=CLOSE; Hm_lpvt_51d49a7cda10d5dd86537755f081cc02=1556873962; __utmb=1.3.10.1556873529; _pzfxuvpc=1556873528660%7C7001987382100307498%7C5%7C1556873963689%7C1%7C%7C1475546543120999364; _pzfxsvpc=1475546543120999364%7C1556873528660%7C5%7Chttps%3A%2F%2Fsp0.baidu.com%2F9q9JcDHa2gU2pMbgoY3K%2Fadrc.php%3Ft%3D06KL00c00fDuykY0CgGB00VIAs0k0TkT0000024kcdC00000TlEXRt.THLPE_yWs_5H1_L30A3qrHn1rj0YnWKxpA7EgLKM0ZnquHFhuHuWnWfsnj0kPjTYrfKd5H0dPH6vrDDLwjn4P1NAnHmLrHc3PY77njF7fHfLPYnY0ADqI1YhUyPGujY1nWnvP1nvnHf3FMKzUvwGujYkP6K-5y9YIZ0lQzqLILT8IZN8pgR8mvqVQ1qs5HDYnj0hmvdspyfqUyVYg10vnj0zPj0kFMNYUNq1ULNzmvRqmh7GuZRhIgwVgvd-uA-dUHdBTh78uaudIAdxmv7VTA7Guv3qmMF9Uhf0mLFW5HTkrHc%26tpl%3Dtpl_11534_19713_15764%26l%3D1512272302%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E5%252587%252586%2525E5%2525A4%2525B4%2525E9%252583%2525A8-%2525E6%2525A0%252587%2525E9%2525A2%252598-%2525E4%2525B8%2525BB%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253D%2525E9%252580%252594%2525E7%252589%25259B%2525E6%252597%252585%2525E6%2525B8%2525B8%2525E7%2525BD%252591%2525E5%2525AE%252598%2525E7%2525BD%252591-%2525E8%2525AE%2525A9%2525E6%252597%252585%2525E6%2525B8%2525B8%2525E6%25259B%2525B4%2525E7%2525AE%252580%2525E5%25258D%252595%252520%2525E8%2525A6%252581%2525E6%252597%252585%2525E6%2525B8%2525B8%252520%2525E6%252589%2525BE%2525E9%252580%252594%2525E7%252589%25259B%2525EF%2525BC%252581%2526xp%253Did(%252522m3236736148_canvas%252522)%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D166%26wd%3D%25E9%2580%2594%25E7%2589%259B%25E6%2597%2585%25E6%25B8%25B8%25E7%25BD%2591%26issp%3D1%26f%3D3%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3D93380420_hao_pg%26oq%3D%2525E9%2525A9%2525AC%2525E8%25259C%252582%2525E7%2525AA%25259D%26inputT%3D73510%26prefixsug%3Dtuniu%26rsp%3D3; hotel_order_begin_date=2019-05-04; hotel_order_end_date=2019-05-05; rg_entrance=010000%2F003001%2F000013%2F000000', 'Host': 'hotel.tuniu.com', 'Referer': 'http://www.tuniu.com/g200/hotel-bj-0/', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', } response = requests.get(url=url, headers=headers) resp = BeautifulSoup(response.content, 'lxml') pp = resp.select('script')[8].string content = js2xml.parse(pp, encoding='utf8', debug=False) parse_tree = js2xml.pretty_print(content) print(parse_tree) selector = etree.HTML(parse_tree) html = selector.xpath("//property[@name='list']/array/object") for i in html: name = i.xpath("property[@name ='name']/string/text()")[0] print(name) url = i.xpath("property[@name = 'url']/string/text()")[0] url = 'https://hotel.tuniu.com' + url print(url) level = i.xpath( "property[@name='levelInfo']/object/property[@name='name']/string/text()" )[0] print(level) address = i.xpath("property[@name = 'address']/string/text()")[0]
def test_schema(): jscode_snippets = [ # strings ( r""" "test"; """, """ <program> <string>test</string> </program> """ ), ( r""" "test\ multiline"; """, """ <program> <string>test multiline</string> </program> """ ), # numbers ( "3.14;", """ <program> <number value="3.14"/> </program> """ ), ( "-12;", """ <program> <number value="-12"/> </program> """ ), ( "3.45e2;", """ <program> <number value="3.45e2"/> </program> """ ), ( "0377;", """ <program> <number value="0377"/> </program> """ ), ( "0xFF;", """ <program> <number value="0xFF"/> </program> """ ), # arrays ( "[]", """ <program> <array/> </program> """ ), ( "[1,2]", """ <program> <array> <number value="1"/> <number value="2"/> </array> </program> """ ), ( "[1,,2]", """ <program> <array> <number value="1"/> <undefined/> <number value="2"/> </array> </program> """ ), ( "[1,,2,,,3,]", """ <program> <array> <number value="1"/> <undefined/> <number value="2"/> <undefined/> <undefined/> <number value="3"/> </array> </program> """ ), ( "['a', 'b','c']", """ <program> <array> <string>a</string> <string>b</string> <string>c</string> </array> </program> """ ), ( "[a, 'b', c]", """ <program> <array> <identifier name="a"/> <string>b</string> <identifier name="c"/> </array> </program> """ ), # objects ( "o = {};", """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object/> </right> </assign> </program> """ ), ( "o = {a: 1};", """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="a"> <number value="1"/> </property> </object> </right> </assign> </program> """ ), ( "o = {a: 1, b: 2};", """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="a"> <number value="1"/> </property> <property name="b"> <number value="2"/> </property> </object> </right> </assign> </program> """ ), ( "o = {'c': 1, 'd': 2};", """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="c"> <number value="1"/> </property> <property name="d"> <number value="2"/> </property> </object> </right> </assign> </program> """ ), ( 'o = {"c": 1, "d": 2};', """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="c"> <number value="1"/> </property> <property name="d"> <number value="2"/> </property> </object> </right> </assign> </program> """ ), ( 'o = {"c": 1, d: "e"};', """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="c"> <number value="1"/> </property> <property name="d"> <string>e</string> </property> </object> </right> </assign> </program> """ ), ( "e = {foo: 5, bar: 6, baz: ['Baz', 'Content']};", """ <program> <assign operator="="> <left> <identifier name="e"/> </left> <right> <object> <property name="foo"> <number value="5"/> </property> <property name="bar"> <number value="6"/> </property> <property name="baz"> <array> <string>Baz</string> <string>Content</string> </array> </property> </object> </right> </assign> </program> """ ), # other primitive data types ( "null;", """ <program> <null/> </program> """ ), ( "undefined;", """ <program> <undefined/> </program> """ ), ( "true;", """ <program> <boolean>true</boolean> </program> """ ), ( "false;", """ <program> <boolean>false</boolean> </program> """ ), # variables ( r""" var i; """, """ <program> <var name="i"/> </program> """ ), ( r""" var i,j,k; """, """ <program> <var name="i"/> <var name="j"/> <var name="k"/> </program> """ ), ( r""" var i = 0; """, """ <program> <var name="i"> <number value="0"/> </var> </program> """ ), ( r""" var i = "test"; """, """ <program> <var name="i"> <string>test</string> </var> </program> """ ), ( r"""var z = 'foxes', r = 'birds';""", """ <program> <var name="z"> <string>foxes</string> </var> <var name="r"> <string>birds</string> </var> </program> """ ), ( r""" var i, j, k = 0; """, """ <program> <var name="i"/> <var name="j"/> <var name="k"> <number value="0"/> </var> </program> """ ), ( r""" var i=1, j, k = 2; """, """ <program> <var name="i"> <number value="1"/> </var> <var name="j"/> <var name="k"> <number value="2"/> </var> </program> """ ), ( r""" var i = obj.prop; """, """ <program> <var name="i"> <dotaccessor> <object> <identifier name="obj"/> </object> <property> <identifier name="prop"/> </property> </dotaccessor> </var> </program> """ ), ( r"""var testObj = {};""", """ <program> <var name="testObj"> <object/> </var> </program> """ ), ( r"""var testObj = [];""", """ <program> <var name="testObj"> <array/> </var> </program> """ ), # operations ( r""" 1 + 2; "foo" + false; 3 - 5 """, """ <program> <binaryoperation operation="+"> <left> <number value="1"/> </left> <right> <number value="2"/> </right> </binaryoperation> <binaryoperation operation="+"> <left> <string>foo</string> </left> <right> <boolean>false</boolean> </right> </binaryoperation> <binaryoperation operation="-"> <left> <number value="3"/> </left> <right> <number value="5"/> </right> </binaryoperation> </program> """ ), ( r""" 1.0 / 2.0; -2 * 2; 12 % 5; """, """ <program> <binaryoperation operation="/"> <left> <number value="1.0"/> </left> <right> <number value="2.0"/> </right> </binaryoperation> <binaryoperation operation="*"> <left> <number value="-2"/> </left> <right> <number value="2"/> </right> </binaryoperation> <binaryoperation operation="%"> <left> <number value="12"/> </left> <right> <number value="5"/> </right> </binaryoperation> </program> """ ), ( r""" // Postfix var x = 3; y = x++; // y = 3, x = 4 // Prefix var a = 2; b = ++a; // a = 3, b = 3 """, """ <program> <var name="x"> <number value="3"/> </var> <assign operator="="> <left> <identifier name="y"/> </left> <right> <postfix operation="++"> <identifier name="x"/> </postfix> </right> </assign> <var name="a"> <number value="2"/> </var> <assign operator="="> <left> <identifier name="b"/> </left> <right> <unaryoperation operation="++"> <identifier name="a"/> </unaryoperation> </right> </assign> </program> """ ), ( r""" // Postfix var x = 3; y = x--; // y = 3, x = 2 // Prefix var a = 2; b = --a; // a = 1, b = 1 """, """ <program> <var name="x"> <number value="3"/> </var> <assign operator="="> <left> <identifier name="y"/> </left> <right> <postfix operation="--"> <identifier name="x"/> </postfix> </right> </assign> <var name="a"> <number value="2"/> </var> <assign operator="="> <left> <identifier name="b"/> </left> <right> <unaryoperation operation="--"> <identifier name="a"/> </unaryoperation> </right> </assign> </program> """ ), ( r""" var x = 3; y = -x; // y = -3, x = 3 """, """ <program> <var name="x"> <number value="3"/> </var> <assign operator="="> <left> <identifier name="y"/> </left> <right> <unaryoperation operation="-"> <identifier name="x"/> </unaryoperation> </right> </assign> </program> """ ), ( r""" +3; // 3 +"3"; // 3 +true; // 1 +false; // 0 +null; // 0 """, """ <program> <number value="+3"/> <unaryoperation operation="+"> <string>3</string> </unaryoperation> <unaryoperation operation="+"> <boolean>true</boolean> </unaryoperation> <unaryoperation operation="+"> <boolean>false</boolean> </unaryoperation> <unaryoperation operation="+"> <null/> </unaryoperation> </program> """ ), # assignements ( r""" i = b; """, """ <program> <assign operator="="> <left> <identifier name="i"/> </left> <right> <identifier name="b"/> </right> </assign> </program> """ ), ( r""" i.a = "b"; """, """ <program> <assign operator="="> <left> <dotaccessor> <object> <identifier name="i"/> </object> <property> <identifier name="a"/> </property> </dotaccessor> </left> <right> <string>b</string> </right> </assign> </program> """ ), ( r""" i["a"] = "b"; """, """ <program> <assign operator="="> <left> <bracketaccessor> <object> <identifier name="i"/> </object> <property> <string>a</string> </property> </bracketaccessor> </left> <right> <string>b</string> </right> </assign> </program> """ ), ( r""" i[a] = "b"; """, """ <program> <assign operator="="> <left> <bracketaccessor> <object> <identifier name="i"/> </object> <property> <identifier name="a"/> </property> </bracketaccessor> </left> <right> <string>b</string> </right> </assign> </program> """ ), # control structures ( r""" if (condition) { result = expression; }""", """ <program> <if> <predicate> <identifier name="condition"/> </predicate> <then> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="expression"/> </right> </assign> </block> </then> </if> </program> """ ), ( r""" if (condition) { result = expression; } else { result = alternative; }""", """ <program> <if> <predicate> <identifier name="condition"/> </predicate> <then> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="expression"/> </right> </assign> </block> </then> <else> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="alternative"/> </right> </assign> </block> </else> </if> </program> """ ), ( r""" if (exprA == exprB) { result = expression; } else if (expr2) { result = alternative1; } else { result = alternative2; }""", """ <program> <if> <predicate> <binaryoperation operation="=="> <left> <identifier name="exprA"/> </left> <right> <identifier name="exprB"/> </right> </binaryoperation> </predicate> <then> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="expression"/> </right> </assign> </block> </then> <else> <if> <predicate> <identifier name="expr2"/> </predicate> <then> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="alternative1"/> </right> </assign> </block> </then> <else> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="alternative2"/> </right> </assign> </block> </else> </if> </else> </if> </program> """ ), ( "result = condition ? expression : alternative;", """ <program> <assign operator="="> <left> <identifier name="result"/> </left> <right> <conditional> <condition> <identifier name="condition"/> </condition> <value1> <identifier name="expression"/> </value1> <value2> <identifier name="alternative"/> </value2> </conditional> </right> </assign> </program> """ ), # switch ( r""" switch (expr) { case SOMEVALUE: //statements; break; case ANOTHERVALUE: //statements; break; default: //statements; break; } """, """ <program> <switch> <expression> <identifier name="expr"/> </expression> <case> <expression> <identifier name="SOMEVALUE"/> </expression> <break/> </case> <case> <expression> <identifier name="ANOTHERVALUE"/> </expression> <break/> </case> <default> <break/> </default> </switch> </program> """ ), # for loop ( r""" for (var i = 0; i < 5; i++) { a = i; } """, """ <program> <for> <init> <var name="i"> <number value="0"/> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <number value="5"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="i"/> </postfix> </post> <statement> <block> <assign operator="="> <left> <identifier name="a"/> </left> <right> <identifier name="i"/> </right> </assign> </block> </statement> </for> </program> """ ), ( r""" for (var i = 0; i < 5; i++) { a = i } """, """ <program> <for> <init> <var name="i"> <number value="0"/> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <number value="5"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="i"/> </postfix> </post> <statement> <block> <assign operator="="> <left> <identifier name="a"/> </left> <right> <identifier name="i"/> </right> </assign> </block> </statement> </for> </program> """ ), ( r""" for (var key in array) { continue; } """, """ <program> <forin> <variable> <var name="key"/> </variable> <object> <identifier name="array"/> </object> <statement> <block> <continue/> </block> </statement> </forin> </program> """ ), ( r""" for (;;) { break; } """, """ <program> <for> <statement> <block> <break/> </block> </statement> </for> </program> """ ), ( r""" for (; i < len; i++) { j = i; } """, """ <program> <for> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <identifier name="len"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="i"/> </postfix> </post> <statement> <block> <assign operator="="> <left> <identifier name="j"/> </left> <right> <identifier name="i"/> </right> </assign> </block> </statement> </for> </program> """ ), ( r""" for (var i = 0, len = cars.length, text = ""; i < len; i++) { text += cars[i] + "<br>"; } """, """ <program> <for> <init> <var name="i"> <number value="0"/> </var> <var name="len"> <dotaccessor> <object> <identifier name="cars"/> </object> <property> <identifier name="length"/> </property> </dotaccessor> </var> <var name="text"> <string></string> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <identifier name="len"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="i"/> </postfix> </post> <statement> <block> <assign operator="+="> <left> <identifier name="text"/> </left> <right> <binaryoperation operation="+"> <left> <bracketaccessor> <object> <identifier name="cars"/> </object> <property> <identifier name="i"/> </property> </bracketaccessor> </left> <right> <string><br></string> </right> </binaryoperation> </right> </assign> </block> </statement> </for> </program> """ ), ( """ for (; i < len; ) { text += cars[i] + "<br>"; i++; } """, """ <program> <for> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <identifier name="len"/> </right> </binaryoperation> </condition> <statement> <block> <assign operator="+="> <left> <identifier name="text"/> </left> <right> <binaryoperation operation="+"> <left> <bracketaccessor> <object> <identifier name="cars"/> </object> <property> <identifier name="i"/> </property> </bracketaccessor> </left> <right> <string><br></string> </right> </binaryoperation> </right> </assign> <postfix operation="++"> <identifier name="i"/> </postfix> </block> </statement> </for> </program> """ ), # while loop ( """ while (a<b) { a+=1; } """, """ <program> <while> <predicate> <binaryoperation operation="<"> <left> <identifier name="a"/> </left> <right> <identifier name="b"/> </right> </binaryoperation> </predicate> <statement> <block> <assign operator="+="> <left> <identifier name="a"/> </left> <right> <number value="1"/> </right> </assign> </block> </statement> </while> </program> """ ), ( """ do { a+=1; } while (a<b); """, """ <program> <statement> <block> <assign operator="+="> <left> <identifier name="a"/> </left> <right> <number value="1"/> </right> </assign> </block> </statement> <while> <binaryoperation operation="<"> <left> <identifier name="a"/> </left> <right> <identifier name="b"/> </right> </binaryoperation> </while> </program> """ ), # with ( """ with (document) { var a = getElementById('a'); var b = getElementById('b'); var c = getElementById('c'); var c = document.get('c'); }; """, """ <program> <with> <identifier name="document"/> <statement> <block> <var name="a"> <functioncall> <function> <identifier name="getElementById"/> </function> <arguments> <string>a</string> </arguments> </functioncall> </var> <var name="b"> <functioncall> <function> <identifier name="getElementById"/> </function> <arguments> <string>b</string> </arguments> </functioncall> </var> <var name="c"> <functioncall> <function> <identifier name="getElementById"/> </function> <arguments> <string>c</string> </arguments> </functioncall> </var> <var name="c"> <functioncall> <function> <dotaccessor> <object> <identifier name="document"/> </object> <property> <identifier name="get"/> </property> </dotaccessor> </function> <arguments> <string>c</string> </arguments> </functioncall> </var> </block> </statement> </with> <empty>;</empty> </program> """ ), # label ( r""" loop1: for (var a = 0; a < 10; a++) { if (a == 4) { break loop1; // Stops after the 4th attempt } alert('a = ' + a); loop2: for (var b = 0; b < 10; ++b) { if (b == 3) { continue loop2; // Number 3 is skipped } if (b == 6) { continue loop1; // Continues the first loop, 'finished' is not shown } alert('b = ' + b); } alert('finished') } block1: { alert('hello'); // Displays 'hello' break block1; alert('world'); // Will never get here } """, """ <program> <label name="loop1"> <statement> <for> <init> <var name="a"> <number value="0"/> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="a"/> </left> <right> <number value="10"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="a"/> </postfix> </post> <statement> <block> <if> <predicate> <binaryoperation operation="=="> <left> <identifier name="a"/> </left> <right> <number value="4"/> </right> </binaryoperation> </predicate> <then> <block> <break> <identifier name="loop1"/> </break> </block> </then> </if> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <binaryoperation operation="+"> <left> <string>a = </string> </left> <right> <identifier name="a"/> </right> </binaryoperation> </arguments> </functioncall> <label name="loop2"> <statement> <for> <init> <var name="b"> <number value="0"/> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="b"/> </left> <right> <number value="10"/> </right> </binaryoperation> </condition> <post> <unaryoperation operation="++"> <identifier name="b"/> </unaryoperation> </post> <statement> <block> <if> <predicate> <binaryoperation operation="=="> <left> <identifier name="b"/> </left> <right> <number value="3"/> </right> </binaryoperation> </predicate> <then> <block> <continue> <identifier name="loop2"/> </continue> </block> </then> </if> <if> <predicate> <binaryoperation operation="=="> <left> <identifier name="b"/> </left> <right> <number value="6"/> </right> </binaryoperation> </predicate> <then> <block> <continue> <identifier name="loop1"/> </continue> </block> </then> </if> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <binaryoperation operation="+"> <left> <string>b = </string> </left> <right> <identifier name="b"/> </right> </binaryoperation> </arguments> </functioncall> </block> </statement> </for> </statement> </label> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>finished</string> </arguments> </functioncall> </block> </statement> </for> </statement> </label> <label name="block1"> <statement> <block> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>hello</string> </arguments> </functioncall> <break> <identifier name="block1"/> </break> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>world</string> </arguments> </functioncall> </block> </statement> </label> </program> """ ), # functions ( """ function foo(p) { p = "bar"; } """, """ <program> <funcdecl name="foo"> <parameters> <identifier name="p"/> </parameters> <body> <assign operator="="> <left> <identifier name="p"/> </left> <right> <string>bar</string> </right> </assign> </body> </funcdecl> </program> """ ), ( """ function hello() { alert('world'); } """, """ <program> <funcdecl name="hello"> <parameters/> <body> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>world</string> </arguments> </functioncall> </body> </funcdecl> </program> """ ), ( """ var anon = function() { alert('I am anonymous'); }; """, """ <program> <var name="anon"> <funcexpr> <identifier/> <parameters/> <body> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>I am anonymous</string> </arguments> </functioncall> </body> </funcexpr> </var> </program> """ ), ( """ anon(); """, """ <program> <functioncall> <function> <identifier name="anon"/> </function> <arguments/> </functioncall> </program> """ ), ( """ setTimeout(function() { alert('hello'); }, 1000) """, """ <program> <functioncall> <function> <identifier name="setTimeout"/> </function> <arguments> <funcexpr> <identifier/> <parameters/> <body> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>hello</string> </arguments> </functioncall> </body> </funcexpr> <number value="1000"/> </arguments> </functioncall> </program> """ ), ( """ (function() { alert('foo'); }()); """, """ <program> <functioncall> <function> <funcexpr> <identifier/> <parameters/> <body> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>foo</string> </arguments> </functioncall> </body> </funcexpr> </function> <arguments/> </functioncall> </program> """ ), # get/set ( """ var obj = { get latest () { return "latest"; } } """, """ <program> <var name="obj"> <object> <get> <property> <identifier name="latest"/> </property> <body> <return> <string>latest</string> </return> </body> </get> </object> </var> </program> """ ), ( """ delete obj.latest; """, """ <program> <unaryoperation operation="delete"> <dotaccessor> <object> <identifier name="obj"/> </object> <property> <identifier name="latest"/> </property> </dotaccessor> </unaryoperation> </program> """ ), ( """ var o = { set current (str) { return this.log[this.log.length] = str; }, log: [] } """, """ <program> <var name="o"> <object> <set> <body> <return> <assign operator="="> <left> <bracketaccessor> <object> <dotaccessor> <object> <identifier>this</identifier> </object> <property> <identifier name="log"/> </property> </dotaccessor> </object> <property> <dotaccessor> <object> <dotaccessor> <object> <identifier>this</identifier> </object> <property> <identifier name="log"/> </property> </dotaccessor> </object> <property> <identifier name="length"/> </property> </dotaccessor> </property> </bracketaccessor> </left> <right> <identifier name="str"/> </right> </assign> </return> </body> </set> <property name="log"> <array/> </property> </object> </var> </program> """ ), ] for snippet, expected in jscode_snippets: print "---------------------------------------------------------" print snippet js = js2xml.parse(snippet) output = js2xml.pretty_print(js).strip() assert_equal(output, expected.strip(), "got\n%s\nexpected:\n%s" % (output, expected))