def collect(self, p_href, p_page, p_sceneno, p_pageno, p_delay=None): #startaddr = p_scene["href"] print ("get page: "+p_href) #print (startpage) pagebody = self._driver.get(p_href) if p_delay : print ("after get page ,we need sleep for a while: " + str(p_delay)) time.sleep(p_delay) selector = Selector(text=pagebody) actors = p_page["actors"] for actidx, actor in enumerate(actors): acttype = actor["type"] properties = actor["properties"] recorder = None if acttype == 2: #Recording recorder = PageCrawl(p_scenario=self, p_parameters=properties, p_sceneno=p_sceneno, p_pageno=p_pageno, p_location=p_href) else: raise Exception("Unsupported actor type") data = recorder.do(p_selector=selector) if data and "data" in data: dir = Configure.get_ouput_dir() + "/" + self.getId() filename = self.getTypename() + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + ".json" pageComponentsData = data["data"] for data in pageComponentsData : print (data) Storage.write_map_result( p_dir = dir, p_file_name = filename, p_contents = data )
def capture_whole(self, p_file=None, p_url=None): #get_phantomjs_webdriver().get(p_url) os.chdir(Configure.get_ouput_dir()) file_path = Configure.get_ouput_dir()+"/snapshot" if not os.path.exists(file_path): os.mkdir("snapshot") if p_file == None: p_file = "snapshot_"+str(uuid.uuid1())+".png" print (file_path+"/"+p_file) print (Configure.get_chrome_webdriver().get_window_size()) #Get web page's actual height clientHeight = Configure.get_chrome_webdriver().execute_script("return document.body.clientHeight;") print (clientHeight) #Adjuest window's height to fit web page's height cursize = Configure.get_chrome_webdriver().get_window_size() Configure.get_chrome_webdriver().set_window_size(cursize["width"], clientHeight) stored = Configure.get_chrome_webdriver().get_screenshot_as_file(file_path+"/"+p_file) return stored
def download_file(self): elements = self.getComponent() file_path = Configure.get_ouput_dir() + "/download" if not os.path.exists(file_path): os.mkdir(file_path) print(elements) for elmt in elements: if self._url_property: url = elmt.xpath("@" + self._url_property).extract_first() else: url = elmt.xpath("text()").extract_first() file_name = url.split("/")[-1] print("Download file to path: " + file_path + "/" + file_name) urllib.urlretrieve(url, file_path + "/" + file_name)
def write_array_result(p_dir=None, p_file_name=None, p_contents=None, p_prefix=None, p_suffix=None, p_seperator=None, p_linenum=False): if p_dir: file_path = p_dir else: file_path = Configure.get_ouput_dir() if not os.path.exists(file_path): os.makedirs(file_path) if p_file_name == None: p_file_name = "crawl_" + str(uuid.uuid1()) + ".data" print(file_path + "/" + p_file_name) ofile = open(file_path + "/" + p_file_name, 'w', encoding="utf-8") if p_contents: for idx, content in enumerate(p_contents): if p_linenum: ofile.write(str(idx) + " ") if p_prefix: ofile.write(p_prefix) if type(content) == list: for i, item in enumerate(content): if i > 0 and p_seperator: ofile.write(p_seperator) else: ofile.write(" ") ofile.write(item) else: ofile.write(content) if p_suffix: ofile.write(p_suffix) ofile.write("\n") ofile.close()
def write_map_result(p_dir=None, p_file_name=None, p_contents=None): if p_dir: file_path = p_dir else: file_path = Configure.get_ouput_dir() if not os.path.exists(file_path): os.mkdir(file_path) if p_file_name == None: p_file_name = "crawl_" + str(uuid.uuid1()) + ".data" print(file_path + "/" + p_file_name) ofile = open(file_path + "/" + p_file_name, 'w', encoding="utf-8") if p_contents: for lines in p_contents: #ofile.write("\n") ofile.write('{ "items": {') for lidx, item in enumerate(lines): if lidx > 0: ofile.write(', ') id = item["item_id"] ofile.write('"' + id + '": {') val = None for idx, key in enumerate(item.keys()): if key == "item_id": continue if val == None: val = '"' + key + '": "' + item[key] + '"' else: val += ', "' + key + '": "' + item[key] + '"' ofile.write(val) ofile.write('}') ofile.write(" } }\n") ofile.close()
def collect(self, p_href, p_page, p_sceneno, p_pageno, p_delay=None): #startaddr = p_scene["href"] print ("get page: "+p_href) #print (startpage) self._driver.get(p_href) if p_delay : print ("after get page ,we need sleep for a while: " + str(p_delay)) time.sleep(p_delay) selector = Selector(text=self._driver.page_source) ofile = open("/usr/local/var/source1.htm",'w', encoding="utf-8") ofile.write(self._driver.page_source) ofile.close() self._driver.save_screenshot("/usr/local/var/capture1.png") actors = p_page["actors"] for actidx, actor in enumerate(actors): acttype = actor["type"] properties = actor["properties"] if "indexname" in properties : indexname = properties["indexname"] else : indexname = "indice" recorder = None if acttype == 5: #element recorder = Iterator(p_scenario=self, p_parameters=properties, p_sceneno=p_sceneno, p_pageno=p_pageno) elif acttype == 10: #Recordingkv recorder = PageKVCrawl(p_scenario=self, p_parameters=properties, p_sceneno=p_sceneno, p_pageno=p_pageno) else: raise Exception("Unsupported actor type") data = recorder.do(p_selector=selector, p_pageid=self.getId()+"_page0") resultary = [] prevalary = None iniprevalary = False keydata = data["keydata"] if "keydata" in data else None keylabel = data["keylabel"] if not keydata: keyvalue = data["keyvalue"] if "keyvalue" in data else "unknown key value" else: for ikey in keydata: ikeyval = ikey["value"] kvpair = '"'+keylabel+'":"'+ikeyval+'"' valary.append(kvpair) npvalary = np.array(valary).reshape((len(valary),1)) if iniprevalary: prevalary = np.hstack((prevalary,npvalary)) else: prevalary = npvalary iniprevalary = True values = data["values"] for valkey in values.keys(): valcollect = values[valkey] valary = [] for item in valcollect: itemval = item["value"] itemtype = properties["pageComponent"]["kvv-mapping"]["values"][valkey]["type"] if valkey == "timepoint": format = properties["timepoint"]["format"] if format == "yyyy-mm-dd HH:mi:ss" : itemval = itemval[0:10] +"T"+ itemval[11:]+".000Z" elif format == "HH:mi:ss" : itemval = datetime.datetime.now().strftime('%Y-%m-%dT') + itemval +".000Z" if itemtype == "number" or itemtype == "boolean" : kvpair = '"'+valkey+'":'+itemval else: kvpair = '"'+valkey+'":"'+itemval+'"' valary.append(kvpair) npvalary = np.array(valary).reshape((len(valary),1)) if iniprevalary : prevalary = np.hstack((prevalary,npvalary)) else : prevalary = npvalary iniprevalary = True if not keydata: rows, cols = prevalary.shape print ("rows, cols, "+str(rows)+str(cols)) keysary = [] for ii in range(rows): keysary.append('"'+keylabel+'":"'+keyvalue+'"') npkeysary = np.array(keysary).reshape((len(keysary),1)) prevalary = np.hstack((prevalary,npkeysary)) dir = Configure.get_ouput_dir() + "/" + self.getId() filename = self.getTypename()+datetime.datetime.now().strftime('%Y%m%d%H%M%S')+".json" Storage.write_array_result(p_dir=dir, p_file_name=filename, p_contents=prevalary.tolist(), p_prefix="{", p_suffix="}", p_seperator=",")