示例#1
0
文件: subscriber.py 项目: xlybaby/VAR
 def collect(self, p_href, p_page, p_sceneno, p_pageno, p_delay=None):
   #startaddr = p_scene["href"]
   print ("get page: "+p_href)
   #print (startpage)
   pagebody = self._driver.get(p_href)
   if p_delay :
     print ("after get page ,we need sleep for a while: " + str(p_delay))
     time.sleep(p_delay)
     
   selector = Selector(text=pagebody)
   actors = p_page["actors"]
   for actidx, actor in enumerate(actors):
     acttype = actor["type"]
     properties = actor["properties"]
     recorder = None
     if acttype == 2: #Recording
       recorder = PageCrawl(p_scenario=self, p_parameters=properties, p_sceneno=p_sceneno, p_pageno=p_pageno, p_location=p_href)    
   
     else:
       raise Exception("Unsupported actor type")    
     data = recorder.do(p_selector=selector)
     if data and "data" in data:
       dir = Configure.get_ouput_dir() + "/" + self.getId()
       filename = self.getTypename() + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + ".json"     
       pageComponentsData = data["data"]
       for data in pageComponentsData :
         print (data)  
         Storage.write_map_result( p_dir = dir, p_file_name = filename, p_contents = data )
示例#2
0
  def capture_whole(self, p_file=None, p_url=None):
    #get_phantomjs_webdriver().get(p_url)
    os.chdir(Configure.get_ouput_dir())
    file_path = Configure.get_ouput_dir()+"/snapshot"
    if not os.path.exists(file_path):
      os.mkdir("snapshot")

    if p_file == None:
      p_file = "snapshot_"+str(uuid.uuid1())+".png"
    print (file_path+"/"+p_file)
    print (Configure.get_chrome_webdriver().get_window_size())
    
    #Get web page's actual height
    clientHeight = Configure.get_chrome_webdriver().execute_script("return document.body.clientHeight;")
    print (clientHeight)
    #Adjuest window's height to fit web page's height
    cursize = Configure.get_chrome_webdriver().get_window_size()
    Configure.get_chrome_webdriver().set_window_size(cursize["width"], clientHeight)
    
    stored = Configure.get_chrome_webdriver().get_screenshot_as_file(file_path+"/"+p_file)
    return stored
示例#3
0
文件: download.py 项目: xlybaby/VAR
    def download_file(self):
        elements = self.getComponent()
        file_path = Configure.get_ouput_dir() + "/download"
        if not os.path.exists(file_path):
            os.mkdir(file_path)
        print(elements)
        for elmt in elements:
            if self._url_property:
                url = elmt.xpath("@" + self._url_property).extract_first()
            else:
                url = elmt.xpath("text()").extract_first()

            file_name = url.split("/")[-1]
            print("Download file to path: " + file_path + "/" + file_name)
            urllib.urlretrieve(url, file_path + "/" + file_name)
示例#4
0
    def write_array_result(p_dir=None,
                           p_file_name=None,
                           p_contents=None,
                           p_prefix=None,
                           p_suffix=None,
                           p_seperator=None,
                           p_linenum=False):
        if p_dir:
            file_path = p_dir
        else:
            file_path = Configure.get_ouput_dir()
        if not os.path.exists(file_path):
            os.makedirs(file_path)

        if p_file_name == None:
            p_file_name = "crawl_" + str(uuid.uuid1()) + ".data"
        print(file_path + "/" + p_file_name)

        ofile = open(file_path + "/" + p_file_name, 'w', encoding="utf-8")
        if p_contents:
            for idx, content in enumerate(p_contents):
                if p_linenum:
                    ofile.write(str(idx) + "  ")

                if p_prefix:
                    ofile.write(p_prefix)

                if type(content) == list:
                    for i, item in enumerate(content):
                        if i > 0 and p_seperator:
                            ofile.write(p_seperator)
                        else:
                            ofile.write(" ")
                        ofile.write(item)
                else:
                    ofile.write(content)

                if p_suffix:
                    ofile.write(p_suffix)
                ofile.write("\n")

        ofile.close()
示例#5
0
    def write_map_result(p_dir=None, p_file_name=None, p_contents=None):
        if p_dir:
            file_path = p_dir
        else:
            file_path = Configure.get_ouput_dir()

        if not os.path.exists(file_path):
            os.mkdir(file_path)

        if p_file_name == None:
            p_file_name = "crawl_" + str(uuid.uuid1()) + ".data"
        print(file_path + "/" + p_file_name)

        ofile = open(file_path + "/" + p_file_name, 'w', encoding="utf-8")

        if p_contents:
            for lines in p_contents:
                #ofile.write("\n")
                ofile.write('{ "items": {')
                for lidx, item in enumerate(lines):
                    if lidx > 0:
                        ofile.write(', ')
                    id = item["item_id"]
                    ofile.write('"' + id + '": {')
                    val = None
                    for idx, key in enumerate(item.keys()):
                        if key == "item_id":
                            continue
                        if val == None:
                            val = '"' + key + '": "' + item[key] + '"'
                        else:
                            val += ', "' + key + '": "' + item[key] + '"'
                    ofile.write(val)
                    ofile.write('}')
                ofile.write(" } }\n")
            ofile.close()
示例#6
0
文件: timeseries.py 项目: xlybaby/VAR
 def collect(self, p_href, p_page, p_sceneno, p_pageno, p_delay=None):
   #startaddr = p_scene["href"]
   print ("get page: "+p_href)
   #print (startpage)
   self._driver.get(p_href)
   if p_delay :
     print ("after get page ,we need sleep for a while: " + str(p_delay))
     time.sleep(p_delay)
     
   selector = Selector(text=self._driver.page_source)
   ofile = open("/usr/local/var/source1.htm",'w', encoding="utf-8")
   ofile.write(self._driver.page_source)
   ofile.close()
   self._driver.save_screenshot("/usr/local/var/capture1.png")
   actors = p_page["actors"]
   for actidx, actor in enumerate(actors):
     acttype = actor["type"]
     properties = actor["properties"]
     if "indexname" in properties :
       indexname = properties["indexname"]   
     else :
       indexname = "indice"
       
     recorder = None
     if acttype == 5: #element
       recorder = Iterator(p_scenario=self, p_parameters=properties, p_sceneno=p_sceneno, p_pageno=p_pageno)    
     elif acttype == 10: #Recordingkv
       recorder = PageKVCrawl(p_scenario=self, p_parameters=properties, p_sceneno=p_sceneno, p_pageno=p_pageno)     
            
     else:
       raise Exception("Unsupported actor type")    
     data = recorder.do(p_selector=selector, p_pageid=self.getId()+"_page0")
     
     resultary = []
     prevalary = None
     iniprevalary = False
     
     keydata = data["keydata"] if "keydata" in data else None
     keylabel = data["keylabel"]
     if not keydata:
       keyvalue = data["keyvalue"] if "keyvalue" in data else "unknown key value"    
     else:
       for ikey in keydata:
         ikeyval = ikey["value"]
         kvpair = '"'+keylabel+'":"'+ikeyval+'"'
         valary.append(kvpair)
       npvalary = np.array(valary).reshape((len(valary),1))  
       if iniprevalary:
         prevalary = np.hstack((prevalary,npvalary))  
       else:
         prevalary = npvalary    
         iniprevalary = True
               
     values = data["values"]
     for valkey in values.keys():
       valcollect = values[valkey]
       valary = []
       for item in valcollect:
         itemval = item["value"]
         itemtype = properties["pageComponent"]["kvv-mapping"]["values"][valkey]["type"]
         if valkey == "timepoint":
           format = properties["timepoint"]["format"]
           if format == "yyyy-mm-dd HH:mi:ss" :
             itemval = itemval[0:10] +"T"+  itemval[11:]+".000Z"    
           elif format == "HH:mi:ss" :     
             itemval = datetime.datetime.now().strftime('%Y-%m-%dT') +  itemval +".000Z"
                 
         if itemtype == "number" or itemtype == "boolean" :
           kvpair = '"'+valkey+'":'+itemval  
         else:
           kvpair = '"'+valkey+'":"'+itemval+'"'
         valary.append(kvpair)
       npvalary = np.array(valary).reshape((len(valary),1))  
       if iniprevalary :
         prevalary = np.hstack((prevalary,npvalary))  
       else :
         prevalary = npvalary
         iniprevalary = True
     
     if not keydata:
       rows, cols = prevalary.shape
       print ("rows, cols, "+str(rows)+str(cols))
       keysary = []
       for ii in range(rows):
         keysary.append('"'+keylabel+'":"'+keyvalue+'"')     
       npkeysary = np.array(keysary).reshape((len(keysary),1))    
       prevalary = np.hstack((prevalary,npkeysary))  
       
     dir = Configure.get_ouput_dir() + "/" + self.getId()
     filename = self.getTypename()+datetime.datetime.now().strftime('%Y%m%d%H%M%S')+".json"   
     Storage.write_array_result(p_dir=dir, p_file_name=filename, p_contents=prevalary.tolist(), p_prefix="{", p_suffix="}", p_seperator=",")