Пример #1
0
 def addLinks(self, links, sel = True, startFrom = 0):
     self.links = links 
     if sel:
         driver = webdriver.Firefox() 
         for url in links:
             driver.get(url)
             self.htmls.append(driver.page_source)
         driver.close()    
     # save htmls
     else:
         for url in links:
             r = requests.get(url)
             self.htmls.append(r.text)
     if not os.path.exists(self.path + self.name):
         os.makedirs(self.path + self.name)
     for num, html in enumerate(self.htmls): 
         with open(self.path + self.name + str(num + startFrom) + ".html", "w") as f: 
             f.write(html)
     self.trees = [makeTree(" ".join(html.split()), url) for html, url in zip(self.htmls, self.links)] 
Пример #2
0
    def load(self):
        obj = Training(self.name, self.path) 
        
        with open(obj.path + obj.name + "sky.training.links") as f: 
            obj.links = f.read().split('\n')
            
        # load targets
        with open(obj.path + obj.name + "sky.training.targets") as f: 
            targets = f.read()
            obj.targets = [" ".join(x.split()) for x in targets.split("sky\nsky")]
        
        # load htmls
        obj.htmls = []
        for num in range(len(obj.links)): 
            with open(obj.path + obj.name + str(num) + ".html") as f: 
                obj.htmls.append(f.read())

        obj.trees = [makeTree(" ".join(html.split()), url) for html, url in zip(obj.htmls, obj.links)]         
                
        return obj        
Пример #3
0
 def addLinks(self, links, sel=True, startFrom=0):
     self.links = links
     if sel:
         driver = webdriver.Firefox()
         for url in links:
             driver.get(url)
             self.htmls.append(driver.page_source)
         driver.close()
     # save htmls
     else:
         for url in links:
             r = requests.get(url)
             self.htmls.append(r.text)
     if not os.path.exists(self.path + self.name):
         os.makedirs(self.path + self.name)
     for num, html in enumerate(self.htmls):
         with open(self.path + self.name + str(num + startFrom) + ".html", "w") as f:
             f.write(html)
     self.trees = [makeTree(" ".join(html.split()), url)
                   for html, url in zip(self.htmls, self.links)]
Пример #4
0
    def load(self):
        obj = Training(self.name, self.path)

        with open(obj.path + obj.name + "sky.training.links") as f:
            obj.links = f.read().split('\n')

        # load targets
        with open(obj.path + obj.name + "sky.training.targets") as f:
            targets = f.read()
            obj.targets = [" ".join(x.split()) for x in targets.split("sky\nsky")]

        # load htmls
        obj.htmls = []
        for num in range(len(obj.links)):
            with open(obj.path + obj.name + str(num) + ".html") as f:
                obj.htmls.append(f.read())

        obj.trees = [makeTree(" ".join(html.split()), url)
                     for html, url in zip(obj.htmls, obj.links)]

        return obj