Exemplo n.º 1
0
 def get_initial_requests(self):
     seeds = set(self.seeds)
     for fname in self.seed_files:
         with open(fname, 'r') as f:
             seeds.update(l.strip() for l in f)
     for seed in seeds:
         url, _ = norm_url(seed)
         yield Request(url,
                       meta = {
                               'lcrawl.labels' : self.get_initial_state(url)
                               })
Exemplo n.º 2
0
 def load_from_txt(cls, filename):
     with open(filename, 'r') as f:
         return cls([(labels.split(','), norm_url(url)[0])
                     for labels, url
                     in (l.strip().split(' ') for l in f)],
                    filename)