Exemplo n.º 1
0
 def send_scrapy_query(self, action, arguments=None):
     url = "%s%s.json" % (self.scrapyd, action)
     method = "POST"
     headers = None
     if action.startswith('list'):
         method = "GET"
         if arguments:
             args = [
                 str(k) + '=' + str(v) for (k, v) in arguments.iteritems()
             ]
             url += '?' + '&'.join(args)
             arguments = None
     elif arguments:
         arguments = urlencode(arguments)
         headers = {'Content-Type': 'application/x-www-form-urlencoded'}
     try:
         res = yield getPage(url, method=method, postdata=arguments, \
           headers=headers, timeout=30)
         result = loadjson(res)
         returnD(result)
     except ConnectionRefusedError:
         returnD(format_error("Could not contact scrapyd server, " + \
           "maybe it's not started..."))
     except Exception as e:
         returnD(format_error(e))
Exemplo n.º 2
0
 def get_scrapyd_status(self):
     url = "%sjobs" % self.scrapyd
     try:
         jobs = yield getPage(url)
     except TimeoutError:
         logger.msg(
             "WARNING: ScrapyD's monitoring website seems like not answering"
         )
         returnD(None)
     except Exception as e:
         logger.msg(
             "WARNING: ScrapyD's monitoring website seems down: %s %s" %
             (type(e), e))
         returnD(None)
     status = {"pending": 0, "running": 0}
     read = None
     for line in jobs.split("><tr"):
         if ">Pending<" in line:
             read = "pending"
         elif ">Running<" in line:
             read = "running"
         elif ">Finished<" in line:
             read = None
         elif read == "running":
             pattern = ">" + self.db_name + "_"
             if pattern not in line:
                 continue
             corpus = line.split(pattern)[1].split("</td>")[0]
             if corpus not in status:
                 status[corpus] = 0
             status[corpus] += 1
             status[read] += 1
         elif read:
             status[read] += 1
     returnD(status)
Exemplo n.º 3
0
 def get_scrapyd_status(self):
     url = "%sjobs" % self.scrapyd
     try:
         jobs = yield getPage(url)
     except TimeoutError:
         logger.msg(
             "WARNING: ScrapyD's monitoring website seems like not answering"
         )
         returnD(None)
     except Exception as e:
         logger.msg(
             "WARNING: ScrapyD's monitoring website seems down: %s %s" %
             (type(e), e))
         returnD(None)
     status = {"pending": 0}
     read = None
     for line in jobs.split("><tr"):
         if ">Pending<" in line:
             read = "pending"
         elif ">Running<" in line:
             read = "running"
         elif ">Finished<" in line:
             read = None
         elif read == "running":
             corpus = line[line.find(".") + 1:line.find("<", 2)]
             if corpus not in status:
                 status[corpus] = 0
             status[corpus] += 1
         elif read:
             status[read] += 1
     returnD(status)
Exemplo n.º 4
0
 def get_scrapyd_status(self):
     url = "%sjobs" % self.scrapyd
     try:
         jobs = yield getPage(url)
     except TimeoutError:
         logger.msg("WARNING: ScrapyD's monitoring website seems like not answering")
         returnD(None)
     except Exception as e:
         logger.msg("WARNING: ScrapyD's monitoring website seems down: %s %s" % (type(e), e))
         returnD(None)
     status = {"pending": 0}
     read = None
     for line in jobs.split("><tr"):
         if ">Pending<" in line:
             read = "pending"
         elif ">Running<" in line:
             read = "running"
         elif ">Finished<" in line:
             read = None
         elif read == "running":
             corpus = line[line.find(".") + 1 : line.find("<", 2)]
             if corpus not in status:
                 status[corpus] = 0
             status[corpus] += 1
         elif read:
             status[read] += 1
     returnD(status)
Exemplo n.º 5
0
def collect_tlds():
    tree = {}
    double_list = {"rules": [], "exceptions": []}
    tldlist = yield getPage(MOZ_TLD_LIST)
    for line in tldlist.split("\n"):
        line = line.strip()
        if not line or line.startswith("//"):
            continue
        chunks = line.decode('utf-8').split('.')
        add_tld_chunks_to_tree(chunks, tree)
        if line[0] == '!':
            double_list["exceptions"].append(line[1:])
        else:
            double_list["rules"].append(line.strip())
    returnD((double_list, tree))
Exemplo n.º 6
0
 def send_scrapy_query(self, action, arguments=None):
     url = "%s%s.json" % (self.scrapyd, action)
     method = "POST"
     headers = None
     if action.startswith('list'):
         method = "GET"
         if arguments:
             args = [str(k)+'='+str(v) for (k, v) in arguments.iteritems()]
             url += '?' + '&'.join(args)
             arguments = None
     elif arguments:
         arguments = urlencode(arguments)
         headers = {'Content-Type': 'application/x-www-form-urlencoded'}
     try:
         res = yield getPage(url, method=method, postdata=arguments, \
           headers=headers, timeout=30)
         result = loadjson(res)
         returnD(result)
     except ConnectionRefusedError:
         returnD(format_error("Could not contact scrapyd server, " + \
           "maybe it's not started..."))
     except Exception as e:
         returnD(format_error(e))