def outReceived(self, data): logger.debug("spawnrunner received for client# %d %s" % (self.client.clientnumber, data[:180])) # although the client can parse the records itself, it is necessary to split them up here correctly so that this code can insert its own records into the stream. lines = [ ] spldata = data.split("\n") self.lbuffer.append(spldata.pop(0)) while spldata: lines.append("".join(self.lbuffer)) self.lbuffer = [ spldata.pop(0) ] # next one in for line in lines: # strip out the httpheaders that come back at the start of a node connection if not self.httpheadersdone: if re.match("HTTP/", line): assert not self.httpheaders continue if line == "\r": self.httpheadersdone = True continue mheader = re.match("(.*?):\s*(.*)\r", line) if not mheader: logger.error("Bad header: "+str([line])) else: self.httpheaders.append((mheader.group(1), mheader.group(2))) continue logger.info("Received and will write: "+str([line])) self.client.writeall(line) if self.runobjectmaker: self.runobjectmaker.receiveline(line)
def requestoverduescrapers(self): logger.info("requestoverduescrapers") uget = {"format":"jsondict", "searchquery":"*OVERDUE*", "maxrows":self.maxscheduledscrapers+5} url = urlparse.urljoin(config.get("twister", "apiurl"), '/api/1.0/scraper/search') logger.info("API URL: " + url + " with params " + urllib.urlencode(uget) ) d = agent.request('GET', "%s?%s" % (url, urllib.urlencode(uget))) d.addCallbacks(self.requestoverduescrapersResponse, self.requestoverduescrapersFailure)
def MakeRunner(scrapername, guid, language, urlquery, username, code, client, beta_user, attachables, rev, bmakerunobject, agent, scheduled): srunner = spawnRunner(client, code) # reuse this class and its functions jdata = { } jdata["code"] = code.replace('\r', '') jdata["cpulimit"] = 80 jdata["draft"] = (not scrapername) jdata["username"] = username # comes through when done with stimulate_run, and we can use this for the dataproxy permissions (whether it can add to the attachables list) jdata["language"] = language jdata["scraperid"] = guid jdata["urlquery"] = urlquery jdata["scrapername"] = scrapername jdata["beta_user"] = beta_user jdata["attachables"] = attachables jdata["rev"] = rev # invent the runid (should actually jdata["runid"] = '%.6f_%s' % (time.time(), uuid.uuid4()) if jdata.get("draft"): jdata["runid"] = "draft|||%s" % jdata["runid"] #logger.info("jjjjjjj "+str(jdata)) srunner.jdata = jdata if bmakerunobject: srunner.runobjectmaker = ScheduledRunMessageLoopHandler(client, username, agent, jdata["runid"],rev) logger.info("Making run object on %s client# %d" % (scrapername, client.clientnumber)) target = 'scheduled' if scheduled else 'live' nodename, nodehost, nodeport = choose_controller( target ) logger.info('Running code on %s:%s (%s/%s)' % (nodehost,str(nodeport),nodename,target) ) deferred = clientcreator.connectTCP(nodehost, nodeport) deferred.addCallbacks(srunner.gotcontrollerconnectionprotocol, srunner.controllerconnectionrequestFailure) return srunner
def dataReceived(self, data): # probably should be using LineReceiver lines = (self.bufferclient+data).split("\r\n") self.bufferclient = lines.pop(-1) for lline in lines: line = lline.strip() # handle case where we have an http connection rather than plain socket connection if not self.clienttype and line[:4] == 'GET ': self.clienttype = "httpget" self.factory.clientConnectionRegistered(self) self.httpheaders = [ ] self.httpgetreplied = False if self.clienttype == "httpget": self.handlehttpgetcase(line) # otherwise this is a message coming in from django or the browser editor # (though the django ones could be done with an httppost or something) elif line: try: parsed_data = json.loads(line) except ValueError: logger.warning("Nonparsable command ") logger.info("Bad json parsing: client# %d %s" % (self.clientnumber, str([line[:1000]]))) self.writejson({'content':"Command not json parsable: %s " % line, 'message_type':'console'}) continue if type(parsed_data) != dict or 'command' not in parsed_data: logger.info("Bad json parsing not dict: client# %d %s" % (self.clientnumber, str([line[:1000]]))) self.writejson({'content':"Command not json dict with command: %s " % line, 'message_type':'console'}) continue command = parsed_data.get('command') self.clientcommand(command, parsed_data)
def connectionMade(self): logger.info("connection client# %d" % self.factory.clientcount) # This returns localhost and is unable to distinguish # between orbited or django source. socket = self.transport.getHandle() if not socket.getpeername()[0] in allowed_ips: logger.info('Refused connection from %s' % (socket.getpeername()[0],)) self.transport.loseConnection() return self.factory.clientConnectionMade(self)
def controllerconnectionrequestFailure(self, failure): # TODO: Allow us to retry on a different host if it failed. # Unfortunately it is here that we need to retry the connection on a different # host. Maybe that we should just mark the current connection as failed but # need to make sure the connection is still there. if failure.type == ConnectionRefusedError: sreason = "Connection to node-controller refused." else: sreason = "controllerconnectionrequest failure received: "+str(failure) logger.info("controllerconnectionrequest failure received: "+sreason) self.endingrun(sreason) failure.trap(ConnectionRefusedError) # denotes that we have handled this correctly
def main(): # daemon mode if os.fork() == 0 : os.setsid() sys.stdin = open('/dev/null') if stdoutlog: sys.stdout = stdoutlog sys.stderr = stdoutlog if os.fork() == 0: ppid = os.getppid() while ppid != 1: time.sleep(1) ppid = os.getppid() else: os._exit(0) else: os.wait() sys.exit(1) pf = open(poptions.pidfile, 'w') pf.write('%d\n' % os.getpid()) pf.close() if poptions.setuid: gid = grp.getgrnam("nogroup").gr_gid os.setregid(gid, gid) uid = pwd.getpwnam("nobody").pw_uid os.setreuid(uid, uid) logging.config.fileConfig(poptions.config) # subproc mode signal.signal(signal.SIGTERM, sigTerm) while True: child = os.fork() if child == 0 : time.sleep (1) break sys.stdout.write("Forked subprocess: %d\n" % child) sys.stdout.flush() os.wait() # http://localhost:9010/update?runid=1234&message={'sources':'somejson} runnerfactory = RunnerFactory() port = config.getint('twister', 'port') reactor.listenTCP(port, runnerfactory) logger.info("Twister listening on port %d" % port) reactor.run() # this function never returns
def lconnectionopen(self, parsed_data): self.guid = parsed_data.get('guid', '') self.username = parsed_data.get('username', '') self.userrealname = parsed_data.get('userrealname', self.username) self.scrapername = parsed_data.get('scrapername', '') self.scraperlanguage = parsed_data.get('language', '') self.isstaff = (parsed_data.get('isstaff') == "yes") if parsed_data.get('umlmonitoring') == "yes": self.clienttype = "umlmonitoring" else: self.clienttype = "editing" self.savecode_authorized = (parsed_data.get('savecode_authorized') == "yes") self.originalrev = parsed_data.get('originalrev', '') logger.info("connection open %s: %s %s client# %d" % (self.clienttype, self.cchatname, self.scrapername, self.clientnumber)) # this will cause a notifyEditorClients to be called for everyone on this scraper self.factory.clientConnectionRegistered(self)
def updaterunobject(self, bfinished): url = urlparse.urljoin(djangourl, 'scraper-admin/twistermakesrunevent/') logger.info("attempting to post: "+url) self.upost["output"] = self.output if bfinished: self.upost["exitstatus"] = self.exceptionmessage and 'exceptionmessage' or 'done' self.upost["domainscrapes"] = json.dumps(self.domainscrapes) if self.exceptionmessage: self.upost['exceptionmessage'] = self.exceptionmessage # urllib.urlencode applies str() to each value in the list, which is dumb. # to get a proper error, print some chinese characters # need to get an explanation for this design of urlencode lupost = self.upost.copy() for key in lupost: if type(lupost[key]) == unicode: lupost[key] = lupost[key].encode("utf8") ulupost = urllib.urlencode(lupost) logger.info(self.upost) d = self.agent.request('POST', url, Headers(), StringProducer(ulupost)) d.addCallbacks(self.updaterunobjectResponse, self.updaterunobjectFailure)
def requestoverduescrapersAction(self, overduelist): logger.info("overdue "+str([od.get("short_name") for od in overduelist])) while len(self.scheduledrunners) < self.maxscheduledscrapers and overduelist: scraperoverdue = overduelist.pop(0) scrapername = scraperoverdue["short_name"] if scrapername in self.scheduledrunners: continue # Avoids scheduling cases where someone is editing. guid = scraperoverdue.get('guid', '') if guid in self.guidclientmap: continue # Fabricate a new client (not actually connected to a socket or made by the factory) sclient = RunnerProtocol() sclient.factory = self sclient.guid = guid sclient.username = '******' sclient.userrealname = sclient.username sclient.scrapername = scrapername sclient.clienttype = "scheduledrun" sclient.originalrev = scraperoverdue.get('rev', '') sclient.savecode_authorized = False sclient.scraperlanguage = scraperoverdue.get('language', '') code = scraperoverdue.get('code', '') urlquery = scraperoverdue.get('envvars', {}).get("QUERY_STRING", "") # Allocates the client number. self.clientConnectionMade(sclient) self.scheduledrunners[scrapername] = sclient self.clientConnectionRegistered(sclient) logger.info("starting off scheduled client: %s %s client# %d" % (sclient.cchatname, sclient.scrapername, sclient.clientnumber)) beta_user = scraperoverdue.get("beta_user", False) attachables = scraperoverdue.get('attachables', []) sclient.processrunning = MakeRunner(sclient.scrapername, sclient.guid, sclient.scraperlanguage, urlquery, sclient.username, code, sclient, beta_user, attachables, sclient.originalrev, True, agent, True) self.runidclientmap[sclient.processrunning.jdata["runid"]] = sclient self.notifyMonitoringClients(sclient)
def updaterunobjectFailure(self, failure): # could check for the type and use the retry functionality that exists in twister logger.info("requestoverduescrapers failure received "+str(failure))
class ScheduledRunMessageLoopHandler: def __init__(self, client, username, agent, runid, rev): # a partial implementation of editor.js self.exceptionmessage = [ ] self.completiondata = None self.outputmessage = [ ] self.domainscrapes = { } # domain: { "pages_scraped":0, "bytes_scraped":0 } self.discarded_lines = 0 self.discarded_characters = 0 self.output = "" self.run_ended = None self.upost = {"django_key":djangokey} self.upost["pages_scraped"] = 0 self.upost["records_produced"] = 0 self.upost["scrapername"] = client.scrapername self.upost["clientnumber"] = client.clientnumber self.upost['runID'] = runid self.upost['run_id'] = runid self.upost['revision'] = rev self.username = username self.agent = agent def updaterunobjectFailure(self, failure): # could check for the type and use the retry functionality that exists in twister logger.info("requestoverduescrapers failure received "+str(failure)) def updaterunobjectResponse(self, response): finished = Deferred() response.deliverBody(updaterunobjectReceiver(finished)) return finished def updaterunobject(self, bfinished): url = urlparse.urljoin(djangourl, 'scraper-admin/twistermakesrunevent/') logger.info("attempting to post: "+url) self.upost["output"] = self.output if bfinished: self.upost["exitstatus"] = self.exceptionmessage and 'exceptionmessage' or 'done' self.upost["domainscrapes"] = json.dumps(self.domainscrapes) if self.exceptionmessage: self.upost['exceptionmessage'] = self.exceptionmessage # urllib.urlencode applies str() to each value in the list, which is dumb. # to get a proper error, print some chinese characters # need to get an explanation for this design of urlencode lupost = self.upost.copy() for key in lupost: if type(lupost[key]) == unicode: lupost[key] = lupost[key].encode("utf8") ulupost = urllib.urlencode(lupost) logger.info(self.upost) d = self.agent.request('POST', url, Headers(), StringProducer(ulupost)) d.addCallbacks(self.updaterunobjectResponse, self.updaterunobjectFailure) def receiveline(self, line): try: data = json.loads(line) if not isinstance(data, dict): raise TypeError('Incorrect type of JSON') except Exception, e: logger.debug( "Failed to json.loads() %s %s" % (str([line]), str(e))) return message_type = data.get('message_type') content = data.get("content") if message_type == 'executionstatus': if content == "startingrun": self.output = "%sEXECUTIONSTATUS: uml=%s usename=%s runid=%s\n" % (self.output, data.get("uml"), self.username, data.get("runID")) self.updaterunobject(False) # generated by scriptmanager executor.js elif content == "runcompleted": logger.debug( "Got run completed : %s" % (line,) ) self.completiondata = data self.completionmessage = ''; if data.get('elapsed_seconds'): self.completionmessage += str(data.get("elapsed_seconds")) + " seconds elapsed, " if data.get("CPU_seconds", False): # Until we can get CPU used self.completionmessage += str(data.get("CPU_seconds")) + " CPU seconds used"; if "exit_status" in data and data.get("exit_status") != 0: self.completionmessage += ", exit status " + str(data.get("exit_status")); if "term_sig_text" in data: self.completionmessage += ", terminated by " + data.get("term_sig_text"); elif "term_sig" in data: self.completionmessage += ", terminated by signal " + str(data.get("term_sig")); logger.debug( "Completion status : %s" % (line,) ) # generated by twister after completion elif content == "runfinished": # run object is updated following the schedulecompleted() function call pass else: logger.warning("Unrecognized message: %s %s" % (message_type, content)) elif message_type == "sources": self.upost["pages_scraped"] += 1 # soon to be deprecated url = data.get('url') netloc = "%s://%s" % urlparse.urlparse(url)[:2] if "first_url_scraped" not in self.upost and url and netloc[-16:] != '.scraperwiki.com' and url[-10:] != 'robots.txt': self.upost["first_url_scraped"] = data.get('url') if netloc: if netloc not in self.domainscrapes: self.domainscrapes[netloc] = { "pages_scraped":0, "bytes_scraped":0 } self.domainscrapes[netloc]["pages_scraped"] += 1 self.domainscrapes[netloc]["bytes_scraped"] += int(data.get('bytes')) elif message_type == "data": self.upost["records_produced"] += 1 elif message_type == "sqlitecall": if data.get('insert'): self.upost["records_produced"] += 1 # only one of these ever elif message_type == "exception": self.upost["exception_message"] = data.get('exceptiondescription') for stackentry in data.get("stackdump"): sMessage = stackentry.get('file') if sMessage: if sMessage == "<string>": sMessage = "Line %d: %s" % (stackentry.get('linenumber', -1), stackentry.get('linetext')) if stackentry.get('furtherlinetext'): sMessage += " -- " + stackentry.get('furtherlinetext') self.exceptionmessage.append(sMessage) if stackentry.get('duplicates') and stackentry.get('duplicates') > 1: self.exceptionmessage.append(" + %d duplicates" % stackentry.get('duplicates')) if data.get("blockedurl"): self.exceptionmessage.append("Blocked URL: %s" % data.get("blockedurl")) self.exceptionmessage.append('') self.exceptionmessage.append(data.get('exceptiondescription')) elif message_type == "console": while content: self.outputmessage.append(content[:APPROXLENOUTPUTLIMIT]) content = content[APPROXLENOUTPUTLIMIT:] elif message_type not in ["editorstatus", "saved", "chat"]: logger.info("Unknown record type: %s\n" % str([line[:2000]])) # live update of event output so we can watch it when debugging scraperwiki platform # reduce pressure on the server by only updating when we over-run the buffer if self.outputmessage and len(self.output) < APPROXLENOUTPUTLIMIT: while self.outputmessage: self.output = "%s%s" % (self.output, self.outputmessage.pop(0)) if len(self.output) >= APPROXLENOUTPUTLIMIT: self.output = "%s%s" % (self.output, temptailmessage) self.updaterunobject(False) break self.run_ended = datetime.datetime.now() #self.updaterunobject(False) while len(self.outputmessage) >= TAIL_LINES: discarded = self.outputmessage.pop(0) self.discarded_lines += 1 self.discarded_characters += len(discarded)
def requestoverduescrapersFailure(self, failure): if failure.type == ConnectionRefusedError: logger.info("requestoverduescrapers ConnectionRefused") else: logger.warning("requestoverduescrapers failure received "+str(failure.type)) failure.trap(ConnectionRefusedError) # (doesn't do anything as there's no higher level error handling anyway)
def clientcommand(self, command, parsed_data): if command != 'typing': logger.debug("command %s client# %d" % (command, self.clientnumber)) # update the lasttouch values on associated aggregations if command != 'automode' and self.clienttype == "editing": self.clientlasttouch = datetime.datetime.now() if self.guid and self.username: assert self.username in self.guidclienteditors.usereditormap self.guidclienteditors.usereditormap[self.username].userlasttouch = self.clientlasttouch self.guidclienteditors.scraperlasttouch = self.clientlasttouch # data uploaded when a new connection is made from the editor if command == 'connection_open': self.lconnectionopen(parsed_data) # finds the corresponding client and presses the run button on it # receives a single record through the pipeline elif command == 'stimulate_run': self.clienttype = "stimulate_run" self.factory.clientConnectionRegistered(self) scrapername = parsed_data["scrapername"] guid = parsed_data["guid"] assert guid username = parsed_data["username"] clientnumber = parsed_data["clientnumber"] client = None eoos = self.factory.guidclientmap.get(guid) if eoos: usereditor = eoos.usereditormap.get(username) if usereditor: for lclient in usereditor.userclients: if lclient.clientnumber == clientnumber: client = lclient if parsed_data.get('django_key') != djangokey: logger.error("djangokey_mismatch") self.writejson({'status':'twister djangokey mismatch'}) if client: client.writejson({"message_type":"console", "content":"twister djangokey mismatch"}) client.writejson({'message_type':'executionstatus', 'content':'runfinished'}) client = None if client: logger.info("stimulate on : %s %s client# %d" % (client.cchatname, client.scrapername, client.clientnumber)) assert client.clienttype == "editing" and client.guid if not client.processrunning: client.runcode(parsed_data) self.writejson({"status":"run started"}) else: client.writejson({"message_type":"console", "content":"client already running"}) self.writejson({"status":"client already running"}) else: parsed_data.pop("code", None) # shorten the log message logger.warning("client not found %s" % parsed_data) self.writejson({"status":"client not found"}) self.transport.loseConnection() elif command == 'rpcrun': self.username = parsed_data.get('username', '') self.userrealname = parsed_data.get('userrealname', self.username) self.scrapername = parsed_data.get('scrapername', '') self.scraperlanguage = parsed_data.get('language', '') self.guid = parsed_data.get("guid", '') if parsed_data.get('django_key') == djangokey: self.clienttype = "rpcrunning" logger.info("connection open %s: %s %s client# %d" % (self.clienttype, self.username, self.scrapername, self.clientnumber)) self.factory.clientConnectionRegistered(self) self.runcode(parsed_data) # termination is by the calling function when it receives an executionstatus runfinished message else: logger.error("djangokey_mismatch") self.writejson({'status':'twister djangokey mismatch'}) self.transport.loseConnection() elif command == 'saved': line = json.dumps({'message_type' : "saved", 'chatname' : self.chatname}) otherline = json.dumps({'message_type' : "othersaved", 'chatname' : self.chatname}) self.guidclienteditors.rev = parsed_data["rev"] self.guidclienteditors.chainpatchnumber = 0 self.writeall(line, otherline) self.factory.notifyMonitoringClientsSmallmessage(self, "savenote") # should record the rev and chainpatchnumber so when we join to this scraper we know elif command == 'typing': logger.debug("command %s client# %d insertlinenumber %s" % (command, self.clientnumber, parsed_data.get("insertlinenumber"))) jline = {'message_type' : "typing", 'content' : "%s typing" % self.chatname} jotherline = parsed_data.copy() jotherline.pop("command") jotherline["message_type"] = "othertyping" jotherline["content"] = jline["content"] self.guidclienteditors.chainpatchnumber = parsed_data.get("chainpatchnumber") self.writeall(json.dumps(jline), json.dumps(jotherline)) self.factory.notifyMonitoringClientsSmallmessage(self, "typingnote") # this one only applies to draft scrapers when you click run elif command == 'run': if self.processrunning: self.writejson({'content':"Already running! (shouldn't happen)", 'message_type':'console'}); return if self.username: if self.automode == 'autoload': self.writejson({'content':"Not supposed to run! "+self.automode, 'message_type':'console'}); return if parsed_data.get('guid'): self.writejson({'content':"scraper run can only be done through stimulate_run method", 'message_type':'console'}); return logger.info("about to run code %s" % str(parsed_data)[:100]) self.runcode(parsed_data) elif command == "umlcontrol": # allows monitoring client to remotely kill processes if self.clienttype != "umlmonitoring": logger.error("umlcontrol called by non-monitoring client") return logger.info("umlcontrol %s" % ([parsed_data])) subcommand = parsed_data.get("subcommand") if subcommand == "killscraper": scrapername = parsed_data["scrapername"] for eoos in self.factory.guidclientmap.values(): # would be better if it was by scrapername instead of guid if eoos.scrapername == scrapername: for usereditor in eoos.usereditormap.values(): for uclient in usereditor.userclients: if uclient.processrunning: logger.info("umlcontrol killing run on client# %d %s" % (uclient.clientnumber, scrapername)) uclient.kill_run() if subcommand == "killallscheduled": for client in self.factory.scheduledrunners.values(): if client.processrunning: logger.info("umlcontrol killing run on client# %d %s" % (client.clientnumber, client.scrapername)) client.kill_run() else: logger.info("umlcontrol client# %d %s wasn't running" % (client.clientnumber, client.scrapername)) if "maxscheduledscrapers" in parsed_data: self.factory.maxscheduledscrapers = parsed_data["maxscheduledscrapers"] self.factory.notifyMonitoringClients(None) elif command == "kill": if self.processrunning: self.kill_run() # allows the killing of a process in another open window by same user elif self.username and self.guid: usereditor = self.guidclienteditors.usereditormap[self.username] for client in usereditor.userclients: if client.processrunning: client.kill_run() elif command == 'chat': line = json.dumps({'message_type':'chat', 'chatname':self.chatname, 'message':parsed_data.get('text'), 'nowtime':jstime(datetime.datetime.now()) }) self.writeall(line) elif command == 'requesteditcontrol': for usereditor in self.guidclienteditors.usereditormap.values(): for client in usereditor.userclients: if client.automode == 'autosave': client.writejson({'message_type':'requestededitcontrol', "username":self.username}) elif command == 'giveselrange': self.writeall(None, json.dumps({'message_type':'giveselrange', 'selrange':parsed_data.get('selrange'), 'chatname':self.chatname })) elif command == 'automode': automode = parsed_data.get('automode') if automode == self.automode: return if not self.username: self.automode = automode self.factory.notifyMonitoringClients(self) return usereditor = self.guidclienteditors.usereditormap[self.username] # self-demote to autoload mode while choosing to promote a particular person to editing mode if automode == 'autoload': selectednexteditor = parsed_data.get('selectednexteditor') if selectednexteditor and selectednexteditor in self.guidclienteditors.usereditormap: assert self.guidclienteditors.usereditormap[selectednexteditor].usersessionpriority >= usereditor.usersessionpriority self.guidclienteditors.usereditormap[selectednexteditor].usersessionpriority = usereditor.usersessionpriority usereditor.usersessionpriority = self.guidclienteditors.usersessionprioritynext self.guidclienteditors.usersessionprioritynext += 1 self.automode = automode self.guidclienteditors.notifyEditorClients("") self.factory.notifyMonitoringClients(self) # this message helps kill it better and killing it from the browser end elif command == 'loseconnection': # Suspect it is possible in some cases that the client sends this command, and before # we have had a chance to close the connection from here, the client has already gone. # To cover this case let's handle the exception here and log that loseConnection failed try: self.transport.loseConnection() except: logger.debug('Closing connection on already closed connection failed')
def connectionLost(self, reason): if self.clienttype == "editing": if self.processrunning: self.kill_run(reason='connection lost') logger.info("connection lost: %s %s client# %d" % (self.cchatname, self.scrapername, self.clientnumber)) self.factory.clientConnectionLost(self)