def MakeRunner(scrapername, guid, language, urlquery, username, code, client, beta_user, attachables, rev, bmakerunobject, agent, scheduled): srunner = spawnRunner(client, code) # reuse this class and its functions jdata = { } jdata["code"] = code.replace('\r', '') jdata["cpulimit"] = 80 # XXX I don't think this is used at all - FAI jdata["draft"] = (not scrapername) jdata["username"] = username # comes through when done with stimulate_run, and we can use this for the dataproxy permissions (whether it can add to the attachables list) jdata["language"] = language jdata["scraperid"] = guid jdata["urlquery"] = urlquery jdata["scrapername"] = scrapername jdata["beta_user"] = beta_user jdata["attachables"] = attachables jdata["rev"] = rev # invent the runid (should actually jdata["runid"] = '%.6f_%s' % (time.time(), uuid.uuid4()) if jdata.get("draft"): jdata["runid"] = "draft|||%s" % jdata["runid"] srunner.jdata = jdata if bmakerunobject: srunner.runobjectmaker = ScheduledRunMessageLoopHandler(client, username, agent, jdata["runid"],rev) logger.debug("Making run object on %s client# %d" % (scrapername, client.clientnumber)) target = 'scheduled' if scheduled else 'live' nodename, nodehost, nodeport = choose_controller( target ) logger.debug('Running code on %s:%s (%s/%s)' % (nodehost,str(nodeport),nodename,target) ) deferred = clientcreator.connectTCP(nodehost, nodeport) deferred.addCallbacks(srunner.gotcontrollerconnectionprotocol, srunner.controllerconnectionrequestFailure) return srunner
def outReceived(self, data): logger.debug("spawnrunner received for client# %d %s" % (self.client.clientnumber, data[:180])) # although the client can parse the records itself, it is necessary to split them up here correctly so that this code can insert its own records into the stream. lines = [ ] spldata = data.split("\n") self.lbuffer.append(spldata.pop(0)) while spldata: lines.append("".join(self.lbuffer)) self.lbuffer = [ spldata.pop(0) ] # next one in for line in lines: # strip out the httpheaders that come back at the start of a node connection if not self.httpheadersdone: if re.match("HTTP/", line): assert not self.httpheaders continue if line == "\r": self.httpheadersdone = True continue mheader = re.match("(.*?):\s*(.*)\r", line) if not mheader: logger.error("Bad header: "+str([line])) else: self.httpheaders.append((mheader.group(1), mheader.group(2))) continue logger.info("Received and will write: "+str([line])) self.client.writeall(line) if self.runobjectmaker: self.runobjectmaker.receiveline(line)
def outReceived(self, data): logger.debug("spawnrunner received for client# %d %s" % (self.client.clientnumber, data[:180])) # although the client can parse the records itself, it is necessary to split them up here correctly so that this code can insert its own records into the stream. lines = [ ] spldata = data.split("\n") self.lbuffer.append(spldata.pop(0)) while spldata: lines.append("".join(self.lbuffer)) self.lbuffer = [ spldata.pop(0) ] # next one in for line in lines: # strip out the httpheaders that come back at the start of a node connection if not self.httpheadersdone: if re.match("HTTP/", line): assert not self.httpheaders continue if line == "\r": self.httpheadersdone = True continue mheader = re.match("(.*?):\s*(.*)\r", line) if not mheader: logger.error("Bad header: "+str([line])) else: self.httpheaders.append((mheader.group(1), mheader.group(2))) continue logger.debug("Received and will write: "+str([line])) self.client.writeall(line) if self.runobjectmaker: self.runobjectmaker.receiveline(line)
def connectionLost(self, reason): if reason.type in [ResponseDone, PotentialDataLoss]: # sometimes need to print huge amount out to read the html formatted error #logger.debug("updaterunobject response: %s"% str("".join(self.rbuffer)[:50000])) logger.debug("updaterunobject response: %s"% str(["".join(self.rbuffer)[:2000]])) else: logger.warning("nope "+str([reason.getErrorMessage(), reason.type, self.rbuffer])) self.finished.callback(None)
def receiveline(self, line): try: data = json.loads(line) if not isinstance(data, dict): raise TypeError('Incorrect type of JSON') except Exception, e: logger.debug( "Failed to json.loads() %s %s" % (str([line]), str(e))) return
def kill_run(self, reason=''): msg = 'Script cancelled' if reason: msg = "%s (%s)" % (msg, reason) self.writeall(json.dumps({'message_type':'executionstatus', 'content':'killsignal', 'message':msg})) logger.debug(msg) if self.processrunning.controllerconnection: # possible for process to have been made, but connection to never have been got self.processrunning.controllerconnection.transport.loseConnection() # this will cause it to clean up else: self.processrunning.processEnded("no connection had ever been made")
def clientConnectionLost(self, client): if client in self.clients: self.clients.remove(client) # main list logger.debug("removing %s client# %d" % (client.clienttype, client.clientnumber)) # connection open but nothing else happened if client.clienttype == None: if client in self.connectedclients: self.connectedclients.remove(client) elif client.clienttype == "stimulate_run": if client in self.stimulate_runclients: self.stimulate_runclients.remove(client) else: logger.error("No place to remove stimulate_run client# %d" % client.clientnumber) elif client.clienttype == "httpget": if client in self.httpgetclients: self.httpgetclients.remove(client) else: logger.error("No place to remove httpget client# %d" % client.clientnumber) elif client.clienttype == "umlmonitoring": if client in self.umlmonitoringclients: self.umlmonitoringclients.remove(client) else: logger.error("No place to remove umlmonitoring client# %d" % client.clientnumber) elif client.clienttype == "rpcrunning": if client in self.rpcrunningclients: self.rpcrunningclients.remove(client) else: logger.error("No place to remove rpcrunning client %d" % client.clientnumber) elif not client.guid: if client in self.draftscraperclients: self.draftscraperclients.remove(client) else: logger.error("No place to remove draftscraper client %d" % client.clientnumber) elif (client.guid in self.guidclientmap): if not self.guidclientmap[client.guid].RemoveClient(client): del self.guidclientmap[client.guid] else: if client.username in self.guidclientmap[client.guid].usereditormap: message = "%s closes a window" % client.chatname else: message = "%s leaves" % client.chatname self.guidclientmap[client.guid].notifyEditorClients(message) else: logger.error("No place to remove client %d" % client.clientnumber) self.notifyMonitoringClients(client)
def endingrun(self, sreason): if not self.controllerconnection: self.controllerconnection = None # consider case where connection hadn't been made yet else: logger.debug("endingrun called when no controllerconnection had been made") self.client.processrunning = None # remove back connection made as return value of MakeRunner del self.client.factory.runidclientmap[self.jdata["runid"]] # other errors (eg connection lost) could put more useful errors into the client logger.debug("run process %s ended client# %d %s" % (self.client.clienttype, self.client.clientnumber, sreason)) self.client.writeall(json.dumps({'message_type':'executionstatus', 'content':'runfinished', 'contentextra':sreason})) if self.runobjectmaker: self.runobjectmaker.schedulecompleted() if self.client.clienttype == "editing": self.client.factory.notifyMonitoringClients(self.client) elif self.client.clienttype == "scheduledrun": self.client.factory.scheduledruncomplete(self.client)
def gotcontrollerconnectionprotocol(self, controllerconnection): controllerconnection.srunner = self self.controllerconnection = controllerconnection # generate the header element that was generated by dispatcher, # which should in future be made by the node-controller msg = { 'message_type':'executionstatus', 'content':'startingrun', 'runID':self.jdata["runid"], 'uml':"Direct to controller", 'rev':self.jdata["rev"], 'chatname':self.client.chatname, 'nowtime':jstime(datetime.datetime.now())} self.client.writeall(json.dumps(msg)) # send the data into the controller including all the code it should run sdata = json.dumps(self.jdata) logger.debug("sending: %s" % str([sdata[:1000]])) controllerconnection.transport.write('POST /Execute HTTP/1.0\r\n') controllerconnection.transport.write('Content-Length: %s\r\n' % len(sdata)) controllerconnection.transport.write('Content-Type: text/json\r\n') # text/json??? controllerconnection.transport.write('Connection: close\r\n') controllerconnection.transport.write("\r\n") controllerconnection.transport.write(sdata)
def connectionMade(self): logger.debug("Starting run ")
class ScheduledRunMessageLoopHandler: def __init__(self, client, username, agent, runid, rev): # a partial implementation of editor.js self.exceptionmessage = [ ] self.completiondata = None self.outputmessage = [ ] self.domainscrapes = { } # domain: { "pages_scraped":0, "bytes_scraped":0 } self.discarded_lines = 0 self.discarded_characters = 0 self.output = "" self.run_ended = None self.upost = {"django_key":djangokey} self.upost["pages_scraped"] = 0 self.upost["records_produced"] = 0 self.upost["scrapername"] = client.scrapername self.upost["clientnumber"] = client.clientnumber self.upost['runID'] = runid self.upost['run_id'] = runid self.upost['revision'] = rev self.username = username self.agent = agent def updaterunobjectFailure(self, failure): # could check for the type and use the retry functionality that exists in twister logger.info("requestoverduescrapers failure received "+str(failure)) def updaterunobjectResponse(self, response): finished = Deferred() response.deliverBody(updaterunobjectReceiver(finished)) return finished def updaterunobject(self, bfinished): url = urlparse.urljoin(djangourl, 'scraper-admin/twistermakesrunevent/') logger.info("attempting to post: "+url) self.upost["output"] = self.output if bfinished: self.upost["exitstatus"] = self.exceptionmessage and 'exceptionmessage' or 'done' self.upost["domainscrapes"] = json.dumps(self.domainscrapes) if self.exceptionmessage: self.upost['exceptionmessage'] = self.exceptionmessage # urllib.urlencode applies str() to each value in the list, which is dumb. # to get a proper error, print some chinese characters # need to get an explanation for this design of urlencode lupost = self.upost.copy() for key in lupost: if type(lupost[key]) == unicode: lupost[key] = lupost[key].encode("utf8") ulupost = urllib.urlencode(lupost) logger.info(self.upost) d = self.agent.request('POST', url, Headers(), StringProducer(ulupost)) d.addCallbacks(self.updaterunobjectResponse, self.updaterunobjectFailure) def receiveline(self, line): try: data = json.loads(line) if not isinstance(data, dict): raise TypeError('Incorrect type of JSON') except Exception, e: logger.debug( "Failed to json.loads() %s %s" % (str([line]), str(e))) return message_type = data.get('message_type') content = data.get("content") if message_type == 'executionstatus': if content == "startingrun": self.output = "%sEXECUTIONSTATUS: uml=%s usename=%s runid=%s\n" % (self.output, data.get("uml"), self.username, data.get("runID")) self.updaterunobject(False) # generated by scriptmanager executor.js elif content == "runcompleted": logger.debug( "Got run completed : %s" % (line,) ) self.completiondata = data self.completionmessage = ''; if data.get('elapsed_seconds'): self.completionmessage += str(data.get("elapsed_seconds")) + " seconds elapsed, " if data.get("CPU_seconds", False): # Until we can get CPU used self.completionmessage += str(data.get("CPU_seconds")) + " CPU seconds used"; if "exit_status" in data and data.get("exit_status") != 0: self.completionmessage += ", exit status " + str(data.get("exit_status")); if "term_sig_text" in data: self.completionmessage += ", terminated by " + data.get("term_sig_text"); elif "term_sig" in data: self.completionmessage += ", terminated by signal " + str(data.get("term_sig")); logger.debug( "Completion status : %s" % (line,) ) # generated by twister after completion elif content == "runfinished": # run object is updated following the schedulecompleted() function call pass else: logger.warning("Unrecognized message: %s %s" % (message_type, content)) elif message_type == "sources": self.upost["pages_scraped"] += 1 # soon to be deprecated url = data.get('url') netloc = "%s://%s" % urlparse.urlparse(url)[:2] if "first_url_scraped" not in self.upost and url and netloc[-16:] != '.scraperwiki.com' and url[-10:] != 'robots.txt': self.upost["first_url_scraped"] = data.get('url') if netloc: if netloc not in self.domainscrapes: self.domainscrapes[netloc] = { "pages_scraped":0, "bytes_scraped":0 } self.domainscrapes[netloc]["pages_scraped"] += 1 self.domainscrapes[netloc]["bytes_scraped"] += int(data.get('bytes')) elif message_type == "data": self.upost["records_produced"] += 1 elif message_type == "sqlitecall": if data.get('insert'): self.upost["records_produced"] += 1 # only one of these ever elif message_type == "exception": self.upost["exception_message"] = data.get('exceptiondescription') for stackentry in data.get("stackdump"): sMessage = stackentry.get('file') if sMessage: if sMessage == "<string>": sMessage = "Line %d: %s" % (stackentry.get('linenumber', -1), stackentry.get('linetext')) if stackentry.get('furtherlinetext'): sMessage += " -- " + stackentry.get('furtherlinetext') self.exceptionmessage.append(sMessage) if stackentry.get('duplicates') and stackentry.get('duplicates') > 1: self.exceptionmessage.append(" + %d duplicates" % stackentry.get('duplicates')) if data.get("blockedurl"): self.exceptionmessage.append("Blocked URL: %s" % data.get("blockedurl")) self.exceptionmessage.append('') self.exceptionmessage.append(data.get('exceptiondescription')) elif message_type == "console": while content: self.outputmessage.append(content[:APPROXLENOUTPUTLIMIT]) content = content[APPROXLENOUTPUTLIMIT:] elif message_type not in ["editorstatus", "saved", "chat"]: logger.info("Unknown record type: %s\n" % str([line[:2000]])) # live update of event output so we can watch it when debugging scraperwiki platform # reduce pressure on the server by only updating when we over-run the buffer if self.outputmessage and len(self.output) < APPROXLENOUTPUTLIMIT: while self.outputmessage: self.output = "%s%s" % (self.output, self.outputmessage.pop(0)) if len(self.output) >= APPROXLENOUTPUTLIMIT: self.output = "%s%s" % (self.output, temptailmessage) self.updaterunobject(False) break self.run_ended = datetime.datetime.now() #self.updaterunobject(False) while len(self.outputmessage) >= TAIL_LINES: discarded = self.outputmessage.pop(0) self.discarded_lines += 1 self.discarded_characters += len(discarded)
def scheduledruncomplete(self, sclient): logger.debug("scheduledruncomplete %d" % sclient.clientnumber) self.clientConnectionLost(sclient) # not called from connectionList because there is no socket actually associated with this object del self.scheduledrunners[sclient.scrapername]
def clientcommand(self, command, parsed_data): if command != 'typing': logger.debug("command %s client# %d" % (command, self.clientnumber)) # update the lasttouch values on associated aggregations if command != 'automode' and self.clienttype == "editing": self.clientlasttouch = datetime.datetime.now() if self.guid and self.username: assert self.username in self.guidclienteditors.usereditormap self.guidclienteditors.usereditormap[self.username].userlasttouch = self.clientlasttouch self.guidclienteditors.scraperlasttouch = self.clientlasttouch # data uploaded when a new connection is made from the editor if command == 'connection_open': self.lconnectionopen(parsed_data) # finds the corresponding client and presses the run button on it # receives a single record through the pipeline elif command == 'stimulate_run': self.clienttype = "stimulate_run" self.factory.clientConnectionRegistered(self) scrapername = parsed_data["scrapername"] guid = parsed_data["guid"] assert guid username = parsed_data["username"] clientnumber = parsed_data["clientnumber"] client = None eoos = self.factory.guidclientmap.get(guid) if eoos: usereditor = eoos.usereditormap.get(username) if usereditor: for lclient in usereditor.userclients: if lclient.clientnumber == clientnumber: client = lclient if parsed_data.get('django_key') != djangokey: logger.error("djangokey_mismatch") self.writejson({'status':'twister djangokey mismatch'}) if client: client.writejson({"message_type":"console", "content":"twister djangokey mismatch"}) client.writejson({'message_type':'executionstatus', 'content':'runfinished'}) client = None if client: logger.info("stimulate on : %s %s client# %d" % (client.cchatname, client.scrapername, client.clientnumber)) assert client.clienttype == "editing" and client.guid if not client.processrunning: client.runcode(parsed_data) self.writejson({"status":"run started"}) else: client.writejson({"message_type":"console", "content":"client already running"}) self.writejson({"status":"client already running"}) else: parsed_data.pop("code", None) # shorten the log message logger.warning("client not found %s" % parsed_data) self.writejson({"status":"client not found"}) self.transport.loseConnection() elif command == 'rpcrun': self.username = parsed_data.get('username', '') self.userrealname = parsed_data.get('userrealname', self.username) self.scrapername = parsed_data.get('scrapername', '') self.scraperlanguage = parsed_data.get('language', '') self.guid = parsed_data.get("guid", '') if parsed_data.get('django_key') == djangokey: self.clienttype = "rpcrunning" logger.info("connection open %s: %s %s client# %d" % (self.clienttype, self.username, self.scrapername, self.clientnumber)) self.factory.clientConnectionRegistered(self) self.runcode(parsed_data) # termination is by the calling function when it receives an executionstatus runfinished message else: logger.error("djangokey_mismatch") self.writejson({'status':'twister djangokey mismatch'}) self.transport.loseConnection() elif command == 'saved': line = json.dumps({'message_type' : "saved", 'chatname' : self.chatname}) otherline = json.dumps({'message_type' : "othersaved", 'chatname' : self.chatname}) self.guidclienteditors.rev = parsed_data["rev"] self.guidclienteditors.chainpatchnumber = 0 self.writeall(line, otherline) self.factory.notifyMonitoringClientsSmallmessage(self, "savenote") # should record the rev and chainpatchnumber so when we join to this scraper we know elif command == 'typing': logger.debug("command %s client# %d insertlinenumber %s" % (command, self.clientnumber, parsed_data.get("insertlinenumber"))) jline = {'message_type' : "typing", 'content' : "%s typing" % self.chatname} jotherline = parsed_data.copy() jotherline.pop("command") jotherline["message_type"] = "othertyping" jotherline["content"] = jline["content"] self.guidclienteditors.chainpatchnumber = parsed_data.get("chainpatchnumber") self.writeall(json.dumps(jline), json.dumps(jotherline)) self.factory.notifyMonitoringClientsSmallmessage(self, "typingnote") # this one only applies to draft scrapers when you click run elif command == 'run': if self.processrunning: self.writejson({'content':"Already running! (shouldn't happen)", 'message_type':'console'}); return if self.username: if self.automode == 'autoload': self.writejson({'content':"Not supposed to run! "+self.automode, 'message_type':'console'}); return if parsed_data.get('guid'): self.writejson({'content':"scraper run can only be done through stimulate_run method", 'message_type':'console'}); return logger.info("about to run code %s" % str(parsed_data)[:100]) self.runcode(parsed_data) elif command == "umlcontrol": # allows monitoring client to remotely kill processes if self.clienttype != "umlmonitoring": logger.error("umlcontrol called by non-monitoring client") return logger.info("umlcontrol %s" % ([parsed_data])) subcommand = parsed_data.get("subcommand") if subcommand == "killscraper": scrapername = parsed_data["scrapername"] for eoos in self.factory.guidclientmap.values(): # would be better if it was by scrapername instead of guid if eoos.scrapername == scrapername: for usereditor in eoos.usereditormap.values(): for uclient in usereditor.userclients: if uclient.processrunning: logger.info("umlcontrol killing run on client# %d %s" % (uclient.clientnumber, scrapername)) uclient.kill_run() if subcommand == "killallscheduled": for client in self.factory.scheduledrunners.values(): if client.processrunning: logger.info("umlcontrol killing run on client# %d %s" % (client.clientnumber, client.scrapername)) client.kill_run() else: logger.info("umlcontrol client# %d %s wasn't running" % (client.clientnumber, client.scrapername)) if "maxscheduledscrapers" in parsed_data: self.factory.maxscheduledscrapers = parsed_data["maxscheduledscrapers"] self.factory.notifyMonitoringClients(None) elif command == "kill": if self.processrunning: self.kill_run() # allows the killing of a process in another open window by same user elif self.username and self.guid: usereditor = self.guidclienteditors.usereditormap[self.username] for client in usereditor.userclients: if client.processrunning: client.kill_run() elif command == 'chat': line = json.dumps({'message_type':'chat', 'chatname':self.chatname, 'message':parsed_data.get('text'), 'nowtime':jstime(datetime.datetime.now()) }) self.writeall(line) elif command == 'requesteditcontrol': for usereditor in self.guidclienteditors.usereditormap.values(): for client in usereditor.userclients: if client.automode == 'autosave': client.writejson({'message_type':'requestededitcontrol', "username":self.username}) elif command == 'giveselrange': self.writeall(None, json.dumps({'message_type':'giveselrange', 'selrange':parsed_data.get('selrange'), 'chatname':self.chatname })) elif command == 'automode': automode = parsed_data.get('automode') if automode == self.automode: return if not self.username: self.automode = automode self.factory.notifyMonitoringClients(self) return usereditor = self.guidclienteditors.usereditormap[self.username] # self-demote to autoload mode while choosing to promote a particular person to editing mode if automode == 'autoload': selectednexteditor = parsed_data.get('selectednexteditor') if selectednexteditor and selectednexteditor in self.guidclienteditors.usereditormap: assert self.guidclienteditors.usereditormap[selectednexteditor].usersessionpriority >= usereditor.usersessionpriority self.guidclienteditors.usereditormap[selectednexteditor].usersessionpriority = usereditor.usersessionpriority usereditor.usersessionpriority = self.guidclienteditors.usersessionprioritynext self.guidclienteditors.usersessionprioritynext += 1 self.automode = automode self.guidclienteditors.notifyEditorClients("") self.factory.notifyMonitoringClients(self) # this message helps kill it better and killing it from the browser end elif command == 'loseconnection': # Suspect it is possible in some cases that the client sends this command, and before # we have had a chance to close the connection from here, the client has already gone. # To cover this case let's handle the exception here and log that loseConnection failed try: self.transport.loseConnection() except: logger.debug('Closing connection on already closed connection failed')