class TopicChange: def __init__(self, dataFile, userJoins, activeForums): sys.stderr.write("Started\n") self.dataHandler = DataHandler(dataFile, userJoins) self.dataHandler.loadActiveForums(activeForums) sys.stderr.write("Data loaded\n") self.post2Month = self.dataHandler.getPost2Month() self.doc2Post = self.dataHandler.getDoc2Post() self.post2User = self.dataHandler.getPost2User() sys.stderr.write("Got the dicts\n") def loadInferredTopics(self, topicsOutput): userMonth = dd(lambda: dd(int)) numUsers = set() csvReader = csv.reader(open(topicsOutput)) for doc in csvReader: # if len(doc)<21: # continue # print 'phani' docId = doc[0] # topic5Num = doc[5] # topic19Num = doc[19] userId = self.post2User[self.doc2Post[docId]] # month = self.post2Month[self.doc2Post[docId]] # userMonth[userId][month] += topic5Num numUsers.add(userId) # for user in userMonth.iterkeys(): # for month in userMonth[user].iterkeys(): # print user, month, userMonth[user][month] print len(numUsers)
def __init__(self, dataFile, userJoins): sys.stderr.write('In Constructor\n') self.distComparer = DistComparer() self.dataHandler = DataHandler(dataFile, userJoins) self.dataHandler.loadActiveForums() self.__loadData() self.sampledUsers = set()
def __init__(self, dataFile, userJoins, activeForums): sys.stderr.write("Started\n") self.dataHandler = DataHandler(dataFile, userJoins) self.dataHandler.loadActiveForums(activeForums) sys.stderr.write("Data loaded\n") self.post2Month = self.dataHandler.getPost2Month() self.doc2Post = self.dataHandler.getDoc2Post() self.post2User = self.dataHandler.getPost2User() sys.stderr.write("Got the dicts\n")
def __init__(self, outDir): self.config = None self.__csHash = set() self.__outputDir = outDir self.__csInstance = CSHandler() self.__dataHandler = DataHandler() self.__utils = Utils() self.__Tree = Dependencytree() self.__fileSuffix = "" self.prepareConfig()
def __init__(self,url,dbFile,outputFile,maxCount=None): self.url = url # url to be crawled if maxCount == None: self.maxCount = -1 else: ''' maxcount is the maximum number of links to be fetched by the crawler. It is incremented as we should accommodate the initial user input while counting the total number of links in the repository as the link entered by the user will also be persisted in the repository (i.e)if user requests to crawl python.org and asks to fetch 2 links , the program should terminate when there are 3 links in repository as python.org is also one of the links in repository ''' self.maxCount = maxCount + 1 self.extracter = LinkExtracter() self.dataHandler = DataHandler(self.maxCount,dbFile,outputFile) self.log = CrawlerLogger.getlogger()
def update_job_priorites(job_priorites): dataHandler = DataHandler() success = dataHandler.update_job_priority(job_priorites) dataHandler.Close() return success
def update_job(job_id, field, value): dataHandler = DataHandler() dataHandler.UpdateJobTextField(job_id, field, value) dataHandler.Close()
def SubmitJob(jobParamsJsonStr): ret = {} jobParams = LoadJobParams(jobParamsJsonStr) if "jobName" not in jobParams or len(jobParams["jobName"].strip()) == 0: ret["error"] = "ERROR: Job name cannot be empty" return ret if "vcName" not in jobParams or len(jobParams["vcName"].strip()) == 0: ret["error"] = "ERROR: VC name cannot be empty" return ret if "preemptionAllowed" not in jobParams: jobParams["preemptionAllowed"] = False else: jobParams["preemptionAllowed"] = ToBool(jobParams["preemptionAllowed"]) if "jobId" not in jobParams or jobParams["jobId"] == "": #jobParams["jobId"] = jobParams["jobName"] + "-" + str(uuid.uuid4()) #jobParams["jobId"] = jobParams["jobName"] + "-" + str(time.time()) jobParams["jobId"] = str(uuid.uuid4()) #jobParams["jobId"] = jobParams["jobId"].replace("_","-").replace(".","-") if "resourcegpu" not in jobParams: jobParams["resourcegpu"] = 0 if isinstance(jobParams["resourcegpu"], basestring): if len(jobParams["resourcegpu"].strip()) == 0: jobParams["resourcegpu"] = 0 else: jobParams["resourcegpu"] = int(jobParams["resourcegpu"]) if "familyToken" not in jobParams or jobParams["familyToken"].isspace(): jobParams["familyToken"] = str(uuid.uuid4()) if "isParent" not in jobParams: jobParams["isParent"] = 1 userName = jobParams["userName"] if "@" in userName: userName = userName.split("@")[0].strip() if "/" in userName: userName = userName.split("/")[1].strip() if not AuthorizationManager.HasAccess( jobParams["userName"], ResourceType.VC, jobParams["vcName"].strip(), Permission.User): ret["error"] = "Access Denied!" return ret if "cmd" not in jobParams: jobParams["cmd"] = "" if "jobPath" in jobParams and len(jobParams["jobPath"].strip()) > 0: jobPath = jobParams["jobPath"] if ".." in jobParams["jobPath"]: ret["error"] = "ERROR: '..' cannot be used in job directory" return ret if "\\." in jobParams["jobPath"]: ret["error"] = "ERROR: invalided job directory" return ret if jobParams["jobPath"].startswith( "/") or jobParams["jobPath"].startswith("\\"): ret["error"] = "ERROR: job directory should not start with '/' or '\\' " return ret if not jobParams["jobPath"].startswith(userName): jobParams["jobPath"] = os.path.join(userName, jobParams["jobPath"]) else: jobPath = userName + "/" + "jobs/" + time.strftime( "%y%m%d") + "/" + jobParams["jobId"] jobParams["jobPath"] = jobPath if "workPath" not in jobParams or len(jobParams["workPath"].strip()) == 0: jobParams["workPath"] = "." if ".." in jobParams["workPath"]: ret["error"] = "ERROR: '..' cannot be used in work directory" return ret if "\\." in jobParams["workPath"]: ret["error"] = "ERROR: invalided work directory" return ret if jobParams["workPath"].startswith( "/") or jobParams["workPath"].startswith("\\"): ret["error"] = "ERROR: work directory should not start with '/' or '\\' " return ret if not jobParams["workPath"].startswith(userName): jobParams["workPath"] = os.path.join(userName, jobParams["workPath"]) if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: jobParams["dataPath"] = "." if ".." in jobParams["dataPath"]: ret["error"] = "ERROR: '..' cannot be used in data directory" return ret if "\\." in jobParams["dataPath"]: ret["error"] = "ERROR: invalided data directory" return ret if jobParams["dataPath"][0] == "/" or jobParams["dataPath"][0] == "\\": ret["error"] = "ERROR: data directory should not start with '/' or '\\' " return ret jobParams["dataPath"] = jobParams["dataPath"].replace("\\", "/") jobParams["workPath"] = jobParams["workPath"].replace("\\", "/") jobParams["jobPath"] = jobParams["jobPath"].replace("\\", "/") jobParams["dataPath"] = os.path.realpath( os.path.join("/", jobParams["dataPath"]))[1:] jobParams["workPath"] = os.path.realpath( os.path.join("/", jobParams["workPath"]))[1:] jobParams["jobPath"] = os.path.realpath( os.path.join("/", jobParams["jobPath"]))[1:] dataHandler = DataHandler() if "logDir" in jobParams and len(jobParams["logDir"].strip()) > 0: tensorboardParams = jobParams.copy() # overwrite for distributed job if tensorboardParams["jobtrainingtype"] == "PSDistJob": tensorboardParams["jobtrainingtype"] = "RegularJob" match = re.match('(.*)(/.*)', tensorboardParams["logDir"]) if not match is None: newDir = match.group(1) + "/worker0" + match.group(2) prefix = match.group(1) match2 = re.match('.*/worker0', prefix) if match2 is None: tensorboardParams["logDir"] = newDir #match = re.match('(.*--logdir\s+.*)(/.*--.*)', tensorboardParams["cmd"]) #if not match is None: # tensorboardParams["cmd"] = match.group(1) + "/worker0" + match.group(2) tensorboardParams["jobId"] = str(uuid.uuid4()) tensorboardParams["jobName"] = "tensorboard-" + jobParams["jobName"] tensorboardParams["jobPath"] = jobPath tensorboardParams["jobType"] = "visualization" tensorboardParams["cmd"] = "tensorboard --logdir " + tensorboardParams[ "logDir"] + " --host 0.0.0.0" tensorboardParams["image"] = jobParams["image"] tensorboardParams["resourcegpu"] = 0 tensorboardParams["interactivePort"] = "6006" if "error" not in ret: if not dataHandler.AddJob(tensorboardParams): ret["error"] = "Cannot schedule tensorboard job." if "error" not in ret: if dataHandler.AddJob(jobParams): ret["jobId"] = jobParams["jobId"] else: ret["error"] = "Cannot schedule job. Cannot add job into database." dataHandler.Close() InvalidateJobListCache(jobParams["vcName"]) return ret
#!/usr/bin/env python # coding=utf-8 import thread, traceback, signal, socket, sys from urllib import urlopen from DataHandler import DataHandler from Client import Client from NATServer import NATServer from Dispatcher import Dispatcher import ip2country # just to make sure it's downloaded import ChanServ _root = DataHandler() _root.parseArgv(sys.argv) try: signal.SIGHUP def sighup(sig, frame): _root.console_write('Received SIGHUP.') if _root.sighup: _root.reload() signal.signal(signal.SIGHUP, sighup) except AttributeError: pass _root.console_write('-'*40) _root.console_write('Starting uberserver...\n')
from DataHandler import DataHandler from Client import Client from NATServer import NATServer from XmlRpcServer import XmlRpcServer import ip2country # just to make sure it's downloaded import ChanServ import twistedserver # uncomment for debugging deadlocks, creates a stacktrace at the given interval to stdout #import stacktracer #stacktracer.trace_start("trace.html",interval=5,auto=True) # Set auto flag to always update file! _root = DataHandler() _root.parseArgv(sys.argv) try: signal.SIGHUP def sighup(sig, frame): _root.console_write('Received SIGHUP.') if _root.sighup: _root.reload() signal.signal(signal.SIGHUP, sighup) except AttributeError: pass _root.console_write('-'*40)
def UpdateJobStatus(job): dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) if job["jobStatus"] == "scheduling" and jobParams[ "jobtrainingtype"] == "PSDistJob": launch_ps_dist_job(jobParams) jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") result, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) logging.info("job %s status: %s,%s" % (job["jobId"], result, json.dumps(detail))) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None if "userId" not in jobParams: jobParams["userId"] = "0" if result.strip() == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Running": if job["jobStatus"] != "running": dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") if "interactivePort" in jobParams: serviceAddress = k8sUtils.GetServiceAddress(job["jobId"]) serviceAddress = base64.b64encode(json.dumps(serviceAddress)) dataHandler.UpdateJobTextField(job["jobId"], "endpoints", serviceAddress) elif result.strip() == "Failed": printlog("Job %s fails, cleaning..." % job["jobId"]) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail) if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Unknown": if job["jobId"] not in UnusualJobs: UnusualJobs[job["jobId"]] = datetime.datetime.now() elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: del UnusualJobs[job["jobId"]] retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "cannot launch the job.") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) else: printlog( "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"], retries)) SubmitJob(job) elif result.strip() == "PendingHostPort": printlog( "Cannot find host ports for job :%s, re-launch the job with different host ports " % (job["jobId"])) SubmitJob(job) if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]]
def extract_job_log(jobId, logPath, userId): try: dataHandler = DataHandler() # TODO: Replace joblog manager with elastic search logs = k8sUtils.GetLog(jobId, tail=None) # Do not overwrite existing logs with empty log # DLTS bootstrap will generate logs for all containers. # If one container has empty log, skip writing. for log in logs: if "containerLog" in log and log["containerLog"] == "": return jobLogDir = os.path.dirname(logPath) if not os.path.exists(jobLogDir): mkdirsAsUser(jobLogDir, userId) logStr = "" trimlogstr = "" for log in logs: if "podName" in log and "containerID" in log and "containerLog" in log: logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += " logs from pod: %s\n" % log["podName"] logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += log["containerLog"] logStr += "\n\n\n" logStr += "=========================================================\n" logStr += " end of logs from pod: %s\n" % log["podName"] logStr += "=========================================================\n" logStr += "\n\n\n" trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" trimlogstr += " logs from pod: %s\n" % log["podName"] trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" logLines = log["containerLog"].split('\n') if (len(logLines) < 3000): trimlogstr += log["containerLog"] trimlogstr += "\n\n\n" trimlogstr += "=========================================================\n" trimlogstr += " end of logs from pod: %s\n" % log[ "podName"] trimlogstr += "=========================================================\n" trimlogstr += "\n\n\n" else: trimlogstr += "\n".join(logLines[-2000:]) trimlogstr += "\n\n\n" trimlogstr += "=========================================================\n" trimlogstr += " end of logs from pod: %s\n" % log[ "podName"] trimlogstr += " Note: the log is too long to display in the webpage.\n" trimlogstr += " Only the last 2000 lines are shown here.\n" trimlogstr += " Please check the log file (in Job Folder) for the full logs.\n" trimlogstr += "=========================================================\n" trimlogstr += "\n\n\n" try: containerLogPath = os.path.join( jobLogDir, "log-container-" + log["containerID"] + ".txt") with open(containerLogPath, 'w') as f: f.write(log["containerLog"]) f.close() os.system("chown -R %s %s" % (userId, containerLogPath)) except Exception as e: logger.exception("write container log failed") if len(trimlogstr.strip()) > 0: dataHandler.UpdateJobTextField(jobId, "jobLog", base64.b64encode(trimlogstr)) with open(logPath, 'w') as f: f.write(logStr) f.close() os.system("chown -R %s %s" % (userId, logPath)) except Exception as e: logging.error(e)
def GetClusterStatus(): job = None dataHandler = DataHandler() cluster_status,last_update_time = dataHandler.GetClusterStatus() dataHandler.Close() return cluster_status,last_update_time
def GetCommands(jobId): dataHandler = DataHandler() commands = dataHandler.GetCommands(jobId=jobId); dataHandler.Close() return commands
def TakeJobActions(jobs): dataHandler = DataHandler() vcList = dataHandler.ListVCs() dataHandler.Close() localResInfo = ResourceInfo() globalResInfo = ResourceInfo() for vc in vcList: localResInfo.Add(ResourceInfo(vc["vcName"], json.loads(vc["quota"]))) globalResInfo.Add(ResourceInfo("", json.loads(vc["quota"]))) jobsInfo = [] for job in jobs: if job["jobStatus"] == "queued" or job[ "jobStatus"] == "scheduling" or job["jobStatus"] == "running": singleJobInfo = {} singleJobInfo["job"] = job singleJobInfo["jobParams"] = json.loads( base64.b64decode(job["jobParams"])) jobGpuType = "any" if "gpuType" in singleJobInfo["jobParams"]: jobGpuType = singleJobInfo["jobParams"]["gpuType"] singleJobInfo["localResInfo"] = ResourceInfo.FromTypeAndCount( job["vcName"], jobGpuType, singleJobInfo["jobParams"]["resourcegpu"]) singleJobInfo["globalResInfo"] = ResourceInfo.FromTypeAndCount( "", jobGpuType, singleJobInfo["jobParams"]["resourcegpu"]) singleJobInfo["sortKey"] = str(job["jobTime"]) if singleJobInfo["jobParams"]["preemptionAllowed"]: singleJobInfo["sortKey"] = "1_" + singleJobInfo["sortKey"] else: singleJobInfo["sortKey"] = "0_" + singleJobInfo["sortKey"] singleJobInfo["allowed"] = False jobsInfo.append(singleJobInfo) jobsInfo.sort(key=JobInfoSorter) logging.info("TakeJobActions : local resources : %s" % (localResInfo.CategoryToCountMap)) logging.info("TakeJobActions : global resources : %s" % (globalResInfo.CategoryToCountMap)) for sji in jobsInfo: logging.info("TakeJobActions : job : %s : %s : %s" % (sji["jobParams"]["jobName"], sji["localResInfo"].CategoryToCountMap, sji["sortKey"])) if sji["jobParams"]["preemptionAllowed"]: localResInfo.UnblockResourceCategory(sji["localResInfo"]) if (localResInfo.CanSatisfy(sji["localResInfo"])): localResInfo.Subtract(sji["localResInfo"]) globalResInfo.Subtract(sji["globalResInfo"]) sji["allowed"] = True logging.info("TakeJobActions : local assignment : %s : %s" % (sji["jobParams"]["jobName"], sji["localResInfo"].CategoryToCountMap)) elif not sji["jobParams"]["preemptionAllowed"]: localResInfo.BlockResourceCategory( sji["localResInfo"]) #FIFO scheduling #logging.info("TakeJobActions : local resources : %s" % (localResInfo.CategoryToCountMap)) #logging.info("TakeJobActions : global resources : %s" % (globalResInfo.CategoryToCountMap)) for sji in jobsInfo: if (sji["jobParams"]["preemptionAllowed"] and sji["allowed"] == False): if globalResInfo.CanSatisfy(sji["globalResInfo"]): logging.info("TakeJobActions : job : %s : %s" % (sji["jobParams"]["jobName"], sji["globalResInfo"].CategoryToCountMap)) # Strict FIFO policy not required for global (bonus) tokens since these jobs are anyway pre-emptible. globalResInfo.Subtract(sji["globalResInfo"]) sji["allowed"] = True logging.info("TakeJobActions : global assignment : %s : %s" % (sji["jobParams"]["jobName"], sji["globalResInfo"].CategoryToCountMap)) logging.info("TakeJobActions : global resources : %s" % (globalResInfo.CategoryToCountMap)) for sji in jobsInfo: if sji["job"]["jobStatus"] == "queued" and sji["allowed"] == True: SubmitJob(sji["job"]) logging.info("TakeJobActions : submitting job : %s : %s : %s" % (sji["jobParams"]["jobName"], sji["jobParams"]["jobId"], sji["sortKey"])) elif (sji["job"]["jobStatus"] == "scheduling" or sji["job"]["jobStatus"] == "running") and sji["allowed"] == False: KillJob(sji["job"], "queued") logging.info("TakeJobActions : pre-empting job : %s : %s : %s" % (sji["jobParams"]["jobName"], sji["jobParams"]["jobId"], sji["sortKey"])) logging.info("TakeJobActions : job desired actions taken")
def launch_ps_dist_job(jobParams): job_id = jobParams["jobId"] pods = k8sUtils.GetPod("run=" + job_id) # if any pod is not up, return if "items" not in pods or len(pods["items"]) != ( int(jobParams["numpsworker"]) + int(jobParams["numps"])): return # if any pod is not ready, return pod_status = [k8sUtils.check_pod_status(pod) for pod in pods["items"]] if any([status != "Running" for status in pod_status]): return user_name = getAlias(jobParams["userName"]) if "hostNetwork" in jobParams and jobParams["hostNetwork"]: host_network = True else: host_network = False # setup ssh server for [idx, pod] in enumerate(pods["items"]): pod_name = pod["metadata"]["name"] dist_port = pod["metadata"]["labels"]["distPort"] # quit if can't setup ssh server ssh_port = start_ssh_server(pod_name, user_name, host_network, dist_port) # generate ssh config ssh_config = """ Host %s HostName %s Port %s User %s StrictHostKeyChecking no UserKnownHostsFile /dev/null """ sshconfigstr = "" for [idx, pod] in enumerate(pods["items"]): pod_ip = pod["status"]["podIP"] dist_port = pod["metadata"]["labels"]["distPort"] role = pod["metadata"]["labels"]["distRole"] role_idx = pod["metadata"]["labels"]["distRoleIdx"] # TODO hostNetwork if host_network: sshconfigstr += ( ssh_config % (role + "-" + str(role_idx), pod_ip, str(dist_port), user_name) + "\n") else: sshconfigstr += ( ssh_config % (role + "-" + str(role_idx), pod_ip, 22, user_name) + "\n") # config ssh client for [idx, pod] in enumerate(pods["items"]): pod_name = pod["metadata"]["name"] bash_script = "cat > /home/" + user_name + "/.ssh/config <<EOF " + sshconfigstr + "\nEOF" print("override ssh client config: %s" % bash_script) k8sUtils.kubectl_exec( "exec %s -- bash -c \'%s\' ; chown -R %s /home/%s/.ssh/config" % (pod_name, bash_script, user_name, user_name)) # fix ~/.ssh/ folder permission k8sUtils.kubectl_exec( "exec %s -- chmod 600 -R /home/%s/.ssh; chmod 700 /home/%s/.ssh; chown -R %s /home/%s/.ssh/config" % (pod_name, user_name, user_name, user_name, user_name)) # generate hostfile hostfilecontent = "" for [_, pod] in enumerate(pods["items"]): role = pod["metadata"]["labels"]["distRole"] if role == "ps": continue role_idx = pod["metadata"]["labels"]["distRoleIdx"] worker_gpu_num = pod["spec"]["containers"][0]["resources"]["requests"][ "nvidia.com/gpu"] hostfilecontent += "%s slots=%s\n" % ("worker-" + str(role_idx), worker_gpu_num) tmp_hostfile = "/tmp/" + job_id + ".hostfile" with open(tmp_hostfile, 'w') as f: f.write(hostfilecontent + "\n") # write the hostfile for [idx, pod] in enumerate(pods["items"]): pod_name = pod["metadata"]["name"] remotecmd = "cp %s %s:/job/hostfile" % (tmp_hostfile, pod_name) k8sUtils.kubectl_exec(remotecmd) for [idx, pod] in enumerate(pods["items"]): pod_name = pod["metadata"]["name"] k8sUtils.kubectl_exec("exec %s touch /opt/run_dist_job" % pod_name) # execute user command #k8sUtils.kubectl_exec("exec %s -- bash -c 'runuser -l ${DLWS_USER_NAME} <<EOF_USER_SCRIPT %s \nEOF_USER_SCRIPT'" % (pod_name, jobParams["cmd"])) # update job status dataHandler = DataHandler() dataHandler.UpdateJobTextField(job_id, "jobStatus", "running")
for j in builders: if j in z["model"]: ur = z["model"] ur = ur.replace("\u00e9", "é") z["builder"] = j.replace("\u00e9", "é") z["builder"] = j.decode("UTF-8") z["model"] = z["model"].replace(j, "") with open("test.json") as data_file: data = json.load(data_file) data_handler = DataHandler(usr=data["usr"], table=data["table"], url=data["url"], pwd=data["pwd"]) var_map = { "builder": "CONSTRUCTEUR", "Displacement": "DEPLACEMENT", "Beam": "MAITRE_BAU", "Length": "LONGUEUR", "model": "MODELE", "type": "TYPE", "No. of Beds": "CAPACITE_COUCHAGE", "Max speed": "VITESSE_MAX", "Range": "RANGE_NAV", "Engine": "MOTEUR", "Cruising speed": "VITESSE_CROISIERE", "Length Waterline": "LONGUEUR_LIGNE_FLOTTAISON",
from DataHandler import DataHandler import sys handler = DataHandler() #handler.addRandomIp(443) for line in sys.stdin: line = line.replace("\n","") line = line.replace("\r","") line = line.replace(" ","") if(line != ""): print line handler.addIp(line,443)
import traceback, signal, socket, sys, logging from twisted.internet import reactor from twisted.internet import task sys.path.append("protocol") sys.path.append(".") from DataHandler import DataHandler from Client import Client from NATServer import NATServer import ip2country # just to make sure it's downloaded import ChanServ import twistedserver _root = DataHandler() _root.parseArgv(sys.argv) try: signal.SIGHUP def sighup(sig, frame): logging.info('Received SIGHUP.') if _root.sighup: _root.reload() signal.signal(signal.SIGHUP, sighup) except AttributeError: pass logging.info('Starting uberserver...')
import numpy as np from DataHandler import DataHandler from ModelHandler import Model """ The script used to generate new music. It creates a model and loads a weights file. Then it gets a seed from the DataHandler object and starts th generation process. """ settings = DataHandler.get_config_params() data_handler = DataHandler() print 'Building model' neurons = settings["neurons"] dropout = settings["dropout"] l_rate = settings["learning_rate"] epochs = settings["epochs"] optimizer = settings["optimizer"] model = Model(neurons=neurons, dropout=dropout, learning_rate=l_rate, optimizer=optimizer, desired_loss=0.3) print 'Loading weights' data_handler.get_weights(model, settings) def sample(prob_distribution, temperature=1.0): """ A function to sample an index from the array of probabilities. :param prob_distribution: an array with probabilities for each class (note). :param temperature: denominator parameter used to divide the natural log of each element. Helps in transforming values in the probability array
def analyze_file(data: DataHandler, verbose=False): """ Funkcja do szybkiej analizy pliku. Odpala wszystkie algorytmy i rysuje ich rozwiązania. :param data: :param verbose: Dla dokladniejszej analizy i rysowania trzeba podać argument True do funkcji """ algos = TSPAlgorithms(data) K = 10000 # only used in verbose and euc2d but need scope axs: List[List[plt.Axes]] fig: plt.Figure fig, axs = plt.subplots(7, 2, figsize=(20, 70)) axs[0][1].remove() random_permutation = np.random.permutation(data.dimension) algorithms_and_parameters = [ #[algos.k_random, tuple([K]), axs[0][0]], #[algos.closest_neighbour, tuple(), axs[1][0]], #[algos.repetitive_closest_neighbour, tuple(), axs[1][1]], [ algos.two_opt, tuple([neighbourings.invert, random_permutation]), axs[2][0] ], #[algos.two_opt, tuple([neighbourings.swap, random_permutation]), axs[2][1]], #[algos.taboo_search, tuple(["accelerate", neighbourings.invert, random_permutation]), axs[3][0]], #[algos.taboo_search, tuple(["accelerate", neighbourings.swap, random_permutation]), axs[3][1]], [ algos.taboo_search, tuple( ["accelerate_moves", neighbourings.invert, random_permutation]), axs[4][0] ], #[algos.taboo_search, tuple(["cycled_accelerate", neighbourings.swap, random_permutation]), axs[4][1]], [ algos.taboo_search, tuple([ "stagnation_accelerate", neighbourings.invert, random_permutation ]), axs[5][0] ], #[algos.taboo_search, tuple(["stagnation_accelerate", neighbourings.swap, random_permutation]), axs[5][1]], [ algos.taboo_search, tuple( ["long_term_memory", neighbourings.invert, random_permutation]), axs[6][0] ], #[algos.taboo_search, tuple(["long_term_memory", neighbourings.swap, random_permutation]), axs[6][1]] ] for algorithm in algorithms_and_parameters: algo, parameter_list, ax = algorithm ax: plt.Axes function_label = f"{algo.__name__}({', '.join(list(map(str, parameter_list)))})" time_before = time.time() cost = algo(*parameter_list) time_after = time.time() print( f"{function_label} = {cost}, took {round(time_after-time_before,2)} seconds" ) # func(*params) = func(params[0], params[1],...,params[k]) solution = algos.last_solution if verbose: print(f"Solution:", algos.last_solution) if data.isEuc2D(): edges = [(solution[i], solution[i + 1]) for i in range(len(solution) - 1)] edges.append((solution[len(solution) - 1], solution[0])) nx.draw(data.getGraph(), ax=ax, pos=data.getPos(), with_labels=True, node_size=300, node_color="#ADD8E6") nx.draw_networkx_edges(data.getGraph(), pos=data.getPos(), ax=ax, edgelist=edges, width=2) ax.title.set_text(function_label + f"\n cost={algos.last_cost}") if verbose and data.isEuc2D(): fig.suptitle(f"Wykres algorytmów dla instancji {data.name}", fontsize=16) plt.tight_layout() plt.show()
def AddUser(username,userId): ret = None dataHandler = DataHandler() ret = dataHandler.AddUser(username,userId) dataHandler.Close() return ret
def SubmitPSDistJob(job): ret = {} dataHandler = DataHandler() try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["rest-api"] = config["rest-api"] distJobParams = {} distJobParams["ps"] = [] distJobParams["worker"] = [] assignedRack = None if len(config["racks"]) > 0: assignedRack = random.choice(config["racks"]) if jobParams["jobtrainingtype"] == "PSDistJob": jobDescriptionList = [] nums = { "ps": int(jobParams["numps"]), "worker": int(jobParams["numpsworker"]) } for role in ["ps", "worker"]: for i in range(nums[role]): distJobParam = copy.deepcopy(jobParams) distJobParam["distId"] = "%s%d" % (role, i) distJobParam["distRole"] = role if "jobPath" not in distJobParam or len( distJobParam["jobPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: job-path does not exist") return False distJobParam["distJobPath"] = os.path.join( distJobParam["jobPath"], distJobParam["distId"]) if "workPath" not in distJobParam or len( distJobParam["workPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: work-path does not exist") return False if "dataPath" not in distJobParam or len( distJobParam["dataPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: data-path does not exist") return False jobPath, workPath, dataPath = GetStoragePath( distJobParam["distJobPath"], distJobParam["workPath"], distJobParam["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) if not os.path.exists(localJobPath): if "userId" in distJobParam: mkdirsAsUser(localJobPath, distJobParam["userId"]) else: mkdirsAsUser(localJobPath, 0) distJobParam["LaunchCMD"] = "" if "cmd" not in distJobParam: distJobParam["cmd"] = "" ################One choice is that we only wait for certain time. # launchCMD = """ ##!/bin/bash #mkdir -p /opt #echo "[DLWorkspace System]: Waiting for all containers are ready..." ## wait for at most 10 mins. #for i in {1..200}; do # if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then # sleep 3 # else # break # fi #done #if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then # echo "[DLWorkspace System]: Waiting for containers: timeout! Restarting..." # exit 1 #else # echo "[DLWorkspace System]: All containers are ready, launching training job..." # chmod +x /opt/run_dist_job.sh # /opt/run_dist_job.sh #fi #""" launchCMD = """ #!/bin/bash mkdir -p /opt echo "[DLWorkspace System]: Waiting for all containers are ready..." while [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; do sleep 3 done echo "[DLWorkspace System]: All containers are ready, launching training job..." chmod +x /opt/run_dist_job.sh /opt/run_dist_job.sh """ launchScriptPath = os.path.join( localJobPath, "launch-%s.sh" % distJobParam["jobId"]) with open(launchScriptPath, 'w') as f: f.write(launchCMD) f.close() distJobParam[ "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % distJobParam[ "jobId"] distJobParam["jobNameLabel"] = ''.join( e for e in distJobParam["jobName"] if e.isalnum()) distJobParam["userNameLabel"] = getAlias( jobParams["userName"]) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"], "Jobs_Templete") jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template") distJobParam["hostjobPath"] = os.path.join( config["storage-mount-path"], jobPath) distJobParam["hostworkPath"] = os.path.join( config["storage-mount-path"], workPath) distJobParam["hostdataPath"] = os.path.join( config["storage-mount-path"], dataPath) distJobParam["nvidiaDriverPath"] = nvidiaDriverPath if "mountpoints" not in distJobParam: distJobParam["mountpoints"] = [] # distJobParam["mountpoints"].append({"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath}) distJobParam["mountpoints"].append({ "name": "job", "containerPath": "/job", "hostPath": distJobParam["hostjobPath"] }) distJobParam["mountpoints"].append({ "name": "work", "containerPath": "/work", "hostPath": distJobParam["hostworkPath"] }) distJobParam["mountpoints"].append({ "name": "data", "containerPath": "/data", "hostPath": distJobParam["hostdataPath"] }) distJobParam["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config and config[ "usefreeflow"] == "True": distJobParam["usefreeflow"] = config["usefreeflow"] else: distJobParam["usefreeflow"] = False random.seed(datetime.datetime.now()) distJobParam["containerPort"] = int(random.random() * 1000 + 3000) if assignedRack is not None: if "nodeSelector" not in distJobParam: distJobParam["nodeSelector"] = {} distJobParam["nodeSelector"]["rack"] = assignedRack template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=distJobParam) jobDescriptionList.append(job_description) distJobParams[role].append(distJobParam) jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists( os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescriptionPath", jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription", base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] jobMeta["distJobParams"] = distJobParams jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta", jobMetaStr) except Exception as e: print e ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg", "Cannot submit job!" + str(e)) return ret
def data_numeric_stub(): dh = DataHandler('data/total-test.csv', 'prediction_label') headers, features, prediction_labels = dh.get_numeric_data_set() knn = KNearestNeighbour(features, prediction_labels, 1) print knn.predict((0, 0))
class Generator: def __init__(self, outDir): self.config = None self.__csHash = set() self.__outputDir = outDir self.__csInstance = CSHandler() self.__dataHandler = DataHandler() self.__utils = Utils() self.__Tree = Dependencytree() self.__fileSuffix = "" self.prepareConfig() def prepareConfig(self): self.config = GeneratingConfig() self.config.setCSVariants([0, 1, 2, 3, 4]) self.config.setDataRanges({0:range(50, 1001, 50), 1:range(50, 1001, 50), 2:range(50, 1001, 50), 3:range(50, 1001, 50), 4:range(50, 1001, 50)}) self.config.setSplits([(50, 50), (60, 40), (70, 30), (80, 20), (90, 10)]) self.config.setTagsetVariants([".uniq", ".uni"]) def prepareGenerator(self): self.__csInstance.updateLIDTags(self.__dataHandler.LID[0], self.__dataHandler.LID[1]) def prepareRealTest(self, dataFile, outFile): dataFile = open(dataFile) outFile = open(outFile, 'w') for line in dataFile: line = map(lambda x:x.split('_#'), line.strip().split()) uniLine = self.__dataHandler.mapLD2Uni(line) outFile.write(' '.join(map(lambda x:'_#'.join(x), uniLine)) + '\n') outFile.close() def generateTestData(self): self.config.setDataRanges({0:range(30, 151, 50), 1:range(30, 151, 50), 2:range(30, 151, 50), 3:range(30, 151, 50), 4:range(30, 151, 50)}) for csType in self.config.csVariants: print "type" + str(csType) for data in self.config.dataRanges[csType]: print print " numSents:" + str(data * 2), initialSplitCSData = [] for splitIndex in range(len(self.config.splits)): csData = [] Split = self.config.splits[splitIndex] pureData = [] pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w') dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w') pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w') dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w') pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data) tr = data - pr print " Pure:" + str(2 * pr), print " CS:" + str(2 * tr), random.seed() pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr) pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr) for index in pIndicesL1: line = self.__dataHandler.pureL1[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) for index in pIndicesL2: line = self.__dataHandler.pureL2[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) if splitIndex != 0: random.seed() csSample = random.sample(initialSplitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__dataHandler.parL1): ##break index = 0 print "Still:", stopLength, " Looping.." csLines = [] csSeqs = [] hashKeys = ["", ""] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: hashKeys[order] = (index, order, tuple(csReturn[1])) csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__dataHandler.parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0]) pureLine1 = self.__dataHandler.makeLD(pureLine1) self.__Tree.updateTree(self.__dataHandler.parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1]) pureLine2 = self.__dataHandler.makeLD(pureLine2) pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2]) if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash: pureData.append(tuple(pureLine1)) pureData.append(tuple(pureLine2)) csData.append(tuple(csLines[0])) csData.append(tuple(csLines[1])) if splitIndex == 0: initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2))) stopLength -= 1 for hashKey in hashKeys: self.__csHash.add(hashKey) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Testing Break!!" dummy = raw_input() for csLine in csData: dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine))) dataFile.write(self.makeString(csLine)) for pureLine in pureData: pureFile.write(self.makeString(pureLine)) pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine))) pureFile.close() dataFile.close() pureUniFile.close() dataUniFile.close() def generateDataForTest(self): for i in range(10): self.__fileSuffix = "."+str(i) self.generateTrainDataForTest() def generateTrainDataForTest(self): self.config.setDataRanges({0:[450], 1:[450], 2:[450], 3:[450], 4:[450]}) statusCount = 0 for csType in self.config.csVariants: print "type" + str(csType), for data in self.config.dataRanges[csType]: print " numSents:" + str(data * 2), initialSplitCSData = [] for splitIndex in range(len(self.config.splits)): csData = [] Split = self.config.splits[splitIndex] pureData = [] pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w') dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w') pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w') dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w') pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data) tr = data - pr print " Pure:" + str(2 * pr), print " CS:" + str(2 * tr), if splitIndex == len(self.config.splits) - 1: print random.seed() pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr) pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr) for index in pIndicesL1: line = self.__dataHandler.pureL1[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) for index in pIndicesL2: line = self.__dataHandler.pureL2[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) if splitIndex != 0: random.seed() csSample = random.sample(initialSplitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__dataHandler.parL1): ##break index = 0 print "Still:", stopLength, " Looping.. ", csLines = [] csSeqs = [] hashKeys = ["", ""] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: hashKeys[order] = (index, order, tuple(csReturn[1])) csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__dataHandler.parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0]) pureLine1 = self.__dataHandler.makeLD(pureLine1) self.__Tree.updateTree(self.__dataHandler.parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1]) pureLine2 = self.__dataHandler.makeLD(pureLine2) pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2]) if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash: pureData.append(tuple(pureLine1)) pureData.append(tuple(pureLine2)) csData.append(tuple(csLines[0])) csData.append(tuple(csLines[1])) if splitIndex == 0: initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2))) stopLength -= 1 for hashKey in hashKeys: self.__csHash.add(hashKey) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine))) dataFile.write(self.makeString(csLine)) for pureLine in pureData: pureFile.write(self.makeString(pureLine)) pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine))) pureFile.close() dataFile.close() pureUniFile.close() dataUniFile.close() statusCount += 1 if statusCount % 50 == 0: print statusCount, sys.stdout.flush() print statusCount def generateTrainData(self): statusCount = 0 for csType in self.config.csVariants: print "type" + str(csType) for data in self.config.dataRanges[csType]: print print " numSents:" + str(data * 2), initialSplitCSData = [] for splitIndex in range(len(self.config.splits)): csData = [] Split = self.config.splits[splitIndex] pureData = [] pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w') dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w') pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w') dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w') pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data) tr = data - pr print " Pure:" + str(2 * pr), print " CS:" + str(2 * tr), random.seed() pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr) pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr) for index in pIndicesL1: line = self.__dataHandler.pureL1[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) for index in pIndicesL2: line = self.__dataHandler.pureL2[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) if splitIndex != 0: random.seed() csSample = random.sample(initialSplitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__dataHandler.parL1): ##break index = 0 print "Still:", stopLength, " Looping.." csLines = [] csSeqs = [] hashKeys = ["", ""] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: hashKeys[order] = (index, order, tuple(csReturn[1])) csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__dataHandler.parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0]) pureLine1 = self.__dataHandler.makeLD(pureLine1) self.__Tree.updateTree(self.__dataHandler.parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1]) pureLine2 = self.__dataHandler.makeLD(pureLine2) pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2]) if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash: pureData.append(tuple(pureLine1)) pureData.append(tuple(pureLine2)) csData.append(tuple(csLines[0])) csData.append(tuple(csLines[1])) if splitIndex == 0: initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2))) stopLength -= 1 for hashKey in hashKeys: self.__csHash.add(hashKey) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine))) dataFile.write(self.makeString(csLine)) for pureLine in pureData: pureFile.write(self.makeString(pureLine)) pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine))) pureFile.close() dataFile.close() pureUniFile.close() dataUniFile.close() statusCount += 1 if statusCount % 50 == 0: print statusCount, sys.stdout.flush() print statusCount def generateUCTrainData(self): # Unknown words constrained training data statusCount = 0 for csType in self.config.csVariants: for data in self.config.dataRanges[csType]: initialSplitCSData = [] for splitIndex in range(len(self.config.splits)): csData = [] Split = self.config.splits[splitIndex] pureData = [] pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w') dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w') pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w') dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w') pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data) tr = data - pr print pr random.seed() pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr) pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr) for index in pIndicesL1: line = self.__dataHandler.pureL1[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) for index in pIndicesL2: line = self.__dataHandler.pureL2[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) if splitIndex != 0: random.seed() csSample = random.sample(initialSplitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__dataHandler.parL1): ##break index = 0 print "Still:", stopLength, " Looping.." csLines = [] csSeqs = [] hashKeys = ["", ""] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: hashKeys[order] = (index, order, tuple(csReturn[1])) csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__dataHandler.parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0]) pureLine1 = self.__dataHandler.makeLD(pureLine1) self.__Tree.updateTree(self.__dataHandler.parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1]) pureLine2 = self.__dataHandler.makeLD(pureLine2) pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2]) if pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash: pureData.append(tuple(pureLine1)) pureData.append(tuple(pureLine2)) csData.append(tuple(csLines[0])) csData.append(tuple(csLines[1])) if splitIndex == 0: initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2))) stopLength -= 1 for hashKey in hashKeys: self.__csHash.add(hashKey) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine))) dataFile.write(self.makeString(csLine)) for pureLine in pureData: pureFile.write(self.makeString(pureLine)) pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine))) pureFile.close() dataFile.close() pureUniFile.close() dataUniFile.close() statusCount += 1 if statusCount % 50 == 0: print statusCount, sys.stdout.flush() print statusCount def makeString(self, wordsTagsLangs): return ' '.join(map(lambda x:"_#".join(x), wordsTagsLangs)) + '\n' def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data): self.__dataHandler.loadData(l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data)
def data_text_stub(): dh = DataHandler('data/train-set.csv', 'sentiment') headers, features, prediction_labels = dh.get_textual_data_set() review_text_index = headers.index('review_text') review_text_list = [feature[review_text_index] for feature in features] bow_headers, train_features = dh.convert_docs_to_bow(review_text_list)
def data_write_test_stub(): dh = DataHandler('data/train-set.csv', 'sentiment') headers, features, prediction_labels = dh.get_textual_data_set() review_text_index = headers.index('review_text') review_text_list = [feature[review_text_index] for feature in features] bow_feature_names = dh.get_feature_set_for_documents(review_text_list) dh = DataHandler('data/test-set.csv', 'sentiment') headers, features, prediction_labels = dh.get_textual_data_set() review_text_index = headers.index('review_text') review_text_list = [feature[review_text_index] for feature in features] bow_features = dh.convert_docs_to_bow_for_features(review_text_list, bow_feature_names) test_prediction_labels = dh.convert_sentiment_list_to_number(prediction_labels) print len(bow_feature_names) print len(bow_features[0]) print test_prediction_labels[0] bow_feature_names.append("prediction_label") dh.write_to_file('data/test-set-feature-engineered.csv', bow_features, bow_feature_names, test_prediction_labels)
class UserwiseDivergenceAnalysis: def __init__(self, dataFile, userJoins): sys.stderr.write('In Constructor\n') self.distComparer = DistComparer() self.dataHandler = DataHandler(dataFile, userJoins) self.dataHandler.loadActiveForums() self.__loadData() self.sampledUsers = set() def __loadData(self): stopWords = set([s.strip() for s in open("/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts/Regression/stopWords")]) self.dataHandler.preprocessVocab(stopWords) def sampleUsers(self): #self.dataHandler.userStats(outFile) self.sampledUsers = self.dataHandler.sampleUsers() def doDivergenceAnalysisPerUser(self, outFile): outFile = open(outFile,'w') for user in self.sampledUsers: #print "User:"******"Month:",month outFile.write(str(user)+'\t'+str(month)+'\t'+str(userDivergences[month][0])+'\t'+str(userDivergences[month][1])+'\n') outFile.close() def prepareUserDivergencesActive(self, userNum): divergences = {} userMonths = self.dataHandler.getUserMonths(userNum) activeForum = self.dataHandler.getActiveForum(userNum) if activeForum.find("Talk")<0: return -1 userInitialData = self.dataHandler.makeDist(self.dataHandler.getForumInitialData(self.dataHandler.getActiveForum(userNum))) userMaturedData = self.dataHandler.makeDist(self.dataHandler.getForumMaturedData(self.dataHandler.getActiveForum(userNum))) for userMonth in userMonths: monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth)) divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData)) return divergences def prepareUserDivergencesBackground(self, userNum): divergences = {} userMonths = self.__dataHandler.getUserMonths(userNum) userInitialData = self.dataHandler.makeDist(self.dataHandler.getForumInitialData("AllTalk")) userMaturedData = self.dataHandler.makeDist(self.dataHandler.getForumMaturedData("AllTalk")) for userMonth in userMonths: monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth)) divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData)) return divergences def prepareUserDivergences(self, userNum): divergences = {} userMonths = self.dataHandler.getUserMonths(userNum) userInitialData = self.dataHandler.makeDist(self.dataHandler.getUserInitialData(userNum)) userMaturedData = self.dataHandler.makeDist(self.dataHandler.getUserMaturedData(userNum)) for userMonth in userMonths: monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth)) divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData)) return divergences
class SBGSurvival(object): """ This class implements an extended version of the Shifted-Beta-Geometric model by P. Fader and B. Hardie. The original model works by assuming a constant in time, beta distributed individual probability of churn. Due to the heterogeneity of a cohort's churn rates (since each individual will have a different probability of churning), expected behaviours such as the decrease of cohort churn rate over time arise naturally. The extension done here generalizes the coefficients alpha and beta of the original model to function of features on the individual level. A log-linear model is used to construct alpha(x) and beta(x) and the likelihood is then computed by combining the contributions of each and every sample in the training set. The model takes as inputs ... """ def __init__(self, age, alive, features=None, gamma=1.0, gamma_beta=None, bias=True, normalize=True, verbose=False): """ Initializes objects with parameters necessary to create the supporting objects: DataHandler and ShiftedBeta :param age: str The column name to identify the age of each individual. Age has to be an integer value, and will determine the time intervals the model with work with. --- See DataHandler.py :param alive: str The column name with the status of each individual. In the context of survival analysis, an individual may be dead or alive, and its contribution to the model will depend on it. --- See DataHandler.py :param features: str, list or None A string with the name of the column to be used as features, or a list of names of columns to be used as features or None, if no features are to be used. --- See DataHandler.py :param gamma: float A non-negative float specifying the strength of the regularization applied to w_alpha (alpha's weights) and, if gamma_beta is not given, it is also applied to beta. --- See ShiftedBeta.py :param gamma_beta: float A non-negative float specifying the strength of the regularization applied to w_beta (beta's weights). If specified, overwrites the value of gamma for beta. --- See ShiftedBeta.py :param bias: bool Whether or not a bias term should be added to the feature matrix. --- See DataHandler.py :param normalize: bool Whether or not numerical fields should be normalized (centered and scaled to have std=1) --- See DataHandler.py :param verbose: bool Whether of not status updates should be printed --- See ShiftedBeta.py """ # Create objects! # DATA-HANDLER OBJECT # The DataHandler object may be created without the training data, so # we do it here. self.dh = DataHandler(age=age, alive=alive, features=features, bias=bias, normalize=normalize) # Shifted beta model object # Was a different gammab parameter passed? If not, we use the same # value passed to gamma. if gamma_beta is None: gamma_beta = 1.0 * gamma # create shifted beta object self.sb = ShiftedBetaGeometric(gamma_alpha=gamma, gamma_beta=gamma_beta, verbose=verbose) def fit(self, df, restarts=1): """ A method responsible for learning both the transformation of the data, including addition of a bias parameters, centering and re-scaling of numerical features, and one-hot-encoding of categorical features. In addition to learning the parameters alpha and beta of the shifted-beta- geometric model. This is just a wrapper, the real heavy-lifting is done by the DataHandler and ShiftedBeta objects. :param df: pandas DataFrame A pandas DataFrame with similar schema as the one used to train the model. Similar in the sense that the columns used as cohort, age and categories must match. Extra columns with not affect anything. :param restarts: int Number of times to restart the optimization procedure with a different seed, to avoid getting stuck on local maxima. """ # Transform dataframe extracting feature matrix, ages and alive status. x, y, z = self.dh.fit_transform(df) # fit to data using the ShiftedBeta object. self.sb.fit(X=x, age=y, alive=z, restarts=restarts) def summary(self): """ Simple method to get the learned weights and their corresponding categories :return: pandas DataFrame A DataFrame object with alpha and beta weights for each category """ # Construct a DataFrame consisting of feature name and corresponding # alpha and beta parameters. Names are obtained by invoking the # get_names() method, and the parameter displayed are the weights, # not the final values (since that cannot be made sense in separate). suma = pd.DataFrame( data={name: (a, b) for name, a, b in zip(self.dh.get_names(), self.sb.alpha, self.sb.beta)}, index=["w_alpha", "w_beta"], ).T return suma def predict_params(self, df): """ predict_params is a method capable of predicting the values of alpha and beta for given combination of features. It invokes the compute_alpha_beta method from the ShiftedBeta object to compute the arrays of alpha and beta for every sample in df given the available features. Notice that it must first transform the dataframe df using DataHandler's transform method, so that it can than work with the lower level feature matrix, x. :param df: pandas DataFrame A pandas dataframe with at least the same feature columns as the one used to train the model. :return: pandas DataFrame A DataFrame with the predicted alpha and beta for each sample in df """ # Start by transforming df to its lower level np.array representation x, y, z = self.dh.transform(df=df) # Use compute_alpha_beta to compute alpha and beta for every sample in # df based on the feature matrix extracted from df, x. alpha, beta = self.sb.compute_alpha_beta(x, self.sb.alpha, self.sb.beta) # Return a dataframe with predictions. return pd.DataFrame(data=np.vstack([alpha, beta]), index=["alpha", "beta"]).T def predict_churn(self, df, age=None, **kwargs): """ predict_churn is a method to compute churn rate for a number of periods conditioned on the age of the sample. This method invokes the churn_p_of_t method from ShiftedBeta to compute the churn rate for a given number of periods conditional on age. See the description of churn_p_of_t in ShiftedBeta.py for more details. This method is a wrapper, it transforms the dataframe df to the appropriate representation and feed it to the lower level method from ShiftedBeta. It is worth noticing that the user has the option to pass the value for age, which can wither be a single number of an array with the same length as df, and this will overwrite whatever other value for age might come out when transforming df. :param df: pandas DataFrame A pandas dataframe with at least the same feature columns as the one used to train the model. :param age: None or float or ndarray of shape(df.shape[0], ) If age is None, the method will use the age parameter extracted from df. ** Notice that if age=None and df does not contain an age field, a RuntimeError will be raised! ** If age != None, pass this value along to churn_p_of_t. :param kwargs: Any other arguments that should be redirected to churn_p_of_t. :return: pandas DataFrame A DataFrame with the churn_p_of_t matrix. """ x, y, z = self.dh.transform(df=df) # If age field is present in prediction dataframe, we may choose to # use it to calculate future churn. To do so, we first check if the # user passed a new age parameter, if answer is yes, use the new age. # If, however, the user did not pass age, use the value extracted from # the dataframe, df. # ** If no value for age is passed and the dataframe does not contain # age, a RuntimeError is raised. if age is None: age = y if age is None: raise RuntimeError( 'The "age" field must either be present in ' "the dataframe or passed separately as an " "argument." ) # Create a dataframe with the churn_p_of_t matrix with all relevant # parameters. out = pd.DataFrame(data=self.sb.churn_p_of_t(x, age=age, **kwargs)) # Give columns a decent, generic name. out.columns = ["period_{}".format(col) for col in range(1, out.shape[1] + 1)] return out def predict_survival(self, df, age=None, **kwargs): """ predict_survival is a method to compute the survival curve for a number of periods conditioned on the age of the sample. This method invokes the survival_function method from ShiftedBeta to compute the retention rate for a given number of periods conditional on age. See the description of survival_function in ShiftedBeta.py for more details. This method is a wrapper, it transforms the dataframe df to the appropriate representation and feed it to the lower level method from ShiftedBeta. It is worth noticing that the user has the option to pass the value for age, which can wither be a single number of an array with the same length as df, and this will overwrite whatever other value for age might come out when transforming df. :param df: pandas DataFrame A pandas dataframe with at least the same feature columns as the one used to train the model. :param age: None or float or ndarray of shape(df.shape[0], ) If age is None, the method will use the age parameter extracted from df. ** Notice that if age=None and df does not contain an age field, a RuntimeError will be raised! ** If age != None, pass this value along to survival_function. :param kwargs: Any other arguments that should be redirected to survival_function. :return: pandas DataFrame A DataFrame with the survival_function matrix. """ x, y, z = self.dh.transform(df=df) # If age field is present in prediction dataframe, we may choose to # use it to calculate future churn. To do so, we first check if the # user passed a new age parameter, if answer is yes, use the new age. # If, however, the user did not pass age, use the value extracted from # the dataframe, df. # ** If no value for age is passed and the dataframe does not contain # age, a RuntimeError is raised. if age is None: age = y if age is None: raise RuntimeError( 'The "age" field must either be present in ' "the dataframe or passed separately as an " "argument." ) # Create a dataframe with the churn_p_of_t matrix with all relevant # parameters. out = pd.DataFrame(data=self.sb.survival_function(x, age=age, **kwargs)) # Give columns a decent, generic name. out.columns = ["period_{}".format(col) for col in range(1, out.shape[1] + 1)] return out def predict_ltv(self, df, age=None, alive=None, **kwargs): """ predict_ltv is a method to compute the ltv for each sample conditioned on age. This method invokes the derl method from ShiftedBeta to compute the residual ltv of each sample given its given age. See the description of derl in ShiftedBeta.py for more details. This method is a wrapper, it transforms the dataframe df to the appropriate representation and feed it to the lower level method from ShiftedBeta. It is worth noticing that the user has the option to pass the value for both age and alive fields, which can wither be a single number of an array with the same length as df, and this will overwrite whatever other value for age and/or alive might come out when transforming df. :param df: pandas DataFrame A pandas dataframe with at least the same feature columns as the one used to train the model. :param age: None or float or ndarray of shape(df.shape[0], ) If age is None, the method will use the age parameter extracted from df. ** Notice that if age=None and df does not contain an age field, a RuntimeError will be raised! ** If age != None, pass this value along to derl. :param alive: None or float or ndarray of shape(df.shape[0], ) If age is None, the method will use the alive parameter extracted from df. ** Notice that if alive=None and df does not contain an alive field, a RuntimeError will be raised! ** If alive != None, pass this value along to derl. :param kwargs: Any other arguments that should be redirected to derl. :return: pandas DataFrame A DataFrame with the ltv predictions. """ x, y, z = self.dh.transform(df=df) # If age field is present in prediction dataframe, we may choose to # use it to calculate future churn. To do so, we first check if the # user passed a new age parameter, if answer is yes, use the new age. # If, however, the user did not pass age, use the value extracted from # the dataframe, df. # ** If no value for age is passed and the dataframe does not contain # age, a RuntimeError is raised. if age is None: age = y if age is None: raise RuntimeError( 'The "age" field must either be present in ' "the dataframe or passed separately as an " "argument." ) # See the discussion above for age, exact same logic applies. if alive is None: alive = z if alive is None: raise RuntimeError( 'The "alive" must either be present in the ' "dataframe or passed separately as an " "argument." ) # Get LTVs and return a dataframe! ltvs = self.sb.derl(x, age=age, alive=alive, **kwargs) return pd.DataFrame(data=ltvs, columns=["ltv"])
import SocketHandler from SocketHandler import SocketHandler from DataHandler import DataHandler import sys while True: dataHandler = DataHandler() ip = dataHandler.getRandom() if ip == "": print "Could not find any ips that need processing" sys.exit() print "Processing "+ip socksHandler = SocketHandler(ip) banner = socksHandler.grabBanner() #Yes I understand that this is an awful hack banner = banner.replace("'","\'") try: dataHandler.setBanner(ip, banner) except: print "Saving banner for "+ip+" failed!"
def get_job(job_id): dataHandler = DataHandler() ret = dataHandler.GetJob(jobId=job_id)[0] dataHandler.Close() return ret
#!/usr/bin/env python import os import sys import argparse import base64 import json import pprint sys.path.append( os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) from DataHandler import DataHandler if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--job_id", "-j", help="id of job", required=True) args = parser.parse_args() handler = DataHandler() jobs = handler.GetJob(jobId=args.job_id) if len(jobs) == 0: print("didn't find job of %s" % (args.job_id)) sys.exit(1) job = jobs[0] job_params = json.loads(base64.b64decode(job["jobParams"])) pprint.pprint(job_params)
def get_job_priorities(): dataHandler = DataHandler() job_priorites = dataHandler.get_job_priority() dataHandler.Close() return job_priorites
from DataHandler import DataHandler import itertools data_handler = DataHandler() try: print(data_handler) print("") df_names = data_handler.files df_combinations = itertools.combinations(df_names, 2) for combination in df_combinations: common_columns = data_handler.list_all_columns( selected_columns=combination) column1 = common_columns.values()[0] column2 = common_columns.values()[1] intersection = set(column1).intersection(column2) if not intersection: continue print( "Columns %s,%s Have %s in common" % (common_columns.keys()[0], common_columns.keys()[1], intersection)) print("") except Exception as e: print(e)
import numpy as np from DataHandler import DataHandler from ModelHandler import Model """ The script used to generate new music. It creates a model and loads a weights file. Then it gets a seed from the DataHandler object and starts th generation process. """ settings = DataHandler.get_config_params() data_handler = DataHandler() print('Building model') neurons = settings["neurons"] dropout = settings["dropout"] l_rate = settings["learning_rate"] epochs = settings["epochs"] optimizer = settings["optimizer"] model = Model(neurons=neurons, dropout=dropout, learning_rate=l_rate, optimizer=optimizer, desired_loss=0.3) print('Loading weights') data_handler.get_weights(model, settings) def sample(prob_distribution, temperature=1.0): """ A function to sample an index from the array of probabilities. :param prob_distribution: an array with probabilities for each class (note). :param temperature: denominator parameter used to divide the natural log of each element. Helps in transforming values in the probability array
def evaluateAllModelsData(baseDir, pathToTestData, pathToTrainingData, evalDirName='Evaluation', modelDirName='Models', params=['Param'], modelName='best_model.hdf5', cv=10, verbose=True, colRemove=None, skiprowsTrain=0, skiprowsTest=0, param_name_value=None): if baseDir[-1] != '/': baseDir += '/' evalDir = baseDir + evalDirName modelDir = baseDir + modelDirName modelList = TestingFramework.getListOfModels(modelDir, modelName=modelName) total_combinations = len(modelList) if verbose: print( '\nEvaluating the best network trained for all {} combinations(s)\n' .format(total_combinations)) modelEvaluation = dict() X, y = DataHandler.predictionData(pathToTestData, pathToTrainingData, skiprowsTrain=skiprowsTrain, skiprowsTest=skiprowsTest, colRemove=colRemove) assert (len(y) > 0), "Error. No truth values were found in the dataset." assert ( np.shape(X)[0] == np.shape(y)[0] ), "Error. The number of columns, {}, in the training data does not equal the number of target truth values, {}.".format( np.shape(X)[0], np.shape(y)[0]) for model_path in modelList: if param_name_value is None: # Get the last part of the path (filename) model_filename = re.search('/([^/]*)$', model_path) model_filename = model_filename.group(1) # Remove the filename from the path and extract the last part of the path: the configuration. model_configuration = re.search( '/([^/]*)/$', re.sub(r'([^/]*)$', '', model_path)) model_configuration = model_configuration.group(1) # Get a list of the values for each parameter #parameter_values = model_configuration.split(self.model_parameter_delimiter) parameter_values = model_configuration.split('-') # Create string, Param1 = Value1,\tParm2 = Value2 etc. param_name_value = ",\t".join([ params[i % len(params)] + ' = ' + parameter_values[i] for i in range(len(parameter_values)) ]) if verbose: print('Evaluating model {} for scenario: {}'.format( model_filename, param_name_value)) K.clear_session() # Bug, kernel dies in Keras 2.2.0 model = load_model(model_path) cvscores = TestingFramework.evaluate(model, X, y, cv=cv, verbose=verbose) model_name = model_configuration + '/' + model_filename #model_name = model_name.lower().replace(' ','_').replace(',','_') modelEvaluation[model_name] = [ np.mean(cvscores), np.std(cvscores), param_name_value ] del model # Rank all the combinations from lowest mean MSE to highest sorted_combinations = sorted(modelEvaluation.items(), key=lambda x: x[1]) #print(sorted_combinations) rank = 1 filename = evalDir + '/' + 'ranked_evaluation_static.txt' for model_score in sorted_combinations: rank_string = 'Rank {}\t: {}\t\t| {}\tMSE: {:.3f} +/- {:.3f}\n'.format( rank, model_score[0], model_score[1][2], model_score[1][0], model_score[1][1]) if verbose: print(rank_string) # Write to file with open(filename, 'a') as f: f.write(rank_string) rank += 1
from Score import Score from DataHandler import DataHandler from PartialScore import PartialScore __author__ = 'Thomas' logger = log.setup_custom_logger("Main") start_time = time.clock() logger.info("Main Started at time %i"%start_time) with open('test.json') as data_file: data = json.load(data_file) data_handler = DataHandler(usr=data["usr"], table=data["table"], url=data["url"], pwd=data["pwd"]) myopts, args = getopt.getopt(argv[1:], "c:i:", ["coeffs=", "ids="]) if len(argv) > 1: for i, j in myopts: if i == "--ids" or i == "-i": requested_ids = j.split(",") elif i == "--coeffs" or i == "-c": input_coeffs = j.split(",") coeffs = {} assert len(input_coeffs) == 12, "User must provide exactly 12 coefficients" assert len(requested_ids) > 0, "No ids requested"
data = os.listdir('/home/oem/train') labels = [[1, 0] if 'cat' in i else [0, 1] for i in data] dataset = ImageDataset(source='/home/oem/train', labels=labels, split_spec={'train': {'amount': 0.6, 'transform': True, 'batch_size':32}, 'validation': {'amount': 0.2, 'transform': False, 'batch_size':32}, 'test': {'amount': 0.2, 'transform': False, 'batch_size':32}}, shuffle=True) transformer = ImageTransformer() transformer.add_resize((256, 256), origin=True, keys=['train', 'validation', 'test']) transformer.add_grayscale(origin=True, keys=['train', 'validation', 'test']) transformer.add_unsharp_masking(origin=True, keys=['train', 'validation', 'test']) transformer.add_histogram_equalization(origin=True, keys=['train', 'validation', 'test']) transformer.add_median_filter(keys=['train']) transformer.add_rotation([45, 60, 90], keys=['train']) transformer.add_contrast([1.5], keys=['train']) transformer.add_brightening([0.5], keys=['train']) transformer.add_sharpening([2.0], keys=['train']) saver = DataSaver('/home/oem/PycharmProjects/DeepLearning', 'CatsvsDogs') handler = DataHandler(dataset=dataset, transformer=transformer, saver=saver) handler.run()
from keras.layers import Dense from DataHandler import DataHandler from predModel import predModel from linModel import linModel SCORING = [0.04, 4, -2, 0, 0, 0, 0.1, 6, 1, 0, 0.1, 6, -2] response = input("Would you like to provide values?") find_vals = False if response.lower() == "no": find_vals = True if find_vals: ### run linear model for various alpha dh = DataHandler(beg=1999, end=2018, split_by_pos=True, offset=16, ignore_na=False, fill_mean=True) alpha_vals = np.logspace(-2, 0.5, 10) errs = {} best_alphas = [] for pos in dh.X_train.keys(): errs[pos] = [] print("Running: " + str(pos) + " (" + str(len(dh.X_train[pos])) + " entries)") for alpha in alpha_vals: lin = linModel(alpha=alpha, max_iter=5000) err = np.mean( lin.cv_error(dh.X_train[pos], dh.y_train[pos], np.abs(SCORING))) errs[pos].append(err)
for l2Key in level2Keys: sys.stdout.write("\t"+str(D[l1Key][l2Key])) sys.stdout.write("\n") def printList(L, outFile): outFile = open(outFile,'w') for content in L: outFile.write('\t'.join(map(lambda x:str(x), content))+'\n') outFile.close() if __name__ == '__main__': drawLine() sys.exit() dataFile = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/allThreads.csv" userJoins = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/userJoins" activeForums = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/activeForums.csv" DH = DataHandler(dataFile, userJoins) DH.loadActiveForums(activeForums) drawPostingFrequency(DH.getPostingFreq()) #printDict(DH.getCumulativePostingFreq()) #printDict(DH.getCutoffPostingFreq()) #printDict(DH.getMonthwisePostingFrequency()) #printTwolevelDict(DH.getMonthwiseBinnedPostingFrequency()) '''outFile = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/Analysis/basicTable" table = DH.getBasicTable() print len(table) print "No. of users:",len(set(map(lambda x:x[0],table))) printList(table, outFile)'''
from Evaluator import Evaluator from surprise import NormalPredictor from matrix_factorization_algo import MatrixFactorizationAlgo from DataHandler import DataHandler from knn_algo import knn from hybrid_algo_weighted import HybridAlgoWeighted # from Evaluator import Evaluator #load dataset dataprocessor = DataHandler() evaluationData = dataprocessor.getEvaluation() rankings = dataprocessor.getRank() evaluator = Evaluator() #use random as our basline here Random = NormalPredictor() evaluator.Add_Algo(Random, "Random") # add knn algos knngenerator = knn() knn_algo_dict = knngenerator.generate_knn(evaluationData) for key in knn_algo_dict: evaluator.Add_Algo(knn_algo_dict[key], key) # adding MF algos mf_algo = MatrixFactorizationAlgo() mf_algo_dict = mf_algo.generate_algorithms(evaluationData) for key in mf_algo_dict: evaluator.Add_Algo(mf_algo_dict[key], key)
class preProcessor: def __init__(self, dataFile, userJoins): self.dataHandler = DataHandler(dataFile, userJoins) self.dataHandler.loadActiveForums() self.tokenizedData = [] def prepareTokenizedCSV(self): self.tokenizedData = self.dataHandler.getTokenizedCSV() def prepareTokenizedUserMonthCSV(self): self.tokenizedData = self.dataHandler.getTokenizedUserMonthCSV() def prepareTokenizedUserMonthForumCSV(self): self.tokenizedData = self.dataHandler.getTokenizedUserMonthForumCSV() def printDataForTMT(self, outFile): outFile = csv.writer(open(outFile, "w")) index = 1 for record in self.tokenizedData: record.insert(0, index) outFile.writerow(record) index += 1 def getRequiredDataFromRecord(self, record): indices = [0] def initializeUserMonthRecord(self, user, month): return self.dataHandler.getBasicUserMonthRecord(user, month) def isProperUnicode(self, text): try: dummy = unicode(text) return True except: return False def printInferDataForTMT(self, outFile): f = codecs.open(outFile, encoding="utf-8", mode="w+") outFile = csv.writer(f) index = 1 for user in self.tokenizedData.iterkeys(): for month in self.tokenizedData[user].iterkeys(): numPosts = 0 userMonthRecord = self.initializeUserMonthRecord(user, month) for recordText in self.tokenizedData[user][month]: if self.isProperUnicode(recordText): userMonthRecord[-1].append(recordText) numPosts += 1 userMonthRecord[-1] = " ".join(userMonthRecord[-1]) userMonthRecord.insert(0, index) userMonthRecord.append(numPosts) try: outFile.writerow([unicode(s).encode("utf-8") for s in userMonthRecord]) index += 1 except: pass def printInferDataForTMTWithForum(self, outFile): f = codecs.open(outFile, encoding="utf-8", mode="w+") outFile = csv.writer(f) index = 1 for user in self.tokenizedData.iterkeys(): for month in self.tokenizedData[user].iterkeys(): totalPosts = 0 allForumsRecord = self.initializeUserMonthRecord(user, month) for forum in self.tokenizedData[user][month]: numPosts = 0 userMonthRecord = self.initializeUserMonthRecord(user, month) for recordText in self.tokenizedData[user][month][forum]: if self.isProperUnicode(recordText): userMonthRecord[-1].append(recordText) numPosts += 1 forumPosts = copy.deepcopy(userMonthRecord[-1]) userMonthRecord[-1] = " ".join(userMonthRecord[-1]) userMonthRecord.insert(0, index) userMonthRecord.append(numPosts) userMonthRecord.append(forum) try: outFile.writerow([unicode(s).encode("utf-8") for s in userMonthRecord]) index += 1 totalPosts += numPosts allForumsRecord[-1].extend(forumPosts) except: pass allForumsRecord[-1] = " ".join(allForumsRecord[-1]) allForumsRecord.insert(0, index) index += 1 allForumsRecord.append(totalPosts) allForumsRecord.append("AllForums") outFile.writerow([unicode(s).encode("utf-8") for s in allForumsRecord])
def ApproveJob(job): dataHandler = DataHandler() dataHandler.ApproveJob(job["jobId"]) dataHandler.Close() return True
from DataHandler import DataHandler from Client import Client from NATServer import NATServer from Dispatcher import Dispatcher from XmlRpcServer import XmlRpcServer import ip2country # just to make sure it's downloaded import ChanServ # uncomment for debugging deadlocks, creates a stacktrace at the given interval to stdout #import stacktracer #stacktracer.trace_start("trace.html",interval=5,auto=True) # Set auto flag to always update file! _root = DataHandler() _root.parseArgv(sys.argv) try: signal.SIGHUP def sighup(sig, frame): _root.console_write('Received SIGHUP.') if _root.sighup: _root.reload() signal.signal(signal.SIGHUP, sighup) except AttributeError: pass _root.console_write('-'*40)
Useless. Print a message with standart shape. """ print "#######################################\n{}\n#######################################\n".format(str) if __name__ == "__main__": myprint("Start computations") #################################################################################################################### # Get data #################################################################################################################### myprint("Load data") MyDataHandler = DataHandler() print "Loading train" train = MyDataHandler.get_train() #Pretty things up. column_names = {} for i in xrange(96*96): column_names[i] = 'pixel{}'.format(i) #################################################################################################################### # Split pixels: #################################################################################################################### myprint("Extract data") train_x = train.iloc[:800]['Image'].apply(lambda x: pd.Series([int(i) for i in x.split(' ')])).rename(columns=column_names) train_y = train.iloc[:800].ix[:, 0:30].fillna(0).astype(int)
import sys from DataHandler import DataHandler def analyzeUser(userNum, DH, baseDir): userNum = str(userNum) sys.stderr.write("User:"******"\n") outFile = baseDir+userNum DH.printMonthlyDataForUser(userNum, outFile) if __name__ == '__main__': baseDir = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/DebugTime/" dataFile = baseDir + "allThreads.csv" userJoins = baseDir + "userJoins" activeForums = baseDir + "activeForums.csv" DH = DataHandler(dataFile, userJoins) DH.loadActiveForums(activeForums) #analyzeUser(9258, DH, baseDir) #analyzeUser(30702, DH, baseDir) analyzeUser(35541, DH, baseDir)
features, act_labels, verbose=True) train_data, act_train_labels = train_loader.time_series_to_section( train_ts.copy(), num_act_labels, sliding_window_size=200, step_size_of_sliding_window=10) test_data, act_test_labels = train_loader.time_series_to_section( test_ts.copy(), num_act_labels, sliding_window_size=200, step_size_of_sliding_window=10) print("---Data is successfully loaded") handler = DataHandler(train_data, test_data) norm_train = handler.normalise("train") norm_test = handler.normalise("test") print("--- Shape of Training Data:", train_data.shape) print("--- Shape of Test Data:", test_data.shape) expt_name = "thurs_Script_jog2" create_directories(expt_name) gan_ = GAN(norm_train.shape) trainer_ = Trainer(gan_, expt_name) trainer_.train_gan(epochs=200, batch_size=128, sample_interval=10, train_data=norm_train)
from CurlHandler import CurlHandler from DataHandler import DataHandler import sys while True: dataHandler = DataHandler() ip = dataHandler.getRandomHTTPS() if ip == "": print "Could not find any ips that need processing" sys.exit() print "Processing "+ip curlHandler = CurlHandler() response = curlHandler.getResponse(ip) dataHandler.setHTTPSPage(ip, response)
def __init__(self, dataFile, userJoins): self.dataHandler = DataHandler(dataFile, userJoins) self.dataHandler.loadActiveForums() self.tokenizedData = []
import tensorflow as tf from DataHandler import DataHandler from RNNGenerator import RNNGenerator from SessionRunner import SessionRunner log_path = 'output/tensorflow/' writer = tf.summary.FileWriter(log_path) # Load and prepare data data_handler = DataHandler() training_data = data_handler.read_data('Data/Zulu.txt') dictionary, reverse_dictionary = data_handler.build_datasets(training_data) # TensorFlow Graph input n_input = 3 n_units = 512 x = tf.placeholder("float", [None, n_input, 1]) y = tf.placeholder("float", [None, len(dictionary)]) # RNN output weights and biases weights = { 'out': tf.Variable(tf.random_normal([n_units, len(dictionary)])) } biases = { 'out': tf.Variable(tf.random_normal([len(dictionary)])) } rnn_generator = RNNGenerator()
from sklearn.tree import DecisionTreeClassifier from DataHandler import DataHandler import numpy as np from Utils import OneHotEncode from datetime import datetime allData = DataHandler.getAllData()[1:] bookingDate = [] origin = [] dest = [] deptDate = [] deptTime = [] pax = [] label = [] BOOKINGDATE = 1 ORIGIN = 2 DEST = 3 DEPTDATE = 4 DEPTTIME = 5 PAX = 6 LABEL = 7 for row in allData: bookingDate.append(row[BOOKINGDATE]) origin.append(row[ORIGIN]) dest.append(row[DEST]) deptDate.append(row[DEPTDATE]) deptTime.append(row[DEPTTIME]) pax.append(row[PAX])
def extract_job_log(jobId, logPath, userId): try: dataHandler = DataHandler() logs = k8sUtils.GetLog(jobId) jobLogDir = os.path.dirname(logPath) if not os.path.exists(jobLogDir): mkdirsAsUser(jobLogDir, userId) logStr = "" trimlogstr = "" for log in logs: if "podName" in log and "containerID" in log and "containerLog" in log: logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += " logs from pod: %s\n" % log["podName"] logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += log["containerLog"] logStr += "\n\n\n" logStr += "=========================================================\n" logStr += " end of logs from pod: %s\n" % log["podName"] logStr += "=========================================================\n" logStr += "\n\n\n" trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" trimlogstr += " logs from pod: %s\n" % log["podName"] trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" logLines = log["containerLog"].split('\n') if (len(logLines) < 3000): trimlogstr += log["containerLog"] trimlogstr += "\n\n\n" trimlogstr += "=========================================================\n" trimlogstr += " end of logs from pod: %s\n" % log[ "podName"] trimlogstr += "=========================================================\n" trimlogstr += "\n\n\n" else: trimlogstr += "\n".join(logLines[-2000:]) trimlogstr += "\n\n\n" trimlogstr += "=========================================================\n" trimlogstr += " end of logs from pod: %s\n" % log[ "podName"] trimlogstr += " Note: the log is too long to display in the webpage.\n" trimlogstr += " Only the last 2000 lines are shown here.\n" trimlogstr += " Please check the log file (in Job Folder) for the full logs.\n" trimlogstr += "=========================================================\n" trimlogstr += "\n\n\n" try: containerLogPath = os.path.join( jobLogDir, "log-container-" + log["containerID"] + ".txt") with open(containerLogPath, 'w') as f: f.write(log["containerLog"]) f.close() os.system("chown -R %s %s" % (userId, containerLogPath)) except Exception as e: print e if len(trimlogstr.strip()) > 0: dataHandler.UpdateJobTextField(jobId, "jobLog", trimlogstr) with open(logPath, 'w') as f: f.write(logStr) f.close() os.system("chown -R %s %s" % (userId, logPath)) except Exception as e: logging.error(e)
class Crawler: def __init__(self,url,dbFile,outputFile,maxCount=None): self.url = url # url to be crawled if maxCount == None: self.maxCount = -1 else: ''' maxcount is the maximum number of links to be fetched by the crawler. It is incremented as we should accommodate the initial user input while counting the total number of links in the repository as the link entered by the user will also be persisted in the repository (i.e)if user requests to crawl python.org and asks to fetch 2 links , the program should terminate when there are 3 links in repository as python.org is also one of the links in repository ''' self.maxCount = maxCount + 1 self.extracter = LinkExtracter() self.dataHandler = DataHandler(self.maxCount,dbFile,outputFile) self.log = CrawlerLogger.getlogger() ''' crawls the link given by the user using BFS traversal until it fetches the specified number of links or till all the links have been fetched ''' def Crawl(self): try: link = self.url self.log.info("crawling "+link) links = self.extracter.fetchLinks(link) if links is None: print("Either the url you entered cannot be crawled or its does not contain any links") return False self.dataHandler.flushTable() maxLinkflag = self.dataHandler.saveUnprocessedLinks(links) if maxLinkflag: return self.writeDataToFile() else: self.crawlfetchedLinks() return self.writeDataToFile() except sqlite3.OperationalError as e: self.log.error(e,exc_info=sys.exc_info()[2]); raise CrawlerError(('Either invalid database entered or database does not have necessary tables',)) ''' Helper for crawl function ''' def crawlfetchedLinks(self): maxLinkflag = True ''' This loop will terminate when the specified number of links has been fetched or all the links has been fetched ''' while True: link = self.dataHandler.getLinkforParsing() if link is None: break links = self.extracter.fetchLinks(link) self.dataHandler.setLinkAsProcessed(link) if not links is None: maxLinkflag = self.dataHandler.saveUnprocessedLinks(links) if maxLinkflag: break ''' writeDataToFile helps in writing the fetched links to file (links.txt) so that user can use it ''' def writeDataToFile(self): try: res = self.dataHandler.exportData() self.dataHandler.flushTable() return res except IOError as e: self.log.error(e,exc_info=sys.exc_info()[2]); raise CrawlerError(('The path of the export file entered is invalid',))
def saveHistory(self): if args.epoch == 0: return with open('History/' + args.save_path + '.his', 'wb') as fs: pickle.dump(self.metrics, fs) saver = tf.train.Saver() saver.save(self.sess, 'Models/' + args.save_path) log('Model Saved: %s' % args.save_path) def loadModel(self): saver = tf.train.Saver() saver.restore(sess, 'Models/' + args.load_model) with open('History/' + args.load_model + '.his', 'rb') as fs: self.metrics = pickle.load(fs) log('Model Loaded') if __name__ == '__main__': logger.saveDefault = True config = tf.ConfigProto() config.gpu_options.allow_growth = True log('Start') handler = DataHandler() log('Load Data') with tf.Session(config=config) as sess: model = Model(sess, handler) model.run()
import sklearn.metrics as Metrices import numpy as np from DataHandler import DataHandler import csv import sys pair = dict() counter = 0 for pairInFile in open("data\Pair.txt", "r"): pairInFile = pairInFile.rstrip('\n') pair[pairInFile] = counter counter += 1 fareClass = {"Classic" : 1, "Deal": 2, "Flex": 3, "Saver": 4} trainingData = DataHandler.getTrainingData() x = np.empty([len(trainingData),44], dtype=int) y = np.empty([len(trainingData)], dtype=int) counter = 0 for trainingList in trainingData: oriDestPair = trainingList[2] + trainingList[3] paxCount = int(trainingList[-2]) tempXList = [0] * 44 tempXList[pair[oriDestPair]] = 1 if paxCount > 1: tempXList[43] = 1 else: tempXList[42] = 1 x[counter] = tempXList
from SiameseNet import SiameseNet from ModelHandler import ModelHandler from DataHandler import DataHandler from plotters import Plotter # %% MNIST dataset dh = DataHandler("MNIST", classes_to_select=[0, 1, 2, 3, 4, 5, 6]) #dh_newdata = DataHandler("MNIST", classes_to_select=[7,8,9]) # %% Define embedding model mh = ModelHandler(model_number=4, embedding_size=200, input_feature_dim=dh.shape) # %% Define siamese net #alphas = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1] alphas = [0.1] for alpha in alphas: net = SiameseNet(mh, dh, alpha) net.print_model() batch_size = 200 epochs = 1 steps_per_epoch = 1 #int(dh.n_train / batch_size) history = net.train("create_pair_batch_random", batch_size, epochs, steps_per_epoch) # % Plot loss # Losses plotter = Plotter() plotter.plot_losses(net, history)
from DataHandler import DataHandler if __name__ == "__main__": dataFile = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/allThreads.csv" userJoins = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/userJoins" DH = DataHandler(dataFile, userJoins) DH.loadActiveForums() outFile = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/activeForums.tsv" DH.printActiveForums(outFile)
def __init__(self, age, alive, features=None, gamma=1.0, gamma_beta=None, bias=True, normalize=True, verbose=False): """ Initializes objects with parameters necessary to create the supporting objects: DataHandler and ShiftedBeta :param age: str The column name to identify the age of each individual. Age has to be an integer value, and will determine the time intervals the model with work with. --- See DataHandler.py :param alive: str The column name with the status of each individual. In the context of survival analysis, an individual may be dead or alive, and its contribution to the model will depend on it. --- See DataHandler.py :param features: str, list or None A string with the name of the column to be used as features, or a list of names of columns to be used as features or None, if no features are to be used. --- See DataHandler.py :param gamma: float A non-negative float specifying the strength of the regularization applied to w_alpha (alpha's weights) and, if gamma_beta is not given, it is also applied to beta. --- See ShiftedBeta.py :param gamma_beta: float A non-negative float specifying the strength of the regularization applied to w_beta (beta's weights). If specified, overwrites the value of gamma for beta. --- See ShiftedBeta.py :param bias: bool Whether or not a bias term should be added to the feature matrix. --- See DataHandler.py :param normalize: bool Whether or not numerical fields should be normalized (centered and scaled to have std=1) --- See DataHandler.py :param verbose: bool Whether of not status updates should be printed --- See ShiftedBeta.py """ # Create objects! # DATA-HANDLER OBJECT # The DataHandler object may be created without the training data, so # we do it here. self.dh = DataHandler(age=age, alive=alive, features=features, bias=bias, normalize=normalize) # Shifted beta model object # Was a different gammab parameter passed? If not, we use the same # value passed to gamma. if gamma_beta is None: gamma_beta = 1.0 * gamma # create shifted beta object self.sb = ShiftedBetaGeometric(gamma_alpha=gamma, gamma_beta=gamma_beta, verbose=verbose)