class TopicChange:
    def __init__(self, dataFile, userJoins, activeForums):
        sys.stderr.write("Started\n")
        self.dataHandler = DataHandler(dataFile, userJoins)
        self.dataHandler.loadActiveForums(activeForums)
        sys.stderr.write("Data loaded\n")
        self.post2Month = self.dataHandler.getPost2Month()
        self.doc2Post = self.dataHandler.getDoc2Post()
        self.post2User = self.dataHandler.getPost2User()
        sys.stderr.write("Got the dicts\n")

    def loadInferredTopics(self, topicsOutput):
        userMonth = dd(lambda: dd(int))
        numUsers = set()
        csvReader = csv.reader(open(topicsOutput))
        for doc in csvReader:
            # if len(doc)<21:
            #  continue
            # print 'phani'
            docId = doc[0]
            # topic5Num = doc[5]
            # topic19Num = doc[19]
            userId = self.post2User[self.doc2Post[docId]]
            # month = self.post2Month[self.doc2Post[docId]]
            # userMonth[userId][month] += topic5Num
            numUsers.add(userId)
        # for user in userMonth.iterkeys():
        #  for month in userMonth[user].iterkeys():
        #    print user, month, userMonth[user][month]
        print len(numUsers)
 def __init__(self, dataFile, userJoins):
   sys.stderr.write('In Constructor\n')
   self.distComparer = DistComparer()
   self.dataHandler = DataHandler(dataFile, userJoins)
   self.dataHandler.loadActiveForums()
   self.__loadData()
   self.sampledUsers = set()
 def __init__(self, dataFile, userJoins, activeForums):
     sys.stderr.write("Started\n")
     self.dataHandler = DataHandler(dataFile, userJoins)
     self.dataHandler.loadActiveForums(activeForums)
     sys.stderr.write("Data loaded\n")
     self.post2Month = self.dataHandler.getPost2Month()
     self.doc2Post = self.dataHandler.getDoc2Post()
     self.post2User = self.dataHandler.getPost2User()
     sys.stderr.write("Got the dicts\n")
示例#4
0
 def __init__(self, outDir):
   self.config = None
   self.__csHash = set()
   self.__outputDir = outDir
   self.__csInstance = CSHandler()
   self.__dataHandler = DataHandler()
   self.__utils = Utils()
   self.__Tree = Dependencytree()
   self.__fileSuffix = ""
   self.prepareConfig()
 def __init__(self,url,dbFile,outputFile,maxCount=None):
     self.url = url # url to be crawled
     if maxCount == None:
         self.maxCount = -1
     else:
         '''
         maxcount is the maximum number of links to be fetched by the crawler.
         It is incremented as we should accommodate the initial user input while 
         counting the total number of links in the repository as the link entered by the user
         will also be persisted in the repository
         (i.e)if user requests to crawl python.org and asks to fetch 2 links , the program should 
         terminate when there are 3 links in repository as python.org is also one of the links in repository   
         '''
         self.maxCount = maxCount + 1
         
     self.extracter = LinkExtracter()
     self.dataHandler = DataHandler(self.maxCount,dbFile,outputFile)
     self.log = CrawlerLogger.getlogger()
示例#6
0
def update_job_priorites(job_priorites):
    dataHandler = DataHandler()
    success = dataHandler.update_job_priority(job_priorites)
    dataHandler.Close()
    return success
示例#7
0
def update_job(job_id, field, value):
    dataHandler = DataHandler()
    dataHandler.UpdateJobTextField(job_id, field, value)
    dataHandler.Close()
示例#8
0
def SubmitJob(jobParamsJsonStr):
    ret = {}

    jobParams = LoadJobParams(jobParamsJsonStr)

    if "jobName" not in jobParams or len(jobParams["jobName"].strip()) == 0:
        ret["error"] = "ERROR: Job name cannot be empty"
        return ret
    if "vcName" not in jobParams or len(jobParams["vcName"].strip()) == 0:
        ret["error"] = "ERROR: VC name cannot be empty"
        return ret

    if "preemptionAllowed" not in jobParams:
        jobParams["preemptionAllowed"] = False
    else:
        jobParams["preemptionAllowed"] = ToBool(jobParams["preemptionAllowed"])

    if "jobId" not in jobParams or jobParams["jobId"] == "":
        #jobParams["jobId"] = jobParams["jobName"] + "-" + str(uuid.uuid4())
        #jobParams["jobId"] = jobParams["jobName"] + "-" + str(time.time())
        jobParams["jobId"] = str(uuid.uuid4())
    #jobParams["jobId"] = jobParams["jobId"].replace("_","-").replace(".","-")

    if "resourcegpu" not in jobParams:
        jobParams["resourcegpu"] = 0

    if isinstance(jobParams["resourcegpu"], basestring):
        if len(jobParams["resourcegpu"].strip()) == 0:
            jobParams["resourcegpu"] = 0
        else:
            jobParams["resourcegpu"] = int(jobParams["resourcegpu"])

    if "familyToken" not in jobParams or jobParams["familyToken"].isspace():
        jobParams["familyToken"] = str(uuid.uuid4())
    if "isParent" not in jobParams:
        jobParams["isParent"] = 1

    userName = jobParams["userName"]
    if "@" in userName:
        userName = userName.split("@")[0].strip()

    if "/" in userName:
        userName = userName.split("/")[1].strip()

    if not AuthorizationManager.HasAccess(
            jobParams["userName"], ResourceType.VC,
            jobParams["vcName"].strip(), Permission.User):
        ret["error"] = "Access Denied!"
        return ret

    if "cmd" not in jobParams:
        jobParams["cmd"] = ""

    if "jobPath" in jobParams and len(jobParams["jobPath"].strip()) > 0:
        jobPath = jobParams["jobPath"]
        if ".." in jobParams["jobPath"]:
            ret["error"] = "ERROR: '..' cannot be used in job directory"
            return ret

        if "\\." in jobParams["jobPath"]:
            ret["error"] = "ERROR: invalided job directory"
            return ret

        if jobParams["jobPath"].startswith(
                "/") or jobParams["jobPath"].startswith("\\"):
            ret["error"] = "ERROR: job directory should not start with '/' or '\\' "
            return ret

        if not jobParams["jobPath"].startswith(userName):
            jobParams["jobPath"] = os.path.join(userName, jobParams["jobPath"])

    else:
        jobPath = userName + "/" + "jobs/" + time.strftime(
            "%y%m%d") + "/" + jobParams["jobId"]
        jobParams["jobPath"] = jobPath

    if "workPath" not in jobParams or len(jobParams["workPath"].strip()) == 0:
        jobParams["workPath"] = "."

    if ".." in jobParams["workPath"]:
        ret["error"] = "ERROR: '..' cannot be used in work directory"
        return ret

    if "\\." in jobParams["workPath"]:
        ret["error"] = "ERROR: invalided work directory"
        return ret

    if jobParams["workPath"].startswith(
            "/") or jobParams["workPath"].startswith("\\"):
        ret["error"] = "ERROR: work directory should not start with '/' or '\\' "
        return ret

    if not jobParams["workPath"].startswith(userName):
        jobParams["workPath"] = os.path.join(userName, jobParams["workPath"])

    if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0:
        jobParams["dataPath"] = "."

    if ".." in jobParams["dataPath"]:
        ret["error"] = "ERROR: '..' cannot be used in data directory"
        return ret

    if "\\." in jobParams["dataPath"]:
        ret["error"] = "ERROR: invalided data directory"
        return ret

    if jobParams["dataPath"][0] == "/" or jobParams["dataPath"][0] == "\\":
        ret["error"] = "ERROR: data directory should not start with '/' or '\\' "
        return ret

    jobParams["dataPath"] = jobParams["dataPath"].replace("\\", "/")
    jobParams["workPath"] = jobParams["workPath"].replace("\\", "/")
    jobParams["jobPath"] = jobParams["jobPath"].replace("\\", "/")
    jobParams["dataPath"] = os.path.realpath(
        os.path.join("/", jobParams["dataPath"]))[1:]
    jobParams["workPath"] = os.path.realpath(
        os.path.join("/", jobParams["workPath"]))[1:]
    jobParams["jobPath"] = os.path.realpath(
        os.path.join("/", jobParams["jobPath"]))[1:]

    dataHandler = DataHandler()
    if "logDir" in jobParams and len(jobParams["logDir"].strip()) > 0:
        tensorboardParams = jobParams.copy()

        # overwrite for distributed job
        if tensorboardParams["jobtrainingtype"] == "PSDistJob":
            tensorboardParams["jobtrainingtype"] = "RegularJob"
            match = re.match('(.*)(/.*)', tensorboardParams["logDir"])
            if not match is None:
                newDir = match.group(1) + "/worker0" + match.group(2)
                prefix = match.group(1)
                match2 = re.match('.*/worker0', prefix)
                if match2 is None:
                    tensorboardParams["logDir"] = newDir
            #match = re.match('(.*--logdir\s+.*)(/.*--.*)', tensorboardParams["cmd"])
            #if not match is None:
            #    tensorboardParams["cmd"] = match.group(1) + "/worker0" + match.group(2)

        tensorboardParams["jobId"] = str(uuid.uuid4())
        tensorboardParams["jobName"] = "tensorboard-" + jobParams["jobName"]
        tensorboardParams["jobPath"] = jobPath
        tensorboardParams["jobType"] = "visualization"
        tensorboardParams["cmd"] = "tensorboard --logdir " + tensorboardParams[
            "logDir"] + " --host 0.0.0.0"
        tensorboardParams["image"] = jobParams["image"]
        tensorboardParams["resourcegpu"] = 0

        tensorboardParams["interactivePort"] = "6006"

        if "error" not in ret:
            if not dataHandler.AddJob(tensorboardParams):
                ret["error"] = "Cannot schedule tensorboard job."

    if "error" not in ret:
        if dataHandler.AddJob(jobParams):
            ret["jobId"] = jobParams["jobId"]
        else:
            ret["error"] = "Cannot schedule job. Cannot add job into database."

    dataHandler.Close()
    InvalidateJobListCache(jobParams["vcName"])
    return ret
示例#9
0
#!/usr/bin/env python
# coding=utf-8

import thread, traceback, signal, socket, sys
from urllib import urlopen

from DataHandler import DataHandler
from Client import Client
from NATServer import NATServer
from Dispatcher import Dispatcher

import ip2country # just to make sure it's downloaded
import ChanServ

_root = DataHandler()
_root.parseArgv(sys.argv)

try:
	signal.SIGHUP
	
	def sighup(sig, frame):
		_root.console_write('Received SIGHUP.')
		if _root.sighup:
			_root.reload()

	signal.signal(signal.SIGHUP, sighup)
except AttributeError:
	pass

_root.console_write('-'*40)
_root.console_write('Starting uberserver...\n')
示例#10
0
from DataHandler import DataHandler
from Client import Client
from NATServer import NATServer
from XmlRpcServer import XmlRpcServer

import ip2country # just to make sure it's downloaded
import ChanServ
import twistedserver

# uncomment for debugging deadlocks, creates a stacktrace at the given interval to stdout
#import stacktracer
#stacktracer.trace_start("trace.html",interval=5,auto=True) # Set auto flag to always update file!


_root = DataHandler()
_root.parseArgv(sys.argv)

try:
	signal.SIGHUP
	
	def sighup(sig, frame):
		_root.console_write('Received SIGHUP.')
		if _root.sighup:
			_root.reload()

	signal.signal(signal.SIGHUP, sighup)
except AttributeError:
	pass

_root.console_write('-'*40)
示例#11
0
def UpdateJobStatus(job):
    dataHandler = DataHandler()
    jobParams = json.loads(base64.b64decode(job["jobParams"]))

    if job["jobStatus"] == "scheduling" and jobParams[
            "jobtrainingtype"] == "PSDistJob":
        launch_ps_dist_job(jobParams)

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(json.dumps(detail)))

    logging.info("job %s status: %s,%s" %
                 (job["jobId"], result, json.dumps(detail)))

    jobDescriptionPath = os.path.join(
        config["storage-mount-path"],
        job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None
    if "userId" not in jobParams:
        jobParams["userId"] = "0"
    if result.strip() == "Succeeded":
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished")
        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            k8sUtils.kubectl_delete(jobDescriptionPath)

    elif result.strip() == "Running":
        if job["jobStatus"] != "running":
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                           "running")

        if "interactivePort" in jobParams:
            serviceAddress = k8sUtils.GetServiceAddress(job["jobId"])
            serviceAddress = base64.b64encode(json.dumps(serviceAddress))
            dataHandler.UpdateJobTextField(job["jobId"], "endpoints",
                                           serviceAddress)

    elif result.strip() == "Failed":
        printlog("Job %s fails, cleaning..." % job["jobId"])
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed")
        dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail)
        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            k8sUtils.kubectl_delete(jobDescriptionPath)

    elif result.strip() == "Unknown":
        if job["jobId"] not in UnusualJobs:
            UnusualJobs[job["jobId"]] = datetime.datetime.now()
        elif (datetime.datetime.now() -
              UnusualJobs[job["jobId"]]).seconds > 300:
            del UnusualJobs[job["jobId"]]
            retries = dataHandler.AddandGetJobRetries(job["jobId"])
            if retries >= 5:
                printlog("Job %s fails for more than 5 times, abort" %
                         job["jobId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "error")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               "cannot launch the job.")
                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    k8sUtils.kubectl_delete(jobDescriptionPath)
            else:
                printlog(
                    "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d"
                    % (job["jobId"], retries))
                SubmitJob(job)
    elif result.strip() == "PendingHostPort":
        printlog(
            "Cannot find host ports for job :%s, re-launch the job with different host ports "
            % (job["jobId"]))

        SubmitJob(job)

    if result.strip() != "Unknown" and job["jobId"] in UnusualJobs:
        del UnusualJobs[job["jobId"]]
示例#12
0
def extract_job_log(jobId, logPath, userId):
    try:
        dataHandler = DataHandler()

        # TODO: Replace joblog manager with elastic search
        logs = k8sUtils.GetLog(jobId, tail=None)

        # Do not overwrite existing logs with empty log
        # DLTS bootstrap will generate logs for all containers.
        # If one container has empty log, skip writing.
        for log in logs:
            if "containerLog" in log and log["containerLog"] == "":
                return

        jobLogDir = os.path.dirname(logPath)
        if not os.path.exists(jobLogDir):
            mkdirsAsUser(jobLogDir, userId)
        logStr = ""
        trimlogstr = ""

        for log in logs:
            if "podName" in log and "containerID" in log and "containerLog" in log:
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "        logs from pod: %s\n" % log["podName"]
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += log["containerLog"]
                logStr += "\n\n\n"
                logStr += "=========================================================\n"
                logStr += "        end of logs from pod: %s\n" % log["podName"]
                logStr += "=========================================================\n"
                logStr += "\n\n\n"

                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                trimlogstr += "        logs from pod: %s\n" % log["podName"]
                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                logLines = log["containerLog"].split('\n')
                if (len(logLines) < 3000):
                    trimlogstr += log["containerLog"]
                    trimlogstr += "\n\n\n"
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "        end of logs from pod: %s\n" % log[
                        "podName"]
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "\n\n\n"
                else:
                    trimlogstr += "\n".join(logLines[-2000:])
                    trimlogstr += "\n\n\n"
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "        end of logs from pod: %s\n" % log[
                        "podName"]
                    trimlogstr += "        Note: the log is too long to display in the webpage.\n"
                    trimlogstr += "        Only the last 2000 lines are shown here.\n"
                    trimlogstr += "        Please check the log file (in Job Folder) for the full logs.\n"
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "\n\n\n"

                try:
                    containerLogPath = os.path.join(
                        jobLogDir,
                        "log-container-" + log["containerID"] + ".txt")
                    with open(containerLogPath, 'w') as f:
                        f.write(log["containerLog"])
                    f.close()
                    os.system("chown -R %s %s" % (userId, containerLogPath))
                except Exception as e:
                    logger.exception("write container log failed")

        if len(trimlogstr.strip()) > 0:
            dataHandler.UpdateJobTextField(jobId, "jobLog",
                                           base64.b64encode(trimlogstr))
            with open(logPath, 'w') as f:
                f.write(logStr)
            f.close()
            os.system("chown -R %s %s" % (userId, logPath))

    except Exception as e:
        logging.error(e)
示例#13
0
def GetClusterStatus():
    job = None
    dataHandler = DataHandler()
    cluster_status,last_update_time =  dataHandler.GetClusterStatus()
    dataHandler.Close()
    return cluster_status,last_update_time
示例#14
0
def GetCommands(jobId):
    dataHandler = DataHandler()
    commands = dataHandler.GetCommands(jobId=jobId);
    dataHandler.Close()
    return commands
示例#15
0
def TakeJobActions(jobs):
    dataHandler = DataHandler()
    vcList = dataHandler.ListVCs()
    dataHandler.Close()

    localResInfo = ResourceInfo()
    globalResInfo = ResourceInfo()

    for vc in vcList:
        localResInfo.Add(ResourceInfo(vc["vcName"], json.loads(vc["quota"])))
        globalResInfo.Add(ResourceInfo("", json.loads(vc["quota"])))

    jobsInfo = []
    for job in jobs:
        if job["jobStatus"] == "queued" or job[
                "jobStatus"] == "scheduling" or job["jobStatus"] == "running":
            singleJobInfo = {}
            singleJobInfo["job"] = job
            singleJobInfo["jobParams"] = json.loads(
                base64.b64decode(job["jobParams"]))
            jobGpuType = "any"
            if "gpuType" in singleJobInfo["jobParams"]:
                jobGpuType = singleJobInfo["jobParams"]["gpuType"]
            singleJobInfo["localResInfo"] = ResourceInfo.FromTypeAndCount(
                job["vcName"], jobGpuType,
                singleJobInfo["jobParams"]["resourcegpu"])
            singleJobInfo["globalResInfo"] = ResourceInfo.FromTypeAndCount(
                "", jobGpuType, singleJobInfo["jobParams"]["resourcegpu"])
            singleJobInfo["sortKey"] = str(job["jobTime"])
            if singleJobInfo["jobParams"]["preemptionAllowed"]:
                singleJobInfo["sortKey"] = "1_" + singleJobInfo["sortKey"]
            else:
                singleJobInfo["sortKey"] = "0_" + singleJobInfo["sortKey"]
            singleJobInfo["allowed"] = False
            jobsInfo.append(singleJobInfo)

    jobsInfo.sort(key=JobInfoSorter)

    logging.info("TakeJobActions : local resources : %s" %
                 (localResInfo.CategoryToCountMap))
    logging.info("TakeJobActions : global resources : %s" %
                 (globalResInfo.CategoryToCountMap))

    for sji in jobsInfo:
        logging.info("TakeJobActions : job : %s : %s : %s" %
                     (sji["jobParams"]["jobName"],
                      sji["localResInfo"].CategoryToCountMap, sji["sortKey"]))
        if sji["jobParams"]["preemptionAllowed"]:
            localResInfo.UnblockResourceCategory(sji["localResInfo"])

        if (localResInfo.CanSatisfy(sji["localResInfo"])):
            localResInfo.Subtract(sji["localResInfo"])
            globalResInfo.Subtract(sji["globalResInfo"])
            sji["allowed"] = True
            logging.info("TakeJobActions : local assignment : %s : %s" %
                         (sji["jobParams"]["jobName"],
                          sji["localResInfo"].CategoryToCountMap))
        elif not sji["jobParams"]["preemptionAllowed"]:
            localResInfo.BlockResourceCategory(
                sji["localResInfo"])  #FIFO scheduling

    #logging.info("TakeJobActions : local resources : %s" % (localResInfo.CategoryToCountMap))
    #logging.info("TakeJobActions : global resources : %s" % (globalResInfo.CategoryToCountMap))

    for sji in jobsInfo:
        if (sji["jobParams"]["preemptionAllowed"] and sji["allowed"] == False):
            if globalResInfo.CanSatisfy(sji["globalResInfo"]):
                logging.info("TakeJobActions : job : %s : %s" %
                             (sji["jobParams"]["jobName"],
                              sji["globalResInfo"].CategoryToCountMap))
                # Strict FIFO policy not required for global (bonus) tokens since these jobs are anyway pre-emptible.
                globalResInfo.Subtract(sji["globalResInfo"])
                sji["allowed"] = True
                logging.info("TakeJobActions : global assignment : %s : %s" %
                             (sji["jobParams"]["jobName"],
                              sji["globalResInfo"].CategoryToCountMap))

    logging.info("TakeJobActions : global resources : %s" %
                 (globalResInfo.CategoryToCountMap))

    for sji in jobsInfo:
        if sji["job"]["jobStatus"] == "queued" and sji["allowed"] == True:
            SubmitJob(sji["job"])
            logging.info("TakeJobActions : submitting job : %s : %s : %s" %
                         (sji["jobParams"]["jobName"],
                          sji["jobParams"]["jobId"], sji["sortKey"]))
        elif (sji["job"]["jobStatus"] == "scheduling"
              or sji["job"]["jobStatus"]
              == "running") and sji["allowed"] == False:
            KillJob(sji["job"], "queued")
            logging.info("TakeJobActions : pre-empting job : %s : %s : %s" %
                         (sji["jobParams"]["jobName"],
                          sji["jobParams"]["jobId"], sji["sortKey"]))

    logging.info("TakeJobActions : job desired actions taken")
示例#16
0
def launch_ps_dist_job(jobParams):
    job_id = jobParams["jobId"]
    pods = k8sUtils.GetPod("run=" + job_id)

    # if any pod is not up, return
    if "items" not in pods or len(pods["items"]) != (
            int(jobParams["numpsworker"]) + int(jobParams["numps"])):
        return
    # if any pod is not ready, return
    pod_status = [k8sUtils.check_pod_status(pod) for pod in pods["items"]]
    if any([status != "Running" for status in pod_status]):
        return

    user_name = getAlias(jobParams["userName"])
    if "hostNetwork" in jobParams and jobParams["hostNetwork"]:
        host_network = True
    else:
        host_network = False

    # setup ssh server
    for [idx, pod] in enumerate(pods["items"]):
        pod_name = pod["metadata"]["name"]
        dist_port = pod["metadata"]["labels"]["distPort"]
        # quit if can't setup ssh server
        ssh_port = start_ssh_server(pod_name, user_name, host_network,
                                    dist_port)

    # generate ssh config
    ssh_config = """
Host %s
  HostName %s
  Port %s
  User %s
  StrictHostKeyChecking no
  UserKnownHostsFile /dev/null
                """
    sshconfigstr = ""
    for [idx, pod] in enumerate(pods["items"]):
        pod_ip = pod["status"]["podIP"]
        dist_port = pod["metadata"]["labels"]["distPort"]
        role = pod["metadata"]["labels"]["distRole"]
        role_idx = pod["metadata"]["labels"]["distRoleIdx"]

        # TODO hostNetwork
        if host_network:
            sshconfigstr += (
                ssh_config %
                (role + "-" + str(role_idx), pod_ip, str(dist_port), user_name)
                + "\n")
        else:
            sshconfigstr += (
                ssh_config %
                (role + "-" + str(role_idx), pod_ip, 22, user_name) + "\n")

    # config ssh client
    for [idx, pod] in enumerate(pods["items"]):
        pod_name = pod["metadata"]["name"]
        bash_script = "cat > /home/" + user_name + "/.ssh/config <<EOF " + sshconfigstr + "\nEOF"
        print("override ssh client config: %s" % bash_script)
        k8sUtils.kubectl_exec(
            "exec %s -- bash -c \'%s\' ; chown -R %s /home/%s/.ssh/config" %
            (pod_name, bash_script, user_name, user_name))

        # fix ~/.ssh/ folder permission
        k8sUtils.kubectl_exec(
            "exec %s -- chmod 600 -R /home/%s/.ssh; chmod 700 /home/%s/.ssh; chown -R %s /home/%s/.ssh/config"
            % (pod_name, user_name, user_name, user_name, user_name))

    # generate hostfile
    hostfilecontent = ""
    for [_, pod] in enumerate(pods["items"]):
        role = pod["metadata"]["labels"]["distRole"]
        if role == "ps":
            continue
        role_idx = pod["metadata"]["labels"]["distRoleIdx"]
        worker_gpu_num = pod["spec"]["containers"][0]["resources"]["requests"][
            "nvidia.com/gpu"]
        hostfilecontent += "%s  slots=%s\n" % ("worker-" + str(role_idx),
                                               worker_gpu_num)
    tmp_hostfile = "/tmp/" + job_id + ".hostfile"
    with open(tmp_hostfile, 'w') as f:
        f.write(hostfilecontent + "\n")
    # write the hostfile
    for [idx, pod] in enumerate(pods["items"]):
        pod_name = pod["metadata"]["name"]
        remotecmd = "cp %s %s:/job/hostfile" % (tmp_hostfile, pod_name)
        k8sUtils.kubectl_exec(remotecmd)

    for [idx, pod] in enumerate(pods["items"]):
        pod_name = pod["metadata"]["name"]
        k8sUtils.kubectl_exec("exec %s touch /opt/run_dist_job" % pod_name)

    # execute user command
    #k8sUtils.kubectl_exec("exec %s -- bash -c 'runuser -l ${DLWS_USER_NAME} <<EOF_USER_SCRIPT %s \nEOF_USER_SCRIPT'" % (pod_name, jobParams["cmd"]))

    # update job status
    dataHandler = DataHandler()
    dataHandler.UpdateJobTextField(job_id, "jobStatus", "running")
示例#17
0
文件: test.py 项目: Jingoo88/CA2
            for j in builders:
                if j in z["model"]:

                    ur = z["model"]
                    ur = ur.replace("\u00e9", "é")
                    z["builder"] = j.replace("\u00e9", "é")

                    z["builder"] = j.decode("UTF-8")

                    z["model"] = z["model"].replace(j, "")


with open("test.json") as data_file:
    data = json.load(data_file)

data_handler = DataHandler(usr=data["usr"], table=data["table"], url=data["url"], pwd=data["pwd"])


var_map = {
    "builder": "CONSTRUCTEUR",
    "Displacement": "DEPLACEMENT",
    "Beam": "MAITRE_BAU",
    "Length": "LONGUEUR",
    "model": "MODELE",
    "type": "TYPE",
    "No. of Beds": "CAPACITE_COUCHAGE",
    "Max speed": "VITESSE_MAX",
    "Range": "RANGE_NAV",
    "Engine": "MOTEUR",
    "Cruising speed": "VITESSE_CROISIERE",
    "Length Waterline": "LONGUEUR_LIGNE_FLOTTAISON",
示例#18
0
from DataHandler import DataHandler
import sys

handler = DataHandler()

#handler.addRandomIp(443)

for line in sys.stdin:

	line = line.replace("\n","")
	line = line.replace("\r","")
	line = line.replace(" ","")
	
	if(line != ""):
		print line
		handler.addIp(line,443)
示例#19
0
import traceback, signal, socket, sys, logging
from twisted.internet import reactor
from twisted.internet import task

sys.path.append("protocol")
sys.path.append(".")

from DataHandler import DataHandler
from Client import Client
from NATServer import NATServer

import ip2country # just to make sure it's downloaded
import ChanServ
import twistedserver

_root = DataHandler()
_root.parseArgv(sys.argv)

try:
	signal.SIGHUP

	def sighup(sig, frame):
		logging.info('Received SIGHUP.')
		if _root.sighup:
			_root.reload()

	signal.signal(signal.SIGHUP, sighup)
except AttributeError:
	pass

logging.info('Starting uberserver...')
示例#20
0
import numpy as np

from DataHandler import DataHandler
from ModelHandler import Model
"""
The script used to generate new music. It creates a model and loads a weights file. Then it gets a seed
from the DataHandler object and starts th generation process.
"""
settings = DataHandler.get_config_params()
data_handler = DataHandler()

print 'Building model'
neurons = settings["neurons"]
dropout = settings["dropout"]
l_rate = settings["learning_rate"]
epochs = settings["epochs"]
optimizer = settings["optimizer"]
model = Model(neurons=neurons, dropout=dropout, learning_rate=l_rate, optimizer=optimizer, desired_loss=0.3)

print 'Loading weights'
data_handler.get_weights(model, settings)


def sample(prob_distribution, temperature=1.0):
    """
    A function to sample an index from the array of probabilities.

    :param prob_distribution: an array with probabilities for each class (note).
    :param temperature: denominator parameter used to divide the natural log of each element. Helps in transforming
    values in the probability array
示例#21
0
def analyze_file(data: DataHandler, verbose=False):
    """
    Funkcja do szybkiej analizy pliku. Odpala wszystkie algorytmy i rysuje ich rozwiązania.
    :param data:
    :param verbose: Dla dokladniejszej analizy i rysowania trzeba podać argument True do funkcji
    """
    algos = TSPAlgorithms(data)
    K = 10000

    # only used in verbose and euc2d but need scope
    axs: List[List[plt.Axes]]
    fig: plt.Figure
    fig, axs = plt.subplots(7, 2, figsize=(20, 70))
    axs[0][1].remove()

    random_permutation = np.random.permutation(data.dimension)

    algorithms_and_parameters = [
        #[algos.k_random, tuple([K]), axs[0][0]],
        #[algos.closest_neighbour, tuple(), axs[1][0]],
        #[algos.repetitive_closest_neighbour, tuple(), axs[1][1]],
        [
            algos.two_opt,
            tuple([neighbourings.invert, random_permutation]), axs[2][0]
        ],
        #[algos.two_opt, tuple([neighbourings.swap, random_permutation]), axs[2][1]],
        #[algos.taboo_search, tuple(["accelerate", neighbourings.invert, random_permutation]), axs[3][0]],
        #[algos.taboo_search, tuple(["accelerate", neighbourings.swap, random_permutation]), axs[3][1]],
        [
            algos.taboo_search,
            tuple(
                ["accelerate_moves", neighbourings.invert,
                 random_permutation]), axs[4][0]
        ],
        #[algos.taboo_search, tuple(["cycled_accelerate", neighbourings.swap, random_permutation]), axs[4][1]],
        [
            algos.taboo_search,
            tuple([
                "stagnation_accelerate", neighbourings.invert,
                random_permutation
            ]), axs[5][0]
        ],
        #[algos.taboo_search, tuple(["stagnation_accelerate", neighbourings.swap, random_permutation]), axs[5][1]],
        [
            algos.taboo_search,
            tuple(
                ["long_term_memory", neighbourings.invert,
                 random_permutation]), axs[6][0]
        ],
        #[algos.taboo_search, tuple(["long_term_memory", neighbourings.swap, random_permutation]), axs[6][1]]
    ]

    for algorithm in algorithms_and_parameters:
        algo, parameter_list, ax = algorithm
        ax: plt.Axes
        function_label = f"{algo.__name__}({', '.join(list(map(str, parameter_list)))})"
        time_before = time.time()
        cost = algo(*parameter_list)
        time_after = time.time()
        print(
            f"{function_label} = {cost}, took {round(time_after-time_before,2)} seconds"
        )  # func(*params) = func(params[0], params[1],...,params[k])
        solution = algos.last_solution
        if verbose:
            print(f"Solution:", algos.last_solution)
            if data.isEuc2D():
                edges = [(solution[i], solution[i + 1])
                         for i in range(len(solution) - 1)]
                edges.append((solution[len(solution) - 1], solution[0]))
                nx.draw(data.getGraph(),
                        ax=ax,
                        pos=data.getPos(),
                        with_labels=True,
                        node_size=300,
                        node_color="#ADD8E6")
                nx.draw_networkx_edges(data.getGraph(),
                                       pos=data.getPos(),
                                       ax=ax,
                                       edgelist=edges,
                                       width=2)
                ax.title.set_text(function_label +
                                  f"\n cost={algos.last_cost}")
    if verbose and data.isEuc2D():
        fig.suptitle(f"Wykres algorytmów dla instancji {data.name}",
                     fontsize=16)
        plt.tight_layout()
        plt.show()
示例#22
0
def AddUser(username,userId):
    ret = None
    dataHandler = DataHandler()
    ret =  dataHandler.AddUser(username,userId)
    dataHandler.Close()
    return ret
示例#23
0
def SubmitPSDistJob(job):
    ret = {}
    dataHandler = DataHandler()

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))
        jobParams["rest-api"] = config["rest-api"]
        distJobParams = {}
        distJobParams["ps"] = []
        distJobParams["worker"] = []
        assignedRack = None
        if len(config["racks"]) > 0:
            assignedRack = random.choice(config["racks"])
        if jobParams["jobtrainingtype"] == "PSDistJob":
            jobDescriptionList = []
            nums = {
                "ps": int(jobParams["numps"]),
                "worker": int(jobParams["numpsworker"])
            }
            for role in ["ps", "worker"]:
                for i in range(nums[role]):
                    distJobParam = copy.deepcopy(jobParams)
                    distJobParam["distId"] = "%s%d" % (role, i)
                    distJobParam["distRole"] = role

                    if "jobPath" not in distJobParam or len(
                            distJobParam["jobPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: job-path does not exist")
                        return False

                    distJobParam["distJobPath"] = os.path.join(
                        distJobParam["jobPath"], distJobParam["distId"])

                    if "workPath" not in distJobParam or len(
                            distJobParam["workPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: work-path does not exist")
                        return False

                    if "dataPath" not in distJobParam or len(
                            distJobParam["dataPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: data-path does not exist")
                        return False

                    jobPath, workPath, dataPath = GetStoragePath(
                        distJobParam["distJobPath"], distJobParam["workPath"],
                        distJobParam["dataPath"])

                    localJobPath = os.path.join(config["storage-mount-path"],
                                                jobPath)
                    if not os.path.exists(localJobPath):
                        if "userId" in distJobParam:
                            mkdirsAsUser(localJobPath, distJobParam["userId"])
                        else:
                            mkdirsAsUser(localJobPath, 0)

                    distJobParam["LaunchCMD"] = ""
                    if "cmd" not in distJobParam:
                        distJobParam["cmd"] = ""


################One choice is that we only wait for certain time.
#                    launchCMD = """
##!/bin/bash
#mkdir -p /opt
#echo "[DLWorkspace System]: Waiting for all containers are ready..."
## wait for at most 10 mins.
#for i in {1..200}; do
#    if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then
#        sleep 3
#    else
#        break
#    fi
#done
#if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then
#    echo "[DLWorkspace System]: Waiting for containers: timeout! Restarting..."
#    exit 1
#else
#    echo "[DLWorkspace System]: All containers are ready, launching training job..."
#    chmod +x /opt/run_dist_job.sh
#    /opt/run_dist_job.sh
#fi
#"""

                    launchCMD = """
#!/bin/bash
mkdir -p /opt
echo "[DLWorkspace System]: Waiting for all containers are ready..."
while [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; do
    sleep 3
done
echo "[DLWorkspace System]: All containers are ready, launching training job..."
chmod +x /opt/run_dist_job.sh
/opt/run_dist_job.sh
"""

                    launchScriptPath = os.path.join(
                        localJobPath, "launch-%s.sh" % distJobParam["jobId"])
                    with open(launchScriptPath, 'w') as f:
                        f.write(launchCMD)
                    f.close()
                    distJobParam[
                        "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % distJobParam[
                            "jobId"]

                    distJobParam["jobNameLabel"] = ''.join(
                        e for e in distJobParam["jobName"] if e.isalnum())
                    distJobParam["userNameLabel"] = getAlias(
                        jobParams["userName"])
                    ENV = Environment(loader=FileSystemLoader("/"))

                    jobTempDir = os.path.join(config["root-path"],
                                              "Jobs_Templete")
                    jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template")

                    distJobParam["hostjobPath"] = os.path.join(
                        config["storage-mount-path"], jobPath)
                    distJobParam["hostworkPath"] = os.path.join(
                        config["storage-mount-path"], workPath)
                    distJobParam["hostdataPath"] = os.path.join(
                        config["storage-mount-path"], dataPath)
                    distJobParam["nvidiaDriverPath"] = nvidiaDriverPath

                    if "mountpoints" not in distJobParam:
                        distJobParam["mountpoints"] = []

                    # distJobParam["mountpoints"].append({"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath})
                    distJobParam["mountpoints"].append({
                        "name":
                        "job",
                        "containerPath":
                        "/job",
                        "hostPath":
                        distJobParam["hostjobPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "work",
                        "containerPath":
                        "/work",
                        "hostPath":
                        distJobParam["hostworkPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "data",
                        "containerPath":
                        "/data",
                        "hostPath":
                        distJobParam["hostdataPath"]
                    })
                    distJobParam["pod_ip_range"] = config["pod_ip_range"]
                    if "usefreeflow" in config and config[
                            "usefreeflow"] == "True":
                        distJobParam["usefreeflow"] = config["usefreeflow"]
                    else:
                        distJobParam["usefreeflow"] = False

                    random.seed(datetime.datetime.now())
                    distJobParam["containerPort"] = int(random.random() *
                                                        1000 + 3000)

                    if assignedRack is not None:
                        if "nodeSelector" not in distJobParam:
                            distJobParam["nodeSelector"] = {}
                        distJobParam["nodeSelector"]["rack"] = assignedRack

                    template = ENV.get_template(os.path.abspath(jobTemp))
                    job_description = template.render(job=distJobParam)

                    jobDescriptionList.append(job_description)

                    distJobParams[role].append(distJobParam)

            jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime(
                "%y%m%d"
            ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"
            jobDescription = "\n---\n".join(jobDescriptionList)

        jobDescriptionPath = os.path.join(config["storage-mount-path"],
                                          jobParams["jobDescriptionPath"])
        if not os.path.exists(
                os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)

        ret["output"] = output

        ret["jobId"] = jobParams["jobId"]

        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],
                                       "jobDescriptionPath",
                                       jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription",
                                       base64.b64encode(jobDescription))

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["LaunchCMD"]
        jobMeta["distJobParams"] = distJobParams

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        print e
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                           "error")
            dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))

    return ret
示例#24
0
def data_numeric_stub():
    dh = DataHandler('data/total-test.csv', 'prediction_label')
    headers, features, prediction_labels = dh.get_numeric_data_set()
    knn = KNearestNeighbour(features, prediction_labels, 1)
    print knn.predict((0, 0))
示例#25
0
class Generator:
  def __init__(self, outDir):
    self.config = None
    self.__csHash = set()
    self.__outputDir = outDir
    self.__csInstance = CSHandler()
    self.__dataHandler = DataHandler()
    self.__utils = Utils()
    self.__Tree = Dependencytree()
    self.__fileSuffix = ""
    self.prepareConfig()
    
  def prepareConfig(self):
    self.config = GeneratingConfig()
    self.config.setCSVariants([0, 1, 2, 3, 4])
    self.config.setDataRanges({0:range(50, 1001, 50), 1:range(50, 1001, 50), 2:range(50, 1001, 50), 3:range(50, 1001, 50), 4:range(50, 1001, 50)})
    self.config.setSplits([(50, 50), (60, 40), (70, 30), (80, 20), (90, 10)])
    self.config.setTagsetVariants([".uniq", ".uni"])
  
  def prepareGenerator(self):
    self.__csInstance.updateLIDTags(self.__dataHandler.LID[0], self.__dataHandler.LID[1])
  
  def prepareRealTest(self, dataFile, outFile):
    dataFile = open(dataFile)
    outFile = open(outFile, 'w')
    for line in dataFile:
      line = map(lambda x:x.split('_#'), line.strip().split())
      uniLine = self.__dataHandler.mapLD2Uni(line)
      outFile.write(' '.join(map(lambda x:'_#'.join(x), uniLine)) + '\n')
    outFile.close()

  def generateTestData(self):
    self.config.setDataRanges({0:range(30, 151, 50), 1:range(30, 151, 50), 2:range(30, 151, 50), 3:range(30, 151, 50), 4:range(30, 151, 50)})
    for csType in self.config.csVariants:
      print "type" + str(csType)
      for data in self.config.dataRanges[csType]:
        print
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Testing Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()
  
  
  def generateDataForTest(self):
    for i in range(10):
      self.__fileSuffix = "."+str(i)
      self.generateTrainDataForTest()
  
  def generateTrainDataForTest(self):
    self.config.setDataRanges({0:[450], 1:[450], 2:[450], 3:[450], 4:[450]})
    statusCount = 0
    for csType in self.config.csVariants:
      print "type" + str(csType),
      for data in self.config.dataRanges[csType]:
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          if splitIndex == len(self.config.splits) - 1:
            print
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.. ",
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount

  
  def generateTrainData(self):
    statusCount = 0
    for csType in self.config.csVariants:
      print "type" + str(csType)
      for data in self.config.dataRanges[csType]:
        print
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def generateUCTrainData(self): # Unknown words constrained training data
    statusCount = 0
    for csType in self.config.csVariants:
      for data in self.config.dataRanges[csType]:
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print pr
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount

  def makeString(self, wordsTagsLangs):
    return ' '.join(map(lambda x:"_#".join(x), wordsTagsLangs)) + '\n'
    
  def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data):
    self.__dataHandler.loadData(l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data)
示例#26
0
def data_text_stub():
    dh = DataHandler('data/train-set.csv', 'sentiment')
    headers, features, prediction_labels = dh.get_textual_data_set()
    review_text_index = headers.index('review_text')
    review_text_list = [feature[review_text_index] for feature in features]
    bow_headers, train_features = dh.convert_docs_to_bow(review_text_list)
示例#27
0
def data_write_test_stub():
    dh = DataHandler('data/train-set.csv', 'sentiment')

    headers, features, prediction_labels = dh.get_textual_data_set()
    review_text_index = headers.index('review_text')

    review_text_list = [feature[review_text_index] for feature in features]
    bow_feature_names = dh.get_feature_set_for_documents(review_text_list)

    dh = DataHandler('data/test-set.csv', 'sentiment')

    headers, features, prediction_labels = dh.get_textual_data_set()
    review_text_index = headers.index('review_text')

    review_text_list = [feature[review_text_index] for feature in features]

    bow_features = dh.convert_docs_to_bow_for_features(review_text_list, bow_feature_names)
    test_prediction_labels = dh.convert_sentiment_list_to_number(prediction_labels)

    print len(bow_feature_names)
    print len(bow_features[0])
    print test_prediction_labels[0]

    bow_feature_names.append("prediction_label")
    dh.write_to_file('data/test-set-feature-engineered.csv', bow_features, bow_feature_names, test_prediction_labels)
class UserwiseDivergenceAnalysis:
  def __init__(self, dataFile, userJoins):
    sys.stderr.write('In Constructor\n')
    self.distComparer = DistComparer()
    self.dataHandler = DataHandler(dataFile, userJoins)
    self.dataHandler.loadActiveForums()
    self.__loadData()
    self.sampledUsers = set()
  
  def __loadData(self):
    stopWords = set([s.strip() for s in open("/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts/Regression/stopWords")])
    self.dataHandler.preprocessVocab(stopWords)

  def sampleUsers(self):
    #self.dataHandler.userStats(outFile)
    self.sampledUsers = self.dataHandler.sampleUsers()

  def doDivergenceAnalysisPerUser(self, outFile):
    outFile = open(outFile,'w')
    for user in self.sampledUsers:
      #print "User:"******"Month:",month
        outFile.write(str(user)+'\t'+str(month)+'\t'+str(userDivergences[month][0])+'\t'+str(userDivergences[month][1])+'\n')
    outFile.close()
    
  def prepareUserDivergencesActive(self, userNum):
    divergences = {}
    userMonths = self.dataHandler.getUserMonths(userNum)
    activeForum  = self.dataHandler.getActiveForum(userNum)
    if activeForum.find("Talk")<0:
      return -1
    userInitialData = self.dataHandler.makeDist(self.dataHandler.getForumInitialData(self.dataHandler.getActiveForum(userNum)))
    userMaturedData = self.dataHandler.makeDist(self.dataHandler.getForumMaturedData(self.dataHandler.getActiveForum(userNum)))
    for userMonth in userMonths:
      monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth))
      divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData))
    return divergences

  def prepareUserDivergencesBackground(self, userNum):
    divergences = {}
    userMonths = self.__dataHandler.getUserMonths(userNum)
    userInitialData = self.dataHandler.makeDist(self.dataHandler.getForumInitialData("AllTalk"))
    userMaturedData = self.dataHandler.makeDist(self.dataHandler.getForumMaturedData("AllTalk"))
    for userMonth in userMonths:
      monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth))
      divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData))
    return divergences

  def prepareUserDivergences(self, userNum):
    divergences = {}
    userMonths = self.dataHandler.getUserMonths(userNum)
    userInitialData = self.dataHandler.makeDist(self.dataHandler.getUserInitialData(userNum))
    userMaturedData = self.dataHandler.makeDist(self.dataHandler.getUserMaturedData(userNum))
    for userMonth in userMonths:
      monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth))
      divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData))
    return divergences
示例#29
0
class SBGSurvival(object):
    """
    This class implements an extended version of the Shifted-Beta-Geometric
    model by P. Fader and B. Hardie.

    The original model works by assuming a constant in time, beta distributed
    individual probability of churn. Due to the heterogeneity of a cohort's
    churn rates (since each individual will have a different probability of
    churning), expected behaviours such as the decrease of cohort churn rate
    over time arise naturally.

    The extension done here generalizes the coefficients alpha and beta of the
    original model to function of features on the individual level. A
    log-linear model is used to construct alpha(x) and beta(x) and the
    likelihood is then computed by combining the contributions of each and
    every sample in the training set.

    The model takes as inputs ...
    """

    def __init__(self, age, alive, features=None, gamma=1.0, gamma_beta=None, bias=True, normalize=True, verbose=False):
        """
        Initializes objects with parameters necessary to create the supporting
        objects: DataHandler and ShiftedBeta

        :param age: str
            The column name to identify the age of each individual. Age has to
            be an integer value, and will determine the time intervals the
            model with work with.
                --- See DataHandler.py

        :param alive: str
            The column name with the status of each individual. In the context
            of survival analysis, an individual may be dead or alive, and its
            contribution to the model will depend on it.
                --- See DataHandler.py

        :param features: str, list or None
            A string with the name of the column to be used as features, or a
            list of names of columns to be used as features or None, if no
            features are to be used.
                --- See DataHandler.py

        :param gamma: float
            A non-negative float specifying the strength of the regularization
            applied to w_alpha (alpha's weights) and, if gamma_beta is not
            given, it is also applied to beta.
                --- See ShiftedBeta.py

        :param gamma_beta: float
            A non-negative float specifying the strength of the regularization
            applied to w_beta (beta's weights). If specified, overwrites the
            value of gamma for beta.
                --- See ShiftedBeta.py

        :param bias: bool
            Whether or not a bias term should be added to the feature matrix.
                --- See DataHandler.py

        :param normalize: bool
            Whether or not numerical fields should be normalized (centered and
            scaled to have std=1)
                --- See DataHandler.py

        :param verbose: bool
            Whether of not status updates should be printed
                --- See ShiftedBeta.py
        """

        # Create objects!
        # DATA-HANDLER OBJECT
        # The DataHandler object may be created without the training data, so
        # we do it here.
        self.dh = DataHandler(age=age, alive=alive, features=features, bias=bias, normalize=normalize)

        # Shifted beta model object
        # Was a different gammab parameter passed? If not, we use the same
        # value passed to gamma.
        if gamma_beta is None:
            gamma_beta = 1.0 * gamma
        # create shifted beta object
        self.sb = ShiftedBetaGeometric(gamma_alpha=gamma, gamma_beta=gamma_beta, verbose=verbose)

    def fit(self, df, restarts=1):
        """
        A method responsible for learning both the transformation of the data,
        including addition of a bias parameters, centering and re-scaling of
        numerical features, and one-hot-encoding of categorical features. In
        addition to learning the parameters alpha and beta of the shifted-beta-
        geometric model.

        This is just a wrapper, the real heavy-lifting is done by the
        DataHandler and ShiftedBeta objects.

        :param df: pandas DataFrame
            A pandas DataFrame with similar schema as the one used to train
            the model. Similar in the sense that the columns used as cohort,
            age and categories must match. Extra columns with not affect
            anything.

        :param restarts: int
            Number of times to restart the optimization procedure with a
            different seed, to avoid getting stuck on local maxima.
        """
        # Transform dataframe extracting feature matrix, ages and alive status.
        x, y, z = self.dh.fit_transform(df)

        # fit to data using the ShiftedBeta object.
        self.sb.fit(X=x, age=y, alive=z, restarts=restarts)

    def summary(self):
        """
        Simple method to get the learned weights and their corresponding
        categories

        :return: pandas DataFrame
            A DataFrame object with alpha and beta weights for each category
        """
        # Construct a DataFrame consisting of feature name and corresponding
        # alpha and beta parameters. Names are obtained by invoking the
        # get_names() method, and the parameter displayed are the weights,
        # not the final values (since that cannot be made sense in separate).
        suma = pd.DataFrame(
            data={name: (a, b) for name, a, b in zip(self.dh.get_names(), self.sb.alpha, self.sb.beta)},
            index=["w_alpha", "w_beta"],
        ).T
        return suma

    def predict_params(self, df):
        """
        predict_params is a method capable of predicting the values of alpha
        and beta for given combination of features. It invokes the
        compute_alpha_beta method from the ShiftedBeta object to compute the
        arrays of alpha and beta for every sample in df given the available
        features.

        Notice that it must first transform the dataframe df using
        DataHandler's transform method, so that it can than work with the lower
        level feature matrix, x.

        :param df: pandas DataFrame
            A pandas dataframe with at least the same feature columns as the
            one used to train the model.

        :return: pandas DataFrame
            A DataFrame with the predicted alpha and beta for each sample in df
        """
        # Start by transforming df to its lower level np.array representation
        x, y, z = self.dh.transform(df=df)

        # Use compute_alpha_beta to compute alpha and beta for every sample in
        # df based on the feature matrix extracted from df, x.
        alpha, beta = self.sb.compute_alpha_beta(x, self.sb.alpha, self.sb.beta)

        # Return a dataframe with predictions.
        return pd.DataFrame(data=np.vstack([alpha, beta]), index=["alpha", "beta"]).T

    def predict_churn(self, df, age=None, **kwargs):
        """
        predict_churn is a method to compute churn rate for a number of periods
        conditioned on the age of the sample.

        This method invokes the churn_p_of_t method from ShiftedBeta to compute
        the churn rate for a given number of periods conditional on age. See
        the description of churn_p_of_t in ShiftedBeta.py for more details.

        This method is a wrapper, it transforms the dataframe df to the
        appropriate representation and feed it to the lower level method from
        ShiftedBeta.

        It is worth noticing that the user has the option to pass the value for
        age, which can wither be a single number of an array with the same
        length as df, and this will overwrite whatever other value for age
        might come out when transforming df.

        :param df: pandas DataFrame
            A pandas dataframe with at least the same feature columns as the
            one used to train the model.

        :param age: None or float or ndarray of shape(df.shape[0], )
            If age is None, the method will use the age parameter extracted
            from df.
            ** Notice that if age=None and df does not contain an age field,
            a RuntimeError will be raised! **
            If age != None, pass this value along to churn_p_of_t.

        :param kwargs:
            Any other arguments that should be redirected to churn_p_of_t.

        :return: pandas DataFrame
            A DataFrame with the churn_p_of_t matrix.
        """
        x, y, z = self.dh.transform(df=df)

        # If age field is present in prediction dataframe, we may choose to
        # use it to calculate future churn. To do so, we first check if the
        # user passed a new age parameter, if answer is yes, use the new age.
        # If, however, the user did not pass age, use the value extracted from
        # the dataframe, df.
        # ** If no value for age is passed and the dataframe does not contain
        # age, a RuntimeError is raised.
        if age is None:
            age = y
        if age is None:
            raise RuntimeError(
                'The "age" field must either be present in ' "the dataframe or passed separately as an " "argument."
            )

        # Create a dataframe with the churn_p_of_t matrix with all relevant
        # parameters.
        out = pd.DataFrame(data=self.sb.churn_p_of_t(x, age=age, **kwargs))

        # Give columns a decent, generic name.
        out.columns = ["period_{}".format(col) for col in range(1, out.shape[1] + 1)]

        return out

    def predict_survival(self, df, age=None, **kwargs):
        """
        predict_survival is a method to compute the survival curve for a number
        of periods conditioned on the age of the sample.

        This method invokes the survival_function method from ShiftedBeta to
        compute the retention rate for a given number of periods conditional
        on age. See the description of survival_function in ShiftedBeta.py for
        more details.

        This method is a wrapper, it transforms the dataframe df to the
        appropriate representation and feed it to the lower level method from
        ShiftedBeta.

        It is worth noticing that the user has the option to pass the value for
        age, which can wither be a single number of an array with the same
        length as df, and this will overwrite whatever other value for age
        might come out when transforming df.

        :param df: pandas DataFrame
            A pandas dataframe with at least the same feature columns as the
            one used to train the model.

        :param age: None or float or ndarray of shape(df.shape[0], )
            If age is None, the method will use the age parameter extracted
            from df.
            ** Notice that if age=None and df does not contain an age field,
            a RuntimeError will be raised! **
            If age != None, pass this value along to survival_function.

        :param kwargs:
            Any other arguments that should be redirected to survival_function.

        :return: pandas DataFrame
            A DataFrame with the survival_function matrix.
        """
        x, y, z = self.dh.transform(df=df)

        # If age field is present in prediction dataframe, we may choose to
        # use it to calculate future churn. To do so, we first check if the
        # user passed a new age parameter, if answer is yes, use the new age.
        # If, however, the user did not pass age, use the value extracted from
        # the dataframe, df.
        # ** If no value for age is passed and the dataframe does not contain
        # age, a RuntimeError is raised.
        if age is None:
            age = y
        if age is None:
            raise RuntimeError(
                'The "age" field must either be present in ' "the dataframe or passed separately as an " "argument."
            )

        # Create a dataframe with the churn_p_of_t matrix with all relevant
        # parameters.
        out = pd.DataFrame(data=self.sb.survival_function(x, age=age, **kwargs))

        # Give columns a decent, generic name.
        out.columns = ["period_{}".format(col) for col in range(1, out.shape[1] + 1)]

        return out

    def predict_ltv(self, df, age=None, alive=None, **kwargs):
        """
        predict_ltv is a method to compute the ltv for each sample conditioned
        on age.

        This method invokes the derl method from ShiftedBeta to compute
        the residual ltv of each sample given its given age. See the
        description of derl in ShiftedBeta.py for more details.

        This method is a wrapper, it transforms the dataframe df to the
        appropriate representation and feed it to the lower level method from
        ShiftedBeta.

        It is worth noticing that the user has the option to pass the value for
        both age and alive fields, which can wither be a single number of an
        array with the same length as df, and this will overwrite whatever
        other value for age and/or alive might come out when transforming df.

        :param df: pandas DataFrame
            A pandas dataframe with at least the same feature columns as the
            one used to train the model.

        :param age: None or float or ndarray of shape(df.shape[0], )
            If age is None, the method will use the age parameter extracted
            from df.
            ** Notice that if age=None and df does not contain an age field,
            a RuntimeError will be raised! **
            If age != None, pass this value along to derl.

        :param alive: None or float or ndarray of shape(df.shape[0], )
            If age is None, the method will use the alive parameter extracted
            from df.
            ** Notice that if alive=None and df does not contain an alive
            field, a RuntimeError will be raised! **
            If alive != None, pass this value along to derl.

        :param kwargs:
            Any other arguments that should be redirected to derl.

        :return: pandas DataFrame
            A DataFrame with the ltv predictions.
        """
        x, y, z = self.dh.transform(df=df)

        # If age field is present in prediction dataframe, we may choose to
        # use it to calculate future churn. To do so, we first check if the
        # user passed a new age parameter, if answer is yes, use the new age.
        # If, however, the user did not pass age, use the value extracted from
        # the dataframe, df.
        # ** If no value for age is passed and the dataframe does not contain
        # age, a RuntimeError is raised.
        if age is None:
            age = y
        if age is None:
            raise RuntimeError(
                'The "age" field must either be present in ' "the dataframe or passed separately as an " "argument."
            )

        # See the discussion above for age, exact same logic applies.
        if alive is None:
            alive = z
        if alive is None:
            raise RuntimeError(
                'The "alive" must either be present in the ' "dataframe or passed separately as an " "argument."
            )

        # Get LTVs and return a dataframe!
        ltvs = self.sb.derl(x, age=age, alive=alive, **kwargs)

        return pd.DataFrame(data=ltvs, columns=["ltv"])
示例#30
0
import SocketHandler
from SocketHandler import SocketHandler
from DataHandler import DataHandler
import sys


while True:
	dataHandler = DataHandler()
	ip = dataHandler.getRandom()

	if ip == "":
		print "Could not find any ips that need processing"
		sys.exit()

	print "Processing "+ip

	socksHandler = SocketHandler(ip)
	banner = socksHandler.grabBanner()

	#Yes I understand that this is an awful hack
	banner = banner.replace("'","\'")

	try:
		dataHandler.setBanner(ip, banner)
	except:
		print "Saving banner for "+ip+" failed!"
示例#31
0
def get_job(job_id):
    dataHandler = DataHandler()
    ret = dataHandler.GetJob(jobId=job_id)[0]
    dataHandler.Close()
    return ret
示例#32
0
#!/usr/bin/env python

import os
import sys
import argparse
import base64
import json
import pprint

sys.path.append(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils"))

from DataHandler import DataHandler

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--job_id", "-j", help="id of job", required=True)

    args = parser.parse_args()
    handler = DataHandler()
    jobs = handler.GetJob(jobId=args.job_id)
    if len(jobs) == 0:
        print("didn't find job of %s" % (args.job_id))
        sys.exit(1)

    job = jobs[0]
    job_params = json.loads(base64.b64decode(job["jobParams"]))

    pprint.pprint(job_params)
示例#33
0
def get_job_priorities():
    dataHandler = DataHandler()
    job_priorites = dataHandler.get_job_priority()
    dataHandler.Close()
    return job_priorites
示例#34
0
from DataHandler import DataHandler
import itertools

data_handler = DataHandler()

try:
    print(data_handler)
    print("")
    df_names = data_handler.files
    df_combinations = itertools.combinations(df_names, 2)
    for combination in df_combinations:
        common_columns = data_handler.list_all_columns(
            selected_columns=combination)
        column1 = common_columns.values()[0]
        column2 = common_columns.values()[1]
        intersection = set(column1).intersection(column2)
        if not intersection:
            continue
        print(
            "Columns %s,%s Have %s in common" %
            (common_columns.keys()[0], common_columns.keys()[1], intersection))
    print("")
except Exception as e:
    print(e)
示例#35
0
import numpy as np

from DataHandler import DataHandler
from ModelHandler import Model
"""
The script used to generate new music. It creates a model and loads a weights file. Then it gets a seed
from the DataHandler object and starts th generation process.
"""
settings = DataHandler.get_config_params()
data_handler = DataHandler()

print('Building model')
neurons = settings["neurons"]
dropout = settings["dropout"]
l_rate = settings["learning_rate"]
epochs = settings["epochs"]
optimizer = settings["optimizer"]
model = Model(neurons=neurons, dropout=dropout, learning_rate=l_rate, optimizer=optimizer, desired_loss=0.3)

print('Loading weights')
data_handler.get_weights(model, settings)


def sample(prob_distribution, temperature=1.0):
    """
    A function to sample an index from the array of probabilities.

    :param prob_distribution: an array with probabilities for each class (note).
    :param temperature: denominator parameter used to divide the natural log of each element. Helps in transforming
    values in the probability array
示例#36
0
    def evaluateAllModelsData(baseDir,
                              pathToTestData,
                              pathToTrainingData,
                              evalDirName='Evaluation',
                              modelDirName='Models',
                              params=['Param'],
                              modelName='best_model.hdf5',
                              cv=10,
                              verbose=True,
                              colRemove=None,
                              skiprowsTrain=0,
                              skiprowsTest=0,
                              param_name_value=None):

        if baseDir[-1] != '/':
            baseDir += '/'

        evalDir = baseDir + evalDirName
        modelDir = baseDir + modelDirName

        modelList = TestingFramework.getListOfModels(modelDir,
                                                     modelName=modelName)

        total_combinations = len(modelList)

        if verbose:
            print(
                '\nEvaluating the best network trained for all {} combinations(s)\n'
                .format(total_combinations))

        modelEvaluation = dict()

        X, y = DataHandler.predictionData(pathToTestData,
                                          pathToTrainingData,
                                          skiprowsTrain=skiprowsTrain,
                                          skiprowsTest=skiprowsTest,
                                          colRemove=colRemove)

        assert (len(y) >
                0), "Error. No truth values were found in the dataset."
        assert (
            np.shape(X)[0] == np.shape(y)[0]
        ), "Error. The number of columns, {}, in the training data does not equal the number of target truth values, {}.".format(
            np.shape(X)[0],
            np.shape(y)[0])

        for model_path in modelList:

            if param_name_value is None:
                # Get the last part of the path (filename)
                model_filename = re.search('/([^/]*)$', model_path)
                model_filename = model_filename.group(1)
                # Remove the filename from the path and extract the last part of the path: the configuration.
                model_configuration = re.search(
                    '/([^/]*)/$', re.sub(r'([^/]*)$', '', model_path))
                model_configuration = model_configuration.group(1)

                # Get a list of the values for each parameter
                #parameter_values = model_configuration.split(self.model_parameter_delimiter)
                parameter_values = model_configuration.split('-')

                # Create string, Param1 = Value1,\tParm2 = Value2 etc.
                param_name_value = ",\t".join([
                    params[i % len(params)] + ' = ' + parameter_values[i]
                    for i in range(len(parameter_values))
                ])

            if verbose:
                print('Evaluating model {} for scenario: {}'.format(
                    model_filename, param_name_value))

            K.clear_session()  # Bug, kernel dies in Keras 2.2.0

            model = load_model(model_path)

            cvscores = TestingFramework.evaluate(model,
                                                 X,
                                                 y,
                                                 cv=cv,
                                                 verbose=verbose)

            model_name = model_configuration + '/' + model_filename
            #model_name = model_name.lower().replace(' ','_').replace(',','_')

            modelEvaluation[model_name] = [
                np.mean(cvscores),
                np.std(cvscores), param_name_value
            ]

            del model

        # Rank all the combinations from lowest mean MSE to highest
        sorted_combinations = sorted(modelEvaluation.items(),
                                     key=lambda x: x[1])
        #print(sorted_combinations)

        rank = 1
        filename = evalDir + '/' + 'ranked_evaluation_static.txt'

        for model_score in sorted_combinations:
            rank_string = 'Rank {}\t: {}\t\t| {}\tMSE: {:.3f} +/- {:.3f}\n'.format(
                rank, model_score[0], model_score[1][2], model_score[1][0],
                model_score[1][1])

            if verbose:
                print(rank_string)

            # Write to file
            with open(filename, 'a') as f:
                f.write(rank_string)

            rank += 1
示例#37
0
文件: main.py 项目: Jingoo88/CA2
from Score import Score
from DataHandler import DataHandler
from PartialScore import PartialScore

__author__ = 'Thomas'

logger = log.setup_custom_logger("Main")

start_time = time.clock()

logger.info("Main Started at time %i"%start_time)

with open('test.json') as data_file:
    data = json.load(data_file)

data_handler = DataHandler(usr=data["usr"], table=data["table"], url=data["url"], pwd=data["pwd"])
myopts, args = getopt.getopt(argv[1:], "c:i:", ["coeffs=", "ids="])

if len(argv) > 1:

    for i, j in myopts:
        if i == "--ids" or i == "-i":
            requested_ids = j.split(",")
        elif i == "--coeffs" or i == "-c":
            input_coeffs = j.split(",")

    coeffs = {}

    assert len(input_coeffs) == 12, "User must provide exactly 12 coefficients"
    assert len(requested_ids) > 0, "No ids requested"
示例#38
0
data = os.listdir('/home/oem/train')

labels = [[1, 0] if 'cat' in i else [0, 1] for i in data]

dataset = ImageDataset(source='/home/oem/train',
                       labels=labels,
                       split_spec={'train': {'amount': 0.6, 'transform': True, 'batch_size':32},
                               'validation': {'amount': 0.2, 'transform': False, 'batch_size':32},
                               'test': {'amount': 0.2, 'transform': False, 'batch_size':32}},
                       shuffle=True)

transformer = ImageTransformer()

transformer.add_resize((256, 256), origin=True, keys=['train', 'validation', 'test'])
transformer.add_grayscale(origin=True, keys=['train', 'validation', 'test'])
transformer.add_unsharp_masking(origin=True, keys=['train', 'validation', 'test'])
transformer.add_histogram_equalization(origin=True, keys=['train', 'validation', 'test'])
transformer.add_median_filter(keys=['train'])
transformer.add_rotation([45, 60, 90], keys=['train'])
transformer.add_contrast([1.5], keys=['train'])
transformer.add_brightening([0.5], keys=['train'])
transformer.add_sharpening([2.0], keys=['train'])

saver = DataSaver('/home/oem/PycharmProjects/DeepLearning', 'CatsvsDogs')
handler = DataHandler(dataset=dataset,
                      transformer=transformer,
                      saver=saver)

handler.run()
示例#39
0
from keras.layers import Dense

from DataHandler import DataHandler
from predModel import predModel
from linModel import linModel

SCORING = [0.04, 4, -2, 0, 0, 0, 0.1, 6, 1, 0, 0.1, 6, -2]
response = input("Would you like to provide values?")
find_vals = False
if response.lower() == "no":
    find_vals = True
if find_vals:
    ### run linear model for various alpha
    dh = DataHandler(beg=1999,
                     end=2018,
                     split_by_pos=True,
                     offset=16,
                     ignore_na=False,
                     fill_mean=True)
    alpha_vals = np.logspace(-2, 0.5, 10)
    errs = {}
    best_alphas = []
    for pos in dh.X_train.keys():
        errs[pos] = []
        print("Running: " + str(pos) + " (" + str(len(dh.X_train[pos])) +
              " entries)")
        for alpha in alpha_vals:
            lin = linModel(alpha=alpha, max_iter=5000)
            err = np.mean(
                lin.cv_error(dh.X_train[pos], dh.y_train[pos],
                             np.abs(SCORING)))
            errs[pos].append(err)
示例#40
0
    for l2Key in level2Keys:
      sys.stdout.write("\t"+str(D[l1Key][l2Key]))
    sys.stdout.write("\n")
    
def printList(L, outFile):
  outFile = open(outFile,'w')
  for content in L:
    outFile.write('\t'.join(map(lambda x:str(x), content))+'\n')
  outFile.close()

if __name__ == '__main__':
  
  drawLine()
  sys.exit()
  
  dataFile = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/allThreads.csv"
  userJoins = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/userJoins"
  activeForums = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/activeForums.csv"
  DH = DataHandler(dataFile, userJoins)
  DH.loadActiveForums(activeForums)  
  drawPostingFrequency(DH.getPostingFreq())
  #printDict(DH.getCumulativePostingFreq())
  #printDict(DH.getCutoffPostingFreq())
  #printDict(DH.getMonthwisePostingFrequency())
  #printTwolevelDict(DH.getMonthwiseBinnedPostingFrequency())
  '''outFile = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/Analysis/basicTable"
  table = DH.getBasicTable()
  print len(table)
  print "No. of users:",len(set(map(lambda x:x[0],table)))
  printList(table, outFile)'''
from Evaluator import Evaluator
from surprise import NormalPredictor
from matrix_factorization_algo import MatrixFactorizationAlgo
from DataHandler import DataHandler

from knn_algo import knn
from hybrid_algo_weighted import HybridAlgoWeighted

# from Evaluator import Evaluator
#load dataset
dataprocessor = DataHandler()
evaluationData = dataprocessor.getEvaluation()
rankings = dataprocessor.getRank()
evaluator = Evaluator()

#use random as our basline here
Random = NormalPredictor()
evaluator.Add_Algo(Random, "Random")

# add knn algos
knngenerator = knn()
knn_algo_dict = knngenerator.generate_knn(evaluationData)
for key in knn_algo_dict:
    evaluator.Add_Algo(knn_algo_dict[key], key)

# adding MF algos
mf_algo = MatrixFactorizationAlgo()
mf_algo_dict = mf_algo.generate_algorithms(evaluationData)
for key in mf_algo_dict:
    evaluator.Add_Algo(mf_algo_dict[key], key)
示例#42
0
class preProcessor:
    def __init__(self, dataFile, userJoins):
        self.dataHandler = DataHandler(dataFile, userJoins)
        self.dataHandler.loadActiveForums()
        self.tokenizedData = []

    def prepareTokenizedCSV(self):
        self.tokenizedData = self.dataHandler.getTokenizedCSV()

    def prepareTokenizedUserMonthCSV(self):
        self.tokenizedData = self.dataHandler.getTokenizedUserMonthCSV()

    def prepareTokenizedUserMonthForumCSV(self):
        self.tokenizedData = self.dataHandler.getTokenizedUserMonthForumCSV()

    def printDataForTMT(self, outFile):
        outFile = csv.writer(open(outFile, "w"))
        index = 1
        for record in self.tokenizedData:
            record.insert(0, index)
            outFile.writerow(record)
            index += 1

    def getRequiredDataFromRecord(self, record):
        indices = [0]

    def initializeUserMonthRecord(self, user, month):
        return self.dataHandler.getBasicUserMonthRecord(user, month)

    def isProperUnicode(self, text):
        try:
            dummy = unicode(text)
            return True
        except:
            return False

    def printInferDataForTMT(self, outFile):
        f = codecs.open(outFile, encoding="utf-8", mode="w+")
        outFile = csv.writer(f)
        index = 1
        for user in self.tokenizedData.iterkeys():
            for month in self.tokenizedData[user].iterkeys():
                numPosts = 0
                userMonthRecord = self.initializeUserMonthRecord(user, month)
                for recordText in self.tokenizedData[user][month]:
                    if self.isProperUnicode(recordText):
                        userMonthRecord[-1].append(recordText)
                        numPosts += 1
                userMonthRecord[-1] = " ".join(userMonthRecord[-1])
                userMonthRecord.insert(0, index)
                userMonthRecord.append(numPosts)
                try:
                    outFile.writerow([unicode(s).encode("utf-8") for s in userMonthRecord])
                    index += 1
                except:
                    pass

    def printInferDataForTMTWithForum(self, outFile):
        f = codecs.open(outFile, encoding="utf-8", mode="w+")
        outFile = csv.writer(f)
        index = 1
        for user in self.tokenizedData.iterkeys():
            for month in self.tokenizedData[user].iterkeys():
                totalPosts = 0
                allForumsRecord = self.initializeUserMonthRecord(user, month)
                for forum in self.tokenizedData[user][month]:
                    numPosts = 0
                    userMonthRecord = self.initializeUserMonthRecord(user, month)
                    for recordText in self.tokenizedData[user][month][forum]:
                        if self.isProperUnicode(recordText):
                            userMonthRecord[-1].append(recordText)
                            numPosts += 1
                    forumPosts = copy.deepcopy(userMonthRecord[-1])
                    userMonthRecord[-1] = " ".join(userMonthRecord[-1])
                    userMonthRecord.insert(0, index)
                    userMonthRecord.append(numPosts)
                    userMonthRecord.append(forum)
                    try:
                        outFile.writerow([unicode(s).encode("utf-8") for s in userMonthRecord])
                        index += 1
                        totalPosts += numPosts
                        allForumsRecord[-1].extend(forumPosts)
                    except:
                        pass
                allForumsRecord[-1] = " ".join(allForumsRecord[-1])
                allForumsRecord.insert(0, index)
                index += 1
                allForumsRecord.append(totalPosts)
                allForumsRecord.append("AllForums")
                outFile.writerow([unicode(s).encode("utf-8") for s in allForumsRecord])
示例#43
0
def ApproveJob(job):
    dataHandler = DataHandler()
    dataHandler.ApproveJob(job["jobId"])
    dataHandler.Close()
    return True
示例#44
0
from DataHandler import DataHandler
from Client import Client
from NATServer import NATServer
from Dispatcher import Dispatcher
from XmlRpcServer import XmlRpcServer

import ip2country # just to make sure it's downloaded
import ChanServ

# uncomment for debugging deadlocks, creates a stacktrace at the given interval to stdout
#import stacktracer
#stacktracer.trace_start("trace.html",interval=5,auto=True) # Set auto flag to always update file!


_root = DataHandler()
_root.parseArgv(sys.argv)

try:
	signal.SIGHUP
	
	def sighup(sig, frame):
		_root.console_write('Received SIGHUP.')
		if _root.sighup:
			_root.reload()

	signal.signal(signal.SIGHUP, sighup)
except AttributeError:
	pass

_root.console_write('-'*40)
示例#45
0
    Useless.

    Print a message with standart shape.
    """
    print "#######################################\n{}\n#######################################\n".format(str)

if __name__ == "__main__":


    myprint("Start computations")

    ####################################################################################################################
    # Get data
    ####################################################################################################################
    myprint("Load data")
    MyDataHandler = DataHandler()
    print "Loading train"
    train = MyDataHandler.get_train()

    #Pretty things up.
    column_names = {}
    for i in xrange(96*96):
        column_names[i] = 'pixel{}'.format(i)

    ####################################################################################################################
    # Split pixels:
    ####################################################################################################################
    myprint("Extract data")
    train_x = train.iloc[:800]['Image'].apply(lambda x: pd.Series([int(i) for i in x.split(' ')])).rename(columns=column_names)
    train_y = train.iloc[:800].ix[:, 0:30].fillna(0).astype(int)
示例#46
0
import sys
from DataHandler import DataHandler

def analyzeUser(userNum, DH, baseDir):
  userNum = str(userNum)
  sys.stderr.write("User:"******"\n")
  outFile = baseDir+userNum
  DH.printMonthlyDataForUser(userNum, outFile)

if __name__ == '__main__':
  baseDir = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/DebugTime/"
  dataFile = baseDir + "allThreads.csv"
  userJoins = baseDir + "userJoins"
  activeForums = baseDir + "activeForums.csv"
  DH = DataHandler(dataFile, userJoins)
  DH.loadActiveForums(activeForums)
  
  #analyzeUser(9258, DH, baseDir)
  #analyzeUser(30702, DH, baseDir)
  analyzeUser(35541, DH, baseDir)
示例#47
0
    features, act_labels, verbose=True)

train_data, act_train_labels = train_loader.time_series_to_section(
    train_ts.copy(),
    num_act_labels,
    sliding_window_size=200,
    step_size_of_sliding_window=10)

test_data, act_test_labels = train_loader.time_series_to_section(
    test_ts.copy(),
    num_act_labels,
    sliding_window_size=200,
    step_size_of_sliding_window=10)

print("---Data is successfully loaded")
handler = DataHandler(train_data, test_data)
norm_train = handler.normalise("train")
norm_test = handler.normalise("test")

print("--- Shape of Training Data:", train_data.shape)
print("--- Shape of Test Data:", test_data.shape)

expt_name = "thurs_Script_jog2"

create_directories(expt_name)
gan_ = GAN(norm_train.shape)
trainer_ = Trainer(gan_, expt_name)
trainer_.train_gan(epochs=200,
                   batch_size=128,
                   sample_interval=10,
                   train_data=norm_train)
示例#48
0
from CurlHandler import CurlHandler
from DataHandler import DataHandler

import sys

while True:
	dataHandler = DataHandler()
	ip = dataHandler.getRandomHTTPS()

	if ip == "":
		print "Could not find any ips that need processing"
		sys.exit()

	print "Processing "+ip

	curlHandler = CurlHandler()
	
	response = curlHandler.getResponse(ip)

	dataHandler.setHTTPSPage(ip, response)
示例#49
0
 def __init__(self, dataFile, userJoins):
     self.dataHandler = DataHandler(dataFile, userJoins)
     self.dataHandler.loadActiveForums()
     self.tokenizedData = []
示例#50
0
import tensorflow as tf
from DataHandler import DataHandler
from RNNGenerator import RNNGenerator
from SessionRunner import SessionRunner

log_path = 'output/tensorflow/'
writer = tf.summary.FileWriter(log_path)

# Load and prepare data
data_handler = DataHandler()

training_data =  data_handler.read_data('Data/Zulu.txt')

dictionary, reverse_dictionary = data_handler.build_datasets(training_data)

# TensorFlow Graph input
n_input = 3
n_units = 512

x = tf.placeholder("float", [None, n_input, 1])
y = tf.placeholder("float", [None, len(dictionary)])

# RNN output weights and biases
weights = {
    'out': tf.Variable(tf.random_normal([n_units, len(dictionary)]))
}
biases = {
    'out': tf.Variable(tf.random_normal([len(dictionary)]))
}

rnn_generator = RNNGenerator()
示例#51
0
from sklearn.tree import DecisionTreeClassifier
from DataHandler import DataHandler
import numpy as np
from Utils import OneHotEncode
from datetime import datetime


allData = DataHandler.getAllData()[1:]
bookingDate = []
origin = []
dest = []
deptDate = []
deptTime = []
pax = []
label = []

BOOKINGDATE = 1
ORIGIN = 2
DEST = 3
DEPTDATE = 4
DEPTTIME = 5
PAX = 6
LABEL = 7

for row in allData:
    bookingDate.append(row[BOOKINGDATE])
    origin.append(row[ORIGIN])
    dest.append(row[DEST])
    deptDate.append(row[DEPTDATE])
    deptTime.append(row[DEPTTIME])
    pax.append(row[PAX])
示例#52
0
def extract_job_log(jobId, logPath, userId):
    try:
        dataHandler = DataHandler()

        logs = k8sUtils.GetLog(jobId)

        jobLogDir = os.path.dirname(logPath)
        if not os.path.exists(jobLogDir):
            mkdirsAsUser(jobLogDir, userId)
        logStr = ""
        trimlogstr = ""

        for log in logs:
            if "podName" in log and "containerID" in log and "containerLog" in log:
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "        logs from pod: %s\n" % log["podName"]
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += log["containerLog"]
                logStr += "\n\n\n"
                logStr += "=========================================================\n"
                logStr += "        end of logs from pod: %s\n" % log["podName"]
                logStr += "=========================================================\n"
                logStr += "\n\n\n"

                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                trimlogstr += "        logs from pod: %s\n" % log["podName"]
                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                logLines = log["containerLog"].split('\n')
                if (len(logLines) < 3000):
                    trimlogstr += log["containerLog"]
                    trimlogstr += "\n\n\n"
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "        end of logs from pod: %s\n" % log[
                        "podName"]
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "\n\n\n"
                else:
                    trimlogstr += "\n".join(logLines[-2000:])
                    trimlogstr += "\n\n\n"
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "        end of logs from pod: %s\n" % log[
                        "podName"]
                    trimlogstr += "        Note: the log is too long to display in the webpage.\n"
                    trimlogstr += "        Only the last 2000 lines are shown here.\n"
                    trimlogstr += "        Please check the log file (in Job Folder) for the full logs.\n"
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "\n\n\n"

                try:
                    containerLogPath = os.path.join(
                        jobLogDir,
                        "log-container-" + log["containerID"] + ".txt")
                    with open(containerLogPath, 'w') as f:
                        f.write(log["containerLog"])
                    f.close()
                    os.system("chown -R %s %s" % (userId, containerLogPath))
                except Exception as e:
                    print e

        if len(trimlogstr.strip()) > 0:
            dataHandler.UpdateJobTextField(jobId, "jobLog", trimlogstr)
            with open(logPath, 'w') as f:
                f.write(logStr)
            f.close()
            os.system("chown -R %s %s" % (userId, logPath))

    except Exception as e:
        logging.error(e)
class Crawler:
    
    def __init__(self,url,dbFile,outputFile,maxCount=None):
        self.url = url # url to be crawled
        if maxCount == None:
            self.maxCount = -1
        else:
            '''
            maxcount is the maximum number of links to be fetched by the crawler.
            It is incremented as we should accommodate the initial user input while 
            counting the total number of links in the repository as the link entered by the user
            will also be persisted in the repository
            (i.e)if user requests to crawl python.org and asks to fetch 2 links , the program should 
            terminate when there are 3 links in repository as python.org is also one of the links in repository   
            '''
            self.maxCount = maxCount + 1
            
        self.extracter = LinkExtracter()
        self.dataHandler = DataHandler(self.maxCount,dbFile,outputFile)
        self.log = CrawlerLogger.getlogger()
    '''
    crawls the link given by the user using BFS traversal until it fetches the specified number of links or 
    till all the links have been fetched
    '''
    def Crawl(self):
 
        try:
            link = self.url
            self.log.info("crawling "+link)
            links = self.extracter.fetchLinks(link)
            
            if links is None:
                print("Either the url you entered cannot be crawled  or its does not contain any links")
                return False
        
            self.dataHandler.flushTable()
            maxLinkflag = self.dataHandler.saveUnprocessedLinks(links)
         
            if maxLinkflag:
                return self.writeDataToFile()
            else:
                self.crawlfetchedLinks()
                return self.writeDataToFile()
        except sqlite3.OperationalError as e:
            self.log.error(e,exc_info=sys.exc_info()[2]);
            raise CrawlerError(('Either invalid database entered or database does not have necessary tables',))
    
    '''
    Helper for crawl function
    '''
    def crawlfetchedLinks(self):
        
        maxLinkflag = True
        
        '''
        This loop will terminate when the specified number of links has been fetched or all the links 
        has been fetched
        ''' 
        while True:
            link = self.dataHandler.getLinkforParsing()
            if link is None:
                break
            links = self.extracter.fetchLinks(link)
            self.dataHandler.setLinkAsProcessed(link)
            if not links is None:
                maxLinkflag = self.dataHandler.saveUnprocessedLinks(links)
            if maxLinkflag:
                break
            
    '''
    writeDataToFile helps in writing the fetched links to file (links.txt) so that user can use it
    '''
    def writeDataToFile(self):
        try:
            res = self.dataHandler.exportData()
            self.dataHandler.flushTable()
            return res
        except IOError as e:
            self.log.error(e,exc_info=sys.exc_info()[2]);
            raise CrawlerError(('The path of the export file entered is invalid',))
示例#54
0
    def saveHistory(self):
        if args.epoch == 0:
            return
        with open('History/' + args.save_path + '.his', 'wb') as fs:
            pickle.dump(self.metrics, fs)

        saver = tf.train.Saver()
        saver.save(self.sess, 'Models/' + args.save_path)
        log('Model Saved: %s' % args.save_path)

    def loadModel(self):
        saver = tf.train.Saver()
        saver.restore(sess, 'Models/' + args.load_model)
        with open('History/' + args.load_model + '.his', 'rb') as fs:
            self.metrics = pickle.load(fs)
        log('Model Loaded')


if __name__ == '__main__':
    logger.saveDefault = True
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    log('Start')
    handler = DataHandler()
    log('Load Data')

    with tf.Session(config=config) as sess:
        model = Model(sess, handler)
        model.run()
示例#55
0
import sklearn.metrics as Metrices
import numpy as np
from DataHandler import DataHandler
import csv
import sys

pair = dict()
counter = 0
for pairInFile in open("data\Pair.txt", "r"):
    pairInFile = pairInFile.rstrip('\n')
    pair[pairInFile] = counter
    counter += 1

fareClass = {"Classic" : 1, "Deal": 2, "Flex": 3, "Saver": 4}

trainingData = DataHandler.getTrainingData()

x = np.empty([len(trainingData),44], dtype=int)
y = np.empty([len(trainingData)], dtype=int)

counter = 0
for trainingList in trainingData:
    oriDestPair = trainingList[2] + trainingList[3]
    paxCount = int(trainingList[-2])
    tempXList = [0] * 44
    tempXList[pair[oriDestPair]] = 1
    if paxCount > 1:
        tempXList[43] = 1
    else:
        tempXList[42] = 1
    x[counter] = tempXList
示例#56
0
from SiameseNet import SiameseNet
from ModelHandler import ModelHandler
from DataHandler import DataHandler
from plotters import Plotter

# %% MNIST dataset
dh = DataHandler("MNIST", classes_to_select=[0, 1, 2, 3, 4, 5, 6])
#dh_newdata = DataHandler("MNIST", classes_to_select=[7,8,9])

# %% Define embedding model
mh = ModelHandler(model_number=4,
                  embedding_size=200,
                  input_feature_dim=dh.shape)

# %% Define siamese net
#alphas = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
alphas = [0.1]
for alpha in alphas:
    net = SiameseNet(mh, dh, alpha)
    net.print_model()
    batch_size = 200
    epochs = 1
    steps_per_epoch = 1  #int(dh.n_train / batch_size)
    history = net.train("create_pair_batch_random", batch_size, epochs,
                        steps_per_epoch)

    # % Plot loss
    # Losses
    plotter = Plotter()
    plotter.plot_losses(net, history)
from DataHandler import DataHandler

if __name__ == "__main__":
    dataFile = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/allThreads.csv"
    userJoins = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/userJoins"
    DH = DataHandler(dataFile, userJoins)
    DH.loadActiveForums()
    outFile = "/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts2/TopicChange/Data/activeForums.tsv"
    DH.printActiveForums(outFile)
示例#58
0
    def __init__(self, age, alive, features=None, gamma=1.0, gamma_beta=None, bias=True, normalize=True, verbose=False):
        """
        Initializes objects with parameters necessary to create the supporting
        objects: DataHandler and ShiftedBeta

        :param age: str
            The column name to identify the age of each individual. Age has to
            be an integer value, and will determine the time intervals the
            model with work with.
                --- See DataHandler.py

        :param alive: str
            The column name with the status of each individual. In the context
            of survival analysis, an individual may be dead or alive, and its
            contribution to the model will depend on it.
                --- See DataHandler.py

        :param features: str, list or None
            A string with the name of the column to be used as features, or a
            list of names of columns to be used as features or None, if no
            features are to be used.
                --- See DataHandler.py

        :param gamma: float
            A non-negative float specifying the strength of the regularization
            applied to w_alpha (alpha's weights) and, if gamma_beta is not
            given, it is also applied to beta.
                --- See ShiftedBeta.py

        :param gamma_beta: float
            A non-negative float specifying the strength of the regularization
            applied to w_beta (beta's weights). If specified, overwrites the
            value of gamma for beta.
                --- See ShiftedBeta.py

        :param bias: bool
            Whether or not a bias term should be added to the feature matrix.
                --- See DataHandler.py

        :param normalize: bool
            Whether or not numerical fields should be normalized (centered and
            scaled to have std=1)
                --- See DataHandler.py

        :param verbose: bool
            Whether of not status updates should be printed
                --- See ShiftedBeta.py
        """

        # Create objects!
        # DATA-HANDLER OBJECT
        # The DataHandler object may be created without the training data, so
        # we do it here.
        self.dh = DataHandler(age=age, alive=alive, features=features, bias=bias, normalize=normalize)

        # Shifted beta model object
        # Was a different gammab parameter passed? If not, we use the same
        # value passed to gamma.
        if gamma_beta is None:
            gamma_beta = 1.0 * gamma
        # create shifted beta object
        self.sb = ShiftedBetaGeometric(gamma_alpha=gamma, gamma_beta=gamma_beta, verbose=verbose)