def __init__(self, params, result):
        # initialize
        self._cur = 0  # current image
        self.transformer = DataTransformer()

        print "BatchLoader initialized with {} images".format(
            len(self.list_samples))
Пример #2
0
async def CreateTask(request):
    task_request = json.loads(await request.json())

    # 1 check the archive file size and judge the validation
    if task_request.get("archive", ""):
        archive_size = len(task_request["archive"]["data"])
        if archive_size > Configs.MAX_SIZE:
            return web.Response(status=413,
                                body={
                                    "error": 413,
                                    "reason": "Request Entity Too Large",
                                    "description": ""
                                },
                                content_type="application/json")
    # 2 add task to DB
    task_doc = DataTransformer.TaskRequest2TaskDoc(task_request)
    DBoperation = MongoDB.MonoDBOperation()
    result = await DBoperation.AddDB(task_doc)
    task_id = result["data"]
    task_doc["id"] = task_id
    task_doc.pop("_id")
    # 3 send response to client
    response = web.Response(status=200,
                            body=json.dumps(task_doc),
                            content_type="application/json")
    await response.prepare(request)
    await response.write_eof()
    # 4 make work dir for this task
    await IO.MakeWorkDir(task_id, logger)
    # 5 extract file
    if task_request.get("archive", ""):
        await IO.FileExtract(task_id, task_request["archive"]["type"],
                             task_request["archive"]["data"], logger)
    return response
class BatchLoader(object):

    """
    This class abstracts away the loading of images.
    Images can either be loaded singly, or in a batch. The latter is used for
    the asyncronous data layer to preload batches while other processing is
    performed.
    """

    def __init__(self, params, result):
        # initialize
        self._cur = 0  # current image
        self.transformer = DataTransformer()

        print "BatchLoader initialized with {} images".format(
            len(self.list_samples))

    def load_next_sample(self):
        """
        Load the next image in a batch.
        """
        # Did we finish an epoch?
        if self._cur == len(self.list_samples):
            self._cur = 0
            shuffle(self.list_samples)

        # Load rgb and its different types of gt from one sample
        index = self.indexlist[self._cur]  # Get the sample index
        image_file_name = list_samples[index][0];
        seg_file_name = list_samples[index][1];
        depth_file_name = list_samples[index][2];
        surface_file_name = list_samples[index][3];
        contour_file_name = list_samples[index][4];
        
        img = cv2.imread(image_file_name, 1);
        seg = cv2.imread(seg_file_name, 0);
        depth = cv2.imread(depth_file_name, -1);
        surface = cv2.imread(surface_file_name, -1);
        contour = cv2.imread(contour_file_name, 0);

        # do a simple horizontal flip as data augmentation       
        if self.mirror == True:
            flip = np.random.choice(2)*2-1
            im = im[:, ::flip, :]
            seg = seg[:, ::flip, :]
            depth = depth[:, ::flip, :]
            surface = surface[:, ::flip, :]
            contour = contour[:, ::flip, :]
        
        # randomly pick a scale from scale factors
        scale_num = len(self.scale_factors)
        self.scale = self.scale_factors[np.random.choice(scale_num)]

        self._cur += 1
        return self.transformer.preprocess(im, seg, depth, surface, contour)
Пример #4
0
def add_extra_features(data, original_column_name):
    # append a squared value for each column
    added_column_names = dt.append_squared_value_for_columns(data, original_column_name)
    # Append Features cubed
    dt.append_cubed_value_for_columns(data, original_column_name)
    # append a combination of all the features
    dt.append_features_combined_with_each_other(data, original_column_name)

    dt.append_features_combined_with_each_other(data, added_column_names[:len(added_column_names)-1])
Пример #5
0
def runTransformer(FolderName, NumFilesToProcess):
    fileList = os.listdir(str(FolderName))
    print("------------------------------------------")
    print(fileList)
    print("------------------------------------------")

    desiredFiles = []
    for i in range(NumFilesToProcess):
        desiredFiles.append(fileList[i])
    print(len(desiredFiles))
    termSets = []
    for dfile in desiredFiles:
        termSet = []
        # rawPageContent = DataTransformer.openRawHTMLsTxt()
        firstFiltered = DataTransformer.getCoreList(str(FolderName)+"/" + str(dfile))
        secondFiltered = DataTransformer.filterCoreList(firstFiltered)
        for term in secondFiltered:
            termSet.append(term)
        termSets.append((str(dfile),termSet))
    for termSet in termSets:
        outputTerm(termSet, str(termSet[0]))

    return termSets
Пример #6
0
class DataManager:
    def __init__(self, dmdf):

        self.dmdf = self.preprocess(dmdf, 1400, 5)
        self.transformer = DataTransformer()
        self.normalizer = DataNormalizer()

    def preprocess(self, dmdf, elo, turns):
        dmdf.sort_values('elo', ascending=False)
        dmdf = dmdf.drop_duplicates(subset = 'battle_url', keep='first')
        dmdf = dmdf.drop(['battle_url'], axis=1)
        dmdf = dmdf.dropna()
        dmdf = dmdf[dmdf.elo != 2019] # Should maybe not do this
        dmdf = dmdf[dmdf.elo >= elo]
        dmdf = dmdf[dmdf.num_turns > 5]
        return dmdf


    def create_analytics_base_table(self):
        return self.normalizer.normalize(self.transformer.transform(self.dmdf))
def runner(IndexFolderName, ContentFolderName, QueriesTxtName, K):
    f = open("./Output.txt", "w", encoding='utf-8')

    queries, content = RankedRetrieval.tokenTransform(str(QueriesTxtName))
    for query in queries:
        f.write(str(content[queries.index(query)]) + "\n")  #write raw query
        f.write(str(query[1]) +
                "\n")  #write toeknized and data transformed query
        scoreList = RankedRetrieval.getScore(query[1])

        if (len(scoreList) < int(K)):
            for score in scoreList:
                f.write(str(score[0]) + "  " + str(score[0]) +
                        "\n")  #write documentID <tab> documentName

                somewhatCoreContent = DataTransformer.getCoreList(
                    "./" + str(ContentFolderName) + "/" + str(score[0]) +
                    ".txt")
                filteredContent = DataTransformer.filterCoreList(
                    somewhatCoreContent)
                snippet = str(filteredContent)[:200]
                f.write(snippet + "\n")  #write first 200 bytes

                f.write(str(score[2]) + "\n")  #write score
                for token in query[1]:
                    f.write(
                        str(token) + ": " +
                        str(score[1][query[1].index(token)]) + " ")
                # f.write(str(query[1][0]) + ": "+ str(score[1][0]) + " "
                # + str(query[1][1]) + ": "+ str(score[1][1]) + " "
                # + str(query[1][2]) + ": "+ str(score[1][2]) + " "
                # + str(query[1][3]) + ": "+ str(score[1][3]) + "\n") #write contribution

                f.write("\n")

        if (len(scoreList) >= int(K)):
            for i in range(K):
                f.write(
                    str(scoreList[i][0]) + "  " + str(scoreList[i][0]) +
                    "\n")  #write documentID <tab> documentName

                somewhatCoreContent = DataTransformer.getCoreList(
                    "./" + str(ContentFolderName) + "/" +
                    str(scoreList[i][0]) + ".txt")
                filteredContent = DataTransformer.filterCoreList(
                    somewhatCoreContent)
                snippet = str(filteredContent)[:200]
                f.write(snippet + "\n")  #write first 200 bytes

                f.write(str(scoreList[i][2]) + "\n")  #write score
                for token in query[1]:
                    f.write(
                        str(token) + ": " +
                        str(scoreList[i][1][query[1].index(token)]) + " ")

                # f.write(str(query[1][0]) + ": "+ str(scoreList[i][1][0]) + " "
                # + str(query[1][1]) + ": "+ str(scoreList[i][1][1]) + " "
                # + str(query[1][2]) + ": "+ str(scoreList[i][1][2]) + " "
                # + str(query[1][3]) + ": "+ str(scoreList[i][1][3]) + "\n") #write contribution

                f.write("\n")

        f.write("\n")
        f.write("\n")
    f.close()
Пример #8
0
async def KillTask(request):
    task_id = request.match_info.get("task_id", "Wrong Task ID")
    logger.debug("KillTask: task id check = {}".format(task_id))
    try:
        ObjectId(task_id)
    except:
        logger.debug("Task ID invalid = {}".format(str(task_id)))
        return web.Response(status=404,
                            body={
                                "error": 404,
                                "reason": "Invalid Task ID",
                                "description": ""
                            })
    DBoperation = MongoDB.MonoDBOperation()
    query_result = await DBoperation.QueryDB({"_id": ObjectId(task_id)})
    if not len(query_result["data"]):
        return web.Response(status=404,
                            body={
                                "error": 404,
                                "description": "",
                                "reason": "the task doesn't exist"
                            })
    task_doc = query_result["data"][0]
    # check task status
    if task_doc["status"] == "DEPLOYING" or task_doc["status"] == "RUNNING":
        # modify task status to KILLING
        task_doc["status"] = "KILLING"
        logger.debug("check updated task doc = {}".format(task_doc))
        update_result = await DBoperation.UpdateDBbyReplace(
            {"_id": ObjectId(task_id)}, task_doc)
        # return response to client
        task_doc = update_result["data"]
        logger.debug("check update result = {}".format(task_doc))
        task_doc["id"] = str(task_doc["_id"])
        task_doc.pop("_id")
        response = web.Response(status=200, body=json.dumps(task_doc))
        await response.prepare(request)
        await response.write_eof()
        # submit kill request to marathon
        marathon_request = DataTransformer.TaskDoc2MarathonRequest(
            task_doc, task_id)
        marathon_response = requests.delete(
            Configs.Marathon_ADDRESS +
            "/v2/apps/{app_id}".format(app_id="mlge1." + task_id), )
        status = marathon_response.status_code
        marathon_response = marathon_response.json()
        logger.debug("kill request commit successfully")
        if status >= 400:
            # marathon cannot kill task
            logger.error(
                "marathon kill task failed , status = {} and response = {}".
                format(status, marathon_response))
            return web.Response(
                status=409,
                body=json.dumps({
                    "error":
                    409,
                    "reason":
                    "marathon failure, task status: {}".format(status),
                    "description":
                    ""
                }))
        # update task doc

        query_result = await DBoperation.QueryDB({"_id": ObjectId(task_id)})

        task_doc = query_result["data"][0]
        task_doc["status"] = "KILLED"
        # check task status
        logger.debug("check updated task doc = {}".format(task_doc))
        result = await DBoperation.UpdateDBbyReplace(
            {"_id": ObjectId(task_id)}, task_doc)
        if result["status"] != 200:
            logger.error(
                "update task doc error, update doc = {}".format(task_doc))
        logger.debug("task doc DB update successfully")
        return response
    else:
        return web.Response(status=409,
                            body=json.dumps(
                                {
                                    "error":
                                    409,
                                    "reason":
                                    "status failure, task status: {}".format(
                                        task_doc["status"]),
                                    "description":
                                    ""
                                }, ))
Пример #9
0
async def LaunchTask(request):
    # 1. check task id validation
    task_id = request.match_info.get("task_id", "Wrong Task ID")
    task_request = json.loads(await request.json())
    logger.debug("launchtask  task id check = {}".format(task_id))
    try:
        ObjectId(task_id)
    except:
        logger.debug("Task ID invalid = {}".format(str(task_id)))
        return web.Response(status=404,
                            body={
                                "error": 404,
                                "reason": "Invalid Task ID",
                                "description": ""
                            })
    # 2. query task doc info
    DBoperation = MongoDB.MonoDBOperation()
    query_result = await DBoperation.QueryDB({"_id": ObjectId(task_id)})
    if len(query_result["data"]) == 0:
        return web.Response(status=404,
                            body={
                                "error": 404,
                                "reason": "Invalid Task ID",
                                "description": ""
                            })

    # 3. update task doc info
    task_doc = query_result["data"][0]
    task_doc["status"] = "WAITING"
    logger.debug("check updated task doc = {}".format(task_doc))
    update_result = await DBoperation.UpdateDBbyReplace(
        {"_id": ObjectId(task_id)}, task_doc)

    # 4. return response to client
    task_doc = update_result["data"]
    logger.debug("check update result = {}".format(task_doc))
    task_doc["id"] = str(task_doc["_id"])
    task_doc.pop("_id")
    response = web.Response(status=200, body=json.dumps(task_doc))
    await response.prepare(request)
    await response.write_eof()

    # submit task to marathon
    marathon_request = DataTransformer.TaskDoc2MarathonRequest(
        task_doc, task_id)
    # marathon_response, status = await MarathonLayer.MarathonPost(
    #    "http://192.168.64.57:8080/v2/apps", json_data=marathon_request)
    marathon_response = requests.post(Configs.Marathon_ADDRESS + "/v2/apps",
                                      json=marathon_request)
    status = marathon_response.status_code
    marathon_response = marathon_response.json()
    logger.debug("marathon task commit successfully")
    if status >= 400:
        logger.error(
            "marathon create task failed , status = {} and response = {}".
            format(status, marathon_response))
    # update task doc
    update_task_doc = DataTransformer.MarathonResponse2TaskDoc(
        marathon_response)
    result = await DBoperation.UpdateDBbyReplace({"_id": ObjectId(task_id)},
                                                 update_task_doc)
    if result["status"] != 200:
        logger.error(
            "update task doc error, update doc = {}".format(update_task_doc))
    logger.debug("task doc DB update successfully")
    return response
Пример #10
0
__author__ = "Haoxiang Ma"

import DataTransformer
import IndexCreator
import sys
import os

if __name__ == "__main__":

    # set parameters
    folderName = str(sys.argv[1])
    folderName = folderName + "/" if not folderName.endswith(
        "/") else folderName
    numFilesToProcess = int(sys.argv[2])
    fileNames = os.listdir(folderName)

    # read documents from local, record documentID, content, and documentName
    documentList = list()
    for i, fileName in zip(range(1, numFilesToProcess + 1), fileNames):
        with open(folderName + fileName) as document:
            documentList.append((i, document.read(), fileName))

    # firstly, transform the text data
    d = DataTransformer.DataTransformer(documentList, folderName)
    d.transform()

    # secondly, create index by using the transformed data in the transfomer
    creator = IndexCreator.IndexCreator(d)
    creator.createTermIDFile()
    creator.createDocumentIDFile()
    creator.createInvertedIndex()
Пример #11
0
    def __init__(self, dmdf):

        self.dmdf = self.preprocess(dmdf, 1400, 5)
        self.transformer = DataTransformer()
        self.normalizer = DataNormalizer()
Пример #12
0
class DataManager:
    def __init__(self, dmdf):

        self.dmdf = self.preprocess(dmdf, 1400, 5)
        self.transformer = DataTransformer()
        self.normalizer = DataNormalizer()

    def preprocess(self, dmdf, elo, turns):
        dmdf.sort_values('elo', ascending=False)
        dmdf = dmdf.drop_duplicates(subset = 'battle_url', keep='first')
        dmdf = dmdf.drop(['battle_url'], axis=1)
        dmdf = dmdf.dropna()
        dmdf = dmdf[dmdf.elo != 2019] # Should maybe not do this
        dmdf = dmdf[dmdf.elo >= elo]
        dmdf = dmdf[dmdf.num_turns > 5]
        return dmdf


    def create_analytics_base_table(self):
        return self.normalizer.normalize(self.transformer.transform(self.dmdf))

if __name__ = "__main__":
    data = pd.read_csv('battle_data.csv')
    dm = DataManager(data).create_analytics_base_table()

    dt = DataTransformer()
    print([func for func in dir(dt) if callable(getattr(dt, func)) and not func.startswith("__")])
    print([str(inspect.signature(getattr(dt, func))) for func in dir(dt) if callable(getattr(dt, func)) and not func.startswith("__")])
    print([type(getattr(dt, func)) for func in dir(dt) if callable(getattr(dt, func)) and not func.startswith("__")])

Пример #13
0
 def test_get_highest_speed_flag(self):
     dt = DataTransformer()
     r1 = 'Pikachu, Voltorb'
     r2 = 'Bulbasaur, Voltorb'
     winner = 'TIE'
     self.assertEqual(dt.get_highest_speed_flag(r1, r2), winner)
Пример #14
0
    if (len(target_full)!=len(train_full)):
        raise Exception("Training set and target column size must be equal!")


    #ADD SOME EXTRA ENGINEERED FEATURES
    add_extra_features(train_full, train_cols)


    # Add gender column
    le_sex = preprocessing.LabelEncoder()
    train_full['gender'] = le_sex.fit_transform(df[df.columns[2]])
    #enc.fit(df[df.columns[2]])
    #train_full['gender'] = enc.transform()

    dt.print_columns(train_full)
    print train_full.head()

    #scaler = preprocessing.StandardScaler()
    #train_full_scaled = scaler.fit_transform(train_full)
    #new_columns = train_full.columns
    #train_full = pd.DataFrame(train_full_scaled, columns=new_columns)

    size = int(len(train_full) * 0.81)
    rows = random.sample(train_full.index, size)
    train = train_full.ix[rows]
    target = target_full.ix[rows]
    validation = train_full.drop(rows)
    validation_target = target_full.drop(rows)

    print("\nTrain rows = {}\n".format(len(train)))