def __init__(self, params, result): # initialize self._cur = 0 # current image self.transformer = DataTransformer() print "BatchLoader initialized with {} images".format( len(self.list_samples))
async def CreateTask(request): task_request = json.loads(await request.json()) # 1 check the archive file size and judge the validation if task_request.get("archive", ""): archive_size = len(task_request["archive"]["data"]) if archive_size > Configs.MAX_SIZE: return web.Response(status=413, body={ "error": 413, "reason": "Request Entity Too Large", "description": "" }, content_type="application/json") # 2 add task to DB task_doc = DataTransformer.TaskRequest2TaskDoc(task_request) DBoperation = MongoDB.MonoDBOperation() result = await DBoperation.AddDB(task_doc) task_id = result["data"] task_doc["id"] = task_id task_doc.pop("_id") # 3 send response to client response = web.Response(status=200, body=json.dumps(task_doc), content_type="application/json") await response.prepare(request) await response.write_eof() # 4 make work dir for this task await IO.MakeWorkDir(task_id, logger) # 5 extract file if task_request.get("archive", ""): await IO.FileExtract(task_id, task_request["archive"]["type"], task_request["archive"]["data"], logger) return response
class BatchLoader(object): """ This class abstracts away the loading of images. Images can either be loaded singly, or in a batch. The latter is used for the asyncronous data layer to preload batches while other processing is performed. """ def __init__(self, params, result): # initialize self._cur = 0 # current image self.transformer = DataTransformer() print "BatchLoader initialized with {} images".format( len(self.list_samples)) def load_next_sample(self): """ Load the next image in a batch. """ # Did we finish an epoch? if self._cur == len(self.list_samples): self._cur = 0 shuffle(self.list_samples) # Load rgb and its different types of gt from one sample index = self.indexlist[self._cur] # Get the sample index image_file_name = list_samples[index][0]; seg_file_name = list_samples[index][1]; depth_file_name = list_samples[index][2]; surface_file_name = list_samples[index][3]; contour_file_name = list_samples[index][4]; img = cv2.imread(image_file_name, 1); seg = cv2.imread(seg_file_name, 0); depth = cv2.imread(depth_file_name, -1); surface = cv2.imread(surface_file_name, -1); contour = cv2.imread(contour_file_name, 0); # do a simple horizontal flip as data augmentation if self.mirror == True: flip = np.random.choice(2)*2-1 im = im[:, ::flip, :] seg = seg[:, ::flip, :] depth = depth[:, ::flip, :] surface = surface[:, ::flip, :] contour = contour[:, ::flip, :] # randomly pick a scale from scale factors scale_num = len(self.scale_factors) self.scale = self.scale_factors[np.random.choice(scale_num)] self._cur += 1 return self.transformer.preprocess(im, seg, depth, surface, contour)
def add_extra_features(data, original_column_name): # append a squared value for each column added_column_names = dt.append_squared_value_for_columns(data, original_column_name) # Append Features cubed dt.append_cubed_value_for_columns(data, original_column_name) # append a combination of all the features dt.append_features_combined_with_each_other(data, original_column_name) dt.append_features_combined_with_each_other(data, added_column_names[:len(added_column_names)-1])
def runTransformer(FolderName, NumFilesToProcess): fileList = os.listdir(str(FolderName)) print("------------------------------------------") print(fileList) print("------------------------------------------") desiredFiles = [] for i in range(NumFilesToProcess): desiredFiles.append(fileList[i]) print(len(desiredFiles)) termSets = [] for dfile in desiredFiles: termSet = [] # rawPageContent = DataTransformer.openRawHTMLsTxt() firstFiltered = DataTransformer.getCoreList(str(FolderName)+"/" + str(dfile)) secondFiltered = DataTransformer.filterCoreList(firstFiltered) for term in secondFiltered: termSet.append(term) termSets.append((str(dfile),termSet)) for termSet in termSets: outputTerm(termSet, str(termSet[0])) return termSets
class DataManager: def __init__(self, dmdf): self.dmdf = self.preprocess(dmdf, 1400, 5) self.transformer = DataTransformer() self.normalizer = DataNormalizer() def preprocess(self, dmdf, elo, turns): dmdf.sort_values('elo', ascending=False) dmdf = dmdf.drop_duplicates(subset = 'battle_url', keep='first') dmdf = dmdf.drop(['battle_url'], axis=1) dmdf = dmdf.dropna() dmdf = dmdf[dmdf.elo != 2019] # Should maybe not do this dmdf = dmdf[dmdf.elo >= elo] dmdf = dmdf[dmdf.num_turns > 5] return dmdf def create_analytics_base_table(self): return self.normalizer.normalize(self.transformer.transform(self.dmdf))
def runner(IndexFolderName, ContentFolderName, QueriesTxtName, K): f = open("./Output.txt", "w", encoding='utf-8') queries, content = RankedRetrieval.tokenTransform(str(QueriesTxtName)) for query in queries: f.write(str(content[queries.index(query)]) + "\n") #write raw query f.write(str(query[1]) + "\n") #write toeknized and data transformed query scoreList = RankedRetrieval.getScore(query[1]) if (len(scoreList) < int(K)): for score in scoreList: f.write(str(score[0]) + " " + str(score[0]) + "\n") #write documentID <tab> documentName somewhatCoreContent = DataTransformer.getCoreList( "./" + str(ContentFolderName) + "/" + str(score[0]) + ".txt") filteredContent = DataTransformer.filterCoreList( somewhatCoreContent) snippet = str(filteredContent)[:200] f.write(snippet + "\n") #write first 200 bytes f.write(str(score[2]) + "\n") #write score for token in query[1]: f.write( str(token) + ": " + str(score[1][query[1].index(token)]) + " ") # f.write(str(query[1][0]) + ": "+ str(score[1][0]) + " " # + str(query[1][1]) + ": "+ str(score[1][1]) + " " # + str(query[1][2]) + ": "+ str(score[1][2]) + " " # + str(query[1][3]) + ": "+ str(score[1][3]) + "\n") #write contribution f.write("\n") if (len(scoreList) >= int(K)): for i in range(K): f.write( str(scoreList[i][0]) + " " + str(scoreList[i][0]) + "\n") #write documentID <tab> documentName somewhatCoreContent = DataTransformer.getCoreList( "./" + str(ContentFolderName) + "/" + str(scoreList[i][0]) + ".txt") filteredContent = DataTransformer.filterCoreList( somewhatCoreContent) snippet = str(filteredContent)[:200] f.write(snippet + "\n") #write first 200 bytes f.write(str(scoreList[i][2]) + "\n") #write score for token in query[1]: f.write( str(token) + ": " + str(scoreList[i][1][query[1].index(token)]) + " ") # f.write(str(query[1][0]) + ": "+ str(scoreList[i][1][0]) + " " # + str(query[1][1]) + ": "+ str(scoreList[i][1][1]) + " " # + str(query[1][2]) + ": "+ str(scoreList[i][1][2]) + " " # + str(query[1][3]) + ": "+ str(scoreList[i][1][3]) + "\n") #write contribution f.write("\n") f.write("\n") f.write("\n") f.close()
async def KillTask(request): task_id = request.match_info.get("task_id", "Wrong Task ID") logger.debug("KillTask: task id check = {}".format(task_id)) try: ObjectId(task_id) except: logger.debug("Task ID invalid = {}".format(str(task_id))) return web.Response(status=404, body={ "error": 404, "reason": "Invalid Task ID", "description": "" }) DBoperation = MongoDB.MonoDBOperation() query_result = await DBoperation.QueryDB({"_id": ObjectId(task_id)}) if not len(query_result["data"]): return web.Response(status=404, body={ "error": 404, "description": "", "reason": "the task doesn't exist" }) task_doc = query_result["data"][0] # check task status if task_doc["status"] == "DEPLOYING" or task_doc["status"] == "RUNNING": # modify task status to KILLING task_doc["status"] = "KILLING" logger.debug("check updated task doc = {}".format(task_doc)) update_result = await DBoperation.UpdateDBbyReplace( {"_id": ObjectId(task_id)}, task_doc) # return response to client task_doc = update_result["data"] logger.debug("check update result = {}".format(task_doc)) task_doc["id"] = str(task_doc["_id"]) task_doc.pop("_id") response = web.Response(status=200, body=json.dumps(task_doc)) await response.prepare(request) await response.write_eof() # submit kill request to marathon marathon_request = DataTransformer.TaskDoc2MarathonRequest( task_doc, task_id) marathon_response = requests.delete( Configs.Marathon_ADDRESS + "/v2/apps/{app_id}".format(app_id="mlge1." + task_id), ) status = marathon_response.status_code marathon_response = marathon_response.json() logger.debug("kill request commit successfully") if status >= 400: # marathon cannot kill task logger.error( "marathon kill task failed , status = {} and response = {}". format(status, marathon_response)) return web.Response( status=409, body=json.dumps({ "error": 409, "reason": "marathon failure, task status: {}".format(status), "description": "" })) # update task doc query_result = await DBoperation.QueryDB({"_id": ObjectId(task_id)}) task_doc = query_result["data"][0] task_doc["status"] = "KILLED" # check task status logger.debug("check updated task doc = {}".format(task_doc)) result = await DBoperation.UpdateDBbyReplace( {"_id": ObjectId(task_id)}, task_doc) if result["status"] != 200: logger.error( "update task doc error, update doc = {}".format(task_doc)) logger.debug("task doc DB update successfully") return response else: return web.Response(status=409, body=json.dumps( { "error": 409, "reason": "status failure, task status: {}".format( task_doc["status"]), "description": "" }, ))
async def LaunchTask(request): # 1. check task id validation task_id = request.match_info.get("task_id", "Wrong Task ID") task_request = json.loads(await request.json()) logger.debug("launchtask task id check = {}".format(task_id)) try: ObjectId(task_id) except: logger.debug("Task ID invalid = {}".format(str(task_id))) return web.Response(status=404, body={ "error": 404, "reason": "Invalid Task ID", "description": "" }) # 2. query task doc info DBoperation = MongoDB.MonoDBOperation() query_result = await DBoperation.QueryDB({"_id": ObjectId(task_id)}) if len(query_result["data"]) == 0: return web.Response(status=404, body={ "error": 404, "reason": "Invalid Task ID", "description": "" }) # 3. update task doc info task_doc = query_result["data"][0] task_doc["status"] = "WAITING" logger.debug("check updated task doc = {}".format(task_doc)) update_result = await DBoperation.UpdateDBbyReplace( {"_id": ObjectId(task_id)}, task_doc) # 4. return response to client task_doc = update_result["data"] logger.debug("check update result = {}".format(task_doc)) task_doc["id"] = str(task_doc["_id"]) task_doc.pop("_id") response = web.Response(status=200, body=json.dumps(task_doc)) await response.prepare(request) await response.write_eof() # submit task to marathon marathon_request = DataTransformer.TaskDoc2MarathonRequest( task_doc, task_id) # marathon_response, status = await MarathonLayer.MarathonPost( # "http://192.168.64.57:8080/v2/apps", json_data=marathon_request) marathon_response = requests.post(Configs.Marathon_ADDRESS + "/v2/apps", json=marathon_request) status = marathon_response.status_code marathon_response = marathon_response.json() logger.debug("marathon task commit successfully") if status >= 400: logger.error( "marathon create task failed , status = {} and response = {}". format(status, marathon_response)) # update task doc update_task_doc = DataTransformer.MarathonResponse2TaskDoc( marathon_response) result = await DBoperation.UpdateDBbyReplace({"_id": ObjectId(task_id)}, update_task_doc) if result["status"] != 200: logger.error( "update task doc error, update doc = {}".format(update_task_doc)) logger.debug("task doc DB update successfully") return response
__author__ = "Haoxiang Ma" import DataTransformer import IndexCreator import sys import os if __name__ == "__main__": # set parameters folderName = str(sys.argv[1]) folderName = folderName + "/" if not folderName.endswith( "/") else folderName numFilesToProcess = int(sys.argv[2]) fileNames = os.listdir(folderName) # read documents from local, record documentID, content, and documentName documentList = list() for i, fileName in zip(range(1, numFilesToProcess + 1), fileNames): with open(folderName + fileName) as document: documentList.append((i, document.read(), fileName)) # firstly, transform the text data d = DataTransformer.DataTransformer(documentList, folderName) d.transform() # secondly, create index by using the transformed data in the transfomer creator = IndexCreator.IndexCreator(d) creator.createTermIDFile() creator.createDocumentIDFile() creator.createInvertedIndex()
def __init__(self, dmdf): self.dmdf = self.preprocess(dmdf, 1400, 5) self.transformer = DataTransformer() self.normalizer = DataNormalizer()
class DataManager: def __init__(self, dmdf): self.dmdf = self.preprocess(dmdf, 1400, 5) self.transformer = DataTransformer() self.normalizer = DataNormalizer() def preprocess(self, dmdf, elo, turns): dmdf.sort_values('elo', ascending=False) dmdf = dmdf.drop_duplicates(subset = 'battle_url', keep='first') dmdf = dmdf.drop(['battle_url'], axis=1) dmdf = dmdf.dropna() dmdf = dmdf[dmdf.elo != 2019] # Should maybe not do this dmdf = dmdf[dmdf.elo >= elo] dmdf = dmdf[dmdf.num_turns > 5] return dmdf def create_analytics_base_table(self): return self.normalizer.normalize(self.transformer.transform(self.dmdf)) if __name__ = "__main__": data = pd.read_csv('battle_data.csv') dm = DataManager(data).create_analytics_base_table() dt = DataTransformer() print([func for func in dir(dt) if callable(getattr(dt, func)) and not func.startswith("__")]) print([str(inspect.signature(getattr(dt, func))) for func in dir(dt) if callable(getattr(dt, func)) and not func.startswith("__")]) print([type(getattr(dt, func)) for func in dir(dt) if callable(getattr(dt, func)) and not func.startswith("__")])
def test_get_highest_speed_flag(self): dt = DataTransformer() r1 = 'Pikachu, Voltorb' r2 = 'Bulbasaur, Voltorb' winner = 'TIE' self.assertEqual(dt.get_highest_speed_flag(r1, r2), winner)
if (len(target_full)!=len(train_full)): raise Exception("Training set and target column size must be equal!") #ADD SOME EXTRA ENGINEERED FEATURES add_extra_features(train_full, train_cols) # Add gender column le_sex = preprocessing.LabelEncoder() train_full['gender'] = le_sex.fit_transform(df[df.columns[2]]) #enc.fit(df[df.columns[2]]) #train_full['gender'] = enc.transform() dt.print_columns(train_full) print train_full.head() #scaler = preprocessing.StandardScaler() #train_full_scaled = scaler.fit_transform(train_full) #new_columns = train_full.columns #train_full = pd.DataFrame(train_full_scaled, columns=new_columns) size = int(len(train_full) * 0.81) rows = random.sample(train_full.index, size) train = train_full.ix[rows] target = target_full.ix[rows] validation = train_full.drop(rows) validation_target = target_full.drop(rows) print("\nTrain rows = {}\n".format(len(train)))