class InterfaceWrapper: def __init__(self): self.arg_obj = ArgParser() # method to show help message def show_help(self): print ":( not enough params" print "usage: python run.py -operation <operation_name> <parameters for operation>" print "******supported operations******" print "(1) operation name: sampling"," parameters: -sampler <sampler_implementation> -positive <positive_source_file> -negative <negative_source_file> -train_size <train_set_size> -test_size <test_set_size>" print "(2) operation name: extract_features"," parameters: -fe_mapper <feature_extraction_mapper> -fe_mapper_params <mapper_params> -fe_reducer <feature_extraction_reducer> -fe_reducer_params <reducer_params> -train_dataset <dataset_location> -train_size <train_set_size> -test_size <test_set_size>" print "(3) operation name: modeling","\n-->parameters when using single feature set file: -feature_set_location <feature_set_file_location> -model_name <model_name_to_save_as>","\n-->parameters when using directory containing feature set files: -feature_set_location <feature_set_location_directory>" print "(4) operation name: classification"," parameters: -cl_mapper <classification_mapper> -cl_mapper_params <mapper_params> -cl_reducer <classification_reducer> -cl_reducer_params <reducer_params> -test_dataset <dataset_location> -model <model_file>" exit() def run(self): # print sys.argv self.arg_obj.parse(sys.argv) print '*'*100 print "Run got arguments:",self.arg_obj.args print '*'*100 if not self.arg_obj.args.has_key("operation"): self.show_help() if self.arg_obj.args["operation"] == SAMPLING: self.run_sampling() if self.arg_obj.args["operation"] == FEATURE_EXTRACTION: self.run_feature_extraction() if self.arg_obj.args["operation"] == MODELING: self.run_modeling() if self.arg_obj.args["operation"] == CLASSIFICATION: self.run_classification() if self.arg_obj.args["operation"] == DEFAULT: self.run_default_flow() # method to run sampling def run_sampling(self): # launch sampling _cmd = "python "+_prefix+"/"+"sampler_interface.py " + self.arg_obj.get_string() print '*'*100 print "run: Invoking Sampling:\n",_cmd print '*'*100 self.invoke(_cmd) pass # method to run feature extraction def run_feature_extraction(self): # launch feature extraction here _cmd = "python "+_prefix+"/"+"feature_extraction_interface.py " + self.arg_obj.get_string() print "run: Invoking Feature Extraction:\n",_cmd self.invoke(_cmd) pass # method to run modeling def run_modeling(self): # launch modeling _cmd = "python "+_prefix+"/"+"modeler_interface.py " + self.arg_obj.get_string() print '*'*100 print "run: Invoking Modeling:\n",_cmd print '*'*100 self.invoke(_cmd) pass # method to run the classification def run_classification(self): # launch classification _cmd = "python "+_prefix+"/"+"classification_interface.py " + self.arg_obj.get_string() print '*'*100 print "run: Invoking Classification:\n",_cmd print '*'*100 self.invoke(_cmd) pass # method to check if all the necessary parameters are provided for default flow def check_params(self): # this checks for all params required to execute the default flow of the framework # mapper and reducer params might be optional- thus they are not required if not self.arg_obj.args.has_key("fe_mapper") or not self.arg_obj.args.has_key("fe_reducer") or not self.arg_obj.args.has_key("train_dataset") or not self.arg_obj.args.has_key("train_size") or not self.arg_obj.args.has_key("test_size") or not self.arg_obj.args.has_key("cl_mapper") or not self.arg_obj.args.has_key("cl_reducer") : self.show_help() # method to run default behavior- here framework handles everything- sampling, feature extraction, modeling # user just needs to provide required parameters # this runs: sampling->feature extraction->modeling def run_default_flow(self): self.check_params() # launch feature extraction self.run_feature_extraction() # if feature extraction was successful then proceed for modeling if os.path.exists("./feature_set_for_modeling"): print "Launching modeler with extracted feature set..." self.run_modeling() # if model was successfully generated, set the model parameter for classification flow self.arg_obj.args["model"] = "trained_models/"+str(self.arg_obj.args["train_size"])+"_output.model" else: print "unable to find the directory 'feature_set_for_modeling'" # if the modeling was successful then proceed for classification if os.path.exists("./trained_models"): print "Launching classification with trained model from ./trained_models" # launch classification self.run_classification() else: print "unable to find the directory 'trained_models'" # method to invoke the commands def invoke(self, cmd): os.system(cmd)
class FeatureExtractionInterface: def __init__(self, data_location=None): self.arg_obj = ArgParser() self.data_location = data_location self.sampler = None self.feature_extraction_mapper = None self.feature_extraction_reducer = None self.mapper_param = None self.reducer_param = None self.training_set_size = None self.test_set_size = None # method to show help message def show_help(self): print ":( not enough params" print "usage: python feature_extraction_interface.py -fe_mapper <feature_extraction_mapper> -fe_mapper_params <mapper_params> -fe_reducer <feature_extraction_reducer> -fe_reducer_params <reducer_params> -train_dataset <dataset_location> -train_size <train_set_size> -test_size <test_set_size>" exit() # method that checks if required parameters are there or not # returns False if the required params are missing # returns True if all the required params are provided def check_params(self): # mapper and reducer params might be optional- thus they are not required if not self.arg_obj.args.has_key("fe_mapper") or not self.arg_obj.args.has_key("fe_reducer") or not self.arg_obj.args.has_key("train_dataset") or not self.arg_obj.args.has_key("train_size") or not self.arg_obj.args.has_key("test_size") : self.show_help() exit() # method that invokes sampling- it assumes that positive instances file is named 'positive_instances' and negative instances file is named 'negative_instances' def invoke_sampling(self): self.check_params() # do sampling # using default sampler if not provided sampler in param list if not self.arg_obj.args.has_key("sampler"): _sampler = "lib.sampler.random_sampler.RandomSampler" else: _sampler = self.arg_obj.args["sampler"] _cmd = "python "+_prefix+"/"+"sampler_interface.py "+self.arg_obj.get_string() os.system(_cmd) time.sleep(5) # method that removes the dataset directory on HDFS def remove_dataset_dir_on_hdfs(self): self.check_params() # remove training directory on HDFS _cmd = "hadoop fs -rmr /user/hadoop/feature_extraction_input" os.system(_cmd) time.sleep(5) # method that removes the output directory on HDFS def remove_output_dir_on_hdfs(self): self.check_params() # remove output directory on HDFS _cmd = "hadoop fs -rmr /user/hadoop/feature_extraction_output" os.system(_cmd) time.sleep(5) # method that loads the dataset into HDFS def load_data_set_on_hdfs(self): self.check_params() # load new training data on HDFS _cmd = "hadoop fs -put Train/train_set_w_tags /user/hadoop/feature_extraction_input/" os.system(_cmd) time.sleep(10) # method that starts the feature extraction job def start_feature_extraction_job(self): self.check_params() print "Launching map-reduce feature extraction task..." # start feature extraction _cmd = "hadoop jar /home/hadoop/contrib/streaming/hadoop-streaming-1.0.3.jar -input /user/hadoop/feature_extraction_input -mapper '"+ self.feature_extraction_mapper # use parameters for mapper job if they are provided if not self.mapper_param is None: _cmd = _cmd +" "+ self.mapper_param _cmd = _cmd + "' -file "+ self.feature_extraction_mapper +" -reducer '"+ self.feature_extraction_reducer # use parameters for reducer job if they are provided if not self.reducer_param is None: _cmd = _cmd +" "+ self.reducer_param _cmd = _cmd + "' -file "+ self.feature_extraction_reducer +" -file glossextractionengine.mod -output /user/hadoop/feature_extraction_output -jobconf mapred.job.name='GlossExtractionEngine:FeatureExtraction'" os.system(_cmd) time.sleep(5) print "feature extraction task completed." # method that exports the result of feature extraction from HDFS to local file system def export_output_from_hdfs(self): self.check_params() # create the output directory for featuer extraction job if not os.path.exists("feature_set_for_modeling"): os.system("mkdir feature_set_for_modeling") # remove previous version of the feature set file in the output directory if os.path.exists("./feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt"): print "FeatureExtractionInterface: File already exists.. removing it :","./feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt" os.remove("./feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt") # get the merged output from HDFS _cmd = "hadoop fs -getmerge /user/hadoop/feature_extraction_output ./feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt" os.system(_cmd) print "Saved output[Feature set for modeling] at : feature_set_for_modeling/"+str(self.training_set_size)+"_output.txt" # method to perform sequence of operations before launching a map-reduce job for feature extraction def launch(self): self.check_params() if self.arg_obj.args.has_key("sampler"): # interact with sampling interface for sampling self.invoke_sampling() self.remove_dataset_dir_on_hdfs() self.remove_output_dir_on_hdfs() self.load_data_set_on_hdfs() # start the feature extraction job self.start_feature_extraction_job() self.export_output_from_hdfs()
class InterfaceWrapper: def __init__(self): self.arg_obj = ArgParser() # method to show help message def show_help(self): print ":( not enough params" print "usage: python run.py -operation <operation_name> <parameters for operation>" print "******supported operations******" print "(1) operation name: sampling", " parameters: -sampler <sampler_implementation> -positive <positive_source_file> -negative <negative_source_file> -train_size <train_set_size> -test_size <test_set_size>" print "(2) operation name: extract_features", " parameters: -fe_mapper <feature_extraction_mapper> -fe_mapper_params <mapper_params> -fe_reducer <feature_extraction_reducer> -fe_reducer_params <reducer_params> -train_dataset <dataset_location> -train_size <train_set_size> -test_size <test_set_size>" print "(3) operation name: modeling", "\n-->parameters when using single feature set file: -feature_set_location <feature_set_file_location> -model_name <model_name_to_save_as>", "\n-->parameters when using directory containing feature set files: -feature_set_location <feature_set_location_directory>" print "(4) operation name: classification", " parameters: -cl_mapper <classification_mapper> -cl_mapper_params <mapper_params> -cl_reducer <classification_reducer> -cl_reducer_params <reducer_params> -test_dataset <dataset_location> -model <model_file>" exit() def run(self): # print sys.argv self.arg_obj.parse(sys.argv) print '*' * 100 print "Run got arguments:", self.arg_obj.args print '*' * 100 if not self.arg_obj.args.has_key("operation"): self.show_help() if self.arg_obj.args["operation"] == SAMPLING: self.run_sampling() if self.arg_obj.args["operation"] == FEATURE_EXTRACTION: self.run_feature_extraction() if self.arg_obj.args["operation"] == MODELING: self.run_modeling() if self.arg_obj.args["operation"] == CLASSIFICATION: self.run_classification() if self.arg_obj.args["operation"] == DEFAULT: self.run_default_flow() # method to run sampling def run_sampling(self): # launch sampling _cmd = "python " + _prefix + "/" + "sampler_interface.py " + self.arg_obj.get_string( ) print '*' * 100 print "run: Invoking Sampling:\n", _cmd print '*' * 100 self.invoke(_cmd) pass # method to run feature extraction def run_feature_extraction(self): # launch feature extraction here _cmd = "python " + _prefix + "/" + "feature_extraction_interface.py " + self.arg_obj.get_string( ) print "run: Invoking Feature Extraction:\n", _cmd self.invoke(_cmd) pass # method to run modeling def run_modeling(self): # launch modeling _cmd = "python " + _prefix + "/" + "modeler_interface.py " + self.arg_obj.get_string( ) print '*' * 100 print "run: Invoking Modeling:\n", _cmd print '*' * 100 self.invoke(_cmd) pass # method to run the classification def run_classification(self): # launch classification _cmd = "python " + _prefix + "/" + "classification_interface.py " + self.arg_obj.get_string( ) print '*' * 100 print "run: Invoking Classification:\n", _cmd print '*' * 100 self.invoke(_cmd) pass # method to check if all the necessary parameters are provided for default flow def check_params(self): # this checks for all params required to execute the default flow of the framework # mapper and reducer params might be optional- thus they are not required if not self.arg_obj.args.has_key( "fe_mapper") or not self.arg_obj.args.has_key( "fe_reducer") or not self.arg_obj.args.has_key( "train_dataset") or not self.arg_obj.args.has_key( "train_size") or not self.arg_obj.args.has_key( "test_size") or not self.arg_obj.args.has_key( "cl_mapper" ) or not self.arg_obj.args.has_key( "cl_reducer"): self.show_help() # method to run default behavior- here framework handles everything- sampling, feature extraction, modeling # user just needs to provide required parameters # this runs: sampling->feature extraction->modeling def run_default_flow(self): self.check_params() # launch feature extraction self.run_feature_extraction() # if feature extraction was successful then proceed for modeling if os.path.exists("./feature_set_for_modeling"): print "Launching modeler with extracted feature set..." self.run_modeling() # if model was successfully generated, set the model parameter for classification flow self.arg_obj.args["model"] = "trained_models/" + str( self.arg_obj.args["train_size"]) + "_output.model" else: print "unable to find the directory 'feature_set_for_modeling'" # if the modeling was successful then proceed for classification if os.path.exists("./trained_models"): print "Launching classification with trained model from ./trained_models" # launch classification self.run_classification() else: print "unable to find the directory 'trained_models'" # method to invoke the commands def invoke(self, cmd): os.system(cmd)