class GeneticAlgorithm: def __init__(self, template, browser): self.util = Utilty() self.template = template self.obj_browser = browser # Read config.ini. full_path = os.path.dirname(os.path.abspath(__file__)) config = configparser.ConfigParser() try: config.read(self.util.join_path(full_path, 'config.ini')) except FileExistsError as e: self.util.print_message(FAIL, 'File exists error: {}'.format(e)) sys.exit(1) # Common setting value. self.wait_time = float(config['Common']['wait_time']) self.html_dir = self.util.join_path(full_path, config['Common']['html_dir']) self.html_template = config['Common']['html_template'] self.html_template_path = self.util.join_path(self.html_dir, self.html_template) self.html_file = config['Common']['ga_html_file'] self.result_dir = self.util.join_path(full_path, config['Common']['result_dir']) # Genetic Algorithm setting value. self.genom_length = int(config['Genetic']['genom_length']) self.max_genom_list = int(config['Genetic']['max_genom_list']) self.select_genom = int(config['Genetic']['select_genom']) self.individual_mutation_rate = float( config['Genetic']['individual_mutation_rate']) self.genom_mutation_rate = float( config['Genetic']['genom_mutation_rate']) self.max_generation = int(config['Genetic']['max_generation']) self.max_fitness = int(config['Genetic']['max_fitness']) self.gene_dir = self.util.join_path(full_path, config['Genetic']['gene_dir']) self.genes_path = self.util.join_path(self.gene_dir, config['Genetic']['gene_file']) html_checker_dir = self.util.join_path( full_path, config['Genetic']['html_checker_dir']) self.html_checker = self.util.join_path( html_checker_dir, config['Genetic']['html_checker_file']) self.html_checker_option = config['Genetic']['html_checker_option'] self.html_checked_path = self.util.join_path( self.html_dir, config['Genetic']['html_checked_file']) self.html_eval_place_list = config['Genetic']['html_eval_place'].split( '@') self.bingo_score = float(config['Genetic']['bingo_score']) self.warning_score = float(config['Genetic']['warning_score']) self.error_score = float(config['Genetic']['error_score']) self.result_file = config['Genetic']['result_file'] self.result_list = [] # Create population. def create_genom(self, df_gene): lst_gene = [] for _ in range(self.genom_length): lst_gene.append(random.randint(0, len(df_gene.index) - 1)) self.util.print_message(OK, 'Created individual : {}.'.format(lst_gene)) return Gene(lst_gene, 0) # Evaluation. def evaluation(self, obj_ga, df_gene, eval_place, individual_idx): # Build html syntax. indivisual = self.util.transform_gene_num2str(df_gene, obj_ga.genom_list) html = self.template.render({eval_place: indivisual}) eval_html_path = self.util.join_path( self.html_dir, self.html_file.replace('*', str(individual_idx))) with codecs.open(eval_html_path, 'w', encoding='utf-8') as fout: fout.write(html) # Evaluate html syntax using tidy. command = self.html_checker + ' ' + self.html_checker_option + ' ' + \ self.html_checked_path + ' ' + eval_html_path enc = locale.getpreferredencoding() env_tmp = os.environ.copy() env_tmp['PYTHONIOENCODING'] = enc subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env_tmp) # Check html checked result. str_eval_result = '' with codecs.open(self.html_checked_path, 'r', encoding='utf-8') as fin: str_eval_result = fin.read() # Check warning and error number. str_pattern = r'.*Tidy found ([0-9]+) warnings and ([0-9]+) errors.*$' obj_match = re.match( str_pattern, str_eval_result.replace('\t', '').replace('\r', '').replace('\n', '')) warnings = 0.0 errors = 0.0 if obj_match: warnings = int(obj_match.group(1)) * -0.1 errors = int(obj_match.group(2)) * -1.0 else: return None, 1 # Compute score. int_score = warnings + errors # Evaluate running script using selenium. selenium_score, error_flag = self.util.check_individual_selenium( self.obj_browser, eval_html_path) if error_flag: return None, 1 # Check result of selenium. if selenium_score > 0: self.util.print_message( OK, 'Detect running script: "{}" in {}.'.format( indivisual, eval_place)) # compute score for running script. int_score += self.bingo_score self.result_list.append( [eval_place, obj_ga.genom_list, indivisual]) # Output evaluation results. self.util.print_message( OK, 'Evaluation result : Browser={} {}, ' 'Individual="{} ({})", ' 'Score={}'.format(self.obj_browser.name, self.obj_browser.capabilities['version'], indivisual, obj_ga.genom_list, str(int_score))) return int_score, 0 # Select elite individual. def select(self, obj_ga, elite): # Sort in desc order of evaluation. sort_result = sorted(obj_ga, reverse=True, key=lambda u: u.evaluation) # Extract elite individuals. return [sort_result.pop(0) for _ in range(elite)] # Crossover (create offspring). def crossover(self, ga_first, ga_second): genom_list = [] # Setting of two-point crossover. cross_first = random.randint(0, self.genom_length) cross_second = random.randint(cross_first, self.genom_length) one = ga_first.getGenom() second = ga_second.getGenom() # Crossover. progeny_one = one[:cross_first] + second[ cross_first:cross_second] + one[cross_second:] progeny_second = second[:cross_first] + one[ cross_first:cross_second] + second[cross_second:] genom_list.append(Gene(progeny_one, 0)) genom_list.append(Gene(progeny_second, 0)) return genom_list # Create population of next generation. def next_generation_gene_create(self, ga, ga_elite, ga_progeny): # Sort in asc order of evaluation. next_generation_geno = sorted(ga, reverse=False, key=lambda u: u.evaluation) # Remove sum of adding the elite group and offspring group. for _ in range(0, len(ga_elite) + len(ga_progeny)): next_generation_geno.pop(0) # Add the elite group and offspring group to the next generation. next_generation_geno.extend(ga_elite) next_generation_geno.extend(ga_progeny) return next_generation_geno # Mutation. def mutation(self, obj_ga, induvidual_mutation, genom_mutation, df_genes): lst_ga = [] for idx in obj_ga: # Mutation to individuals. if induvidual_mutation > (random.randint(0, 100) / Decimal(100)): lst_gene = [] for idx2 in idx.getGenom(): # Mutation to genes. if genom_mutation > (random.randint(0, 100) / Decimal(100)): lst_gene.append( random.randint(0, len(df_genes.index) - 1)) else: lst_gene.append(idx2) idx.setGenom(lst_gene) lst_ga.append(idx) else: lst_ga.append(idx) return lst_ga # Main control. def main(self): # Load gene list. df_genes = pd.read_csv(self.genes_path, encoding='utf-8').fillna('') # Create saving file (only header). save_path = self.util.join_path( self.result_dir, self.result_file.replace('*', self.obj_browser.name)) if os.path.exists(save_path) is False: pd.DataFrame([], columns=['eval_place', 'sig_vector', 'sig_string']).to_csv(save_path, mode='w', header=True, index=False) # Evaluate indivisual each evaluating place in html. for eval_place in self.html_eval_place_list: self.util.print_message( NOTE, 'Evaluating html place : {}'.format(eval_place)) # Generate 1st generation. self.util.print_message(NOTE, 'Create population.') current_generation = [] for _ in range(self.max_genom_list): current_generation.append(self.create_genom(df_genes)) # Evaluate each generation. for int_count in range(1, self.max_generation + 1): self.util.print_message( NOTE, 'Evaluate individual : {}/{} generation.'.format( str(int_count), self.max_generation)) for indivisual, idx in enumerate(range(self.max_genom_list)): self.util.print_message( OK, 'Evaluation individual in {}: ' '{}/{} in {} generation'.format( eval_place, indivisual + 1, self.max_genom_list, str(int_count))) evaluation_result, eval_status = self.evaluation( current_generation[indivisual], df_genes, eval_place, idx) idx += 1 if eval_status == 1: indivisual -= 1 continue current_generation[indivisual].setEvaluation( evaluation_result) time.sleep(self.wait_time) # Select elite's individual. elite_genes = self.select(current_generation, self.select_genom) # Crossover of elite gene. progeny_gene = [] for i in range(0, self.select_genom): progeny_gene.extend( self.crossover(elite_genes[i - 1], elite_genes[i])) # Select elite group. next_generation_individual_group = self.next_generation_gene_create( current_generation, elite_genes, progeny_gene) # Mutation next_generation_individual_group = self.mutation( next_generation_individual_group, self.individual_mutation_rate, self.genom_mutation_rate, df_genes) # Finish evolution computing for current generation. # Arrange fitness each individual. fits = [_.getEvaluation() for _ in current_generation] # evaluate evolution result. flt_avg = sum(fits) / float(len(fits)) self.util.print_message( NOTE, '{} generation result: ' 'Min={}, Max={}, Avg={}.'.format(int_count, min(fits), max(fits), flt_avg)) # Judge fitness. if flt_avg > self.max_fitness: self.util.print_message( NOTE, 'Finish evolution: average={}'.format(str(flt_avg))) break # Replace current generation and next generation. current_generation = next_generation_individual_group # Save individuals. pd.DataFrame(self.result_list).to_csv(save_path, mode='a', header=True, index=False) # Output final result. str_best_individual = '' for gene_num in elite_genes[0].getGenom(): str_best_individual += str(df_genes.loc[gene_num].values[0]) str_best_individual = str_best_individual.replace('%s', ' ').replace( '"', '"').replace('%comma', ',') self.util.print_message( NOTE, 'Best individual : "{}"'.format(str_best_individual)) self.util.print_message( NOTE, 'Done creation of injection codes using Genetic Algorithm.') return self.result_list
class GAN: def __init__(self, template, browser): self.util = Utilty() self.template = template self.obj_browser = browser # Read config.ini. full_path = os.path.dirname(os.path.abspath(__file__)) config = configparser.ConfigParser() try: config.read(self.util.join_path(full_path, 'config.ini')) except FileExistsError as e: self.util.print_message(FAIL, 'File exists error: {}'.format(e)) sys.exit(1) # Common setting value. self.wait_time = float(config['Common']['wait_time']) self.html_dir = self.util.join_path(full_path, config['Common']['html_dir']) self.html_file = config['Common']['gan_html_file'] self.result_dir = self.util.join_path(full_path, config['Common']['result_dir']) self.eval_html_path = self.util.join_path(self.html_dir, self.html_file) # Genetic Algorithm setting value. self.genom_length = int(config['Genetic']['genom_length']) self.gene_dir = self.util.join_path(full_path, config['Genetic']['gene_dir']) self.genes_path = self.util.join_path(self.gene_dir, config['Genetic']['gene_file']) self.ga_result_file = config['Genetic']['result_file'] self.eval_place_list = config['Genetic']['html_eval_place'].split('@') # Generative Adversarial Network setting value. self.input_size = int(config['GAN']['input_size']) self.batch_size = int(config['GAN']['batch_size']) self.num_epoch = int(config['GAN']['num_epoch']) self.max_sig_num = int(config['GAN']['max_sig_num']) self.max_explore_codes_num = int( config['GAN']['max_explore_codes_num']) self.max_synthetic_num = int(config['GAN']['max_synthetic_num']) self.weight_dir = self.util.join_path(full_path, config['GAN']['weight_dir']) self.gen_weight_file = config['GAN']['generator_weight_file'] self.dis_weight_file = config['GAN']['discriminator_weight_file'] self.gan_result_file = config['GAN']['result_file'] self.gan_vec_result_file = config['GAN']['vec_result_file'] self.generator = None # Load gene list. self.df_genes = pd.read_csv(self.genes_path, encoding='utf-8').fillna('') self.flt_size = len(self.df_genes) / 2.0 # Path of trained weight. self.weight_path = self.util.join_path( self.weight_dir, self.gen_weight_file.replace('*', str(self.num_epoch - 1))) # Build generator model. def generator_model(self): model = Sequential() model.add( Dense(input_dim=self.input_size, output_dim=self.input_size * 10, init='glorot_uniform')) model.add(LeakyReLU(0.2)) model.add(Dropout(0.5)) model.add(Dense(self.input_size * 10, init='glorot_uniform')) model.add(LeakyReLU(0.2)) model.add(Dropout(0.5)) model.add(Dense(self.input_size * 5, init='glorot_uniform')) model.add(LeakyReLU(0.2)) model.add(Dropout(0.5)) model.add(Dense(output_dim=self.genom_length, init='glorot_uniform')) model.add(Activation('tanh')) return model # Build discriminator model. def discriminator_model(self): model = Sequential() model.add( Dense(input_dim=self.genom_length, output_dim=self.genom_length * 10, init='glorot_uniform')) model.add(LeakyReLU(0.2)) model.add(Dense(self.genom_length * 10, init='glorot_uniform')) model.add(LeakyReLU(0.2)) model.add(Dense(1, init='glorot_uniform')) model.add(Activation('sigmoid')) return model # Train GAN model (generate injection codes). def train(self, list_sigs): # Load train data (=ga result). X_train = [] X_train = np.array(list_sigs) X_train = (X_train.astype(np.float32) - self.flt_size) / self.flt_size # Build discriminator. discriminator = self.discriminator_model() d_opt = SGD(lr=0.1, momentum=0.1, decay=1e-5) discriminator.compile(loss='binary_crossentropy', optimizer=d_opt) # Build generator and discriminator (fixed weight of discriminator). discriminator.trainable = False self.generator = self.generator_model() dcgan = Sequential([self.generator, discriminator]) g_opt = SGD(lr=0.1, momentum=0.3) dcgan.compile(loss='binary_crossentropy', optimizer=g_opt) # Execute train. num_batches = int(len(X_train) / self.batch_size) lst_scripts = [] for epoch in range(self.num_epoch): for batch in range(num_batches): # Create noise for inputting to generator. noise = np.array([ np.random.uniform(-1, 1, self.input_size) for _ in range(self.batch_size) ]) # Generate new injection code using noise. generated_codes = self.generator.predict(noise, verbose=0) # Update weight of discriminator. image_batch = X_train[batch * self.batch_size:(batch + 1) * self.batch_size] X = image_batch y = [random.uniform(0.7, 1.2) for _ in range(self.batch_size)] d_loss = discriminator.train_on_batch(X, y) X = generated_codes y = [random.uniform(0.0, 0.3) for _ in range(self.batch_size)] d_loss = discriminator.train_on_batch(X, y) # Update weight of generator. noise = np.array([ np.random.uniform(-1, 1, self.input_size) for _ in range(self.batch_size) ]) g_loss = dcgan.train_on_batch(noise, [1] * self.batch_size) # Build HTML syntax from generated codes. for generated_code in generated_codes: lst_genom = [] for gene_num in generated_code: gene_num = (gene_num * self.flt_size) + self.flt_size gene_num = int(np.round(gene_num)) if gene_num == len(self.df_genes): gene_num -= 1 lst_genom.append(int(gene_num)) str_html = self.util.transform_gene_num2str( self.df_genes, lst_genom) self.util.print_message( OK, 'Train GAN : epoch={}, batch={}, g_loss={}, d_loss={}, {} ({})' .format( epoch, batch, g_loss, d_loss, np.round((generated_code * self.flt_size) + self.flt_size), str_html)) # Evaluate generated injection code. for eval_place in self.eval_place_list: # Build html syntax. html = self.template.render({eval_place: str_html}) with codecs.open(self.eval_html_path, 'w', encoding='utf-8') as fout: fout.write(html) # Evaluate individual using selenium. selenium_score, error_flag = self.util.check_individual_selenium( self.obj_browser, self.eval_html_path) if error_flag: continue # Check generated individual using selenium. if selenium_score > 0: self.util.print_message( WARNING, 'Detect running script: "{}" in {}.'.format( str_html, eval_place)) # Save running script. lst_scripts.append([eval_place, str_html]) # Save weights of network each epoch. self.generator.save_weights( self.util.join_path( self.weight_dir, self.gen_weight_file.replace('*', str(epoch)))) discriminator.save_weights( self.util.join_path( self.weight_dir, self.dis_weight_file.replace('*', str(epoch)))) return lst_scripts # Transform from generated codes to gene list. def transform_code2gene(self, generated_code): lst_genom = [] for gene_num in generated_code: gene_num = (gene_num * self.flt_size) + self.flt_size gene_num = int(np.round(gene_num)) if gene_num == len(self.df_genes): gene_num -= 1 lst_genom.append(int(gene_num)) return lst_genom # Mean of two vectors. def vector_mean(self, vector1, vector2): return (vector1 + vector2) / 2 # Main control. def main(self): # Define saving path. gan_save_path = self.util.join_path( self.result_dir, self.gan_result_file.replace('*', self.obj_browser.name)) vec_save_path = self.util.join_path( self.result_dir, self.gan_vec_result_file.replace('*', self.obj_browser.name)) # Start generating injection code. if os.path.exists(self.weight_path): # Load trained model. self.generator = self.generator_model() self.generator.load_weights('{}'.format(self.weight_path)) # Explore the valid injection codes. valid_code_list = [] result_list = [] for idx in range(self.max_explore_codes_num): self.util.print_message( NOTE, '{}/{} Explore valid injection code.'.format( idx + 1, self.max_explore_codes_num)) # Generate injection codes. noise = np.array([ np.random.uniform(-1, 1, self.input_size) for _ in range(1) ]) generated_codes = self.generator.predict(noise, verbose=0) str_html = self.util.transform_gene_num2str( self.df_genes, self.transform_code2gene(generated_codes[0])) # Evaluate injection code using selenium. for eval_place in self.eval_place_list: html = self.template.render({eval_place: str_html}) with codecs.open(self.eval_html_path, 'w', encoding='utf-8') as fout: fout.write(html) selenium_score, error_flag = self.util.check_individual_selenium( self.obj_browser, self.eval_html_path) if error_flag: continue # Check generated injection code. if selenium_score > 0: self.util.print_message( WARNING, 'Find valid injection code: "{}" in {}.'.format( str_html, eval_place)) valid_code_list.append([str_html, noise]) result_list.append([eval_place, str_html]) # Save generated injection codes. if os.path.exists(gan_save_path) is False: pd.DataFrame(result_list, columns=['eval_place', 'injection_code']).to_csv(gan_save_path, mode='w', header=True, index=False) else: pd.DataFrame(result_list).to_csv(gan_save_path, mode='a', header=False, index=False) # Synthesize injection codes. vector_result_list = [] for idx in range(self.max_synthetic_num): noise_idx1 = np.random.randint(0, len(valid_code_list)) noise_idx2 = np.random.randint(0, len(valid_code_list)) self.util.print_message( NOTE, '{}/{} Synthesize injection codes.'.format( idx + 1, self.max_synthetic_num)) self.util.print_message( OK, 'Use two injection codes : ({}) + ({}).'.format( valid_code_list[noise_idx1][0], valid_code_list[noise_idx2][0])) # Generate injection codes. synthesized_noise = self.vector_mean( valid_code_list[noise_idx1][1], valid_code_list[noise_idx2][1]) generated_codes = self.generator.predict(synthesized_noise, verbose=0) str_html = self.util.transform_gene_num2str( self.df_genes, self.transform_code2gene(generated_codes[0])) # Evaluate synthesized injection code using selenium. for eval_place in self.eval_place_list: hit_flag = 'Failure' html = self.template.render({eval_place: str_html}) with codecs.open(self.eval_html_path, 'w', encoding='utf-8') as fout: fout.write(html) selenium_score, error_flag = self.util.check_individual_selenium( self.obj_browser, self.eval_html_path) if error_flag: continue # Check synthesized injection code using selenium. if selenium_score > 0: self.util.print_message( WARNING, 'Find running script: "{}".'.format(str_html)) hit_flag = 'Bingo' # Save running script. vector_result_list.append([ eval_place, str_html, valid_code_list[noise_idx1][0], valid_code_list[noise_idx2][0], hit_flag ]) # Save synthesized injection codes. if os.path.exists(vec_save_path) is False: pd.DataFrame(vector_result_list, columns=[ 'eval_place', 'synthesized_code', 'origin_code1', 'origin_code2', 'bingo' ]).to_csv(vec_save_path, mode='w', header=True, index=False) else: pd.DataFrame(vector_result_list).to_csv(vec_save_path, mode='a', header=False, index=False) else: # Load created individuals by Genetic Algorithm. sig_path = self.util.join_path( self.result_dir, self.ga_result_file.replace('*', self.obj_browser.name)) df_temp = pd.read_csv(sig_path, encoding='utf-8').fillna('') df_sigs = df_temp[~df_temp.duplicated()] list_sigs = [] # Extract genom list from ga result. for idx in range(len(df_sigs)): list_temp = df_sigs['sig_vector'].values[idx].replace( '[', '').replace(']', '').split(',') list_sigs.append([int(s) for s in list_temp]) # Generate individuals (=injection codes). lst_scripts = [] target_sig_list = [] for target_sig in list_sigs: self.util.print_message( NOTE, 'Start generating injection codes using {}'.format( target_sig)) target_sig_list.extend( [target_sig for _ in range(self.max_sig_num)]) lst_scripts.extend(self.train(target_sig_list)) # Save generated injection codes. if os.path.exists(gan_save_path) is False: pd.DataFrame(lst_scripts, columns=['eval_place', 'injection_code']).to_csv(gan_save_path, mode='w', header=True, index=False) else: pd.DataFrame(lst_scripts).to_csv(gan_save_path, mode='a', header=False, index=False) self.util.print_message( NOTE, 'Done generation of injection codes using Generative Adversarial Networks.' )