def anomaly_selection(files_map_file, anomalies_output_file, use_dbscan, differences_file=None, differences=None): if differences_file: if use_dbscan: differences = binary_read(differences_file) else: differences = ascii_read(differences_file) if use_dbscan: anomalies = dbscan_anomaly_selection(differences) else: anomalies = three_sigma_anomaly_selection(differences) anomalies_write_time_logger = TimeLogger() with open(files_map_file) as files_map_file_descriptor: files_map = json.loads(files_map_file_descriptor.read()) anomaly_files = [] if use_dbscan: for anomaly_index in anomalies: anomaly_files.append(files_map[anomaly_index]) else: for anomaly_index, anomaly_value in anomalies: anomaly_files.append((files_map[anomaly_index], anomaly_value)) with open(anomalies_output_file, 'w') as anomalies_output_file_descriptor: anomalies_output_file_descriptor.write(json.dumps(anomaly_files)) print('Anomaly list written. Time: ' + str(anomalies_write_time_logger.finish())) return len(anomaly_files)
def write_file(self): time_logger = TimeLogger() try: content = self.obj.decoded_content.decode('utf-8') except Exception as e: pprint(e) if isinstance(e, RateLimitExceededException): print('File is skipped. Waiting for 1 minute.') with open('rate_limit_exceeded_exceptions.log', 'a') as exceptions_descriptor: exceptions_descriptor.write(self.number + os.linesep) time.sleep(60) return self.write_file() elif isinstance(e, UnknownObjectException): print('File is skipped because not found.') with open('unknown_object_exceptions.log', 'a') as exceptions_descriptor: exceptions_descriptor.write(self.number + os.linesep) return None path = ContentSaver.save(self.directory, self.number, content, ext='kt') time_logger.finish(task_name='Write ' + path + ' (#' + str(self.number) + ') file')
def dbscan_anomaly_selection(differences): dbscan_time_logger = TimeLogger() labels = DBSCAN(eps=3, min_samples=5, metric='euclidean').fit_predict(differences) anomaly_indexes = [i for i, x in enumerate(labels) if x == -1] print('DBSCAN finished its work. Time: ' + str(dbscan_time_logger.finish())) return anomaly_indexes
def ascii_read(differences_file): differences_read_logger = TimeLogger() with open(differences_file) as f: differences = json.loads(f.read()) difference_indexes = [] difference_values = [] for difference in differences: difference_indexes.append(difference[0]) difference_values.append(difference[1]) print('Differences read finished. Time: ' + str(differences_read_logger.finish())) return difference_indexes, difference_values
def three_sigma_anomaly_selection(differences): three_sigma_time_logger = TimeLogger() difference_indexes, difference_values = differences mean = np.mean(difference_values) std_deviation = np.std(difference_values) left_bound_3_sigma = mean - 5 * std_deviation right_bound_3_sigma = mean + 5 * std_deviation anomalies = [] for i, x in enumerate(difference_values): if x < left_bound_3_sigma or x > right_bound_3_sigma: anomalies.append((difference_indexes[i], difference_values[i])) print('3-sigma anomaly selection finished its work. Time: ' + str(three_sigma_time_logger.finish())) return anomalies
def binary_write(differences, features_number, output_file): chunking_time_logger = TimeLogger() differences = differences.flatten('F') differences = np.append(differences, features_number) differences = struct.pack('=%df' % differences.size, *differences) chunk_size = 10000000 difference_chunks = funcy.chunks(chunk_size, differences) print('Chunking finished. Time: ' + str(chunking_time_logger.finish())) chunk_counter = 1 for difference_chunk in difference_chunks: with open(output_file, 'ab') as f: difference_chunk_time_logger = TimeLogger() f.write(difference_chunk) print('Write difference chunk ' + str(chunk_counter) + ' is done. Time: ' + str(difference_chunk_time_logger.finish())) chunk_counter += 1
parser.add_argument('--files_map_file', nargs=1, type=str, help='path to file with map dataset indexes and ast file paths') parser.add_argument('--anomalies_output_file', '-o', nargs=1, type=str, help='path to file, which will contain anomaly list (as paths to AST code snippets)') args = parser.parse_args() stage = args.stage if stage == 'autoencoding': dataset_file = args.dataset[0] split_percent = args.split_percent[0] encoding_dim_percent = args.encoding_dim_percent[0] output_file = args.differences_output_file[0] use_dbscan = args.use_dbscan total_time_logger = TimeLogger() autoencoding(dataset_file, split_percent, encoding_dim_percent, output_file, full_differences=use_dbscan) print('==============================') print('Autoencoder finished its work. Time: ' + str(total_time_logger.finish())) elif stage == 'anomaly_selection': differences_file = args.differences_file[0] files_map_file = args.files_map_file[0] anomalies_output_file = args.anomalies_output_file[0] use_dbscan = args.use_dbscan total_time_logger = TimeLogger() anomalies_number =\
def autoencoding(dataset_file, split_percent, encoding_dim_percent, output_file=None, full_differences=None): time_logger = TimeLogger() data = DatasetLoader(dataset_file).load(split_percent=split_percent) (_, _, features_number) = data encoding_dim = math.ceil(features_number * encoding_dim_percent) print('Dataset loaded. Time: ' + str(time_logger.finish())) time_logger = TimeLogger() autoencoder = Autoencoder(features_number, encoding_dim, data) autoencoder.print_model_summary() autoencoder.fit() print('Autoencoder fit finished. Time: ' + str(time_logger.finish())) time_logger = TimeLogger() autoencoder.predict() print('Autoencoder predict finished. Time: ' + str(time_logger.finish())) time_logger = TimeLogger() differences = autoencoder.calc_differences(full_differences) print('Calculate differences finished. Time: ' + str(time_logger.finish())) if not full_differences: differences = sorted(enumerate(differences), key=lambda tup: tup[1], reverse=True) if not output_file: return differences time_logger = TimeLogger() if full_differences: binary_write(differences, features_number, output_file) else: ascii_write(differences, output_file) print('Write differences finished. Time: ' + str(time_logger.finish()))
def binary_read(differences_file): differences_read_logger = TimeLogger() with open(differences_file, 'rb') as f: buffer = f.read(4) differences = [] chunk_counter = 0 log_write_per_chunk_number = 10000000 chunks_time_logger = TimeLogger() while buffer: differences.append(struct.unpack('=f', buffer)) buffer = f.read(4) if (chunk_counter + 1) % log_write_per_chunk_number == 0: print( str(chunk_counter + 1) + ' chunks is read and unpacked. Time: ' + str(chunks_time_logger.finish())) chunks_time_logger = TimeLogger() chunk_counter += 1 print( str(chunk_counter) + ' chunks is read and unpacked. Time: ' + str(chunks_time_logger.finish())) print('Differences read finished. Time: ' + str(differences_read_logger.finish())) transformation_logger = TimeLogger() differences = np.array(differences) features_number = int(differences[-1]) differences = differences[:-1].reshape( int(len(differences) / features_number), features_number) print('Differences transformation finished. Time: ' + str(transformation_logger.finish())) return differences
parser = argparse.ArgumentParser() parser.add_argument('--keyword', '-k', nargs=1, type=str, help='keyword for search on Github') parser.add_argument('--token', '-t', nargs=1, type=str, help='Github token') parser.add_argument('--directory', '-d', nargs=1, type=str, help='directory for saving Kotlin source code files') args = parser.parse_args() keyword = args.keyword[0] token = args.token[0] directory = args.directory[0] LOG_FILE = 'log.txt' github = GithubCodeCollector(token) config = {'log_file': LOG_FILE, 'keyword': keyword, 'directory': directory} time_logger = TimeLogger() # code_search(github, config) code_by_repo_search(github, config) time_logger.finish(task_name='Code collection')