from ui_exerciser import UIExerciser from utils import Utilities import re import os import time if __name__ == '__main__': ISOTIMEFORMAT = '%m%d-%H-%M-%S' logger = Utilities.set_logger('COSMOS_TRIGGER_PY-Console') device = 'nexus4' pc = 'iai' if device == 'nexus4': series = '01b7006e13dd12a1' elif device == 'galaxy': series = '014E233C1300800B' elif device == 'nexuss': series = '39302E8CEA9B00EC' else: series = 'emulator-5554' user = '******' aapt_loc = 'C:\Users\\' + user + '\AppData\Local\Android\sdk/build-tools/19.1.0/aapt.exe' apk_dir = 'C:\Users\\' + user + '\Documents\FlowIntent\\apks\\VirusShare_Android_20130506_3\\' UIExerciser.emu_loc = 'C:\Users\hfu\AppData\Local\Android\sdk/tools/emulator.exe' UIExerciser.emu_name = 'Qvga' out_base_dir = os.path.abspath(os.pardir + '/output/') + '/' #UIExerciser.emu_proc = UIExerciser.open_emu(UIExerciser.emu_loc, UIExerciser.emu_name)
class Learner: logger = Utilities.set_logger('Learner') class LabelledDocs: def stem_tokens(tokens, stemmer): stemmed = [] for item in tokens: stemmed.append(stemmer.stem(item)) return stemmed def tokenize(self, text): vectorizer = CountVectorizer(analyzer='word') vectorizer.fit_transform([text]) tokens = vectorizer.get_feature_names() # stems = self.stem_tokens(tokens, stemmer) return tokens def __init__(self, doc, label, char_wb=False): self.doc = doc self.label = label tokens = self.tokenize(doc) if char_wb: self.doc = ''.join(tokens) else: self.doc = ' '.join(tokens) @staticmethod def dir2jsons(json_dir): jsons = [] if json_dir is None: return jsons for root, dirs, files in os.walk(json_dir, topdown=False): for filename in files: if '201' in filename and re.search('json$', filename): with open(os.path.join(root, filename), "rb") as fin: try: jsons.append(simplejson.load(fin)) except Exception as e: pass # Utilities.logger.error(e) return jsons @staticmethod def same_prefix(str_a, str_b): for i, c in enumerate(str_a): if i > 6: return True if c == str_b[i]: continue else: return False @staticmethod def feature_filter_by_prefix(vocab, docs): examined = [] for i in range(len(vocab)): Learner.logger.info('i: ' + vocab[i] + ' ' + str(i)) if len(vocab[i]) < 6 or vocab[i] in examined: continue for j in range(i + 1, len(vocab)): # Learner.logger.info('j: ' + vocab[j] + ' ' + str(j)) if len(vocab[j]) < 6: examined.append(vocab[j]) continue if vocab[i] in vocab[j] or vocab[j] in vocab[ i]: # Learner.same_prefix(vocab[i], vocab[j]): # Learner.logger.info('Found ' + vocab[i] + ' ' + vocab[j] + ' ' + str(i)) examined.append(vocab[j]) for doc in docs: if vocab[j] in doc.doc: doc.doc = str(doc.doc).replace(vocab[j], vocab[i]) instances = [] labels = [] for doc in docs: instances.append(doc.doc) labels.append(doc.label) vectorizer = StemmedCountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None) train_data = vectorizer.fit_transform(instances) # Numpy arrays are easy to work with, so convert the result to an # array # train_data = train_data.toarray() Learner.logger.info(train_data.shape) return train_data, labels @staticmethod def gen_instances(pos_json_dir, neg_json_dir, simulate=False, char_wb=False): pos_jsons = Learner.dir2jsons(pos_json_dir) neg_jsons = Learner.dir2jsons(neg_json_dir) Learner.logger.info('lenPos: ' + str(len(pos_jsons))) Learner.logger.info('lenNeg: ' + str(len(neg_jsons))) docs = Learner.gen_docs(pos_jsons, 1, char_wb) docs = docs + (Learner.gen_docs(neg_jsons, -1, char_wb)) if simulate: if len(neg_jsons) == 0: docs = docs + Learner.simulate_flows(len(pos_jsons), 0) instances = [] labels = [] for doc in docs: instances.append(doc.doc) labels.append(doc.label) return instances, np.array(labels) @staticmethod def gen_X_matrix(instances, vec=None, tf=False, ngrams_range=None): # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. if vec is not None: train_data = vec.transform(instances) vocab = vec.get_feature_names() return train_data, vocab, vec if not tf: if ngrams_range is None: vectorizer = StemmedCountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=['http']) else: vectorizer = StemmedCountVectorizer(analyzer='char_wb', tokenizer=None, preprocessor=None, stop_words=['http'], ngram_range=ngrams_range) else: if ngrams_range is None: vectorizer = StemmedTfidfVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=['http']) else: vectorizer = StemmedTfidfVectorizer(analyzer='char_wb', tokenizer=None, preprocessor=None, stop_words=None, ngram_range=ngrams_range) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. train_data = vectorizer.fit_transform(instances) # Numpy arrays are easy to work with, so convert the result to an # array # train_data = train_data.toarray() Learner.logger.info(train_data.shape) # Take a look at the words in the vocabulary vocab = vectorizer.get_feature_names() # Learner.logger.info(vocab) # train_data, labels = Learner.feature_filter_by_prefix(vocab, docs) return train_data, vocab, vectorizer @staticmethod def ocsvm(train_data, labels, cross_vali=True): nu = float(np.count_nonzero(labels == -1)) / len(labels) clf = svm.OneClassSVM(nu=nu, kernel="rbf", gamma=0.1) results = None if cross_vali: results = Learner.cross_validation(clf, train_data, labels) # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8'), # separators=(',', ':'), sort_keys=True, indent=4) Learner.logger.info('OCSVM: ' + str(results['duration'])) Learner.logger.info('mean scores:' + str(results['mean_scores'])) Learner.logger.info('mean_conf:' + str(results['mean_conf_mat'])) clf.fit(train_data) return clf, results @staticmethod def train_bayes(train_data, labels, cross_vali=True): clf = BernoulliNB() results = None if cross_vali: results = Learner.cross_validation(clf, train_data, labels) # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8'), # separators=(',', ':'), sort_keys=True, indent=4) Learner.logger.info('Bayes: ' + str(results['duration'])) Learner.logger.info('mean scores:' + str(results['mean_scores'])) Learner.logger.info('mean_conf:' + str(results['mean_conf_mat'])) # Fit the forest to the training set, using the bag of words as # features and the sentiment labels as the response variable # # This may take a few minutes to run clf = clf.fit(train_data, labels) return clf, results @staticmethod def class_report(conf_mat): tp, fp, fn, tn = conf_mat.flatten() measures = { 'accuracy': (tp + tn) / (tp + fp + fn + tn), 'fp_rate': fp / (tn + fp), 'recall': tp / (tp + fn), 'precision': tp / (tp + fp), 'f1score': 2 * tp / (2 * tp + fp + fn) } # measures['tn_rate'] = tn / (tn + fp) # (true negative rate) return measures @staticmethod def cross_validation(clf, data, labels, scoring='f1', n_fold=5): X = data y = np.array(labels) ''' Run x-validation and return scores, averaged confusion matrix, and df with false positives and negatives ''' t0 = time() results = dict() # cv = KFold(n_splits=5, shuffle=True) # I generate a KFold in order to make cross validation shuffle = True kf = StratifiedKFold(n_splits=n_fold, shuffle=shuffle, random_state=42) scores = [] conf_mat = np.zeros((2, 2)) # Binary classification # I start the cross validation for fold, (train_index, test_index) in enumerate(kf.split(X, y)): result = dict() X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # I train the classifier clf.fit(X_train, y_train) # I make the predictions predicted = clf.predict(X_test) y_plabs = np.squeeze(predicted) if hasattr(clf, 'predict_proba'): y_pprobs = clf.predict_proba(X_test) # Predicted probabilitie result['roc'] = metrics.roc_auc_score(y_test, y_pprobs[:, 1]) else: # for SVM y_decision = clf.decision_function(X_test) try: result['roc'] = metrics.roc_auc_score( y_test, y_decision[:, 1]) except: # OCSVM result['roc'] = metrics.roc_auc_score(y_test, y_decision) # metrics.roc_curve(y_test, y_pprobs[:, 1]) scores.append(result['roc']) # Learner.perf_measure(predicted, y_test) # I obtain the accuracy of this fold # ac = accuracy_score(predicted, y_test) # I obtain the confusion matrix confusion = metrics.confusion_matrix(y_test, predicted) conf_mat += confusion result['conf_mat'] = confusion.tolist() # Collect indices of false positive and negatives, effective only shuffle=False, or backup the original data if not shuffle: fp_i = np.where((y_plabs == 1) & (y_test == -1))[0] fn_i = np.where((y_plabs == -1) & (y_test == 1))[0] result['fp_item'] = test_index[fp_i] result['fn_item'] = test_index[fn_i] results['fold_' + str(fold)] = result # cv_res = cross_val_score(clf, data, labels, cv=cv, scoring='f1').tolist() # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8'), # separators=(',', ':'), sort_keys=True, indent=4) duration = time() - t0 results['duration'] = duration # results['cv_res'] = cv_res # results['cv_res_mean'] = sum(cv_res) / n_splits # print "\nMean score: %0.2f (+/- %0.2f)" % (np.mean(scores), np.std(scores) * 2) results['mean_scores'] = np.mean(scores) results['std_scores'] = np.std(scores) conf_mat /= n_fold # print "Mean CM: \n", conf_mat # print "\nMean classification measures: \n" results['mean_conf_mat'] = Learner.class_report(conf_mat) # return scores, conf_mat, {'fp': sorted(false_pos), 'fn': sorted(false_neg)} return results @staticmethod def train_SVM(train_data, labels, cross_vali=True): clf = svm.SVC(class_weight='balanced', probability=True) results = None if cross_vali == True: results = Learner.cross_validation(clf, train_data, labels) # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8'), # separators=(',', ':'), sort_keys=True, indent=4) Learner.logger.info('SVM: ' + str(results['duration'])) Learner.logger.info('mean scores:' + str(results['mean_scores'])) Learner.logger.info('mean_conf:' + str(results['mean_conf_mat'])) # Fit the forest to the training set, using the bag of words as # features and the sentiment labels as the response variable # # This may take a few minutes to run clf = clf.fit(train_data, labels) return clf, results @staticmethod def train_logistic(train_data, labels, cross_vali=True): clf = LogisticRegression(class_weight='balanced') results = None if cross_vali == True: results = Learner.cross_validation(clf, train_data, labels) # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8'), # separators=(',', ':'), sort_keys=True, indent=4) Learner.logger.info('Logistic: ' + str(results['duration'])) Learner.logger.info('mean scores:' + str(results['mean_scores'])) Learner.logger.info('mean_conf:' + str(results['mean_conf_mat'])) # Fit the forest to the training set, using the bag of words as # features and the sentiment labels as the response variable # # This may take a few minutes to run clf = clf.fit(train_data, labels) return clf, results @staticmethod def train_tree(train_data, labels, cross_vali=True, res=None, output_dir=os.curdir, tree_name='tree'): clf = DecisionTreeClassifier(class_weight='balanced') results = None if cross_vali == True: results = Learner.cross_validation(clf, train_data, labels) # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8'), # separators=(',', ':'), sort_keys=True, indent=4) Learner.logger.info('Tree: ' + str(results['duration'])) Learner.logger.info('mean scores:' + str(results['mean_scores'])) Learner.logger.info('mean_conf:' + str(results['mean_conf_mat'])) # Fit the forest to the training set, using the bag of words as # features and the sentiment labels as the response variable # # This may take a few minutes to run clf = clf.fit(train_data, labels) """ tree.export_graphviz(clf, out_file=output_dir + '/' + tree_name + '.dot', feature_names=feature_names, label='root', impurity=False, special_characters=True) # , max_depth=5) dotfile = open(output_dir + '/' + tree_name + '.dot', 'r') graph = pydotplus.graph_from_dot_data(dotfile.read()) graph.write_pdf(output_dir + '/' + tree_name + '.pdf') dotfile.close() """ if res is not None: res['tree'] = results return clf, results @staticmethod def train_classifier(func, X, y, cv, result_dict, tag): result_dict[tag] = func(X, y, cv) @staticmethod def rand_str(size=6, chars=string.ascii_uppercase + string.digits): url = ''.join(random.choice(chars) for _ in range(size)) if url[0] < 'k': url = url + 'net' else: url = url + 'com' url = 'www.' + url return url @staticmethod def simulate_flows(size, label): docs = [] for _ in range(size): docs.append( Learner.LabelledDocs('www.' + Learner.rand_str() + '', label)) return docs @staticmethod def tree_info(clf): info = dict() n_nodes = clf.tree_.node_count # children_left = clf.tree_.children_left # children_right = clf.tree_.children_right # feature = clf.tree_.max_features # n_feature = clf.tree_.n_features_ # The tree structure can be traversed to compute various properties such # as the depth of each node and whether or not it is a leaf. depth = clf.tree_.max_depth info['n_nodes'] = n_nodes info['depth'] = depth Learner.logger.info(info) return info @staticmethod def gen_docs(jsons, label, char_wb=False): docs = [] for flow in jsons: label = label # flow['label'] line = '' line += flow['domain'] line += flow['uri'] try: docs.append(Learner.LabelledDocs(line, label, char_wb=char_wb)) except: print line return docs @staticmethod def predict(model, vec, instances, labels=None, src_name='', model_name=''): # loaded_vec = CountVectorizer(decode_error="replace", vocabulary=voc) data = vec.transform(instances) y_1 = model.predict(data) # Learner.logger.info(y_1) if labels is not None: return accuracy_score(labels, y_1) @staticmethod def feature_selection(X, y, k, count_vectorizer, instances, tf=False, ngram_range=None): ch2 = SelectKBest(chi2, k=k) X_new = ch2.fit_transform(X, y) feature_names = count_vectorizer.get_feature_names() if feature_names != None: feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] ''' dict = np.asarray(count_vectorizer.get_feature_names())[ch2.get_support()] if tf: if ngram_range is not None: count_vectorizer = StemmedTfidfVectorizer(analyzer='char_wb', ngram_range=ngram_range, vocabulary=dict) else: count_vectorizer = StemmedTfidfVectorizer(analyzer='char_wb', vocabulary=dict) else: if ngram_range is not None: count_vectorizer = StemmedCountVectorizer(analyzer='word', vocabulary=dict, ngram_range=ngram_range) else: count_vectorizer = StemmedCountVectorizer(analyzer="word", vocabulary=dict) X_new = count_vectorizer.fit_transform(instances) # cPickle.dump(count_vectorizer.vocabulary, open(output_dir + '/' + "vocabulary.pkl", "wb")) ''' return X_new, feature_names, ch2 @staticmethod def pipe_feature_selection(X, y): clf = Pipeline([('feature_selection', SelectKBest(chi2, k=2).fit_transform(X, y)), ('classification', RandomForestClassifier())]) clf.fit(X, y) @staticmethod def save2file(obj, path): # save the obj with open(path, 'wb') as fid: cPickle.dump(obj, fid) @staticmethod def obj_from_file(path): return cPickle.load(open(path, 'rb'))
else: if ngram_range is not None: count_vectorizer = StemmedCountVectorizer(analyzer='word', vocabulary=dict, ngram_range=ngram_range) else: count_vectorizer = StemmedCountVectorizer(analyzer="word", vocabulary=dict) X_new = count_vectorizer.fit_transform(instances) # cPickle.dump(count_vectorizer.vocabulary, open(output_dir + '/' + "vocabulary.pkl", "wb")) ''' return X_new, feature_names, ch2 @staticmethod def pipe_feature_selection(X, y): clf = Pipeline([('feature_selection', SelectKBest(chi2, k=2).fit_transform(X, y)), ('classification', RandomForestClassifier())]) clf.fit(X, y) @staticmethod def save2file(obj, path): # save the obj with open(path, 'wb') as fid: cPickle.dump(obj, fid) @staticmethod def obj_from_file(path): return cPickle.load(open(path, 'rb')) if __name__ == '__main__': logger = Utilities.set_logger('Learner')
class CtuAdAnalyzer: logger = Utilities.set_logger('CTU-Ad') @staticmethod def cv_result_table(base_dir): for model_name in ['bag', 'bag-ngram', 'tf', 'tf-ngram']: print '\\\\' model = dict() model_name = model_name + '_' for dataset in ['']: output_dir = base_dir + dataset with open( os.path.join(output_dir, model_name + 'cv_res_sel.json'), "rb") as fin: cv_res = simplejson.load(fin) # print cv_res for algorithm in cv_res: if algorithm not in model: model[algorithm] = dict() results = cv_res[algorithm] model[algorithm][dataset] = results # print(algorithm + ': ' + str(results['duration'])) # print('mean scores:' + str(results['mean_scores'])) # print('mean_conf:' + str(results['mean_conf_mat'])) for algorithm in ['tree', 'bayes', 'logistic', 'svm', 'ocsvm']: if algorithm == 'tree': algorithm_name = 'Decision Tree' elif algorithm == 'bayes': algorithm_name = 'Naive Bayes' elif algorithm == 'logistic': algorithm_name = 'Logistic Regreesion' elif algorithm == 'svm': algorithm_name = 'SVM' else: algorithm_name = 'OCSVM' for dataset in ['']: results = model[algorithm][dataset] mean_conf = results['mean_conf_mat'] recall = str('{:.3%}'.format(mean_conf['recall'])).replace( '%', '\%') fp = str('{:.3%}'.format(mean_conf['fp_rate'])).replace( '%', '\%') precision = str('{:.3%}'.format( mean_conf['precision'])).replace('%', '\%') f1 = str('{:.3%}'.format(mean_conf['f1score'])).replace( '%', '\%') mean_score = str('{:.3%}'.format( results['mean_scores'])).replace('%', '\%') duration = str('{:.3}'.format(results['duration'])) print ' & ' + algorithm_name + ' & ' + duration + ' & ' + recall \ + ' & ' + fp + ' & ' + precision \ + ' & ' + f1 + ' & ' + mean_score + ' \\\\ ' @staticmethod def cmp_model_cv(base_dir, normal_dir): """ Cmp between bag-of-words, Tf-idf, bag-ngrams, Tf-ngrams :return: """ classifier_dir = base_dir """ Pool().map(CtuCCAnalyzer.cmp_algorithm_cv, [base_dir, base_dir, base_dir, base_dir], [normal_dir, normal_dir, normal_dir, normal_dir], [classifier_dir, classifier_dir, classifier_dir, classifier_dir], ['bag_', 'bag-ngram_', 'tf_', 'tf-ngram_']) """ threads = dict() for model_name in ['bag', 'bag-ngram', 'tf', 'tf-ngram']: threads[model_name] = Thread(target=CtuCCAnalyzer.cmp_algorithm_cv, args=(base_dir, normal_dir, classifier_dir, classifier_dir, model_name + '_')) threads[model_name].start() for model_name in threads: threads[model_name].join()
class CtuCCAnalyzer: logger = Utilities.set_logger('CTU-13-CC') @staticmethod def cmp_feature_selection(base_dir, normal_dir, data_path, output_dir, dataset=None): classifier_dir = base_dir + dataset instances, labels = Learner.gen_instances(os.path.join(normal_dir, 'March'), data_path, simulate=False) data, feature_names, vec = Learner.gen_X_matrix(instances) back = [data, labels, feature_names, vec] Learner.save2file(vec.vocabulary_, output_dir + '/' + "vocabulary.pkl") CtuCCAnalyzer.logger.info(data.shape) clf, cv = Learner.train_tree(data, labels, cross_vali=True, tree_name='Fig_tree_' + dataset, output_dir=output_dir) Learner.save2file(clf, classifier_dir + '\\' + 'classifier.pkl') clf_info = Learner.tree_info(clf) clf_info['cv'] = cv simplejson.dump(clf_info, codecs.open(output_dir + '/tree_info.json', 'w', encoding='utf-8')) data, labels, feature_names, vec = back data, feature_names, vec = Learner.feature_selection(data, labels, 200, vec, instances) Learner.save2file(vec.vocabulary, output_dir + '/' + "vocabulary_sel.pkl") CtuCCAnalyzer.logger.info(data.shape) clf, cv = Learner.train_tree(data, labels, cross_vali=True, tree_name='Fig_tree_sel_' + dataset, output_dir=output_dir) Learner.save2file(clf, classifier_dir + '\\' + 'classifier_sel.pkl') clf_info = Learner.tree_info(clf) clf_info['cv'] = cv json.dump(clf_info, codecs.open(output_dir + '/tree_info_sel.json', 'w', encoding='utf-8')) # simplejson.dump(results.tolist(), codecs.open(output_dir + '/cv.json', 'w', encoding='utf-8')) # separators=(',', ':'), sort_keys=True, indent=4) @staticmethod def cmp_model_cv(base_dir, normal_dir): """ Cmp between bag-of-words, Tf-idf, bag-ngrams, Tf-ngrams :return: """ for model_name in ['bag']: # 'bag-ngram', 'tf', 'tf-ngram']: CtuCCAnalyzer.logger.info(model_name + "----------------------------------") for dataset in ['Neris', 'Murlo', 'Virut', 'Sogou']: classifier_dir = base_dir + dataset CtuCCAnalyzer.cmp_algorithm_cv(base_dir, normal_dir, classifier_dir, classifier_dir, dataset=dataset, model_name=model_name + '_') @staticmethod def train_and_save(X, y, model_name, classifier_dir): outfile = os.path.join(classifier_dir, model_name + 'cv_res_sel.json') cv_res = dict() results = dict() thread1 = Thread(target=Learner.train_classifier, args=(Learner.train_tree, X, y, True, results, 'tree')) thread2 = Thread(target=Learner.train_classifier, args=(Learner.train_bayes, X, y, True, results, 'bayes')) thread3 = Thread(target=Learner.train_classifier, args=(Learner.train_logistic, X, y, True, results, 'logistic')) thread4 = Thread(target=Learner.train_classifier, args=(Learner.train_SVM, X, y, True, results, 'svm')) thread5 = Thread(target=Learner.train_classifier, args=(Learner.ocsvm, X, y, True, results, 'ocsvm')) thread1.start() thread2.start() thread3.start() thread4.start() thread5.start() thread1.join() thread2.join() thread3.join() thread4.join() thread5.join() clf_tree, cv_res['tree'] = results['tree'] clf_bayes, cv_res['bayes'] = results['bayes'] clf_logistic, cv_res['logistic'] = results['logistic'] clf_svm, cv_res['svm'] = results['svm'] clf_ocsvm, cv_res['ocsvm'] = results['ocsvm'] Learner.save2file(clf_tree, os.path.join(classifier_dir, model_name + 'tree_sel.pkl')) Learner.save2file(clf_bayes, os.path.join(classifier_dir, model_name + 'bayes_sel.pkl')) Learner.save2file(clf_logistic, os.path.join(classifier_dir, model_name + 'logistic_sel.pkl')) Learner.save2file(clf_svm, os.path.join(classifier_dir, model_name + 'svm_sel.pkl')) Learner.save2file(clf_ocsvm, os.path.join(classifier_dir, model_name + 'ocsvm_sel.pkl')) CtuCCAnalyzer.logger.info('Threads Done! Saving cv_res...') json.dump(cv_res, codecs.open(outfile, 'w', encoding='utf-8')) """ result1, result2, result3, result4, result5 = Pool().map(Learner.train_classifier, [Learner.train_tree, Learner.train_bayes, Learner.train_logistic, Learner.train_SVM, Learner.ocsvm], [X, X, X, X, X], [y, y, y, y, y], [True, True, True, True, True]) clf_tree, cv_res['tree'] = result1 clf_bayes, cv_res['bayes'] = result2 clf_logistic, cv_res['logistic'] = result3 clf_svm, cv_res['svm'] = result4 clf_ocsvm, cv_res['ocsvm'] = result5 Learner.save2file(clf_tree, os.path.join(classifier_dir, model_name + 'tree_sel.pkl')) Learner.save2file(clf_bayes, os.path.join(classifier_dir, model_name + 'bayes_sel.pkl')) Learner.save2file(clf_logistic, os.path.join(classifier_dir, model_name + 'logistic_sel.pkl')) Learner.save2file(clf_svm, os.path.join(classifier_dir, model_name + 'svm_sel.pkl')) Learner.save2file(clf_ocsvm, os.path.join(classifier_dir, model_name + 'ocsvm_sel.pkl')) json.dump(cv_res, codecs.open(os.path.join(classifier_dir, model_name + 'cv_res_sel.json'), 'w', encoding='utf-8')) ''' result1 = Pool().map(Learner.train_tree, [X,], [y], [True]) result2 = Pool().map(Learner.train_bayes, [X], [y], [True]) result3 = Pool().map(Learner.train_logistic, [X], [y], [True]) result4 = Pool().map(Learner.train_SVM, [X], [y], [True]) result5 = Pool().map(Learner.ocsvm, [X], [y], [True]) ''' """ @staticmethod def cmp_algorithm_cv(base_dir, normal_dir, data_path, output_dir, model_name='', dataset=''): char_wb = False if 'tf' in model_name: tf = True else: tf = False if 'ngram' in model_name: ngram = (2, 15) # char_wb = True else: ngram = None classifier_dir = base_dir + dataset outfile = os.path.join(classifier_dir, model_name + 'cv_res_sel.json') if os.path.exists(outfile): return if os.path.exists(os.path.join(output_dir, model_name + "vec_sel.pkl")): X = Learner.obj_from_file(os.path.join(output_dir, model_name + "X_sel.pkl")) y = Learner.obj_from_file(os.path.join(output_dir, model_name + "y_sel.pkl")) else: instances, y = Learner.gen_instances(os.path.join(normal_dir, 'March'), data_path, char_wb=char_wb, simulate=False) X, feature_names, vec = Learner.gen_X_matrix(instances, tf=tf, ngrams_range=ngram) Learner.save2file(X, os.path.join(output_dir, model_name + "X.pkl")) Learner.save2file(y, os.path.join(output_dir, model_name + "y.pkl")) Learner.save2file(vec, os.path.join(output_dir, model_name + "vec.pkl")) Learner.save2file(feature_names, os.path.join(output_dir, model_name + "feature_names.pkl")) X, feature_names, vec = Learner.feature_selection(X, y, 500, vec, instances, tf=tf, ngram_range=ngram) Learner.save2file(X, os.path.join(output_dir, model_name + "X_sel.pkl")) Learner.save2file(y, os.path.join(output_dir, model_name + "y_sel.pkl")) Learner.save2file(vec, os.path.join(output_dir, model_name + "vec_sel.pkl")) Learner.save2file(feature_names, os.path.join(output_dir, model_name + "feature_names_sel.pkl")) CtuCCAnalyzer.train_and_save(X, y, model_name, classifier_dir) @staticmethod def zero_day_helper(base_dir, src_name, model_name, algorithm, target_name, normal_dir=None): vec_dir = os.path.join(base_dir, src_name) model_path = os.path.join(vec_dir, model_name + algorithm + '_sel.pkl') target_path = os.path.join(base_dir, target_name) if normal_dir is None: data, labels = Learner.gen_instances('', target_path) else: data, labels = Learner.gen_instances(os.path.join(normal_dir, target_name), '') vec = Learner.obj_from_file(os.path.join(vec_dir, model_name + 'vec.pkl')) vec_sel = Learner.obj_from_file(os.path.join(vec_dir, model_name + 'vec_sel.pkl')) data, vocab, vec = Learner.gen_X_matrix(data, vec=vec) return Learner.predict(Learner.obj_from_file(model_path), vec_sel, data, labels=labels, src_name=src_name, model_name=model_name) @staticmethod def zero_day_sub(base_dir, normal_dir, model_name, output_dir): if os.path.exists(os.path.join(output_dir, model_name + 'pred_res.json')): return results = dict() for algorithm in ['tree', 'bayes', 'logistic', 'svm', 'ocsvm']: for src_name in ['Neris', 'Murlo', 'Virut', 'Sogou']: for target_name in ['Neris', 'Murlo', 'Virut', 'Sogou']: res = CtuCCAnalyzer.zero_day_helper(base_dir, src_name, model_name, algorithm, target_name) if algorithm not in results: results[algorithm] = dict() if src_name not in results[algorithm]: results[algorithm][src_name] = dict() results[algorithm][src_name][target_name] = res # name = src_name + '_' + model_name + '_' + target_name # CtuCCAnalyzer.logger.info(name + ':' + str(res)) target_name = 'April' res = CtuCCAnalyzer.zero_day_helper(base_dir, src_name, model_name, algorithm, target_name, normal_dir=normal_dir) # name = src_name + '_' + model_name + '_' + target_name # CtuCCAnalyzer.logger.info(name + ':' + str(res)) results[algorithm][src_name][target_name] = res json.dump(results, codecs.open(os.path.join(output_dir, model_name + 'pred_res.json'), 'w', encoding='utf-8')) for algorithm in ['tree', 'bayes', 'logistic', 'svm', 'ocsvm']: for src_name in ['Neris', 'Murlo', 'Virut', 'Sogou']: output = '' for target_name in ['Neris', 'Murlo', 'Virut', 'Sogou']: output = output + str(results[algorithm][src_name][target_name] * 100) + '\%' + ' & ' CtuCCAnalyzer.logger.info(algorithm + ' & ' + src_name + ' & ' + output) @staticmethod def zero_day(base_dir, normal_dir): for model_name in ['bag']: # ['bag', 'bag-ngram', 'tf', 'tf-ngram']: CtuCCAnalyzer.zero_day_sub(base_dir, normal_dir, model_name + '_', base_dir)
class ViewClientHandler: logger = Utilities.set_logger('ViewClientHandler') @staticmethod def traverse(vc, root="ROOT", indent="", transform=None, stream=sys.stdout, bounds2id={}): ''' Traverses the C{View} tree and prints its nodes. The nodes are printed converting them to string but other transformations can be specified by providing a method name as the C{transform} parameter. @type root: L{View} @param root: the root node from where the traverse starts @type indent: str @param indent: the indentation string to use to print the nodes @type transform: method @param transform: a method to use to transform the node before is printed ''' if transform is None: # this cannot be a default value, otherwise # TypeError: 'staticmethod' object is not callable # is raised transform = ViewClient.TRAVERSE_CIT if type(root) == types.StringType and root == "ROOT": root = vc.root print vc.list() xml_root = ET.Element('hierarchy') ViewClientHandler.__traverse(root, indent, transform, stream, bounds2id=bounds2id) return bounds2id # if not root: # return # # s = transform(root) # if s: # print >>stream, "%s%s" % (indent, s) # # for ch in root.children: # self.traverse(ch, indent=indent+" ", transform=transform, stream=stream) @staticmethod def __traverse(root, indent="", transform=View.__str__, stream=sys.stdout, bounds2id={}): if not root: return s = transform(root) sub_node = None if stream and s: ius = "%s%s" % (indent, s if isinstance(s, unicode) else unicode(s, 'utf-8', 'replace')) print >> stream, ius.encode('utf-8', 'replace') bounds = str(root.getBounds()).replace('((', '[') bounds = bounds.replace('))', ']') bounds = bounds.replace('), (', '][') bounds = bounds.replace(', ', ',') # print root.getPositionAndSize(), bounds bounds2id[bounds] = root.getId() for ch in root.children: ViewClientHandler.__traverse(ch, indent=indent + " ", transform=transform, stream=stream, bounds2id=bounds2id) return sub_node @staticmethod def dump_view_server(package): kwargs1 = {VERBOSE: False, 'ignoresecuredevice': False, 'ignoreversioncheck': False} kwargs2 = {ViewClientOptions.FORCE_VIEW_SERVER_USE: False, ViewClientOptions.START_VIEW_SERVER: True, ViewClientOptions.AUTO_DUMP: False, ViewClientOptions.IGNORE_UIAUTOMATOR_KILLED: True, ViewClientOptions.COMPRESSED_DUMP: True, ViewClientOptions.USE_UIAUTOMATOR_HELPER: False, ViewClientOptions.DEBUG: {}, } kwargs2[ViewClientOptions.FORCE_VIEW_SERVER_USE] = True vc = ViewClient(*ViewClient.connectToDeviceOrExit(**kwargs1), **kwargs2) options = {WINDOW: -1, SAVE_SCREENSHOT: None, SAVE_VIEW_SCREENSHOTS: None, DO_NOT_DUMP_VIEWS: False, DEVICE_ART: None, DROP_SHADOW: False, SCREEN_GLARE: False} windows = vc.list() print windows transform = MAP['b'] for window in windows: if package not in windows[window]: continue print windows[window] vc.dump(window=int(window)) # ViewClient.imageDirectory = options[SAVE_VIEW_SCREENSHOTS] return ViewClientHandler.traverse(vc, transform=transform) @staticmethod def fill_ids(xml_data, package): ''' Fill the missing ids caused by uiautomator with low API level (<18) :param xml_data: :param package: :return: ''' dom = parseString(xml_data.encode("utf-8")) nodes = dom.getElementsByTagName('node') for node in nodes: if node.hasAttribute('resource-id'): return xml_data else: break bounds2ids = ViewClientHandler.dump_view_server(package) if bounds2ids == None: ViewClientHandler.logger.error('Cannot identify the package!') return xml_data ViewClientHandler.logger.info(str(bounds2ids)) for node in nodes: if node.getAttribute('bounds') in bounds2ids: node.setAttribute('resource-id', bounds2ids[node.getAttribute('bounds')]) else: ViewClientHandler.logger.warn('Cannot find ' + node.getAttribute('bounds')) return dom.toxml()