def extract_link_train_test_usenix_features(config_learning: ConfigurationLearning) -> StyloFeaturesProxy: """ Creates a feature object that can be directly used. It contains all features from Caliskan et al., USENIX *that we used*. It does *not* contain the ARFF feature set, and thus NO layout features. The individual feature objects are loaded and linked to each other. :param config_learning: configuration :return: StyloFeatures object. """ # load feature sets unigrammmatrix_train = extract_train_test_unigram(config_learning=config_learning, tf=True, idf=False, ngram_range=(1,1)) clangmatrices_train = extract_train_test_clang(config_learning=config_learning) # now link all elements to form one feature object; we use a list to extract the clangmatrices_train list. styloelements: typing.List[StyloFeatures] = [unigrammmatrix_train, *clangmatrices_train] for ix in range(1, len(styloelements)): styloelements[ix-1].setnextstylo(styloelements[ix]) features_merged = StyloFeaturesProxy(codestyloreference=styloelements[0]) return features_merged
def load_new_features_merged(datasetpath: str, attackdirauth: str, verbose: bool, cppfile: str, train_object: StyloFeatures, already_extracted: bool) -> StyloFeatures: """ New interface for loading features. Loads a single feature vector for a given source file-author. Internally, it calls load_new_features.. :param datasetpath: :param attackdirauth: :param verbose: :param cppfile: :param train_object: StyloFeatures object :param already_extracted: bool to indicate if clang-lexems-arff features were already extracted from cpp file. If you are unsure, set it to False. We use it only if we perform all transformations via gnu parallel, so that we also extract the features in parallel. :return: StyloFeatures object """ loaded_styloobjects: typing.List[StyloFeatures] = __load_new_features(datasetpath=datasetpath, attackdirauth=attackdirauth, verbose=verbose, cppfile=cppfile, train_object=train_object, already_extracted=already_extracted) # now link the objects assert len(loaded_styloobjects) > 0 prev_o = loaded_styloobjects[0] if len(loaded_styloobjects) >= 2: for ixx in range(1, len(loaded_styloobjects)): next_o = loaded_styloobjects[ixx] prev_o.setnextstylo(codestyloreference=next_o) prev_o = next_o # create proxy # No need to check if an object was present in trainobject, but no respective novel element was created for that, # since createtfidf and selectcolumns will perform a key check! co: StyloFeaturesProxy = StyloFeaturesProxy(codestyloreference=loaded_styloobjects[0]) co.createtfidffeatures(trainobject=train_object) co.selectcolumns(index=None, trainobject=train_object) return co
def extract_link_train_test_usenix_all(config_learning: ConfigurationLearning) -> StyloFeaturesProxy: """ Creates a feature object that can be directly used. It contains all features from Caliskan et al., USENIX. It *also* contains the ARFF feature set, and thus layout features. The individual feature objects are loaded and linked to each other. :param config_learning: configuration :return: StyloFeatures object. """ # load all feature sets arffmatrix_train = extract_train_test_arff(config_learning=config_learning) unigrammmatrix_train = extract_train_test_unigram(config_learning=config_learning, tf=True, idf=False, ngram_range=(1,1)) clangmatrices_train = extract_train_test_clang(config_learning=config_learning) # aggregate into one list styloelements: typing.List[StyloFeatures] = [arffmatrix_train, unigrammmatrix_train, *clangmatrices_train] # now link all elements to form one feature object for ix in range(1, len(styloelements)): styloelements[ix - 1].setnextstylo(styloelements[ix]) features_merged = StyloFeaturesProxy(codestyloreference=styloelements[0]) # perform a short check def check(unigrammmatrix, joernmatrix, arffmatrix): # Check that rows correspond to same file-author pair, 'astype' only used so that type hints do not show an error assert np.sum((unigrammmatrix.getiids() != arffmatrix.getiids()).astype(np.bool)) == 0 assert np.sum((unigrammmatrix.getiids() != joernmatrix.getiids()).astype(np.bool)) == 0 assert np.sum((unigrammmatrix.getauthors() != arffmatrix.getauthors()).astype(np.bool)) == 0 assert np.sum((unigrammmatrix.getauthors() != joernmatrix.getauthors()).astype(np.bool)) == 0 # had some trouble with this column/feature for 2 authors, just check it... colnames_simple = np.array([x.colname for x in joernmatrix.getcolnamesraw()]) # TODO remove me. print(np.max(joernmatrix.getfeaturematrix().toarray()[:, np.where(colnames_simple == "max_depth_ast_node")[0]])) check(unigrammmatrix_train, clangmatrices_train[0], arffmatrix_train) return features_merged
os.makedirs(modelsavedir) if not os.path.exists(modelsavedir) else print("Use existing dir for models", file=sys.stderr) else: modelsavedir = None ############## Get lexical, layout and syntactic features ############## if feature_method == "Usenix": features_merged: StyloFeaturesProxy = utils_extraction.extract_link_train_test_usenix_features( config_learning=configuration_learning) elif feature_method == "CCS18": assert configuration_learning.use_lexems is not True unigrammmatrix_train: StyloFeatures = utils_extraction.extract_train_test_unigram( config_learning=configuration_learning, tf=True, idf=True, ngram_range=(1, 3)) features_merged: StyloFeaturesProxy = StyloFeaturesProxy(codestyloreference=unigrammmatrix_train) else: raise Exception("feature_method") ############## Split dataset into train - test set with our our grouped stratified k-fold ############## skf2 = StratifiedKFoldProblemId.StratifiedKFoldProblemId(iids=features_merged.getiids(), n_splits=8, shuffle=True, random_state=411, nocodesperprogrammer=configuration_learning.probsperprogrammer) print("No splits:", skf2.get_n_splits()) ############## Do training + testing on each split ############## accuracy = {}
# Last, let's link all clang objects for ix in range(1, len(clangmatrices_train)): clangmatrices_train[ix - 1].setnextstylo(clangmatrices_train[ix]) # In this way, we create a chain of feature objects, where: # arffmatrix_train -> unigrammmatrix_train -> clangmatrix1 -> clangmatrix2 -> ... -> clangmatrixn # If we call now arffmatrix_train.getfeaturematrix(), it will build a large feature matrix by combining all feature matrices # from the chained feature objects. print(arffmatrix_train.getfeaturematrix().shape) # shows (1632, 29141) # we get a feature matrix with 1632 rows (8 challenges * 204 authors) and 29141 features # The feature matrix contains the arff features, the unigram features and the clang features now. # Usually, but not necessary, we pass the first object in the chain to StyloFeaturesProxy. # The idea is that this class could implement some caching later, to speed up the retrieval of the feature matrix... features_merged: StyloFeatures = StyloFeaturesProxy( codestyloreference=arffmatrix_train) ### Learning ############## Split dataset into train - test set with our our grouped stratified k-fold ############## skf2 = StratifiedKFoldProblemId.StratifiedKFoldProblemId( iids=features_merged.getiids(), n_splits=8, shuffle=True, random_state=411, nocodesperprogrammer=configuration_learning.probsperprogrammer) ############## Do training + testing on each split ############## accuracy = {} for train_index, test_index in skf2.split(None, None):