def doc2vec_train(train, path_to_walks, app_id_fp): """ performs doc2vec on the walks to obtain document vectors USE ON SEPARATE APKs path_to_walks --> directory conatining the metapath2vec walks """ documents = [] labels = [] df = pd.read_csv(app_id_fp) train = [utils.dir_and_app(app)[1] for app in train] paths = [ os.path.join(path_to_walks, (appname + "m2v_walks.txt")) for appname in train ] for walk in tqdm(range(len(paths))): try: documents.append(np.loadtxt(paths[walk], dtype=object)) direc, appname = utils.dir_and_app(paths[walk]) label = df[df.app_fp.str.contains(appname)].app_label.iloc[0] labels.append(label) except: print("app ", paths[walk], " is broken") docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)] documents = [] model = Doc2Vec(docs, vector_size=100, window=300, min_count=1, workers=4) return [model, labels]
def building_embeddings(app_ids_fp, metapathsFP, walksFP): """ builds the embeddings of each app app_ids_fp --> filepath to each of the apps metapathsFP --> path to where metapaths are stored walksFP --> path to where the walks are stored """ df = pd.read_csv(app_ids_fp) walks = [] for row in tqdm(range(len(df))): app_fp = df.iloc[row]["app_fp"] app_label = df.iloc[row]["app_label"] directory, app_name = utils.dir_and_app(app_fp) metapath_fp = os.path.join(metapathsFP, (app_name + "m2v_metapaths.txt")) walk_fp = os.path.join(walksFP, (app_name + "m2v_walks.txt")) if os.path.exists(walk_fp) & os.path.exists(metapath_fp): walk = unstack_walks(walk_fp, app_name) walks = walks + walk return walks
def doc2vec_test(test, path_to_walks, app_id_fp, model, target, metapathsFP, walksFP): """ infers test item for unseen apps test --> list of test app names path_to_walks --> the path to the walks of apps app_id_fp --> filepath of information model --> the trained model """ # model = Doc2Vec.load(model_fp) documents = [] # assume that the data is coming from the apps that we have labels = [] df = pd.read_csv(app_id_fp) if ".gml.bz2" in test[0]: app_names = [utils.dir_and_app(app)[1] for app in test] else: app_names = test paths = [ os.path.join(path_to_walks, (appname + "m2v_walks.txt")) for appname in app_names ] for ind in range(len(paths)): walk = paths[ind] # if not(os.path.exists(walk)): # if this app has not been traversed yet # # find the app location, and create files for it # wrapper(test[ind], target, metapathsFP, walksFP) if os.path.exists(walk): documents.append(np.loadtxt(walk, dtype=object)) direc, appname = utils.dir_and_app(walk) label = df[df.app_fp.str.contains(appname)].app_label.iloc[0] labels.append(label) X = [] for test_app in documents: X.append(model.infer_vector(test_app)) return [X, labels]
def API_abstraction_vectorized(inFP, outFP, kind, to_return, truename=False): """ abstracts edges and nodes of ONE APP to some level returns a graph that is abstracted (WILL CHANGE) inFP --> input file path (should be .gml.bz2) outFP --> output directory kind --> (str) FAMILY or PACKAGE or CLASS """ # getting the app name direc, app_name = utils.dir_and_app(inFP) try: networkx = nx.read_gml(inFP) except: return inFP + " might be broken!" nx_nodes = np.array(networkx.nodes(data=True)) nx_edges = np.array(networkx.edges, dtype=object) node_vfunc = np.vectorize(API_abstraction) edge_vfunc = np.vectorize(edge_processing) newnodes = [API_abstraction(kind, node) for node in nx_nodes] newedges = [edge_processing(kind, edge) for edge in nx_edges] G = nx.MultiDiGraph() G.add_nodes_from(newnodes) G.add_edges_from(newedges) if truename == False: G = add_apk_node(G, "") else: G = add_apk_node(G, app_name) metapaths = dfs(G, app_name) if to_return == "NX": return [G, metapaths] elif to_return == "SG": stellar = StellarGraph.from_networkx(G, node_type_attr="type") return [stellar, metapaths]
def get_markov(inFP, outFP, kind): """ obtains the markov chain for one app inFP --> input file path (should be .gml.bz2) outFP --> output directory kind --> (str) FAMILY or PACKAGE """ direc, app_name = utils.dir_and_app(inFP) outputfp = os.path.join(outFP, (app_name + "_" + kind + ".txt")) if os.path.exists(outputfp): print("app ", inFP, " is already done!") else: try: networkx = nx.read_gml(inFP) except: return inFP + " might be broken!" nx_nodes = np.array(networkx.nodes()) nx_edges = np.array(networkx.edges, dtype=object) # convert to package/family mode vfunc = np.vectorize(get_package_family) newnodes = vfunc(kind, nx_nodes) new_edges = [] for edge in nx_edges: new_edges.append(edge_processing(edge, kind)) G = nx.MultiDiGraph() G.add_nodes_from(newnodes) G.add_edges_from(new_edges) stellar = StellarGraph.from_networkx(G) # step2: markov chain ## Set of possible states of the Markov chain is denoted as S ## If Sj and Sk are two connected states, Pjk denotes P(transition from Sj to Sk) ## Pjk is # occurances(Ojk), or edges(from j to k), divided by # of all occurrences ## Pjk = # of Edge(j, k) / # total edges if kind == "PACKAGE": possible_packages = get_possible_packages() S = ["/".join(item).strip() for item in possible_packages] + ["self_defined"] possible_edges = get_possible_edges() elif kind == "FAMILY": possible_packages = POSSIBLE_FAMILIES possible_edges = get_possible_family_edges() S = possible_packages + ["self_defined"] total_edges = stellar.number_of_edges() markov = [] counts_nd_stuff = pd.Series(stellar.edges()).value_counts() for j in S: for k in S: ## we might have self calling loops edge = (j, k) try: Pjk = counts_nd_stuff[edge] / total_edges markov.append(Pjk) except ValueError: markov.append(0) # build output fp and save if (round(sum(markov)) == 1) & (not os.path.exists(outputfp)): try: np.savetxt(outputfp, markov, fmt="%s") print("the app: ", inFP, " is done!", "mode: ", kind) return (inFP + " IS FINISHED!") except: print("the app: ", inFP, " encountered errors!")
def wrapper(apk, target, metapathsFP, walksFP): """ wrapper to build features for doc2vec, metapath2vec. apk --> filepath to the apk target --> filepath to store common graph txts (for metapath2vec) metapathsFP --> filepath to store metapath2vec txts (for metapath2vec) walksFP --> filepath to store metapths2vec walks txt (for doc2vec) """ if ".gml.bz2" in apk: direc, appname = utils.dir_and_app(apk) else: appname = apk document_out = os.path.join(walksFP, (appname + "m2v_walks.txt")) metapath_out = os.path.join(metapathsFP, (appname + "m2v_metapaths.txt")) graph_out = os.path.join(target, (appname + "graph.txt")) if (os.path.exists(document_out)) & (os.path.exists(metapath_out)) & ( os.path.exists(graph_out)): print("the app: ", apk, " is already done!") else: try: networkx, metapaths = API_abstraction_vectorized( apk, "", "CLASS", "NX", True) stellar = StellarGraph.from_networkx(networkx, node_type_attr="type") ################## COMMON GRAPH INFORMATION ################## if not os.path.exists(graph_out): with open(graph_out, 'a') as file: for edge in np.array(networkx.edges): node1, node2, weight = edge type1 = networkx.nodes[node1]["type"] type2 = networkx.nodes[node2]["type"] # columns are: ["node1", "node2", "weight", "type1", "type2"] row = " ".join([node1, node2, weight, type1, type2 ]) + "\n" file.write(row) file.close() ############################################################## ################## DOC2VEC AND METAPATH2VEC INFORMATION ################## try: # OUTPUT WALKS OF ONE APP if not os.path.exists(document_out): document = metapath2vec(stellar, 500, metapaths) np.savetxt(document_out, np.hstack(document), fmt="%s") # OUTPUT METAPATHS OF ONE APP if not os.path.exists(metapath_out): joined = ["->".join(lst) for lst in metapaths] np.savetxt(metapath_out, joined, fmt="%s") print("the app: ", apk, " has finished!") except: print("The app: ", apk, " seems to be broken!") except: print("The app: ", apk, " seems to be broken!")
def metapath2vec(commonFP, train_apps, metapathFP, app_ids_fp, walksFP, mdl_fp, walk_length=100, reduced=False, subset=False, testing=False): """ wrapper function performs metapath2vec on the commongraph outputs: X and y commonFP --> filepath to the directory containing txts for common graph txts columns are --> ["node1", "node2", "weight", "type1", "type2"] metapathFP --> filepath to the directory of metapaths of separate apps (from doc2vec) app_ids_fp --> filepath to the csv containing app and label information walksFP --> filepath too the directory of walks of separate apps (from doc2vec) """ df = pd.read_csv(app_ids_fp) if testing == True: train_apps = [utils.dir_and_app(item)[1] for item in train_apps] # get the common graph commongraph = get_commongraph(commonFP, train_apps, subset) # get the metapaths if reduced == True: metapaths = reduced_metapaths(metapathFP) elif reduced == False: metapaths = full_metapaths(metapathFP) # get root nodes as indices nodes, indices = root_nodes(commongraph, metapaths) # run metapath2vec walks = run(commongraph, indices, metapaths) if os.path.exists(mdl_fp): print("model already exists, will load it in .....") model = Word2Vec.load(mdl_fp) else: # gensim word2vec model = Word2Vec(walks, size=128, window=5, min_count=0, sg=1, workers=2, iter=1) model.save(mdl_fp) X = [] y = [] for node in nodes: X.append(build_embedding(node, walksFP, model)) try: y.append(df[df.app_fp.str.contains(node)].app_label.iloc[0]) except: y.append(0) return [X, y]