def fit(self, metapaths=None, d=128, r=10, l=80, k=10): """ Pipeline for representational learning for all nodes in a graph. :param k: :return: """ self._assert_positive_int(d, msg="d should be positive integer") self._assert_positive_int(r, msg="r should be positive integer") self._assert_positive_int(l, msg="l should be positive integer") self._assert_positive_int(k, msg="k should be positive integer") start_time_fit = time.time() # self.G = node2vec.Graph(self.nxG, False, p, q) # self.G.preprocess_transition_probs() metapath_walker = UniformRandomMetaPathWalk(self.nxG) # walks = self.G.simulate_walks(r, l) time_b = time.time() walks = metapath_walker.run( nodes=list(self.nxG.nodes()), metapaths=metapaths, length=l, n=r, node_type_attribute="label", seed=None, ) print("({}) Time for random walks {:.0f} seconds.".format( type(self).__name__, time.time() - time_b)) self.learn_embeddings(walks, d, k) print("Total time for fit() was {:.0f}".format(time.time() - start_time_fit))
def metapath2vec(G, walk_length, metapaths): """ performs metapath2vec and returns representations with labels G --> stellargraph object of the graph label --> (0 or 1) benign or not walk_length --> int, defines how long the sentences should be """ rw = UniformRandomMetaPathWalk(G) walks = rw.run(nodes=list(G.nodes()), length=walk_length, n=10, metapaths=metapaths) return walks
def _metapath_randomwalk(graph): # Create the random walker rw = UniformRandomMetaPathWalk(StellarGraph(graph)) # specify the metapath schemas as a list of lists of node types. walks = rw.run(nodes=list(graph.nodes()), # root nodes length=WALK_DISTANCE, # maximum length of a random walk n=1, # number of random walks per root node metapaths=METAPATHS # the metapaths ) print("Number of random walks: {}".format(len(walks))) return walks
def metapath2vec_walk(G, params): """Performs uniform random metapath walks using StellarGraph to generate corpus used in metapath2vec and writes corpus to a txt file :param G : StellarGraph graph Nodes consist of apps, api calls, packages, and invoke methods :param params : dict dict["key"] where dict is global parameter dictionary and key returns metapath2vec parameter sub-dictionary """ fp=os.path.join(params["save_dir"],params["filename"]) os.makedirs(params["save_dir"], exist_ok=True) # Create the random walker rw = UniformRandomMetaPathWalk(G, length=params["walk_length"], # maximum length of a random walk n=params["n"], # number of random walks per root node metapaths=params["metapaths"] # the metapaths) ) print("Starting MetaPath Walks") start_walks = time.time() walks = rw.run( nodes=list(G.nodes_of_type("app_nodes")), # root nodes (app_nodes) length=params["walk_length"], # maximum length of a random walk n=params["n"], # number of random walks per root node metapaths=params["metapaths"] # the metapaths ) print("--- Done Walking in " + str(int(time.time() - start_walks)) + " Seconds ---") print() print("Number of metapath walks: {}".format(len(walks))) # save walks to file with open(fp, 'w') as f: for walk in walks: for node in walk: f.write(str(node) + ' ') f.write('\n') f.close() if params["verbose"]: print("Saved %s to %s" %(params["filename"], params["save_dir"])) return
def load4graph(self): """ 将所有边和顶点信息装入图,并生成游走路径rw :return: """ g_nx = self.load_dataset_SMDB(self.location, self.graph_infos) print("Number of nodes {} and number of edges {} in graph.".format(g_nx.number_of_nodes(), g_nx.number_of_edges())) from stellargraph.data import UniformRandomMetaPathWalk rw = UniformRandomMetaPathWalk(StellarGraph(g_nx)) return g_nx, rw
def common_metapath2vec(metapaths, commongraph, root_nodes, walk_length): """ from the filepath, returned a combined list of all metapaths metapathsFP --> filepath to the directory containing all metapaths should be: "/teams/DSC180A_FA20_A00/a04malware/personal-group03/actualdroid_intermediate_files/metapath2vec_metapaths" common_graph_txts --> path to a folder containing elements of the common graph should be: "/teams/DSC180A_FA20_A00/a04malware/personal-group03/common_graph/common_graph_txts" """ # start traversal walk_length = 100 rw = UniformRandomMetaPathWalk(commongraph) walks = rw.run( nodes=root_nodes, # root nodes length=walk_length, # maximum length of a random walk n=1, # number of random walks per root node metapaths=metapaths, # the metapaths ) print("Number of random walks: {}".format(len(walks))) return walks
def generate_random_walks(graph_obj, num_walks_per_node, walk_length, metapaths): random_walk_object = UniformRandomMetaPathWalk(graph_obj) cpu_count = multiprocessing.cpu_count() list_nodes = list(graph_obj.nodes()) num_chunks = cpu_count chunk_len = (len(list_nodes) // num_chunks) chunks = [ list_nodes[i * chunk_len: (i + 1) * chunk_len] for i in range(0, num_chunks + 1) ] res = Parallel(n_jobs=cpu_count)( delayed(aux_gen_walks)( node_chunk, walk_length, random_walk_object, metapaths, num_walks_per_node ) for node_chunk in chunks ) all_walks = [] for r in res: all_walks.extend(r) return all_walks
# add the user-user edges with label 'friend' g_nx.add_edges_from(u_m_edges, label="rating") g_nx.add_edges_from(u_u_edges, label="friend") g_nx.add_edges_from(m_m_edges, label="same") g_nx.add_edges_from(m_a_edges, weight=1, label="include") print(g_nx.number_of_nodes()) print(g_nx.number_of_edges()) return g_nx g_nx = construct_hin_pandas() # Create the random walker rw = UniformRandomMetaPathWalk(StellarGraph(g_nx)) # specify the metapath schemas as a list of lists of node types. metapaths = [ ["user", "user", "movie"], #["user", "movie", "aspect", "movie"], ] walks = rw.run( nodes=list(g_nx.nodes()), # root nodes length=3, # maximum length of a random walk n=50, # number of random walks per root node metapaths=metapaths # the metapaths ) for i in range(1000):
def model_train(): print("Loading files..") cluster = MongoClient('mongodb+srv://nirmal:[email protected]/<dbname>?retryWrites=true&w=majority') db = cluster.Dataset pcol = db.posts vcol= db.views fcol= db.follows favcol = db.favourites # posts = pd.read_csv('./posts.csv',engine='python') # users = pd.read_csv('./users.csv') # views = pd.read_csv('./views.csv') # favorites = pd.read_csv('./favourites.csv') # userPosts = pd.read_csv('./usersPosts.csv') # print("Files loaded..") ''' Create DataFrame for preprocessing ''' posts = pd.DataFrame(list(pcol.find())) views = pd.DataFrame(list(vcol.find())) favorites = pd.DataFrame(list(favcol.find())) userPosts = posts[['_id','postedBy']] follows = pd.DataFrame(list(fcol.find())) print("Collections loaded..") print("Started preprocessing..") views = views[views['user_id']!='anonymous'] posts = posts.dropna(subset=['title',' post_type','tags']) posts['category'] = posts['category'].fillna(posts['tags']) posts['tags'] = posts['tags'].apply(clean_text) """Splitting on '|' and '#' for getting categories""" uniq_category = dict() uniq_post_type = dict() i=0 j=0 for cats,pt in zip(posts['category'].values,posts[' post_type'].values): for cat in re.split('[#|]',cats): if cat not in uniq_category.keys(): uniq_category[cat]=i i+=1 if pt not in uniq_post_type.keys(): uniq_post_type[pt]=j j+=1 category_ohe = np.zeros((len(posts),513)) for i,cats in enumerate(posts['category'].values): for cat in re.split('[#|]',cats): category_ohe[i][uniq_category[cat]]=1 token_tag = [word_tokenize(tag) for tag in posts['tags'].values.tolist()] tag_model = Word2Vec(token_tag,sg=1,size=100,window=5, min_count=5, workers=4,iter=100) tag_model.save('./tag.model') tag_model = Sentence2Vec('./tag.model') processed_title = posts['title'].apply(clean_text) token_title = [word_tokenize(tag) for tag in processed_title] title_model = Word2Vec(token_title,sg=1,size=100,window=5, min_count=5, workers=4,iter=100) title_model.save('./title.model') title_model = Sentence2Vec('./title.model') posts_info = dict() for pid,title,cat,tag in zip(posts['_id'],posts['title'].values,category_ohe,posts['tags'].values): posts_info[pid] = dict() posts_info[pid]['title'] = title_model.get_vector(title) posts_info[pid]['tag'] = tag_model.get_vector(tag) posts_info[pid]['cat'] = cat """Removing rows in views.csv, favorites.csv and usrPosts.csv that has pid not present in posts.csv """ pidr=set() for pid in views['post_id']: if posts_info.get(pid,0) == 0: pidr.add(pid) for pid in favorites['post_id']: if posts_info.get(pid,0) == 0: pidr.add(pid) for pid in userPosts['post_id']: if posts_info.get(pid,0) == 0: pidr.add(pid) for pid in list(pidr): views = views[views['post_id']!=pid] userPosts = userPosts[userPosts['post_id']!=pid] favorites = favorites[favorites['post_id']!=pid] """Representing the user based on the categories seen by the user""" users_info = defaultdict(lambda :np.zeros((513))) for uid,pid in zip(views['user_id'],views['post_id']): a = posts_info[pid]['cat'] #,posts_info[pid]['pt']))#,posts_info[pid]['title_ohe'])) users_info[uid] = np.add(users_info[uid],a) assert(np.sum(users_info[uid])!=0) """Increasing the weightage for categories by 100% for posts posted by user""" for uid,pid in zip(userPosts['user_id'],userPosts['post_id']): a = posts_info[pid]['cat'] #,posts_info[pid]['pt']))#,posts_info[pid]['title_ohe'])) users_info[uid] = np.add(users_info[uid],a) assert(np.sum(users_info[uid])!=0) """Increasing weightage for categories by 50% for favorite posts""" for uid,pid in zip(favorites['user_id'],favorites['post_id']): a = 1/2*posts_info[pid]['cat'] #,posts_info[pid]['pt'])))#,posts_info[pid]['title_ohe']))) users_info[uid] = np.add(users_info[uid],a) assert(np.sum(users_info[uid])!=0) """## MODEL Generating -ive datapoints for each user where the posts chosen have categories that are not seen by the user """ def gen_pseudoDP(user_id): cat_user = users_info[uid] arr=[] k=0 for pid in posts_info.keys(): cat = posts_info[pid]['cat'] flag=0 for i in range(len(cat)): if (cat[i]!=0 and cat_user[i] != 0): flag=1 break if flag==0: arr.append([uid,pid,0]) k+=1 if k==4: break return arr pseudo = pd.DataFrame(np.zeros((len(users_info)*4,3)),columns=['user_id','post_id','view']) i=0 for uid in list(users_info.keys()): arr = gen_pseudoDP(uid) if len(arr): pseudo[i:i+len(arr)] = arr i+=4 views['view'] = np.ones((len(views))) views = views.drop(columns=['timestamp'],axis=1) data = views.append(pseudo) print("Preprocessing done!") class Datagenerator(tf.keras.utils.Sequence): def __init__(self,X,y=None,batch_size=1,shuffle=True): super().__init__() self.X = X self.y = y self.batch_size = batch_size self.on_epoch_end() def __getitem__(self,index): indices = self.indexes[index * self.batch_size:(index + 1) * self.batch_size] batch = self.X.iloc[indices] y = self.y.iloc[indices] user=np.zeros((self.batch_size,len(uniq_category))) title=np.zeros((self.batch_size,100)) tag=np.zeros((self.batch_size,100)) category = np.zeros((self.batch_size,len(uniq_category))) for i in range(self.batch_size): title[i] = posts_info[batch.post_id.values[i]]['title'] tag[i] = posts_info[batch.post_id.values[i]]['tag'] category[i] = posts_info[batch.post_id.values[i]]['cat'] user[i] = users_info[batch['user_id'].values[i]] return [user,title,tag,category],y.values.reshape(-1,1) def __len__(self): return int(np.floor(len(self.X) / self.batch_size)) def on_epoch_end(self): self.indexes = np.arange(len(self.X)) np.random.shuffle(self.indexes) y = data['view'] X = data.drop(['view'],axis=1) X_train, X_test, y_train, y_test = train_test_split(X,y) train_dg = Datagenerator(X_train,y_train,128) test_dg = Datagenerator(X_test,y_test,128) """Model predicts whether a user will see a post or not. Based on that user embeddings will be learnt which will then be used for recommendation""" def create_model(): user_inp = Input((len(uniq_category))) embed = Embedding(input_dim=len(uniq_category),output_dim=50)(user_inp) dense = Dense(2056)(Flatten()(embed)) user = Dense(500,activation='relu')(dense) user = Dense(400,activation='relu')(user) cat = Input((len(uniq_category))) cat_ = Dense(300,activation='relu')(cat) title = Input((100)) title_ = Dense(50,activation='relu')(title) tag = Input((100)) tag_ = Dense(50,activation='relu')(tag) post_concat = Concatenate()([cat_,title_,tag_]) output = Dot(axes=[-1,-1],normalize=True)([user,post_concat]) model = tf.keras.Model([user_inp,title,tag,cat],output) return model model = create_model() model.compile(optimizer=Adagrad(lr=0.0001), loss='binary_crossentropy',metrics=['accuracy']) print("model started training...") model.fit_generator(train_dg,validation_data=test_dg,epochs=1) print("Model trained") """Retrieving trained user embeddings""" user_embeddings = model.get_layer('embedding').get_weights()[0] follows = pd.read_csv('./follows.csv') follows = follows.drop(['timestamp'],axis=1) """Users present in follows.csv""" uids = np.concatenate((follows['user_id'].values,follows['follower_id'].values)) uids = set(uids) """Creating Edges""" edges = [(y,x) for x,y in zip(follows['user_id'],follows['follower_id'])] """Creating Directional Graph and adding the edges""" G = nx.DiGraph() G.add_edges_from(edges) edges_dict = dict() for edge in edges: edges_dict[edge]=1 rw = UniformRandomMetaPathWalk(StellarDiGraph(G)) """Creating random walks. Each walk can be seen as a chain: uid->uid->uid ... They are of length 100 """ walks = rw.run(nodes=list(uids),length=100,n=2,metapaths=[['default','default']]) """Word2Vec on those chains""" user_model = Word2Vec(walks,size=128,window=5) user_model.wv.vectors.shape """Each user represented by 128 dim vector""" node_ids = user_model.wv.index2word node_embed = user_model.wv.vectors print("Pushing to database...") userCollection = cluster.Users.User_Embeddings userCollection.delete_many({}) followCollection = cluster.Users.Follows followCollection.delete_many({}) posted = cluster.Users.Posted posted.delete_many({}) catCol = cluster.Users.Categories catCol.delete_many({}) embedCol = cluster.Users.Embedding_Matrix embedCol.delete_many({}) folDict = dict() for i,id in enumerate(node_ids): folDict[id]=i user_ins=[] for user in tqdm(users_info.keys()): embed = list(np.matmul(users_info[user],user_embeddings)) if folDict.get(user,-1) == -1: #userCollection.insert_one({'user_id':user, 'user_embed':embed}) user_ins.append({'user_id':user, 'user_embed':embed}) else: yo = node_embed[folDict[user]].tolist() #userCollection.insert_one({'user_id':user, 'user_embed':embed, 'node_embed':yo}) user_ins.append({'user_id':user, 'user_embed':embed, 'node_embed':yo}) userCollection.insert_many(user_ins) fol=[] for uid,fid in tqdm(zip(follows['user_id'],follows['follower_id'])): d = dict() d['user_id'] = uid d['follower_id'] = fid fol.append(d) followCollection.insert_many(fol) categories = pickle.dumps(uniq_category) user_embed = pickle.dumps(user_embeddings) catCol.insert_one({"Categories":categories}) embedCol.insert_one({"Matrix":user_embed}) uids = set() for uid in userPosts['user_id']: uids.add(uid) to_ins=[] for uid in uids: noob = dict() noob['user_id']=uid to_ins.append(noob) posted.insert_many(to_ins) requests.get('http://3.7.185.166/train') print("Done!")
uids = set(uids) """Creating Edges""" edges = [(y,x) for x,y in zip(follows['followed'],follows['follower'])] """Creating Directional Graph and adding the edges""" G = nx.DiGraph() G.add_edges_from(edges) edges_dict = dict() for edge in edges: edges_dict[edge]=1 rw = UniformRandomMetaPathWalk(StellarDiGraph(G)) """ Creating random walks. Each walk can be seen as a chain: uid->uid->uid ... They are of length 100 """ walks = rw.run(nodes=list(uids),length=100,n=2,metapaths=[['default','default']]) """Word2Vec on those chains""" user_model = Word2Vec(walks,size=128,window=5) user_model.wv.vectors.shape
def get_features(outfolder, walk_args=None, w2v_args=None, redo=False): ''' Implements metapath2vec by: 1. Building a graph 2. Performing a random metapath walk then 3. Applying word2vec on the walks generated. --------- Parameters: outfolder: Path to directory where output will be saved, should contain app_list.csv walk_args: Arguments for stellargraph.data.UniformRandomMetaPathWalk w2v_args: Arguments for gensim.models.Word2Vec ''' # save parameters to outfolder params = { "outfolder": outfolder, "walk_args": walk_args, "w2v_args": w2v_args } with open(os.path.join(outfolder, 'params.json'), 'w') as param_file: json.dump(params, param_file) # define paths app_list_path = os.path.join(outfolder, 'app_list.csv') nodes_path = os.path.join(outfolder, 'nodes.json') edge_path = os.path.join(outfolder, 'edges.csv') graph_path = os.path.join(outfolder, 'graph.pkl') feature_path = os.path.join(outfolder, 'features.csv') app_heap_path = os.path.join('data', 'out', 'all-apps', 'app-data/') metapath_walk_outpath = os.path.join(outfolder, 'metapath_walk.json') # generate app list apps_df = pd.read_csv(app_list_path) app_data_list = app_heap_path + apps_df.app + '.csv' if os.path.exists(graph_path) and not redo: # load graph from file if present with open(graph_path, 'rb') as file: g = pickle.load(file) else: # otherwise compute from data g = build_graph(outfolder, app_data_list, nodes_path, edge_path) # save graph to file with open(graph_path, 'wb') as file: pickle.dump(g, file) if os.path.exists(metapath_walk_outpath) and not redo: # load graph from file if present with open(metapath_walk_outpath, 'r') as file: metapath_walks = json.load(file) else: # otherwise compute from data # random walk on all apps, save to metapath_walk.json print('Performing random walks') rw = UniformRandomMetaPathWalk(g) app_nodes = list( apps_df.app.map( pd.read_csv(os.path.join(outfolder, 'app_map.csv'), index_col='app').uid ) ) metapath_walks = rw.run(app_nodes, n=walk_args['n'], length=walk_args['length'], metapaths=walk_args['metapaths']) with open(metapath_walk_outpath, 'w') as file: json.dump(metapath_walks, file) print('Running Word2vec') w2v = Word2Vec(metapath_walks, **w2v_args) features = pd.DataFrame(w2v.wv.vectors) features['uid'] = w2v.wv.index2word features['app'] = features['uid'].map( pd.read_csv(os.path.join(outfolder, 'app_map.csv'), index_col='uid').app ) features = features[features.uid.str.contains('app')].set_index('uid') features.to_csv(feature_path)
def fit_predict(self, path): outpath = os.path.join(path, f'm2v-{self.name}') os.makedirs(outpath, exist_ok=True) # get app data, compute unique apis apps = pd.read_csv(os.path.join(path, 'app_list.csv'), usecols=['app'], squeeze=True, dtype=str) # apps = set(apps) app_data_list = os.path.join('data', 'out', 'all-apps', 'app-data/') + apps + '.csv' print('Computing new edges') data = dd.read_csv(list(app_data_list), dtype=str, usecols=['app', 'api']).drop_duplicates().compute() data.api = data.api.map(self.api_map) data.columns = ['source', 'target'] data = data.dropna() nodes = self.nodes.copy() nodes['app'] = IndexedArray( index=np.array(list(nodes['app'].index) + list(apps))) edges = pd.concat([pd.read_csv(self.edges_path, dtype=str), data], ignore_index=True).reset_index(drop=True) g = StellarGraph(nodes=nodes, edges=edges) print(g) print('Running random walk') rw = UniformRandomMetaPathWalk(g) walk_args = self.params['walk_args'] new_walks = rw.run(list(apps), n=walk_args['n'], length=walk_args['length'], metapaths=walk_args['metapaths']) metapath_walks = (self.metapath_walks + new_walks) print('Running Word2Vec') # make features with word2vec w2v = Word2Vec(metapath_walks, **self.params['w2v_args']) print('Fitting model') features = pd.DataFrame(w2v.wv.vectors) features['app'] = w2v.wv.index2word map_func = lambda uid: uid if uid not in self.inverse_app_map else self.inverse_app_map[ uid] features['app'] = features['app'].map(map_func) features = features.set_index('app') X_train = features.loc[self.app_map.keys()] # X_train = X_train.uid.map(self.inverse_app_map) X_test = features.loc[apps] # train model and predict new apps labels = pd.read_csv('data/out/all-apps/app_list.csv', usecols=['app', 'malware'], index_col='app', squeeze=True) y_test = labels[X_test.index] y_train = labels[X_train.index] mdl = self.classifier(**self.classifier_args) mdl.fit(X_train, y_train) pred = mdl.predict(X_test) print(classification_report(y_test, pred)) results = X_test.assign(m2vDroid=pred, true=y_test) # save results and training data results.to_csv(os.path.join(outpath, 'predictions.csv')) X_train.assign(m2vDroid=mdl.predict(X_train), true=y_train).to_csv( os.path.join(outpath, 'training_data.csv')) return results
phrase_id_map = pickle.load(open(data_path + "phrase_id_map.pkl", "rb")) id_phrase_map = pickle.load(open(data_path + "id_phrase_map.pkl", "rb")) labels, label_to_index, index_to_label = get_distinct_labels(df) label_term_dict = get_label_term_json(data_path + "seedwords.json") label_term_dict = modify_phrases(label_term_dict, phrase_id_map) graph, metapaths = get_book_graph_metapaths(df, tokenizer, id_phrase_map) print( "Number of nodes {} and number of edges {} in graph.".format( graph.number_of_nodes(), graph.number_of_edges() ) ) rw = UniformRandomMetaPathWalk(graph) walks = rw.run( nodes=list(graph.nodes()), # root nodes length=5, # maximum length of a random walk n=5, # number of random walks per root node metapaths=metapaths, # the metapaths ) print("Number of random walks: {}".format(len(walks))) model = Word2Vec(walks, size=128, window=5, min_count=0, sg=1, workers=2, iter=10) print("Embeddings shape: ", model.wv.vectors.shape) node_ids = model.wv.index2word # list of node IDs node_embeddings = ( model.wv.vectors ) # numpy.ndarray of size number of nodes times embeddings dimensionality