def _convert_DataFrame_to_DiGraph(self, df): columns = ['Class ID', 'Parents'] try: tmp = df[columns] edges = [(get_short_concept_name(, parent), get_short_concept_name(, row['Class ID'])) for index, row in tmp.iterrows() for parent in str(row['Parents']).split('|')] self.G = nx.DiGraph() self.G.graph['name'] = self.G.graph['year'] = self.year self.G.graph['submission_id'] = self.submission_id self.G.add_edges_from(edges) try: self.G.remove_nodes_from(['nan']) except: pass printf('Convertion {}-{} DataFrame to DiGraph done!'.format(, self.year)) except Exception as ex: printf(ex) printf( 'ERROR: {}-{} NOT converted from DataFrame to DiGraph!'.format(, self.year))
def get_submissions(fn): if not os.path.exists(fn): raise ValueError("submission fn does not exist!") return None try: with open(fn, 'r') as f: obj = json.load(f) printf('{} loaded!'.format(fn)) printf('- {} ontologies'.format(len(obj.keys()))) printf('- {} years'.format( len( set([ year for o, years in obj.items() for year, data in years.items() ])))) except Exception as ex: printf(ex) printf('ERROR: {} NOT loaded!'.format(fn)) obj = {k.upper(): v for k, v in obj.items()} return obj
def load_clickstream(path, year): try: fn = os.path.join(path, CS_FN_SOURCE.replace('<YEAR>', year)) df = read_csv(fn, index_col=None, compression=COMPRESSION) printf('{} loaded!'.format(fn)) return df except Exception as ex: printf(ex) printf('ERROR: CS{} NOT loaded!'.format(year))
def _convert_DataFrame_to_DiGraph(self, df, nodes, min_session_length=MIN_SESSION_LENGTH): edges = defaultdict(lambda: 0) try: for name, group in df.groupby(['ip', '_sessionid']): if len(group) < min_session_length: continue dyad0 = None dyad1 = None seq0 = None seq1 = None for i, row in group.iterrows(): if dyad0 is None: dyad0 = row._concept seq0 = row._sequence continue if dyad1 is None: dyad1 = row._concept seq1 = row._sequence if seq1 == (seq0 + 1) and dyad0 != dyad1: if self.navitype is None or (self.navitype == row._navitype): edges[(dyad0, dyad1)] += 1 dyad0 = dyad1 seq0 = seq1 dyad1 = None seq1 = None except Exception as ex: printf(ex) printf('ERROR converting dataframe to digraph') return tmp = nx.DiGraph() tmp.add_weighted_edges_from([(e[0], e[1], w) for e, w in edges.items()]) self.H = tmp.subgraph(nodes).copy() del (edges) printf('{}-{}-{}: {} concepts found, but {} kept (cros-val)'.format(, self.year, self.navitype, tmp.number_of_nodes(), self.H.number_of_nodes())) del (tmp)
def load_ontology(self): fn = [ fn for fn in os.listdir(self._path) if fn.startswith( and fn.endswith(ONTO_EXT) ] if len(fn) == 0: raise ValueError("Ontology file not found in {}".format( self._path)) try: fn = os.path.join(self._path, fn[0]) df = read_csv(fn, index_col=False, compression=COMPRESSION) printf('{} loaded!'.format(fn)) except Exception as ex: printf(ex) printf('ERROR: {}-{} NOT loaded!'.format(, self.year)) return self._convert_DataFrame_to_DiGraph(df) self.sorted_nodes = sorted(list(self.G.nodes())) self.lcc_sorted_nodes = sorted( list( max(nx.connected_component_subgraphs(self.G.to_undirected()), key=len).nodes()))
def main(): printf('class ontology')
def create_hops_matrices(self, path, maxk=5, lcc=False): self.set_lcc(lcc) self.set_path_khop(path) reached_zero = False if lcc: if self.lcc_A is None: printf('{}-{}-{}: Adjacency matrix is not loaded.'.format(, self.year, self.submission_id)) return A = self.lcc_A else: if self.A is None: printf('{}-{}-{}: Adjacency matrix is not loaded.'.format(, self.year, self.submission_id)) return A = self.A uA = self.get_undirected_adjacency(lcc).tocsr().astype( np.int32, copy=False) # undirected kdone = 1 khops = get_khop_with_partial_results_load_previous( uA, maxk, self.get_khop) for k, hop in khops: if hop.sum() == 0: printf('{}-{}-{}: {}-hop has reached zero!'.format(, self.year, self.submission_id, k)) kdone = k - 1 if (k - kdone) > 1 else kdone break kdone = k # save printf('{}-{}-{}: {}-hop --> shape:{}, sum:{}!'.format(, self.year, self.submission_id, k, hop.shape, hop.sum())) printf('{}-{}-{}: {}-hop saving...'.format(, self.year, self.submission_id, k)) fn = self.get_khop_matrix_fn(k, lcc=lcc) #save_sparse_matrix(hop, path, fn) printf('{}-{}-{}: {}-hop done!'.format(, self.year, self.submission_id, k)) printf('') return kdone
def __get_damping_factor__(self, alpha): if alpha is None: alpha = round(self.M.multiply(self.T).sum() / self.T.sum(), 2) printf('Empirical alpha (damping factor): {}'.format(alpha)) return alpha return alpha
def main(): printf('class clickstreams')