def test_real_graph(nparts): logging.info('Reading author collab graph') author_graph = nx.read_graphml('/home/amir/az/io/spam/mgraph2.gexf') author_graph.name = 'author graph' logging.info('Reading the full author product graph') full_graph = nx.read_graphml('/home/amir/az/io/spam/spam_graph.graphml') full_graph.name = 'full graph' proper_author_graph = author_graph.subgraph([a for a in author_graph if 'revLen' in author_graph.node[a] and 'hlpful_fav_unfav' in author_graph.node[a] and 'vrf_prchs_fav_unfav' in author_graph.node[a]]) # features = {'revLen': 0.0, 'hlpful_fav_unfav': False, 'vrf_prchs_fav_unfav': False} # for a in author_graph: # for feat, def_val in features.items(): # if feat not in author_graph.node[a]: # author_graph.node[a][feat] = def_val # sub sample proper_author_graph # proper_author_graph.remove_edges_from(random.sample(proper_author_graph.edges(), 2*proper_author_graph.size()/3)) # degree = proper_author_graph.degree() # proper_author_graph.remove_nodes_from([n for n in proper_author_graph if degree[n] == 0]) # author to the product reviewed by him mapping logging.debug('forming the product mapping') author_product_mapping = {} for a in proper_author_graph: author_product_mapping[a] = [p for p in full_graph[a] if 'starRating' in full_graph[a][p] and full_graph[a][p]['starRating'] >= 4] logging.debug('Running EM') ll, partition = HardEM.run_EM(proper_author_graph, author_product_mapping, nparts=nparts, parallel=True) print 'best loglikelihood: %s' % ll for n in partition: author_graph.node[n]['cLabel'] = int(partition[n]) nx.write_gexf(author_graph, '/home/amir/az/io/spam/spam_graph_mgraph_sage_labeled.gexf')
def test_hard_EM(N, nparts, write_labeled_graph=True, parallel=True): graph, author_prod_map = gen_test_graph(N) ll, partition = HardEM.run_EM(author_graph=graph, author_product_map=author_prod_map, nparts=nparts, parallel=parallel) print 'best loglikelihood: %s' % ll print partition.values() for n in partition: graph.node[n]['cLabel'] = int(partition[n]) if write_labeled_graph: nx.write_graphml(graph, '/home/amir/az/io/spam/synthetic_graph_sage_labeled.graphml')
def test_hard_EM(N, nparts, write_labeled_graph=True, parallel=True): graph, author_prod_map, _ = gen_synthetic_graph(N, nparts) ll, partition = HardEM.run_EM(author_graph=graph, author_product_map=author_prod_map, nparts=nparts, parallel=parallel) print 'best loglikelihood: %s' % ll print partition.values() for n in partition: graph.node[n]['cLabel'] = int(partition[n]) if write_labeled_graph: nx.write_graphml(graph, '/home/amir/amazon-spam-review/io/synthetic_graph_labeled.graphml') return graph
def test_real_graph(nparts): MIN_CC_SIZE = 10 # Nodes belonging to connected components smaller than this are discarded logging.info('Reading author collab graph') author_graph = nx.read_graphml( '/home/amir/az/io/spam/spam_mgraph_augmented.graphml') author_graph.name = 'author graph' logging.info('Reading the full author product graph') full_graph = nx.read_graphml('/home/amir/az/io/spam/spam_graph.graphml') full_graph.name = 'full graph' logging.info('Removing nodes which do not have all the features') proper_author_graph = author_graph.subgraph([ a for a in author_graph if 'revLen' in author_graph.node[a] and 'hlpful_fav_unfav' in author_graph.node[a] and 'vrf_prchs_fav_unfav' in author_graph.node[a] ]) logging.info( 'Keeping only nodes which belong to large connected components') ccs = nx.connected_components(proper_author_graph) ccs = filter(lambda cc: len(cc) >= MIN_CC_SIZE, ccs) proper_author_graph = proper_author_graph.subgraph(itertools.chain(*ccs)) # features = {'revLen': 0.0, 'hlpful_fav_unfav': False, 'vrf_prchs_fav_unfav': False} # for a in author_graph: # for feat, def_val in features.items(): # if feat not in author_graph.node[a]: # author_graph.node[a][feat] = def_val # sub sample proper_author_graph # proper_author_graph.remove_edges_from(random.sample(proper_author_graph.edges(), 2*proper_author_graph.size()/3)) # degree = proper_author_graph.degree() # proper_author_graph.remove_nodes_from([n for n in proper_author_graph if degree[n] == 0]) # author to the product reviewed by him mapping logging.debug('forming the product mapping') author_product_mapping = {} for a in proper_author_graph: author_product_mapping[a] = [ p for p in full_graph[a] if 'starRating' in full_graph[a][p] and full_graph[a][p]['starRating'] >= 4 ] logging.info('Running EM') ll, partition = HardEM.run_EM(proper_author_graph, author_product_mapping, nparts=nparts, parallel=True) print 'best loglikelihood: %s' % ll for n in partition: author_graph.node[n]['cLabel'] = int(partition[n]) output_filename = 'spam_graph_mgraph_labeled.gexf' logging.info( 'Writing the clusters into the graph and saving the file into %s' % output_filename) nx.write_gexf(author_graph, '/home/amir/az/io/spam/%s' % output_filename)
def test_real_graph(nparts): MIN_CC_SIZE = 10 # Nodes belonging to connected components smaller than this are discarded logging.info('Reading author collab graph') author_graph = nx.read_graphml('/home/amir/az/io/spam/spam_mgraph_augmented.graphml') author_graph.name = 'author graph' logging.info('Reading the full author product graph') full_graph = nx.read_graphml('/home/amir/az/io/spam/spam_graph.graphml') full_graph.name = 'full graph' logging.info('Removing nodes which do not have all the features') proper_author_graph = author_graph.subgraph([a for a in author_graph if 'revLen' in author_graph.node[a] and 'hlpful_fav_unfav' in author_graph.node[a] and 'vrf_prchs_fav_unfav' in author_graph.node[a]]) logging.info('Keeping only nodes which belong to large connected components') ccs = nx.connected_components(proper_author_graph) ccs = filter(lambda cc: len(cc) >= MIN_CC_SIZE, ccs) proper_author_graph = proper_author_graph.subgraph(itertools.chain(*ccs)) # features = {'revLen': 0.0, 'hlpful_fav_unfav': False, 'vrf_prchs_fav_unfav': False} # for a in author_graph: # for feat, def_val in features.items(): # if feat not in author_graph.node[a]: # author_graph.node[a][feat] = def_val # sub sample proper_author_graph # proper_author_graph.remove_edges_from(random.sample(proper_author_graph.edges(), 2*proper_author_graph.size()/3)) # degree = proper_author_graph.degree() # proper_author_graph.remove_nodes_from([n for n in proper_author_graph if degree[n] == 0]) # author to the product reviewed by him mapping logging.debug('forming the product mapping') author_product_mapping = {} for a in proper_author_graph: author_product_mapping[a] = [p for p in full_graph[a] if 'starRating' in full_graph[a][p] and full_graph[a][p]['starRating'] >= 4] logging.info('Running EM') ll, partition = HardEM.run_EM(proper_author_graph, author_product_mapping, nparts=nparts, parallel=True) print 'best loglikelihood: %s' % ll for n in partition: author_graph.node[n]['cLabel'] = int(partition[n]) output_filename = 'spam_graph_mgraph_labeled.gexf' logging.info('Writing the clusters into the graph and saving the file into %s'%output_filename) nx.write_gexf(author_graph, '/home/amir/az/io/spam/%s'%output_filename)
import logging logging.basicConfig(level=logging.DEBUG, format='%(process)d\t%(asctime)s:%(levelname)s: %(message)s', datefmt='%H:%M:%S') from pre_process import crawl_to_graph DS_DIR = '/home/amir/pyproj/amazon-review-spam/io/same_cat_v2' graph, membs, prods = crawl_to_graph(ds_dir=DS_DIR) graph_orig = graph.copy() import networkx as nx from os import path mgraph = nx.read_gexf(path.join(DS_DIR, '%s.gexf' % 'em_unlabeled_mgraph')) author_product_mapping = {} for a in mgraph: author_product_mapping[a] = [p for p in graph[a]] from hardEM_gurobi import HardEM nparts = 4 ll, partition = HardEM.run_EM(author_graph=mgraph, author_product_map=author_product_mapping, nparts=nparts*5, parallel=True, nprocs=4) for a in mgraph: mgraph.node[a]['cLabel'] = int(partition[a]) nx.write_gexf(mgraph, path.join(DS_DIR, '%s.gexf' % 'em_labeled_mgraph'), version='1.2draft', encoding='us-ascii')