Python HardEM.run_EM示例，hardEM_gurobi.HardEM.run_EM Python示例

示例#1

0

显示文件

文件： driver.py 项目： lzh6710/amazon-review-spam

def test_real_graph(nparts):
    logging.info('Reading author collab graph')
    author_graph = nx.read_graphml('/home/amir/az/io/spam/mgraph2.gexf')
    author_graph.name = 'author graph'
    logging.info('Reading the full author product graph')
    full_graph = nx.read_graphml('/home/amir/az/io/spam/spam_graph.graphml')
    full_graph.name = 'full graph'

    proper_author_graph = author_graph.subgraph([a for a in author_graph if 'revLen' in author_graph.node[a]
                                                                            and 'hlpful_fav_unfav' in author_graph.node[a]
                                                                            and 'vrf_prchs_fav_unfav' in author_graph.node[a]])
    # features = {'revLen': 0.0, 'hlpful_fav_unfav': False, 'vrf_prchs_fav_unfav': False}
    # for a in author_graph:
    #     for feat, def_val in features.items():
    #         if feat not in author_graph.node[a]:
    #             author_graph.node[a][feat] = def_val

    # sub sample proper_author_graph
    # proper_author_graph.remove_edges_from(random.sample(proper_author_graph.edges(), 2*proper_author_graph.size()/3))
    # degree = proper_author_graph.degree()
    # proper_author_graph.remove_nodes_from([n for n in proper_author_graph if degree[n] == 0])
    # author to the product reviewed by him mapping
    logging.debug('forming the product mapping')
    author_product_mapping = {}
    for a in proper_author_graph:
        author_product_mapping[a] = [p for p in full_graph[a] if 'starRating' in full_graph[a][p] and
                                                                 full_graph[a][p]['starRating'] >= 4]
    logging.debug('Running EM')
    ll, partition = HardEM.run_EM(proper_author_graph, author_product_mapping, nparts=nparts, parallel=True)
    print 'best loglikelihood: %s' % ll
    for n in partition:
        author_graph.node[n]['cLabel'] = int(partition[n])
    nx.write_gexf(author_graph, '/home/amir/az/io/spam/spam_graph_mgraph_sage_labeled.gexf')

示例#2

0

显示文件

文件： hardEM_driver.py 项目： YukiShan/amazon-review-spam

def test_hard_EM(N, nparts, write_labeled_graph=True, parallel=True):
    graph, author_prod_map = gen_test_graph(N)
    ll, partition = HardEM.run_EM(author_graph=graph, author_product_map=author_prod_map, nparts=nparts, parallel=parallel)

    print 'best loglikelihood: %s' % ll
    print partition.values()
    for n in partition:
        graph.node[n]['cLabel'] = int(partition[n])
    if write_labeled_graph:
        nx.write_graphml(graph, '/home/amir/az/io/spam/synthetic_graph_sage_labeled.graphml')

示例#3

0

显示文件

文件： driver.py 项目： lzh6710/amazon-review-spam

def test_hard_EM(N, nparts, write_labeled_graph=True, parallel=True):
    graph, author_prod_map, _ = gen_synthetic_graph(N, nparts)
    ll, partition = HardEM.run_EM(author_graph=graph, author_product_map=author_prod_map, nparts=nparts, parallel=parallel)

    print 'best loglikelihood: %s' % ll
    print partition.values()
    for n in partition:
        graph.node[n]['cLabel'] = int(partition[n])
    if write_labeled_graph:
        nx.write_graphml(graph, '/home/amir/amazon-spam-review/io/synthetic_graph_labeled.graphml')
    return graph

示例#4

0

显示文件

def test_real_graph(nparts):
    MIN_CC_SIZE = 10  # Nodes belonging to connected components smaller than this are discarded
    logging.info('Reading author collab graph')
    author_graph = nx.read_graphml(
        '/home/amir/az/io/spam/spam_mgraph_augmented.graphml')
    author_graph.name = 'author graph'
    logging.info('Reading the full author product graph')
    full_graph = nx.read_graphml('/home/amir/az/io/spam/spam_graph.graphml')
    full_graph.name = 'full graph'

    logging.info('Removing nodes which do not have all the features')
    proper_author_graph = author_graph.subgraph([
        a for a in author_graph
        if 'revLen' in author_graph.node[a] and 'hlpful_fav_unfav' in
        author_graph.node[a] and 'vrf_prchs_fav_unfav' in author_graph.node[a]
    ])
    logging.info(
        'Keeping only nodes which belong to large connected components')
    ccs = nx.connected_components(proper_author_graph)
    ccs = filter(lambda cc: len(cc) >= MIN_CC_SIZE, ccs)
    proper_author_graph = proper_author_graph.subgraph(itertools.chain(*ccs))
    # features = {'revLen': 0.0, 'hlpful_fav_unfav': False, 'vrf_prchs_fav_unfav': False}
    # for a in author_graph:
    #     for feat, def_val in features.items():
    #         if feat not in author_graph.node[a]:
    #             author_graph.node[a][feat] = def_val

    # sub sample proper_author_graph
    # proper_author_graph.remove_edges_from(random.sample(proper_author_graph.edges(), 2*proper_author_graph.size()/3))
    # degree = proper_author_graph.degree()
    # proper_author_graph.remove_nodes_from([n for n in proper_author_graph if degree[n] == 0])
    # author to the product reviewed by him mapping
    logging.debug('forming the product mapping')
    author_product_mapping = {}
    for a in proper_author_graph:
        author_product_mapping[a] = [
            p for p in full_graph[a] if 'starRating' in full_graph[a][p]
            and full_graph[a][p]['starRating'] >= 4
        ]
    logging.info('Running EM')
    ll, partition = HardEM.run_EM(proper_author_graph,
                                  author_product_mapping,
                                  nparts=nparts,
                                  parallel=True)
    print 'best loglikelihood: %s' % ll
    for n in partition:
        author_graph.node[n]['cLabel'] = int(partition[n])
    output_filename = 'spam_graph_mgraph_labeled.gexf'
    logging.info(
        'Writing the clusters into the graph and saving the file into %s' %
        output_filename)
    nx.write_gexf(author_graph, '/home/amir/az/io/spam/%s' % output_filename)

示例#5

0

显示文件

文件： hardEM_driver.py 项目： YukiShan/amazon-review-spam

def test_real_graph(nparts):
    MIN_CC_SIZE = 10        # Nodes belonging to connected components smaller than this are discarded
    logging.info('Reading author collab graph')
    author_graph = nx.read_graphml('/home/amir/az/io/spam/spam_mgraph_augmented.graphml')
    author_graph.name = 'author graph'
    logging.info('Reading the full author product graph')
    full_graph = nx.read_graphml('/home/amir/az/io/spam/spam_graph.graphml')
    full_graph.name = 'full graph'

    logging.info('Removing nodes which do not have all the features')
    proper_author_graph = author_graph.subgraph([a for a in author_graph if 'revLen' in author_graph.node[a]
                                                and 'hlpful_fav_unfav' in author_graph.node[a]
                                                and 'vrf_prchs_fav_unfav' in author_graph.node[a]])
    logging.info('Keeping only nodes which belong to large connected components')
    ccs = nx.connected_components(proper_author_graph)
    ccs = filter(lambda cc: len(cc) >= MIN_CC_SIZE, ccs)
    proper_author_graph = proper_author_graph.subgraph(itertools.chain(*ccs))
    # features = {'revLen': 0.0, 'hlpful_fav_unfav': False, 'vrf_prchs_fav_unfav': False}
    # for a in author_graph:
    #     for feat, def_val in features.items():
    #         if feat not in author_graph.node[a]:
    #             author_graph.node[a][feat] = def_val

    # sub sample proper_author_graph
    # proper_author_graph.remove_edges_from(random.sample(proper_author_graph.edges(), 2*proper_author_graph.size()/3))
    # degree = proper_author_graph.degree()
    # proper_author_graph.remove_nodes_from([n for n in proper_author_graph if degree[n] == 0])
    # author to the product reviewed by him mapping
    logging.debug('forming the product mapping')
    author_product_mapping = {}
    for a in proper_author_graph:
        author_product_mapping[a] = [p for p in full_graph[a] if 'starRating' in full_graph[a][p] and
                                                                 full_graph[a][p]['starRating'] >= 4]
    logging.info('Running EM')
    ll, partition = HardEM.run_EM(proper_author_graph, author_product_mapping, nparts=nparts, parallel=True)
    print 'best loglikelihood: %s' % ll
    for n in partition:
        author_graph.node[n]['cLabel'] = int(partition[n])
    output_filename = 'spam_graph_mgraph_labeled.gexf'
    logging.info('Writing the clusters into the graph and saving the file into %s'%output_filename)
    nx.write_gexf(author_graph, '/home/amir/az/io/spam/%s'%output_filename)

示例#6

0

显示文件

import logging

logging.basicConfig(level=logging.DEBUG, format='%(process)d\t%(asctime)s:%(levelname)s: %(message)s', datefmt='%H:%M:%S')

from pre_process import crawl_to_graph

DS_DIR = '/home/amir/pyproj/amazon-review-spam/io/same_cat_v2'

graph, membs, prods = crawl_to_graph(ds_dir=DS_DIR)
graph_orig = graph.copy()

import networkx as nx
from os import path
mgraph = nx.read_gexf(path.join(DS_DIR, '%s.gexf' % 'em_unlabeled_mgraph'))


author_product_mapping = {}
for a in mgraph:
    author_product_mapping[a] = [p for p in graph[a]]


from hardEM_gurobi import HardEM

nparts = 4
ll, partition = HardEM.run_EM(author_graph=mgraph, author_product_map=author_product_mapping, nparts=nparts*5, parallel=True, nprocs=4)

for a in mgraph:
    mgraph.node[a]['cLabel'] = int(partition[a])


nx.write_gexf(mgraph, path.join(DS_DIR, '%s.gexf' % 'em_labeled_mgraph'), version='1.2draft', encoding='us-ascii')