def trait_variance(ancestor_pno, db, trait='w2v', n_gens = 2, enforce_func = lambda x: True): """ Computes the trait mean and variance*. For now the variance is the norm of the vector of component-wise variances. """ # TODO: Need to unfold tfidfs into sparse vector to compute variance. if trait == 'lda': raise RuntimeError('Trait variance not currently supported for {}'.format(trait)) stats = {} # mean_field_name = str(n_gens)+'_gen_trait_mean_' + trait # JM comment out 7/10 var_field_name = str(n_gens)+'_gen_trait_variance_' + trait trait_field, _, densify_func, _ = _trait_info[trait] parent = db.traits.find_one({'_id': ancestor_pno}, {'_id': 1, 'citedby': 1, trait_field:1}) lineage = crawl_lineage(db, ancestor_pno, n_gens, fields = ['_id', 'citedby', trait_field], flatten=True, enforce_func = enforce_func) if lineage is None: # stats[mean_field_name] = -1 # JM Comment out 7/10 stats[var_field_name] = -1 return stats traits = [doc.get(trait_field, None) for doc in lineage] traits = np.array([densify_func(t) for t in traits if t is not None], dtype=np.float64) if len(traits) > 1: pass # stats[mean_field_name] = list(np.mean(traits, axis=0)) # JM comment out 7/10 elif len(traits) == 1: pass # stats[mean_field_name] = traits # JM comment out 7/10 elif len(traits) == 0: # stats[mean_field_name] = -1 # JM comment out 7/10 stats[var_field_name] = -1 return stats else: raise RuntimeError("Less than 0 traits???") stats[var_field_name] = np.linalg.norm(np.var(traits, axis=0)) # For now, get a real number by computing norm. return stats
def main(): db = MongoClient().patents family_names = [ "stents", "zeolites", "bubblejet", "cellphone", "pcr", "microarrays", "semiconductors", "nonwovenwebs", "rsa", "browser", ] family_pnos = [4655771, 4061724, 4723129, 5103459, 4683202, 5143854, 4064521, 4340563, 4405829, 5572643] family_thresholds = [350, 60, 75, 225, 150, 175, 125, 100, 400, 250] lilfriend_names = [ "skate", "murphybed", "hummingbirdfeeder", "telescopicumbrella", "hybridengine", "minesweeper", "humanoidrobot", "recumbentbike", "hangglider", "ziplock", ] lilfriend_pnos = [6000704, 4766623, 5454348, 4880023, 5191766, 3938459, 6377014, 5284351, 4417707, 6004032] lilfriend_thresholds = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] bigfriend_names = [ "dentallaser", "ballisticvest", "hungryhippos", "sharkprod", "gatlinggun", "nuclearwastetreatment", "gfp", "roughterrainchasis", "bowflex", "radaraltimeter", ] bigfriend_pnos = [5616141, 4287607, 4119312, 4667431, 4154143, 4274976, 5491084, 4061199, 4725057, 4945360] bigfriend_thresholds = [25, 25, 10, 12, 8, 9, 25, 30, 15, 10] names = family_names + lilfriend_names + bigfriend_names pnos = family_pnos + lilfriend_pnos + bigfriend_pnos thresholds = family_thresholds + lilfriend_thresholds + bigfriend_thresholds for pno, threshold, name in zip(pnos, thresholds, names): print "getting lineage for patent {} ({}), with threhold {}.".format(pno, name, threshold) lineage = crawl_lineage( db, pno, n_generations=5, enforce_func=lambda x: len(x.get("citedby", [])) >= threshold, flatten=True ) adj = subnet_adj_dict(lineage) dict_fn = "/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/{}_pca_dict.p".format(name) colordict = load_obj(dict_fn) savefn = "{}_{}_force_pca_test.pdf".format(pno, name) network_plot(pno, adj, colordict, False, savefn) print "done with {}".format(name)
def test2(): db = MongoClient().patents family_names = [ 'stents', 'zeolites', 'bubblejet', 'cellphone', 'pcr', 'microarrays', 'semiconductors', 'nonwovenwebs', 'rsa', 'browser' ] family_pnos = [ 4655771, 4061724, 4723129, 5103459, 4683202, 5143854, 4064521, 4340563, 4405829, 5572643 ] family_thresholds = [350, 60, 75, 225, 150, 175, 125, 100, 400, 250] lilfriend_names = [ 'skate', 'murphybed', 'hummingbirdfeeder', 'telescopicumbrella', 'hybridengine', 'minesweeper', 'humanoidrobot', 'recumbentbike', 'hangglider', 'ziplock' ] lilfriend_pnos = [ 6000704, 4766623, 5454348, 4880023, 5191766, 3938459, 6377014, 5284351, 4417707, 6004032 ] lilfriend_thresholds = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] bigfriend_names = [ 'dentallaser', 'ballisticvest', 'hungryhippos', 'sharkprod', 'gatlinggun', 'nuclearwastetreatment', 'gfp', 'roughterrainchasis', 'bowflex', 'radaraltimeter' ] bigfriend_pnos = [ 5616141, 4287607, 4119312, 4667431, 4154143, 4274976, 5491084, 4061199, 4725057, 4945360 ] bigfriend_thresholds = [25, 25, 10, 12, 8, 9, 25, 30, 15, 10] names = family_names + lilfriend_names + bigfriend_names pnos = family_pnos + lilfriend_pnos + bigfriend_pnos thresholds = family_thresholds + lilfriend_thresholds + bigfriend_thresholds names = names[:1] pnos = pnos[:1] thresholds = thresholds[:1] for pno, threshold, name in zip(pnos, thresholds, names): print "getting lineage for patent {} ({}), with threhold {}.".format( pno, name, threshold) lineage = crawl_lineage( db, pno, n_generations=5, enforce_func=lambda x: len(x.get('citedby', [])) >= threshold, flatten=True) adj = subnet_adj_dict(lineage) dict_fn = '/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/{}_pca_dict.p'.format( name) colordict = load_obj(dict_fn) savefn = '{}_{}_force_pca_test.pdf'.format(pno, name) print "getting plot..." network_plot(pno, adj, colordict, False, savefn) print "done with {}".format(name)
def test(): db = MongoClient().patents pno = 4723129 threshold = 75 print "getting lineage..." lineage = crawl_lineage( db, pno, n_generations=5, enforce_func=lambda x: len(x.get("citedby", [])) >= threshold, flatten=True ) adj = subnet_adj_dict(lineage) bubblejet_color_dict_fn = "/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/bubblejet_pca_dict.p" bubblejet_colors = load_obj(bubblejet_color_dict_fn) savefn = "{}_force_pca_test.pdf".format(pno) print "making plot..." network_plot(pno, adj, bubblejet_colors, True, savefn) return adj, bubblejet_colors
def community_colors(db, pno, threshold, show_vis = False, savefn=None): # Get the patents in the lineage and the adjacency dictionary. lineage = crawl_lineage( db, pno, n_generations = 5, enforce_func = lambda x: len(x.get('citedby', [])) >= threshold, flatten=True ) adj = subnet_adj_dict(lineage) # detect communities detector = detect.CommunityDetector(adj) communities = detector.run() n_communities = len(communities) community_lookup = util.get_community_lookup(communities) # assign each patent a color, and provide a lookup dictionary colors = visualize.discrete_color_scheme(n_communities+1) node_color_lookup = {node: colors[community_lookup[node]] for node in adj.keys()} # make the visualization. G = visualize.get_graph(adj) G.graph['ancestor'] = pno ancestor_idx = G.nodes().index(G.graph['ancestor']) node_colors = [colors[community_lookup[node]] for node in G.nodes()] node_colors[G.nodes().index(G.graph['ancestor'])] = colors[n_communities] default_node_size = 60 node_sizes = [default_node_size for node in G.nodes()] node_sizes[ancestor_idx] = 6*default_node_size f = plt.figure() f.set_size_inches(18.5, 10.5) if savefn is not None or show_vis: nx.draw_networkx( G, nx.spring_layout(G, iterations=20000), node_color=node_colors, node_size = node_sizes, with_labels = False, fontsize=1, # font_weight = 'bold', linewidths=.5, width=.5 ) if savefn is not None: plt.title('Communities in Patent {}'.format(pno)) plt.savefig(savefn, dpi=100) if show_vis: plt.show() return node_color_lookup
def test(): db = MongoClient().patents pno = 4723129 threshold = 75 print "getting lineage..." lineage = crawl_lineage( db, pno, n_generations=5, enforce_func=lambda x: len(x.get('citedby', [])) >= threshold, flatten=True) adj = subnet_adj_dict(lineage) bubblejet_color_dict_fn = '/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/bubblejet_pca_dict.p' bubblejet_colors = load_obj(bubblejet_color_dict_fn) savefn = '{}_force_pca_test.pdf'.format(pno) print "making plot..." network_plot(pno, adj, bubblejet_colors, True, savefn) return adj, bubblejet_colors
def parent_child_trait_distance(ancestor_pno, db, trait='w2v', n_gens = 2, enforce_func = lambda x: True): """ Computes the sum and average of the parent-descendant trait distances for every descendant begot by the patent with pno given by the 'ancestor_pno' argument. The appropriate distance function for the given trait is looked up in the _trait_info dictionary. The distance function is then evaluated for each descendent in the lineage from the ancestor, up to n_gens in the future. The returned dictionary contains the sum and average fields, with both fields equal to zero if the patent in question has no children, and set to -1 as a kind of error value, if none of its descendants have traits. The argument 'enforce_func' allows the user to set a condition which must be met in order for a patent to be included in the lineage. For instance, the enforce_func, lambda x: len(x.get('citedby', [])) > 100 Would enforce that patents only be included in the lineage if they have more than 100 incoming citations. """ if trait == 'lda': raise RuntimeError('Trait variance not currently supported for {}'.format(trait)) stats = {} sum_fieldname = '_'.join([str(n_gens), 'gen_sum_dist', trait]) avg_fieldname = '_'.join([str(n_gens), 'gen_avg_dist', trait]) trait_field,dist_func, _, _ = _trait_info[trait] parent = db.traits.find_one({'_id': ancestor_pno}, {'_id': 1, 'citedby': 1, trait_field:1}) if parent is None: return None # We don't have that ancestor. lineage = crawl_lineage(db, ancestor_pno, n_gens, fields = ['_id', 'citedby', trait_field], flatten=True, enforce_func = enforce_func)[1:] if lineage is None: stats[sum_fieldname] = -1 stats[avg_fieldname] = -1 return stats if not (parent.get('citedby', None) and parent.get(trait_field, None)): stats[sum_fieldname] = 0 stats[avg_fieldname] = 0 return stats traits = [doc.get(trait_field, None) for doc in lineage] traits = np.array([t for t in traits if t is not None], dtype=np.float64) n_children_with_traits = len(traits) if n_children_with_traits == 0: stats[sum_fieldname] = -1 stats[avg_fieldname] = -1 return stats dist_sum = np.sum([dist_func(parent.get(trait_field), trait) for trait in traits]) stats[sum_fieldname] = dist_sum stats[avg_fieldname] = dist_sum/n_children_with_traits return stats
def testCrawl2Gens(self): for pno in self.pnos_test: dbutil.crawl_lineage(self.db, pno, n_generations = 2)