def walk_down_graph(pno,depth,threshold,trait='top_tf-idf'): count = 0 p = collection.find_one({'_id':pno},{'_id':1, 'citedby':1, trait:1}) gens = [[p]] just_nodes = [p['_id']] node_gens = [[p['_id']]] links = [] for i in range(1,depth): parents = gens[i-1] next_gen = [] new_nodes = [] for par in parents: children_pnos = par['citedby'] children = collection.find({'_id': {"$in":children_pnos}}, {'_id':1, 'citedby':1, trait:1}) for child in list(children): if 'citedby' not in child.keys(): count += 1 elif len(child['citedby']) >= threshold: links.append((par['_id'],child['_id'])) # add only previously unseen nodes if child['_id'] not in just_nodes: next_gen.append(child) new_nodes.append(child['_id']) just_nodes.append(child['_id']) gens.append(next_gen) node_gens.append(new_nodes) print(count, 'without citedby') # get rid of the gens, just a list of records recs = [] count = 0 for gen in gens: recs += gen for rec in recs: # rename '_id' to 'pno' rec['pno'] = rec.pop('_id') # rename trait to 'traits' if trait in rec.keys(): rec['traits'] = rec.pop(trait) else: count += 1 rec['traits'] = [] count = round(float(count)/float(len(just_nodes)) * 100) print('%d%% without traits' % count) recs = et.recs_by_pno(recs) sparse = tm.sparse_matrix(just_nodes,recs,'traits') trait_dict = {} for i,pno in enumerate(just_nodes): trait_dict[pno] = sparse[i] return (just_nodes,node_gens,links,trait_dict)
def walk_down_graph(pno,depth,threshold,trait=None): p = collection.find_one({'pno':pno},{'pno':1, 'citedby':1, 'sorted_text':1}) gens = [[p]] just_nodes = [p['pno']] node_gens = [[p['pno']]] links = [] for i in range(1,depth): parents = gens[i-1] next_gen = [] new_nodes = [] for par in parents: children_pnos = par['citedby'] children = collection.find({'pno': {"$in":children_pnos}}, {'pno':1, 'citedby':1, 'sorted_text':1, 'text':1}) for child in list(children): if len(child['citedby']) >= threshold: links.append((par['pno'],child['pno'])) # add only previously unseen nodes if child['pno'] not in just_nodes: next_gen.append(child) new_nodes.append(child['pno']) just_nodes.append(child['pno']) gens.append(next_gen) node_gens.append(new_nodes) recs = [] for gen in gens: recs += gen # get the trait dict recs = et.recs_by_pno(recs) recs = et.trim_sorted_text(recs,10) sparse = tm.sparse_matrix(just_nodes,recs,'traits') trait_dict = {} for i,pno in enumerate(just_nodes): trait_dict[pno] = sparse[i] return (just_nodes,node_gens,links,trait_dict)
import trait_matrix as tm import ..dots f = open('zeolites_network_5_60.p', 'rb') network = pickle.load(f) n = 10 just_nodes = network[0] node_gens = network[1] links = network[2] recs = network[3] primo = just_nodes[0] recs = et.recs_by_pno(recs) recs = et.trim_sorted_text(recs,n) p_by_c = et.parents_by_child(links) # first patent doesn't have any parents! we need to add it manually p_by_c[primo] = [] # get the primogenitors traits primo_traits = recs[primo]['words'] # the matrix is ordered by the order of pnos in just_nodes s_matrix = tm.sparse_matrix(just_nodes,recs,'words') traits_by_pno = tm.traits_by_pno(just_nodes,s_matrix) real_network = p_by_c