Пример #1
0
def trait_variance(ancestor_pno, db, trait='w2v', n_gens = 2, enforce_func = lambda x: True):
    """
    Computes the trait mean and variance*. For now the variance is the norm of the vector of component-wise variances.  
    """
    # TODO: Need to unfold tfidfs into sparse vector to compute variance. 
    if trait == 'lda':
        raise RuntimeError('Trait variance not currently supported for {}'.format(trait))
    stats = {}
#    mean_field_name = str(n_gens)+'_gen_trait_mean_' + trait # JM comment out 7/10
    var_field_name = str(n_gens)+'_gen_trait_variance_' + trait
    trait_field, _, densify_func, _ = _trait_info[trait]
    parent = db.traits.find_one({'_id': ancestor_pno}, {'_id': 1, 'citedby': 1, trait_field:1})
    lineage = crawl_lineage(db, ancestor_pno, n_gens, fields = ['_id', 'citedby', trait_field], flatten=True, enforce_func = enforce_func)
    if lineage is None: 
#        stats[mean_field_name] = -1 # JM Comment out 7/10
        stats[var_field_name] = -1
        return stats
    traits = [doc.get(trait_field, None) for doc in lineage]
    traits = np.array([densify_func(t) for t in traits if t is not None], dtype=np.float64)
    if len(traits) > 1:
        pass
#        stats[mean_field_name] = list(np.mean(traits, axis=0)) # JM comment out 7/10
    elif len(traits) == 1:
        pass
#        stats[mean_field_name] = traits # JM comment out 7/10
    elif len(traits) == 0:
#        stats[mean_field_name] = -1 # JM comment out 7/10
        stats[var_field_name] = -1
        return stats
    else:
        raise RuntimeError("Less than 0 traits???")
    stats[var_field_name] = np.linalg.norm(np.var(traits, axis=0)) # For now, get a real number by computing norm. 
    return stats
Пример #2
0
def main():
    db = MongoClient().patents
    family_names = [
        "stents",
        "zeolites",
        "bubblejet",
        "cellphone",
        "pcr",
        "microarrays",
        "semiconductors",
        "nonwovenwebs",
        "rsa",
        "browser",
    ]
    family_pnos = [4655771, 4061724, 4723129, 5103459, 4683202, 5143854, 4064521, 4340563, 4405829, 5572643]
    family_thresholds = [350, 60, 75, 225, 150, 175, 125, 100, 400, 250]
    lilfriend_names = [
        "skate",
        "murphybed",
        "hummingbirdfeeder",
        "telescopicumbrella",
        "hybridengine",
        "minesweeper",
        "humanoidrobot",
        "recumbentbike",
        "hangglider",
        "ziplock",
    ]
    lilfriend_pnos = [6000704, 4766623, 5454348, 4880023, 5191766, 3938459, 6377014, 5284351, 4417707, 6004032]
    lilfriend_thresholds = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    bigfriend_names = [
        "dentallaser",
        "ballisticvest",
        "hungryhippos",
        "sharkprod",
        "gatlinggun",
        "nuclearwastetreatment",
        "gfp",
        "roughterrainchasis",
        "bowflex",
        "radaraltimeter",
    ]
    bigfriend_pnos = [5616141, 4287607, 4119312, 4667431, 4154143, 4274976, 5491084, 4061199, 4725057, 4945360]
    bigfriend_thresholds = [25, 25, 10, 12, 8, 9, 25, 30, 15, 10]
    names = family_names + lilfriend_names + bigfriend_names
    pnos = family_pnos + lilfriend_pnos + bigfriend_pnos
    thresholds = family_thresholds + lilfriend_thresholds + bigfriend_thresholds
    for pno, threshold, name in zip(pnos, thresholds, names):
        print "getting lineage for patent {} ({}), with threhold {}.".format(pno, name, threshold)
        lineage = crawl_lineage(
            db, pno, n_generations=5, enforce_func=lambda x: len(x.get("citedby", [])) >= threshold, flatten=True
        )
        adj = subnet_adj_dict(lineage)
        dict_fn = "/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/{}_pca_dict.p".format(name)
        colordict = load_obj(dict_fn)
        savefn = "{}_{}_force_pca_test.pdf".format(pno, name)
        network_plot(pno, adj, colordict, False, savefn)
        print "done with {}".format(name)
Пример #3
0
def test2():
    db = MongoClient().patents
    family_names = [
        'stents', 'zeolites', 'bubblejet', 'cellphone', 'pcr', 'microarrays',
        'semiconductors', 'nonwovenwebs', 'rsa', 'browser'
    ]
    family_pnos = [
        4655771, 4061724, 4723129, 5103459, 4683202, 5143854, 4064521, 4340563,
        4405829, 5572643
    ]
    family_thresholds = [350, 60, 75, 225, 150, 175, 125, 100, 400, 250]
    lilfriend_names = [
        'skate', 'murphybed', 'hummingbirdfeeder', 'telescopicumbrella',
        'hybridengine', 'minesweeper', 'humanoidrobot', 'recumbentbike',
        'hangglider', 'ziplock'
    ]
    lilfriend_pnos = [
        6000704, 4766623, 5454348, 4880023, 5191766, 3938459, 6377014, 5284351,
        4417707, 6004032
    ]
    lilfriend_thresholds = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    bigfriend_names = [
        'dentallaser', 'ballisticvest', 'hungryhippos', 'sharkprod',
        'gatlinggun', 'nuclearwastetreatment', 'gfp', 'roughterrainchasis',
        'bowflex', 'radaraltimeter'
    ]
    bigfriend_pnos = [
        5616141, 4287607, 4119312, 4667431, 4154143, 4274976, 5491084, 4061199,
        4725057, 4945360
    ]
    bigfriend_thresholds = [25, 25, 10, 12, 8, 9, 25, 30, 15, 10]
    names = family_names + lilfriend_names + bigfriend_names
    pnos = family_pnos + lilfriend_pnos + bigfriend_pnos
    thresholds = family_thresholds + lilfriend_thresholds + bigfriend_thresholds
    names = names[:1]
    pnos = pnos[:1]
    thresholds = thresholds[:1]
    for pno, threshold, name in zip(pnos, thresholds, names):
        print "getting lineage for patent {} ({}), with threhold {}.".format(
            pno, name, threshold)
        lineage = crawl_lineage(
            db,
            pno,
            n_generations=5,
            enforce_func=lambda x: len(x.get('citedby', [])) >= threshold,
            flatten=True)
        adj = subnet_adj_dict(lineage)
        dict_fn = '/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/{}_pca_dict.p'.format(
            name)
        colordict = load_obj(dict_fn)
        savefn = '{}_{}_force_pca_test.pdf'.format(pno, name)
        print "getting plot..."
        network_plot(pno, adj, colordict, False, savefn)
        print "done with {}".format(name)
Пример #4
0
def test():
    db = MongoClient().patents
    pno = 4723129
    threshold = 75
    print "getting lineage..."
    lineage = crawl_lineage(
        db, pno, n_generations=5, enforce_func=lambda x: len(x.get("citedby", [])) >= threshold, flatten=True
    )
    adj = subnet_adj_dict(lineage)
    bubblejet_color_dict_fn = "/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/bubblejet_pca_dict.p"
    bubblejet_colors = load_obj(bubblejet_color_dict_fn)
    savefn = "{}_force_pca_test.pdf".format(pno)
    print "making plot..."
    network_plot(pno, adj, bubblejet_colors, True, savefn)
    return adj, bubblejet_colors
Пример #5
0
def community_colors(db, pno, threshold, show_vis = False, savefn=None):
    # Get the patents in the lineage and the adjacency dictionary. 
    lineage = crawl_lineage(
        db, pno, n_generations = 5, 
        enforce_func = lambda x: len(x.get('citedby', [])) >= threshold,
        flatten=True
    )
    adj = subnet_adj_dict(lineage)

    # detect communities
    detector = detect.CommunityDetector(adj)
    communities = detector.run()
    n_communities = len(communities)
    community_lookup = util.get_community_lookup(communities)

    # assign each patent a color, and provide a lookup dictionary
    colors = visualize.discrete_color_scheme(n_communities+1)
    node_color_lookup = {node: colors[community_lookup[node]] for node in adj.keys()}
    
    # make the visualization.
    G = visualize.get_graph(adj)
    G.graph['ancestor'] = pno
    ancestor_idx = G.nodes().index(G.graph['ancestor'])
    node_colors = [colors[community_lookup[node]] for node in G.nodes()]
    node_colors[G.nodes().index(G.graph['ancestor'])] = colors[n_communities]
    default_node_size = 60
    node_sizes = [default_node_size for node in G.nodes()]
    node_sizes[ancestor_idx] = 6*default_node_size
    f = plt.figure()
    f.set_size_inches(18.5, 10.5)
    if savefn is not None or show_vis:
        nx.draw_networkx(
            G, 
            nx.spring_layout(G, iterations=20000), 
            node_color=node_colors, 
            node_size = node_sizes,
            with_labels = False,
            fontsize=1,
#            font_weight = 'bold',
            linewidths=.5,
            width=.5
        )
    if savefn is not None:
        plt.title('Communities in Patent {}'.format(pno))
        plt.savefig(savefn, dpi=100)
    if show_vis:
        plt.show()
    return node_color_lookup
Пример #6
0
def test():
    db = MongoClient().patents
    pno = 4723129
    threshold = 75
    print "getting lineage..."
    lineage = crawl_lineage(
        db,
        pno,
        n_generations=5,
        enforce_func=lambda x: len(x.get('citedby', [])) >= threshold,
        flatten=True)
    adj = subnet_adj_dict(lineage)
    bubblejet_color_dict_fn = '/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/bubblejet_pca_dict.p'
    bubblejet_colors = load_obj(bubblejet_color_dict_fn)
    savefn = '{}_force_pca_test.pdf'.format(pno)
    print "making plot..."
    network_plot(pno, adj, bubblejet_colors, True, savefn)
    return adj, bubblejet_colors
Пример #7
0
def parent_child_trait_distance(ancestor_pno, db, trait='w2v', n_gens = 2, enforce_func = lambda x: True):
    """
    Computes the sum and average of the parent-descendant trait distances for every descendant begot by the patent
    with pno given by the 'ancestor_pno' argument. The appropriate distance function
    for the given trait is looked up in the _trait_info dictionary. The distance function is then evaluated
    for each descendent in the lineage from the ancestor, up to n_gens in the future. The returned 
    dictionary contains the sum and average fields, with both fields equal to zero if the patent in question has no
    children, and set to -1 as a kind of error value, if none of its descendants have traits. The argument 'enforce_func'
    allows the user to set a condition which must be met in order for a patent to be included in the lineage. For instance,
    the enforce_func, 

        lambda x: len(x.get('citedby', [])) > 100
    
    Would enforce that patents only be included in the lineage if they have more than 100 incoming citations.
    """
    if trait == 'lda':
        raise RuntimeError('Trait variance not currently supported for {}'.format(trait))
    stats = {}
    sum_fieldname = '_'.join([str(n_gens), 'gen_sum_dist', trait])
    avg_fieldname = '_'.join([str(n_gens), 'gen_avg_dist', trait])
    trait_field,dist_func, _, _ = _trait_info[trait]
    parent = db.traits.find_one({'_id': ancestor_pno}, {'_id': 1, 'citedby': 1, trait_field:1})
    if parent is None:
        return None # We don't have that ancestor. 
    lineage = crawl_lineage(db, ancestor_pno, n_gens, fields = ['_id', 'citedby', trait_field], flatten=True, enforce_func = enforce_func)[1:]
    if lineage is None:
        stats[sum_fieldname] = -1
        stats[avg_fieldname] = -1
        return stats
    if not (parent.get('citedby', None) and parent.get(trait_field, None)):
        stats[sum_fieldname] = 0
        stats[avg_fieldname] = 0
        return stats
    traits = [doc.get(trait_field, None) for doc in lineage]
    traits = np.array([t for t in traits if t is not None], dtype=np.float64)
    n_children_with_traits = len(traits)
    if n_children_with_traits == 0:
        stats[sum_fieldname] = -1
        stats[avg_fieldname] = -1
        return stats
    dist_sum = np.sum([dist_func(parent.get(trait_field), trait) for trait in traits])
    stats[sum_fieldname] = dist_sum
    stats[avg_fieldname] = dist_sum/n_children_with_traits
    return stats
Пример #8
0
 def testCrawl2Gens(self):
     for pno in self.pnos_test:
         dbutil.crawl_lineage(self.db, pno, n_generations = 2)