def test(limit=100): # Get filenames. indir = '/Users/jmenick/Desktop/alife_refactor/output/lda_model_200' name = 'lda_200' pnofn = '/'.join([indir, 'pnos.p']) ldafn = '/'.join([indir, name+'.lda']) corpusfn = '/'.join([indir, 'corpus_'+name+'.svmlight']) vocabfn = '/'.join([indir, 'vocab_'+name+'.dict']) # Load persisted data from disk. print "loading data..." vocab = load_vocab(vocabfn) corpus = load_corpus(corpusfn) lda = load_lda(ldafn) pnos = load_obj(pnofn) pno2id = {p:i for i,p in enumerate(pnos)} #produce visualization... commented out for now. keeps crashing the machine. # print "producing visualization..." # visfn = '/'.join([indir, 'vis.html']) # vis_data = prepare(lda, corpus, vocab) # print "saving visualization..." # pyLDAvis.save_html(vis_data, visfn) # put doc topics in db. print "Getting doc topics..." assert(len(corpus) == len(pnos)) db = MongoClient().patents def partfunc(doc): topics = lda[corpus[pno2id[doc['_id']]]] return {'$set': {'lda_topics': topics}} pats_test = db.traits.find().limit(limit) for p in pats_test: pprint(partfunc(p)) print "\nDone."
def load_pops(start_time, limit = None): date_str = dt_as_str(start_time) popfn = '/'.join([_pop_dir, date_str+'.p']) doc = load_obj(popfn) if limit is not None: return doc['new_ancestors'], doc['descendants'] else: return doc['new_ancestors'][:limit], doc['descendants'][:limit]
def load_pops(start_time, limit=None): date_str = dt_as_str(start_time) popfn = '/'.join([_pop_dir, date_str + '.p']) doc = load_obj(popfn) if limit is not None: return doc['new_ancestors'], doc['descendants'] else: return doc['new_ancestors'][:limit], doc['descendants'][:limit]
def main(): db = MongoClient().patents family_names = [ "stents", "zeolites", "bubblejet", "cellphone", "pcr", "microarrays", "semiconductors", "nonwovenwebs", "rsa", "browser", ] family_pnos = [4655771, 4061724, 4723129, 5103459, 4683202, 5143854, 4064521, 4340563, 4405829, 5572643] family_thresholds = [350, 60, 75, 225, 150, 175, 125, 100, 400, 250] lilfriend_names = [ "skate", "murphybed", "hummingbirdfeeder", "telescopicumbrella", "hybridengine", "minesweeper", "humanoidrobot", "recumbentbike", "hangglider", "ziplock", ] lilfriend_pnos = [6000704, 4766623, 5454348, 4880023, 5191766, 3938459, 6377014, 5284351, 4417707, 6004032] lilfriend_thresholds = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] bigfriend_names = [ "dentallaser", "ballisticvest", "hungryhippos", "sharkprod", "gatlinggun", "nuclearwastetreatment", "gfp", "roughterrainchasis", "bowflex", "radaraltimeter", ] bigfriend_pnos = [5616141, 4287607, 4119312, 4667431, 4154143, 4274976, 5491084, 4061199, 4725057, 4945360] bigfriend_thresholds = [25, 25, 10, 12, 8, 9, 25, 30, 15, 10] names = family_names + lilfriend_names + bigfriend_names pnos = family_pnos + lilfriend_pnos + bigfriend_pnos thresholds = family_thresholds + lilfriend_thresholds + bigfriend_thresholds for pno, threshold, name in zip(pnos, thresholds, names): print "getting lineage for patent {} ({}), with threhold {}.".format(pno, name, threshold) lineage = crawl_lineage( db, pno, n_generations=5, enforce_func=lambda x: len(x.get("citedby", [])) >= threshold, flatten=True ) adj = subnet_adj_dict(lineage) dict_fn = "/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/{}_pca_dict.p".format(name) colordict = load_obj(dict_fn) savefn = "{}_{}_force_pca_test.pdf".format(pno, name) network_plot(pno, adj, colordict, False, savefn) print "done with {}".format(name)
def test2(): db = MongoClient().patents family_names = [ 'stents', 'zeolites', 'bubblejet', 'cellphone', 'pcr', 'microarrays', 'semiconductors', 'nonwovenwebs', 'rsa', 'browser' ] family_pnos = [ 4655771, 4061724, 4723129, 5103459, 4683202, 5143854, 4064521, 4340563, 4405829, 5572643 ] family_thresholds = [350, 60, 75, 225, 150, 175, 125, 100, 400, 250] lilfriend_names = [ 'skate', 'murphybed', 'hummingbirdfeeder', 'telescopicumbrella', 'hybridengine', 'minesweeper', 'humanoidrobot', 'recumbentbike', 'hangglider', 'ziplock' ] lilfriend_pnos = [ 6000704, 4766623, 5454348, 4880023, 5191766, 3938459, 6377014, 5284351, 4417707, 6004032 ] lilfriend_thresholds = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] bigfriend_names = [ 'dentallaser', 'ballisticvest', 'hungryhippos', 'sharkprod', 'gatlinggun', 'nuclearwastetreatment', 'gfp', 'roughterrainchasis', 'bowflex', 'radaraltimeter' ] bigfriend_pnos = [ 5616141, 4287607, 4119312, 4667431, 4154143, 4274976, 5491084, 4061199, 4725057, 4945360 ] bigfriend_thresholds = [25, 25, 10, 12, 8, 9, 25, 30, 15, 10] names = family_names + lilfriend_names + bigfriend_names pnos = family_pnos + lilfriend_pnos + bigfriend_pnos thresholds = family_thresholds + lilfriend_thresholds + bigfriend_thresholds names = names[:1] pnos = pnos[:1] thresholds = thresholds[:1] for pno, threshold, name in zip(pnos, thresholds, names): print "getting lineage for patent {} ({}), with threhold {}.".format( pno, name, threshold) lineage = crawl_lineage( db, pno, n_generations=5, enforce_func=lambda x: len(x.get('citedby', [])) >= threshold, flatten=True) adj = subnet_adj_dict(lineage) dict_fn = '/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/{}_pca_dict.p'.format( name) colordict = load_obj(dict_fn) savefn = '{}_{}_force_pca_test.pdf'.format(pno, name) print "getting plot..." network_plot(pno, adj, colordict, False, savefn) print "done with {}".format(name)
def _load_df(): """ Loads a dictionary of document frequencies from disk. Assumes It lives at a particular location on disk. TODO: throw this function (and all such functions) into alife.data """ _data_dir = '/Users/jmenick/Desktop/alife_refactor/output/aggregate_stats' _df_fn = '/'.join([_data_dir, 'tfidf_doc_freq.p']) try: df_dict = load_obj(_df_fn) except: raise RuntimeError("A document frequency dictionary is not stored in {}.".format(_df_fn)) return sorted(df_dict.items() ,key = lambda x: x[1])
def main(): # Get filenames. indir = '/Users/jmenick/Desktop/alife_refactor/output/lda_model_200' name = 'lda_200' pnofn = '/'.join([indir, 'pnos.p']) ldafn = '/'.join([indir, name + '.lda']) corpusfn = '/'.join([indir, 'corpus_' + name + '.svmlight']) vocabfn = '/'.join([indir, 'vocab_' + name + '.dict']) # Load persisted data from disk. print "loading data..." vocab = load_vocab(vocabfn) corpus = load_corpus(corpusfn) lda = load_lda(ldafn) pnos = load_obj(pnofn) pno2id = {p: i for i, p in enumerate(pnos)} #produce visualization... commented out for now due to crashing. Ugh PCA again... # visfn = '/'.join([indir, 'vis.html']) # vis_data = prepare(lda, corpus, vocab) # pyLDAvis.save_html(vis_data, visfn) # put doc topics in db. print "inserting doc topics..." db = MongoClient().patents print "len(corpus): {}, len(pnos): {}".format(len(pnos), len(corpus)) def partfunc(doc): pno = doc['_id'] try: corpus_idx = pno2id[pno] bow = corpus[corpus_idx] topics = lda[bow] return {'$set': {'lda_topics': topics}} except: logging.warning("no topics for {}".format(pno)) return {'$set': {'no_topics': True}} parallelMap(partfunc, in_collection=db.traits, out_collection=db.traits, findArgs={ 'spec': {}, 'fields': { '_id': 1 } }, bSize=1000, updateFreq=500)
def test(): db = MongoClient().patents pno = 4723129 threshold = 75 print "getting lineage..." lineage = crawl_lineage( db, pno, n_generations=5, enforce_func=lambda x: len(x.get("citedby", [])) >= threshold, flatten=True ) adj = subnet_adj_dict(lineage) bubblejet_color_dict_fn = "/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/bubblejet_pca_dict.p" bubblejet_colors = load_obj(bubblejet_color_dict_fn) savefn = "{}_force_pca_test.pdf".format(pno) print "making plot..." network_plot(pno, adj, bubblejet_colors, True, savefn) return adj, bubblejet_colors
def main(): # Get filenames. indir = '/Users/jmenick/Desktop/alife_refactor/output/lda_model_200' name = 'lda_200' pnofn = '/'.join([indir, 'pnos.p']) ldafn = '/'.join([indir, name+'.lda']) corpusfn = '/'.join([indir, 'corpus_'+name+'.svmlight']) vocabfn = '/'.join([indir, 'vocab_'+name+'.dict']) # Load persisted data from disk. print "loading data..." vocab = load_vocab(vocabfn) corpus = load_corpus(corpusfn) lda = load_lda(ldafn) pnos = load_obj(pnofn) pno2id = {p:i for i,p in enumerate(pnos)} #produce visualization... commented out for now due to crashing. Ugh PCA again... # visfn = '/'.join([indir, 'vis.html']) # vis_data = prepare(lda, corpus, vocab) # pyLDAvis.save_html(vis_data, visfn) # put doc topics in db. print "inserting doc topics..." db = MongoClient().patents print "len(corpus): {}, len(pnos): {}".format(len(pnos), len(corpus)) def partfunc(doc): pno = doc['_id'] try: corpus_idx = pno2id[pno] bow = corpus[corpus_idx] topics = lda[bow] return {'$set': {'lda_topics': topics}} except: logging.warning("no topics for {}".format(pno)) return {'$set': {'no_topics': True}} parallelMap( partfunc, in_collection = db.traits, out_collection = db.traits, findArgs = { 'spec': {}, 'fields': {'_id':1} }, bSize = 1000, updateFreq = 500 )
def test(): db = MongoClient().patents pno = 4723129 threshold = 75 print "getting lineage..." lineage = crawl_lineage( db, pno, n_generations=5, enforce_func=lambda x: len(x.get('citedby', [])) >= threshold, flatten=True) adj = subnet_adj_dict(lineage) bubblejet_color_dict_fn = '/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/bubblejet_pca_dict.p' bubblejet_colors = load_obj(bubblejet_color_dict_fn) savefn = '{}_force_pca_test.pdf'.format(pno) print "making plot..." network_plot(pno, adj, bubblejet_colors, True, savefn) return adj, bubblejet_colors
def test(limit=100): # Get filenames. indir = '/Users/jmenick/Desktop/alife_refactor/output/lda_model_200' name = 'lda_200' pnofn = '/'.join([indir, 'pnos.p']) ldafn = '/'.join([indir, name + '.lda']) corpusfn = '/'.join([indir, 'corpus_' + name + '.svmlight']) vocabfn = '/'.join([indir, 'vocab_' + name + '.dict']) # Load persisted data from disk. print "loading data..." vocab = load_vocab(vocabfn) corpus = load_corpus(corpusfn) lda = load_lda(ldafn) pnos = load_obj(pnofn) pno2id = {p: i for i, p in enumerate(pnos)} #produce visualization... commented out for now. keeps crashing the machine. # print "producing visualization..." # visfn = '/'.join([indir, 'vis.html']) # vis_data = prepare(lda, corpus, vocab) # print "saving visualization..." # pyLDAvis.save_html(vis_data, visfn) # put doc topics in db. print "Getting doc topics..." assert (len(corpus) == len(pnos)) db = MongoClient().patents def partfunc(doc): topics = lda[corpus[pno2id[doc['_id']]]] return {'$set': {'lda_topics': topics}} pats_test = db.traits.find().limit(limit) for p in pats_test: pprint(partfunc(p)) print "\nDone."
def test(): gpes = load_obj('gpes_tfidf.p') plot_gpe(gpes, savefn='gpes_tfidf_test.pdf')
def load_anc_dec(start_date, indir): """ Load two lists of patents; ancestral and descendant poulation respectively. """ filename = '/'.join([indir, dt_as_str(start_date)+'.p']) pop_dict = load_obj(filename) assert(start_date == pop_dict['start']) # Make sure the date we think we're loading matches the stored date. return pop_dict['ancestors'], pop_dict['descendants']
def load_pop(start_date): """ Load a list of patents (dictionaries) occuring in the month following the given start_date.""" filename = '/'.join([_pop_dir, dt_as_str(start_date)+'.p']) pop_dict = load_obj(filename) assert(start_date == pop_dict['start']) # Make sure the date we think we're loading matches the stored date. return pop_dict['descendants']
from alife.util.general import load_obj import sys import os import csv if __name__ == '__main__': if len(sys.argv) != 2: sys.exit("Usage: python {} <path to .p file>".format(sys.argv[0])) else: infn = sys.argv[1] inbase = os.path.basename(infn) outfn = inbase.split('.')[0] + '.csv' gpes = load_obj(infn) with open(outfn, 'wb') as outfile: writer = csv.writer(outfile) writer.writerow(['trait', 'time_step', 't1', 't2', 't3', 'total']) for trait, series in gpes.items(): for step,term_list in enumerate(series): writer.writerow([trait, step]+list(term_list))