def dump_populations(db, start, end, outdir, delta=timedelta(days=7), lim=100, debug=True): """ Step through time, maintaining the known ancestral population at each time step. Save each set of populations as a pickled dictionary.""" for (tm1, t, tp1) in step_through_time(start, end): new_ancestors, descendants = get_populations(db, tm1, t, tp1, lim) precompute_doc = { '_id': tm1, 'new_ancestors': list(new_ancestors), 'descendants': list(descendants) } if debug: precompute_doc['new_ancestors'] = len( precompute_doc['new_ancestors']) precompute_doc['descendants'] = len(precompute_doc['descendants']) pprint(precompute_doc) else: popfn = '/'.join([outdir, dt_as_str(tm1) + '.p']) print "pickling population for time {} as {}".format(tm1, popfn) print "#new ancestors: {}, #descendants:{}".format( len(precompute_doc['new_ancestors']), len(precompute_doc['descendants'])) pickle_obj(popfn, precompute_doc)
def main(infn, outfn, verbose = True): trait_gpes = defaultdict(list, []) with open(infn, 'r') as infile: parsed = [x for x in (parseline(line) for line in infile) if x is not None] for trait,gpes in parsed: if verbose: print "{}: {}".format(trait, gpes) trait_gpes[trait].append(gpes) pickle_obj(outfn, trait_gpes)
def get_and_save_community_colors(db, pnos, thresholds=None): community_colors_list = [] if thresholds is None: thresholds = [0 for _ in pnos] for pno,threshold in zip(pnos,thresholds): viz_fn = 'viz_'+str(pno)+'.pdf' lookup_fn = 'lookup_'+str(pno)+'.p' color_lookup = community_colors(db, pno, threshold, show_vis=False, savefn=viz_fn) community_colors_list.append(color_lookup) pickle_obj(lookup_fn, color_lookup) return community_colors_list
def save(self, outdir, just_lda=False): """ save all files""" if not just_lda: pnofn = '/'.join([outdir, 'pnos.p']) vocabfn = '/'.join([outdir, 'vocab_' + self.name + '.dict']) corpusfn = '/'.join([outdir, 'corpus_' + self.name + '.svmlight']) if self.pnos is not None: pickle_obj(pnofn, self.pnos) self.vocab.save(vocabfn) corpora.SvmLightCorpus.serialize(corpusfn, self.corpus) ldafn = '/'.join([outdir, self.name + '.lda']) self._lda_model.save(ldafn)
def save(self, outdir, just_lda=False): """ save all files""" if not just_lda: pnofn = '/'.join([outdir, 'pnos.p']) vocabfn = '/'.join([outdir, 'vocab_' + self.name + '.dict']) corpusfn = '/'.join([outdir, 'corpus_' + self.name + '.svmlight']) if self.pnos is not None: pickle_obj(pnofn, self.pnos) self.vocab.save(vocabfn) corpora.SvmLightCorpus.serialize(corpusfn, self.corpus) ldafn = '/'.join([outdir,self.name+'.lda']) self._lda_model.save(ldafn)
def test(): db = MongoClient().patents all_pairs = db.just_cites.find() N = all_pairs.count() # Get counters for each number of shared traits and store as pickled dict. real_shares_pairs = (nshared_tfidf(*p) for p in get_cite_pairs(N)) real_shares_pairs_counts = Counter(real_shares_pairs) real_N = np.sum((x for x in real_shares_pairs.values())) pickle_obj('real_shares_counter_pairs.p', dict(real_shares_pairs_counts)) rand_shares_pairs = (nshared_tfidf(*p) for p in get_rand_pairs(N)) rand_shares_pairs_counts = Counter(rand_shares_pairs) rand_N = np.sum((x for x in rand_shares_pairs_counts)) pickle_obj('rand_shares_counter_pairs.p', dict(rand_shares_pairs_counts))
def main_noncum(name, mark=False): db = MongoClient().patents mindate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', 1).limit(1))[0]['isd'] maxdate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', -1).limit(1))[0]['isd'] # tfidf_traits = _tfidf_traits tfidf_traits = list(set(freq_prop_sample(3500)+_tfidf_traits)) docvec_traits = range(200) # each cluster is a docvec trait # Runs the GPE calculation for TFIDF logging.info("starting with tfidf...") gpes_tfidf = run_gpe_parmap_noncum(db, 'tf-idf', tfidf_traits, mindate.year, maxdate.year, mark=mark) # Serialize the GPE results as a pickled python dictionary. pickle_fn = name+'gpes_tfidf_3k.p' logging.info("done. pickling in {}...".format(pickle_fn)) pickle_obj(pickle_fn, gpes_tfidf) # Save the computed GPE terms as csv. csv_fn = name+'gpes_tfidf_3k.csv' logging.info("saving as csv in {}...".format(csv_fn)) with open(csv_fn, 'wb') as outfile: writer = csv.writer(outfile) writer.writerow(['trait', 'time_step', 't1', 't2', 't3', 'total']) for trait, series in gpes_tfidf.items(): for step,term_list in enumerate(series): writer.writerow([trait, step]+list(term_list)) # Runs the GPE calculation for docvec logging.info("now for docvec...") gpes_docvec = run_gpe_parmap_noncum(db, 'w2v', docvec_traits, mindate.year, maxdate.year, mark=mark) # Serialize the GPE results as a pickled python dictionary. logging.info("saving as pickle...") pickle_obj(name+'gpes_docvec.p', gpes_docvec) # Save the computed GPE terms as csv. logging.info("done. saving as csv.") with open(name+'gpes_docvec.csv', 'wb') as outfile: writer = csv.writer(outfile) writer.writerow(['trait', 'time_step', 't1', 't2', 't3', 'total']) for trait, series in gpes_docvec.items(): for step,term_list in enumerate(series): writer.writerow([trait, step]+list(term_list)) return gpes_tfidf, gpes_docvec
def dump_populations(db, start, end, outdir, delta=timedelta(days=7),lim=100, debug=True): """ Step through time, maintaining the known ancestral population at each time step. Save each set of populations as a pickled dictionary.""" for (tm1, t, tp1) in step_through_time(start, end): new_ancestors, descendants = get_populations(db, tm1, t, tp1, lim) precompute_doc = {'_id': tm1, 'new_ancestors': list(new_ancestors), 'descendants': list(descendants)} if debug: precompute_doc['new_ancestors'] = len(precompute_doc['new_ancestors']) precompute_doc['descendants'] = len(precompute_doc['descendants']) pprint(precompute_doc) else: popfn = '/'.join([outdir, dt_as_str(tm1)+'.p']) print "pickling population for time {} as {}".format(tm1, popfn) print "#new ancestors: {}, #descendants:{}".format(len(precompute_doc['new_ancestors']), len(precompute_doc['descendants'])) pickle_obj(popfn, precompute_doc)
def main(): db = MongoClient().patents in_deg_counts, out_deg_counts = in_and_out_counts(db, None) pickle_obj('in_deg_counts.p', dict(in_deg_counts)) pickle_obj('out_deg_counts.p', dict(out_deg_counts)) f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) f.set_size_inches(18.5, 10.5) ax1.hist(in_deg_counts.keys(), weights=in_deg_counts.values(), bins=100) ax1.set_xlabel('In-Degree') ax1.set_ylabel('Count') ax2.hist(out_deg_counts.keys(), weights=out_deg_counts.values(), bins=100) ax2.set_xlabel('Out-Degree') ax2.set_ylabel('Count') plt.suptitle('Degree Distributions') plt.savefig('degree_distributions.png')
def main(): db = MongoClient().patents in_deg_counts, out_deg_counts = in_and_out_counts(db, None) pickle_obj('in_deg_counts.p', dict(in_deg_counts)) pickle_obj('out_deg_counts.p', dict(out_deg_counts)) f,(ax1, ax2) = plt.subplots(1,2,sharey=True) f.set_size_inches(18.5, 10.5) ax1.hist(in_deg_counts.keys(), weights=in_deg_counts.values(), bins=100) ax1.set_xlabel('In-Degree') ax1.set_ylabel('Count') ax2.hist(out_deg_counts.keys(), weights=out_deg_counts.values(), bins=100) ax2.set_xlabel('Out-Degree') ax2.set_ylabel('Count') plt.suptitle('Degree Distributions') plt.savefig('degree_distributions.png')
def dump_pops_over_time(db, time_pairs, outdir, limit=None, mark = False): times = [] popsizes = [] for (time_0, time_1) in time_pairs: if mark: ancestors, new_descendants = map(list, get_anc_dec_mark(db, time_0, time_1, limit)) else: ancestors, new_descendants = map(list, get_anc_dec_noncum(db, time_0, time_1, limit)) precompute_doc = {'start': time_0, 'ancestors': ancestors, 'descendants': new_descendants} times.append(time_0) popsizes.append((len(ancestors), len(new_descendants))) popfn = '/'.join([outdir, dt_as_str(time_0)+'.p']) print "pickling population for time {} as {}".format(time_0, popfn) pickle_obj(popfn, precompute_doc) return times, popsizes
def dump_descendants_over_time(db, time_pairs, outdir, limit = None, debug = True): # also returns a histogram of pop sizes. times = [] popsizes = [] for (time_0, time_1) in time_pairs: new_descendants = list(get_new_descendants(db, time_0, time_1, limit)) precompute_doc = {'start': time_0, 'descendants': new_descendants} print "number of descendants at time {}: {}".format(time_0, len(new_descendants)) times.append(time_0) popsizes.append(len(new_descendants)) if debug: precompute_doc['descendants'] = len(new_descendants) pprint(precompute_doc) else: popfn = '/'.join([outdir, dt_as_str(time_0)+'.p']) print "pickling population for time {} as {}".format(time_0, popfn) pickle_obj(popfn, precompute_doc) return times, popsizes
def main_both(): """ Runs the GPE calculation for the whole population, for each docvec trait.""" db = MongoClient().patents oneweek = timedelta(days=7) mindate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', 1).limit(1))[0]['isd'] maxdate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', -1).limit(1))[0]['isd'] old_ancestors = [] gpes_tfidf = {} gpes_docvec = {} for (t1,_,_) in step_through_time(mindate,maxdate,oneweek): logging.info("computing gpe for time {}, both tf-idf and w2v".format(t1)) gpe_dict_w2v, new_ancestors = gpe_multi_threaded(t1, 'w2v', _docvec_traits, None, old_ancestors) gpe_dict_tfidf, _ = gpe_multi(t1, 'tf-idf', _tfidf_traits, None, old_ancestors) # multithreading Not worth overhead for tfidf. gpes_docvec[t1] = gpe_dict_w2v[t1] gpes_tfidf[t1] = gpe_dict_tfidf[t1] old_ancestors = old_ancestors + new_ancestors return gpes_tfidf, gpes_docvec if __name__ == '__main__': gpes_tfidf, gpes_docvec = main_both() try: pickle_obj('gpes_docvec.p', gpes_docvec) pickle_obj('gpes_tfidf.p', gpes_tfidf) except: logging.info("error.") logging.info("docvec gpes:") pprint(gpes_docvec) logging.info("tfidf gpes.") pprint(gpes_tfidf)
} }).sort('isd', -1).limit(1))[0]['isd'] old_ancestors = [] gpes_tfidf = {} gpes_docvec = {} for (t1, _, _) in step_through_time(mindate, maxdate, oneweek): logging.info( "computing gpe for time {}, both tf-idf and w2v".format(t1)) gpe_dict_w2v, new_ancestors = gpe_multi_threaded( t1, 'w2v', _docvec_traits, None, old_ancestors) gpe_dict_tfidf, _ = gpe_multi( t1, 'tf-idf', _tfidf_traits, None, old_ancestors) # multithreading Not worth overhead for tfidf. gpes_docvec[t1] = gpe_dict_w2v[t1] gpes_tfidf[t1] = gpe_dict_tfidf[t1] old_ancestors = old_ancestors + new_ancestors return gpes_tfidf, gpes_docvec if __name__ == '__main__': gpes_tfidf, gpes_docvec = main_both() try: pickle_obj('gpes_docvec.p', gpes_docvec) pickle_obj('gpes_tfidf.p', gpes_tfidf) except: logging.info("error.") logging.info("docvec gpes:") pprint(gpes_docvec) logging.info("tfidf gpes.") pprint(gpes_tfidf)