def main(): db = MongoClient().patents w2v, kmeans = model_loader(300, 200) def part_func(doc): return { '$set': { 'wordvec_clusters': cluster_distances(db, doc['_id'], w2v, kmeans) } } parallelMap(part_func, in_collection=db.traits, out_collection=db.traits, findArgs={ 'spec': { 'doc_vec': { '$exists': True, '$nin': [[0 for _ in range(300)]] }, 'top_tf-idf': { '$nin': [[]] } }, 'fields': { '_id': 1 } }, updateFreq=500, bSize=1000)
def main(): # Get filenames. indir = '/Users/jmenick/Desktop/alife_refactor/output/lda_model_200' name = 'lda_200' pnofn = '/'.join([indir, 'pnos.p']) ldafn = '/'.join([indir, name + '.lda']) corpusfn = '/'.join([indir, 'corpus_' + name + '.svmlight']) vocabfn = '/'.join([indir, 'vocab_' + name + '.dict']) # Load persisted data from disk. print "loading data..." vocab = load_vocab(vocabfn) corpus = load_corpus(corpusfn) lda = load_lda(ldafn) pnos = load_obj(pnofn) pno2id = {p: i for i, p in enumerate(pnos)} #produce visualization... commented out for now due to crashing. Ugh PCA again... # visfn = '/'.join([indir, 'vis.html']) # vis_data = prepare(lda, corpus, vocab) # pyLDAvis.save_html(vis_data, visfn) # put doc topics in db. print "inserting doc topics..." db = MongoClient().patents print "len(corpus): {}, len(pnos): {}".format(len(pnos), len(corpus)) def partfunc(doc): pno = doc['_id'] try: corpus_idx = pno2id[pno] bow = corpus[corpus_idx] topics = lda[bow] return {'$set': {'lda_topics': topics}} except: logging.warning("no topics for {}".format(pno)) return {'$set': {'no_topics': True}} parallelMap(partfunc, in_collection=db.traits, out_collection=db.traits, findArgs={ 'spec': {}, 'fields': { '_id': 1 } }, bSize=1000, updateFreq=500)
def main(): db = MongoClient().patents w2v,kmeans = model_loader(300,200) def part_func(doc): return {'$set': {'wordvec_clusters': cluster_distances(db, doc['_id'], w2v,kmeans)}} parallelMap( part_func, in_collection = db.traits, out_collection = db.traits, findArgs = { 'spec': {'doc_vec': {'$exists': True, '$nin': [[0 for _ in range(300)]]}, 'top_tf-idf': {'$nin': [[]]}}, 'fields': {'_id': 1} }, updateFreq=500, bSize=1000 )
def main(): # Get filenames. indir = '/Users/jmenick/Desktop/alife_refactor/output/lda_model_200' name = 'lda_200' pnofn = '/'.join([indir, 'pnos.p']) ldafn = '/'.join([indir, name+'.lda']) corpusfn = '/'.join([indir, 'corpus_'+name+'.svmlight']) vocabfn = '/'.join([indir, 'vocab_'+name+'.dict']) # Load persisted data from disk. print "loading data..." vocab = load_vocab(vocabfn) corpus = load_corpus(corpusfn) lda = load_lda(ldafn) pnos = load_obj(pnofn) pno2id = {p:i for i,p in enumerate(pnos)} #produce visualization... commented out for now due to crashing. Ugh PCA again... # visfn = '/'.join([indir, 'vis.html']) # vis_data = prepare(lda, corpus, vocab) # pyLDAvis.save_html(vis_data, visfn) # put doc topics in db. print "inserting doc topics..." db = MongoClient().patents print "len(corpus): {}, len(pnos): {}".format(len(pnos), len(corpus)) def partfunc(doc): pno = doc['_id'] try: corpus_idx = pno2id[pno] bow = corpus[corpus_idx] topics = lda[bow] return {'$set': {'lda_topics': topics}} except: logging.warning("no topics for {}".format(pno)) return {'$set': {'no_topics': True}} parallelMap( partfunc, in_collection = db.traits, out_collection = db.traits, findArgs = { 'spec': {}, 'fields': {'_id':1} }, bSize = 1000, updateFreq = 500 )
def main(): db = MongoClient().patents w2v, kmeans = model_loader(300, 200) def part_func(doc): try: return {"$set": {"wordvec_clusters": cluster_distances(db, doc["_id"], w2v, kmeans)}} except: return {"$set": {"wordvec_clusters": []}} parallelMap( part_func, in_collection=db.traits, out_collection=db.traits, findArgs={"spec": {}, "fields": {}}, updateFreq=500, bSize=1000, )
def compute_reach(db, trait='w2v', n_gens=2, family=None, enforce_func = lambda x: True): trait_field, _, _, _ = _trait_info[trait] def one_reach(doc): return {'$set': parent_child_trait_distance(doc['_id'], n_gens=n_gens, trait=trait, db=db, enforce_func = enforce_func)} if family is not None: for doc in family: logging.info("Computing {} gen {} reach for patent {}".format(n_gens, trait, doc['_id'])) db.traits.update({'_id': doc['_id']}, one_reach(doc)) else: parallelMap( one_reach, in_collection = db.traits, out_collection = db.traits, findArgs = { 'spec': {trait_field: {'$exists': True}}, 'fields': {trait_field: 1, 'citedby': 1, '_id': 1} }, updateFreq=500, bSize = 1000 )