示例#1
0
def main():
    db = MongoClient().patents
    w2v, kmeans = model_loader(300, 200)

    def part_func(doc):
        return {
            '$set': {
                'wordvec_clusters': cluster_distances(db, doc['_id'], w2v,
                                                      kmeans)
            }
        }

    parallelMap(part_func,
                in_collection=db.traits,
                out_collection=db.traits,
                findArgs={
                    'spec': {
                        'doc_vec': {
                            '$exists': True,
                            '$nin': [[0 for _ in range(300)]]
                        },
                        'top_tf-idf': {
                            '$nin': [[]]
                        }
                    },
                    'fields': {
                        '_id': 1
                    }
                },
                updateFreq=500,
                bSize=1000)
def main():
    # Get filenames.
    indir = '/Users/jmenick/Desktop/alife_refactor/output/lda_model_200'
    name = 'lda_200'
    pnofn = '/'.join([indir, 'pnos.p'])
    ldafn = '/'.join([indir, name + '.lda'])
    corpusfn = '/'.join([indir, 'corpus_' + name + '.svmlight'])
    vocabfn = '/'.join([indir, 'vocab_' + name + '.dict'])

    # Load persisted data from disk.
    print "loading data..."
    vocab = load_vocab(vocabfn)
    corpus = load_corpus(corpusfn)
    lda = load_lda(ldafn)
    pnos = load_obj(pnofn)
    pno2id = {p: i for i, p in enumerate(pnos)}

    #produce visualization... commented out for now due to crashing. Ugh PCA again...
    #    visfn = '/'.join([indir, 'vis.html'])
    #    vis_data = prepare(lda, corpus, vocab)
    #    pyLDAvis.save_html(vis_data, visfn)

    # put doc topics in db.
    print "inserting doc topics..."
    db = MongoClient().patents
    print "len(corpus): {}, len(pnos): {}".format(len(pnos), len(corpus))

    def partfunc(doc):
        pno = doc['_id']
        try:
            corpus_idx = pno2id[pno]
            bow = corpus[corpus_idx]
            topics = lda[bow]
            return {'$set': {'lda_topics': topics}}
        except:
            logging.warning("no topics for {}".format(pno))
            return {'$set': {'no_topics': True}}

    parallelMap(partfunc,
                in_collection=db.traits,
                out_collection=db.traits,
                findArgs={
                    'spec': {},
                    'fields': {
                        '_id': 1
                    }
                },
                bSize=1000,
                updateFreq=500)
def main():
    db = MongoClient().patents
    w2v,kmeans = model_loader(300,200)
    def part_func(doc):
        return {'$set': {'wordvec_clusters': cluster_distances(db, doc['_id'], w2v,kmeans)}}
    parallelMap(
        part_func,
        in_collection = db.traits,
        out_collection = db.traits,
        findArgs = {
            'spec': {'doc_vec': {'$exists': True, '$nin': [[0 for _ in range(300)]]}, 'top_tf-idf': {'$nin': [[]]}},
            'fields': {'_id': 1}
        },
        updateFreq=500,
        bSize=1000
    )
def main():
    # Get filenames. 
    indir = '/Users/jmenick/Desktop/alife_refactor/output/lda_model_200'
    name = 'lda_200'
    pnofn = '/'.join([indir, 'pnos.p'])
    ldafn = '/'.join([indir, name+'.lda'])
    corpusfn = '/'.join([indir, 'corpus_'+name+'.svmlight'])
    vocabfn = '/'.join([indir, 'vocab_'+name+'.dict'])
        
    # Load persisted data from disk.
    print "loading data..."
    vocab = load_vocab(vocabfn)
    corpus = load_corpus(corpusfn)
    lda = load_lda(ldafn)
    pnos = load_obj(pnofn)
    pno2id = {p:i for i,p in enumerate(pnos)}

    #produce visualization... commented out for now due to crashing. Ugh PCA again...
#    visfn = '/'.join([indir, 'vis.html'])
#    vis_data = prepare(lda, corpus, vocab)
#    pyLDAvis.save_html(vis_data, visfn)

    # put doc topics in db. 
    print "inserting doc topics..."
    db = MongoClient().patents
    print "len(corpus): {}, len(pnos): {}".format(len(pnos), len(corpus))
    def partfunc(doc):
        pno = doc['_id']
        try: 
            corpus_idx = pno2id[pno]
            bow = corpus[corpus_idx]
            topics = lda[bow]
            return {'$set': {'lda_topics': topics}}
        except:
            logging.warning("no topics for {}".format(pno))
            return {'$set': {'no_topics': True}}
    parallelMap(
        partfunc,
        in_collection = db.traits,
        out_collection = db.traits,
        findArgs = {
            'spec': {},
            'fields': {'_id':1}
        },
        bSize = 1000,
        updateFreq = 500
    )
def main():
    db = MongoClient().patents
    w2v, kmeans = model_loader(300, 200)

    def part_func(doc):
        try:
            return {"$set": {"wordvec_clusters": cluster_distances(db, doc["_id"], w2v, kmeans)}}
        except:
            return {"$set": {"wordvec_clusters": []}}

    parallelMap(
        part_func,
        in_collection=db.traits,
        out_collection=db.traits,
        findArgs={"spec": {}, "fields": {}},
        updateFreq=500,
        bSize=1000,
    )
示例#6
0
def compute_reach(db, trait='w2v', n_gens=2, family=None, enforce_func = lambda x: True):
    trait_field, _, _, _ = _trait_info[trait]
    def one_reach(doc):
        return {'$set': parent_child_trait_distance(doc['_id'], n_gens=n_gens, trait=trait, db=db, enforce_func = enforce_func)}
    if family is not None:
        for doc in family:
            logging.info("Computing {} gen {} reach for patent {}".format(n_gens, trait, doc['_id']))
            db.traits.update({'_id': doc['_id']}, one_reach(doc))
    else:
        parallelMap(
            one_reach,
            in_collection = db.traits,
            out_collection = db.traits,
            findArgs = {
                'spec': {trait_field: {'$exists': True}},
                'fields': {trait_field: 1, 'citedby': 1, '_id': 1}
            },
            updateFreq=500,
            bSize = 1000
        )