def load_users(path): dbm = DBM() fin = open(path, encoding='utf-8', mode='r') cnt = 1 for line in fin: cnt += 1 try: user = xmltodict.parse(line)['row'] dbm.add_user(user) print(cnt) except: print("ERROR") print(line)
def load_posts(path): dbm = DBM() fin = open(path, mode='r', encoding='utf-8') dcnt = 1 for line in fin: try: post = xmltodict.parse(line)['row'] # print(post) dbm.add_post(post) # print(post) except: print("ERROR") print(post) if dcnt > 1000000: dcnt = 1 del dbm dbm = DBM()
assert (cfg.pretrain or not cfg.continue_learning) ### append node types while len(cfg.utype) < len(cfg.l_size): cfg.utype.append(pyrbm.UnitType.binary) print "Shuffling data..." if not cfg.gethidrep: dataset.shuffle() print "Initializing RBM..." pyrbm.initialize(cfg) print "ready." if cfg.dbm: rbmstack = DBM(cfg) else: rbmstack = pyrbm.RBMStack(cfg) rbmstack.saveOptions(cfg.get_serialization_obj()) mbp = minibatch_provider.MNISTMiniBatchProvider(dataset.data, dataset.teacher) print "Calculating statistics for minibatch..." mbs = minibatch_provider.MiniBatchStatistics(mbp, rbmstack.layers[0].act) if cfg.utype[0] == pyrbm.UnitType.gaussian: mbp.norm = lambda x: mbs.normalize_zmuv(x) else: mbp.norm = lambda x: mbs.normalize_255(x) if "test_data" in dataset.__dict__: mbp_test = minibatch_provider.MNISTMiniBatchProvider(
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--gauss', type=int, default=2) parser.add_argument('--epoch', type=int, default=50) parser.add_argument('--hidden', type=int, default=8) parser.add_argument('--steps', type=int, default=1) parser.add_argument('--recon', type=int, default=50) args = parser.parse_args() data_file = 'exp1_gauss' + str(args.gauss) with open(data_file, 'rb') as f: samples = pickle.load(f, encoding='bytes') n_visible, n_hidden = 2, args.hidden n_steps = args.steps n_epochs = args.epoch n_gibbs = args.recon dbm = DBM(num_visible=n_visible, num_hidden=n_hidden, CD_steps=n_steps, gibb_steps=n_gibbs, num_epochs=n_epochs) train = samples[:8000] validation = samples[8000:] dbm.fit(train, validation) plot_samples(dbm.reconstruction)
from dbm import DBM from tqdm import tqdm ''' This script assigns a set of tags to every user based on their interactions. ''' db = DBM() upd_handler = DBM() #The following line adds a '@Tags' attribute to each user doc # db.db.users.update({}, {'$set':{'@Tags':[]}}, upsert=False, multi=True) def append_tags(user_id, tags): ''' This method appends a set of tags to a specific user's list of tags. ''' upd_handler.db.users.update({'@Id': user_id}, {'$push': { '@Tags': { '$each': tags } }}) def tags_list_to_dict(user): if '@Id' not in user or '@Tags' not in user: return tag_dict = {} for tag in user['@Tags']: if tag not in tag_dict: tag_dict[tag] = 1
assert (cfg.pretrain or not cfg.continue_learning) ### append node types while len(cfg.utype) < len(cfg.l_size): cfg.utype.append(pyrbm.UnitType.binary) print "Shuffling data..." if not cfg.gethidrep: dataset.shuffle() print "Initializing RBM..." pyrbm.initialize(cfg) print "ready." if cfg.dbm: rbmstack = DBM(cfg) else: rbmstack = pyrbm.RBMStack(cfg) rbmstack.saveOptions(cfg.get_serialization_obj()) mbp = minibatch_provider.MNISTMiniBatchProvider(dataset.data, dataset.teacher) print "Calculating statistics for minibatch..." mbs = minibatch_provider.MiniBatchStatistics(mbp, rbmstack.layers[0].act) if cfg.utype[0] == pyrbm.UnitType.gaussian: mbp.norm = lambda x: mbs.normalize_zmuv(x) else: mbp.norm = lambda x: mbs.normalize_255(x) if "test_data" in dataset.__dict__: mbp_test = minibatch_provider.MNISTMiniBatchProvider(dataset.test_data, dataset.test_teacher)
from dbm import DBM import networkx as nx db = DBM() db2 = DBM() post_filter = {'@CreationDate': {'$gt': '2019'}} post_filter = {} # post_filter = {'@Tags' : {'$type':'string', '$not':{'$type':'array'}}} posts = db.get_post(post_filter) graph = nx.Graph() # cnt = 0 # for post in posts: # if cnt%100==0: # print(cnt, post['@Tags']) # cnt += 1 # tags = post['@Tags'][1:-1].replace("><", " ").split(" ") # # print(tags) # post['@Tags'] = tags # update = { "$set": {"@Tags": tags} } # db2.update_post({'_id':post['_id']}, update) cnt = 0 for post in posts: if '@Tags' not in post or type(post['@Tags']) == type(""): continue tags = post['@Tags'] cnt += 1 if cnt % 10000 == 0: print(cnt, tags)
print i, ' cycle entropy: ', entropy[-1],' cycle accuracy: ', accuracy[-1] joblib.dump(entropy, 'output/dbm_entropy') joblib.dump(accuracy, 'output/dbm_accuracy') dataset = np.round(np.random.rand(10000, 1)) labels = 1-dataset dataset = np.append(dataset,1-dataset,axis=1) dataset = np.append(dataset,np.ones((dataset.shape[0],1)),axis=1) print 'dataset shape: ', dataset.shape energy = [] entropy = [] accuracy = [] print 'initializing model' dbm_test=DBM(dataset,layers=[30,20]) #render_output(1,1) for k in range(1,3): for i in range(10): print 'beginning boltzmann training of model' dbm_test.train_unsupervised(k) render_output(i,k) dbm_test.learning_rate = 1.0 dbm_test.add_layer(1) dbm_test.labels = labels #Adapt the output layer to the network render_output(-1,4) render_supervised(-1) for i in range(20):
print i, ' cycle entropy: ', entropy[-1], ' cycle accuracy: ', accuracy[-1] joblib.dump(entropy, 'output/dbm_entropy') joblib.dump(accuracy, 'output/dbm_accuracy') dataset = np.round(np.random.rand(10000, 1)) labels = 1 - dataset dataset = np.append(dataset, 1 - dataset, axis=1) dataset = np.append(dataset, np.ones((dataset.shape[0], 1)), axis=1) print 'dataset shape: ', dataset.shape energy = [] entropy = [] accuracy = [] print 'initializing model' dbm_test = DBM(dataset, layers=[30, 20]) #render_output(1,1) for k in range(1, 3): for i in range(10): print 'beginning boltzmann training of model' dbm_test.train_unsupervised(k) render_output(i, k) dbm_test.learning_rate = 1.0 dbm_test.add_layer(1) dbm_test.labels = labels #Adapt the output layer to the network render_output(-1, 4) render_supervised(-1) for i in range(20):
import matplotlib.pyplot as plt from wordcloud import WordCloud from dbm import DBM dbm = DBM() #Get all tags from all users all_users = dbm.get_user(flt={'tags': {'$exists': 1}}) all_tags = "" norm_tags = {} for user in all_users: for tag in user['tags']: all_tags += tag['name'] + " " if tag['name'] in norm_tags: norm_tags[tag['name']] += tag['count'] else: norm_tags[tag['name']] = tag['count'] print("Number of distinct tags:", len(norm_tags)) # Generate a non-normalized tag cloud image wordcloud = WordCloud(width=700, height=500, stopwords=['n'], normalize_plurals=False, max_words=1000).generate(all_tags) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.title("Non Normalized Tag-Cloud") # Generate a Normalized tag cloud image