def combine(val, res, mapping): data = read_content(val) mesh_map, mesh_rev_map = mesh_mapping(mapping) print len(mesh_map), len(mesh_rev_map) resdict = collections.defaultdict(list) res = read_content(res) for i in res['documents']: resdict[i['pmid']] = [mesh_rev_map[k] for k in i['labels']] for i in data['documents']: i['meshMajor'] = resdict[i['pmid']] with open(val.split('.json')[0] + '_res.json', 'w') as outfile: json.dump(data, outfile)
def __read_impl(self, excluded, fpath): """ single read operation, if excluded==True, return a empty list, otherwise return data content """ if excluded: return list() return utils.read_content(fpath)
def test_utils(): # lemmatized = "f:/Corpus/lemmatized_trec_all.dat" lemmatized = "f:/Corpus/new4.dat" from utils import read_content count = 0 for text, meta in read_content(lemmatized): count += 1 # if count is 10: # break print(text, meta)
def test_from_data_file(r6): import driver def _corrupt(fname, index, size): get_logger().warning("corrupting disk {}".format(index)) error_fpath = r6.get_real_name(index, fname) error_content = os.urandom(size) utils.write_content(error_fpath, error_content) def _corrupt2(fname, indexes, size): for index in indexes: _corrupt(fname, index, size) data_fname = 'data3' SIZE = 32768 driver.gen_rnd_file(data_fname, SIZE, 'text') fpath = os.path.join(config.root, 'data3') original_content = utils.read_content(fpath) r6.write(original_content, data_fname) r6.detect_corruption(data_fname) for error_index in [0, 3, r6.N - 2, r6.N - 1]: error_size = SIZE / 13 _corrupt(data_fname, error_index, error_size) found_error_index = r6.detect_corruption(data_fname) if found_error_index is not None: get_logger().warning("recover disk {}".format(error_index)) assert found_error_index == error_index if found_error_index < r6.N - 1: r6.recover_d_or_p(data_fname, found_error_index) else: r6.recover_q(data_fname) r6.detect_corruption(data_fname) ##################################################### get_logger().warning("testing recover_d_q") error_indexes = [4, r6.N - 1] size = SIZE / (r6.N - 4) _corrupt2(data_fname, error_indexes, size) r6.recover_d_q(data_fname, error_indexes[0]) r6.detect_corruption(data_fname) ##################################################### get_logger().warning("testing recover_2d") error_indexes = [0, 1] size = SIZE / (r6.N + 2) _corrupt2(data_fname, error_indexes, size) r6.recover_2d(data_fname, error_indexes[0], error_indexes[1]) r6.detect_corruption(data_fname) ##################################################### get_logger().warning("testing recover_d_p") error_indexes = [0, r6.N - 2] size = SIZE / (r6.N - 2) _corrupt2(data_fname, error_indexes, size) r6.recover_d_p(data_fname, error_indexes[0]) r6.detect_corruption(data_fname)
def predict(te, vocab, label_dict, label_rev_dict, mesh_map, mesh_rev_map, prefix, buckets, model, nhidden, nlayer, dropout, nepoch, batch_size): # Prediction for testing data set batch_size = 1 tins, tlabels, tpmids, t, tld, tlrd = load_data(read_content(te), vocab, label_dict, label_rev_dict, tr=False) print 'tins', len(tins) res = {} res["documents"] = [] param_file = "./models/%s-%s" % (prefix, 30) #arg_param,aux_param=load_param(param_file) make_predict(res, tins, len(label_dict), tpmids, model, param_file, buckets, nhidden, nlayer, vocab, dropout, label_rev_dict, mesh_map, mesh_rev_map, nepoch, batch_size) return res
def train(args, path, df, val, te, meshmap, nhidden, nembed, batch_size, nepoch, model, nlayer, eta, dropout, split, is_train): assert model in ['ffn', 'lstm', 'bilstm', 'gru'] data = read_content_stream(os.path.join(path, df)) nins, vocab, label_dict, label_rev_dict = load_data_statics(data) mesh_map, mesh_rev_map = mesh_mapping(meshmap) contexts = [mx.context.gpu(i) for i in xrange(1)] nwords = len(vocab) nlabels = len(label_dict) print '#ins', nins print '#labels', nlabels print '#words', nwords npart = 30 pins = chunkl(nins, npart) buckets = [50, 100, 200, 300, 150, 1000] prefix = model + '_' + str(nlayer) + '_' + str(nhidden) + "_" + str(nembed) gen_data = read_content_stream(os.path.join(path, df)) logging.basicConfig(level=logging.DEBUG) logging.info('start with arguments %s', args) if model == 'ffn': def ffn_gen(seq_len): sym = ffn.ffn(nlayer, seq_len, nwords, nhidden, nembed, nlabels, dropout) data_names = ['data'] label_names = ['label'] return sym, data_names, label_names for pidx in xrange(len(pins)): print 'partition ', pidx data = {'articles': []} for _ in xrange(pins[pidx]): data['articles'].append(gen_data.next()) if val == None: tr_data, val_data = get_data_iter(ins, labels, nlabels, batch_size, [], buckets, split) else: ins, labels, pmids, v, ld, lrd = load_data( data, vocab, label_dict, label_rev_dict) tr_data = BucketFlexIter(ins, labels, nlabels, batch_size, [], buckets) vins, vlabels, vpmids, v, ld, lrd = load_data(read_content( os.path.join(path, val)), vocab, label_dict, label_rev_dict, tr=False) val_data = BucketFlexIter(vins, vlabels, nlabels, batch_size, [], buckets) if len(buckets) == 1: mod = mx.mod.Module(*ffn_gen(buckets[0]), context=contexts) else: mod = mx.mod.BucketingModule( ffn_gen, default_bucket_key=tr_data.default_bucket_key, context=contexts) if is_train: if pidx: sym, arg_params, aux_params = mx.model.load_checkpoint( './models/%s-%s' % (prefix, pidx - 1), nepoch) mod.bind(data_shapes=tr_data.provide_data, label_shapes=tr_data.provide_label, for_training=True) mod.set_params(arg_params=arg_params, aux_params=aux_params) mod.fit(tr_data, eval_data=val_data, num_epoch=nepoch, epoch_end_callback=mx.callback.do_checkpoint( './models/%s-%s' % (prefix, pidx), period=nepoch), eval_metric=['rmse', accuracy, ins_recall]) else: mod.fit(tr_data, eval_data=val_data, num_epoch=nepoch, epoch_end_callback=mx.callback.do_checkpoint( './models/%s-%s' % (prefix, pidx), period=nepoch), eval_metric=['rmse', accuracy, ins_recall], batch_end_callback=mx.callback.Speedometer( batch_size, 500), initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), optimizer='sgd', optimizer_params={ 'learning_rate': eta, 'momentum': 0.9, 'wd': 0.00001 }) elif model == 'lstm': init_c = [('l%d_init_c' % l, (batch_size, nhidden)) for l in range(nlayer)] init_h = [('l%d_init_h' % l, (batch_size, nhidden)) for l in range(nlayer)] init_states = init_c + init_h state_names = [x[0] for x in init_states] def lstm_gen(seq_len): sym = lstm.lstm_unroll(nlayer, seq_len, nwords, nhidden, nembed, nlabels, dropout) data_names = ['data'] + state_names label_names = ['label'] return sym, data_names, label_names for pidx in xrange(len(pins)): print 'partition ', pidx data = {'articles': []} for _ in xrange(pins[pidx]): data['articles'].append(gen_data.next()) if val == None: tr_data, val_data = get_data_iter(ins, labels, nlabels, batch_size, [], buckets, split) else: ins, labels, pmids, v, ld, lrd = load_data( data, vocab, label_dict, label_rev_dict) tr_data = BucketFlexIter(ins, labels, nlabels, batch_size, [], buckets) vins, vlabels, vpmids, v, ld, lrd = load_data(read_content( os.path.join(path, val)), vocab, label_dict, label_rev_dict, tr=False) val_data = BucketFlexIter(vins, vlabels, nlabels, batch_size, [], buckets) if len(buckets) == 1: mod = mx.mod.Module(*lstm_gen(buckets[0]), context=contexts) else: mod = mx.mod.BucketingModule( lstm_gen, default_bucket_key=tr_data.default_bucket_key, context=contexts) if is_train: if pidx: sym, arg_params, aux_params = mx.model.load_checkpoint( './models/%s-%s' % (prefix, pidx - 1), nepoch) mod.bind(data_shapes=tr_data.provide_data, label_shapes=tr_data.provide_label, for_training=True) mod.set_params(arg_params=arg_params, aux_params=aux_params) mod.fit(tr_data, eval_data=val_data, num_epoch=nepoch, epoch_end_callback=mx.callback.do_checkpoint( './models/%s-%s' % (prefix, pidx), period=nepoch), eval_metric=['rmse', accuracy, ins_recall]) else: mod.fit(tr_data, eval_data=val_data, num_epoch=nepoch, epoch_end_callback=mx.callback.do_checkpoint( './models/%s-%s' % (prefix, pidx), period=nepoch), eval_metric=['rmse', accuracy, ins_recall], batch_end_callback=mx.callback.Speedometer( batch_size, 500), initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), optimizer='sgd', optimizer_params={ 'learning_rate': eta, 'momentum': 0.9, 'wd': 0.00001 }) elif model == 'gru': init_h = [('l%d_init_h' % l, (batch_size, nhidden)) for l in range(nlayer)] init_states = init_h state_names = [x[0] for x in init_states] def gru_gen(seq_len): sym = gru.my_GRU_unroll(nlayer, seq_len, nwords, nhidden, nembed, nlabels, dropout) data_names = ['data'] + state_names label_names = ['label'] return sym, data_names, label_names for pidx in xrange(len(pins)): print 'partition ', pidx data = {'articles': []} for _ in xrange(pins[pidx]): data['articles'].append(gen_data.next()) if val == None: tr_data, val_data = get_data_iter(ins, labels, nlabels, batch_size, [], buckets, split) else: ins, labels, pmids, v, ld, lrd = load_data( data, vocab, label_dict, label_rev_dict) tr_data = BucketFlexIter(ins, labels, nlabels, batch_size, [], buckets) vins, vlabels, vpmids, v, ld, lrd = load_data(read_content( os.path.join(path, val)), vocab, label_dict, label_rev_dict, tr=False) val_data = BucketFlexIter(vins, vlabels, nlabels, batch_size, [], buckets) if len(buckets) == 1: mod = mx.mod.Module(*lstm_gen(buckets[0]), context=contexts) else: mod = mx.mod.BucketingModule( lstm_gen, default_bucket_key=tr_data.default_bucket_key, context=contexts) if is_train: if pidx: sym, arg_params, aux_params = mx.model.load_checkpoint( './models/%s-%s' % (prefix, pidx - 1), nepoch) mod.bind(data_shapes=tr_data.provide_data, label_shapes=tr_data.provide_label, for_training=True) mod.set_params(arg_params=arg_params, aux_params=aux_params) mod.fit(tr_data, eval_data=val_data, num_epoch=nepoch, epoch_end_callback=mx.callback.do_checkpoint( './models/%s-%s' % (prefix, pidx), period=nepoch), eval_metric=['rmse', accuracy, ins_recall]) else: mod.fit(tr_data, eval_data=val_data, num_epoch=nepoch, epoch_end_callback=mx.callback.do_checkpoint( './models/%s-%s' % (prefix, pidx), period=nepoch), eval_metric=['rmse', accuracy, ins_recall], batch_end_callback=mx.callback.Speedometer( batch_size, 500), initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), optimizer='sgd', optimizer_params={ 'learning_rate': eta, 'momentum': 0.9, 'wd': 0.00001 }) elif model == 'bilstm': init_cf = [('lf%d_init_c' % l, (batch_size, nhidden)) for l in range(nlayer)] init_cb = [('lb%d_init_c' % l, (batch_size, nhidden)) for l in range(nlayer)] init_hf = [('lf%d_init_h' % l, (batch_size, nhidden)) for l in range(nlayer)] init_hb = [('lb%d_init_h' % l, (batch_size, nhidden)) for l in range(nlayer)] init_states = init_cf + init_hf + init_cb + init_hb state_names = [x[0] for x in init_states] def bilstm_gen(seq_len): data = mx.sym.Variable('data') embed_weight = mx.sym.Variable('embed_weight') concat_weight = mx.sym.Variable('concat_weight') hds = mx.sym.Embedding(data=data, weight=embed_weight, input_dim=nwords, output_dim=nembed, name='embed') w2v = mx.sym.SliceChannel(data=hds, num_outputs=seq_len, squeeze_axis=1) for layidx in xrange(nlayer): w2v = bi_lstm_unroll(w2v, concat_weight, seq_len, nwords, nhidden, nembed, nlabels, dropout, layidx) w2v = [mx.sym.expand_dims(x, axis=1) for x in w2v] hidden = mx.sym.Concat(*w2v, dim=1) hidden = mx.sym.sum_axis(hidden, axis=1) / seq_len cls_weight = mx.sym.Variable('cls_weight') cls_bias = mx.sym.Variable('cls_bias') hidden = mx.sym.FullyConnected(data=hidden, weight=cls_weight, bias=cls_bias, num_hidden=nlabels, name='fc_cls') loss = mx.sym.LinearRegressionOutput( data=hidden, label=mx.sym.Variable('label')) return loss, ['data'] + state_names, ['label'] for pidx in xrange(len(pins)): print 'partition ', pidx data = {'articles': []} for _ in xrange(pins[pidx]): data['articles'].append(gen_data.next()) if val == None: tr_data, val_data = get_data_iter(ins, labels, nlabels, batch_size, [], buckets, split) else: ins, labels, pmids, v, ld, lrd = load_data( data, vocab, label_dict, label_rev_dict) tr_data = BucketFlexIter(ins, labels, nlabels, batch_size, [], buckets) vins, vlabels, vpmids, v, ld, lrd = load_data(read_content( os.path.join(path, val)), vocab, label_dict, label_rev_dict, tr=False) val_data = BucketFlexIter(vins, vlabels, nlabels, batch_size, [], buckets) if len(buckets) == 1: mod = mx.mod.Module(*lstm_gen(buckets[0]), context=contexts) else: mod = mx.mod.BucketingModule( lstm_gen, default_bucket_key=tr_data.default_bucket_key, context=contexts) if is_train: if pidx: sym, arg_params, aux_params = mx.model.load_checkpoint( './models/%s-%s' % (prefix, pidx - 1), nepoch) mod.bind(data_shapes=tr_data.provide_data, label_shapes=tr_data.provide_label, for_training=True) mod.set_params(arg_params=arg_params, aux_params=aux_params) mod.fit(tr_data, eval_data=val_data, num_epoch=nepoch, epoch_end_callback=mx.callback.do_checkpoint( './models/%s-%s' % (prefix, pidx), period=nepoch), eval_metric=['rmse', accuracy, ins_recall]) else: mod.fit(tr_data, eval_data=val_data, num_epoch=nepoch, epoch_end_callback=mx.callback.do_checkpoint( './models/%s-%s' % (prefix, pidx), period=nepoch), eval_metric=['rmse', accuracy, ins_recall], batch_end_callback=mx.callback.Speedometer( batch_size, 500), initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), optimizer='sgd', optimizer_params={ 'learning_rate': eta, 'momentum': 0.9, 'wd': 0.00001 }) return vocab, label_dict, label_rev_dict, prefix, buckets, mesh_map, mesh_rev_map
regex4 = re.compile("SYRINGE[^$\W]") # regex4 = re.compile("[\s]+DEPRESSION") regex5 = re.compile("PTSD[^$\W]") regex_last = re.compile('[-(]?[A-Z][a-z0-9]+[\W]?[a-z0-9:]*[\s]*') # data = utils.readData("../"+cfg.PATH_INPUT, "../"+cfg.PATH_PREPROCESSED_TRAIN, 1) outDir = cfg.PATH_TRAIN + "refactor/" try: os.makedirs(outDir) except OSError: pass data = utils.read_content(cfg.PATH_TRAIN) for idx, content in data.items(): content_str = "" for word in content.split(' '): found = False new_str, found = getUpdatedStr([regex1], regex_exclude, found, 1, word) # if not found: new_str, found = getUpdatedStr([regex_last], regex_exclude, found, 3, word) if not found: new_str, found = getUpdatedStr([regex2, regex3, regex4, regex5], regex_exclude, found, 2, word) if not found: new_str = word
def get(self, url, param, retry=3): logger.info('Crawl content url: %s, %s', url, str(param)) if not url.startswith('http'): return utils.read_content(url) return utils.get_data(url, param, retry)
app = Flask(__name__) template_filename = 'lomake_2.html' storyfiles = {'Ilmastonmuutos': 'ilmasto.txt', 'Töfö-pasta': 'pasta.txt'} dirs = {'imgdir': 'static/img/', 'styledir': 'static/css/'} possible_adjectives = [ 'cooli', 'epäcooli', 'tyhjä', 'pörröinen', 'höpö', 'suurenmoinen', 'vituttava', 'vihreä', 'neliömäinen', 'pyöreä', 'juustomainen', 'kukkainen', 'koiramainen', 'iso', 'kaunis', 'upee', 'harmaa', 'pimeä', 'rasistinen' ] stories = read_content(storyfiles) def roll_article(s): o = random.choice(list(s.keys())) return o def roll_adjective(possible_adjectives=possible_adjectives): a = random.choice(possible_adjectives) return a def handle_post(data): a = [] print(data)