def generate_scan_image(subset): list_dirs = os.walk(TRUNK_DIR + subset) jsobjs = [] output_dir = SAMPLE_DIR + subset mkdir(output_dir) for root, dirs, files in list_dirs: for f in files: if f.lower().endswith('mhd'): key = os.path.splitext(f)[0] numpyImage, numpyOrigin, numpySpacing = ( util.load_itk_image( os.path.join(root, f))) for z in range(numpyImage.shape[0]): patch = numpyImage[z, 0:512, 0:512] patch = util.normalizePlanes(patch) im = Image.fromarray(patch * 255).convert('L') output_filename = ( subset + "-" + key + "-" + str(z) + "-scan.bmp") print(subset + '/' + output_filename) im.save(os.path.join( output_dir, output_filename)) jsobjs.append({ "image_path": subset + '/' + output_filename, "rects":[] } ) with open(META_DIR + subset + '-scan.json', 'w') as f: json.dump(jsobjs, f)
def make_log_file(self, base_directory, filename): util.mkdir(base_directory) f = open(base_directory + filename, 'w') for k, v in self.__dict__.iteritems(): line = str(k) + " : " + str(v) + "\n" f.write(line) f.close()
def prepare(): args = read_params(sys.argv) conf = ConfigParser() conf.read(args['config']) args['wdir'] = conf.get('param', 'work_dir') args['rdir'] = conf.get('param', 'raw_dir_name') args['rdir'] = '%s/%s' % (args['wdir'], args['rdir']) mkdir('%s/temp' % args['rdir']) return args
def generate_data(data_root, data_map, fp_dir): list_dirs = os.walk(data_root) index = 0 for i in range(10): util.mkdir(FP_DIR + 'subset' + str(i)) meta = dict([('subset' + str(i), []) for i in range(10)]) for root, dirs, files in list_dirs: for f in files: if f.lower().endswith("mhd"): print(f) key = os.path.splitext(f)[0] subset = root.split("/")[-1] if key in data_map: numpyImage, numpyOrigin, numpySpacing = util.load_itk_image(os.path.join(root, f)) for it in data_map[key]: worldCoord, label = it voxelCoord = util.worldToVoxelCoord(worldCoord, numpyOrigin, numpySpacing) voxelWidth = 65 x = int(voxelCoord[1]) y = int(voxelCoord[2]) z = int(voxelCoord[0]) patch = numpyImage[z, x - voxelWidth // 2:x + voxelWidth // 2, y - voxelWidth // 2:y + voxelWidth // 2] patch = util.normalizePlanes(patch) if patch.size == 0: continue fpath = os.path.join(fp_dir, subset + '/patch_' + str(index) + '.bmp') Image.fromarray(patch * 255).convert('L').save(fpath) meta[subset].append((fpath, label)) index += 1 if label == 1: for i in range(50): dx, dy = MOV_LIST[i % 8] xx = x + int(dx * np.random.rand()) yy = y + int(dy * np.random.rand()) aug_patch = numpyImage[z, xx - voxelWidth // 2:xx + voxelWidth // 2, yy - voxelWidth // 2:yy + voxelWidth // 2] aug_patch = util.normalizePlanes(aug_patch) if aug_patch.size == 0: continue fpath = os.path.join(fp_dir, subset + '/patch_' + str(index) + '.bmp') Image.fromarray(aug_patch * 255).convert('L').save(fpath) meta[subset].append((fpath, label)) index += 1 with open(META_DIR + 'fp.json', 'w') as f: json.dump(meta, f)
def main(): # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # DEFINE INPUTS AND LOAD DATA today_dt = datetime.date.today() yesterday_dt = today_dt - datetime.timedelta(days=1) dates_ds = pd.date_range(inputs.start_date, inputs.end_date) dates = [str(x.date()).replace('-', '/') for x in list(dates_ds)] overwrite = inputs.overwrite site = 'guardian' src = os.path.join(utilities.blm_dir, 'Google_CSE_Results', site + '_articles.pkl') with open(src, 'rb') as f: dates_articles_ = pickle.load(f) interim_dir = os.path.join(utilities.blm_dir, 'z_Interim') utilities.mkdir(interim_dir) # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Pull comment URLs') if overwrite: base_url = 'https://www.theguardian.com/discussion' dates_articles = copy.deepcopy(dates_articles_) counter = 0 for date, days_articles in dates_articles.items(): for ix, article in enumerate(days_articles): try: article_url = article['url'].strip().lower() r = requests.get(article_url) article_soup = bs(r.text) comments_div = article_soup.find('div', {'id': 'comments'}) soup_id = comments_div.attrs['data-discussion-key'] comments_url = base_url + soup_id dates_articles[date][ix]['comments_url'] = comments_url except: dates_articles[date][ix]['comments_url'] = 'no comments' dates_articles_dst = os.path.join(interim_dir, 'articles_w_comments_urls.pkl') with open(dates_articles_dst, 'wb') as f: pickle.dump(dates_articles, f) else: with open(dates_articles_dst, 'rb') as f: dates_articles = pickle.load(f) # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Remove articles with no comments') dates_articles2 = {} for date, days_articles in dates_articles.items(): articles = [] for article in days_articles: if article['comments_url'] != 'no comments': article_copy = copy.deepcopy(article) articles.append(article_copy) if len(articles) > 0: dates_articles2[date] = articles # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Scrape comments pages') if overwrite: dates_articles3 = {} counter = 0 file_counter = 0 for date, days_articles in dates_articles2.items(): articles3 = [] for article in days_articles: comments_url = article['comments_url'] comments_li = [] try: comments_soup, comments = get_page_comments(comments_url) comments_li.append(comments) next_page_comments_url = get_next_page_url(comments_soup) while next_page_comments_url is not None: try: next_page_comments_soup, next_page_comments = get_page_comments( next_page_comments_url) comments_li.append(next_page_comments) next_page_comments_url = get_next_page_url( next_page_comments_soup) except: next_page_comments_url = None except: pass article3 = copy.deepcopy(article) article3['raw_comments'] = comments_li articles3.append(article3) if len(articles3) > 0: dates_articles3[date] = articles3 counter += 1 if counter >= 10: dst = os.path.join(utilities.blm_html_1pass_dir, site + str(file_counter) + '.pkl') with open(dst, 'wb') as f: pickle.dump(dates_articles3, f) dates_articles3 = {} counter = 0 file_counter += 1 if counter > 0: dst = os.path.join(utilities.blm_html_1pass_dir, site + str(file_counter + 1) + '.pkl') with open(dst, 'wb') as f: pickle.dump(dates_articles3, f) # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Print outputs') recovered_articles_srcs = [ x for x in utilities.get_files(utilities.blm_html_1pass_dir) if site in x ] recovered_articles = utilities.combine_dicts(recovered_articles_srcs) n_articles = utilities.count_articles(recovered_articles) print('Recovered %s on-topic articles with comments' % n_articles)
def train(args): with open(args.hype) as f: H = json.load(f) H['subset'] = args.subset H['save_dir'] = FPR_DIR + 'subset' + str(H['subset']) mkdir(H['save_dir']) if args.gpu != None: H['gpu'] = args.gpu with open(META_DIR + 'fp.json') as fpj: meta = json.load(fpj) dat = {} dat['train'] = [] dat['valid'] = [] for i in range(10): if i == args.subset: dat['valid'] = meta['subset' + str(i)] else: dat['train'] += meta['subset' + str(i)] tf.set_random_seed(2012310818) os.environ['CUDA_VISIBLE_DEVICES'] = str(H['gpu']) gpu_options = tf.GPUOptions() gpu_options.allow_growth = True config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: (x, y, training, Xt, Yt, Xv, Yv, logits, loss, preds, opt, varst, gstep, train_opt, saver, fptrunk) = build(H, dat, sess) sess.run(tf.global_variables_initializer()) fptrunk.start() if args.weight != None: logging.info('Restoring from %s...' % args.weight) saver.restore(sess, weight) bsize = fptrunk.bsize train_batches = fptrunk.nbatches['train'] valid_batches = fptrunk.nbatches['valid'] for epoch in range(H['epochs']): tst = time.time() tol_loss, tol_tfn, tol_tfp, tol_vfn, N, P, vN, vP, tol_vfp, tol_acc, tol_vacc = [ 0.0 ] * 11 for step in range(1, train_batches): curX, curY = sess.run([Xt, Yt]) _, tloss, tpreds = sess.run([train_opt, loss, preds], feed_dict={ Xt: curX, Yt: curY, training: True }) fn, fp = FPFN(curY, tpreds) N += np.sum(curY == 0) P += np.sum(curY == 1) tol_loss += tloss tol_tfn += fn tol_tfp += fp tol_acc += fp + fn if step % 100 == 0: cnt = (step * bsize) logstr = ( 'Training batchs %d, avg loss %f, acc %f, FN %d/%d, FP %d/%d.' % (step, tol_loss / step, (cnt - tol_acc) / cnt, tol_tfn, P - tol_tfn, tol_tfp, N - tol_tfp)) print(logstr) logging.info(logstr) for step in range(valid_batches): curX, curY = sess.run([Xv, Yv]) curY = curY.reshape(bsize, 1) tpreds = sess.run(preds, feed_dict={ Xt: curX, Yt: curY, training: False }) fn, fp = FPFN(curY, tpreds) vN += np.sum(curY == 0) vP += np.sum(curY == 1) tol_vfn += fn tol_vfp += fp tol_vacc += fn + fp t = time.time() - tst logstr = ('epoch %d, time elapse %f, training loss %f,' + ' valid avg FN %f, FP %f, acc %f.') % ( epoch + 1, t, tol_loss / train_batches, float(tol_vfn) / vP, float(tol_vfp) / vN, tol_vacc / valid_batches) print(logstr) logging.info(logstr) saver.save(sess, H['save_dir'] + '/save.ckpt', global_step=gstep) logging.info('training finished, try ending...') fptrunk.stop() logging.info('ended...') sess.close()
import utilities from utilities import preview import os import pandas as pd import copy import pickle import keyring import getpass from googleapiclient.discovery import build dates_ds = pd.date_range(inputs.start_date, inputs.end_date) dates = [str(x.date()).replace('-', '/') for x in list(dates_ds)] overwrite = inputs.overwrite site = 'guardian' dst_dir = os.path.join(utilities.blm_dir, 'Google_CSE_Results') utilities.mkdir(dst_dir) res_dst = os.path.join(dst_dir, site + '_res_li.pkl') def main(): # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print( 'Use google custom search api to retrieve on-topic articles for selected site' ) # indicate the site on the google custom search api control panel # enter credentials from config.py file # if this file doesn't exist, create one and define google_custom_search_cx and developerKey variables cx = config.google_custom_search_cx developerKey = config.google_custom_search_developer_key service = build("customsearch", "v1", developerKey=developerKey)
if (file_format == 'json') & (write_mode == 'w'): json.dump(d, f) elif (file_format == 'pkl') & (write_mode == 'wb'): pickle.dump(d, f) else: raise ValueError( "File format or write mode incorrect.\nOptions are 1) 'json', 'w' and 2) 'pkl', 'wb'" ) # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # DEFINE INPUTS site = 'breitbart' dst_dir = os.path.join(utilities.blm_processed_parsed_dir, '2nd_iteration') utilities.mkdir(dst_dir) # make dst dir if it doesn't exist def main(): # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Parse comments') # combine scraped blm data blm_srcs = utilities.get_files( utilities.blm_html_1pass_dir) + utilities.get_files( utilities.blm_html_2pass_dir) blm_srcs = [x for x in blm_srcs if site in x] blm = utilities.combine_dicts(blm_srcs) # sort by date
def main(): savepath = 'train2_img/' utilities.mkdir(savepath) lmdb_file = 'train2' read_lmdb(lmdb_file, savepath)
parser.add_argument('--test', dest='test', metavar='method',type=str,default="wilcox", help="set method wilcox,t,default[wilcox]") parser.set_defaults(paired=False) args = parser.parse_args() params = vars(args) params['paired'] = judge(params['paired']) return params if __name__ == '__main__': params = read_params(sys.argv) bin_defdir = '%s/02.taxon' % const.bin_defdir out_dir = params["out_dir"] profile_table = params["profile_table"] group_file = params["group_file"] use_mothed = params["test"] mkdir(out_dir) env = Environment(loader=FileSystemLoader(bin_defdir),autoescape=False) template = env.get_template("g11_diff.R") Rtext = template.render(tool_default_dir = const.tool_defdir,\ profile_table=profile_table,\ group_file=group_file,\ out_dir = out_dir,\ mothed=use_mothed,\ p_cutoff=params["cutoff"],\ fdr=params["fdr"],\ paired = params['paired']) with open("%s/diff.R" % out_dir,"w") as fqw: fqw.write(Rtext) Rrun("%s/diff.R" % out_dir)
del draw im.save(os.path.join(pimg, img_name[:-4] + '_gt.bmp')) cur = { "image_path": subset + '/' + img_name, "rects": [{ "x1": x1, "x2": x2, "y1": y1, "y2": y2, }] } samples[subset].append(cur) for key in samples: with open(META_DIR + key + '.json', 'w') as f: json.dump(samples[key], f) if __name__ == '__main__': dirs = [ SAMPLE_DIR, META_DIR, FPR_DIR, ] + [SAMPLE_DIR + 'subset' + str(i) for i in range(10)] for d in dirs: util.mkdir(d) data_map = util.readImageMap(ANNOTATION_CSV) generate_data(TRUNK_DIR, data_map, SAMPLE_DIR) # generate_scan_image(DATA_ROOT, OUTPUT_DIR)
def eliminate(args): with open(args.hype) as f: H = json.load(f) H['subset'] = args.subset if args.gpu != None: H['gpu'] = args.gpu H['epoch'] = args.epoch H['weights'] = args.weights H['fpepoch'] = args.fpepoch H['save_dir'] = 'data/output.eliminate/' + 'subset' + str(H['subset']) mkdir(H['save_dir']) os.environ['CUDA_VISIBLE_DEVICES'] = str(H['gpu']) gpu_options = tf.GPUOptions() gpu_options.allow_growth = True config = tf.ConfigProto(gpu_options=gpu_options) tf.set_random_seed(2012310818) with tf.Session(config=config) as sess: xv = tf.placeholder(tf.float32, shape=[1, 64, 64, 1]) logits, pred = model(H, xv, training=True) saver = tf.train.Saver(max_to_keep=None) sess.run(tf.global_variables_initializer()) saver.restore(sess, H['weights']) voxelW = 65 SUBSET_DIR = DETECT_DIR + 'subset' + str(H['subset']) + '/' with open(SUBSET_DIR + 'result_' + str(H['epoch']) + '.json') as f: detects = json.load(f) i = 0 for it in detects: boxes = it['box'] if len(boxes) > 0: img = cv2.imread(SAMPLE_DIR + it['file'], 0).astype(np.float32) rboxes = [] for box in boxes: x, y = int((box[0] + box[2]) / 2), int( (box[1] + box[3]) / 2) if x - voxelW // 2 < 0: x = 0 if x + voxelW // 2 >= img.shape[1]: x = img.shape[1] - voxelW if y - voxelW // 2 < 0: y = 0 if y + voxelW // 2 >= img.shape[0]: y = img.shape[0] - voxelW patch = img[y:y + voxelW - 1, x:x + voxelW - 1] y_, logits_ = sess.run( [pred, logits], feed_dict={xv: patch.reshape(1, 64, 64, 1)}) if y_ == 1: rboxes.append(box) cv2.imwrite(str(i) + '.bmp', patch) i = i + 1 print(logits_, y_) it['box'] = rboxes generate_result(TRUNK_DIR, detects, SUBSET_DIR + str(H['epoch']) + '.csv', 0.1)