def main(args): if FLAGS.cfg_file: print('loading config setting') cfg_from_file(FLAGS.cfg_file, cfg) cfg.MAX_STEP = 50 cfg.BATCH_SIZE = 1 cfg.TRAIN_QUEUE_CAPACITY = 10 if not os.path.isdir(FLAGS.output_dir): os.mkdir(FLAGS.output_dir) logger = log_helper.get_logger() data_pipeline = TFLoadingPipeline(cfg, logger, shuffle=True) data_pipeline.setup(FLAGS.sample_path, FLAGS.label_path, cfg.BATCH_SIZE, cfg.TRAIN_QUEUE_CAPACITY) with tf.Session() as sess: data_pipeline.start(sess) for step in xrange(cfg.MAX_STEP): image_batch, label_batch = data_pipeline.load_batch() logger.info('output {}th image for validation'.format(step)) out_fname = '{}/{}.png'.format(FLAGS.output_dir, step) image = image_batch[0].astype(np.uint8) r, g, b = cv2.split(image) image = cv2.merge((b, g, r)) label = label_batch[0].astype(np.uint8) mask = image.copy() mask[:, :, 1][label[:, :, 0] > 0] = 255 overlay = cv2.addWeighted(image, 0.5, mask, 0.5, 0) cv2.imwrite(out_fname, overlay) data_pipeline.shutdown()
def crawl(self): """ 一个子进程执行的爬取任务 流程: 1.从共享url队列中取出一个url, 若无则使用搜索引擎获取更多起始地址 2.使用request_url函数获取网页response 3.使用ParseHelper中的解析函数解析网页 4.将数据存储到MongoDB数据库中 5.将网页中解析出来的url放入共享队列 6.记录日志 [此方案需可以改进的地方:将request请求url部分与后续处理部分分离, 采用异步HTTP请求的方式进一步爬取提高效率(1,2)(3,4,5)分离] :parameter logger: 日志生成对象,默认过滤级别为logging.INFO :return: None """ queue = get_queue_object(self.queue_type) pipline = get_pipline_object(self.pipline_type) logger = get_logger('blockchain_spider', to_file=True, filename='spider') while True: url = queue.get_url_from_queue() response = request_url(url, timeout=self.timeout) first_parsed_data = ParseHelper.first_parse_response(response, keyword=self.keyword) new_urls = first_parsed_data['urls'] if first_parsed_data else None pipline.save_html_data(first_parsed_data) url_amount = queue.put_urls_in_queue(new_urls) logger.info(f"{url} has been crawled.") if url_amount: logger.info(f"There are {url_amount} urls in queue now.")
def main(args): if FLAGS.cfg_file: print('loading config setting') cfg_from_file(FLAGS.cfg_file, cfg) print_config(cfg) logger = log_helper.get_logger() logger.info("show information about {}:".format(FLAGS.model)) if FLAGS.model == 'res50': model = Res50DispNet(cfg, logger) else: logger.error('wrong model type: {}'.format(FLAGS.model)) sys.exit(-1)
def main(): args = parser.parse_args() image_list = [] with open(os.path.join(args.in_samples, args.in_list)) as f: for line in f: image_list.append( os.path.join(args.in_samples, line.split('.png')[-2] + '.png')) logger = log_helper.get_logger() begin_ts = time.time() total_time_elapsed, eval_rets = eval_with_model( args.model_file, image_list, args.in_samples, args.out_infer, args.out_eval, args.max_pixel_dis, logger) end_ts = time.time() logger.info("total pipeline time elapsed: {} s".format(end_ts - begin_ts)) logger.info("total infer time elapsed: {} s".format(total_time_elapsed)) ave_time_elapsed = total_time_elapsed / len(image_list) logger.info("average infer time elapsed: {} s".format(ave_time_elapsed)) for i in range(3): eval_rets_i = [ eval_rets[x * 4 + i] for x in range(len(eval_rets) // 4) ] ag_ret = dict(count=len(eval_rets_i), metrics=calculate(aggregate_results(eval_rets_i)), commpare_list=args.in_list, out_folder=args.out_eval) json.dump(ag_ret, open(os.path.join(args.out_eval, str(i), 'L4E_result.json'), 'w'), indent=2) eval_rets_overall = [ eval_rets[x * 4 + 3] for x in range(len(eval_rets) // 4) ] ag_ret = dict(count=len(eval_rets_overall), metrics=calculate(aggregate_results(eval_rets_overall)), commpare_list=args.in_list, out_folder=args.out_eval) json.dump(ag_ret, open(os.path.join(args.out_eval, 'overall', 'L4E_result.json'), 'w'), indent=2)
import json import sklearn from sklearn.feature_extraction.text import TfidfVectorizer from utils import file_helper from utils import scikit_ml_helper from processors.processor import Processor from utils import log_helper log = log_helper.get_logger("AmazonLineProcessorTFIDF") class AmazonLineProcessorTfIdf(Processor): def __init__(self, labeled_articles_source_file_path, doc2vec_model_file_path, ml_model_file_path, articles_source_file_path, shuffle_count, classification_sources_file_path): self.labeled_articles_file_path = labeled_articles_source_file_path self.articles_source_file_path = articles_source_file_path self.doc2vec_model_file_path = doc2vec_model_file_path self.ml_model_file_path = ml_model_file_path self.shuffle_count = shuffle_count self.classification_sources_file_path = classification_sources_file_path def process(self): log.info("Commencing execution") with open(self.classification_sources_file_path) as source_cfg: sources_dict = json.load(source_cfg)
"/home/v2john/Dropbox/Personal/Academic/Masters/UWaterloo/Academics/" + \ "ResearchProject/Veriday/2class/models/doc2vec.model" ml_model_path = \ "/home/v2john/Dropbox/Personal/Academic/Masters/UWaterloo/Academics/" + \ "ResearchProject/Veriday/2class/models/ml.model.d2v.logreg" veriday_articles_path = \ "/home/v2john/Dropbox/Personal/Academic/Masters/UWaterloo/Academics/" + \ "ResearchProject/Veriday/annotated/all_articles.json" veriday_predicted_articles_path = \ "/home/v2john/Dropbox/Personal/Academic/Masters/UWaterloo/Academics/" + \ "ResearchProject/Veriday/annotated/all_articles_predicted.json" log = log_helper.get_logger("VeridayPredict2Class") def load_models(): doc2vec_model = Doc2Vec.load(doc2vec_model_path) ml_model = scikit_ml_helper.get_model_from_disk(ml_model_path) return doc2vec_model, ml_model log.info("Begun execution") doc2vec_model, ml_model = load_models() log.info("Models loaded") with open(veriday_articles_path) as veriday_articles_file: veriday_articles = json.load(veriday_articles_file)
def main(args): if FLAGS.cfg_file: print('loading config setting') cfg_from_file(FLAGS.cfg_file, cfg) print_config(cfg) output_path = FLAGS.output_path mask_path = FLAGS.mask_path if not os.path.isdir(output_path): os.makedirs(output_path) if not os.path.isdir(mask_path): os.makedirs(mask_path) image_h = cfg.IMAGE_HEIGHT image_w = cfg.IMAGE_WIDTH logger = log_helper.get_logger() # We use our "load_graph" function logger.info("accessing tf graph") graph = load_graph(FLAGS.graph_name) if FLAGS.verbose: # We can verify that we can access the list of operations in the graph for op in graph.get_operations(): logger.info(op.name) # prefix/Placeholder/inputs_placeholder # ... # prefix/Accuracy/predictions # We access the input and output nodes input_img = graph.get_tensor_by_name('import/input/image:0') pred = graph.get_tensor_by_name('import/output/prob:0') # launch a Session with tf.Session(graph=graph) as sess: total_time_elapsed = 0.0 for image, fname in instance_generator(FLAGS.sample_path): logger.info("predicting for {}".format(fname)) begin_ts = time.time() feed_dict = { input_img: image[np.newaxis], } # Note: we didn't initialize/restore anything, everything is stored in the graph_def prediction = sess.run(pred, feed_dict=feed_dict) end_ts = time.time() logger.info("cost time: {} s".format(end_ts - begin_ts)) total_time_elapsed += end_ts - begin_ts # output_image to verify output_fname = output_path + "/" + os.path.basename(fname) pred_img = np.reshape(prediction, (image_h, image_w, cfg.NUM_CLASSES)) pred_prob = genPredProb(pred_img, cfg.NUM_CLASSES) ret = cv2.imwrite(output_fname, pred_prob) if not ret: logger.error('writing image to {} failed!'.format(output_fname)) sys.exit(-1) # masking image mask_fname = mask_path + "/" + os.path.basename(fname) r, g, b = cv2.split(image.astype(np.uint8)) cv_img = cv2.merge([b, g, r]) masked = image_process.prob_mask(cv_img, pred_prob) ret = cv2.imwrite(mask_fname, masked) if not ret: logger.error('writing image to {} failed!'.format(output_fname)) sys.exit(-1) print("total time elapsed: {} s".format(total_time_elapsed))
from sklearn import model_selection, linear_model, svm from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer import xgboost as xgb from sklearn.metrics import make_scorer from xgboost import XGBRegressor from sklearn.model_selection import train_test_split from processors.processor import Processor from utils import file_helper from utils import log_helper from utils.ml_helper import train_xgboost_regressor from utils.evaluation_helper import evaluate_task_score log = log_helper.get_logger("TFIDFProcessor") class TFIDFProcessor(Processor): def process(self): log.info("Began Processing") if self.options.validate: x_train_articles, y_train = file_helper.get_article_details( self.options.train_headlines_data_path) x_test_articles, y_test = file_helper.get_article_details( self.options.test_headlines_data_path) log.info("Extracting articles and scores") x_train_articles.extend(x_test_articles) y_train.extend(y_test) vectorizer = TfidfVectorizer(sublinear_tf=True,
import json from gensim.models.doc2vec import TaggedDocument from nltk import sent_tokenize, word_tokenize from utils.options import Options from utils import log_helper log = log_helper.get_logger("ReviewFile_Helper") def parse_review_file(): """ Parses the input review file :return: a list of TaggedDocs for Doc2Vec and a dict of scores """ tagged_reviews = list() rating_dict = dict() for review in open(Options.options.input_file_path): identifier, tagged_review, rating = parse_review(json.loads(review)) tagged_reviews.append(tagged_review) rating_dict[identifier] = rating return tagged_reviews, rating_dict def parse_review(review): """ :param review: JSON object containing an Amazon review
from gensim.models.doc2vec import TaggedLineDocument from processors.processor import Processor from utils import doc2vec_helper from utils import log_helper from utils import scikit_ml_helper from sklearn import metrics log = log_helper.get_logger("FactCheckProcessor") class FactCheckProcessorDocvec(Processor): def __init__(self, labeled_articles_source_file_path, doc2vec_model_file_path, ml_model_file_path, articles_source_file_path, shuffle_count, classification_sources_file_path): self.labeled_articles_file_path = labeled_articles_source_file_path self.articles_source_file_path = articles_source_file_path self.doc2vec_model_file_path = doc2vec_model_file_path self.ml_model_file_path = ml_model_file_path self.shuffle_count = shuffle_count self.classification_sources_file_path = classification_sources_file_path self.samples_per_class_train = 680 self.samples_per_class_test = 50 def process(self): log.info("Commencing execution") tagged_docs = TaggedLineDocument(self.labeled_articles_file_path) log.info("Training Doc2Vec model")
from sklearn import metrics from entities.fpb_tagged_line_document import FPBTaggedLineDocument from processors.processor import Processor from utils import doc2vec_helper from utils import evaluation_helper from utils import file_helper from utils import log_helper from utils import ml_helper log = log_helper.get_logger("FPBDocvecProcessor") class FPBDocvecProcessor(Processor): def process(self): log.info("Began Processing") fpb_training_docs = FPBTaggedLineDocument( self.options.fpb_sentences_file_path) doc2vec_model = \ doc2vec_helper.init_model( fpb_training_docs, self.options.docvec_dimension_size, self.options.docvec_iteration_count ) log.info("Doc2vec model initialized with " + str(self.options.docvec_dimension_size) + " dimensions and " + str(self.options.docvec_iteration_count) + " iterations") label_list = fpb_training_docs.get_label_list() log.info("Re-training document vectors") x_train = list()
from processors.processor import Processor from utils import log_helper, file_helper, doc2vec_helper, scikit_ml_helper log = log_helper.get_logger("AmazonProcessor") class AmazonProcessor(Processor): def __init__(self, labeled_articles_source_file_path, doc2vec_model_file_path, ml_model_file_path, articles_source_file_path): self.labeled_articles_file_path = labeled_articles_source_file_path self.articles_source_file_path = articles_source_file_path self.doc2vec_model_file_path = doc2vec_model_file_path self.ml_model_file_path = ml_model_file_path self.shuffle_count = 5 def process(self): log.info("Commencing execution") # Get tagged articles from Veriday log.info("Getting tagged Veriday articles ... ") veriday_articles_raw = file_helper.get_articles_list( self.articles_source_file_path) veriday_tagged_articles = doc2vec_helper.get_tagged_articles_veriday( veriday_articles_raw) log.info("Getting tagged Amazon reviews ... ") tagged_articles, sentiment_scores_dict = \ doc2vec_helper.get_tagged_amazon_reviews(self.labeled_articles_file_path)
import json import sklearn from sklearn.feature_extraction.text import CountVectorizer from utils import file_helper from utils import scikit_ml_helper from processors.processor import Processor from utils import log_helper log = log_helper.get_logger("AmazonLineProcessorBigram") class AmazonLineProcessorBigram(Processor): def __init__(self, labeled_articles_source_file_path, doc2vec_model_file_path, ml_model_file_path, articles_source_file_path, shuffle_count, classification_sources_file_path): self.labeled_articles_file_path = labeled_articles_source_file_path self.articles_source_file_path = articles_source_file_path self.doc2vec_model_file_path = doc2vec_model_file_path self.ml_model_file_path = ml_model_file_path self.shuffle_count = shuffle_count self.classification_sources_file_path = classification_sources_file_path def process(self): log.info("Commencing execution") with open(self.classification_sources_file_path) as source_cfg: sources_dict = json.load(source_cfg)
def main(args): checkArgs() if FLAGS.cfg_file: print('loading config setting') cfg_from_file(FLAGS.cfg_file, cfg) if FLAGS.stereo_path != '': cfg.DO_STEREO = True else: cfg.DO_STEREO = False base_path = None title_str = "{:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}".format( 'ratio', 'abs_rel_i', 'sq_rel_i', 'rmse_i', 'rmse_log_i', 'd1_all_i', 'a1_i', 'a2_i', 'a3_i', 'abs_rel', 'sq_rel', 'rmse', 'rmse_log', 'd1_all', 'a1', 'a2', 'a3') if FLAGS.base_path != '': base_path = FLAGS.base_path title_str = "{:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}".format( 'ratio', 'abs_rel', 'sq_rel', 'rmse', 'rmse_log', 'd1_all', 'a1', 'a2', 'a3', 'abs_rel_b', 'sq_rel_b', 'rmse_b', 'rmse_log_b', 'd1_all_b', 'a1_b', 'a2_b', 'a3_b') stereo_path = FLAGS.stereo_path if cfg.DO_STEREO else None cfg.BATCH_SIZE = 1 if FLAGS.do_pp and not cfg.DO_STEREO: cfg.BATCH_SIZE = 2 print_config(cfg) if FLAGS.output_path != '': output_path = FLAGS.output_path if not os.path.isdir(output_path): os.mkdir(output_path) logger = log_helper.get_logger() if FLAGS.model == 'res50': model = Res50DispNet(cfg, logger) else: logger.error('wrong model type: {}'.format(FLAGS.model)) sys.exit(-1) # get moving avg if FLAGS.use_avg: variable_averages = tf.train.ExponentialMovingAverage( cfg.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) else: saver = tf.train.Saver(model.all_variables) total_time_elapsed = 0 with tf.Session() as sess: # restore model logger.info("restoring model ......") saver.restore(sess, FLAGS.ckpt_path) rate_list = [] rmse_inter_list = [] rmse_log_inter_list = [] abs_rel_inter_list = [] sq_rel_inter_list = [] d1_all_inter_list = [] a1_inter_list = [] a2_inter_list = [] a3_inter_list = [] rmse_list = [] rmse_log_list = [] abs_rel_list = [] sq_rel_list = [] d1_all_list = [] a1_list = [] a2_list = [] a3_list = [] for image, label, fname in instance_label_generator( FLAGS.sample_path, FLAGS.label_path, cfg.IMAGE_WIDTH, cfg.IMAGE_HEIGHT, FLAGS.do_pp, stereo_path, base_path=base_path): if cfg.DO_STEREO: sample_name = fname[0] stereo_name = fname[1] logger.info("testing for {} & {}".format(fname[0], fname[1])) feed_dict = { model.left_image: image[0], model.right_image: image[1] } fname = sample_name else: logger.info("testing for {}".format(fname)) if base_path is None: feed_dict = {model.left_image: image} else: feed_dict = {model.left_image: image[0]} begin_ts = time.time() pre_disp = sess.run(model.left_disparity[0], feed_dict=feed_dict) end_ts = time.time() logger.info("cost time: {} s".format(end_ts - begin_ts)) total_time_elapsed += end_ts - begin_ts if FLAGS.do_pp and not cfg.DO_STEREO: disp = post_process_disparity(pre_disp.squeeze()) else: disp = pre_disp[0].squeeze() base_disp = None if base_path is None else image[-1] width = label.shape[1] focal = KITTI_FOCAL[width] base = KITTI_BASE rate, d1_all_inter, abs_rel_inter, sq_rel_inter, rmse_inter, rmse_log_inter, a1_inter, a2_inter, a3_inter, d1_all, abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = depth_metrics( label, disp, focal, base, base_disp) print(title_str) print( "{:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}" .format(rate, abs_rel_inter, sq_rel_inter, rmse_inter, rmse_log_inter, d1_all_inter, a1_inter, a2_inter, a3_inter, abs_rel, sq_rel, rmse, rmse_log, d1_all, a1, a2, a3)) rate_list.append(rate) rmse_inter_list.append(rmse_inter) rmse_log_inter_list.append(rmse_log_inter) abs_rel_inter_list.append(abs_rel_inter) sq_rel_inter_list.append(sq_rel_inter) d1_all_inter_list.append(d1_all_inter) a1_inter_list.append(a1_inter) a2_inter_list.append(a2_inter) a3_inter_list.append(a3_inter) rmse_list.append(rmse) rmse_log_list.append(rmse_log) abs_rel_list.append(abs_rel) sq_rel_list.append(sq_rel) d1_all_list.append(d1_all) a1_list.append(a1) a2_list.append(a2) a3_list.append(a3) # output_image to verify if FLAGS.output_path != '': if FLAGS.do_pp and not cfg.DO_STEREO: output_fname = output_path + "/pp_" + os.path.basename( fname) else: output_fname = output_path + "/" + os.path.basename(fname) plt.imsave(output_fname, disp, cmap=plt.cm.gray) rate_mean = np.array(rate_list).mean() rmse_inter_mean = np.array(rmse_inter_list).mean() rmse_log_inter_mean = np.array(rmse_log_inter_list).mean() abs_rel_inter_mean = np.array(abs_rel_inter_list).mean() sq_rel_inter_mean = np.array(sq_rel_inter_list).mean() d1_all_inter_mean = np.array(d1_all_inter_list).mean() a1_inter_mean = np.array(a1_inter_list).mean() a2_inter_mean = np.array(a2_inter_list).mean() a3_inter_mean = np.array(a3_inter_list).mean() rmse_mean = np.array(rmse_list).mean() rmse_log_mean = np.array(rmse_log_list).mean() abs_rel_mean = np.array(abs_rel_list).mean() sq_rel_mean = np.array(sq_rel_list).mean() d1_all_mean = np.array(d1_all_list).mean() a1_mean = np.array(a1_list).mean() a2_mean = np.array(a2_list).mean() a3_mean = np.array(a3_list).mean() print("============total metric============") print(title_str) print( "{:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}" .format(rate_mean, abs_rel_inter_mean, sq_rel_inter_mean, rmse_inter_mean, rmse_log_inter_mean, d1_all_inter_mean, a1_inter_mean, a2_inter_mean, a3_inter_mean, abs_rel_mean, sq_rel_mean, rmse_mean, rmse_log_mean, d1_all_mean, a1_mean, a2_mean, a3_mean)) print("total time elapsed: {} s".format(total_time_elapsed))
import numpy as np from sklearn.metrics.pairwise import cosine_similarity from utils import log_helper log = log_helper.get_logger("EvaluationHelper") def evaluate_task_score(y_true, y_pred): cosine_smty = \ cosine_similarity(np.array(y_pred).reshape(1, -1), np.array(y_true).reshape(1, -1))[0][0] log.info("Cosine Similarity: " + str(cosine_smty)) return cosine_smty
from sklearn import linear_model, svm from sklearn.externals import joblib from sklearn.model_selection import train_test_split from utils import log_helper log = log_helper.get_logger("ML_Helper") def train_linear_model(x, y): x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) linear_reg_model = linear_model.LinearRegression() linear_reg_model.fit(x_train, y_train) log.info("Linear Regression accuracy: " + str(linear_reg_model.score(x_test, y_test))) return linear_reg_model def train_svm(x, y): x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) svm_regressor = svm.LinearSVR() svm_regressor.fit(x_train, y_train) log.info("SVR accuracy: " + str(svm_regressor.score(x_test, y_test))) return svm_regressor
from sklearn import model_selection, linear_model, svm from sklearn.feature_extraction.text import CountVectorizer import xgboost as xgb from sklearn.metrics import make_scorer from xgboost import XGBRegressor from processors.processor import Processor from utils import file_helper from utils import log_helper from utils.evaluation_helper import evaluate_task_score from utils.ml_helper import train_xgboost_regressor log = log_helper.get_logger("BigramProcessor") min_ngram_range = range(1, 3) max_ngram_range = range(1, 3) class BigramProcessor(Processor): def process(self): log.info("Began Processing") if self.options.validate: x_train_articles, y_train = file_helper.get_article_details( self.options.train_headlines_data_path) x_test_articles, y_test = file_helper.get_article_details( self.options.test_headlines_data_path) log.info("Extracting articles and scores") x_train_articles.extend(x_test_articles) y_train.extend(y_test)
from gensim.models.doc2vec import Doc2Vec from utils.options import Options from utils import log_helper log = log_helper.get_logger("Doc2Vec_Helper") def init_doc2vec_model(tagged_reviews): model = Doc2Vec(min_count=25, iter=50, workers=6, size=1000) model.build_vocab(tagged_reviews) return model def train_doc2vec_model(doc2vec_model, tagged_reviews): shuffle_count = Options.doc2vec_training_count for i in range(shuffle_count): log.info("Shuffles left: " + str(shuffle_count - i)) doc2vec_model.train(tagged_reviews)
import sys from args.options import Options from argparse import ArgumentParser from processors.term_scrape_processor import TermScrapeProcessor from processors.content_scrape_processor import ContentScrapeProcessor from exceptions.cmdline_exception import CmdLineException from utils import log_helper log = log_helper.get_logger("run") def parse_args(argv): parser = ArgumentParser(prog="Investopedia Term Scraper") parser.add_argument('--mode', metavar='Term Scrape / Content Scrape', type=str) parser.add_argument('--term_indices_file_path', metavar='Term Indices for Investopedia', type=str) parser.add_argument('--term_list_file_path', metavar='Term List filepath', type=str) parser.add_argument('--output_file_path', metavar='Output File Path', type=str) Options.args = parser.parse_args(argv, namespace=Options) def validate_args(args): if args.mode == 'term-scrape': if not args.term_indices_file_path: msg = "'term-scrape' mode requires 'term_indices_file_path'" log.error(msg) raise CmdLineException(msg)
import json from time import time from utils import doc2vec_helper, ml_helper from utils.options import Options from entities.rated_review_document import RatedReviewDocument from processors.processor import Processor from utils import log_helper log = log_helper.get_logger("AmazonReviewProcessor") class AmazonReviewProcessor(Processor): def process(self): log.info("Processing begun") log.info("Reading input file " + Options.options.input_file_path) review_iterator = RatedReviewDocument(Options.options.input_file_path) log.info("Building Doc2Vec model") start_time = time() doc2vec_model = doc2vec_helper.init_doc2vec_model(review_iterator) doc2vec_helper.train_doc2vec_model(doc2vec_model, review_iterator) time_to_create_docvecs = time() - start_time log.info("Doc2Vec model successfully trained") ratings_list = list() with open(Options.options.input_file_path) as reviews_file: for line in reviews_file:
def main(args): if FLAGS.cfg_file: print('loading config setting') cfg_from_file(FLAGS.cfg_file, cfg) do_pp = FLAGS.do_pp if FLAGS.do_stereo: do_pp = False cfg.DO_STEREO = True else: cfg.DO_STEREO = False cfg.BATCH_SIZE = 1 if do_pp: cfg.BATCH_SIZE = 2 print_config(cfg) output_path = FLAGS.output_path if output_path != '': if not os.path.isdir(output_path): os.makedirs(output_path) logger = log_helper.get_logger() do_recon = FLAGS.recon_path != '' if do_recon: if FLAGS.stereo_path == '': logger.error("to do reconstruction, stereo_path has to be set!") sys.exit(-1) recon_path = FLAGS.recon_path if not os.path.isdir(recon_path): os.makedirs(recon_path) stereo_path = FLAGS.stereo_path if FLAGS.model == 'res50': model = Res50DispNet(cfg, logger) else: logger.error('wrong model type: {}'.format(FLAGS.model)) sys.exit(-1) if FLAGS.use_avg: # get moving avg variable_averages = tf.train.ExponentialMovingAverage(cfg.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) else: saver = tf.train.Saver(model.all_variables) with tf.Session() as sess: # restore model logger.info("restoring model ......") saver.restore(sess, FLAGS.ckpt_path) total_time_elapsed = 0.0 aspect_ratio = float(cfg.IMAGE_WIDTH) / cfg.IMAGE_HEIGHT for image, fname in instance_generator(FLAGS.sample_path, cfg.IMAGE_WIDTH, cfg.IMAGE_HEIGHT, do_pp, stereo_path, cfg.DO_STEREO, do_recon): if cfg.DO_STEREO or do_recon: sample_name = fname[0] stereo_name = fname[1] logger.info("inference for {} & {}".format(fname[0], fname[1])) feed_dict = { model.left_image: image[0], model.right_image: image[1] } fname = sample_name else: logger.info("inference for {}".format(fname)) feed_dict = { model.left_image: image } begin_ts = time.time() if not do_recon: pre_disp = sess.run(model.left_disparity[0], feed_dict=feed_dict) else: pre_disp, recon, recon_diff = sess.run([model.left_disparity[0], model.left_reconstruction[0], model.left_recon_diff[0]], feed_dict=feed_dict) recon = recon[0,:,:,:] recon_diff = recon_diff[0,:,:,:] #print pre_disp.shape #print recon.shape #print recon_diff.shape end_ts = time.time() logger.info("cost time: {} s".format(end_ts - begin_ts)) total_time_elapsed += end_ts - begin_ts if do_pp: disp = post_process_disparity(pre_disp.squeeze()) else: disp = pre_disp[0].squeeze() if FLAGS.resize_ratio != 0 and FLAGS.resize_ratio != 1: disp = cv2.resize(disp, (FLAGS.resize_ratio*cfg.IMAGE_WIDTH, FLAGS.resize_ratio*cfg.IMAGE_HEIGHT), interpolation=cv2.INTER_LINEAR) # output disparity if output_path != '': if do_pp: output_fname = output_path + "/pp_" + os.path.basename(fname) else: output_fname = output_path + "/" + os.path.basename(fname) plt.imsave(output_fname, disp, cmap=plt.cm.gray) if recon_path is not None: o_image = cv2.resize(image[0][0], (FLAGS.resize_ratio*cfg.IMAGE_WIDTH, FLAGS.resize_ratio*cfg.IMAGE_HEIGHT), interpolation=cv2.INTER_LINEAR) o_recon = cv2.resize(recon, (FLAGS.resize_ratio*cfg.IMAGE_WIDTH, FLAGS.resize_ratio*cfg.IMAGE_HEIGHT), interpolation=cv2.INTER_LINEAR) o_diff = cv2.resize(recon_diff, (FLAGS.resize_ratio*cfg.IMAGE_WIDTH, FLAGS.resize_ratio*cfg.IMAGE_HEIGHT), interpolation=cv2.INTER_LINEAR) whole_fig = plt.figure(figsize=(int(aspect_ratio*8), 8)) gs = gridspec.GridSpec(2, 2) a = plt.subplot(gs[0, 0]) b = plt.subplot(gs[1, 0]) c = plt.subplot(gs[0, 1]) d = plt.subplot(gs[1, 1]) a.imshow(o_image) a.set_title('raw_image') plt.gca().get_xaxis().set_visible(False) plt.gca().get_yaxis().set_visible(False) b.imshow(disp, cmap=plt.cm.gray) b.set_title('disparity') plt.gca().get_xaxis().set_visible(False) plt.gca().get_yaxis().set_visible(False) c.imshow(o_recon) c.set_title('reconstruct') plt.gca().get_xaxis().set_visible(False) plt.gca().get_yaxis().set_visible(False) d.imshow(o_diff) d.set_title('recon_diff') #plt.tight_layout() plt.gca().get_xaxis().set_visible(False) plt.gca().get_yaxis().set_visible(False) output_fname = recon_path + "/" + os.path.basename(fname) plt.savefig(output_fname) # for release memory plt.clf() plt.close() print("total time elapsed: {} s".format(total_time_elapsed))
def _get_spider_logger(self): """获取爬虫日志对象""" return get_logger("spider", to_file=True, to_console=True, filename=self.spider_log_name)
from newspaper import Article from utils import log_helper log = log_helper.get_logger(__name__) def get_article_content(url_list): article_tuples = list() for url in url_list: try: article = Article(url) article.download() article.parse() article_tuple = (article.title, article.text) article_tuples.append(article_tuple) except Exception as e: log.error(e) return article_tuples def get_tweet_content(status_list): tweets = list() for status in status_list: tweets.append(status.text) return tweets
from sklearn import model_selection, linear_model, svm from sklearn.metrics import make_scorer from xgboost import XGBRegressor from entities.semeval_tagged_line_document import SemevalTaggedLineDocument from processors.processor import Processor from utils import doc2vec_helper from utils import file_helper from utils import log_helper from utils.evaluation_helper import evaluate_task_score log = log_helper.get_logger("DocvecProcessorCrossval") class DocvecProcessorCrossval(Processor): def process(self): log.info("Began Processing") semeval_train_docs = SemevalTaggedLineDocument( self.options.train_headlines_data_path) doc2vec_model = \ doc2vec_helper.init_model( semeval_train_docs, self.options.docvec_dimension_size, self.options.docvec_iteration_count ) log.info("Doc2vec model initialized with " + str(self.options.docvec_dimension_size) + " dimensions and " + str(self.options.docvec_iteration_count) + " iterations") x_articles, y_train = file_helper.get_article_details( self.options.train_headlines_data_path)
# -*- coding: utf-8 -*- """ 发起HTTP GET请求并接收返回结果 @file: get_helper.py @time: 2018/10/25 19:20 Created by Junyi. """ from requests_html import HTMLSession from utils.log_helper import get_logger from utils.decorator import deal_exceptions requests_logger = get_logger(logger_name='requests_logger', to_console=False, to_file=True, filename='requests') def is_useful_response(func): """ 判断response是否为文本型html的装饰器 :param func: 需要装饰的函数 :return: response | None """ def swapper(*args, **kwargs): response = func(*args, **kwargs) if response.status_code == 200: content_type = response.headers['Content-Type'] if 'text/html' in content_type: requests_logger.info(f"Request {response.url} successful!") else: requests_logger.warning(f"{response.url} is not a text html page!") response = None return response
from processors.processor import Processor from utils import log_helper, file_helper, doc2vec_helper, scikit_ml_helper log = log_helper.get_logger("ModelTrainer") class ModelTrainer(Processor): def __init__(self, labeled_articles_source_file_path, doc2vec_model_file_path, ml_model_file_path, articles_source_file_path, output_file_path): self.labeled_articles_file_path = labeled_articles_source_file_path self.articles_source_file_path = articles_source_file_path self.doc2vec_model_file_path = doc2vec_model_file_path self.ml_model_file_path = ml_model_file_path self.output_file_path = output_file_path self.shuffle_count = 100 def process(self): log.info("Commencing execution") # Get tagged articles from Veriday log.info("Getting tagged Veriday articles ... ") veriday_articles_raw = file_helper.get_articles_list(self.articles_source_file_path) veriday_tagged_articles = doc2vec_helper.get_tagged_articles_veriday(veriday_articles_raw) # Convert articles file into a Tagged documents for doc2vec log.info("Getting tagged Semeval articles ... ") articles = file_helper.get_articles_list(self.labeled_articles_file_path) tagged_articles, sentiment_scores_dict = doc2vec_helper.get_tagged_articles_scores(articles)
from processors.processor import Processor from utils import log_helper, file_helper, doc2vec_helper, scikit_ml_helper log = log_helper.get_logger("ArticleClassifier") class ArticleClassifier(Processor): def __init__(self, labeled_articles_source_file_path, doc2vec_model_file_path, ml_model_file_path, articles_source_file_path): self.labeled_articles_file_path = labeled_articles_source_file_path self.articles_source_file_path = articles_source_file_path self.doc2vec_model_file_path = doc2vec_model_file_path self.ml_model_file_path = ml_model_file_path self.shuffle_count = 1 def process(self): log.info("Commencing execution") # Get tagged articles from Semeval log.info("Getting Semeval articles ... ") semeval_articles_raw = file_helper.get_articles_list( self.labeled_articles_file_path) semeval_tagged_articles, document_sentiment_classes = \ doc2vec_helper.get_tagged_semeval_articles(semeval_articles_raw) # model initialization and vocab building log.info("Initializing the doc2vec model ...") doc2vec_model = doc2vec_helper.init_model(semeval_tagged_articles)
def main(args): if FLAGS.cfg_file: print('loading config setting') cfg_from_file(FLAGS.cfg_file, cfg) cfg.BATCH_SIZE = 1 print_config(cfg) output_path = FLAGS.output_path if not os.path.isdir(output_path): os.makedirs(output_path) batch_size = 1 image_h = cfg.IMAGE_HEIGHT image_w = cfg.IMAGE_WIDTH image_c = cfg.IMAGE_DEPTH output_name = FLAGS.output_name whole_graph_ext = 'pb' if FLAGS.whole_graph_bin else 'pbtxt' infer_graph_ext = 'pb' if FLAGS.infer_graph_bin else 'pbtxt' whole_graph_name = "{}_whole.{}".format(output_name, whole_graph_ext) infer_graph_name = "{}_infer.{}".format(output_name, whole_graph_ext) uff_graph_name = "{}_uff.{}".format(output_name, whole_graph_ext) output_graph_path = "{}/{}.{}".format(output_path, output_name, infer_graph_ext) output_uff_graph_path = "{}/{}_uff.{}".format(output_path, output_name, infer_graph_ext) print whole_graph_name print infer_graph_name print uff_graph_name print output_graph_path print output_uff_graph_path # We clear devices to allow TensorFlow to control on which device it will load operations clear_devices = True # Build graph logger = log_helper.get_logger() if FLAGS.model == 'sq': model = SQSegNet(cfg, logger) elif FLAGS.model == 'erf': model = ERFSegNet(cfg, logger) output_node_names = "output/prob" if FLAGS.restore_avg: # get moving avg variable_averages = tf.train.ExponentialMovingAverage(cfg.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) else: saver = tf.train.Saver(model.all_variables) saver = tf.train.Saver(variables_to_restore) with tf.Session() as sess: # Load checkpoint whole_graph_def = sess.graph.as_graph_def() # fix whole_graph_def for bn for node in whole_graph_def.node: if node.op == 'RefSwitch': node.op = 'Switch' for index in xrange(len(node.input)): if 'moving_' in node.input[index]: node.input[index] = node.input[index] + '/read' elif node.op == 'AssignSub': node.op = 'Sub' if 'use_locking' in node.attr: del node.attr['use_locking'] elif node.op == 'AssignAdd': node.op = 'Add' if 'use_locking' in node.attr: del node.attr['use_locking'] print("%d ops in the whole graph." % len(whole_graph_def.node)) tf.train.write_graph(whole_graph_def, output_path, whole_graph_name, as_text=not FLAGS.whole_graph_bin) infer_graph_def = graph_util.extract_sub_graph(whole_graph_def, output_node_names.split(",")) print("%d ops in the infer graph." % len(infer_graph_def.node)) tf.train.write_graph(infer_graph_def, output_path, infer_graph_name, as_text=not FLAGS.whole_graph_bin) # fix infer_graph_def for bn for converstion to tensorRT uff for node in infer_graph_def.node: name_fields = node.name.split('/') if name_fields[-2] == 'batchnorm': if name_fields[-1] == 'add': for index in xrange(len(node.input)): if 'cond/Merge' in node.input[index]: node.input[index] = '/'.join(name_fields[:-2] + ['moving_variance', 'read']) if name_fields[-1] == 'mul_2': for index in xrange(len(node.input)): if 'cond/Merge' in node.input[index]: node.input[index] = '/'.join(name_fields[:-2] + ['moving_mean', 'read']) uff_graph_def = graph_util.extract_sub_graph(infer_graph_def, output_node_names.split(",")) print("%d ops in the uff graph." % len(uff_graph_def.node)) tf.train.write_graph(uff_graph_def, output_path, uff_graph_name, as_text=not FLAGS.whole_graph_bin) saver.restore(sess, FLAGS.ckpt_path) output_graph_def = graph_util.convert_variables_to_constants( sess, # The session is used to retrieve the weights whole_graph_def, # The graph_def is used to retrieve the nodes output_node_names.split(",") # The output node names are used to select the usefull nodes ) output_uff_graph_def = graph_util.convert_variables_to_constants( sess, # The session is used to retrieve the weights infer_graph_def, # The graph_def is used to retrieve the nodes output_node_names.split(",") # The output node names are used to select the usefull nodes ) # Finally we serialize and dump the output graph to the filesystem mode = "wb" if FLAGS.infer_graph_bin else "w" with tf.gfile.GFile(output_graph_path, mode) as f: if FLAGS.infer_graph_bin: f.write(output_graph_def.SerializeToString()) else: f.write(str(output_graph_def)) print("%d ops in the output graph." % len(output_graph_def.node)) with tf.gfile.GFile(output_uff_graph_path, mode) as f: if FLAGS.infer_graph_bin: f.write(output_uff_graph_def.SerializeToString()) else: f.write(str(output_uff_graph_def)) print("%d ops in the output uff graph." % len(output_uff_graph_def.node))
import json from args.options import Options from processors.processor import Processor from utils import log_helper, scrape_helper log = log_helper.get_logger("TermScrapeProcessor") class TermScrapeProcessor(Processor): def __init__(self): super().__init__() self.domain = "http://www.investopedia.com" self.root_url = self.domain + "/terms/" self.min_term_count = 100 def process(self): log.info("Processing begun") with open(Options.args.term_indices_file_path) as indices_file: list_of_indices = json.load(indices_file) log.info("There are " + str(len(list_of_indices)) + " indices") output_file_object = open(Options.args.output_file_path, 'w') for index_term in list_of_indices: term_set = set() log.info("Working on index term " + index_term)
import json from utils import log_helper log = log_helper.get_logger("FileHelper") def get_articles_list(articles_file_path): with open(articles_file_path, 'r') as articles_file: articles_data = articles_file.read() return json.loads(articles_data) def get_article_details(articles_file_path): articles = list() sentiment_scores = list() semeval_articles = get_articles_list(articles_file_path) for semeval_article in semeval_articles: if "sentiment" in semeval_article.keys(): sentiment_scores.append(semeval_article['sentiment']) articles.append(semeval_article['title'].replace(semeval_article['company'], "Umbrella Corp")) return articles, sentiment_scores def annotate_test_set(test_headlines_data_path, y_test):