def cmd_create_source_files_for_slides(args): """Create dummy source files for slides. If file or folder exists, don't touch it.""" create_directory(args.target) content = get_config(args.config)[CONTENT] def make_group(group): # create group dir group_root = os.path.join(args.target, group.slug) create_directory(group_root) # create group index file make_file(group_root, "index", group.title, '#') # create individual sections (add section name as headline) for section in group.sections: make_file(group_root, section, section, '##') def make_file(root, filename_root, title_root, markup='#'): """Create file if it does not exist.""" filename = os.path.join(root, md_filename(filename_root)) if not os.path.exists(filename): with codecs.open(filename, 'w+', 'utf-8') as fp: fp.write('%s %s\n\n' % (markup, make_title(title_root))) else: if args.verbose: print "skipped %s" % title_root make_file(args.target, content.title, content.title) if content.introduction: make_group(content.introduction) for chapter in content.chapters: make_group(chapter) if content.appendix: make_group(content.appendix) if content.end: make_file(args.target, content.end, content.end)
def save_model(model, file_name): """ save model to the model folder """ create_directory('model') with open('model/%s.pickle' % file_name, 'wb') as f: cPickle.dump(model, f)
def send_mail(title, msg): if not common.file_exist(common.CONST_DIR_LOG): common.create_directory(common.CONST_DIR_LOG) if not common.file_exist(common.CONST_DIR_CONF): common.create_directory(common.CONST_DIR_CONF) log = Logger(mail_log_filename, level='debug') config, err = _load_config() if config is None: log.logger.error(u"邮件发送客户端配置文件加载错误: %s", err) return host = config.get("host", "localhost") port = config.get("port", 25) user = config.get("user", "root") pwd = config.get("pwd", "") sender = config.get("sender", "localhost") receivers = config.get("receivers", []) message = MIMEText(msg, 'plain', 'utf-8') message['Subject'] = Header(title, 'utf-8') try: smtp_instance = smtplib.SMTP() smtp_instance.connect(host, port) # 25 为 SMTP 端口号 smtp_instance.login(user, pwd) smtp_instance.sendmail(sender, receivers, message.as_string()) log.logger.info(u"主题: [%s] 的邮件已经被发送." % title) except smtplib.SMTPException as err: log.logger.error(u"主题: [%s] 的邮件发送失败, 错误: %s" % (title, err.message))
def _get_meta_paths(imdb_id, path): data_file = os.path.join(path, "%s.dat" % (imdb_id)) poster_path = create_directory(path, META_QUALITY) fanart_path = create_directory(path, META_QUALITY) poster_file = os.path.join(poster_path, "%s_poster.jpg" % (imdb_id)) fanart_file = os.path.join(fanart_path, "%s_fanart.jpg" % (imdb_id)) poster_missing = os.path.join(poster_path, "%s_poster.missing" % (imdb_id)) fanart_missing = os.path.join(fanart_path, "%s_fanart.missing" % (imdb_id)) return (data_file, poster_file, fanart_file, poster_missing, fanart_missing)
def make_group(group): # create group dir group_root = os.path.join(args.target, group.slug) create_directory(group_root) # create group index file make_file(group_root, "index", group.title, '#') # create individual sections (add section name as headline) for section in group.sections: make_file(group_root, section, section, '##')
def load_args(self, args): # Load command line options for arg, attr in self.command_line_args_to_attrs: setattr(self, attr, getattr(args, arg)) create_directory(self.outdir) file_names = FileNames(self.outdir) for attr in self.__dict__.keys(): if getattr(self, attr) != None: continue setattr(self, attr, getattr(file_names, attr))
def find_and_save_timings(): tweet_list = get_labelled_tweets() num_tweets = len(tweet_list) setup = """ from data_source import get_labelled_tweets, get_labels; from sklearn.externals import joblib; tweet_list = get_labelled_tweets(); # do transformation into vector; vectoriser = joblib.load('model/tfidf_vectoriser.pkl'); vectorised_tweet_list = vectoriser.transform(tweet_list); svm_model = joblib.load('model/tfidf_linsvc.pkl'); svm_model.predict(vectorised_tweet_list); """ test_statement = 'svm_model.predict(vectorised_tweet_list)' REPETITIONS = 100 # check timing of svm # time in micro seconds svm_time = timeit.timeit(stmt=test_statement, setup=setup, number=REPETITIONS) svm_time_dataset = get_dataset_time(svm_time, REPETITIONS) svm_time_record = get_record_time(svm_time_dataset, num_tweets) setup_ensemble = """ import cPickle; from data_source import get_labelled_tweets; from sklearn.externals import joblib; tweet_list = get_labelled_tweets(); vectoriser = joblib.load('model/tfidf_vectoriser.pkl'); vectorised_tweet_list = vectoriser.transform(tweet_list); with open('model/tfidf_ada.pickle', 'rb') as f: ensemble_model = cPickle.load(f); ensemble_model.predict(vectorised_tweet_list); """ test_statement_ensemble = 'ensemble_model.predict(vectorised_tweet_list)' ensemble_time = timeit.timeit(stmt=test_statement_ensemble, setup=setup_ensemble, number=REPETITIONS) ens_time_dataset = get_dataset_time(ensemble_time, REPETITIONS) ens_time_record = get_record_time(ens_time_dataset, num_tweets) # save results in a txt file create_directory('metric_result') with open("metric_result/" + 'timings' + ".txt", "w") as text_file: text_file.write("Number of records in dataset: {0}\n".format(num_tweets)) text_file.write("Svm dataset time: {0}\n".format(svm_time_dataset)) text_file.write("Svm record time: {0}\n".format(svm_time_record)) text_file.write("Ensemble dataset time: {0}\n".format(ens_time_dataset)) text_file.write("Ensemble record time: {0}\n".format(ens_time_record))
def generate_eval_metrics(binarise_result, file_name, y_test): accuracy = accuracy_score(np.array(y_test), np.array(binarise_result)) precision = precision_score(y_test, binarise_result, average="macro") recall = recall_score(y_test, binarise_result, average="macro") f1_measure = f1_score(y_test, binarise_result, average="macro") # save results in a txt file create_directory('metric_result') with open("metric_result/" + file_name + ".txt", "w") as text_file: text_file.write("Accuracy: {0}\n".format(accuracy)) text_file.write("Precision: {0}\n".format(precision)) text_file.write("Recall: {0}\n".format(recall)) text_file.write("F1 measure: {0}\n".format(f1_measure))
def gensim_classifier(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) label_list = get_labels() tweet_list = get_labelled_tweets() # split all sentences to list of words sentences = [] for tweet in tweet_list: temp_doc = tweet.split() sentences.append(temp_doc) # parameters for model num_features = 100 min_word_count = 1 num_workers = 4 context = 2 downsampling = 1e-3 # Initialize and train the model w2v_model = Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed=1) index_value, train_set, test_set = train_test_split(0.80, sentences) train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features) test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features) train_vector = Imputer().fit_transform(train_vector) test_vector = Imputer().fit_transform(test_vector) # train model and predict model = LinearSVC() classifier_fitted = OneVsRestClassifier(model).fit( train_vector, label_list[:index_value]) result = classifier_fitted.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/w2v_linsvc.csv", sep=',') # store the model to mmap-able files create_directory('model') joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc') # evaluation label_score = classifier_fitted.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(label_list, classes=class_list) evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def gensim_classifier(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) label_list = get_labels() tweet_list = get_labelled_tweets() # split all sentences to list of words sentences = [] for tweet in tweet_list: temp_doc = tweet.split() sentences.append(temp_doc) # parameters for model num_features = 100 min_word_count = 1 num_workers = 4 context = 2 downsampling = 1e-3 # Initialize and train the model w2v_model = Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed=1) index_value, train_set, test_set = train_test_split(0.80, sentences) train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features) test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features) train_vector = Imputer().fit_transform(train_vector) test_vector = Imputer().fit_transform(test_vector) # train model and predict model = LinearSVC() classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value]) result = classifier_fitted.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/w2v_linsvc.csv", sep=',') # store the model to mmap-able files create_directory('model') joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc') # evaluation label_score = classifier_fitted.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(label_list, classes=class_list) evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def evaluate(binarise_result, y_test, y_score, file_name): """ computes the accuracy, precision and recall. plots the precision and recall curve. saves the plots to the figure folder. :param binarise_result: list of binarised result after prediction from classifier :type binarise_result: list[list[int]] :param y_test: list of binarised labels from the test set :type y_test: list[list[int]] :param y_score: distance of each sample from the decision boundary for each class :type y_score:list :param file_name: directory name for saving all figures from the plots :type file_name: str :return: :rtype: """ num_class = y_test.shape[1] # Compute Precision-Recall and plot curve precision = dict() recall = dict() average_precision = dict() for i in range(num_class): precision[i], recall[i], _ = precision_recall_curve( y_test[:, i], y_score[:, i]) average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i]) # Compute micro-average ROC curve and ROC area precision["micro"], recall["micro"], _ = precision_recall_curve( y_test.ravel(), y_score.ravel()) average_precision["micro"] = average_precision_score(y_test, y_score, average="micro") # create directory create_directory('figure') create_directory('figure/' + file_name) # plots plot_precision_recall_curve(average_precision, precision, recall, file_name) # Plot Precision-Recall curve for each class plot_precision_recall_curve_all_classes(average_precision, precision, recall, file_name, num_class) generate_eval_metrics(binarise_result, file_name, y_test)
def create_tv_show_strm_files(name, imdb_id, mode, dir_path): info = TheTVDBInfo(imdb_id) episodes = info.episodes() tv_show_path = create_directory(dir_path, name) for episode in episodes: first_aired = episode.FirstAired() if len(first_aired) > 0: d = first_aired.split('-') episode_date = date(int(d[0]), int(d[1]), int(d[2])) if date.today() > episode_date: season_number = int(episode.SeasonNumber()) if season_number > 0: episode_number = int(episode.EpisodeNumber()) episode_name = episode.EpisodeName() display = "S%.2dE%.2d %s" % (season_number, episode_number, episode_name) data = '%s<|>%s<|>%d<|>%d' % (name, episode_name, season_number, episode_number) season_path = create_directory(tv_show_path, str(season_number)) create_strm_file(display, data, imdb_id, mode, season_path)
def lin_svc(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer( min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, ) ## do transformation into vector fitted_vectoriser = vectoriser.fit(tweet_list) vectorised_tweet_list = fitted_vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split( vectorised_tweet_list, label_list, test_size=0.8, random_state=42) # train model and predict model = LinearSVC() ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels) result = ovr_classifier.predict(test_vector) # output result to csv create_directory('data') save_to_csv("data/testset_labels.csv", test_labels) result.tofile("data/tfidf_linsvc.csv", sep=',') save_model(ovr_classifier, 'tfidf_linsvc') save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser') # evaluation label_score = ovr_classifier.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def lin_svc(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,) ## do transformation into vector fitted_vectoriser = vectoriser.fit(tweet_list) vectorised_tweet_list = fitted_vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list, label_list, test_size=0.8, random_state=42) # train model and predict model = LinearSVC() ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels) result = ovr_classifier.predict(test_vector) # output result to csv create_directory('data') save_to_csv("data/testset_labels.csv", test_labels) result.tofile("data/tfidf_linsvc.csv", sep=',') save_model(ovr_classifier, 'tfidf_linsvc') save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser') # evaluation label_score = ovr_classifier.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def set_resolved_url(handle, name, url, imdb_id, list_item=None): #=None if xbmc.Player().isPlayingVideo() == True: xbmc.Player().stop() if not list_item: list_item = xbmcgui.ListItem( name, path=url) # iconImage="DefaultVideoBig.png", thumbnailImage='', poster_path = create_directory(META_PATH, META_QUALITY) poster_file = os.path.join(poster_path, "%s_poster.jpg" % (imdb_id)) list_item.setThumbnailImage(poster_file) #list_item = set_movie_meta(list_item, imdb_id, META_PATH) list_item.setProperty("IsPlayable", "true") xbmcplugin.setResolvedUrl(handle, True, list_item)
def evaluate(binarise_result, y_test, y_score, file_name): """ computes the accuracy, precision and recall. plots the precision and recall curve. saves the plots to the figure folder. :param binarise_result: list of binarised result after prediction from classifier :type binarise_result: list[list[int]] :param y_test: list of binarised labels from the test set :type y_test: list[list[int]] :param y_score: distance of each sample from the decision boundary for each class :type y_score:list :param file_name: directory name for saving all figures from the plots :type file_name: str :return: :rtype: """ num_class = y_test.shape[1] # Compute Precision-Recall and plot curve precision = dict() recall = dict() average_precision = dict() for i in range(num_class): precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i]) average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i]) # Compute micro-average ROC curve and ROC area precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(), y_score.ravel()) average_precision["micro"] = average_precision_score(y_test, y_score, average="micro") # create directory create_directory('figure') create_directory('figure/' + file_name) # plots plot_precision_recall_curve(average_precision, precision, recall, file_name) # Plot Precision-Recall curve for each class plot_precision_recall_curve_all_classes(average_precision, precision, recall, file_name, num_class) generate_eval_metrics(binarise_result, file_name, y_test)
def ensemble_classify(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,) ## do transformation into vector vectoriser.fit(tweet_list) vectorised_tweet_list = vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list, label_list, test_size=0.8, random_state=42) n_estimators = 10 # number of weak learners model = AdaBoostClassifier(n_estimators=n_estimators) ada_classifier = model.fit(train_vector, train_labels) result = ada_classifier.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/tfidf_ada.csv", sep=',') save_model(ada_classifier, 'tfidf_ada') # evaluation binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)
def ensemble_classify(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer( min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, ) ## do transformation into vector vectoriser.fit(tweet_list) vectorised_tweet_list = vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split( vectorised_tweet_list, label_list, test_size=0.8, random_state=42) n_estimators = 10 # number of weak learners model = AdaBoostClassifier(n_estimators=n_estimators) ada_classifier = model.fit(train_vector, train_labels) result = ada_classifier.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/tfidf_ada.csv", sep=',') save_model(ada_classifier, 'tfidf_ada') # evaluation binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)
def __init__(self): if not common.file_exist(common.CONST_DIR_LOG): common.create_directory(common.CONST_DIR_LOG) if not common.file_exist(common.CONST_DIR_CONF): common.create_directory(common.CONST_DIR_CONF) if not common.file_exist(common.CONST_DIR_DATABASE): common.create_directory(common.CONST_DIR_DATABASE) self.log = Logger(trader_log_filename, level='debug')
def tv_directory(): if ADDON.getSetting('tv_directory').startswith('special'): return create_directory(DATA_PATH, "TV_SUBSCRIPTIONS") else: return ADDON.getSetting('tv_directory') def show_unaired(): if ADDON.getSetting('show_unaired') == "true": return True else: return False def subscription_update(): if ADDON.getSetting('subscription_update') == "true": return True else: return False def dummy_path(): return os.path.join(ADDON.getAddonInfo('path'), 'dummy.wma') def service_sleep_time(): return 10 def subscription_timer(): return int(ADDON.getSetting('subscription_timer')) create_directory(DATA_PATH, "")
def tv_directory(): if ADDON.getSetting('tv_directory').startswith('special'): return create_directory(DATA_PATH, "TV_SUBSCRIPTIONS") else: return ADDON.getSetting('tv_directory')
def _storage_box_data(data): if not common.file_exist(common.CONST_DIR_DATABASE): common.create_directory(common.CONST_DIR_DATABASE) common.dict_to_file(data, box_db_filename)
def __init__(self): if not common.file_exist(common.CONST_DIR_LOG): common.create_directory(common.CONST_DIR_LOG) self.log = Logger(box_log_filename, level='debug') self.connect_instance = None
def download_path(): return create_directory(DATA_PATH, "download")
def save_vectoriser(model, file_name): """ save vectorised tweets to the data folder """ create_directory('model') joblib.dump(model, 'model/%s.pkl' % file_name)
@author: Batch ''' import xbmc, xbmcaddon, xbmcgui, xbmcplugin from common import notification, get_url, regex_get_all, regex_from_to, create_directory, write_to_file, read_from_file, clean_file_name from datetime import date, timedelta import urllib, os, sys, re import shutil from furk import FurkAPI from mediahandler import play, download, download_and_play, set_resolved_url from meta import TheTVDBInfo, set_movie_meta, download_movie_meta, set_tv_show_meta, download_tv_show_meta, meta_exist from threading import Thread ADDON = xbmcaddon.Addon(id='plugin.video.whatthefurk') DATA_PATH = os.path.join(xbmc.translatePath('special://profile/addon_data/plugin.video.whatthefurk'), '') CACHE_PATH = create_directory(DATA_PATH, "cache") COOKIE_JAR = os.path.join(DATA_PATH, "cookiejar.lwp") SUBSCRIPTION_FILE = os.path.join(DATA_PATH, "subsciption.list") SEARCH_FILE = os.path.join(DATA_PATH, "search.list") DOWNLOAD_PATH = create_directory(DATA_PATH, "download") META_PATH = create_directory(DATA_PATH, "meta") FURK_FILTER = 'cached' IMDB_TITLE_SEARCH = "http://m.imdb.com/search/title?" COUNT = "100" #Max 100 HAS = "asin-dvd-us" #To only show movies released on DVD PRODUCTION_STATUS = "released" SORT = "user_rating" #alpha/user_rating/num_votes/year/release_date_us/boxoffice_gross_us/moviemeter,desc VIEW = "simple" if ADDON.getSetting('release_date') == "true": try:
def meta_path(): if ADDON.getSetting('meta_custom_directory') == "true": return ADDON.getSetting('meta_path') else: return create_directory(DATA_PATH, "meta")
def movies_directory(): if ADDON.getSetting('movies_custom_directory') == "true": return ADDON.getSetting('movies_directory') else: return create_directory(DATA_PATH, "movies")
def tv_show_directory(): if ADDON.getSetting('tv_shows_custom_directory') == "true": return ADDON.getSetting('tv_shows_directory') else: return create_directory(DATA_PATH, "tv shows")
def save_model(model, file_name): """ save model to the model folder """ create_directory('model') joblib.dump(model, 'model/%s.pkl' % file_name)
def cache_path(): return create_directory(DATA_PATH, "cache")