def users(): users = db.get_users() users = utils.sorted_dict(users, 'username') ctx = { 'users': users, } return render_template('accounts/users.html', **ctx)
def get_categories_1_names(self): names_map = {} for category_name in self.categories_1: category_id = self.categories_1[category_name] names_map[category_id] = category_name names_list = sorted_dict(names_map) return [ cat_name for (cat_id, cat_name) in names_list]
def compute_cos(X, users_ratings, N): res = {} for key, Y in users_ratings.items(): res[key] = cos(X, Y) res = utils.sorted_dict(res) return res[:N]
def export_categories_to_xls(self, categories_useinfo, xls_file): categories = self if xls_file is None: return wb = xlwt.Workbook(encoding='utf-8') ws = wb.add_sheet("categories") ws.write(0, 0, 'ID') ws.write(0, 1, 'CAT1') ws.write(0, 2, 'CAT2') ws.write(0, 3, 'CAT3') ws.write(0, 4, 'SAMPLES') rowidx = 1 categories_list = sorted_dict(categories_useinfo) for (category_id, category_used) in categories_list: if category_id % 1000 == 0: if category_id % 1000000 == 0: category_3 = -1 category_2 = -1 category_1 = category_id else: category_3 = -1 category_2 = category_id category_1 = int(category_id / 1000000) * 1000000 else: category_3 = category_id category_2 = int(category_id / 1000) * 1000 category_1 = int(category_id / 1000000) * 1000000 category_1_name = categories.get_category_name(category_1) category_2_name = categories.get_category_name(category_2) category_3_name = categories.get_category_name(category_3) logging.debug(Logger.debug("id:%d 1:%d 2:%d 3:%d" % ( category_id, category_1, category_2, category_3))) ws.write(rowidx, 0, category_id) ws.write(rowidx, 1, category_1_name) ws.write(rowidx, 2, category_2_name) ws.write(rowidx, 3, category_3_name) ws.write(rowidx, 4, category_used) rowidx += 1 wb.save(xls_file) logging.debug(Logger.debug("Export categories to xls file %s" % (xls_file)))
def print_categories(self): logging.info(Logger.notice("--------------- categories -----------------")) categories_dict = {} for k in self.categories_1: v = self.categories_1[k] categories_dict[v] = k for k in self.categories_2: v = self.categories_2[k] categories_dict[v] = k for k in self.categories_3: v = self.categories_3[k] categories_dict[v] = k categories_list = sorted_dict(categories_dict) for (category_id, category_name) in categories_list: logging.info(Logger.notice("%s - %d" % (category_name, category_id)))
def index(): workflows = config.get_workflows() job = None result_key = None workflow_id = None workflows = utils.sorted_dict(workflows, 'name') if request.method == 'POST': form = request.form workflow_id = form.get('workflow_id') if workflow_id: workflow = [x for x in workflows if x.get('name') == workflow_id][0] username = session.get('user', {}).get('username') db.log({ 'ip': request.remote_addr, 'user': username, 'command': 'Workflow: {0}'.format(workflow_id), }) args = workflow.get('arguments') # substitute args task = workflow.get('command') if args: for a in args: arg_name = a.get('name') arg_val = form.get(arg_name, None) task = task.replace('<{0}>'.format(arg_name), form.get(arg_name)) # add proxy_user to get launchpad user task = task.replace('<proxy_user>', username) # generate result_key result_key = str(int(time.time())) # run command job = queue_task(ops.run_fabric_task, task, result_key, workflow.get('notify')) ctx = { 'workflows': workflows, 'job': job, 'result_key': result_key, } return render_template('admin/index.html', **ctx)
def main() -> None: """ Main method of Seddit. Processes arguments, performs search, and returns the result """ # ========================================================= Read input # Load config file config = load_config(DEFAULT_CONFIG_FILE) # Read command line arguments args = load_params() # Load config file from CLI argument if args.config: config.read(args.config) """ Convert arguments to local variables """ # The name of the subreddit being scraped sub_name = args.subreddit # Whether to force each feed to update regardless of cache validity force = True if args.force else config["DEFAULT"].getboolean( "force", fallback=False) # Whether to display a graph of the most popular terms show_graph = True if args.graph else config["DEFAULT"].getboolean( "show_graph", fallback=False) # The post limit for refreshing feeds try: feed_limit = args.feed_limit if args.feed_limit else config[ "DEFAULT"].getint("feed_limit") if feed_limit < 0: # Change non-positive feed limit to "None" feed_limit = None except TypeError: feed_limit = None # Convert any non-number limit to None # Method for scoring scoring = args.scoring.lower() if scoring == "count": method = PostCache.COUNT elif scoring == "score": method = PostCache.SCORE else: raise ValueError(f"Unrecognized scoring method '{scoring}'") # Read frequency threshold threshold = args.threshold if args.threshold else config.getint( "Filters", "threshold") """ Ingest CSV files """ # Read search terms from CSV search_term_path = args.search_terms if args.search_terms else config[ "Files"]["search_terms"] search_terms = utils.ingest_csv( search_term_path) if search_term_path else None # Read term groups from CSV term_groups_path = args.term_groups if args.term_groups else config[ "Files"]["term_groups"] terms_list = utils.ingest_csv( term_groups_path) if term_groups_path else None term_groups = data.TermGroups(terms_list) # Read in word filter CSV files and flatten to 1D list word_set = set() if args.word_filter: word_filters = args.word_filter elif config["Files"].get("word_filters"): word_filters = config["Files"].get("word_filters").split(" ") else: word_filters = [] for path in word_filters: word_array = utils.ingest_csv(path) # Add every word in file to set for row in word_array: for word in row: word_set.add(word.strip()) filtered_words = list(word_set) if word_set else None # ========================================================= Load Data # Create PRAW reddit object reddit = praw.Reddit(client_id=config["PRAW"]["client_id"], client_secret=config["PRAW"]["client_secret"], user_agent=config["PRAW"]["user_agent"]) # Load cache from file cache_path = f"{config['Cache']['dir_path']}{os.path.sep}{sub_name.lower()}.json" cache = PostCache(sub_name, cache_path, reddit, config["Cache"].getint("ttl_hot"), config["Cache"].getint("ttl_new"), config["Cache"].getint("ttl_top")) # Refresh cache if cache.refresh(force=force, limit=feed_limit): cache.save() # ========================================================= Perform search # Perform search term result if provided, otherwise perform word count if search_terms: result_dict = cache.search_terms( search_terms, ignore_title_regex=config["Regex"]["ignore_title"], require_title_regex=config["Regex"]["require_title"], method=method) else: result_dict = cache.count_words( term_group=term_groups, ignore_title_regex=config["Regex"]["ignore_title"], require_title_regex=config["Regex"]["require_title"], method=method) """ Filter results """ # Remove filtered words if filtered_words: utils.list_filter_dict(result_dict, filtered_words) # Filter low-frequency words if not using search terms if threshold is not None and not search_terms: result_dict = utils.value_filter_dict(result_dict, threshold) # Filter words by regex if config["Regex"]["require_word"] or config["Regex"]["ignore_word"]: utils.regex_filter_dict(result_dict, require=config["Regex"]["require_word"], remove=config["Regex"]["ignore_word"]) """ Sort words by frequency """ sorted_tuples = utils.sorted_dict(result_dict) # ========================================================== Display Findings """ Print rankings to stdout """ print("===============================================") print("================ RESULTS ====================") print("===============================================\n") print("Popularity score:\n") num = 1 for name, count in sorted_tuples: print(f"{num}) {name} - {count}") num += 1 """ Present graph if requested """ if show_graph: sorted_tuples = sorted_tuples[:config["Filters"].getint( "rank_cutoff")] # Trim results list utils.show_bar_chart( sorted_tuples, "Top {} Results for /r/{}".format(len(sorted_tuples), sub_name))
def multicategories_predict(samples_test, model_name, result_dir): if model_name is None or len(model_name) == 0: logging.warn(Logger.warn("model_name must not be NULL.")) return if result_dir is None: cfm_file = "%s.cfm" % (model_name) sfm_file = "%s.sfm" % (model_name) else: if not os.path.isdir(result_dir): try: os.mkdir(result_dir) except OSError: logging.error(Logger.error("mkdir %s failed." % (result_dir))) return cfm_file = "%s/%s.cfm" % (result_dir, model_name) sfm_file = "%s/%s.sfm" % (result_dir, model_name) logging.debug(Logger.error("Loading train sample feature matrix ...")) sfm_train = SampleFeatureMatrix() sfm_train.load(sfm_file) logging.debug(Logger.debug("Loading train category feature matrix ...")) cfm_train = CategoryFeatureMatrix() cfm_train.load(cfm_file) logging.debug(Logger.debug("Making sample feature matrix for test data ...")) category_id = 2000000 sfm_test = SampleFeatureMatrix(sfm_train.get_category_id_map(), sfm_train.get_feature_id_map()) features = cfm_train.get_features(category_id) for sample_id in samples_test.tsm.sample_matrix(): (sample_category, sample_terms, term_map) = samples_test.tsm.get_sample_row(sample_id) category_1_id = Categories.get_category_1_id(sample_category) sfm_test.set_sample_category(sample_id, category_1_id) for feature_id in features: if feature_id in term_map: feature_weight = features[feature_id] sfm_test.add_sample_feature(sample_id, feature_id, feature_weight) logging.debug(Logger.debug("train sample feature matrix - features:%d categories:%d" % (sfm_train.get_num_features(), sfm_train.get_num_categories()))) X_train, y_train = sfm_train.to_sklearn_data() logging.debug(Logger.debug("test sample feature matrix - features:%d categories:%d" % (sfm_test.get_num_features(), sfm_test.get_num_categories()))) X_test, y_test = sfm_test.to_sklearn_data() clf = Classifier() logging.debug(Logger.debug("Classifier training ...")) clf.train(X_train, y_train) logging.debug(Logger.debug("Classifier predicting ...")) categories = samples_test.get_categories() categories_1_names = [] categories_1_idx_map = {} categories_1_idlist = categories.get_categories_1_idlist() for category_id in categories_1_idlist: category_idx = sfm_test.get_category_idx(category_id) category_name = categories.get_category_name(category_id) categories_1_idx_map[category_idx] = (category_id, category_name) categories_1_idx_list = sorted_dict(categories_1_idx_map) for (category_idx, (category_id, category_name)) in categories_1_idx_list: categories_1_names.append("%s(%d)" % (category_name, category_id)) clf.predict(X_test, y_test, categories_1_names)