def count_data(path, subreddit): pages_dir = os.path.join(path, 'pages') subreddits_dir = os.path.join(path, 'subreddits') if subreddit == 'all': return len(list(search_files(pages_dir))) else: return len(get_subreddit_files(pages_dir, subreddits_dir, subreddit))
def load_data_source(data_path, subreddit, page_samples, seed=None, relative=True): """ Generates a dictionary of labeled and unlabelled pages from a Reddit Data Source as specified by the specification on the github Wiki. :param data_path: path to a Reddit Data Source. :param subreddit: labeled subreddit which is to be targeted. :param page_samples: number of random unlabelled page samples to use. :param seed: seed for the pseudo random generator. :param relative: relative or absolute paths. :return: dictionary of (label, path) """ pages_dir = os.path.join(data_path, 'pages') subreddits_dir = os.path.join(data_path, 'subreddits') sr_path = os.path.join(subreddits_dir, subreddit) random.seed(seed) # Dictionary of all available instances data = {} # Add pages from subreddit JSON file for json_file in os.listdir(sr_path): with open(os.path.join(sr_path, json_file)) as fp: post_data = json.load(fp) for post in post_data['data']['children']: # Only interested in link posts (but they all should ok) if post['kind'] == 't3': url_path = get_path_from_url(pages_dir, post['data']['url']) if relative: url_path = os.path.relpath(url_path, pages_dir) data[url_path] = subreddit # Add random sample from pages directory remaining = set(search_files(pages_dir, relative=relative)) - set( data.keys()) for url_path in random.sample(remaining, page_samples): data[url_path] = None # Unlabelled data return data
def load_data_source(data_path, subreddit, page_samples, seed=None, relative=True): """ Generates a dictionary of labeled and unlabelled pages from a Reddit Data Source as specified by the specification on the github Wiki. :param data_path: path to a Reddit Data Source. :param subreddit: labeled subreddit which is to be targeted. :param page_samples: number of random unlabelled page samples to use. :param seed: seed for the pseudo random generator. :param relative: relative or absolute paths. :return: dictionary of (label, path) """ pages_dir = os.path.join(data_path, 'pages') subreddits_dir = os.path.join(data_path, 'subreddits') sr_path = os.path.join(subreddits_dir, subreddit) random.seed(seed) # Dictionary of all available instances data = {} # Add pages from subreddit JSON file for json_file in os.listdir(sr_path): with open(os.path.join(sr_path, json_file)) as fp: post_data = json.load(fp) for post in post_data['data']['children']: # Only interested in link posts (but they all should ok) if post['kind'] == 't3': url_path = get_path_from_url(pages_dir, post['data']['url']) if relative: url_path = os.path.relpath(url_path, pages_dir) data[url_path] = subreddit # Add random sample from pages directory remaining = set(search_files(pages_dir, relative=relative)) - set(data.keys()) for url_path in random.sample(remaining, page_samples): data[url_path] = None # Unlabelled data return data
def clean_data(path): pages_dir = os.path.join(path, 'pages') subreddits_dir = os.path.join(path, 'subreddits') all_pages = set(search_files(pages_dir)) referenced = set() for subreddit in os.listdir(subreddits_dir): for file_path in get_subreddit_files(pages_dir, subreddits_dir, subreddit): referenced.add(file_path) count = 0 unreferenced = all_pages - referenced for page_path in unreferenced: if os.path.exists(page_path): print 'Removing: %s' % page_path os.remove(page_path) parent = os.path.dirname(page_path) count += 1 if len(os.listdir(parent)) == 0: os.removedirs(parent) return count
def run_wrfda_3dvar(work_root, wrfda_root, config, args, wrf_work_dir=None, force=False, tag=None, fg=None): start_time = config['custom']['start_time'] datetime_fmt = 'YYYY-MM-DD_HH:mm:ss' start_time_str = start_time.format(datetime_fmt) max_dom = config['domains']['max_dom'] if not wrf_work_dir: if tag != None: wrf_work_dir = f'{work_root}/wrf_{tag}' else: wrf_work_dir = f'{work_root}/wrf' if tag != None: obsproc_work_dir = f'{work_root}/wrfda_{tag}/obsproc' else: obsproc_work_dir = f'{work_root}/wrfda/obsproc' if max_dom > 1: dom_str = 'd' + str(config['custom']['wrfda']['dom'] + 1).zfill(2) if tag != None: wrfda_work_dir = f'{work_root}/wrfda_{tag}/{dom_str}' else: wrfda_work_dir = f'{work_root}/wrfda/{dom_str}' else: dom_str = 'd01' if tag != None: wrfda_work_dir = f'{work_root}/wrfda_{tag}' else: wrfda_work_dir = f'{work_root}/wrfda' if not os.path.isdir(wrfda_work_dir): os.mkdir(wrfda_work_dir) os.chdir(wrfda_work_dir) cli.stage(f'Run da_wrfvar.exe at {wrfda_work_dir} ...') if os.path.isfile(f'wrfvar_output_{start_time_str}' ) and not args.force and not force: run(f'ls -l wrfvar_output_{start_time_str}') cli.notice(f'wrfvar_output_{start_time_str} already exist.') return run(f'ln -sf {wrfda_root}/run/LANDUSE.TBL {wrfda_work_dir}') if not os.path.isfile('namelist.input'): cli.error( 'namelist.input has not been generated! Run config_wrfda.py.') # BE matrix if 'cv_options' in config['wrfvar7']: be_work_dir = os.path.dirname( os.path.abspath(work_root)) + '/be/' + dom_str if not os.path.isdir(be_work_dir): be_work_dir = os.path.dirname( os.path.abspath(work_root)) + '/../be/' + dom_str if config['wrfvar7']['cv_options'] == 5: if not os.path.isfile(f'{be_work_dir}/be.dat.cv5'): cli.error( f'BE matrix {be_work_dir}/be.dat.cv5 does not exist!') run(f'ln -sf {be_work_dir}/be.dat.cv5 be.dat') elif config['wrfvar7']['cv_options'] == 6: if not os.path.isfile(f'{be_work_dir}/be.dat.cv6'): cli.error( f'BE matrix {be_work_dir}/be.dat.cv6 does not exist!') run(f'ln -sf {be_work_dir}/be.dat.cv6 be.dat') elif config['wrfvar7']['cv_options'] == 7: if not os.path.isfile(f'{be_work_dir}/be.dat.cv7'): cli.error( f'BE matrix {be_work_dir}/be.dat.cv7 does not exist!') run(f'ln -sf {be_work_dir}/be.dat.cv7 be.dat') if not os.path.exists('./be.dat'): run(f'ln -sf {wrfda_root}/var/run/be.dat.cv3 be.dat') # First guess # TODO: Assume there is only one domain to be assimilated. if fg != None: run(f'ln -sf {fg} {wrfda_work_dir}/fg') else: expected_files = [ '{}/wrfout_d{:02d}_{}'.format(wrf_work_dir, i + 1, start_time_str) for i in range(max_dom) ] if check_files(expected_files): run(f'ln -sf {wrf_work_dir}/wrfout_{dom_str}_{start_time_str} {wrfda_work_dir}/fg' ) else: expected_files = [ '{}/wrfinput_d{:02d}_{}'.format(wrf_work_dir, i + 1, start_time_str) for i in range(max_dom) ] if not check_files(expected_files): cli.error( 'real.exe or da_update_bc.exe wasn\'t executed successfully!' ) run(f'ln -sf {wrf_work_dir}/wrfinput_{dom_str}_{start_time_str} {wrfda_work_dir}/fg' ) # Observation data if config['custom']['wrfda']['type'] == '3dvar': if 'use_radarobs' in config['wrfvar4'] and config['wrfvar4'][ 'use_radarobs']: # Radar data run(f'rm -f ob.*') for obs_radar_file in glob( f'{args.littler_root}/{start_time.format("YYYYMMDD")}/obs.radar.*' ): radar_time = pendulum.from_format( os.path.basename(obs_radar_file).split('.')[2], 'YYYYMMDDHHmm') if radar_time == start_time: run(f'ln -sf {obs_radar_file} ob.radar') if os.path.isfile(f'wrfvar_output_{start_time_str}'): cli.notice('Use previous analysis data as the background.') run(f'mv wrfvar_output_{start_time_str} wrfvar_output_conv_{start_time_str}' ) run(f'ln -sf wrfvar_output_conv_{start_time_str} fg') elif 'conv_obs' in config['custom']: if 'dir_pattern' in config['custom']['conv_obs']: obs_dir = Template( config['custom']['conv_obs']['dir_pattern']).render( obs_time=start_time) if 'file_pattern' in config['custom']['conv_obs']: obs_file = Template( config['custom']['conv_obs']['file_pattern']).render( obs_time=start_time) if config['wrfvar3']['ob_format'] == 1: run(f'ln -sf {args.prepbufr_root}/{obs_dir}/{obs_file} ob.bufr' ) elif config['wrfvar3']['ob_format'] == 2: run(f'ln -sf {args.prepbufr_root}/{obs_dir}/{obs_file} ob.ascii' ) elif config['wrfvar3']['ob_format'] == 2 and os.path.isfile( f'{obsproc_work_dir}/obs_gts_{start_time.format(datetime_fmt)}.3DVAR' ): # LITTLE_R conventional data run(f'ln -sf {obsproc_work_dir}/obs_gts_{start_time.format(datetime_fmt)}.3DVAR ob.ascii' ) elif config['wrfvar3']['ob_format'] == 1 and config['custom']['wrfda'][ 'prepbufr_source'] == 'gdas': # PREPBUFR conventional data gdas_file_path = f'{args.prepbufr_root}/gdas.{start_time.format("YYYYMMDD")}/gdas.t{start_time.hour:02}z.prepbufr.nr' if not os.path.isfile(gdas_file_path): cli.error(f'{gdas_file_path} does not exist!') run(f'ln -sf {gdas_file_path} ob.bufr') if os.path.isfile(f'{wrfda_work_dir}/wrfvar_output_{start_time_str}' ) and not args.force: cli.notice( f'{wrfda_work_dir}/wrfvar_output_{start_time_str} already exists.') return submit_job(f'{wrfda_root}/var/build/da_wrfvar.exe', min(20, args.np), config, args, wait=True) expected_files = [f'wrfvar_output', 'statistics'] if not check_files(expected_files): # Check if the failure is caused by parallel computing? Such as cv_options is zero in some process. if search_files('rsl.error.*', 'Invalid CV option chosen: cv_options = 0'): cli.warning( 'Failed to run da_wrfvar.exe in parallel. Try to run in serial.' ) submit_job(f'{wrfda_root}/var/build/da_wrfvar.exe', 1, config, args, wait=True) if not check_files(expected_files): cli.error( f'Still failed! See {wrfda_work_dir}/rsl.error.0000.') else: cli.error(f'Failed! See {wrfda_work_dir}/rsl.error.0000.') else: print(open('statistics').read()) run(f'ncl -Q {scripts_root}/../plots/plot_cost_grad_fn.ncl') run(f'cp wrfvar_output wrfvar_output_{start_time_str}') cli.notice('Succeeded.')
def execute_operation( self ): # Anropa olika funktioner beroende på valet i radiobuttons och skapa dictionaries inför generering av rapport folder = self.katalog_entry.get() ext = self.ext_entry.get() keyword = self.keyword_entry.get() date = self.date_entry.get() if self.radiovar.get() == 1: if folder: if os.path.isdir(folder): list_tmp = utils.find_all_files(folder) self.match_hashset += utils.verify_files(list_tmp) utils.create_dict(folder, self.allfiles, list_tmp) self.display_results(list_tmp) else: tkinter.messagebox.showerror( 'Error', 'Detta är inte en äkta katalog!') else: tkinter.messagebox.showerror('Error', 'Du måste ange en katalog!') elif self.radiovar.get() == 2: if folder and ext: if os.path.isdir(folder): list_tmp = utils.find_specific_files(folder, ext) self.match_hashset += utils.verify_files(list_tmp) utils.create_dict(ext, self.specificfiles, list_tmp) self.display_results(list_tmp) else: tkinter.messagebox.showerror( 'Error', 'Detta är inte en äkta katalog!') else: tkinter.messagebox.showerror( 'Error', 'Du måste ange både katalog och filändelse!') elif self.radiovar.get() == 3: if folder and ext and keyword: if os.path.isdir(folder): list_tmp = utils.search_files(folder, ext, keyword) self.match_hashset += utils.verify_files(list_tmp) utils.create_dict(keyword, self.infofiles, list_tmp) self.display_results(list_tmp) else: tkinter.messagebox.showerror( 'Error', 'Detta är inte en äkta katalog!') else: tkinter.messagebox.showerror( 'Error!', 'Du måste ange katalog, filändelse och sökord!') elif self.radiovar.get() == 4: if folder and date: if os.path.isdir(folder): list_tmp = utils.find_modified_files(folder, date) self.match_hashset += utils.verify_files(list_tmp) utils.create_dict(date, self.datefiles, list_tmp) self.display_results(list_tmp) else: tkinter.messagebox.showerror( 'Error', 'Detta är inte en äkta katalog!') else: tkinter.messagebox.showerror( 'Error!', 'Du måste ange katalog och datum!')
import nltk import json import time import bz2 from matplotlib import pyplot as plt from textparser import word_tokenize from utils import search_files, load_db_params from index.hashedindex import HashedIndex from index.wikiindex import WikiIndex data_path = '/home/michaela/Development/Reddit-Testing-Data' pages_path = os.path.join(data_path, 'pages') available_pages = set(search_files(pages_path)) sample_sizes = (400, 600, 800, 1000, 1200) def test_wiki_index(cache_name, label, color, sample_set, sample_sizes, word_concepts_ags, db_params, n_concepts=10): dimensions = [] runtimes = [] if not os.path.exists('.cache'):
def main(): allfiles = dict() specificfiles = dict() infofiles = dict() datefiles = dict() match_hashset = list() while True: print("\n") print("################################################") print("# [1]Search [2]Encryption [3]File Difference #") print("# [4]System Info [5]Generate report #") print('# q or "exit" to exit #') print("################################################") ch = input("$ ") # Search in files if ch == "1": while True: print("\n") print("##########################################") print("# [1] Find all files [2] File Extension #") print("# [3] By date [4] Search in files #") print('# q or "back" to go back #') print("##########################################") ch2 = input("$ ") if ch2 == "1": path = input("$ Path to folder: ") if path == "q" or path == "back": break list_tmp = utils.find_all_files(path) utils.create_dict(path, allfiles, list_tmp) match_hashset += utils.verify_files(list_tmp) print_results(list_tmp) if ch2 == "2": ext = input("$ Extension: ") if ext == "q" or ext == "back": break folder = input("$ Path to folder: ") if folder == "q" or folder == "back": break list_tmp = utils.find_specific_files(folder, ext) utils.create_dict(ext, specificfiles, list_tmp) match_hashset += utils.verify_files(list_tmp) print_results(list_tmp) if ch2 == "3": folder = input("$ Path to folder: ") if folder == "q" or folder == "back": break date = input("$ Date (Ex format: 2020-03-03): ") if date == "q" or date == "back": break list_tmp = utils.find_modified_files(folder, date) utils.create_dict(date, datefiles, list_tmp) match_hashset = utils.verify_files(list_tmp) print_results(list_tmp) if ch2 == "4": folder = input("$ Path to folder: ") if folder == "q" or folder == "back": break ext = input("$ Extension: ") if ext == "q" or ext == "back": break keyword = input("$ Keyword: ") if keyword == "q" or keyword == "back": break list_tmp = utils.search_files(folder, ext, keyword) utils.create_dict(keyword, infofiles, list_tmp) match_hashset = utils.verify_files(list_tmp) print_results(list_tmp) if ch2 == "q" or ch2 == "back": break #Encryption if ch == "2": while True: print("\n") print("###########################") print("# [1] Encrypt [2] Decrypt #") print('# q or "back" to go back #') print("###########################") ch2 = input("$ ") if ch2 == "1": filename = input("$ Path to file: ") if filename == "q" or filename == "back": break utils.encrypt_file(filename) print(filename + " has been encrypted.") if ch2 == "2": filename = input("$ Path to file: ") if filename == "q" or filename == "back": break utils.decrypt_file(filename) print(filename + "has been decrypted.") if ch2 == "q" or ch2 == "back": break # File Difference if ch == "3": while True: print("\n") print(' q or "back" to go back') file1 = input("$ File 1: ") if file1 == "q" or file1 == "back": break file2 = input("$ File 2: ") if file2 == "q" or file2 == "back": break file1_diff, file2_diff = utils.word_difference(file1, file2) print() print("Words in file 1, but not in file 2:") print_results(file1_diff) print("Words in file 2, but not in file 1:") print_results(file2_diff) # System info if ch == "4": print_results(utils.system_information()) if ch == "5": dictionary = dict() dictionary['sys'] = utils.system_information() dictionary['hashset'] = match_hashset dictionary['allfiles'] = allfiles dictionary['extfiles'] = specificfiles dictionary['infofiles'] = infofiles dictionary['datefiles'] = datefiles utils.gen_report(dictionary) print("The report has been generated!") if ch == "q" or ch == "exit": print("\n") print(" Cya! ") print("\n") break
# Set the parameters to the program over here force_reindex = False parameters = { 'samples': 800, 'subreddit': 'python', 'min_frequency': 2, 'stemmer': str(_stemmer), 'data_path': data_path, 'mode': 'tfidf', } save_path = '/home/michaela/Development/%s_sr.json.bz2' % parameters['subreddit'] print(parameters) print('Available pages: ', len(list(search_files(os.path.join(data_path, 'pages'))))) sr_index = HashedIndex() if os.path.exists(save_path): meta = load_meta(save_path, compressed=True) else: meta = None force_reindex = True if force_reindex or meta['parameters'] != parameters: print('State File Parameters out of date. Re-Indexing...') t0 = time.time() sr_index.clear()
import nltk import json import time import bz2 from matplotlib import pyplot as plt from textparser import word_tokenize from utils import search_files, load_db_params from index.hashedindex import HashedIndex from index.wikiindex import WikiIndex data_path = '/home/michaela/Development/Reddit-Testing-Data' pages_path = os.path.join(data_path, 'pages') available_pages = set(search_files(pages_path)) sample_sizes = (400, 600, 800, 1000, 1200) def test_wiki_index(cache_name, label, color, sample_set, sample_sizes, word_concepts_ags, db_params, n_concepts=10): dimensions = [] runtimes = [] if not os.path.exists('.cache'): os.mkdir('.cache') for n_samples in sample_sizes: # Check if a cached version is available file_name = '.cache/%s_%d.json.bz2' % (cache_name, n_samples) if os.path.exists(file_name): print('Found cached version of %s (%d)' % (cache_name, n_samples))