def count_data(path, subreddit):
    pages_dir = os.path.join(path, 'pages')
    subreddits_dir = os.path.join(path, 'subreddits')

    if subreddit == 'all':
        return len(list(search_files(pages_dir)))
    else:
        return len(get_subreddit_files(pages_dir, subreddits_dir, subreddit))
def load_data_source(data_path,
                     subreddit,
                     page_samples,
                     seed=None,
                     relative=True):
    """
    Generates a dictionary of labeled and unlabelled pages from a Reddit
    Data Source as specified by the specification on the github Wiki.
    :param data_path: path to a Reddit Data Source.
    :param subreddit: labeled subreddit which is to be targeted.
    :param page_samples: number of random unlabelled page samples to use.
    :param seed: seed for the pseudo random generator.
    :param relative: relative or absolute paths.
    :return: dictionary of (label, path)
    """
    pages_dir = os.path.join(data_path, 'pages')
    subreddits_dir = os.path.join(data_path, 'subreddits')
    sr_path = os.path.join(subreddits_dir, subreddit)

    random.seed(seed)

    # Dictionary of all available instances
    data = {}

    # Add pages from subreddit JSON file
    for json_file in os.listdir(sr_path):
        with open(os.path.join(sr_path, json_file)) as fp:
            post_data = json.load(fp)

        for post in post_data['data']['children']:
            # Only interested in link posts (but they all should ok)
            if post['kind'] == 't3':
                url_path = get_path_from_url(pages_dir, post['data']['url'])
                if relative:
                    url_path = os.path.relpath(url_path, pages_dir)
                data[url_path] = subreddit

    # Add random sample from pages directory
    remaining = set(search_files(pages_dir, relative=relative)) - set(
        data.keys())
    for url_path in random.sample(remaining, page_samples):
        data[url_path] = None  # Unlabelled data

    return data
def load_data_source(data_path, subreddit, page_samples, seed=None, relative=True):
    """
    Generates a dictionary of labeled and unlabelled pages from a Reddit
    Data Source as specified by the specification on the github Wiki.
    :param data_path: path to a Reddit Data Source.
    :param subreddit: labeled subreddit which is to be targeted.
    :param page_samples: number of random unlabelled page samples to use.
    :param seed: seed for the pseudo random generator.
    :param relative: relative or absolute paths.
    :return: dictionary of (label, path)
    """
    pages_dir = os.path.join(data_path, 'pages')
    subreddits_dir = os.path.join(data_path, 'subreddits')
    sr_path = os.path.join(subreddits_dir, subreddit)

    random.seed(seed)

    # Dictionary of all available instances
    data = {}

    # Add pages from subreddit JSON file
    for json_file in os.listdir(sr_path):
        with open(os.path.join(sr_path, json_file)) as fp:
            post_data = json.load(fp)

        for post in post_data['data']['children']:
            # Only interested in link posts (but they all should ok)
            if post['kind'] == 't3':
                url_path = get_path_from_url(pages_dir, post['data']['url'])
                if relative:
                    url_path = os.path.relpath(url_path, pages_dir)
                data[url_path] = subreddit

    # Add random sample from pages directory
    remaining = set(search_files(pages_dir, relative=relative)) - set(data.keys())
    for url_path in random.sample(remaining, page_samples):
        data[url_path] = None   # Unlabelled data

    return data
def clean_data(path):
    pages_dir = os.path.join(path, 'pages')
    subreddits_dir = os.path.join(path, 'subreddits')

    all_pages = set(search_files(pages_dir))
    referenced = set()

    for subreddit in os.listdir(subreddits_dir):
        for file_path in get_subreddit_files(pages_dir, subreddits_dir, subreddit):
            referenced.add(file_path)

    count = 0
    unreferenced = all_pages - referenced
    for page_path in unreferenced:
        if os.path.exists(page_path):
            print 'Removing: %s' % page_path
            os.remove(page_path)
            parent = os.path.dirname(page_path)
            count += 1

            if len(os.listdir(parent)) == 0:
                os.removedirs(parent)

    return count
Пример #5
0
def run_wrfda_3dvar(work_root,
                    wrfda_root,
                    config,
                    args,
                    wrf_work_dir=None,
                    force=False,
                    tag=None,
                    fg=None):
    start_time = config['custom']['start_time']
    datetime_fmt = 'YYYY-MM-DD_HH:mm:ss'
    start_time_str = start_time.format(datetime_fmt)
    max_dom = config['domains']['max_dom']

    if not wrf_work_dir:
        if tag != None:
            wrf_work_dir = f'{work_root}/wrf_{tag}'
        else:
            wrf_work_dir = f'{work_root}/wrf'

    if tag != None:
        obsproc_work_dir = f'{work_root}/wrfda_{tag}/obsproc'
    else:
        obsproc_work_dir = f'{work_root}/wrfda/obsproc'

    if max_dom > 1:
        dom_str = 'd' + str(config['custom']['wrfda']['dom'] + 1).zfill(2)
        if tag != None:
            wrfda_work_dir = f'{work_root}/wrfda_{tag}/{dom_str}'
        else:
            wrfda_work_dir = f'{work_root}/wrfda/{dom_str}'
    else:
        dom_str = 'd01'
        if tag != None:
            wrfda_work_dir = f'{work_root}/wrfda_{tag}'
        else:
            wrfda_work_dir = f'{work_root}/wrfda'
    if not os.path.isdir(wrfda_work_dir): os.mkdir(wrfda_work_dir)
    os.chdir(wrfda_work_dir)

    cli.stage(f'Run da_wrfvar.exe at {wrfda_work_dir} ...')

    if os.path.isfile(f'wrfvar_output_{start_time_str}'
                      ) and not args.force and not force:
        run(f'ls -l wrfvar_output_{start_time_str}')
        cli.notice(f'wrfvar_output_{start_time_str} already exist.')
        return

    run(f'ln -sf {wrfda_root}/run/LANDUSE.TBL {wrfda_work_dir}')

    if not os.path.isfile('namelist.input'):
        cli.error(
            'namelist.input has not been generated! Run config_wrfda.py.')

    # BE matrix
    if 'cv_options' in config['wrfvar7']:
        be_work_dir = os.path.dirname(
            os.path.abspath(work_root)) + '/be/' + dom_str
        if not os.path.isdir(be_work_dir):
            be_work_dir = os.path.dirname(
                os.path.abspath(work_root)) + '/../be/' + dom_str

        if config['wrfvar7']['cv_options'] == 5:
            if not os.path.isfile(f'{be_work_dir}/be.dat.cv5'):
                cli.error(
                    f'BE matrix {be_work_dir}/be.dat.cv5 does not exist!')
            run(f'ln -sf {be_work_dir}/be.dat.cv5 be.dat')
        elif config['wrfvar7']['cv_options'] == 6:
            if not os.path.isfile(f'{be_work_dir}/be.dat.cv6'):
                cli.error(
                    f'BE matrix {be_work_dir}/be.dat.cv6 does not exist!')
            run(f'ln -sf {be_work_dir}/be.dat.cv6 be.dat')
        elif config['wrfvar7']['cv_options'] == 7:
            if not os.path.isfile(f'{be_work_dir}/be.dat.cv7'):
                cli.error(
                    f'BE matrix {be_work_dir}/be.dat.cv7 does not exist!')
            run(f'ln -sf {be_work_dir}/be.dat.cv7 be.dat')
    if not os.path.exists('./be.dat'):
        run(f'ln -sf {wrfda_root}/var/run/be.dat.cv3 be.dat')

    # First guess
    # TODO: Assume there is only one domain to be assimilated.
    if fg != None:
        run(f'ln -sf {fg} {wrfda_work_dir}/fg')
    else:
        expected_files = [
            '{}/wrfout_d{:02d}_{}'.format(wrf_work_dir, i + 1, start_time_str)
            for i in range(max_dom)
        ]
        if check_files(expected_files):
            run(f'ln -sf {wrf_work_dir}/wrfout_{dom_str}_{start_time_str} {wrfda_work_dir}/fg'
                )
        else:
            expected_files = [
                '{}/wrfinput_d{:02d}_{}'.format(wrf_work_dir, i + 1,
                                                start_time_str)
                for i in range(max_dom)
            ]
            if not check_files(expected_files):
                cli.error(
                    'real.exe or da_update_bc.exe wasn\'t executed successfully!'
                )
            run(f'ln -sf {wrf_work_dir}/wrfinput_{dom_str}_{start_time_str} {wrfda_work_dir}/fg'
                )

    # Observation data
    if config['custom']['wrfda']['type'] == '3dvar':
        if 'use_radarobs' in config['wrfvar4'] and config['wrfvar4'][
                'use_radarobs']:
            # Radar data
            run(f'rm -f ob.*')
            for obs_radar_file in glob(
                    f'{args.littler_root}/{start_time.format("YYYYMMDD")}/obs.radar.*'
            ):
                radar_time = pendulum.from_format(
                    os.path.basename(obs_radar_file).split('.')[2],
                    'YYYYMMDDHHmm')
                if radar_time == start_time:
                    run(f'ln -sf {obs_radar_file} ob.radar')
            if os.path.isfile(f'wrfvar_output_{start_time_str}'):
                cli.notice('Use previous analysis data as the background.')
                run(f'mv wrfvar_output_{start_time_str} wrfvar_output_conv_{start_time_str}'
                    )
                run(f'ln -sf wrfvar_output_conv_{start_time_str} fg')
        elif 'conv_obs' in config['custom']:
            if 'dir_pattern' in config['custom']['conv_obs']:
                obs_dir = Template(
                    config['custom']['conv_obs']['dir_pattern']).render(
                        obs_time=start_time)
            if 'file_pattern' in config['custom']['conv_obs']:
                obs_file = Template(
                    config['custom']['conv_obs']['file_pattern']).render(
                        obs_time=start_time)
            if config['wrfvar3']['ob_format'] == 1:
                run(f'ln -sf {args.prepbufr_root}/{obs_dir}/{obs_file} ob.bufr'
                    )
            elif config['wrfvar3']['ob_format'] == 2:
                run(f'ln -sf {args.prepbufr_root}/{obs_dir}/{obs_file} ob.ascii'
                    )
        elif config['wrfvar3']['ob_format'] == 2 and os.path.isfile(
                f'{obsproc_work_dir}/obs_gts_{start_time.format(datetime_fmt)}.3DVAR'
        ):
            # LITTLE_R conventional data
            run(f'ln -sf {obsproc_work_dir}/obs_gts_{start_time.format(datetime_fmt)}.3DVAR ob.ascii'
                )
        elif config['wrfvar3']['ob_format'] == 1 and config['custom']['wrfda'][
                'prepbufr_source'] == 'gdas':
            # PREPBUFR conventional data
            gdas_file_path = f'{args.prepbufr_root}/gdas.{start_time.format("YYYYMMDD")}/gdas.t{start_time.hour:02}z.prepbufr.nr'
            if not os.path.isfile(gdas_file_path):
                cli.error(f'{gdas_file_path} does not exist!')
            run(f'ln -sf {gdas_file_path} ob.bufr')

    if os.path.isfile(f'{wrfda_work_dir}/wrfvar_output_{start_time_str}'
                      ) and not args.force:
        cli.notice(
            f'{wrfda_work_dir}/wrfvar_output_{start_time_str} already exists.')
        return

    submit_job(f'{wrfda_root}/var/build/da_wrfvar.exe',
               min(20, args.np),
               config,
               args,
               wait=True)

    expected_files = [f'wrfvar_output', 'statistics']
    if not check_files(expected_files):
        # Check if the failure is caused by parallel computing? Such as cv_options is zero in some process.
        if search_files('rsl.error.*',
                        'Invalid CV option chosen:  cv_options =    0'):
            cli.warning(
                'Failed to run da_wrfvar.exe in parallel. Try to run in serial.'
            )
            submit_job(f'{wrfda_root}/var/build/da_wrfvar.exe',
                       1,
                       config,
                       args,
                       wait=True)
            if not check_files(expected_files):
                cli.error(
                    f'Still failed! See {wrfda_work_dir}/rsl.error.0000.')
        else:
            cli.error(f'Failed! See {wrfda_work_dir}/rsl.error.0000.')
    else:
        print(open('statistics').read())
        run(f'ncl -Q {scripts_root}/../plots/plot_cost_grad_fn.ncl')
        run(f'cp wrfvar_output wrfvar_output_{start_time_str}')
        cli.notice('Succeeded.')
Пример #6
0
    def execute_operation(
        self
    ):  # Anropa olika funktioner beroende på valet i radiobuttons och skapa dictionaries inför generering av rapport
        folder = self.katalog_entry.get()
        ext = self.ext_entry.get()
        keyword = self.keyword_entry.get()
        date = self.date_entry.get()

        if self.radiovar.get() == 1:

            if folder:
                if os.path.isdir(folder):
                    list_tmp = utils.find_all_files(folder)
                    self.match_hashset += utils.verify_files(list_tmp)
                    utils.create_dict(folder, self.allfiles, list_tmp)
                    self.display_results(list_tmp)
                else:
                    tkinter.messagebox.showerror(
                        'Error', 'Detta är inte en äkta katalog!')
            else:
                tkinter.messagebox.showerror('Error',
                                             'Du måste ange en katalog!')

        elif self.radiovar.get() == 2:

            if folder and ext:
                if os.path.isdir(folder):
                    list_tmp = utils.find_specific_files(folder, ext)
                    self.match_hashset += utils.verify_files(list_tmp)
                    utils.create_dict(ext, self.specificfiles, list_tmp)
                    self.display_results(list_tmp)
                else:
                    tkinter.messagebox.showerror(
                        'Error', 'Detta är inte en äkta katalog!')
            else:
                tkinter.messagebox.showerror(
                    'Error', 'Du måste ange både katalog och filändelse!')

        elif self.radiovar.get() == 3:

            if folder and ext and keyword:
                if os.path.isdir(folder):
                    list_tmp = utils.search_files(folder, ext, keyword)
                    self.match_hashset += utils.verify_files(list_tmp)
                    utils.create_dict(keyword, self.infofiles, list_tmp)
                    self.display_results(list_tmp)
                else:
                    tkinter.messagebox.showerror(
                        'Error', 'Detta är inte en äkta katalog!')
            else:
                tkinter.messagebox.showerror(
                    'Error!', 'Du måste ange katalog, filändelse och sökord!')

        elif self.radiovar.get() == 4:

            if folder and date:
                if os.path.isdir(folder):
                    list_tmp = utils.find_modified_files(folder, date)
                    self.match_hashset += utils.verify_files(list_tmp)
                    utils.create_dict(date, self.datefiles, list_tmp)
                    self.display_results(list_tmp)
                else:
                    tkinter.messagebox.showerror(
                        'Error', 'Detta är inte en äkta katalog!')
            else:
                tkinter.messagebox.showerror(
                    'Error!', 'Du måste ange katalog och datum!')
import nltk
import json
import time
import bz2

from matplotlib import pyplot as plt

from textparser import word_tokenize
from utils import search_files, load_db_params
from index.hashedindex import HashedIndex
from index.wikiindex import WikiIndex

data_path = '/home/michaela/Development/Reddit-Testing-Data'
pages_path = os.path.join(data_path, 'pages')

available_pages = set(search_files(pages_path))
sample_sizes = (400, 600, 800, 1000, 1200)


def test_wiki_index(cache_name,
                    label,
                    color,
                    sample_set,
                    sample_sizes,
                    word_concepts_ags,
                    db_params,
                    n_concepts=10):
    dimensions = []
    runtimes = []

    if not os.path.exists('.cache'):
Пример #8
0
def main():
    allfiles = dict()
    specificfiles = dict()
    infofiles = dict()
    datefiles = dict()
    match_hashset = list()

    while True:
        print("\n")
        print("################################################")
        print("# [1]Search  [2]Encryption  [3]File Difference #")
        print("# [4]System Info [5]Generate report            #")
        print('#  q or "exit" to exit                         #')
        print("################################################")
        ch = input("$ ")

        # Search in files
        if ch == "1":
            while True:
                print("\n")
                print("##########################################")
                print("# [1] Find all files [2] File Extension  #")
                print("# [3] By date        [4] Search in files #")
                print('#  q or "back" to go back                #')
                print("##########################################")
                ch2 = input("$ ")

                if ch2 == "1":
                    path = input("$ Path to folder: ")
                    if path == "q" or path == "back":
                        break
                    list_tmp = utils.find_all_files(path)
                    utils.create_dict(path, allfiles, list_tmp)
                    match_hashset += utils.verify_files(list_tmp)
                    print_results(list_tmp)

                if ch2 == "2":
                    ext = input("$ Extension: ")
                    if ext == "q" or ext == "back":
                        break

                    folder = input("$ Path to folder: ")
                    if folder == "q" or folder == "back":
                        break
                    list_tmp = utils.find_specific_files(folder, ext)
                    utils.create_dict(ext, specificfiles, list_tmp)
                    match_hashset += utils.verify_files(list_tmp)
                    print_results(list_tmp)

                if ch2 == "3":
                    folder = input("$ Path to folder: ")
                    if folder == "q" or folder == "back":
                        break

                    date = input("$ Date (Ex format: 2020-03-03): ")
                    if date == "q" or date == "back":
                        break
                    list_tmp = utils.find_modified_files(folder, date)
                    utils.create_dict(date, datefiles, list_tmp)
                    match_hashset = utils.verify_files(list_tmp)
                    print_results(list_tmp)

                if ch2 == "4":
                    folder = input("$ Path to folder: ")
                    if folder == "q" or folder == "back":
                        break

                    ext = input("$ Extension: ")
                    if ext == "q" or ext == "back":
                        break

                    keyword = input("$ Keyword: ")
                    if keyword == "q" or keyword == "back":
                        break
                    list_tmp = utils.search_files(folder, ext, keyword)
                    utils.create_dict(keyword, infofiles, list_tmp)
                    match_hashset = utils.verify_files(list_tmp)
                    print_results(list_tmp)

                if ch2 == "q" or ch2 == "back":
                    break

        #Encryption
        if ch == "2":
            while True:
                print("\n")
                print("###########################")
                print("# [1] Encrypt [2] Decrypt #")
                print('#  q or "back" to go back #')
                print("###########################")
                ch2 = input("$ ")

                if ch2 == "1":
                    filename = input("$ Path to file: ")
                    if filename == "q" or filename == "back":
                        break

                    utils.encrypt_file(filename)
                    print(filename + " has been encrypted.")

                if ch2 == "2":
                    filename = input("$ Path to file: ")
                    if filename == "q" or filename == "back":
                        break

                    utils.decrypt_file(filename)
                    print(filename + "has been decrypted.")

                if ch2 == "q" or ch2 == "back":
                    break

        # File Difference
        if ch == "3":
            while True:
                print("\n")
                print(' q or "back" to go back')
                file1 = input("$ File 1: ")
                if file1 == "q" or file1 == "back":
                    break

                file2 = input("$ File 2: ")
                if file2 == "q" or file2 == "back":
                    break

                file1_diff, file2_diff = utils.word_difference(file1, file2)
                print()
                print("Words in file 1, but not in file 2:")
                print_results(file1_diff)
                print("Words in file 2, but not in file 1:")
                print_results(file2_diff)

        # System info
        if ch == "4":
            print_results(utils.system_information())

        if ch == "5":
            dictionary = dict()
            dictionary['sys'] = utils.system_information()
            dictionary['hashset'] = match_hashset
            dictionary['allfiles'] = allfiles
            dictionary['extfiles'] = specificfiles
            dictionary['infofiles'] = infofiles
            dictionary['datefiles'] = datefiles
            utils.gen_report(dictionary)
            print("The report has been generated!")

        if ch == "q" or ch == "exit":
            print("\n")
            print(" Cya! ")
            print("\n")
            break
    # Set the parameters to the program over here
    force_reindex = False
    parameters = {
        'samples': 800,
        'subreddit': 'python',
        'min_frequency': 2,
        'stemmer': str(_stemmer),
        'data_path': data_path,
        'mode': 'tfidf',
    }

    save_path = '/home/michaela/Development/%s_sr.json.bz2' % parameters['subreddit']

    print(parameters)
    print('Available pages: ', len(list(search_files(os.path.join(data_path, 'pages')))))

    sr_index = HashedIndex()

    if os.path.exists(save_path):
        meta = load_meta(save_path, compressed=True)
    else:
        meta = None
        force_reindex = True

    if force_reindex or meta['parameters'] != parameters:
        print('State File Parameters out of date. Re-Indexing...')

        t0 = time.time()
        sr_index.clear()
import nltk
import json
import time
import bz2

from matplotlib import pyplot as plt

from textparser import word_tokenize
from utils import search_files, load_db_params
from index.hashedindex import HashedIndex
from index.wikiindex import WikiIndex

data_path = '/home/michaela/Development/Reddit-Testing-Data'
pages_path = os.path.join(data_path, 'pages')

available_pages = set(search_files(pages_path))
sample_sizes = (400, 600, 800, 1000, 1200)


def test_wiki_index(cache_name, label, color, sample_set, sample_sizes, word_concepts_ags, db_params, n_concepts=10):
    dimensions = []
    runtimes = []

    if not os.path.exists('.cache'):
        os.mkdir('.cache')

    for n_samples in sample_sizes:
        # Check if a cached version is available
        file_name = '.cache/%s_%d.json.bz2' % (cache_name, n_samples)
        if os.path.exists(file_name):
            print('Found cached version of %s (%d)' % (cache_name, n_samples))