Пример #1
0
def main():
    # Parse the original dataset
    print("Parsing of original data...")
    parse.main(original_file, parsed_file, prefix)
    # Create features from the dataset
    print("Creating additional features...")
    add_features.main(parsed_file, rain_dataset, added_features_file)

    # Merge the three files : parsed_file, added_features_file and
    # distances_features_file into merged_file
    print("Merging all features...")
    parsed_df = pd.read_csv(parsed_file)
    added_features_df = pd.read_csv(added_features_file)
    distances_features_df = pd.read_csv(distances_features_file)
    # how="left" replace missing ORSM values by NaNs (dunno how to replace by zeros)
    merged_df = pd.merge(parsed_df,
                         pd.merge(added_features_df,
                                  distances_features_df,
                                  on='id',
                                  how='left'),
                         on='id')
    merged_df.to_csv(merged_file, index=False)

    # Do not clean the test set
    if prefix == "train":
        # Clean the final dataset
        print("Cleaning data...")
        clean.main(merged_file, merged_file)
Пример #2
0
def main(sets, path='data/'):
    print('Combining raw data sets...')
    combine_sources.main(sets, path)

    print('Cleaning string for Lemmatization and Tagging...')
    clean.main(path)

    print('Splitting the data for Lemmatization and Tagging...')
    pos_prep.main(path)

    print('Creating Lemmas and Pos Tags using RFTagger...')
    run([
        'java', '-jar',
        'C:\\Users\\Josef\\PycharmProjects\\QC-Yes-No\\Classification\\preprocessing\\RFTagger\\rft-java.jar',
        '-c', 'stts', '-l', 'german', '-x',
        'C:\\Users\\Josef\\PycharmProjects\\QC-Yes-No\\Classification\\preprocessing\\RFTagger\\lib\\german-rft-tagger-lemma-lexicon.txt',
        'C:\\Users\\Josef\\PycharmProjects\\QC-Yes-No\\Classification\\preprocessing\\RFTagger\\lib\\german.par',
        path + 'all_pre_pos.txt', path + 'pos_tagged_raw.txt'
    ],
        shell=True)

    print('Further streamlining data...')
    reclean.main(path)

    print('Extracting final data set...')
    extract_info_from_tag.main(path)

    print('Process finished.')
Пример #3
0
def main(args):
    """
    Main running script
    """

    # Get the config file
    config = util.get_config(args.config)
    root_dir = config['ROOT_DIR']
    # fill out initial folders
    if not os.path.isdir('{}/metadata'.format(root_dir)):
        os.mkdir('{}/metadata'.format(root_dir))
        print('created metadata dir')
    if not os.path.isdir('{}'.format(config['OBS_ROOT'])):
        os.mkdir('{}'.format(config['OBS_ROOT']))
        print('created OBS dir')
    if not os.path.isdir('{}'.format(config['ESTIMATORS_ROOT'])):
        os.mkdir('{}'.format(config['ESTIMATORS_ROOT']))
        print('created ESTIMATORS dir')
    if not os.path.isdir('{}'.format(config['PREDICTIONS_ROOT'])):
        os.mkdir('{}'.format(config['PREDICTIONS_ROOT']))
        print('created PREDICTIONS dir')
    if not os.path.isdir('{}'.format(config['QAQC_ROOT'])):
        os.mkdir('{}'.format(config['QAQC_ROOT']))
        print('created QAQC dir')
    if not os.path.isdir('{}'.format(config['PLOT_ROOT'])):
        os.mkdir('{}'.format(config['PLOT_ROOT']))
        print('created PLOT dir')

    # --- download data ---
    if args.clean:
        clean.main(config)
    else:
        print('skipping database cleaning')
    # --- download data ---
    if args.download:
        download.main(config)
    else:
        print('skipping download of new data')
    # --- train models
    if args.train:
        train.main(config)
    else:
        print('skip training')
    # --- make predictions ---
    if args.predict:
        predict.main(config)
    else:
        print('skipping download of new data')
    # --- run qaqc checks ---
    if args.qaqc:
        qaqc.main(config)
    else:
        print('skipping qaqc')
    # --- plot ---
    if args.plot:
        plot.main(config)
    else:
        print('skipping plots')
Пример #4
0
def main():
	
	# Clean, then build. Couldn't be simpler!
	
	error = clean.main()
	if error!=0:
		return error
	return build.main()
Пример #5
0
def main():
    delete_data_dir_contents()
    scrap()
    clean.main()
Пример #6
0
import Predictive_Analysis_Classification
import clean
import Clustering_Associationrules
import t_test
if __name__ == '__main__':
    # Basic Statistical Analysis and data cleaning insight Part
    print("##############################################")
    print("# Basic Statistical Analysis and data cleaning insight Part")
    print("##############################################")
    clean.main()
    # Cluster and Association Rule Part
    print("##############################################")
    print("# Cluster and Association Rule Part")
    print("##############################################")
    Clustering_Associationrules.execute()
    # Predictive_Analysis_Hypothesis_Test Part
    print("##############################################")
    print("# Predictive_Analysis_Hypothesis_Test Part")
    print("##############################################")
    t_test.execute()
    # Predictive_Analysis_Classification Part
    print("##############################################")
    print("# Predictive_Analysis_Classification Part")
    print("##############################################")
    Predictive_Analysis_Classification.execute()
Пример #7
0
def main(args):
    args.files = filter(is_arxiv, catalogue.utils.list_files(".pdf"))
    clean.main(args)