def new2016Dataset(include_robot_and_aerial=True, include_harvest=True): samples = csv_utils.read_csv_as_dicts('2016.merged.csv') dataset_lib.convert_to_float_or_missing(samples, filter_2016_labels(( 'HARVEST_', 'COMPOSITION_', 'ROBOT_', 'AERIAL_', 'SYNTHETIC_', 'GPS_', 'ROW', 'COLUMN'))) input_features_starts_with = [ 'GPS_', 'ACCESSION_', ] if include_robot_and_aerial: input_features_starts_with.extend(['ROBOT_', 'AERIAL_']) if include_harvest: input_features_starts_with.extend(['HARVEST_', 'SYNTHETIC_HARVEST_']) input_labels = filter_2016_labels(tuple(input_features_starts_with)) output_labels = filter_2016_labels('COMPOSITION_') output_generators = collections.OrderedDict(sorted( [(x, create_2016_output_generator(x)) for x in output_labels] )) print('2016 Inputs: ' + ','.join(input_labels)) return dataset_lib.Dataset(samples, input_labels, output_generators)
def test_read_csv_as_dicts_good_case(self): file_path = self._create_test_csv( ['label1,label2,label3', '1,2,3', '4,5,6']) self.assertListEqual([{ 'label1': '1', 'label2': '2', 'label3': '3' }, { 'label1': '4', 'label2': '5', 'label3': '6' }], csv_utils.read_csv_as_dicts(file_path))
def main(): samples = csv_utils.read_csv_as_dicts(INPUT_PATH) features_to_use = filter_labels(('ROW', 'COLUMN', 'GPS_', 'ROBOT_', 'HARVEST_', 'SYNTHETIC_', 'COMPOSITION_')) dataset.convert_to_float_or_missing(samples, features_to_use) X = np.array([[sample[x] for x in features_to_use] for sample in samples]) results = np.corrcoef(X, rowvar=False) with open(OUTPUT_PATH, 'w') as f: writer = csv.writer(f) writer.writerow(['feature_name'] + features_to_use) for feature_name, results_row in zip(features_to_use, results): writer.writerow([feature_name] + list(results_row))
def main(): pool = Pool() samples = read_csv_as_dicts(INPUT_PATH) # Tuples of spatial_key1, spatial_key2, spatial_keys_description). mantel_runs = [ (Features.GPS_EASTINGS, Features.GPS_NORTHINGS, 'eastings_and_northings'), (Features.GPS_EASTINGS, None, 'eastings_only'), (Features.GPS_NORTHINGS, None, 'northings_only'), (Features.ROW, Features.COLUMN, 'plot_row_and_column'), (Features.ROW, None, 'plot_row_only'), (Features.COLUMN, None, 'plot_column_only'), ] for spatial_key1, spatial_key2, spatial_keys_description in mantel_runs: args = [] for feature in Features: # This is a bit hacky way to skip values that are all text values. # But it works nicely right now. if (feature == Features.ROW or feature == Features.COLUMN or feature == Features.PLANT_ID): continue if feature == Features.GPS_EASTINGS: break # Ignore this Feature and all Features after this one. args.append((samples, spatial_key1, spatial_key2, feature)) output_path = get_output_path(spatial_keys_description) print('Spawning jobs for:', output_path) results = pool.map(get_spatial_correlation, args) results.sort(key=lambda x: x[-1]) # Sort by p-value. with open(output_path, 'w') as f: csv_writer = csv.writer(f) csv_writer.writerow([ 'label', 'num_data_points', 'avg_data_value', 'avg_diff_between_adjacent_plots', 'avg_diff_between_nonadjacent_plots', 'corr_coeff', 'p_value' ]) csv_writer.writerows(results)
def new2014Dataset(): samples = csv_utils.read_csv_as_dicts('2014/2014_Pheotypic_Data_FileS2.csv') ADF = 'ADF (% DM)' NDF = 'NDF (% DM)' NFC = 'NFC (% DM)' LIGNIN = 'Lignin (% DM)' DRY_WEIGHT = 'Dry weight (kg)' input_labels = ( 'Anthesis date (days)', 'Harvest date (days)', 'Total fresh weight (kg)', 'Brix (maturity)', 'Brix (milk)', 'Stalk height (cm)', # Including dry weight greatly increases predictive ability. #'Dry weight (kg)', #'Dry tons per acre', ) dataset_lib.convert_to_float_or_missing(samples, list(input_labels) + [ ADF, NDF, NFC, LIGNIN, DRY_WEIGHT]) output_generators = collections.OrderedDict([ ('adf', lambda sample: get_2014_weight(sample, DRY_WEIGHT, ADF)), ('ndf', lambda sample: get_2014_weight(sample, DRY_WEIGHT, NDF)), ('nfc', lambda sample: get_2014_weight(sample, DRY_WEIGHT, NFC)), ('lignin', lambda sample: get_2014_weight(sample, DRY_WEIGHT, LIGNIN)), ('c6', lambda sample: get_2014_weight(sample, DRY_WEIGHT, ADF, minus=LIGNIN)), ('c5', lambda sample: get_2014_weight(sample, DRY_WEIGHT, NDF, minus=ADF)), ]) print('2014 Inputs: ' + ','.join(input_labels)) return dataset_lib.Dataset(samples, input_labels, output_generators)
def main(): parser = argparse.ArgumentParser(description='Visualize features') parser.add_argument('-f', '--file', required=True, help='Path to input file.') parser.add_argument('-l', '--label', required=True, help='Label name in input file to visualize.') args = parser.parse_args() samples = read_csv_as_dicts(args.file) samples = [x for x in samples if x[args.label] != ''] rows = [average_mismatch(x[Features.GPS_EASTINGS.value]) for x in samples] columns = [ average_mismatch(x[Features.GPS_NORTHINGS.value]) for x in samples ] values = [average_mismatch(x[args.label]) for x in samples] plt.title('Visualization of ' + args.label + ' in ' + os.path.basename(args.file)) xdim = 3 ydim = 2 plt.scatter(rows, columns, c=values, s=100, marker=[(-xdim, -ydim), (xdim, -ydim), (xdim, ydim), (-xdim, ydim), (-xdim, -ydim)]) cb = plt.colorbar() cb.ax.set_title('Value') plt.xlabel("Row") plt.ylabel("Range") plt.show()