def write_dataset_example(): """ An example for writing dataset. :return: None """ gold = [2, 3, 6] h = [5, 6] gold_dataset = Dataset({ 'test1': { 'golden': gold, }, 'test2': { 'golden': gold, }, }) test_dataset = Dataset({ 'test1': { 'test': h, }, 'test2': { 'test': h, }, }) write_dataset(gold_dataset, "./data/golden.json") write_dataset(test_dataset, "./data/test.json")
def test_dataset(self): ''' Test dataset property creation and independence. ''' prop = 'test' dataset_a = Dataset() dataset_a.properties[prop] = False dataset_b = Dataset() self.assertFalse(prop in dataset_b.properties)
def test_add_duplicate_codings(self): ''' Test dataset property creation and independence. ''' dataset_a = Dataset({'item1': {'a': [2]}}) dataset_b = Dataset({'item1': {'a': [2]}}) exception = False try: dataset_a + dataset_b except DataIOError: exception = True self.assertTrue(exception, 'Did not throw DataIOError')
def test_add_coders(self): ''' Test dataset property creation and independence. ''' dataset_a = Dataset({'item1': {'a': [2]}}) dataset_b = Dataset({'item1': {'b': [2]}}) dataset_c = Dataset({'item1': {'a': [2], 'b': [2]}}) self.assertNotEqual(dataset_a, dataset_b) self.assertEqual(dataset_a + dataset_b, dataset_c) self.assertNotEqual(dataset_a, dataset_b) self.assertNotEqual(dataset_a, dataset_c) self.assertNotEqual(dataset_b, dataset_c)
def input_linear_mass_json(filepath): ''' Reads a file path. Returns segmentation mass codings as a :class:`Dataset`. :param filepath: Path to the mass file containing segment position codings. :type filepath: :func:`str` ''' from segeval.data import Dataset, DataIOError dataset = Dataset() data = dict() # Open file json_file = open(filepath, 'rU') # Read in file try: data = json.load(json_file) except Exception as exception: raise DataIOError('Error occurred processing file: ' + filepath, exception) # Check type if Field.segmentation_type in data: if data[Field.segmentation_type] != SegmentationType.linear: raise DataIOError( 'Segmentation type \'{0}\' expected, but encountered \'{1}\' for file: {2}' .format(SegmentationType.linear, data[Field.segmentation_type], filepath)) else: raise DataIOError( 'The entry \'segmentation_type\' was expected in JSON for file:' + filepath) # Duplicate to store other properties dataset.properties = data # If separated into multiple items for one coder per file if Field.items in data: data = data[Field.items] # Convert item labels into strings for item, coder_masses in data.items(): dataset[item] = dict() for coder, masses in coder_masses.items(): dataset[item][coder] = tuple(masses) # Remove from properties del dataset.properties[Field.items] else: raise DataIOError( 'Expected an entry \'{0}\' that contained segmentation codings for specific individual texts (i.e., items) in file: {1}' .format(Field.items, filepath)) return dataset
def test_dataset_property(self): ''' Test dataset property creation and independence. ''' prop = 'test' dataset = Dataset(properties={prop: True}) self.assertTrue(prop in dataset.properties) self.assertTrue(dataset.properties[prop])
def input_linear_mass_json(filepath): ''' Reads a file path. Returns segmentation mass codings as a :class:`Dataset`. :param filepath: Path to the mass file containing segment position codings. :type filepath: :func:`str` ''' from segeval.data import Dataset, DataIOError dataset = Dataset() data = dict() # Open file json_file = open(filepath, 'rU') # Read in file try: data = json.load(json_file) except Exception as exception: raise DataIOError( 'Error occurred processing file: ' + filepath, exception) # Check type if Field.segmentation_type in data: if data[Field.segmentation_type] != SegmentationType.linear: raise DataIOError('Segmentation type \'{0}\' expected, but encountered \'{1}\' for file: {2}' .format(SegmentationType.linear, data[Field.segmentation_type], filepath)) else: raise DataIOError( 'The entry \'segmentation_type\' was expected in JSON for file:' + filepath) # Duplicate to store other properties dataset.properties = data # If separated into multiple items for one coder per file if Field.items in data: data = data[Field.items] # Convert item labels into strings for item, coder_masses in data.items(): dataset[item] = dict() for coder, masses in coder_masses.items(): dataset[item][coder] = tuple(masses) # Remove from properties del dataset.properties[Field.items] else: raise DataIOError('Expected an entry \'{0}\' that contained segmentation codings for specific individual texts (i.e., items) in file: {1}' .format(Field.items, filepath)) return dataset
def input_linear_mass_tsv(filepath, delimiter=DEFAULT_DELIMITER): ''' Takes a file path. Returns segmentation mass codings as a :class:`Dataset`. :param filepath: path to the mass file containing segment mass codings. :param delimiter: the delimiter used when reading a TSV file (by default, a tab, but it can also be a comma, whitespace, etc. :type filepath: str :type delimiter: str ''' from segeval.data import Dataset, name_from_filepath # List version of file header = [] dataset = Dataset() item = name_from_filepath(filepath) dataset[item] = dict() # Open file with open(filepath, 'rU') as csv_file: # Read in file reader = csv.reader(csv_file, delimiter=delimiter) for i, row in enumerate(reader): # Read annotators from header if i is 0: for col_name in row[1:]: header.append(col_name) # Read data else: coder = None for j, col in enumerate(row): # Skip the first col if j is 0: coder = str(col) dataset[item][coder] = list() else: dataset[item][coder].append(int(col)) dataset[item][coder] = tuple(dataset[item][coder]) return dataset
''' from __future__ import absolute_import from segeval.data import Dataset from segeval.format import BoundaryFormat KAZANTSEVA2012_G5 = Dataset( {'ch1': {'an4': (2, 8, 2, 1), 'an1': (11, 2), 'an2': (2, 1, 7, 2, 1), 'an3': (9, 4)}, 'ch11':{'an4': (10, 4, 3, 2, 1, 8, 3, 2, 6, 1, 2, 8, 10, 9, 4, 10, 4, 8, 4, 3, 5, 4), 'an1': (20, 22, 8, 11, 11, 11, 13, 11, 4), 'an2': (1, 7, 2, 4, 3, 3, 10, 1, 1, 5, 3, 2, 8, 3, 3, 3, 14, 4, 1, 1, 4, 4, 2, 7, 3, 2, 3, 3, 2, 1, 1), 'an3': (10, 10, 15, 11, 4, 10, 13, 5, 4, 23, 6)}, 'ch4': {'an4': (2, 9, 2, 5, 2, 19, 1, 6), 'an1': (17, 25, 4), 'an2': (1, 10, 2, 4, 1, 2, 3, 10, 6, 6, 1), 'an3': (12, 5, 29)}, 'ch3': {'an4': (2, 3, 4, 2, 5, 17, 4, 1), 'an1': (6, 5, 27), 'an2': (3, 8, 2, 3, 17, 2, 2, 1), 'an3': (3, 15, 15, 5)}}) ''' Segmentations provided by 4 coders (labeled group 5) of 4 chapters of "The Moonstone" (Collins1868)_ collected by (KazantsevaSzpakowicz2012)_:: KAZANTSEVA2012_G5 = Dataset( {'ch1': {'an4': (2, 8, 2, 1),