def write_dataset_example():
    """
    An example for writing dataset.
    :return: None
    """
    gold = [2, 3, 6]
    h = [5, 6]
    gold_dataset = Dataset({
        'test1': {
            'golden': gold,
        },
        'test2': {
            'golden': gold,
        },
    })
    test_dataset = Dataset({
        'test1': {
            'test': h,
        },
        'test2': {
            'test': h,
        },
    })
    write_dataset(gold_dataset, "./data/golden.json")
    write_dataset(test_dataset, "./data/test.json")
Пример #2
0
 def test_dataset(self):
     '''
     Test dataset property creation and independence.
     '''
     prop = 'test'
     dataset_a = Dataset()
     dataset_a.properties[prop] = False
     dataset_b = Dataset()
     self.assertFalse(prop in dataset_b.properties)
Пример #3
0
 def test_add_duplicate_codings(self):
     '''
     Test dataset property creation and independence.
     '''
     dataset_a = Dataset({'item1': {'a': [2]}})
     dataset_b = Dataset({'item1': {'a': [2]}})
     exception = False
     try:
         dataset_a + dataset_b
     except DataIOError:
         exception = True
     self.assertTrue(exception, 'Did not throw DataIOError')
Пример #4
0
    def test_add_coders(self):
        '''
        Test dataset property creation and independence.
        '''
        dataset_a = Dataset({'item1': {'a': [2]}})
        dataset_b = Dataset({'item1': {'b': [2]}})
        dataset_c = Dataset({'item1': {'a': [2], 'b': [2]}})

        self.assertNotEqual(dataset_a, dataset_b)
        self.assertEqual(dataset_a + dataset_b, dataset_c)
        self.assertNotEqual(dataset_a, dataset_b)
        self.assertNotEqual(dataset_a, dataset_c)
        self.assertNotEqual(dataset_b, dataset_c)
Пример #5
0
def input_linear_mass_json(filepath):
    '''
    Reads a file path. Returns segmentation mass codings as a :class:`Dataset`.

    :param filepath: Path to the mass file containing segment position codings.
    :type filepath: :func:`str`
    '''
    from segeval.data import Dataset, DataIOError
    dataset = Dataset()
    data = dict()
    # Open file
    json_file = open(filepath, 'rU')
    # Read in file
    try:
        data = json.load(json_file)
    except Exception as exception:
        raise DataIOError('Error occurred processing file: ' + filepath,
                          exception)
    # Check type
    if Field.segmentation_type in data:
        if data[Field.segmentation_type] != SegmentationType.linear:
            raise DataIOError(
                'Segmentation type \'{0}\' expected, but encountered \'{1}\' for file: {2}'
                .format(SegmentationType.linear, data[Field.segmentation_type],
                        filepath))
    else:
        raise DataIOError(
            'The entry \'segmentation_type\' was expected in JSON for file:' +
            filepath)
    # Duplicate to store other properties
    dataset.properties = data
    # If separated into multiple items for one coder per file
    if Field.items in data:
        data = data[Field.items]
        # Convert item labels into strings
        for item, coder_masses in data.items():
            dataset[item] = dict()
            for coder, masses in coder_masses.items():
                dataset[item][coder] = tuple(masses)
        # Remove from properties
        del dataset.properties[Field.items]
    else:
        raise DataIOError(
            'Expected an entry \'{0}\' that contained segmentation codings for specific individual texts (i.e., items) in file: {1}'
            .format(Field.items, filepath))
    return dataset
Пример #6
0
 def test_dataset_property(self):
     '''
     Test dataset property creation and independence.
     '''
     prop = 'test'
     dataset = Dataset(properties={prop: True})
     self.assertTrue(prop in dataset.properties)
     self.assertTrue(dataset.properties[prop])
Пример #7
0
def input_linear_mass_json(filepath):
    '''
    Reads a file path. Returns segmentation mass codings as a :class:`Dataset`.

    :param filepath: Path to the mass file containing segment position codings.
    :type filepath: :func:`str`
    '''
    from segeval.data import Dataset, DataIOError
    dataset = Dataset()
    data = dict()
    # Open file
    json_file = open(filepath, 'rU')
    # Read in file
    try:
        data = json.load(json_file)
    except Exception as exception:
        raise DataIOError(
            'Error occurred processing file: ' + filepath, exception)
    # Check type
    if Field.segmentation_type in data:
        if data[Field.segmentation_type] != SegmentationType.linear:
            raise DataIOError('Segmentation type \'{0}\' expected, but encountered \'{1}\' for file: {2}'
                              .format(SegmentationType.linear, data[Field.segmentation_type], filepath))
    else:
        raise DataIOError(
            'The entry \'segmentation_type\' was expected in JSON for file:' + filepath)
    # Duplicate to store other properties
    dataset.properties = data
    # If separated into multiple items for one coder per file
    if Field.items in data:
        data = data[Field.items]
        # Convert item labels into strings
        for item, coder_masses in data.items():
            dataset[item] = dict()
            for coder, masses in coder_masses.items():
                dataset[item][coder] = tuple(masses)
        # Remove from properties
        del dataset.properties[Field.items]
    else:
        raise DataIOError('Expected an entry \'{0}\' that contained segmentation codings for specific individual texts (i.e., items) in file: {1}'
                          .format(Field.items, filepath))
    return dataset
Пример #8
0
def input_linear_mass_tsv(filepath, delimiter=DEFAULT_DELIMITER):
    '''
    Takes a file path.  Returns segmentation mass codings as a :class:`Dataset`.

    :param filepath: path to the mass file containing segment mass codings.
    :param delimiter:    the delimiter used when reading a TSV file (by default,
                         a tab, but it can also be a comma, whitespace, etc.
    :type filepath: str
    :type delimiter: str
    '''

    from segeval.data import Dataset, name_from_filepath
    # List version of file
    header = []
    dataset = Dataset()
    item = name_from_filepath(filepath)
    dataset[item] = dict()
    # Open file
    with open(filepath, 'rU') as csv_file:
        # Read in file
        reader = csv.reader(csv_file, delimiter=delimiter)
        for i, row in enumerate(reader):
            # Read annotators from header
            if i is 0:
                for col_name in row[1:]:
                    header.append(col_name)
            # Read data
            else:
                coder = None
                for j, col in enumerate(row):
                    # Skip the first col
                    if j is 0:
                        coder = str(col)
                        dataset[item][coder] = list()
                    else:
                        dataset[item][coder].append(int(col))
                dataset[item][coder] = tuple(dataset[item][coder])

    return dataset
Пример #9
0
'''
from __future__ import absolute_import
from segeval.data import Dataset
from segeval.format import BoundaryFormat


KAZANTSEVA2012_G5 = Dataset(
    {'ch1': {'an4': (2, 8, 2, 1),
             'an1': (11, 2),
             'an2': (2, 1, 7, 2, 1),
             'an3': (9, 4)},
     'ch11':{'an4': (10, 4, 3, 2, 1, 8, 3, 2, 6, 1, 2, 8, 10, 9, 4, 10, 4,
                     8, 4, 3, 5, 4),
             'an1': (20, 22, 8, 11, 11, 11, 13, 11, 4),
             'an2': (1, 7, 2, 4, 3, 3, 10, 1, 1, 5, 3, 2, 8, 3, 3, 3, 14,
                     4, 1, 1, 4, 4, 2, 7, 3, 2, 3, 3, 2, 1, 1),
             'an3': (10, 10, 15, 11, 4, 10, 13, 5, 4, 23, 6)},
     'ch4': {'an4': (2, 9, 2, 5, 2, 19, 1, 6),
             'an1': (17, 25, 4),
             'an2': (1, 10, 2, 4, 1, 2, 3, 10, 6, 6, 1),
             'an3': (12, 5, 29)},
     'ch3': {'an4': (2, 3, 4, 2, 5, 17, 4, 1),
             'an1': (6, 5, 27),
             'an2': (3, 8, 2, 3, 17, 2, 2, 1),
             'an3': (3, 15, 15, 5)}})
'''
Segmentations provided by 4 coders (labeled group 5) of 4 chapters of
"The Moonstone" (Collins1868)_ collected by (KazantsevaSzpakowicz2012)_::

    KAZANTSEVA2012_G5 = Dataset(
        {'ch1': {'an4': (2, 8, 2, 1),