def test_partition_with_max_labels_discards_labels(self): mock_label_reader = labelled_data.Reader( io.StringIO(""" input,name,qty,range_end,unit,comment 1 cup foo,foo,1.0,0.0,cup, 2 drops foz,foz,2.0,0.0,drop, 3 ml faa,faa,3.0,0.0,ml, 4 cloves bar,bar,4.0,0.0,cloves, 5 oz baz,baz,5.0,0.0,oz, """.strip())) partitioner.split_labels(mock_label_reader, self.mock_training_writer, self.mock_testing_writer, training_fraction=0.67, max_labels=3) self.assertMultiLineEqual( """ input,name,qty,range_end,unit,comment 1 cup foo,foo,1.0,0.0,cup, 2 drops foz,foz,2.0,0.0,drop, """.strip(), self.mock_training_file.getvalue().strip()) self.assertMultiLineEqual( """ input,name,qty,range_end,unit,comment 3 ml faa,faa,3.0,0.0,ml, """.strip(), self.mock_testing_file.getvalue().strip())
def test_raises_error_when_csv_does_not_have_required_columns(self): with self.assertRaises(labelled_data.InvalidHeaderError): mock_file = io.StringIO(""" index,input,UNEXPECTED_COLUMN,qty,range_end,unit,comment 77,3 bananas,bananas,3.0,0.0,, """.strip()) next(labelled_data.Reader(mock_file))
def main(args): with open(args.label_path) as label_file, open( args.training_path, 'wb') as training_file, open( args.testing_path, 'wb') as testing_file: label_reader = labelled_data.Reader(label_file) training_writer = labelled_data.Writer(training_file) testing_writer = labelled_data.Writer(testing_file) partitioner.split_labels(label_reader, training_writer, testing_writer, args.training_fraction, args.max_labels)
def test_reads_file_with_utf8_encoding(self): mock_file = io.StringIO( ('index,input,name,qty,range_end,unit,comment\n' '1,2 jalape\xc3\xb1os,jalape\xc3\xb1os,2.0,0.0,,,\n')) reader = labelled_data.Reader(mock_file) self.assertEqual([{ 'input': '2 jalape\xc3\xb1os', 'name': 'jalape\xc3\xb1os', 'qty': 2.0, 'unit': '', 'range_end': 0.0, 'comment': '', }], [r for r in reader])
def run(self): """ Generates training data in the CRF++ format for the ingredient tagging task """ with open(self.opts.data_path, encoding='utf-8') as data_file: data_reader = labelled_data.Reader(data_file) for row in data_reader: # Write the utf-8 encoded data directly to stdout instead of using print # because print() will output a bytestring like `b"string"`. sys.stdout.buffer.write( translator.translate_row(row).encode('utf-8')) sys.stdout.buffer.write(b'\n')
def test_interprets_empty_range_end_as_zero(self): mock_file = io.BytesIO(""" index,input,name,qty,range_end,unit,comment 77,3 bananas,bananas,3.0,,, """.strip()) reader = labelled_data.Reader(mock_file) self.assertEqual({ 'input': '3 bananas', 'qty': 3.0, 'unit': '', 'name': 'bananas', 'comment': '', 'range_end': 0.0, }, next(reader))
def test_reads_file_with_utf8_encoding(self): mock_file = io.StringIO( "index,input,name,qty,range_end,unit,comment\n" "1,2 jalape\xc3\xb1os,jalape\xc3\xb1os,2.0,0.0,,,\n") reader = labelled_data.Reader(mock_file) self.assertEqual( [{ "input": u"2 jalape\xc3\xb1os", "name": u"jalape\xc3\xb1os", "qty": 2.0, "unit": u"", "range_end": 0.0, "comment": u"", }], [r for r in reader], )
def test_interprets_empty_range_end_as_zero(self): mock_file = io.StringIO("""\ index,input,name,qty,range_end,unit,comment 77,3 bananas,bananas,3.0,,, """) reader = labelled_data.Reader(mock_file) self.assertEqual( { "input": u"3 bananas", "qty": 3.0, "unit": u"", "name": u"bananas", "comment": u"", "range_end": 0.0, }, next(reader), )
def test_reads_valid_label_file(self): mock_file = io.StringIO("""\ index,input,name,qty,range_end,unit,comment 63,4 to 6 large cloves garlic,garlic,4.0,6.0,clove, 77,3 bananas,bananas,3.0,0.0,, 106,"2 1/2 pounds bell peppers (about 6 peppers in assorted colors), cut into 2-inch chunks",bell peppers,2.5,0.0,pound,"(about 6 peppers in assorted colors), cut into 2-inch chunks" """) reader = labelled_data.Reader(mock_file) self.assertEqual( [ { "input": u"4 to 6 large cloves garlic", "qty": 4.0, "unit": u"clove", "name": u"garlic", "range_end": 6.0, "comment": u"", }, { "input": u"3 bananas", "qty": 3.0, "unit": u"", "name": u"bananas", "comment": u"", "range_end": 0.0, }, { "input": (u"2 1/2 pounds bell peppers (about 6 peppers in " u"assorted colors), cut into 2-inch chunks"), "qty": 2.5, "unit": u"pound", "name": u"bell peppers", "range_end": 0.0, "comment": (u"(about 6 peppers in assorted colors), cut into " u"2-inch chunks"), }, ], [r for r in reader], )
def test_reads_valid_label_file(self): mock_file = io.StringIO(""" index,input,name,qty,range_end,unit,comment 63,4 to 6 large cloves garlic,garlic,4.0,6.0,clove, 77,3 bananas,bananas,3.0,0.0,, 106,"2 1/2 pounds bell peppers (about 6 peppers in assorted colors), cut into 2-inch chunks",bell peppers,2.5,0.0,pound,"(about 6 peppers in assorted colors), cut into 2-inch chunks" """.strip()) reader = labelled_data.Reader(mock_file) self.assertEqual([{ 'input': '4 to 6 large cloves garlic', 'qty': 4.0, 'unit': 'clove', 'name': 'garlic', 'range_end': 6.0, 'comment': '', }, { 'input': '3 bananas', 'qty': 3.0, 'unit': '', 'name': 'bananas', 'comment': '', 'range_end': 0.0, }, { 'input': ('2 1/2 pounds bell peppers (about 6 peppers in ' 'assorted colors), cut into 2-inch chunks'), 'qty': 2.5, 'unit': 'pound', 'name': 'bell peppers', 'range_end': 0.0, 'comment': ('(about 6 peppers in assorted colors), cut into ' '2-inch chunks'), }], [r for r in reader])
def test_partition_20_percent_training(self): mock_label_reader = labelled_data.Reader( io.StringIO( """\ input,name,qty,range_end,unit,comment 1 cup foo,foo,1.0,0.0,cup, 2 drops foz,foz,2.0,0.0,drop, 3 ml faa,faa,3.0,0.0,ml, 4 cloves bar,bar,4.0,0.0,cloves, 5 oz baz,baz,5.0,0.0,oz,\ """ ) ) partitioner.split_labels( mock_label_reader, self.mock_training_writer, self.mock_testing_writer, training_fraction=0.2, ) self.assertMultiLineEqual( """\ input,name,qty,range_end,unit,comment 1 cup foo,foo,1.0,0.0,cup,\ """, self.mock_training_file.getvalue().strip(), ) self.assertMultiLineEqual( """\ input,name,qty,range_end,unit,comment 2 drops foz,foz,2.0,0.0,drop, 3 ml faa,faa,3.0,0.0,ml, 4 cloves bar,bar,4.0,0.0,cloves, 5 oz baz,baz,5.0,0.0,oz,\ """, self.mock_testing_file.getvalue().strip(), )