def test_convert_data_csv(self): input_file = self._write_csv(self._create_example_csv()) output_file = util.convert_comments_data(input_file) # Remove the quotes around identity terms list that read_csv injects. df = pd.read_csv(output_file).replace("'", '', regex=True) expected_df = pd.DataFrame() expected_df = expected_df.append( { 'comment_text': 'comment 1', 'toxicity': 0.0, 'gender': [], 'sexual_orientation': ['bisexual'], 'race': ['other_race_or_ethnicity'], 'religion': ['atheist', 'other_religion'], 'disability': [ 'physical_disability', 'intellectual_or_learning_disability', 'psychiatric_or_mental_illness', 'other_disability' ] }, ignore_index=True) self.assertEqual(df.reset_index(drop=True, inplace=True), expected_df.reset_index(drop=True, inplace=True))
def test_convert_data_tfrecord(self): input_file = self._write_tf_records(self._create_example_tfrecord()) output_file = util.convert_comments_data(input_file) output_example_list = [] for serialized in tf.data.TFRecordDataset(filenames=[output_file]): output_example = tf.train.Example() output_example.ParseFromString(serialized.numpy()) output_example_list.append(output_example) self.assertEqual(len(output_example_list), 1) self.assertEqual( output_example_list[0], text_format.Parse( """ features { feature { key: "comment_text" value { bytes_list {value: [ "comment 1" ] }} } feature { key: "toxicity" value { float_list { value: [ 0.0 ] }}} feature { key: "sexual_orientation" value { bytes_list { value: ["bisexual"] }} } feature { key: "gender" value { bytes_list { }}} feature { key: "race" value { bytes_list { value: [ "other_race_or_ethnicity" ] }} } feature { key: "religion" value { bytes_list { value: [ "atheist", "other_religion" ] } } } feature { key: "disability" value { bytes_list { value: [ "physical_disability", "intellectual_or_learning_disability", "psychiatric_or_mental_illness", "other_disability"] }} } } """, tf.train.Example()))