示例#1
0
def i_check_topic_distributions(step, check_file):
    check_file = res_filename(check_file)
    predictions_file = world.output
    import traceback
    try:
        with UnicodeReader(predictions_file) as predictions_file:
            with UnicodeReader(check_file) as check_file:
                for row in predictions_file:
                    check_row = check_file.next()
                    assert len(check_row) == len(row)
                    for index in range(len(row)):
                        dot = row[index].find(".")
                        decimal_places = 1
                        if dot > 0 or (check_row[index].find(".") > 0
                                       and check_row[index].endswith(".0")):
                            try:
                                decimal_places = min( \
                                    len(row[index]),
                                    len(check_row[index])) - dot - 1
                                row[index] = round(float(row[index]),
                                                   decimal_places)
                                check_row[index] = round(
                                    float(check_row[index]), decimal_places)
                            except ValueError:
                                decimal_places = 1
                            assert_almost_equal(check_row[index],
                                                row[index],
                                                places=(decimal_places - 1))
                        else:
                            assert_equal(check_row[index], row[index])
    except Exception, exc:
        assert False, traceback.format_exc()
def i_check_forecasts(step, check_file):
    check_file = res_filename(check_file)
    forecasts_file = "%s_%s.csv" % \
        (world.output, world.time_series["object"]["objective_field"])
    import traceback
    try:
        with UnicodeReader(forecasts_file) as forecasts_file:
            with UnicodeReader(check_file) as check_file:
                for row in forecasts_file:
                    check_row = check_file.next()
                    assert_equal(len(check_row), len(row))
                    for index in range(len(row)):
                        dot = row[index].find(".")
                        decimal_places = 1
                        if dot > 0 or (check_row[index].find(".") > 0
                                       and check_row[index].endswith(".0")):
                            try:
                                decimal_places = min(len(row[index]),
                                                     len(check_row[index])) \
                                    - dot - 1
                                row[index] = round(float(row[index]),
                                                   decimal_places)
                                check_row[index] = round(float(check_row[ \
                                    index]), decimal_places)
                            except ValueError:
                                decimal_places = 1
                            assert_almost_equal(check_row[index],
                                                row[index],
                                                places=(decimal_places - 1))
                        else:
                            assert_equal(check_row[index], row[index])
    except Exception, exc:
        assert False, traceback.format_exc()
示例#3
0
def check_summary_like_expected(step, summary_file, expected_file):
    summary_contents = []
    expected_contents = []
    with UnicodeReader(res_filename(summary_file)) as summary_handler:
        for line in summary_handler:
            summary_contents.append(line)
    with UnicodeReader(res_filename(expected_file)) as expected_handler:
        for line in expected_handler:
            expected_contents.append(line)
    eq_(summary_contents, expected_contents)
示例#4
0
    def reset(self):
        """Starts a new csv reader object

        """
        try:
            self.training_set.close()
        except (IOError, AttributeError):
            pass
        try:
            self.training_reader = UnicodeReader(
                self.training_set, delimiter=self.training_separator,
                lineterminator="\n").open_reader()
        except IOError:
            sys.exit("Error: cannot read training %s" % self.training_set)
示例#5
0
def read_field_attributes(path):
    """Reads field attributes from a csv file to update source fields.

    A column number and a list of attributes separated by a comma per line.
    The expected structure is:
    column number, name, label, description

    For example:

    0,'first name','label for the first field','fist field full description'
    1,'last name','label for the last field','last field full description'

    """
    field_attributes = {}
    try:
        with UnicodeReader(path, quotechar="'") as attributes_reader:
            for row in attributes_reader:
                attributes = {}
                if len(row) > 1:
                    for index in range(0,
                                       min(len(ATTRIBUTE_NAMES),
                                           len(row) - 1)):
                        attributes.update(
                            {ATTRIBUTE_NAMES[index]: row[index + 1]})
                    field_attributes.update({int(row[0]): attributes})
            return field_attributes
    except IOError:
        sys.exit("Error: cannot read field attributes %s" % path)
示例#6
0
def read_objective_weights(path):
    """Reads objective weights from a CSV file in a class, weight format.

    The expected structure is:
    class name, weight

    For example:

    Iris-setosa,5
    Iris-versicolor,10

    """
    objective_weights = []
    import traceback
    try:
        with UnicodeReader(path, quotechar="'") as weights_reader:
            for row in weights_reader:
                weights = []
                if len(row) != 2:
                    sys.exit("Error: wrong objective field file syntax\n%s" %
                             ",".join(row))
                weights = row[:]
                try:
                    weights[1] = int(weights[1])
                except ValueError:
                    sys.exit("Error: wrong objective field file syntax\n%s" %
                             ",".join(row))
                objective_weights.append(weights)
            return objective_weights
    except IOError:
        sys.exit("Error: cannot read objective weights %s" % path)
示例#7
0
    def reset(self):
        """Starts a new csv reader object

        """
        try:
            self.training_set.close()
        except (IOError, AttributeError):
            pass
        try:
            self.training_reader = UnicodeReader(
                self.training_set, delimiter=self.training_separator,
                lineterminator="\n").open_reader()
        except IOError:
            sys.exit("Error: cannot read training %s" % self.training_set)
示例#8
0
def read_votes(votes_files, to_prediction, data_locale=None):
    """Reads the votes found in the votes' files.

       Returns a list of MultiVote objects containing the list of predictions.
       votes_files parameter should contain the path to the files where votes
       are stored
       In to_prediction parameter we expect the method of a local model object
       that casts the string prediction values read from the file to their
       real type. For instance
           >>> local_model = Model(model)
           >>> prediction = local_model.to_prediction("1")
           >>> isinstance(prediction, int)
           True
           >>> read_votes(["my_predictions_file"], local_model.to_prediction)
       data_locale should contain the string identification for the locale
       used in numeric formatting.
    """
    votes = []
    for order in range(0, len(votes_files)):
        votes_file = votes_files[order]
        index = 0
        with UnicodeReader(votes_file) as rdr:
            for row in rdr:
                prediction = to_prediction(row[0], data_locale=data_locale)
                if index > (len(votes) - 1):
                    votes.append(MultiVote([]))
                distribution = None
                instances = None
                if len(row) > 2:
                    distribution = ast.literal_eval(row[2])
                    instances = int(row[3])
                    try:
                        confidence = float(row[1])
                    except ValueError:
                        confidence = 0.0
                prediction_row = [
                    prediction, confidence, order, distribution, instances
                ]
                votes[index].append_row(prediction_row)
                index += 1
    return votes
def i_check_predictions(step, check_file):
    with UnicodeReader(world.output) as prediction_rows:
        with UnicodeReader(res_filename(check_file)) as test_rows:
            check_rows(prediction_rows, test_rows)
示例#10
0
文件: fields.py 项目: mhakanda/python
    def new_fields_structure(self,
                             csv_attributes_file=None,
                             attributes=None,
                             out_file=None):
        """Builds the field structure needed to update a fields dictionary
        in a BigML resource.

        :param csv_attributes_file: (string) Path to a CSV file like the one
                                             generated by summary_csv.
        :param attributes: (list) list of rows containing the
                                  attributes information ordered
                                  as in the summary_csv output.
        :param out_file: (string) Path to a JSON file that will be used
                                  to store the new fields structure. If None,
                                  the output is returned as a dict.
        """
        if csv_attributes_file is not None:
            reader = UnicodeReader(csv_attributes_file).open_reader()
            attributes = [row for row in reader]
        new_fields_structure = {}
        if "field ID" in attributes[0] or "field column" in attributes[0]:
            # headers are used
            for index in range(1, len(attributes)):
                new_attributes = dict(zip(attributes[0], attributes[index]))
                if new_attributes.get("field ID"):
                    field_id = new_attributes.get("field ID")
                    if not field_id in self.fields.keys():
                        raise ValueError("Field ID %s not found"
                                         " in this resource" % field_id)
                    del new_attributes["field ID"]
                else:
                    field_column = int(new_attributes.get("field column"))
                    if not field_column in self.field_columns:
                        raise ValueError("Field column %s not found"
                                         " in this resource" % field_column)
                    field_id = self.field_id(field_column)
                    del new_attributes["field column"]
                for attribute, value in new_attributes.items():
                    if not attribute in UPDATABLE_HEADERS.keys():
                        del new_attributes[attribute]
                    else:
                        new_attributes[UPDATABLE_HEADERS[attribute]] = \
                            new_attributes[attribute]
                        if attribute != UPDATABLE_HEADERS[attribute]:
                            del new_attributes[attribute]
                if "preferred" in new_attributes:
                    new_attributes['preferred'] = json.loads( \
                        new_attributes['preferred'])
                new_fields_structure[field_id] = new_attributes
        else:
            # assume the order given in the summary_csv method
            first_attribute = attributes[0][0]
            first_column_is_id = False
            try:
                field_id = self.field_id(int(first_attribute))
            except ValueError:
                field_id = first_attribute
                first_column_is_id = True
            if not field_id in self.fields:
                raise ValueError("The first column should contain either the"
                                 " column or ID of the fields. Failed to find"
                                 " %s as either of them." % field_id)
            headers = SUMMARY_HEADERS[2:7]
            headers = [UPDATABLE_HEADERS[header] for header in headers]
            try:
                for field_attributes in attributes:
                    if field_attributes[6] is not None:
                        field_attributes[6] = json.loads(field_attributes[6])
                    field_id = field_attributes[0] if first_column_is_id else \
                        self.field_id(int(field_attributes[0]))
                    new_fields_structure[field_id] = \
                        dict(zip(headers, field_attributes[1: 6]))

            except ValueError:
                raise ValueError("The first column should contain either the"
                                 " column or ID of the fields. Failed to find"
                                 " %s as either of them." % field_id)
        if out_file is None:
            return {"fields": new_fields_structure}
        else:
            try:
                with open(out_file, "w") as out:
                    json.dump({"fields": new_fields_structure}, out)
            except IOError, exc:
                raise IOError("Failed writing the fields structure file in"
                              " %s- Please, check your arguments." % out_file)
示例#11
0
class TrainReader(object):
    """Retrieves csv info and manages objective fields and multi-labels

    """
    def __init__(self, training_set, training_set_header, objective_field,
                 multi_label=False, labels=None, label_separator=None,
                 training_separator=None, multi_label_fields=None,
                 label_aggregates=None, objective=True):
        """Builds a generator from a csv file

           `training_set`: path to the training data file
           `training_set_header`: boolean, True means that headers are first
                                 row in the file
           `objective_field`: objective field column or field name
           `labels`: Fields object with the expected fields structure.
        """
        self.training_set = training_set
        if training_set.__class__.__name__ == "StringIO":
            self.encode = None
            self.training_set = UTF8Recoder(test_set, SYSTEM_ENCODING)
        else:
            self.encode = None if PYTHON3 else FILE_ENCODING
        self.training_set_header = training_set_header
        self.training_reader = None
        self.multi_label = multi_label
        self.objective = objective
        if label_aggregates is None:
            label_aggregates = []
        self.label_aggregates = label_aggregates
        self.training_separator = (decode2(training_separator,
                                           encoding="string_escape")
                                   if training_separator is not None
                                   else get_csv_delimiter())
        if len(self.training_separator) > 1:
            sys.exit("Only one character can be used as test data separator.")
        # opening csv reader
        self.reset()
        self.label_separator = (decode2(label_separator,
                                        encoding="string_escape")
                                if label_separator is not None
                                else get_csv_delimiter())

        first_row = self.get_next(reset=not training_set_header)
        self.row_length = len(first_row)

        if training_set_header:
            self.headers = first_row
        else:
            self.headers = [("field_%s" % index) for index in
                            range(0, self.row_length)]

        self.multi_label_fields = sorted(self._get_columns(multi_label_fields))
        if objective:
            self.objective_column = self._get_columns([objective_field])[0]
            if not self.objective_column in self.multi_label_fields:
                self.multi_label_fields.append(self.objective_column)
            self.labels = labels
        self.fields_labels = self._get_labels()
        if objective:
            if labels is None:
                self.labels = self.fields_labels[self.objective_column]
            self.objective_name = self.headers[self.objective_column]

    def __iter__(self):
        """Iterator method

        """
        return self

    def get_label_headers(self):
        """Returns a list of headers with the new extended field names for
           each objective label
        """
        new_headers = self.get_headers()
        for field_column in self.multi_label_fields:
            labels = self.fields_labels[field_column]
            new_field_names = [get_label_field(self.headers[field_column],
                                               label)
                               for label in labels]
            new_headers.extend(new_field_names)
            for aggregate in self.label_aggregates:
                new_headers.append(get_label_field(
                    self.headers[field_column], aggregate))
        if not PYTHON3:
            new_headers = [encode2(header) for header in new_headers]
        return new_headers

    def _get_columns(self, fields_list):
        """Receives a comma-separated list of fields given by name or
           column number and returns column number list

        """
        column_list = []
        if fields_list is None:
            return column_list
        if not isinstance(fields_list, list):
            fields_list = [fields_list]
        for field in fields_list:
            column = None
            if isinstance(field, int):
                column = field
            elif field is None:
                column = self.row_length - 1
            else:
                try:
                    column = self.headers.index(field)
                except ValueError:
                    if self.objective:
                        sys.exit("The %s has been set as multi-label field but"
                                 " it cannot be found in the headers row: \n"
                                 " %s" %
                                 (field,
                                  ", ".join([encode2(header)
                                             for header in self.headers])))
                    else:
                        column = None
            if column is not None:
                column_list.append(column)
        return column_list

    def reset(self):
        """Starts a new csv reader object

        """
        try:
            self.training_set.close()
        except (IOError, AttributeError):
            pass
        try:
            self.training_reader = UnicodeReader(
                self.training_set, delimiter=self.training_separator,
                lineterminator="\n").open_reader()
        except IOError:
            sys.exit("Error: cannot read training %s" % self.training_set)

    def next(self):
        """Iterator method for next item

        """
        return self.get_next()

    def get_next(self, extended=False, reset=False):
        """Returns the next row. If extended is True, the row is extended with
           a list of booleans depending on whether the label is in the
           objective field value or not. If reset is True, the file is
           reopened and pointer starts at the beginning of the file.

        """
        row = self.training_reader.next()
        row = [value.strip() for value in row]
        if extended:
            if self.multi_label and self.fields_labels is None:
                self.fields_labels = self._get_labels()

            for field_column in self.multi_label_fields:
                aggregated_field_value = row[field_column]
                field_values = aggregated_field_value.split(
                    self.label_separator)

                field_values = [value.strip() for
                                value in field_values]

                labels_row = [int(label in field_values) for label in
                              self.fields_labels[field_column]]
                row.extend(labels_row)
                for aggregate in self.label_aggregates:
                    row.append(AGGREGATES[aggregate](field_values))
        if reset:
            self.reset()
        if not PYTHON3:
            row = [encode2(item) for item in row]
        return row

    def number_of_rows(self):
        """Returns the number of rows in the test file

        """
        rows = file_number_of_lines(self.training_set)
        if self.training_set_header:
            rows -= 1
        return rows

    def has_headers(self):
        """Returns whether the training set file has a headers row

        """
        return self.training_set_header

    def _get_labels(self):
        """Returns the list of labels in the multi-label fields

        """
        labels = {}
        for field_column in self.multi_label_fields:
            labels[field_column] = []
        for row in self:
            for field_column in self.multi_label_fields:
                labels = self._get_field_labels(row, labels,
                                                field_column,
                                                self.label_separator)
        return labels

    def _get_field_labels(self, row, labels, field_column, separator):
        """Returns the list of labels in a multi-label field

        """
        field_value = row[field_column]
        if self.multi_label:
            new_labels = field_value.split(separator)
            new_labels = [decode2(label).strip()
                          for label in new_labels]
            # TODO: clean user given missing tokens
            for label_index in range(0, len(new_labels)):
                if new_labels[label_index] == '':
                    del new_labels[label_index]
            if new_labels != []:
                if (self.objective and field_column == self.objective_column
                        and self.labels is not None):
                    # If user gave the subset of labels, use only those
                    new_labels = [label for label in self.labels if
                                  label in new_labels]
                labels[field_column].extend(new_labels)
        else:
            labels[field_column].append(field_value)
        labels[field_column] = sorted(list(set(labels[field_column])))
        return labels

    def get_headers(self, objective_field=True):
        """Returns headers. If objective_field is False, the objective field
           header is removed.

        """
        if objective_field:
            return self.headers[:]
        new_headers = self.headers[:]
        if self.objective:
            del new_headers[self.objective_column]
        return new_headers

    def new_fields_info(self):
        """Dict of 2-item lists 'field_column': [label, label_column]
           describing the per label extension

        """
        info = {}
        column = len(self.headers)
        for field_column in self.multi_label_fields:
            alpha_field_column = str(field_column)
            info[alpha_field_column] = []
            labels = self.fields_labels[field_column]
            for label in labels:
                info[alpha_field_column].append([label, column])
                column += 1
            # skip the aggregate values columns
            column += len(self.label_aggregates)

        return info

    def get_multi_label_data(self):
        """Returns a dict to store the multi-label info that defines this
           source

        """
        if self.objective:
            return {
                "multi_label_fields": [[column, self.headers[column]]
                                       for column in self.multi_label_fields],
                "generated_fields": self.new_fields_info(),
                "objective_name": self.objective_name,
                "objective_column": self.objective_column}

    def close(self):
        """Closing file handler

        """
        self.training_reader.close_reader()
示例#12
0
class TstReader(object):
    """Retrieves csv info and builds a input data dict

    """
    def __init__(self,
                 test_set,
                 test_set_header,
                 fields,
                 objective_field,
                 test_separator=None):
        """Builds a generator from a csv file and the fields' model structure

           `test_set`: path to the test data file
           `test_set_header`: boolean, True means that headers are first row
                              in the file
           `fields`: Fields object with the expected fields structure.
           `objective_field`: field_id of the objective field
        """
        self.test_set = test_set
        if test_set.__class__.__name__ == "StringIO":
            self.encode = None
            self.test_set = UTF8Recoder(test_set, SYSTEM_ENCODING)
        else:
            self.encode = None if PYTHON3 else FILE_ENCODING
        self.test_set_header = test_set_header
        self.fields = fields
        if (objective_field is not None
                and not objective_field in fields.fields):
            try:
                objective_field = fields.field_id(objective_field)
            except ValueError, exc:
                sys.exit(exc)
        self.objective_field = objective_field
        if test_separator and not PYTHON3:
            test_separator = decode2(test_separator, encoding="string_escape")
        self.test_separator = (test_separator if test_separator is not None
                               else get_csv_delimiter())
        if len(self.test_separator) > 1:
            sys.exit("Only one character can be used as test data separator.")
        try:
            self.test_reader = UnicodeReader(
                self.test_set,
                delimiter=self.test_separator,
                lineterminator="\n").open_reader()
        except IOError:
            sys.exit("Error: cannot read test %s" % test_set)

        self.headers = None
        self.raw_headers = None
        self.exclude = []
        if test_set_header:
            self.headers = self.test_reader.next()
            # validate headers against model fields excluding objective_field,
            # that may be present or not
            if objective_field is not None:
                objective_field = fields.field_column_number(objective_field)
            try:
                fields_names = [
                    fields.fields[fields.field_id(i)]['name']
                    for i in sorted(fields.fields_by_column_number.keys())
                    if objective_field is None or i != objective_field
                ]
            except ValueError, exc:
                sys.exit(exc)
            self.raw_headers = self.headers[:]

            self.exclude = [
                i for i in range(len(self.headers))
                if not self.headers[i] in fields_names
            ]

            self.exclude.reverse()
            if self.exclude:
                if len(self.headers) > len(self.exclude):
                    for index in self.exclude:
                        del self.headers[index]
                else:
                    raise Exception(
                        (u"No test field matches the model fields."
                         u"\nThe expected fields are:\n\n%s\n\n"
                         u"while "
                         u"the headers found in the test file are:"
                         u"\n\n%s\n\n"
                         u"Use --no-test-header flag if first li"
                         u"ne should not be interpreted as"
                         u" headers." % (",".join(fields_names), ",".join(
                             self.headers))).encode("utf-8"))
示例#13
0
class TrainReader(object):
    """Retrieves csv info and manages objective fields and multi-labels

    """
    def __init__(self,
                 training_set,
                 training_set_header,
                 objective_field,
                 multi_label=False,
                 labels=None,
                 label_separator=None,
                 training_separator=None,
                 multi_label_fields=None,
                 label_aggregates=None,
                 objective=True):
        """Builds a generator from a csv file

           `training_set`: path to the training data file
           `training_set_header`: boolean, True means that headers are first
                                 row in the file
           `objective_field`: objective field column or field name
           `labels`: Fields object with the expected fields structure.
        """
        self.training_set = training_set

        if training_set.__class__.__name__ == "StringIO":
            self.encode = None
            self.training_set = UTF8Recoder(training_set, SYSTEM_ENCODING)
        else:
            self.encode = None if PYTHON3 else FILE_ENCODING
        self.training_set_header = training_set_header
        self.training_reader = None
        self.multi_label = multi_label
        self.objective = objective
        if label_aggregates is None:
            label_aggregates = []
        self.label_aggregates = label_aggregates
        self.training_separator = (decode2(
            training_separator, encoding="string_escape") if training_separator
                                   is not None else get_csv_delimiter())
        if len(self.training_separator) > 1:
            sys.exit("Only one character can be used as test data separator.")
        # opening csv reader
        self.reset()
        self.label_separator = (decode2(
            label_separator, encoding="string_escape") if label_separator
                                is not None else get_csv_delimiter())

        first_row = self.get_next(reset=not training_set_header)
        self.row_length = len(first_row)

        if training_set_header:
            self.headers = first_row
        else:
            self.headers = [("field_%s" % index)
                            for index in range(0, self.row_length)]

        self.multi_label_fields = sorted(self._get_columns(multi_label_fields))
        if objective:
            self.objective_column = self._get_columns([objective_field])[0]
            if not self.objective_column in self.multi_label_fields:
                self.multi_label_fields.append(self.objective_column)
            self.labels = labels
        self.fields_labels = self._get_labels()
        if objective:
            if labels is None:
                self.labels = self.fields_labels[self.objective_column]
            self.objective_name = self.headers[self.objective_column]

    def __iter__(self):
        """Iterator method

        """
        return self

    def get_label_headers(self):
        """Returns a list of headers with the new extended field names for
           each objective label
        """
        new_headers = self.get_headers()
        for field_column in self.multi_label_fields:
            labels = self.fields_labels[field_column]
            new_field_names = [
                get_label_field(self.headers[field_column], label)
                for label in labels
            ]
            new_headers.extend(new_field_names)
            for aggregate in self.label_aggregates:
                new_headers.append(
                    get_label_field(self.headers[field_column], aggregate))
        if not PYTHON3:
            new_headers = [encode2(header) for header in new_headers]
        return new_headers

    def _get_columns(self, fields_list):
        """Receives a comma-separated list of fields given by name or
           column number and returns column number list

        """
        column_list = []
        if fields_list is None:
            return column_list
        if not isinstance(fields_list, list):
            fields_list = [fields_list]
        for field in fields_list:
            column = None
            if isinstance(field, int):
                column = field
            elif field is None:
                column = self.row_length - 1
            else:
                try:
                    column = self.headers.index(field)
                except ValueError:
                    if self.objective:
                        sys.exit(
                            "The %s has been set as multi-label field but"
                            " it cannot be found in the headers row: \n"
                            " %s" % (field, ", ".join(
                                [encode2(header) for header in self.headers])))
                    else:
                        column = None
            if column is not None:
                column_list.append(column)
        return column_list

    def reset(self):
        """Starts a new csv reader object

        """
        try:
            self.training_set.close()
        except (IOError, AttributeError):
            pass
        try:
            self.training_reader = UnicodeReader(
                self.training_set,
                delimiter=self.training_separator,
                lineterminator="\n").open_reader()
        except IOError:
            sys.exit("Error: cannot read training %s" % self.training_set)

    def next(self):
        """Iterator method for next item

        """
        return self.get_next()

    def get_next(self, extended=False, reset=False):
        """Returns the next row. If extended is True, the row is extended with
           a list of booleans depending on whether the label is in the
           objective field value or not. If reset is True, the file is
           reopened and pointer starts at the beginning of the file.

        """
        row = self.training_reader.next()
        row = [value.strip() for value in row]
        if extended:
            if self.multi_label and self.fields_labels is None:
                self.fields_labels = self._get_labels()

            for field_column in self.multi_label_fields:
                aggregated_field_value = row[field_column]
                field_values = aggregated_field_value.split(
                    self.label_separator)

                field_values = [value.strip() for value in field_values]

                labels_row = [
                    int(label in field_values)
                    for label in self.fields_labels[field_column]
                ]
                row.extend(labels_row)
                for aggregate in self.label_aggregates:
                    row.append(AGGREGATES[aggregate](field_values))
        if reset:
            self.reset()
        if not PYTHON3:
            row = [encode2(item) for item in row]
        return row

    def number_of_rows(self):
        """Returns the number of rows in the test file

        """
        rows = file_number_of_lines(self.training_set)
        if self.training_set_header:
            rows -= 1
        return rows

    def has_headers(self):
        """Returns whether the training set file has a headers row

        """
        return self.training_set_header

    def _get_labels(self):
        """Returns the list of labels in the multi-label fields

        """
        labels = {}
        for field_column in self.multi_label_fields:
            labels[field_column] = []
        for row in self:
            for field_column in self.multi_label_fields:
                labels = self._get_field_labels(row, labels, field_column,
                                                self.label_separator)
        return labels

    def _get_field_labels(self, row, labels, field_column, separator):
        """Returns the list of labels in a multi-label field

        """
        field_value = row[field_column]
        if self.multi_label:
            new_labels = field_value.split(separator)
            new_labels = [decode2(label).strip() for label in new_labels]
            # TODO: clean user given missing tokens
            for label_index in range(0, len(new_labels)):
                if new_labels[label_index] == '':
                    del new_labels[label_index]
            if new_labels != []:
                if (self.objective and field_column == self.objective_column
                        and self.labels is not None):
                    # If user gave the subset of labels, use only those
                    new_labels = [
                        label for label in self.labels if label in new_labels
                    ]
                labels[field_column].extend(new_labels)
        else:
            labels[field_column].append(field_value)
        labels[field_column] = sorted(list(set(labels[field_column])))
        return labels

    def get_headers(self, objective_field=True):
        """Returns headers. If objective_field is False, the objective field
           header is removed.

        """
        if objective_field:
            return self.headers[:]
        new_headers = self.headers[:]
        if self.objective:
            del new_headers[self.objective_column]
        return new_headers

    def new_fields_info(self):
        """Dict of 2-item lists 'field_column': [label, label_column]
           describing the per label extension

        """
        info = {}
        column = len(self.headers)
        for field_column in self.multi_label_fields:
            alpha_field_column = str(field_column)
            info[alpha_field_column] = []
            labels = self.fields_labels[field_column]
            for label in labels:
                info[alpha_field_column].append([label, column])
                column += 1
            # skip the aggregate values columns
            column += len(self.label_aggregates)

        return info

    def get_multi_label_data(self):
        """Returns a dict to store the multi-label info that defines this
           source

        """
        if self.objective:
            return {
                "multi_label_fields": [[column, self.headers[column]]
                                       for column in self.multi_label_fields],
                "generated_fields":
                self.new_fields_info(),
                "objective_name":
                self.objective_name,
                "objective_column":
                self.objective_column
            }

    def close(self):
        """Closing file handler

        """
        self.training_reader.close_reader()