Exemplo n.º 1
0
def i_check_topic_distributions(step, check_file):
    check_file = res_filename(check_file)
    predictions_file = world.output
    import traceback
    try:
        with UnicodeReader(predictions_file) as predictions_file:
            with UnicodeReader(check_file) as check_file:
                for row in predictions_file:
                    check_row = check_file.next()
                    assert len(check_row) == len(row)
                    for index in range(len(row)):
                        dot = row[index].find(".")
                        decimal_places = 1
                        if dot > 0 or (check_row[index].find(".") > 0
                                       and check_row[index].endswith(".0")):
                            try:
                                decimal_places = min( \
                                    len(row[index]),
                                    len(check_row[index])) - dot - 1
                                row[index] = round(float(row[index]),
                                                   decimal_places)
                                check_row[index] = round(
                                    float(check_row[index]), decimal_places)
                            except ValueError:
                                decimal_places = 1
                            assert_almost_equal(check_row[index],
                                                row[index],
                                                places=(decimal_places - 1))
                        else:
                            assert_equal(check_row[index], row[index])
    except Exception, exc:
        assert False, traceback.format_exc()
Exemplo n.º 2
0
def i_check_forecasts(step, check_file):
    check_file = res_filename(check_file)
    forecasts_file = "%s_%s.csv" % \
        (world.output, world.time_series["object"]["objective_field"])
    import traceback
    try:
        with UnicodeReader(forecasts_file) as forecasts_file:
            with UnicodeReader(check_file) as check_file:
                for row in forecasts_file:
                    check_row = check_file.next()
                    assert_equal(len(check_row), len(row))
                    for index in range(len(row)):
                        dot = row[index].find(".")
                        decimal_places = 1
                        if dot > 0 or (check_row[index].find(".") > 0
                                       and check_row[index].endswith(".0")):
                            try:
                                decimal_places = min(len(row[index]),
                                                     len(check_row[index])) \
                                    - dot - 1
                                row[index] = round(float(row[index]),
                                                   decimal_places)
                                check_row[index] = round(float(check_row[ \
                                    index]), decimal_places)
                            except ValueError:
                                decimal_places = 1
                            assert_almost_equal(check_row[index],
                                                row[index],
                                                places=(decimal_places - 1))
                        else:
                            assert_equal(check_row[index], row[index])
    except Exception, exc:
        assert False, traceback.format_exc()
Exemplo n.º 3
0
def check_summary_like_expected(step, summary_file, expected_file):
    summary_contents = []
    expected_contents = []
    with UnicodeReader(res_filename(summary_file)) as summary_handler:
        for line in summary_handler:
            summary_contents.append(line)
    with UnicodeReader(res_filename(expected_file)) as expected_handler:
        for line in expected_handler:
            expected_contents.append(line)
    eq_(summary_contents, expected_contents)
Exemplo n.º 4
0
def read_field_attributes(path):
    """Reads field attributes from a csv file to update source fields.

    A column number and a list of attributes separated by a comma per line.
    The expected structure is:
    column number, name, label, description

    For example:

    0,'first name','label for the first field','fist field full description'
    1,'last name','label for the last field','last field full description'

    """
    field_attributes = {}
    try:
        with UnicodeReader(path, quotechar="'") as attributes_reader:
            for row in attributes_reader:
                attributes = {}
                if len(row) > 1:
                    for index in range(0,
                                       min(len(ATTRIBUTE_NAMES),
                                           len(row) - 1)):
                        attributes.update(
                            {ATTRIBUTE_NAMES[index]: row[index + 1]})
                    field_attributes.update({int(row[0]): attributes})
            return field_attributes
    except IOError:
        sys.exit("Error: cannot read field attributes %s" % path)
Exemplo n.º 5
0
def read_objective_weights(path):
    """Reads objective weights from a CSV file in a class, weight format.

    The expected structure is:
    class name, weight

    For example:

    Iris-setosa,5
    Iris-versicolor,10

    """
    objective_weights = []
    import traceback
    try:
        with UnicodeReader(path, quotechar="'") as weights_reader:
            for row in weights_reader:
                weights = []
                if len(row) != 2:
                    sys.exit("Error: wrong objective field file syntax\n%s" %
                             ",".join(row))
                weights = row[:]
                try:
                    weights[1] = int(weights[1])
                except ValueError:
                    sys.exit("Error: wrong objective field file syntax\n%s" %
                             ",".join(row))
                objective_weights.append(weights)
            return objective_weights
    except IOError:
        sys.exit("Error: cannot read objective weights %s" % path)
Exemplo n.º 6
0
    def reset(self):
        """Starts a new csv reader object

        """
        try:
            self.training_set.close()
        except (IOError, AttributeError):
            pass
        try:
            self.training_reader = UnicodeReader(
                self.training_set, delimiter=self.training_separator,
                lineterminator="\n").open_reader()
        except IOError:
            sys.exit("Error: cannot read training %s" % self.training_set)
Exemplo n.º 7
0
def read_votes(votes_files, to_prediction, data_locale=None):
    """Reads the votes found in the votes' files.

       Returns a list of MultiVote objects containing the list of predictions.
       votes_files parameter should contain the path to the files where votes
       are stored
       In to_prediction parameter we expect the method of a local model object
       that casts the string prediction values read from the file to their
       real type. For instance
           >>> local_model = Model(model)
           >>> prediction = local_model.to_prediction("1")
           >>> isinstance(prediction, int)
           True
           >>> read_votes(["my_predictions_file"], local_model.to_prediction)
       data_locale should contain the string identification for the locale
       used in numeric formatting.
    """
    votes = []
    for order in range(0, len(votes_files)):
        votes_file = votes_files[order]
        index = 0
        with UnicodeReader(votes_file) as rdr:
            for row in rdr:
                prediction = to_prediction(row[0], data_locale=data_locale)
                if index > (len(votes) - 1):
                    votes.append(MultiVote([]))
                distribution = None
                instances = None
                if len(row) > 2:
                    distribution = ast.literal_eval(row[2])
                    instances = int(row[3])
                    try:
                        confidence = float(row[1])
                    except ValueError:
                        confidence = 0.0
                prediction_row = [
                    prediction, confidence, order, distribution, instances
                ]
                votes[index].append_row(prediction_row)
                index += 1
    return votes
def i_check_predictions(step, check_file):
    with UnicodeReader(world.output) as prediction_rows:
        with UnicodeReader(res_filename(check_file)) as test_rows:
            check_rows(prediction_rows, test_rows)
Exemplo n.º 9
0
    def new_fields_structure(self,
                             csv_attributes_file=None,
                             attributes=None,
                             out_file=None):
        """Builds the field structure needed to update a fields dictionary
        in a BigML resource.

        :param csv_attributes_file: (string) Path to a CSV file like the one
                                             generated by summary_csv.
        :param attributes: (list) list of rows containing the
                                  attributes information ordered
                                  as in the summary_csv output.
        :param out_file: (string) Path to a JSON file that will be used
                                  to store the new fields structure. If None,
                                  the output is returned as a dict.
        """
        if csv_attributes_file is not None:
            reader = UnicodeReader(csv_attributes_file).open_reader()
            attributes = [row for row in reader]
        new_fields_structure = {}
        if "field ID" in attributes[0] or "field column" in attributes[0]:
            # headers are used
            for index in range(1, len(attributes)):
                new_attributes = dict(zip(attributes[0], attributes[index]))
                if new_attributes.get("field ID"):
                    field_id = new_attributes.get("field ID")
                    if not field_id in self.fields.keys():
                        raise ValueError("Field ID %s not found"
                                         " in this resource" % field_id)
                    del new_attributes["field ID"]
                else:
                    field_column = int(new_attributes.get("field column"))
                    if not field_column in self.field_columns:
                        raise ValueError("Field column %s not found"
                                         " in this resource" % field_column)
                    field_id = self.field_id(field_column)
                    del new_attributes["field column"]
                for attribute, value in new_attributes.items():
                    if not attribute in UPDATABLE_HEADERS.keys():
                        del new_attributes[attribute]
                    else:
                        new_attributes[UPDATABLE_HEADERS[attribute]] = \
                            new_attributes[attribute]
                        if attribute != UPDATABLE_HEADERS[attribute]:
                            del new_attributes[attribute]
                if "preferred" in new_attributes:
                    new_attributes['preferred'] = json.loads( \
                        new_attributes['preferred'])
                new_fields_structure[field_id] = new_attributes
        else:
            # assume the order given in the summary_csv method
            first_attribute = attributes[0][0]
            first_column_is_id = False
            try:
                field_id = self.field_id(int(first_attribute))
            except ValueError:
                field_id = first_attribute
                first_column_is_id = True
            if not field_id in self.fields:
                raise ValueError("The first column should contain either the"
                                 " column or ID of the fields. Failed to find"
                                 " %s as either of them." % field_id)
            headers = SUMMARY_HEADERS[2:7]
            headers = [UPDATABLE_HEADERS[header] for header in headers]
            try:
                for field_attributes in attributes:
                    if field_attributes[6] is not None:
                        field_attributes[6] = json.loads(field_attributes[6])
                    field_id = field_attributes[0] if first_column_is_id else \
                        self.field_id(int(field_attributes[0]))
                    new_fields_structure[field_id] = \
                        dict(zip(headers, field_attributes[1: 6]))

            except ValueError:
                raise ValueError("The first column should contain either the"
                                 " column or ID of the fields. Failed to find"
                                 " %s as either of them." % field_id)
        if out_file is None:
            return {"fields": new_fields_structure}
        else:
            try:
                with open(out_file, "w") as out:
                    json.dump({"fields": new_fields_structure}, out)
            except IOError, exc:
                raise IOError("Failed writing the fields structure file in"
                              " %s- Please, check your arguments." % out_file)
Exemplo n.º 10
0
class TstReader(object):
    """Retrieves csv info and builds a input data dict

    """
    def __init__(self,
                 test_set,
                 test_set_header,
                 fields,
                 objective_field,
                 test_separator=None):
        """Builds a generator from a csv file and the fields' model structure

           `test_set`: path to the test data file
           `test_set_header`: boolean, True means that headers are first row
                              in the file
           `fields`: Fields object with the expected fields structure.
           `objective_field`: field_id of the objective field
        """
        self.test_set = test_set
        if test_set.__class__.__name__ == "StringIO":
            self.encode = None
            self.test_set = UTF8Recoder(test_set, SYSTEM_ENCODING)
        else:
            self.encode = None if PYTHON3 else FILE_ENCODING
        self.test_set_header = test_set_header
        self.fields = fields
        if (objective_field is not None
                and not objective_field in fields.fields):
            try:
                objective_field = fields.field_id(objective_field)
            except ValueError, exc:
                sys.exit(exc)
        self.objective_field = objective_field
        if test_separator and not PYTHON3:
            test_separator = decode2(test_separator, encoding="string_escape")
        self.test_separator = (test_separator if test_separator is not None
                               else get_csv_delimiter())
        if len(self.test_separator) > 1:
            sys.exit("Only one character can be used as test data separator.")
        try:
            self.test_reader = UnicodeReader(
                self.test_set,
                delimiter=self.test_separator,
                lineterminator="\n").open_reader()
        except IOError:
            sys.exit("Error: cannot read test %s" % test_set)

        self.headers = None
        self.raw_headers = None
        self.exclude = []
        if test_set_header:
            self.headers = self.test_reader.next()
            # validate headers against model fields excluding objective_field,
            # that may be present or not
            if objective_field is not None:
                objective_field = fields.field_column_number(objective_field)
            try:
                fields_names = [
                    fields.fields[fields.field_id(i)]['name']
                    for i in sorted(fields.fields_by_column_number.keys())
                    if objective_field is None or i != objective_field
                ]
            except ValueError, exc:
                sys.exit(exc)
            self.raw_headers = self.headers[:]

            self.exclude = [
                i for i in range(len(self.headers))
                if not self.headers[i] in fields_names
            ]

            self.exclude.reverse()
            if self.exclude:
                if len(self.headers) > len(self.exclude):
                    for index in self.exclude:
                        del self.headers[index]
                else:
                    raise Exception(
                        (u"No test field matches the model fields."
                         u"\nThe expected fields are:\n\n%s\n\n"
                         u"while "
                         u"the headers found in the test file are:"
                         u"\n\n%s\n\n"
                         u"Use --no-test-header flag if first li"
                         u"ne should not be interpreted as"
                         u" headers." % (",".join(fields_names), ",".join(
                             self.headers))).encode("utf-8"))