def i_check_topic_distributions(step, check_file): check_file = res_filename(check_file) predictions_file = world.output import traceback try: with UnicodeReader(predictions_file) as predictions_file: with UnicodeReader(check_file) as check_file: for row in predictions_file: check_row = check_file.next() assert len(check_row) == len(row) for index in range(len(row)): dot = row[index].find(".") decimal_places = 1 if dot > 0 or (check_row[index].find(".") > 0 and check_row[index].endswith(".0")): try: decimal_places = min( \ len(row[index]), len(check_row[index])) - dot - 1 row[index] = round(float(row[index]), decimal_places) check_row[index] = round( float(check_row[index]), decimal_places) except ValueError: decimal_places = 1 assert_almost_equal(check_row[index], row[index], places=(decimal_places - 1)) else: assert_equal(check_row[index], row[index]) except Exception, exc: assert False, traceback.format_exc()
def i_check_forecasts(step, check_file): check_file = res_filename(check_file) forecasts_file = "%s_%s.csv" % \ (world.output, world.time_series["object"]["objective_field"]) import traceback try: with UnicodeReader(forecasts_file) as forecasts_file: with UnicodeReader(check_file) as check_file: for row in forecasts_file: check_row = check_file.next() assert_equal(len(check_row), len(row)) for index in range(len(row)): dot = row[index].find(".") decimal_places = 1 if dot > 0 or (check_row[index].find(".") > 0 and check_row[index].endswith(".0")): try: decimal_places = min(len(row[index]), len(check_row[index])) \ - dot - 1 row[index] = round(float(row[index]), decimal_places) check_row[index] = round(float(check_row[ \ index]), decimal_places) except ValueError: decimal_places = 1 assert_almost_equal(check_row[index], row[index], places=(decimal_places - 1)) else: assert_equal(check_row[index], row[index]) except Exception, exc: assert False, traceback.format_exc()
def check_summary_like_expected(step, summary_file, expected_file): summary_contents = [] expected_contents = [] with UnicodeReader(res_filename(summary_file)) as summary_handler: for line in summary_handler: summary_contents.append(line) with UnicodeReader(res_filename(expected_file)) as expected_handler: for line in expected_handler: expected_contents.append(line) eq_(summary_contents, expected_contents)
def reset(self): """Starts a new csv reader object """ try: self.training_set.close() except (IOError, AttributeError): pass try: self.training_reader = UnicodeReader( self.training_set, delimiter=self.training_separator, lineterminator="\n").open_reader() except IOError: sys.exit("Error: cannot read training %s" % self.training_set)
def read_field_attributes(path): """Reads field attributes from a csv file to update source fields. A column number and a list of attributes separated by a comma per line. The expected structure is: column number, name, label, description For example: 0,'first name','label for the first field','fist field full description' 1,'last name','label for the last field','last field full description' """ field_attributes = {} try: with UnicodeReader(path, quotechar="'") as attributes_reader: for row in attributes_reader: attributes = {} if len(row) > 1: for index in range(0, min(len(ATTRIBUTE_NAMES), len(row) - 1)): attributes.update( {ATTRIBUTE_NAMES[index]: row[index + 1]}) field_attributes.update({int(row[0]): attributes}) return field_attributes except IOError: sys.exit("Error: cannot read field attributes %s" % path)
def read_objective_weights(path): """Reads objective weights from a CSV file in a class, weight format. The expected structure is: class name, weight For example: Iris-setosa,5 Iris-versicolor,10 """ objective_weights = [] import traceback try: with UnicodeReader(path, quotechar="'") as weights_reader: for row in weights_reader: weights = [] if len(row) != 2: sys.exit("Error: wrong objective field file syntax\n%s" % ",".join(row)) weights = row[:] try: weights[1] = int(weights[1]) except ValueError: sys.exit("Error: wrong objective field file syntax\n%s" % ",".join(row)) objective_weights.append(weights) return objective_weights except IOError: sys.exit("Error: cannot read objective weights %s" % path)
def read_votes(votes_files, to_prediction, data_locale=None): """Reads the votes found in the votes' files. Returns a list of MultiVote objects containing the list of predictions. votes_files parameter should contain the path to the files where votes are stored In to_prediction parameter we expect the method of a local model object that casts the string prediction values read from the file to their real type. For instance >>> local_model = Model(model) >>> prediction = local_model.to_prediction("1") >>> isinstance(prediction, int) True >>> read_votes(["my_predictions_file"], local_model.to_prediction) data_locale should contain the string identification for the locale used in numeric formatting. """ votes = [] for order in range(0, len(votes_files)): votes_file = votes_files[order] index = 0 with UnicodeReader(votes_file) as rdr: for row in rdr: prediction = to_prediction(row[0], data_locale=data_locale) if index > (len(votes) - 1): votes.append(MultiVote([])) distribution = None instances = None if len(row) > 2: distribution = ast.literal_eval(row[2]) instances = int(row[3]) try: confidence = float(row[1]) except ValueError: confidence = 0.0 prediction_row = [ prediction, confidence, order, distribution, instances ] votes[index].append_row(prediction_row) index += 1 return votes
def i_check_predictions(step, check_file): with UnicodeReader(world.output) as prediction_rows: with UnicodeReader(res_filename(check_file)) as test_rows: check_rows(prediction_rows, test_rows)
def new_fields_structure(self, csv_attributes_file=None, attributes=None, out_file=None): """Builds the field structure needed to update a fields dictionary in a BigML resource. :param csv_attributes_file: (string) Path to a CSV file like the one generated by summary_csv. :param attributes: (list) list of rows containing the attributes information ordered as in the summary_csv output. :param out_file: (string) Path to a JSON file that will be used to store the new fields structure. If None, the output is returned as a dict. """ if csv_attributes_file is not None: reader = UnicodeReader(csv_attributes_file).open_reader() attributes = [row for row in reader] new_fields_structure = {} if "field ID" in attributes[0] or "field column" in attributes[0]: # headers are used for index in range(1, len(attributes)): new_attributes = dict(zip(attributes[0], attributes[index])) if new_attributes.get("field ID"): field_id = new_attributes.get("field ID") if not field_id in self.fields.keys(): raise ValueError("Field ID %s not found" " in this resource" % field_id) del new_attributes["field ID"] else: field_column = int(new_attributes.get("field column")) if not field_column in self.field_columns: raise ValueError("Field column %s not found" " in this resource" % field_column) field_id = self.field_id(field_column) del new_attributes["field column"] for attribute, value in new_attributes.items(): if not attribute in UPDATABLE_HEADERS.keys(): del new_attributes[attribute] else: new_attributes[UPDATABLE_HEADERS[attribute]] = \ new_attributes[attribute] if attribute != UPDATABLE_HEADERS[attribute]: del new_attributes[attribute] if "preferred" in new_attributes: new_attributes['preferred'] = json.loads( \ new_attributes['preferred']) new_fields_structure[field_id] = new_attributes else: # assume the order given in the summary_csv method first_attribute = attributes[0][0] first_column_is_id = False try: field_id = self.field_id(int(first_attribute)) except ValueError: field_id = first_attribute first_column_is_id = True if not field_id in self.fields: raise ValueError("The first column should contain either the" " column or ID of the fields. Failed to find" " %s as either of them." % field_id) headers = SUMMARY_HEADERS[2:7] headers = [UPDATABLE_HEADERS[header] for header in headers] try: for field_attributes in attributes: if field_attributes[6] is not None: field_attributes[6] = json.loads(field_attributes[6]) field_id = field_attributes[0] if first_column_is_id else \ self.field_id(int(field_attributes[0])) new_fields_structure[field_id] = \ dict(zip(headers, field_attributes[1: 6])) except ValueError: raise ValueError("The first column should contain either the" " column or ID of the fields. Failed to find" " %s as either of them." % field_id) if out_file is None: return {"fields": new_fields_structure} else: try: with open(out_file, "w") as out: json.dump({"fields": new_fields_structure}, out) except IOError, exc: raise IOError("Failed writing the fields structure file in" " %s- Please, check your arguments." % out_file)
class TrainReader(object): """Retrieves csv info and manages objective fields and multi-labels """ def __init__(self, training_set, training_set_header, objective_field, multi_label=False, labels=None, label_separator=None, training_separator=None, multi_label_fields=None, label_aggregates=None, objective=True): """Builds a generator from a csv file `training_set`: path to the training data file `training_set_header`: boolean, True means that headers are first row in the file `objective_field`: objective field column or field name `labels`: Fields object with the expected fields structure. """ self.training_set = training_set if training_set.__class__.__name__ == "StringIO": self.encode = None self.training_set = UTF8Recoder(test_set, SYSTEM_ENCODING) else: self.encode = None if PYTHON3 else FILE_ENCODING self.training_set_header = training_set_header self.training_reader = None self.multi_label = multi_label self.objective = objective if label_aggregates is None: label_aggregates = [] self.label_aggregates = label_aggregates self.training_separator = (decode2(training_separator, encoding="string_escape") if training_separator is not None else get_csv_delimiter()) if len(self.training_separator) > 1: sys.exit("Only one character can be used as test data separator.") # opening csv reader self.reset() self.label_separator = (decode2(label_separator, encoding="string_escape") if label_separator is not None else get_csv_delimiter()) first_row = self.get_next(reset=not training_set_header) self.row_length = len(first_row) if training_set_header: self.headers = first_row else: self.headers = [("field_%s" % index) for index in range(0, self.row_length)] self.multi_label_fields = sorted(self._get_columns(multi_label_fields)) if objective: self.objective_column = self._get_columns([objective_field])[0] if not self.objective_column in self.multi_label_fields: self.multi_label_fields.append(self.objective_column) self.labels = labels self.fields_labels = self._get_labels() if objective: if labels is None: self.labels = self.fields_labels[self.objective_column] self.objective_name = self.headers[self.objective_column] def __iter__(self): """Iterator method """ return self def get_label_headers(self): """Returns a list of headers with the new extended field names for each objective label """ new_headers = self.get_headers() for field_column in self.multi_label_fields: labels = self.fields_labels[field_column] new_field_names = [get_label_field(self.headers[field_column], label) for label in labels] new_headers.extend(new_field_names) for aggregate in self.label_aggregates: new_headers.append(get_label_field( self.headers[field_column], aggregate)) if not PYTHON3: new_headers = [encode2(header) for header in new_headers] return new_headers def _get_columns(self, fields_list): """Receives a comma-separated list of fields given by name or column number and returns column number list """ column_list = [] if fields_list is None: return column_list if not isinstance(fields_list, list): fields_list = [fields_list] for field in fields_list: column = None if isinstance(field, int): column = field elif field is None: column = self.row_length - 1 else: try: column = self.headers.index(field) except ValueError: if self.objective: sys.exit("The %s has been set as multi-label field but" " it cannot be found in the headers row: \n" " %s" % (field, ", ".join([encode2(header) for header in self.headers]))) else: column = None if column is not None: column_list.append(column) return column_list def reset(self): """Starts a new csv reader object """ try: self.training_set.close() except (IOError, AttributeError): pass try: self.training_reader = UnicodeReader( self.training_set, delimiter=self.training_separator, lineterminator="\n").open_reader() except IOError: sys.exit("Error: cannot read training %s" % self.training_set) def next(self): """Iterator method for next item """ return self.get_next() def get_next(self, extended=False, reset=False): """Returns the next row. If extended is True, the row is extended with a list of booleans depending on whether the label is in the objective field value or not. If reset is True, the file is reopened and pointer starts at the beginning of the file. """ row = self.training_reader.next() row = [value.strip() for value in row] if extended: if self.multi_label and self.fields_labels is None: self.fields_labels = self._get_labels() for field_column in self.multi_label_fields: aggregated_field_value = row[field_column] field_values = aggregated_field_value.split( self.label_separator) field_values = [value.strip() for value in field_values] labels_row = [int(label in field_values) for label in self.fields_labels[field_column]] row.extend(labels_row) for aggregate in self.label_aggregates: row.append(AGGREGATES[aggregate](field_values)) if reset: self.reset() if not PYTHON3: row = [encode2(item) for item in row] return row def number_of_rows(self): """Returns the number of rows in the test file """ rows = file_number_of_lines(self.training_set) if self.training_set_header: rows -= 1 return rows def has_headers(self): """Returns whether the training set file has a headers row """ return self.training_set_header def _get_labels(self): """Returns the list of labels in the multi-label fields """ labels = {} for field_column in self.multi_label_fields: labels[field_column] = [] for row in self: for field_column in self.multi_label_fields: labels = self._get_field_labels(row, labels, field_column, self.label_separator) return labels def _get_field_labels(self, row, labels, field_column, separator): """Returns the list of labels in a multi-label field """ field_value = row[field_column] if self.multi_label: new_labels = field_value.split(separator) new_labels = [decode2(label).strip() for label in new_labels] # TODO: clean user given missing tokens for label_index in range(0, len(new_labels)): if new_labels[label_index] == '': del new_labels[label_index] if new_labels != []: if (self.objective and field_column == self.objective_column and self.labels is not None): # If user gave the subset of labels, use only those new_labels = [label for label in self.labels if label in new_labels] labels[field_column].extend(new_labels) else: labels[field_column].append(field_value) labels[field_column] = sorted(list(set(labels[field_column]))) return labels def get_headers(self, objective_field=True): """Returns headers. If objective_field is False, the objective field header is removed. """ if objective_field: return self.headers[:] new_headers = self.headers[:] if self.objective: del new_headers[self.objective_column] return new_headers def new_fields_info(self): """Dict of 2-item lists 'field_column': [label, label_column] describing the per label extension """ info = {} column = len(self.headers) for field_column in self.multi_label_fields: alpha_field_column = str(field_column) info[alpha_field_column] = [] labels = self.fields_labels[field_column] for label in labels: info[alpha_field_column].append([label, column]) column += 1 # skip the aggregate values columns column += len(self.label_aggregates) return info def get_multi_label_data(self): """Returns a dict to store the multi-label info that defines this source """ if self.objective: return { "multi_label_fields": [[column, self.headers[column]] for column in self.multi_label_fields], "generated_fields": self.new_fields_info(), "objective_name": self.objective_name, "objective_column": self.objective_column} def close(self): """Closing file handler """ self.training_reader.close_reader()
class TstReader(object): """Retrieves csv info and builds a input data dict """ def __init__(self, test_set, test_set_header, fields, objective_field, test_separator=None): """Builds a generator from a csv file and the fields' model structure `test_set`: path to the test data file `test_set_header`: boolean, True means that headers are first row in the file `fields`: Fields object with the expected fields structure. `objective_field`: field_id of the objective field """ self.test_set = test_set if test_set.__class__.__name__ == "StringIO": self.encode = None self.test_set = UTF8Recoder(test_set, SYSTEM_ENCODING) else: self.encode = None if PYTHON3 else FILE_ENCODING self.test_set_header = test_set_header self.fields = fields if (objective_field is not None and not objective_field in fields.fields): try: objective_field = fields.field_id(objective_field) except ValueError, exc: sys.exit(exc) self.objective_field = objective_field if test_separator and not PYTHON3: test_separator = decode2(test_separator, encoding="string_escape") self.test_separator = (test_separator if test_separator is not None else get_csv_delimiter()) if len(self.test_separator) > 1: sys.exit("Only one character can be used as test data separator.") try: self.test_reader = UnicodeReader( self.test_set, delimiter=self.test_separator, lineterminator="\n").open_reader() except IOError: sys.exit("Error: cannot read test %s" % test_set) self.headers = None self.raw_headers = None self.exclude = [] if test_set_header: self.headers = self.test_reader.next() # validate headers against model fields excluding objective_field, # that may be present or not if objective_field is not None: objective_field = fields.field_column_number(objective_field) try: fields_names = [ fields.fields[fields.field_id(i)]['name'] for i in sorted(fields.fields_by_column_number.keys()) if objective_field is None or i != objective_field ] except ValueError, exc: sys.exit(exc) self.raw_headers = self.headers[:] self.exclude = [ i for i in range(len(self.headers)) if not self.headers[i] in fields_names ] self.exclude.reverse() if self.exclude: if len(self.headers) > len(self.exclude): for index in self.exclude: del self.headers[index] else: raise Exception( (u"No test field matches the model fields." u"\nThe expected fields are:\n\n%s\n\n" u"while " u"the headers found in the test file are:" u"\n\n%s\n\n" u"Use --no-test-header flag if first li" u"ne should not be interpreted as" u" headers." % (",".join(fields_names), ",".join( self.headers))).encode("utf-8"))
class TrainReader(object): """Retrieves csv info and manages objective fields and multi-labels """ def __init__(self, training_set, training_set_header, objective_field, multi_label=False, labels=None, label_separator=None, training_separator=None, multi_label_fields=None, label_aggregates=None, objective=True): """Builds a generator from a csv file `training_set`: path to the training data file `training_set_header`: boolean, True means that headers are first row in the file `objective_field`: objective field column or field name `labels`: Fields object with the expected fields structure. """ self.training_set = training_set if training_set.__class__.__name__ == "StringIO": self.encode = None self.training_set = UTF8Recoder(training_set, SYSTEM_ENCODING) else: self.encode = None if PYTHON3 else FILE_ENCODING self.training_set_header = training_set_header self.training_reader = None self.multi_label = multi_label self.objective = objective if label_aggregates is None: label_aggregates = [] self.label_aggregates = label_aggregates self.training_separator = (decode2( training_separator, encoding="string_escape") if training_separator is not None else get_csv_delimiter()) if len(self.training_separator) > 1: sys.exit("Only one character can be used as test data separator.") # opening csv reader self.reset() self.label_separator = (decode2( label_separator, encoding="string_escape") if label_separator is not None else get_csv_delimiter()) first_row = self.get_next(reset=not training_set_header) self.row_length = len(first_row) if training_set_header: self.headers = first_row else: self.headers = [("field_%s" % index) for index in range(0, self.row_length)] self.multi_label_fields = sorted(self._get_columns(multi_label_fields)) if objective: self.objective_column = self._get_columns([objective_field])[0] if not self.objective_column in self.multi_label_fields: self.multi_label_fields.append(self.objective_column) self.labels = labels self.fields_labels = self._get_labels() if objective: if labels is None: self.labels = self.fields_labels[self.objective_column] self.objective_name = self.headers[self.objective_column] def __iter__(self): """Iterator method """ return self def get_label_headers(self): """Returns a list of headers with the new extended field names for each objective label """ new_headers = self.get_headers() for field_column in self.multi_label_fields: labels = self.fields_labels[field_column] new_field_names = [ get_label_field(self.headers[field_column], label) for label in labels ] new_headers.extend(new_field_names) for aggregate in self.label_aggregates: new_headers.append( get_label_field(self.headers[field_column], aggregate)) if not PYTHON3: new_headers = [encode2(header) for header in new_headers] return new_headers def _get_columns(self, fields_list): """Receives a comma-separated list of fields given by name or column number and returns column number list """ column_list = [] if fields_list is None: return column_list if not isinstance(fields_list, list): fields_list = [fields_list] for field in fields_list: column = None if isinstance(field, int): column = field elif field is None: column = self.row_length - 1 else: try: column = self.headers.index(field) except ValueError: if self.objective: sys.exit( "The %s has been set as multi-label field but" " it cannot be found in the headers row: \n" " %s" % (field, ", ".join( [encode2(header) for header in self.headers]))) else: column = None if column is not None: column_list.append(column) return column_list def reset(self): """Starts a new csv reader object """ try: self.training_set.close() except (IOError, AttributeError): pass try: self.training_reader = UnicodeReader( self.training_set, delimiter=self.training_separator, lineterminator="\n").open_reader() except IOError: sys.exit("Error: cannot read training %s" % self.training_set) def next(self): """Iterator method for next item """ return self.get_next() def get_next(self, extended=False, reset=False): """Returns the next row. If extended is True, the row is extended with a list of booleans depending on whether the label is in the objective field value or not. If reset is True, the file is reopened and pointer starts at the beginning of the file. """ row = self.training_reader.next() row = [value.strip() for value in row] if extended: if self.multi_label and self.fields_labels is None: self.fields_labels = self._get_labels() for field_column in self.multi_label_fields: aggregated_field_value = row[field_column] field_values = aggregated_field_value.split( self.label_separator) field_values = [value.strip() for value in field_values] labels_row = [ int(label in field_values) for label in self.fields_labels[field_column] ] row.extend(labels_row) for aggregate in self.label_aggregates: row.append(AGGREGATES[aggregate](field_values)) if reset: self.reset() if not PYTHON3: row = [encode2(item) for item in row] return row def number_of_rows(self): """Returns the number of rows in the test file """ rows = file_number_of_lines(self.training_set) if self.training_set_header: rows -= 1 return rows def has_headers(self): """Returns whether the training set file has a headers row """ return self.training_set_header def _get_labels(self): """Returns the list of labels in the multi-label fields """ labels = {} for field_column in self.multi_label_fields: labels[field_column] = [] for row in self: for field_column in self.multi_label_fields: labels = self._get_field_labels(row, labels, field_column, self.label_separator) return labels def _get_field_labels(self, row, labels, field_column, separator): """Returns the list of labels in a multi-label field """ field_value = row[field_column] if self.multi_label: new_labels = field_value.split(separator) new_labels = [decode2(label).strip() for label in new_labels] # TODO: clean user given missing tokens for label_index in range(0, len(new_labels)): if new_labels[label_index] == '': del new_labels[label_index] if new_labels != []: if (self.objective and field_column == self.objective_column and self.labels is not None): # If user gave the subset of labels, use only those new_labels = [ label for label in self.labels if label in new_labels ] labels[field_column].extend(new_labels) else: labels[field_column].append(field_value) labels[field_column] = sorted(list(set(labels[field_column]))) return labels def get_headers(self, objective_field=True): """Returns headers. If objective_field is False, the objective field header is removed. """ if objective_field: return self.headers[:] new_headers = self.headers[:] if self.objective: del new_headers[self.objective_column] return new_headers def new_fields_info(self): """Dict of 2-item lists 'field_column': [label, label_column] describing the per label extension """ info = {} column = len(self.headers) for field_column in self.multi_label_fields: alpha_field_column = str(field_column) info[alpha_field_column] = [] labels = self.fields_labels[field_column] for label in labels: info[alpha_field_column].append([label, column]) column += 1 # skip the aggregate values columns column += len(self.label_aggregates) return info def get_multi_label_data(self): """Returns a dict to store the multi-label info that defines this source """ if self.objective: return { "multi_label_fields": [[column, self.headers[column]] for column in self.multi_label_fields], "generated_fields": self.new_fields_info(), "objective_name": self.objective_name, "objective_column": self.objective_column } def close(self): """Closing file handler """ self.training_reader.close_reader()