def get_label_headers(self): """Returns a list of headers with the new extended field names for each objective label """ new_headers = self.get_headers() for field_column in self.multi_label_fields: labels = self.fields_labels[field_column] new_field_names = [get_label_field(self.headers[field_column], label) for label in labels] new_headers.extend(new_field_names) for aggregate in self.label_aggregates: new_headers.append(get_label_field(self.headers[field_column], aggregate)) if not PYTHON3: new_headers = [encode2(header) for header in new_headers] return new_headers
def get_label_headers(self): """Returns a list of headers with the new extended field names for each objective label """ new_headers = self.get_headers() for field_column in self.multi_label_fields: labels = self.fields_labels[field_column] new_field_names = [ get_label_field(self.headers[field_column], label) for label in labels ] new_headers.extend(new_field_names) for aggregate in self.label_aggregates: new_headers.append( get_label_field(self.headers[field_column], aggregate)) new_headers = [header.encode("utf-8") for header in new_headers] return new_headers
def multi_label_expansion(training_set, training_set_header, objective_field, args, output_path, field_attributes=None, labels=None, session_file=None): """Splitting the labels in a multi-label objective field to create a source with column per label """ # find out column number corresponding to the objective field training_reader = TrainReader(training_set, training_set_header, objective_field, multi_label=True, labels=labels, label_separator=args.label_separator, training_separator=args.training_separator) # read file to get all the different labels if no --labels flag is given # or use labels given in --labels and generate the new field names new_headers = training_reader.get_headers(objective_field=False) new_field_names = [l.get_label_field(training_reader.objective_name, label) for label in training_reader.labels] new_headers.extend(new_field_names) new_headers.append(training_reader.objective_name) new_headers = [header.encode("utf-8") for header in new_headers] try: file_name = os.path.basename(training_set) except AttributeError: file_name = "training_set.csv" output_file = "%s%sextended_%s" % (output_path, os.sep, file_name) message = u.dated("Transforming to extended source.\n") u.log_message(message, log_file=session_file, console=args.verbosity) with open(output_file, 'w', 0) as output_handler: output = csv.writer(output_handler, lineterminator="\n") output.writerow(new_headers) # read to write new source file with column per label training_reader.reset() if training_set_header: training_reader.next() while True: try: row = training_reader.next(extended=True) output.writerow(row) except StopIteration: break objective_field = training_reader.headers[training_reader.objective_column] if field_attributes is None: field_attributes = {} for label_column, label in training_reader.labels_columns(): field_attributes.update({label_column: { "label": "%s%s" % (l.MULTI_LABEL_LABEL, label)}}) # Setting field label to mark objective and label fields and objective # field (just in case it was not set previously and other derived fields # are added in the source construction process after the real last field). return (output_file, training_reader.labels, field_attributes, training_reader.objective_name)