def binarize(features, binarizers): assert (list_of_lists(features)) num_features = len(features[0]) # if binarizers != {} and max(binarizers.keys()) >= num_features: # print("Binarizers keys max: ", max(binarizers.keys())) # print("Total feature number: ", num_features) # print("Features:", features[0]) assert (binarizers == {} or max(binarizers.keys()) < num_features) binarized_cols = [] for i in range(num_features): # get this column cur_values = [f[i] for f in features] # if there's a binarizer for this column if i in binarizers: binarizer = binarizers[i] if type(binarizer) == LabelBinarizer: try: binarized_cols.append(binarizer.transform(cur_values)) except: pass # print(cur_values) elif type(binarizer) == MultiLabelBinarizer: assert (list_of_lists(cur_values)) # MultiLabelBinarizer doesn't support unknown values -- they need to be replaced with a default value # we're going to use the empty list as the default value cur_values_default = [] default_value = binarizer.classes_[-1] for a_list in cur_values: new_list = list(a_list) for j, val in enumerate(new_list): if val not in binarizer.classes_: new_list[j] = default_value cur_values_default.append(tuple(new_list)) transformed = binarizer.transform(cur_values_default) binarized_cols.append(transformed) else: raise NotImplementedError( 'this function is not implemented for type: {}'.format( type(binarizer))) else: # arr = np.array(cur_values) # print(arr.shape) # print(len(cur_values)) # print(cur_values) try: # new_vals = np.array(cur_values).reshape(len(cur_values), 1) binarized_cols.append( np.array(cur_values).reshape(len(cur_values), 1)) except: print(cur_values) sys.exit() assert ( len(binarized_cols) == num_features ), 'the number of columns after binarization must match the number of features' new_features = np.hstack(binarized_cols) return new_features
def sequence_correlation(y_true, y_pred, good_label=1, bad_label=0, out='sequence_corr.out', verbose=False): assert(len(y_true) == len(y_pred)) if not list_of_lists(y_true) and not list_of_lists(y_pred): logger.warning("You provided the labels in a flat list of length {}. Assuming them to be one sequence".format(len(y_true))) y_true = [y_true] y_pred = [y_pred] elif list_of_lists(y_true) and list_of_lists(y_pred): pass else: logger.error("Shapes of the hypothesis and the reference don't match") return 0 sentence_pred = [] if verbose: out_file = open(out, 'w') for true_sent, pred_sent in zip(y_true, y_pred): assert(len(true_sent) == len(pred_sent)) true_spans_1, true_spans_0 = get_spans(true_sent, good_label=good_label, bad_label=bad_label) pred_spans_1, pred_spans_0 = get_spans(pred_sent, good_label=good_label, bad_label=bad_label) res_1 = intersect_spans(true_spans_1, pred_spans_1) res_0 = intersect_spans(true_spans_0, pred_spans_0) corr_val = (res_1+res_0)/float(len(true_sent)) # print(corr_val, type(corr_val)) if verbose: out_file.write("Reference: %s\nPrediction: %s\nCorrelation: %s\n" % (' '.join([str(t) for t in true_sent]), ' '.join([str(t) for t in pred_sent]), str(corr_val))) sentence_pred.append(corr_val) if verbose: out_file.close() return sentence_pred, np.average(sentence_pred)
def sequence_correlation_weighted(y_true, y_pred, good_label=1, bad_label=0, out='sequence_corr.out', verbose=False): assert(len(y_true) == len(y_pred)) if not list_of_lists(y_true) and not list_of_lists(y_pred): logger.warning("You provided the labels in a flat list of length {}. Assuming them to be one sequence".format(len(y_true))) y_true = [y_true] y_pred = [y_pred] elif list_of_lists(y_true) and list_of_lists(y_pred): pass else: logger.error("Shapes of the hypothesis and the reference don't match") return 0 sentence_pred = [] if verbose: out_file = open(out, 'w') for true_sent, pred_sent in zip(y_true, y_pred): ref_bad = sum([1 for l in true_sent if l == bad_label]) ref_good = sum([1 for l in true_sent if l == good_label]) assert(ref_bad + ref_good == len(true_sent)) # coefficients that ensure the equal influence of good and bad classes on the overall score try: coeff_bad = len(true_sent)/(2*ref_bad) except ZeroDivisionError: coeff_bad = 0.0 try: coeff_good = len(true_sent)/(2*ref_good) except ZeroDivisionError: coeff_good = 0.0 assert(len(true_sent) == len(pred_sent)) true_spans_1, true_spans_0 = get_spans(true_sent, good_label=good_label, bad_label=bad_label) pred_spans_1, pred_spans_0 = get_spans(pred_sent, good_label=good_label, bad_label=bad_label) res_1 = intersect_spans(true_spans_1, pred_spans_1) res_0 = intersect_spans(true_spans_0, pred_spans_0) len_t_1, len_t_0 = len(true_spans_1), len(true_spans_0) len_p_1, len_p_0 = len(pred_spans_1), len(pred_spans_0) if len_t_1 + len_t_0 > len_p_1 + len_p_0: spans_ratio = (len_p_1 + len_p_0)/(len_t_1 + len_t_0) else: spans_ratio = (len_t_1 + len_t_0)/(len_p_1 + len_p_0) corr_val = (res_1*coeff_good + res_0*coeff_bad)*spans_ratio/float(len(true_sent)) # try: # corr_val = res_0/float(ref_bad) # except ZeroDivisionError: # corr_val = 1.0 # print(corr_val, type(corr_val)) if verbose: out_file.write("Reference: %s\nPrediction: %s\nCorrelation: %s\n" % (' '.join([str(t) for t in true_sent]), ' '.join([str(t) for t in pred_sent]), str(corr_val))) sentence_pred.append(corr_val) if verbose: out_file.close() return sentence_pred, np.average(sentence_pred)
def binarize(features, binarizers): assert(list_of_lists(features)) num_features = len(features[0]) # if binarizers != {} and max(binarizers.keys()) >= num_features: # print("Binarizers keys max: ", max(binarizers.keys())) # print("Total feature number: ", num_features) # print("Features:", features[0]) assert(binarizers == {} or max(binarizers.keys()) < num_features) binarized_cols = [] for i in range(num_features): # get this column cur_values = [f[i] for f in features] # if there's a binarizer for this column if i in binarizers: binarizer = binarizers[i] if type(binarizer) == LabelBinarizer: try: binarized_cols.append(binarizer.transform(cur_values)) except: pass # print(cur_values) elif type(binarizer) == MultiLabelBinarizer: assert(list_of_lists(cur_values)) # MultiLabelBinarizer doesn't support unknown values -- they need to be replaced with a default value # we're going to use the empty list as the default value cur_values_default = [] default_value = binarizer.classes_[-1] for a_list in cur_values: new_list = list(a_list) for j, val in enumerate(new_list): if val not in binarizer.classes_: new_list[j] = default_value cur_values_default.append(tuple(new_list)) transformed = binarizer.transform(cur_values_default) binarized_cols.append(transformed) else: raise NotImplementedError('this function is not implemented for type: {}'.format(type(binarizer))) else: # arr = np.array(cur_values) # print(arr.shape) # print(len(cur_values)) # print(cur_values) try: # new_vals = np.array(cur_values).reshape(len(cur_values), 1) binarized_cols.append(np.array(cur_values).reshape(len(cur_values), 1)) except: print(cur_values) sys.exit() assert (len(binarized_cols) == num_features), 'the number of columns after binarization must match the number of features' new_features = np.hstack(binarized_cols) return new_features
def sequence_correlation(y_true, y_pred, good_label=1, bad_label=0, out='sequence_corr.out', verbose=False): assert (len(y_true) == len(y_pred)) if not list_of_lists(y_true) and not list_of_lists(y_pred): logger.warning( "You provided the labels in a flat list of length {}. Assuming them to be one sequence" .format(len(y_true))) y_true = [y_true] y_pred = [y_pred] elif list_of_lists(y_true) and list_of_lists(y_pred): pass else: logger.error("Shapes of the hypothesis and the reference don't match") return 0 sentence_pred = [] if verbose: out_file = open(out, 'w') for true_sent, pred_sent in zip(y_true, y_pred): assert (len(true_sent) == len(pred_sent)) true_spans_1, true_spans_0 = get_spans(true_sent, good_label=good_label, bad_label=bad_label) pred_spans_1, pred_spans_0 = get_spans(pred_sent, good_label=good_label, bad_label=bad_label) res_1 = intersect_spans(true_spans_1, pred_spans_1) res_0 = intersect_spans(true_spans_0, pred_spans_0) corr_val = (res_1 + res_0) / float(len(true_sent)) # print(corr_val, type(corr_val)) if verbose: out_file.write( "Reference: %s\nPrediction: %s\nCorrelation: %s\n" % (' '.join([str(t) for t in true_sent]), ' '.join( [str(t) for t in pred_sent]), str(corr_val))) sentence_pred.append(corr_val) if verbose: out_file.close() return sentence_pred, np.average(sentence_pred)
def sequence_correlation(y_true, y_pred, good_label=1, bad_label=0): assert(len(y_true) == len(y_pred)) if not list_of_lists(y_true) and not list_of_lists(y_pred): logger.warning("You provided the labels in a flat list of length {}. Assuming them to be one sequence".format(len(y_true))) y_true = [y_true] y_pred = [y_pred] elif list_of_lists(y_true) and list_of_lists(y_pred): pass else: logger.error("Shapes of the hypothesis and the reference don't match") return 0 sentence_pred = [] for true_sent, pred_sent in zip(y_true, y_pred): assert(len(true_sent) == len(pred_sent)) true_spans_1, true_spans_0 = get_spans(true_sent, good_label=good_label, bad_label=bad_label) pred_spans_1, pred_spans_0 = get_spans(pred_sent, good_label=good_label, bad_label=bad_label) res_1 = intersect_spans(true_spans_1, pred_spans_1) res_0 = intersect_spans(true_spans_0, pred_spans_0) sentence_pred.append((res_1+res_0)/len(true_sent)) return sentence_pred, np.average(sentence_pred)
def flatten(lofl): if list_of_lists(lofl): return [item for sublist in lofl for item in sublist] elif type(lofl) == dict: return lofl.values()
def sequence_correlation_weighted(y_true, y_pred, good_label=1, bad_label=0, out='sequence_corr.out', verbose=False): assert (len(y_true) == len(y_pred)) if not list_of_lists(y_true) and not list_of_lists(y_pred): logger.warning( "You provided the labels in a flat list of length {}. Assuming them to be one sequence" .format(len(y_true))) y_true = [y_true] y_pred = [y_pred] elif list_of_lists(y_true) and list_of_lists(y_pred): pass else: logger.error("Shapes of the hypothesis and the reference don't match") return 0 sentence_pred = [] if verbose: out_file = open(out, 'w') for true_sent, pred_sent in zip(y_true, y_pred): ref_bad = sum([1 for l in true_sent if l == bad_label]) ref_good = sum([1 for l in true_sent if l == good_label]) assert (ref_bad + ref_good == len(true_sent)) # coefficients that ensure the equal influence of good and bad classes on the overall score try: coeff_bad = len(true_sent) / (2 * ref_bad) except ZeroDivisionError: coeff_bad = 0.0 try: coeff_good = len(true_sent) / (2 * ref_good) except ZeroDivisionError: coeff_good = 0.0 assert (len(true_sent) == len(pred_sent)) true_spans_1, true_spans_0 = get_spans(true_sent, good_label=good_label, bad_label=bad_label) pred_spans_1, pred_spans_0 = get_spans(pred_sent, good_label=good_label, bad_label=bad_label) res_1 = intersect_spans(true_spans_1, pred_spans_1) res_0 = intersect_spans(true_spans_0, pred_spans_0) len_t_1, len_t_0 = len(true_spans_1), len(true_spans_0) len_p_1, len_p_0 = len(pred_spans_1), len(pred_spans_0) if len_t_1 + len_t_0 > len_p_1 + len_p_0: spans_ratio = (len_p_1 + len_p_0) / (len_t_1 + len_t_0) else: spans_ratio = (len_t_1 + len_t_0) / (len_p_1 + len_p_0) corr_val = (res_1 * coeff_good + res_0 * coeff_bad) * spans_ratio / float(len(true_sent)) # try: # corr_val = res_0/float(ref_bad) # except ZeroDivisionError: # corr_val = 1.0 # print(corr_val, type(corr_val)) if verbose: out_file.write( "Reference: %s\nPrediction: %s\nCorrelation: %s\n" % (' '.join([str(t) for t in true_sent]), ' '.join( [str(t) for t in pred_sent]), str(corr_val))) sentence_pred.append(corr_val) if verbose: out_file.close() return sentence_pred, np.average(sentence_pred)
def persist_features(dataset_name, features, persist_dir, tags=None, feature_names=None, phrase_lengths=None, file_format='crf_suite'): ''' persist the features to persist_dir -- use dataset_name as the prefix for the persisted files :param dataset_name: prefix of the output file :param features: dataset :param persist_dir: directory of output file(s) :param tags: tags for the dataset :param feature_names: names of features in the dataset :param file_format: format of the output file for sequences. Values -- 'crf++', 'crf_suite', 'svm_light' :return: ''' try: os.makedirs(persist_dir) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(persist_dir): pass else: raise if file_format == 'crf_suite' and feature_names is None: print( "Feature names are required to save features in CRFSuite and SVMLight formats" ) return # for the 'plain' datatype if type(features) == np.ndarray and features.shape[1] == len( feature_names): output_df = pd.DataFrame(data=features, columns=feature_names) output_path = os.path.join(persist_dir, dataset_name + '.csv') output_df.to_csv(output_path, index=False) logger.info('saved features in: {} to file: {}'.format( dataset_name, output_path)) # for the 'sequential' datatype elif list_of_lists(features): if file_format == 'svm_light': feature_names = range(1, len(features[0]) + 1) output_path = os.path.join(persist_dir, dataset_name + '.svm') output = open(output_path, 'w') tags_map = {'OK': '+1', 'BAD': '-1'} for a_tag, feat_seq in zip(tags, features): feat_list = [] for f_name, f_val in zip(feature_names, feat_seq): try: if float(f_val) != 0.0: feat_list.append( str(f_name) + ':' + val_to_str(f_val)) except ValueError: feat_list.append(str(f_name) + ':' + val_to_str(f_val)) output.write("%s %s\n" % (tags_map[a_tag], ' '.join(feat_list))) return output_path = os.path.join(persist_dir, dataset_name + '.crf') output = open(output_path, 'w') if tags is not None: assert (len(features) == len(tags) ), "Different numbers of tag and feature sequences" for s_idx, (seq, tag_seq) in enumerate(zip(features, tags)): assert ( len(seq) == len(tag_seq) ), "Lengths of tag and feature sequences don't match in sequence {}: {} and {} ({} and {})".format( s_idx, len(seq), len(tag_seq), seq, tag_seq) for w_idx, (feature_list, tag) in enumerate(zip(seq, tag_seq)): if len(feature_list) != len(feature_names): print(feature_list) print(feature_names) sys.exit() tag = str(tag) feature_str = [] for f in feature_list: if type(f) == unicode: feature_str.append(f.encode('utf-8')) # else: # feature_str.append(str(f)) else: feature_str.append(f) if file_format == 'crf++': feature_str = '\t'.join([str(f) for f in feature_str]) output.write('%s\t%s\n' % (feature_str, tag)) elif file_format == 'crf_suite': feature_str_all = [] for i in range(len(feature_str)): # if isinstance(feature_str[i], (int, float, np.float32, np.float64, np.int32, np.int64)): # feature_str_all.append(feature_names[i] + '=1:' + str(feature_str[i])) # else: feature_str_all.append(feature_names[i] + '=' + str(feature_str[i])) # feature_str = [feature_names[i] + '=' + feature_str[i] for i in range(len(feature_str))] feature_str = '\t'.join(feature_str_all) output.write("%s\t%s\n" % (tag, feature_str)) else: print("Unknown data format:", file_format) return False output.write("\n") else: for s_idx, seq in enumerate(features): for w_idx, feature_list in enumerate(seq): #assert(len(seq) == len(feature_names)), "Wrong number of features in sequence %d, word %d" % (s_idx, w_idx) feature_str = [] for f in feature_list: if type(f) == unicode: feature_str.append(f.encode('utf-8')) # else: # feature_str.append(str(f)) else: feature_str.append(f) if file_format == 'crf++': feature_str = '\t'.join([str(f) for f in feature_str]) elif file_format == 'crf_suite': # feature_str = [feature_names[i] + '=' + feature_str[i] for i in range(len(feature_str))] feature_str_all = [] for i in range(len(feature_str)): # if isinstance(feature_str[i], (int, float, np.float32, np.float64, np.int32, np.int64)): # feature_str_all.append(feature_names[i] + '=1:' + str(feature_str[i])) # else: feature_str_all.append(feature_names[i] + '=' + str(feature_str[i])) feature_str = '\t'.join(feature_str_all) else: print("Unknown data format:", file_format) return False output.write("%s\n" % feature_str) output.write("\n") if feature_names is not None: output_features = open( os.path.join(persist_dir, dataset_name + '.features'), 'w') for f_name in feature_names: output_features.write("%s\n" % f_name.encode('utf-8')) output_features.close() output.close() # write phrase lengths if phrase_lengths is not None: write_lofl( phrase_lengths, os.path.join(persist_dir, dataset_name + '.phrase-lengths')) # generate CRF++ template if file_format == 'crf++': feature_num = len(features[0][0]) generate_crf_template(feature_num, tmp_dir=persist_dir) return output_path
def persist_features(dataset_name, features, persist_dir, tags=None, feature_names=None, phrase_lengths=None, file_format='crf_suite'): ''' persist the features to persist_dir -- use dataset_name as the prefix for the persisted files :param dataset_name: prefix of the output file :param features: dataset :param persist_dir: directory of output file(s) :param tags: tags for the dataset :param feature_names: names of features in the dataset :param file_format: format of the output file for sequences. Values -- 'crf++', 'crf_suite', 'svm_light' :return: ''' try: os.makedirs(persist_dir) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(persist_dir): pass else: raise if file_format == 'crf_suite' and feature_names is None: print("Feature names are required to save features in CRFSuite and SVMLight formats") return # for the 'plain' datatype if type(features) == np.ndarray and features.shape[1] == len(feature_names): output_df = pd.DataFrame(data=features, columns=feature_names) output_path = os.path.join(persist_dir, dataset_name + '.csv') output_df.to_csv(output_path, index=False) logger.info('saved features in: {} to file: {}'.format(dataset_name, output_path)) # for the 'sequential' datatype elif list_of_lists(features): if file_format == 'svm_light': feature_names = range(1, len(features[0]) + 1) output_path = os.path.join(persist_dir, dataset_name + '.svm') output = open(output_path, 'w') tags_map = {'OK': '+1', 'BAD': '-1'} for a_tag, feat_seq in zip(tags, features): feat_list = [] for f_name, f_val in zip(feature_names, feat_seq): try: if float(f_val) != 0.0: feat_list.append(str(f_name) + ':' + val_to_str(f_val)) except ValueError: feat_list.append(str(f_name) + ':' + val_to_str(f_val)) output.write("%s %s\n" % (tags_map[a_tag], ' '.join(feat_list))) return output_path = os.path.join(persist_dir, dataset_name + '.crf') output = open(output_path, 'w') if tags is not None: assert(len(features) == len(tags)), "Different numbers of tag and feature sequences" for s_idx, (seq, tag_seq) in enumerate(zip(features, tags)): assert(len(seq) == len(tag_seq)), "Lengths of tag and feature sequences don't match in sequence {}: {} and {} ({} and {})".format(s_idx, len(seq), len(tag_seq), seq, tag_seq) for w_idx, (feature_list, tag) in enumerate(zip(seq, tag_seq)): if len(feature_list) != len(feature_names): print(feature_list) print(feature_names) sys.exit() tag = str(tag) feature_str = [] for f in feature_list: if type(f) == unicode: feature_str.append(f.encode('utf-8')) # else: # feature_str.append(str(f)) else: feature_str.append(f) if file_format == 'crf++': feature_str = '\t'.join([str(f) for f in feature_str]) output.write('%s\t%s\n' % (feature_str, tag)) elif file_format == 'crf_suite': feature_str_all = [] for i in range(len(feature_str)): # if isinstance(feature_str[i], (int, float, np.float32, np.float64, np.int32, np.int64)): # feature_str_all.append(feature_names[i] + '=1:' + str(feature_str[i])) # else: feature_str_all.append(feature_names[i] + '=' + str(feature_str[i])) # feature_str = [feature_names[i] + '=' + feature_str[i] for i in range(len(feature_str))] feature_str = '\t'.join(feature_str_all) output.write("%s\t%s\n" % (tag, feature_str)) else: print("Unknown data format:", file_format) return False output.write("\n") else: for s_idx, seq in enumerate(features): for w_idx, feature_list in enumerate(seq): #assert(len(seq) == len(feature_names)), "Wrong number of features in sequence %d, word %d" % (s_idx, w_idx) feature_str = [] for f in feature_list: if type(f) == unicode: feature_str.append(f.encode('utf-8')) # else: # feature_str.append(str(f)) else: feature_str.append(f) if file_format == 'crf++': feature_str = '\t'.join([str(f) for f in feature_str]) elif file_format == 'crf_suite': # feature_str = [feature_names[i] + '=' + feature_str[i] for i in range(len(feature_str))] feature_str_all = [] for i in range(len(feature_str)): # if isinstance(feature_str[i], (int, float, np.float32, np.float64, np.int32, np.int64)): # feature_str_all.append(feature_names[i] + '=1:' + str(feature_str[i])) # else: feature_str_all.append(feature_names[i] + '=' + str(feature_str[i])) feature_str = '\t'.join(feature_str_all) else: print("Unknown data format:", file_format) return False output.write("%s\n" % feature_str) output.write("\n") if feature_names is not None: output_features = open(os.path.join(persist_dir, dataset_name + '.features'), 'w') for f_name in feature_names: output_features.write("%s\n" % f_name.encode('utf-8')) output_features.close() output.close() # write phrase lengths if phrase_lengths is not None: write_lofl(phrase_lengths, os.path.join(persist_dir, dataset_name + '.phrase-lengths')) # generate CRF++ template if file_format == 'crf++': feature_num = len(features[0][0]) generate_crf_template(feature_num, tmp_dir=persist_dir) return output_path
def persist_features(dataset_name, features, persist_dir, tags=None, feature_names=None, file_format='crf++'): ''' persist the features to persist_dir -- use dataset_name as the prefix for the persisted files :param dataset_name: prefix of the output file :param features: dataset :param persist_dir: directory of output file(s) :param tags: tags for the dataset :param feature_names: names of features in the dataset :param file_format: format of the output file for sequences. Values -- 'crf++' or 'crf_suite' :return: ''' try: os.makedirs(persist_dir) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(persist_dir): pass else: raise # for the 'plain' datatype if type(features) == np.ndarray and features.shape[1] == len(feature_names): output_df = pd.DataFrame(data=features, columns=feature_names) output_path = os.path.join(persist_dir, dataset_name + '.csv') output_df.to_csv(output_path, index=False) logger.info('saved features in: {} to file: {}'.format(dataset_name, output_path)) # for the 'sequential' datatype elif list_of_lists(features): output_path = os.path.join(persist_dir, dataset_name + '.crf') output = open(output_path, 'w') if tags is not None: assert(len(features) == len(tags)), "Different numbers of tag and feature sequences" for s_idx, (seq, tag_seq) in enumerate(zip(features, tags)): assert(len(seq) == len(tag_seq)), "Lengths of tag and feature sequences don't match in sequence %d" % s_idx for w_idx, (feature_list, tag) in enumerate(zip(seq, tag_seq)): assert(len(feature_list) == len(feature_names)), "Wrong number of features in sequence %d, word %d" % (s_idx, w_idx) tag = str(tag) feature_str = [] for f in feature_list: if type(f) == unicode: feature_str.append(f.encode('utf-8')) else: feature_str.append(str(f)) if file_format == 'crf++': feature_str = '\t'.join(feature_str) output.write('%s\t%s\n' % (feature_str, tag)) elif file_format =='crf_suite': feature_str = [feature_names[i] + '=' + feature_str[i] for i in range(len(feature_str))] feature_str = '\t'.join(feature_str) output.write("%s\t%s\n" % (tag, feature_str)) else: print("Unknown data format:", file_format) return False output.write("\n") else: for s_idx, seq in enumerate(features): for w_idx, feature_list in enumerate(seq): assert(len(seq) == len(feature_names)), "Wrong number of features in sequence %d, word %d" % (s_idx, w_idx) feature_str = [] for f in feature_list: if type(f) == unicode: feature_str.append(f.encode('utf-8')) else: feature_str.append(str(f)) if file_format == 'crf++': feature_str = '\t'.join(feature_str) elif file_format =='crf_suite': feature_str = [feature_name[i] + '=' + feature_str[i] for i in range(len(feature_str))] feature_str = '\t'.join(feature_str) else: print("Unknown data format:", file_format) return False output.write("%s\n" % feature_str) output.write("\n") output_features = open(os.path.join(persist_dir, dataset_name + '.features'), 'w') for f_name in feature_names: output_features.write("%s\n" % f_name.encode('utf-8')) output.close() output_features.close()