plt.ylabel('True label') plt.xlabel('Predicted label') plt.tight_layout() if len(sys.argv) > 2: file1 = sys.argv[1] file2 = sys.argv[2] df1 = pd.read_csv(file1) df1 = df1.sort_values('id') df2 = pd.read_csv(file2) df2 = df2.sort_values('id') scores1 = [gezi.str2scores(x) for x in df1['score'].values] scores2 = [gezi.str2scores(x) for x in df2['score'].values] scores1 = np.reshape(scores1, [-1, len(ATTRIBUTES), 4]) scores1 = gezi.softmax(scores1) scores2 = np.reshape(scores2, [-1, len(ATTRIBUTES), 4]) scores2 = gezi.softmax(scores2) ndf1 = pd.DataFrame() ndf2 = pd.DataFrame() for i, attr in enumerate(ATTRIBUTES): score1 = np.reshape(scores1[:, i, :], [-1]) score2 = np.reshape(scores2[:, i, :], [-1]) ndf1[attr] = score1 ndf2[attr] = score2
# for score in iscores: # score = gezi.softmax(np.reshape(score, [num_attrs, 4]), -1) # score = np.reshape(score, [-1]) # iscores2.append(score) # iscores = iscores2 iscores = np.array(iscores) print(valid_file) df = pd.read_csv(valid_file, sep=',') df = df.sort_values('id') labels = df.iloc[:,idx:idx+num_attrs].values predicts = df.iloc[:,idx+num_attrs:idx+2*num_attrs].values #scores = df['score'] scores = df['logit'] #scores = df['prob'] scores = [gezi.str2scores(score) for score in scores] # scores2 = [] # for score in scores: # score = gezi.softmax(np.reshape(score, [num_attrs, 4]), -1) # score = np.reshape(score, [-1]) # scores2.append(score) # scores = scores2 scores = np.array(scores) ids = df.iloc[:,0].values cnames = [] for attr in ATTRIBUTES: for i in range(4): cnames.append(f'{attr}_{i}') print('---------', cnames)
df = pd.read_csv('./models.csv') df = df[df['model'] != 'ensemble'] models_ = df['model'].values files_ = df['file'].values metrics = df['adjusted_f1/mean'].values models = [] files = [] for file, model in tqdm(zip(files_, models_), ascii=True): if not os.path.exists(file): continue df = pd.read_csv(file) df = df.sort_values('id') scores = [gezi.str2scores(x) for x in df['score'].values] scores = np.reshape(scores, [-1, len(ATTRIBUTES), 4]) scores = gezi.softmax(scores) ndf = pd.DataFrame() ndf['score'] = np.reshape(scores, [-1]) dfs.append(ndf) files.append(file) models.append(model) def calc_correlation(x, y, method): if method.startswith('ks'): ks_stat, p_value = ks_2samp(x, y) if method == 'ks_s': score = ks_stat else:
def evaluate_file(file): print('-------------------------', file) df = pd.read_csv(file) scores = df['score'] scores = [gezi.str2scores(score) for score in scores] scores = np.array(scores) predicts = np.reshape(scores, [-1, NUM_ATTRIBUTES, NUM_CLASSES]) # for auc might need to do this #predicts /= 26 idx = 2 length = NUM_ATTRIBUTES labels = df.iloc[:, idx:idx + length].values labels += 2 #print(labels.shape, predicts.shape) assert labels.shape[0] == 15000, labels.shape[0] vals, names = evaluate(labels, predicts) if FLAGS.show_detail: for name, val in zip(names, vals): print(name, val) print('---------------------------------') for name, val in zip(names, vals): if 'mean' in name: print(name, val) lens = [len(x) for x in df['content'].values] predicts1 = [] predicts2 = [] labels1 = [] labels2 = [] for len_, label, predict in zip(lens, labels, predicts): if len_ >= FLAGS.len_thre: predicts2.append(predict) labels2.append(label) else: predicts1.append(predict) labels1.append(label) predicts1 = np.array(predicts1) labels1 = np.array(labels1) print('num docs len < ', FLAGS.len_thre, len(predicts1)) vals1, names1 = evaluate(labels1, predicts1) for name, val in zip(names1, vals1): if 'mean' in name: print(name, val) predicts2 = np.array(predicts2) labels2 = np.array(labels2) print('num docs len >= ', FLAGS.len_thre, len(predicts2)) vals2, names2 = evaluate(labels2, predicts2) for name, val in zip(names2, vals2): if 'mean' in name: print(name, val) return vals, names
def build_features(index): mode = get_mode(FLAGS.input) start_index = FLAGS.start_index out_file = os.path.dirname(FLAGS.vocab_) + '/{0}/{1}.record'.format(mode, index + start_index) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('---out_file', out_file) # TODO now only gen one tfrecord file total = len(df) num_records = FLAGS.num_records_ ## TODO FIXME whty here still None ? FLAGS.num_records has bee modified before in main as 7 ... #print('---------', num_records, FLAGS.num_records_) if not num_records: if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm'] or 'valid' in FLAGS.input: num_records = 1 else: num_records = 7 #print('------------------', num_records, FLAGS.num_records_) start, end = gezi.get_fold(total, num_records, index) print('total', total, 'infile', FLAGS.input, 'out_file', out_file) max_len = 0 max_num_ids = 0 num = 0 with melt.tfrecords.Writer(out_file) as writer: for i in tqdm(range(start, end), ascii=True): try: row = df.iloc[i] id = str(row[0]) if seg_result: if id not in seg_result: print('id %s ot found in seg_result' % id) continue words = seg_result[id] if FLAGS.content_limit_: # NOW only for bert! if len(words) + 2 > FLAGS.content_limit_: words = words[:FLAGS.content_limit_ - 3 - 50] + ['[MASK]'] + words[-50:] #print(words) if FLAGS.add_start_end_: words = gezi.add_start_end(words, FLAGS.start_mark, FLAGS.end_mark) if pos_result: pos = pos_result[id] if FLAGS.add_start_end_: pos = gezi.add_start_end(pos) if ner_result: ner = ner_result[id] if FLAGS.add_start_end_: ner = gezi.add_start_end(ner) if start_index > 0: id == 't' + id content = row[1] content_ori = content content = filter.filter(content) if not FLAGS.use_soft_label_: if 'test' in mode: label = [-2] * 20 else: label = list(row[2:]) #label = [x + 2 for x in label] #num_labels = len(label) else: label = [0.] * 80 if not FLAGS.is_soft_label: for idx, val in enumerate(row[2:]): label[idx * 4 + val] = 1. else: logits = np.array(gezi.str2scores(row['score'])) logits = np.reshape(logits, [20, 4]) probs = gezi.softmax(logits) label = list(np.reshape(probs, [-1])) if not seg_result: content_ids, words = text2ids_(content, preprocess=False, return_words=True) assert len(content_ids) == len(words) else: content_ids = [vocab.id(x) for x in words] #print(words, content_ids) #exit(0) if len(content_ids) > max_len: max_len = len(content_ids) print('max_len', max_len) if len(content_ids) > FLAGS.word_limit and len(content_ids) < 5: print('{} {} {}'.format(id, len(content_ids), content_ori)) #if len(content_ids) > FLAGS.word_limit: # print(id, content) # if mode not in ['test', 'valid']: # continue #if len(content_ids) < 5 and mode not in ['test', 'valid']: # continue content_ids = content_ids[:FLAGS.word_limit] words = words[:FLAGS.word_limit] # NOTICE different from tf, pytorch do not allow all 0 seq for rnn.. if using padding mode if FLAGS.use_char: chars = [list(word) for word in words] char_ids = np.zeros([len(content_ids), FLAGS.char_limit], dtype=np.int32) vocab_ = char_vocab if char_vocab else vocab for i, token in enumerate(chars): for j, ch in enumerate(token): if j == FLAGS.char_limit: break char_ids[i, j] = vocab_.id(ch) char_ids = list(char_ids.reshape(-1)) if np.sum(char_ids) == 0: print('------------------------bad id', id) print(content_ids) print(words) exit(0) else: char_ids = [0] if pos_vocab: assert pos pos = pos[:FLAGS.word_limit] pos_ids = [pos_vocab.id(x) for x in pos] else: pos_ids = [0] if ner_vocab: assert ner if pos_vocab: assert len(pos) == len(ner) ner = ner[:FLAGS.word_limit] ner_ids = [ner_vocab.id(x) for x in ner] else: ner_ids = [0] wlen = [len(word) for word in words] feature = { 'id': melt.bytes_feature(id), 'content': melt.int64_feature(content_ids), 'content_str': melt.bytes_feature(content_ori), 'char': melt.int64_feature(char_ids), 'pos': melt.int64_feature(pos_ids), # might also be postion info for mix seg 'ner': melt.int64_feature(ner_ids), 'wlen': melt.int64_feature(wlen), 'source': melt.bytes_feature(mode), } feature['label'] = melt.int64_feature(label) if not FLAGS.use_soft_label_ else melt.float_feature(label) # TODO currenlty not get exact info wether show 1 image or 3 ... record = tf.train.Example(features=tf.train.Features(feature=feature)) writer.write(record) num += 1 global counter with counter.get_lock(): counter.value += 1 global total_words with total_words.get_lock(): total_words.value += len(content_ids) except Exception: print(traceback.format_exc(), file=sys.stderr) pass