def seg(id, text, out, type): text = filter.filter(text) counter.add(START_WORD) counter.add(END_WORD) l = gezi.cut(text, type) if type != 'word': for x, y in l: counter.add(x) counter2.add(y) words = ['%s|%s' % (x, y) for x,y in l] else: if FLAGS.seg_method == 'char': l2 = [] for i, w in enumerate(l): for ch in w: counter.add(ch) counter2.add(str(i)) l2.append((ch, i)) words = ['%s|%d' % (x, y) for x,y in l2] else: words = l for w in words: counter.add(w) if not FLAGS.for_pretrain: print(id, '\x09'.join(words), sep='\t', file=out) else: print(' '.join([x.split('|')[0] for x in words]), file=out)
def text2ids(text): wenzheng.utils.text2ids.init() text = filter.filter(text) return to_ids(text, seg_method=FLAGS.seg_method, feed_single_en=FLAGS.feed_single_en, to_lower=FLAGS.to_lower, norm_digit=False, multi_grid=True, pad=False)
def text2ids(text, preprocess=True, return_words=False): wenzheng.utils.text2ids.init() if preprocess: text = filter.filter(text) return to_ids(text, seg_method=FLAGS.seg_method, feed_single=FLAGS.feed_single, feed_single_en=FLAGS.feed_single_en, to_lower=FLAGS.to_lower, norm_digit=False, multi_grid=False, pad=False, return_words=return_words)
def seg(id, text, out, counter): text = filter.filter(text) words = [] for i, word in enumerate(gezi.cut(text)): counter.add(str(i)) if vocab.has(word) and not word.isdigit(): words.append('%s|%d' % (word, i)) else: if six.PY2: for ch in word.decode('utf8'): words.append('%s|%d' % (ch.encode('utf8'), i)) else: for ch in word: words.append('%s|%d' % (ch, i)) if not FLAGS.for_pretrain: print(id, '\x09'.join(words), sep='\t', file=out) else: print(' '.join([x.split('|')[0] for x in words]), file=out)
def seg(text, out): text = filter.filter(text) words = segmentor.Segment(text, FLAGS.seg_method) words = [x.strip() for x in words if x.strip()] if words: print(' '.join(words), file=out)
# -*- coding: utf-8 -*- # ============================================================================== # \file find-chars.py # \author chenghuige # \date 2018-10-01 20:35:40.158875 # \Description # ============================================================================== from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys import os import pandas as pd from projects.ai2018.sentiment.prepare import filter df = pd.read_csv('/home/gezi/data/ai2018/sentiment/sentiment_classify_data/comment_raw_v2/raw_comment_v2.csv') chars = set() for comment in df['content']: comment = filter.filter(comment) for w in comment: if w not in chars: print(w) chars.add(w)
def seg(id, text, out): text = filter.filter(text) _, words = text2ids_(text, return_words=True) print(id, '\x09'.join(words), sep='\t', file=out)
def build_features(index): mode = get_mode(FLAGS.input) start_index = FLAGS.start_index out_file = os.path.dirname(FLAGS.vocab_) + '/{0}/{1}.record'.format( mode, index + start_index) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('---out_file', out_file) # TODO now only gen one tfrecord file total = len(df) num_records = FLAGS.num_records_ ## TODO FIXME whty here still None ? FLAGS.num_records has bee modified before in main as 7 ... #print('---------', num_records, FLAGS.num_records_) if not num_records: if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm' ] or 'valid' in FLAGS.input: num_records = 1 else: num_records = 1 #print('------------------', num_records, FLAGS.num_records_) start, end = gezi.get_fold(total, num_records, index) print('total', total, 'infile', FLAGS.input, 'out_file', out_file, 'num_records', num_records, 'start', start, 'end', end) max_len = 0 max_num_ids = 0 num = 0 with melt.tfrecords.Writer(out_file) as writer: for i in tqdm(range(start, end), ascii=True): try: #row = df.iloc[i] row = df[i] id = str(row[0]) words = row[-1].split('\t') content = row[2] content_ori = content content = filter.filter(content) label = int(row[1]) content_ids = [vocab.id(x) for x in words] if len(content_ids) > max_len: max_len = len(content_ids) print('max_len', max_len) if len(content_ids) > FLAGS.word_limit and len( content_ids) < 5: print('{} {} {}'.format(id, len(content_ids), content_ori)) content_ids = content_ids[:FLAGS.word_limit] words = words[:FLAGS.word_limit] # NOTICE different from tf, pytorch do not allow all 0 seq for rnn.. if using padding mode if FLAGS.use_char: chars = [list(word) for word in words] char_ids = np.zeros([len(content_ids), FLAGS.char_limit], dtype=np.int32) vocab_ = char_vocab if char_vocab else vocab for i, token in enumerate(chars): for j, ch in enumerate(token): if j == FLAGS.char_limit: break char_ids[i, j] = vocab_.id(ch) char_ids = list(char_ids.reshape(-1)) if np.sum(char_ids) == 0: print('------------------------bad id', id) print(content_ids) print(words) exit(0) else: char_ids = [0] feature = { 'id': melt.bytes_feature(id), 'content': melt.int64_feature(content_ids), 'content_str': melt.bytes_feature(content_ori), 'char': melt.int64_feature(char_ids), 'source': melt.bytes_feature(mode), } feature['label'] = melt.int64_feature(label) # TODO currenlty not get exact info wether show 1 image or 3 ... record = tf.train.Example(features=tf.train.Features( feature=feature)) writer.write(record) num += 1 global counter with counter.get_lock(): counter.value += 1 global total_words with total_words.get_lock(): total_words.value += len(content_ids) except Exception: print(traceback.format_exc(), file=sys.stderr) pass
def build_features(file_): if not os.path.isfile(file_): return file_name = os.path.basename(file_) assert os.path.isdir(FLAGS.input) mode = 'train' if 'train' in FLAGS.input else 'valid' dir_ = os.path.dirname(os.path.dirname(FLAGS.input)) out_file = os.path.join(dir_ , '{}/{}/{}.record'.format(FLAGS.tfrecord_dir, mode, file_name)) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('infile', file_, 'out_file', out_file) # if os.path.exists(out_file): # return max_len = 0 max_num_ids = 0 num = 0 with melt.tfrecords.Writer(out_file) as writer: for line in tqdm(open(file_), total=1e6, ascii=True): try: line = line.rstrip('\n') line = filter.filter(line) words = line.split(' ') words = gezi.add_start_end(words) words_list = gezi.break_sentence(words, FLAGS.max_sentence_len) for words in words_list: content = ' '.join(words) content_ids = [vocab.id(x) for x in words] if len(content_ids) > max_len: max_len = len(content_ids) print('max_len', max_len) if len(content_ids) > FLAGS.word_limit and len(content_ids) < 5: print('{} {} {}'.format(id, len(content_ids), content_ori)) content_ids = content_ids[:FLAGS.word_limit] words = words[:FLAGS.word_limit] # NOTICE different from tf, pytorch do not allow all 0 seq for rnn.. if using padding mode if FLAGS.use_char: chars = [list(word) for word in words] char_ids = np.zeros([len(content_ids), FLAGS.char_limit], dtype=np.int32) vocab_ = char_vocab if char_vocab else vocab for i, token in enumerate(chars): for j, ch in enumerate(token): if j == FLAGS.char_limit: break char_ids[i, j] = vocab_.id(ch) char_ids = list(char_ids.reshape(-1)) else: char_ids = [0] feature = { 'content': melt.int64_feature(content_ids), 'content_str': melt.bytes_feature(content), 'char': melt.int64_feature(char_ids), 'source': melt.bytes_feature(FLAGS.source), } # TODO currenlty not get exact info wether show 1 image or 3 ... record = tf.train.Example(features=tf.train.Features(feature=feature)) writer.write(record) num += 1 global counter with counter.get_lock(): counter.value += 1 global total_words with total_words.get_lock(): total_words.value += len(content_ids) except Exception: print(traceback.format_exc(), file=sys.stderr) pass
def build_features(index): mode = get_mode(FLAGS.input) start_index = FLAGS.start_index out_file = os.path.dirname(FLAGS.vocab_) + '/{0}/{1}.record'.format( mode, index + start_index) os.system('mkdir -p %s' % os.path.dirname(out_file)) print('---out_file', out_file) # TODO now only gen one tfrecord file total = len(df) num_records = FLAGS.num_records_ if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm' ] or 'valid' in FLAGS.input: num_records = 1 start, end = gezi.get_fold(total, num_records, index) print('total', total, 'infile', FLAGS.input, 'out_file', out_file) max_len = 0 max_num_ids = 0 num = 0 with melt.tfrecords.Writer(out_file) as writer: for i in tqdm(range(start, end), ascii=True): try: row = df.iloc[i] id = str(row[0]) if seg_result: if id not in seg_result: print('id %s ot found in seg_result' % id) continue words = seg_result[id] if FLAGS.add_start_end_: words = gezi.add_start_end(words, FLAGS.start_mark, FLAGS.end_mark) if pos_result: pos = pos_result[id] if FLAGS.add_start_end_: pos = gezi.add_start_end(pos) if ner_result: ner = ner_result[id] if FLAGS.add_start_end_: ner = gezi.add_start_end(ner) if start_index > 0: id == 't' + id content = row[1] content_ori = content content = filter.filter(content) #label = list(row[2:]) label = [-2] * 20 #label = [x + 2 for x in label] #num_labels = len(label) if not seg_result: content_ids, words = text2ids_(content, preprocess=False, return_words=True) assert len(content_ids) == len(words) else: content_ids = [vocab.id(x) for x in words] #print(words, content_ids) #exit(0) if len(content_ids) > max_len: max_len = len(content_ids) print('max_len', max_len) if len(content_ids) > FLAGS.word_limit and len( content_ids) < 5: print('{} {} {}'.format(id, len(content_ids), content_ori)) #if len(content_ids) > FLAGS.word_limit: # print(id, content) # if mode not in ['test', 'valid']: # continue #if len(content_ids) < 5 and mode not in ['test', 'valid']: # continue content_ids = content_ids[:FLAGS.word_limit] words = words[:FLAGS.word_limit] # NOTICE different from tf, pytorch do not allow all 0 seq for rnn.. if using padding mode if FLAGS.use_char: chars = [list(word) for word in words] char_ids = np.zeros([len(content_ids), FLAGS.char_limit], dtype=np.int32) vocab_ = char_vocab if char_vocab else vocab for i, token in enumerate(chars): for j, ch in enumerate(token): if j == FLAGS.char_limit: break char_ids[i, j] = vocab_.id(ch) char_ids = list(char_ids.reshape(-1)) if np.sum(char_ids) == 0: print('------------------------bad id', id) print(content_ids) print(words) exit(0) else: char_ids = [0] if pos_vocab: assert pos pos = pos[:FLAGS.word_limit] pos_ids = [pos_vocab.id(x) for x in pos] else: pos_ids = [0] if ner_vocab: assert ner if pos_vocab: assert len(pos) == len(ner) ner = ner[:FLAGS.word_limit] ner_ids = [ner_vocab.id(x) for x in ner] else: ner_ids = [0] wlen = [len(word) for word in words] feature = { 'id': melt.bytes_feature(id), 'label': melt.int64_feature(label), 'content': melt.int64_feature(content_ids), 'content_str': melt.bytes_feature(content_ori), 'char': melt.int64_feature(char_ids), 'pos': melt.int64_feature( pos_ids), # might also be postion info for mix seg 'ner': melt.int64_feature(ner_ids), 'wlen': melt.int64_feature(wlen), 'source': melt.bytes_feature(mode), } # TODO currenlty not get exact info wether show 1 image or 3 ... record = tf.train.Example(features=tf.train.Features( feature=feature)) writer.write(record) num += 1 global counter with counter.get_lock(): counter.value += 1 global total_words with total_words.get_lock(): total_words.value += len(content_ids) except Exception: print(traceback.format_exc(), file=sys.stderr) pass
def seg(id, text, out): text = filter.filter(text) words = tokenizer.tokenize(text) print(id, '\x09'.join(words), sep='\t', file=out)
fm = 'a' for line in open(ofile): ids_set.add(line.split('\t')[0]) print('%s already done %d' % (ofile, len(ids_set))) num_errs = 0 with open(ofile, fm) as out: df = pd.read_csv(ifile, lineterminator='\n') contents = df['content'].values ids = df['id'].values for i in tqdm(range(len(df)), ascii=True): if str(ids[i]) in ids_set: continue #if i != 2333: # continue #print(gezi.cut(filter.filter(contents[i]), type_)) try: l = [] for ch in filter.filter(contents[i]): l.append(ch) print(' '.join(l), file=out) except Exception: if num_errs == 0: print(traceback.format_exc()) num_errs += 1 continue #exit(0) print('num_errs:', num_errs, 'ratio:', num_errs / len(df))
logging.init('/tmp') from projects.ai2018.sentiment.prepare import filter START_WORD = '<S>' END_WORD = '</S>' print('seg_method:', FLAGS.seg_method, file=sys.stderr) print('min_count:', FLAGS.min_count, 'most_common:', FLAGS.most_common) num = 0 for line in sys.stdin: if num % 10000 == 0: print(num, file=sys.stderr) text = line.rstrip() text = filter.filter(text) try: words = segmentor.Segment(text, FLAGS.seg_method) except Exception: print(num, '-----------fail', text) print(traceback.format_exc()) continue if num % 10000 == 0: logging.info(text, '|'.join(words), len(words)) counter.add(START_WORD) for word in words: counter.add(word) if word.isdigit(): counter.add('<NUM>') counter.add(END_WORD) num += 1