def tokenize(index):
    comments = df['comment_text']
    start, end = gezi.get_fold(len(comments), FLAGS.threads, index)

    #for i in tqdm(range(start, end)):
    with open('%s/%s_%d.txt' % (FLAGS.out_dir, name, index), 'w') as out:
        for i in range(start, end):
            if i % 1000 == 0:
                print(i, file=sys.stderr)
            sent = gezi.segment.tokenize_filter_empty(comments[i].replace(
                '\n', ' '))
            print(' '.join(sent), file=out)
def tokenize(index):
    global context_tokens_list
    comments = df['comment_text']
    start, end = gezi.get_fold(len(comments), FLAGS.threads, index)
    for i in tqdm(range(start, end), ascii=True):
        comment = comments[i]
        if FLAGS.special_tokenizer:
            context_tokens_list[i] = tokenizer.tokenize(comment).tokens
        else:
            if FLAGS.is_twitter:
                comment = glove_twitter_preprocess(comment)
            context_tokens_list[i] = [
                x.lower() for x in gezi.segment.tokenize_filter_empty(comment)
            ]
def get_fold(ids, index):
  ids_ = []
  ids = list(ids)
  ids_set = set()
  for id in ids:
    if id not in ids_set:
      ids_.append(id)
      ids_set.add(id)
  start_, end_ = gezi.get_fold(len(ids_), FLAGS.num_records, index)
  
  ids.append('END')
  ids_.append('END')
    
  start = None 
  end = None 
  for i in range(len(ids)):
    if ids[i] == ids_[start_]:
      start = i
    elif ids[i] == ids_[end_]:
      end = i
      return start, end
def run(index):
    df = pd.read_csv(input)
    ids = df['id'].values
    comments = df['comment_text'].values

    start, end = gezi.get_fold(len(comments), num_threads, index)

    output = input.replace('.csv', '.sents.%d.txt' % index)
    print(output)
    num = 0
    with open(output, 'w') as out:
        for id, comment in zip(ids[start:end], comments[start:end]):
            if num % 1000 == 0:
                print(num)
            num += 1
            doc = gezi.segment.doc(comment)
            for sent in doc.sents:
                print(id,
                      sent.text.replace('\n', 'NEWLINE'),
                      sep='\t',
                      file=out)
示例#5
0
def build_features(index):
  total = len(df)
  start, end = gezi.get_fold(total, FLAGS.num_records, index)
  df_ = df.iloc[start:end]

  num_records = 0

  buffer_size = None if (FLAGS.mark != 'train' or not FLAGS.shuffle_impressions) else FLAGS.shuffle_buffer_size
  ofile = f'{FLAGS.out_dir}/{FLAGS.mark}/record_{index}.TMP'
  folder_name = FLAGS.mark
  if FLAGS.neg_parts > 1:
    folder_name = f'{FLAGS.mark}-{FLAGS.neg_part}'
    os.system(f'mkdir -p {FLAGS.out_dir}/{folder_name}')
    ofile = f'{FLAGS.out_dir}/{FLAGS.mark}-{FLAGS.neg_part}/record_{index}.TMP'
  writer = melt.tfrecords.Writer(ofile, buffer_size=buffer_size) 

  if FLAGS.mark == 'train' and FLAGS.train_by_day:
    # 2019 11 9 -> 11 14
    num_days = 7
    num_records_list = [0] * num_days
    ofiles = []
    writers = []
    for i in range(num_days):
      os.system(f'mkdir -p {FLAGS.out_dir}/{folder_name}-days/{i}')
      ofiles += [f'{FLAGS.out_dir}/{FLAGS.mark}-days/{i}/record_{index}.TMP']
      writers += [melt.tfrecords.Writer(ofiles[-1], buffer_size=buffer_size)]

  for _, row in tqdm(df_.iterrows(), total=len(df_), ascii=True):
    time_ = row['time']
    day = int(time_.split()[0].split('/')[1]) - 9
    if FLAGS.day is not None and day != FLAGS.day:
      continue

    x = to_datetime(time_)
    weekday = x.weekday() 
    hour = x.hour
    timestamp = to_timestamp(x) 

    impressions = row['impressions'].split()
    impression_id = row['impression_id']
    uid = uid_vocab.id(row['uid'])

    try:
      history = [did_vocab.id(x) for x in reversed(row['history'].split())]
    except Exception:
      # print(row['history'], row['impression_id'])
      history = []
    
    feature = {}
    feature['uid_'] = row['uid']
    feature['uid'] = uid
    feature['day'] = day
    feature['weekday'] = weekday
    feature['hour'] = hour
    feature['history'] = history
    feature['impression_id'] = impression_id
    feature['uid_in_train'] = int(uid_vocab2.has(row['uid']))
    feature['impression_len'] = len(impressions) 
    feature['impressions'] = [did_vocab.id(x.split('-')[0]) for x in impressions]

    if FLAGS.record_padded:
      feature['history'] = gezi.pad(feature['history'], FLAGS.max_history)
      feature['impressions'] = gezi.pad(feature['impressions'], FLAGS.max_impressions)

    if FLAGS.neg_parts > 1:
      indexes = list(range(len(impressions)))
      np.random.shuffle(indexes)

    for i, impression in enumerate(impressions):
      if '-' in impression:
        did_, click = impression.split('-')  
      else:
        did_, click = impression, '0'
      click = int(click)

      if FLAGS.neg_parts > 1:        
        if not click and indexes[i] % FLAGS.neg_parts != FLAGS.neg_part:
          continue

      start_timestamp = start_timestamps[did_]
      fresh = timestamp - start_timestamp
      did = did_vocab.id(did_)

      feature['fresh'] = fresh
      feature['did_in_train'] = int(did_vocab2.has(did_))

      feature['click'] = click
      feature['did_'] = did_
      feature['did'] = did
      feature['id'] = impression_id * 100 + i
      feature['position'] = i

      feature_ = {}
      for key in feature:
        feature_[key] = feature[key]
        if isinstance(feature[key], list or tuple) and not feature[key]:
          feature_[key] = [X]
      for key in feature_:
        try:
          feature_[key] = melt.gen_feature(feature_[key])
        except Exception:
          print(key, feature[key])
          print(traceback.format_exc())
          exit(0)

      record = tf.train.Example(features=tf.train.Features(feature=feature_))

      if FLAGS.mark == 'train' and FLAGS.train_by_day:
        writer = writers[day]

      writer.write(record)

      if FLAGS.mark == 'train' and FLAGS.train_by_day:
        num_records_list[day] += 1
      else:
        num_records += 1

  if FLAGS.mark == 'train' and FLAGS.train_by_day:
    for i in range(num_days):
      writers[i].close()   
      if num_records_list[i] == 0:
        os.system('rm -rf %s' % ofiles[i])
      else:
        ofile2 = ofiles[i].replace('.TMP', f'.{num_records_list[i]}')
        os.system('mv %s %s' % (ofiles[i], ofile2))
  else:
    writer.close()
    if num_records == 0:
      os.system('rm -rf %s' % ofile)
    else:
      ofile2 = ofile.replace('.TMP', f'.{num_records}')
      os.system('mv %s %s' % (ofile, ofile2))
def build_features(index):
    mode = 'train' if 'train' in FLAGS.input else 'test'
    out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format(
        mode, index)
    os.system('mkdir -p %s' % os.path.dirname(out_file))
    print('---out_file', out_file)
    # TODO now only gen one tfrecord file

    total = len(examples)
    start, end = gezi.get_fold(total, FLAGS.num_records, index)

    ids = examples['id'].values[start:end]
    comments = examples['comment_text'].values[start:end]

    try:
        labels = examples[CLASSES].values[start:end]
    except Exception:
        labels = [[0.] * len(CLASSES)] * len(ids)

    with melt.tfrecords.Writer(out_file) as writer:
        for id, comment, label in tqdm(zip(ids, comments, labels)):
            comment_str = comment
            # TODO use info
            doc = tokenizer.tokenize(comment)
            comment_tokens, tokens_info = doc.tokens, doc.attributes

            for i in range(len(tokens_info)):
                tokens_info[i] = list(map(float, tokens_info[i]))

            if FLAGS.comment_limit:
                comment_tokens = comment_tokens[:FLAGS.comment_limit]
                tokens_info = tokens_info[:FLAGS.comment_limit]

            tokens_info = np.array(tokens_info)
            tokens_info = tokens_info.reshape(-1)
            tokens_info = list(tokens_info)

            assert len(
                tokens_info) == len(comment_tokens) * len(attribute_names)

            comment_ids = [get_id(token, vocab) for token in comment_tokens]
            comment_tokens_str = '|'.join(
                [vocab.key(id) for id in comment_ids])
            label = list(map(float, label))

            comment_chars = [list(token) for token in comment_tokens]

            char_ids = np.zeros([len(comment_ids), FLAGS.char_limit],
                                dtype=np.int32)

            for i, token in enumerate(comment_chars):
                for j, ch in enumerate(token):
                    if j == FLAGS.char_limit:
                        break
                    char_ids[i, j] = get_char_id(ch, char_vocab)

            char_ids = list(char_ids.reshape(-1))

            #print(char_ids)

            simple_char_ids = []
            num_chs = 0
            for ch in list(comment):
                id_ = get_char_id(ch, char_vocab)
                #if id_ == char_vocab.unk_id():
                #  continue
                simple_char_ids.append(id_)
                if len(simple_char_ids) == FLAGS.simple_char_limit:
                    break

            simple_chars_str = ''.join(
                [char_vocab.key(id) for id in simple_char_ids])

            #print(simple_char_ids, simple_chars_str)

            record = tf.train.Example(features=tf.train.Features(
                feature={
                    "comment": melt.int64_feature(comment_ids),
                    "tokens_info": melt.float_feature(tokens_info),
                    "comment_chars": melt.int64_feature(char_ids),
                    "simple_chars": melt.int64_feature(simple_char_ids),
                    "simple_chars_str": melt.bytes_feature(simple_chars_str),
                    "classes": melt.float_feature(label),
                    "id": melt.bytes_feature(id),
                    "comment_str": melt.bytes_feature(comment_str),
                    "comment_tokens_str": melt.bytes_feature(
                        comment_tokens_str)
                }))

            writer.write(record)
            global counter
            with counter.get_lock():
                counter.value += 1

        print("Build {} instances of features in total".format(writer.size()))
        writer.close()
示例#7
0
def tokenize(index):
    global tokens_list
    comments = df['comment_text']
    start, end = gezi.get_fold(len(comments), FLAGS.threads, index)

    if 'tokens' in df.columns:
        for i in range(start, end):
            # if df['id'][i] == '5bbabc3b14cc1f7f':
            #   sent = tokenizer.full_tokenize(comments[i])
            #   tokens_list[i] = sent.tokens
            #   attributes_list[i] = np.reshape(np.array([list(map(float, x)) for  x in sent.attributes]), -1)
            #   poses_list[i] = sent.poses
            #   tags_list[i] = sent.tags
            #   ners_list[i] = sent.ners
            # else:
            tokens_list[i] = df['tokens'][i].split(' ')
            attributes_list[i] = df['attributes'][i].split(' ')
            # if len(attributes_list[i]) != len(attribute_names) * len(tokens_list[i]) or FLAGS.modify_attribute:
            #   sent = tokenizer.tokenize(comments[i])
            #   attributes_list[i] = np.reshape(np.array([list(map(float, x)) for  x in sent.attributes]), -1)
            #   assert len(attributes_list[i]) == len(attribute_names) * len(tokens_list[i]), '{} {} {} {}'.format(len(attributes_list[i]) / len(attribute_names), len(tokens_list[i]), i, df['id'][i])
            poses_list[i] = df['poses'][i].split(' ')
            tags_list[i] = df['tags'][i].split(' ')
            ners_list[i] = df['ners'][i].split(' ')
            ori_tokens_list[i] = df['ori_tokens'][i].split(' ')
    else:
        for i in tqdm(range(start, end)):
            # for i in range(start, end):
            #   if i % 1000 == 0:
            #     print(i, file=sys.stderr)
            if FLAGS.full_tokenizer:
                #if FLAGS.simple_tokenizer:
                sent = tokenizer.full_tokenize(
                    comments[i], lemmatization=FLAGS.lemmatization)
                # else:
                #   sent = gezi.segment.tokenize_filter_empty(comments[i])
                # if FLAGS.lower:
                #   sent.tokens = [w.lower() for w in sent.tokens]
                tokens_list[i] = sent.tokens
                ori_tokens_list[i] = sent.ori_tokens
                attributes_list[i] = np.reshape(
                    np.array([list(map(float, x)) for x in sent.attributes]),
                    -1)
                poses_list[i] = sent.poses
                tags_list[i] = sent.tags
                ners_list[i] = sent.ners
            else:
                sent = tokenizer.tokenize(comments[i],
                                          lemmatization=FLAGS.lemmatization)
                # if FLAGS.lower:
                #   sent.tokens = [w.lower() for w in sent.tokens]
                tokens_list[i] = sent.tokens
                ori_tokens_list[i] = sent.ori_tokens

                #print('----------', sent.attributes)
                try:
                    attributes_list[i] = np.reshape(
                        np.array(
                            [list(map(float, x)) for x in sent.attributes]),
                        -1)
                except Exception:
                    print(sent.attributes)
                    raise ValueError()
                poses_list[i] = ['NONE'] * len(tokens_list[i])
                tags_list[i] = ['NONE'] * len(tokens_list[i])
                ners_list[i] = ['NONE'] * len(tokens_list[i])
def build_features(index):
    mode = get_mode(FLAGS.input)

    start_index = FLAGS.start_index

    out_file = os.path.dirname(FLAGS.vocab_) + '/{0}/{1}.record'.format(
        mode, index + start_index)
    os.system('mkdir -p %s' % os.path.dirname(out_file))
    print('---out_file', out_file)
    # TODO now only gen one tfrecord file

    total = len(df)
    num_records = FLAGS.num_records_
    ## TODO FIXME whty here still None ? FLAGS.num_records has bee modified before in main as 7 ...
    #print('---------', num_records, FLAGS.num_records_)
    if not num_records:
        if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm'
                                   ] or 'valid' in FLAGS.input:
            num_records = 1
        else:
            num_records = 1
    #print('------------------', num_records, FLAGS.num_records_)
    start, end = gezi.get_fold(total, num_records, index)

    print('total', total, 'infile', FLAGS.input, 'out_file', out_file,
          'num_records', num_records, 'start', start, 'end', end)

    max_len = 0
    max_num_ids = 0
    num = 0
    with melt.tfrecords.Writer(out_file) as writer:
        for i in tqdm(range(start, end), ascii=True):
            try:
                #row = df.iloc[i]
                row = df[i]
                id = str(row[0])

                words = row[-1].split('\t')

                content = row[2]
                content_ori = content
                content = filter.filter(content)

                label = int(row[1])

                content_ids = [vocab.id(x) for x in words]

                if len(content_ids) > max_len:
                    max_len = len(content_ids)
                    print('max_len', max_len)

                if len(content_ids) > FLAGS.word_limit and len(
                        content_ids) < 5:
                    print('{} {} {}'.format(id, len(content_ids), content_ori))

                content_ids = content_ids[:FLAGS.word_limit]
                words = words[:FLAGS.word_limit]

                # NOTICE different from tf, pytorch do not allow all 0 seq for rnn.. if using padding mode
                if FLAGS.use_char:
                    chars = [list(word) for word in words]
                    char_ids = np.zeros([len(content_ids), FLAGS.char_limit],
                                        dtype=np.int32)

                    vocab_ = char_vocab if char_vocab else vocab

                    for i, token in enumerate(chars):
                        for j, ch in enumerate(token):
                            if j == FLAGS.char_limit:
                                break
                            char_ids[i, j] = vocab_.id(ch)

                    char_ids = list(char_ids.reshape(-1))
                    if np.sum(char_ids) == 0:
                        print('------------------------bad id', id)
                        print(content_ids)
                        print(words)
                        exit(0)
                else:
                    char_ids = [0]

                feature = {
                    'id': melt.bytes_feature(id),
                    'content': melt.int64_feature(content_ids),
                    'content_str': melt.bytes_feature(content_ori),
                    'char': melt.int64_feature(char_ids),
                    'source': melt.bytes_feature(mode),
                }
                feature['label'] = melt.int64_feature(label)

                # TODO currenlty not get exact info wether show 1 image or 3 ...
                record = tf.train.Example(features=tf.train.Features(
                    feature=feature))

                writer.write(record)
                num += 1
                global counter
                with counter.get_lock():
                    counter.value += 1
                global total_words
                with total_words.get_lock():
                    total_words.value += len(content_ids)
            except Exception:
                print(traceback.format_exc(), file=sys.stderr)
                pass
示例#9
0
def build_features(index):
    mode = get_mode(FLAGS.input)

    start_index = FLAGS.start_index

    out_file = os.path.dirname(FLAGS.vocab_) + '/{0}/{1}.record'.format(
        mode, index + start_index)
    os.system('mkdir -p %s' % os.path.dirname(out_file))
    print('---out_file', out_file)
    # TODO now only gen one tfrecord file

    total = len(df)
    num_records = FLAGS.num_records_
    if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm'
                               ] or 'valid' in FLAGS.input:
        num_records = 1
    start, end = gezi.get_fold(total, num_records, index)

    print('total', total, 'infile', FLAGS.input, 'out_file', out_file)

    max_len = 0
    max_num_ids = 0
    num = 0
    with melt.tfrecords.Writer(out_file) as writer:
        for i in tqdm(range(start, end), ascii=True):
            try:
                row = df.iloc[i]
                id = str(row[0])

                if seg_result:
                    if id not in seg_result:
                        print('id %s ot found in seg_result' % id)
                        continue
                    words = seg_result[id]
                    if FLAGS.add_start_end_:
                        words = gezi.add_start_end(words, FLAGS.start_mark,
                                                   FLAGS.end_mark)
                if pos_result:
                    pos = pos_result[id]
                    if FLAGS.add_start_end_:
                        pos = gezi.add_start_end(pos)
                if ner_result:
                    ner = ner_result[id]
                    if FLAGS.add_start_end_:
                        ner = gezi.add_start_end(ner)

                if start_index > 0:
                    id == 't' + id

                content = row[1]
                content_ori = content
                content = filter.filter(content)

                #label = list(row[2:])
                label = [-2] * 20

                #label = [x + 2 for x in label]
                #num_labels = len(label)

                if not seg_result:
                    content_ids, words = text2ids_(content,
                                                   preprocess=False,
                                                   return_words=True)
                    assert len(content_ids) == len(words)
                else:
                    content_ids = [vocab.id(x) for x in words]
                    #print(words, content_ids)
                    #exit(0)

                if len(content_ids) > max_len:
                    max_len = len(content_ids)
                    print('max_len', max_len)

                if len(content_ids) > FLAGS.word_limit and len(
                        content_ids) < 5:
                    print('{} {} {}'.format(id, len(content_ids), content_ori))
                #if len(content_ids) > FLAGS.word_limit:
                #  print(id, content)
                #  if mode not in ['test', 'valid']:
                #    continue

                #if len(content_ids) < 5 and mode not in ['test', 'valid']:
                #  continue

                content_ids = content_ids[:FLAGS.word_limit]
                words = words[:FLAGS.word_limit]

                # NOTICE different from tf, pytorch do not allow all 0 seq for rnn.. if using padding mode
                if FLAGS.use_char:
                    chars = [list(word) for word in words]
                    char_ids = np.zeros([len(content_ids), FLAGS.char_limit],
                                        dtype=np.int32)

                    vocab_ = char_vocab if char_vocab else vocab

                    for i, token in enumerate(chars):
                        for j, ch in enumerate(token):
                            if j == FLAGS.char_limit:
                                break
                            char_ids[i, j] = vocab_.id(ch)

                    char_ids = list(char_ids.reshape(-1))
                    if np.sum(char_ids) == 0:
                        print('------------------------bad id', id)
                        print(content_ids)
                        print(words)
                        exit(0)
                else:
                    char_ids = [0]

                if pos_vocab:
                    assert pos
                    pos = pos[:FLAGS.word_limit]
                    pos_ids = [pos_vocab.id(x) for x in pos]
                else:
                    pos_ids = [0]

                if ner_vocab:
                    assert ner
                    if pos_vocab:
                        assert len(pos) == len(ner)
                    ner = ner[:FLAGS.word_limit]

                    ner_ids = [ner_vocab.id(x) for x in ner]
                else:
                    ner_ids = [0]

                wlen = [len(word) for word in words]

                feature = {
                    'id': melt.bytes_feature(id),
                    'label': melt.int64_feature(label),
                    'content': melt.int64_feature(content_ids),
                    'content_str': melt.bytes_feature(content_ori),
                    'char': melt.int64_feature(char_ids),
                    'pos': melt.int64_feature(
                        pos_ids),  # might also be postion info for mix seg
                    'ner': melt.int64_feature(ner_ids),
                    'wlen': melt.int64_feature(wlen),
                    'source': melt.bytes_feature(mode),
                }

                # TODO currenlty not get exact info wether show 1 image or 3 ...
                record = tf.train.Example(features=tf.train.Features(
                    feature=feature))

                writer.write(record)
                num += 1
                global counter
                with counter.get_lock():
                    counter.value += 1
                global total_words
                with total_words.get_lock():
                    total_words.value += len(content_ids)
            except Exception:
                print(traceback.format_exc(), file=sys.stderr)
                pass
def build_features(index):
  mode = get_mode()
  out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format(mode, index)
  os.system('mkdir -p %s' % os.path.dirname(out_file))
  print('---out_file', out_file)
  # TODO now only gen one tfrecord file 

  total = len(examples)
  if not FLAGS.has_dup:
    start, end = gezi.get_fold(total, FLAGS.num_records, index)
  else:
    start, end = get_fold(examples['id'].values, index)

  ids = examples['id'].values[start: end]
  ids = list(map(str, ids))
  comments = examples['comment_text'].values[start: end]
  tokens_list = examples['tokens'].values[start: end]
  tokens_infos = examples['attributes'].values[start: end]
  # TODO change to poses
  poses = examples['poses'].values[start: end]
  tags = examples['tags'].values[start: end]
  ners = examples['ners'].values[start: end]
  ori_tokens_list = examples['ori_tokens'].values[start: end]
  
  try:
    labels = examples[CLASSES].values[start: end]
  except Exception:
    labels = [[0.] * len(CLASSES)] * len(ids)

  with melt.tfrecords.Writer(out_file) as writer:
    for id, comment, label, comment_tokens, ori_tokens, tokens_info, pos, tag, ner in tqdm(zip(ids, comments, labels, tokens_list, ori_tokens_list, tokens_infos, poses, tags, ners)):
      if not isinstance(comment, str):
        comment = 'ok'
      comment_str = comment

      comment_tokens = comment_tokens.split(' ')
      tokens_info = tokens_info.split(' ')
      pos = pos.split(' ')
      tag = tag.split(' ')
      ner = ner.split(' ')
      ori_tokens = ori_tokens.split(' ')

      if FLAGS.comment_limit:
        comment_tokens = comment_tokens[:FLAGS.comment_limit]
        ori_tokens = ori_tokens[:FLAGS.comment_limit]
        tokens_info = tokens_info[:len(attribute_names) * FLAGS.comment_limit]

      pos_ids = [get_char_id(x, pos_vocab) for x in pos]
      tag_ids = [get_char_id(x, tag_vocab) for x in tag]
      ner_ids = [get_char_id(x, ner_vocab) for x in ner]

      # NOTICE comment_ids with vocab(all train + test word so no unk)
      if not FLAGS.lower:
        comment_ids = [get_id(token, vocab) for token in comment_tokens]
        #comment_ids_withunk = [get_id(token, unk_vocab) for token in comment_tokens]
      else:
        comment_ids = [get_id(token.lower(), vocab) for token in comment_tokens]
        #comment_ids_withunk = [get_id(token.lower(), unk_vocab) for token in comment_tokens]

      comment_tokens_str = '|'.join([vocab.key(id) for id in comment_ids])
      label = list(map(float, label))

      tokens_info = list(map(float, tokens_info))

      #print(len(comment_ids), len(tokens_info) / len(attribute_names), len(tokens_info) / len(comment_ids))
      assert len(tokens_info) == len(attribute_names) * len(comment_ids), '%d %f' %(len(comment_ids), len(tokens_info) / len(attribute_names))


      #comment_chars = [list(token) for token in comment_tokens]
      ## CHANGE to use ori token so fu**ck will encode ** but  NiggerMan to Nigger Man will all encode NiggerMan NiggerMan twice
      chars_list = [list(token) for token in ori_tokens]
      char_ids = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32)
      assert len(comment_ids) == len(chars_list), '{} {} {} {} {}'.format((len(comment_ids), len(chars_list), comment), tokens, ori_tokens)
      
      for i, chars in enumerate(chars_list):
        for j, ch in enumerate(chars):
          if j == FLAGS.char_limit:
            break
          char_ids[i, j] = get_char_id(ch, char_vocab)

      char_ids = list(char_ids.reshape(-1))

      #print(char_ids)

      # --------------simple char
      simple_char_ids = []
      for ch in list(comment):
        id_ = get_char_id(ch, char_vocab)
        #if id_ == char_vocab.unk_id():
        #  continue
        simple_char_ids.append(id_)
        if len(simple_char_ids) == FLAGS.simple_char_limit:
          break

      simple_chars_str = ''.join([char_vocab.key(id) for id in simple_char_ids])
      #print(simple_char_ids, simple_chars_str)

      # # --------------simple ngram
      # simple_ngrams = gezi.get_ngrams(comment)
      # simple_ngrams = simple_ngrams[:FLAGS.simple_char_limit * 5]
      # simple_ngram_ids = [get_ngram_id(ngram, ngram_vocab) for ngram in simple_ngrams]

      # --------------ngram
      ngram_ids_list = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32)
      if not FLAGS.ftngram:
        #ngrams_list = [gezi.get_ngrams(token) for token in ori_tokens]
        if not FLAGS.ngram_lower:
          ngrams_list = [gezi.get_ngrams(token, FLAGS.ngram_min, FLAGS.ngram_max) for token in comment_tokens]
        else:
          ngrams_list = [gezi.get_ngrams(token.lower(), FLAGS.ngram_min, FLAGS.ngram_max) for token in comment_tokens]

        for i, ngrams in enumerate(ngrams_list):
          for j, ngram in enumerate(ngrams):
            if j == FLAGS.char_limit:
              break
            #assert get_ngram_id(ngram, ngram_vocab) < 20003
            ngram_ids_list[i, j] = get_ngram_id(ngram, ngram_vocab)
      else:
        #for i, (token, ori_token) in enumerate(zip(comment_tokens, ori_tokens)):
        for i, (token, ori_token) in enumerate(zip(comment_tokens, comment_tokens)):
          ngram_ids = gezi.fasttext_ids(ori_token, vocab, FLAGS.ngram_buckets, FLAGS.ngram_min, FLAGS.ngram_max)
          if len(ngram_ids) >= FLAGS.char_limit:
            ngram_ids = gezi.fasttext_ids(token, vocab, FLAGS.ngram_buckets, FLAGS.ngram_min, FALGS.ngram_max)
          ngram_ids = ngram_ids[:FLAGS.char_limit]
          for j, ngram_id in enumerate(ngram_ids):
            ngram_ids_list[i, j] = ngram_id

      ngram_ids = list(ngram_ids_list.reshape(-1))

      # # ---------------fngrams(full ngrams)
      # fngrams_list = [gezi.get_ngrams_hash(token, FLAGS.ngram_buckets, 3, 6, reserve=3) for token in ori_tokens]
      # fngram_ids =  np.zeros([len(comment_ids), FLAGS.ngram_limit], dtype=np.int32)
      # for i, fngrams in enumerate(fngrams_list):
      #   for j, fngram in enumerate(fngrams):
      #     if j == FLAGS.ngram_limit:
      #       break
      #     fngram_ids[i, j] = fngram
      # fngram_ids = list(fngram_ids.reshape(-1))

      # global info per comment  7 features
      comment_info = []
      comment_info.append(len(ori_tokens))
      comment_info.append(len(comment_tokens))
      #comment_len = sum[len(x) for x in ori_tokens]
      comment_len = len(comment_str)
      comment_info.append(comment_len)
      comment_info.append(comment_len / (len(ori_tokens) + 1))
      num_unks = len([x for x in comment_ids if x == vocab.unk_id()])
      comment_info.append(num_unks)
      comment_info.append(num_unks / len(comment_tokens))
      comment_info.append(enprob_dict[id])

      record = tf.train.Example(features=tf.train.Features(feature={
                                "comment": melt.int64_feature(comment_ids),
                                #"comment_withunk": melt.int64_feature(comment_ids_withunk),
                                "tokens_info": melt.float_feature(tokens_info),
                                "comment_info": melt.float_feature(comment_info),
                                "pos": melt.int64_feature(pos_ids),
                                "tag": melt.int64_feature(tag_ids),
                                "ner": melt.int64_feature(ner_ids),
                                "comment_chars": melt.int64_feature(char_ids),
                                "comment_ngrams": melt.int64_feature(ngram_ids),
                                "simple_chars": melt.int64_feature(simple_char_ids),
                                #"simple_ngrams": melt.int64_feature(simple_ngram_ids),
                                #"comment_fngrams": melt.int64_feature(fngram_ids),
                                #"simple_chars_str": melt.bytes_feature(simple_chars_str),
                                "classes": melt.float_feature(label),
                                "id": melt.bytes_feature(id),
                                "weight": melt.float_feature([FLAGS.weight]),
                                "comment_str": melt.bytes_feature(comment_str),
                                "comment_tokens_str": melt.bytes_feature(comment_tokens_str)
                                }))
      
      writer.write(record)
      global counter
      with counter.get_lock():
        counter.value += 1

    print("Build {} instances of features in total".format(writer.size()))
    writer.close()
def build_features(index):
    mode = get_mode(FLAGS.input)

    start_index = 0 if not FLAGS.use_fold else 1
    out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format(
        mode, index + start_index)
    os.system('mkdir -p %s' % os.path.dirname(out_file))
    print('---out_file', out_file)
    # TODO now only gen one tfrecord file

    total = len(df)
    num_records = FLAGS.num_records_
    if mode in ['valid', 'test', 'dev', 'pm']:
        num_records = 1
    start, end = gezi.get_fold(total, num_records, index)

    print('infile', FLAGS.input, 'out_file', out_file)

    max_len = 0
    max_num_ids = 0
    num = 0
    with melt.tfrecords.Writer(out_file) as writer:
        for i in range(start, end):
            try:
                row = df.iloc[i]
                id = row[0]
                content = row[1]

                #print(content, type(content))
                if len(content) > max_len:
                    max_len = len(content)
                    print('max_len', max_len)

                if len(content) > 3000:
                    print(id, content)
                    if mode not in ['test', 'valid']:
                        continue

                label = list(row[2:])

                #label = [x + 2 for x in label]
                #num_labels = len(label)

                content_ids = text2ids_(content)

                if len(content_ids) < 5 and mode not in ['test', 'valid']:
                    continue

                limit = FLAGS.limit
                if len(content_ids) > max_num_ids:
                    max_num_ids = len(content_ids)
                    print('max_num_ids', max_num_ids)
                content_ids = content_ids[:limit]

                feature = {
                    'id': melt.bytes_feature(str(id)),
                    'label': melt.int64_feature(label),
                    'content': melt.int64_feature(content_ids),
                    'content_str': melt.bytes_feature(content),
                    'sorce': melt.bytes_feature(mode),
                }

                # TODO currenlty not get exact info wether show 1 image or 3 ...
                record = tf.train.Example(features=tf.train.Features(
                    feature=feature))

                if num % 1000 == 0:
                    print(num)

                writer.write(record)
                num += 1
                global counter
                with counter.get_lock():
                    counter.value += 1
                global total_words
                with total_words.get_lock():
                    total_words.value += len(content_ids)
            except Exception:
                #print(traceback.format_exc(), file=sys.stderr)
                pass
示例#12
0
def build_features(index):
    total = len(df)
    start, end = gezi.get_fold(total, FLAGS.num_records, index)
    df_ = df.iloc[start:end]

    num_records = 0

    buffer_size = None if (
        FLAGS.mark != 'train'
        or not FLAGS.shuffle_impressions) else FLAGS.shuffle_buffer_size
    ofile = f'{FLAGS.out_dir}/{FLAGS.mark}/record_{index}.TMP'
    folder_name = FLAGS.mark
    if FLAGS.neg_parts > 1:
        folder_name = f'{FLAGS.mark}-{FLAGS.neg_part}'
        os.system(f'mkdir -p {FLAGS.out_dir}/{folder_name}')
        ofile = f'{FLAGS.out_dir}/{FLAGS.mark}-{FLAGS.neg_part}/record_{index}.TMP'
    writer = melt.tfrecords.Writer(ofile, buffer_size=buffer_size)

    if FLAGS.mark == 'train' and FLAGS.train_by_day:
        # 2019 11 9 -> 11 14
        num_days = 7
        num_records_list = [0] * num_days
        ofiles = []
        writers = []
        for i in range(num_days):
            os.system(f'mkdir -p {FLAGS.out_dir}/{folder_name}-days/{i}')
            ofiles += [
                f'{FLAGS.out_dir}/{folder_name}-days/{i}/record_{index}.TMP'
            ]
            writers += [
                melt.tfrecords.Writer(ofiles[-1], buffer_size=buffer_size)
            ]

    for _, row in tqdm(df_.iterrows(), total=len(df_), ascii=True):
        time_ = row['time']
        day = int(time_.split()[0].split('/')[1]) - 9
        if FLAGS.day is not None and day != FLAGS.day:
            continue

        x = to_datetime(time_)
        weekday = x.weekday()
        hour = x.hour
        # timestamp = to_timestamp(x)
        timestamp = row['timestamp']

        impressions = row['impressions'].split()
        impression_id = row['impression_id']
        uid = uid_vocab.id(row['uid'])

        try:
            history = [
                did_vocab.id(x) for x in reversed(row['history'].split())
            ]
        except Exception:
            # print(row['history'], row['impression_id'])
            history = []

        feature = {}
        feature['uid_'] = row['uid']
        feature['uid'] = uid
        feature['day'] = day
        feature['weekday'] = weekday
        feature['hour'] = hour
        feature['impression_id'] = impression_id
        feature['uid_in_train'] = int(uid_vocab2.has(row['uid']))
        feature['impression_len'] = len(impressions)
        feature['hist_len'] = len(history)
        feature['history'] = history
        if FLAGS.record_padded:
            feature['history'] = gezi.pad(feature['history'],
                                          FLAGS.max_history)
        else:
            feature['history'] = feature['history'][:FLAGS.max_history]

        if FLAGS.use_impressions:
            feature['impressions'] = [
                did_vocab.id(x.split('-')[0]) for x in impressions
            ]

        # 当前doc的cat subcat

        # 当前doc的entities entity types

        feature['history_title_entities'] = []
        feature['history_title_entity_types'] = []
        feature['history_abstract_entities'] = []
        feature['history_abstract_entity_types'] = []
        for did in history:
            if did == 0:
                break
            did = did_vocab.key(did)
            news = news_info[did]

            try:
                title_entities = json.loads(news['title_entities'])
                for i, m in enumerate(title_entities):
                    if i == 2:
                        break
                    entity = m['WikidataId']
                    feature['history_title_entities'] += [
                        entity_vocab.id(entity)
                    ]
                    feature['history_title_entity_types'] += [
                        entity_type_vocab.id(m['Type'])
                    ]
            except Exception:
                pass

            try:
                abstract_entities = json.loads(news['abstract_entities'])
                for m in title_entities:
                    if i == 2:
                        break
                    entity = m['WikidataId']
                    feature['history_abstract_entities'] += [
                        entity_vocab.id(entity)
                    ]
                    feature['history_abstract_entity_types'] += [
                        entity_type_vocab.id(m['Type'])
                    ]
            except Exception:
                pass

        if FLAGS.record_padded:
            feature['history_title_entities'] = pad(
                feature['history_title_entities'],
                FLAGS.max_history * FLAGS.max_his_title_entities)
            feature['history_title_entity_types'] = gezi.pad(
                feature['history_title_entity_types'],
                FLAGS.max_history * FLAGS.max_his_title_entities)
            feature['history_abstract_entities'] = pad(
                feature['history_abstract_entities'],
                FLAGS.max_history * FLAGS.max_his_abstract_entities)
            feature['history_abstract_entity_types'] = pad(
                feature['history_abstract_entity_types'],
                FLAGS.max_history * FLAGS.max_his_abstract_entities)
        else:
            feature['history_title_entities'] = feature[
                'history_title_entities'][:FLAGS.max_history *
                                          FLAGS.max_his_title_entities]
            feature['history_title_entity_types'] = feature[
                'history_title_entity_types'][:FLAGS.max_history *
                                              FLAGS.max_his_title_entities]
            feature['history_abstract_entities'] = feature[
                'history_abstract_entities'][:FLAGS.max_history *
                                             FLAGS.max_his_abstract_entities]
            feature['history_abstract_entity_types'] = feature[
                'history_abstract_entity_types'][:FLAGS.max_history * FLAGS.
                                                 max_his_abstract_entities]

        if FLAGS.neg_parts > 1:
            indexes = list(range(len(impressions)))
            np.random.shuffle(indexes)

        prev_cat, prev_sub_cat = X, X
        recall_cats, recall_sub_cats = defaultdict(int), defaultdict(int)
        for i, impression in enumerate(impressions):
            did_ = impression.split('-')[0]
            news = news_info[did_]
            cat, sub_cat = news['cat'], news['sub_cat']
            recall_cats[cat] += 1
            recall_sub_cats[sub_cat] += 1

        for i, impression in enumerate(impressions):
            if '-' in impression:
                did_, click = impression.split('-')
            else:
                did_, click = impression, '0'
            click = int(click)

            if FLAGS.neg_parts > 1:
                if not click and indexes[i] % FLAGS.neg_parts != FLAGS.neg_part:
                    continue

            start_timestamp = start_timestamps[did_]
            fresh = timestamp - start_timestamp
            did = did_vocab.id(did_)

            feature['fresh'] = fresh
            feature['did_in_train'] = int(did_vocab2.has(did_))

            feature['click'] = click
            feature['did_'] = did_
            feature['did'] = did
            feature['id'] = impression_id * 100 + i
            feature['position'] = i

            news = news_info[did_]

            feature['cat'] = cat_vocab.id(news['cat'])
            feature['sub_cat'] = scat_vocab.id(news['sub_cat'])
            feature['title_len'] = len(news['title'].split())
            try:
                feature['abstract_len'] = len(news['abstract'].split())
            except Exception:
                # Nan
                feature['abstract_len'] = 0

            feature['title_entities'] = []
            feature['title_entity_types'] = []
            feature['abstract_entities'] = []
            feature['abstract_entity_types'] = []

            try:
                title_entities = json.loads(news['title_entities'])
                for m in title_entities:
                    entity = m['WikidataId']
                    feature['title_entities'].append(entity_vocab.id(entity))
                    feature['title_entity_types'].append(
                        entity_type_vocab.id(m['Type']))
            except Exception:
                pass

            try:
                abstract_entities = json.loads(news['abstract_entities'])
                for m in title_entities:
                    entity = m['WikidataId']
                    feature['abstract_entities'].append(
                        entity_vocab.id(entity))
                    feature['abstract_entity_types'].append(
                        entity_type_vocab.id(m['Type']))
            except Exception:
                pass

            if FLAGS.record_padded:
                for key in ['title_entities', 'title_entity_types']:
                    feature[key] = pad(feature[key], FLAGS.max_title_entities)

                for key in ['abstract_entities', 'abstract_entity_types']:
                    feature[key] = pad(feature[key],
                                       FLAGS.max_abstract_entities)

            # feature['impression_prev_cat'] = prev_cat
            # feature['impression_prev_sub_cat'] = prev_sub_cat

            # prev_cat = cat_vocab.id(news['cat'])
            # prev_sub_cat = scat_vocab.id(news['sub_cat'])

            # feature['impression_cat_ratio'] = recall_cats[news['cat']] / len(impressions)
            # feature['impression_sub_cat_ratio'] = recall_sub_cats[news['sub_cat']] / len(impressions)

            if FLAGS.use_impressions:
                feature['impressions'] = feature['impressions'][
                    max(0, i - 5):min(len(impressions), i + 4)]
                if FLAGS.record_padded:
                    feature['impressions'] = gezi.pad(feature['impressions'],
                                                      FLAGS.max_impressions)

            feature_ = {}
            for key in feature:
                feature_[key] = feature[key]
                if isinstance(feature[key], list
                              or tuple) and not feature[key]:
                    feature_[key] = [X]
            for key in feature_:
                try:
                    feature_[key] = melt.gen_feature(feature_[key])
                except Exception:
                    print(key, feature[key])
                    print(traceback.format_exc())
                    exit(0)

            record = tf.train.Example(features=tf.train.Features(
                feature=feature_))

            if FLAGS.mark == 'train' and FLAGS.train_by_day:
                writer = writers[day]

            writer.write(record)

            if FLAGS.mark == 'train' and FLAGS.train_by_day:
                num_records_list[day] += 1
            else:
                num_records += 1

    if FLAGS.mark == 'train' and FLAGS.train_by_day:
        for i in range(num_days):
            writers[i].close()
            if num_records_list[i] == 0:
                os.system('rm -rf %s' % ofiles[i])
            else:
                ofile2 = ofiles[i].replace('.TMP', f'.{num_records_list[i]}')
                os.system('mv %s %s' % (ofiles[i], ofile2))
    else:
        writer.close()
        if num_records == 0:
            os.system('rm -rf %s' % ofile)
        else:
            ofile2 = ofile.replace('.TMP', f'.{num_records}')
            os.system('mv %s %s' % (ofile, ofile2))