Exemplo n.º 1
0
def create_examples(data,
                    bert_client,
                    training=True,
                    label2int=None,
                    class_weight=None):
    """
    data: pd.DataFrame
    label2int: dict
    class_weight: list

    yield examples
    """
    idx_start = data.index[0]

    A_encoded = bert_client.encode(data['title1_en'].tolist())
    B_encoded = bert_client.encode(data['title2_en'].tolist())

    for i in range(len(data)):
        feature = {
            'A_encoded': Feature(float_list=FloatList(value=A_encoded[i])),
            'B_encoded': Feature(float_list=FloatList(value=B_encoded[i]))
        }
        if training:
            label = label2int[data.loc[idx_start + i, 'label']]
            feature['label'] = Feature(int64_list=Int64List(value=[label]))
            feature['class_weight'] = Feature(float_list=FloatList(
                value=[class_weight[label]]))
        else:
            feature['id'] = Feature(int64_list=Int64List(
                value=[data.loc[idx_start + i, 'id']]))

        yield Example(features=Features(feature=feature))
Exemplo n.º 2
0
def serialise(data):
    
    ID,pos, dimensions, color, border, fill, text, img, seq_len, seq_mask = data['ID'], data['pos'], data['dimensions'], data['color'], \
                                    data['border'], data['fill'], data['text'], data['img'], \
                                    int(data['seq_len']), data['seq_mask'] \

    ID = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(ID).numpy(),]))
    pos = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(tf.cast(pos, tf.float32)).numpy(),]))
    dimensions = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(tf.cast(dimensions, tf.float32)).numpy(),]))
    color = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(tf.cast(color, tf.float32)).numpy(),]))
    border = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(tf.cast(border, tf.float32)).numpy(),]))
    fill = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(tf.cast(fill, tf.float32)).numpy(),]))
    text = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(tf.cast(text, tf.float32)).numpy(),]))
    img = Feature(bytes_list=BytesList(value=[img.numpy(),]))
    seq_len =  Feature(int64_list=Int64List(value=[seq_len,]))
    seq_mask = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(seq_mask).numpy(),]))
    # img is already serialised because we never decode it!
    
    features = Features(feature={
                'ID' : ID,
                'pos' : pos,
                'dimensions' : dimensions,
                'color' : color,
                'border' : border,
                'fill' : fill,
                'text' : text,
                'img': img,
                'seq_len':seq_len,
                'seq_mask':seq_mask,
                })
    
    example = Example(features=features)
    
    return example.SerializeToString()
Exemplo n.º 3
0
def make_tf_examples(string_features, int_features, labels):
    int_features += [[label] for label in zero_norm_labels(labels)]
    string_features = [
        Feature(bytes_list=BytesList(value=val)) for val in string_features
    ]
    int_features = [
        Feature(int64_list=Int64List(value=val)) for val in int_features
    ]
    all_features = string_features + int_features
    return [
        Example(features=Features(
            feature={
                "left": left,
                "target": target,
                "right": right,
                "left_ids": left_ids,
                "target_ids": target_ids,
                "right_ids": right_ids,
                "labels": label,
            })) for (
                left,
                target,
                right,
                left_ids,
                target_ids,
                right_ids,
                label,
            ) in zip(*split_list(all_features, parts=7))
    ]
Exemplo n.º 4
0
def convert_to_example(
    adj,
    feature,
    label_data=None,
    label_mask=None,
):
    """
    Writes graph related data to disk.
    """
    adj_row, adj_col = np.nonzero(adj)
    adj_values = adj[adj_row, adj_col]
    adj_elem_len = len(adj_row)
    degrees = np.sum(adj, 0)
    adj_degrees = []
    for ar, ac in zip(adj_row, adj_col):
        if ar == ac:
            adj_degrees.append(0)
        else:
            adj_degrees.append(int(degrees[ar]))
    feature = np.array(feature)
    feature_row, feature_col = np.nonzero(feature)
    feature_values = feature[feature_row, feature_col]
    feature_elem_len = len(feature_row)
    feature = {
        'adj_row': Feature(int64_list=Int64List(value=list(adj_row))),
        'adj_column': Feature(int64_list=Int64List(value=list(adj_col))),
        'adj_values': Feature(float_list=FloatList(value=list(adj_values))),
        'adj_elem_len': Feature(int64_list=Int64List(value=[adj_elem_len])),
        'adj_degrees': Feature(int64_list=Int64List(value=adj_degrees)),
        'feature_row': Feature(int64_list=Int64List(value=list(feature_row))),
        'feature_column':
        Feature(int64_list=Int64List(value=list(feature_col))),
        'feature_values':
        Feature(float_list=FloatList(value=list(feature_values))),
        'feature_elem_len':
        Feature(int64_list=Int64List(value=[feature_elem_len])),
        'size': Feature(int64_list=Int64List(value=list(feature.shape)))
    }
    if label_data is not None:
        label_data = np.nan_to_num(label_data)
        feature['label'] = Feature(int64_list=Int64List(
            value=label_data.astype(int)))
        feature['mask_label'] = Feature(int64_list=Int64List(
            value=label_mask.astype(int))),
    features = Features(feature=feature)
    ex = Example(features=features)
    return ex.SerializeToString()
Exemplo n.º 5
0
 def create_example(features: np.ndarray, label: np.int32):
     return Example(features=Features(
         feature={
             "features":
             NumpyToRecordConverter._bytes_feature(
                 tf.io.serialize_tensor(features)),
             "label":
             Feature(int64_list=Int64List(value=[label]))
         })).SerializeToString()
Exemplo n.º 6
0
def serialise_traj(data):
    
    features = {k: Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(v).numpy(),])) for k,v in data.items() if k not in ['seq_lens']}
    features['seq_lens'] =  Feature(int64_list=Int64List(value=[data['seq_lens'],]))

    example = Example(features=Features(feature=features))
    
    
    return example.SerializeToString()
Exemplo n.º 7
0
def serialise_vid(data):
    
    # seq_lens, masks, imgs, goal_imgs,label, label_embedding, tag = data['seq_lens'], data['masks'], data['imgs'], data['goal_imgs'], data['label'], data['label_embedding'], data['tag']
    
    features = {k: Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(v).numpy(),])) for k,v in data.items() if k not in ['seq_lens']}
    features['seq_lens'] =  Feature(int64_list=Int64List(value=[data['seq_lens'],]))

    example = Example(features=Features(features))
    
    
    return example.SerializeToString()
Exemplo n.º 8
0
    def __encode_input(self, mr, input_encoder):
        """Encodes the input, and creates a TF Example record out of it."""

        input_ids = input_encoder.encode(mr)
        input_ids.append(text_encoder.EOS_ID)

        features = {'inputs': Feature(int64_list=Int64List(value=input_ids))}

        example = Example(features=Features(feature=features))

        return example.SerializeToString()
Exemplo n.º 9
0
Arquivo: prep.py Projeto: wibrow/kGCN
def write_to_tfrecords(adj, feature, label_data, label_mask, tfrname):
    """
    Writes graph related data to disk.
    """
    adj_row, adj_col = np.nonzero(adj)
    adj_values = adj[adj_row, adj_col]
    adj_elem_len = len(adj_row)
    feature = np.array(feature)
    feature_row, feature_col = np.nonzero(feature)
    feature_values = feature[feature_row, feature_col]
    feature_elem_len = len(feature_row)
    features = Features(
        feature={
            'label':
            Feature(int64_list=Int64List(value=label_data)),
            'mask_label':
            Feature(int64_list=Int64List(value=label_mask)),
            'adj_row':
            Feature(int64_list=Int64List(value=list(adj_row))),
            'adj_column':
            Feature(int64_list=Int64List(value=list(adj_col))),
            'adj_values':
            Feature(float_list=FloatList(value=list(adj_values))),
            'adj_elem_len':
            Feature(int64_list=Int64List(value=[adj_elem_len])),
            'feature_row':
            Feature(int64_list=Int64List(value=list(feature_row))),
            'feature_column':
            Feature(int64_list=Int64List(value=list(feature_col))),
            'feature_values':
            Feature(float_list=FloatList(value=list(feature_values))),
            'feature_elem_len':
            Feature(int64_list=Int64List(value=[feature_elem_len])),
            'size':
            Feature(int64_list=Int64List(value=list(feature.shape)))
        })
    ex = Example(features=features)
    with TFRecordWriter(tfrname) as single_writer:
        single_writer.write(ex.SerializeToString())
Exemplo n.º 10
0
    def _process_page(page_title: str):
        if _entity_vocab.contains(page_title, _language):
            page_id = _entity_vocab.get_id(page_title, _language)
        else:
            page_id = -1

        sentences = []

        def tokenize(text: str, add_prefix_space: bool):
            # clean up multiple spaces
            text = re.sub(r"\s+", " ", text).rstrip()
            if not text:
                return []
            if isinstance(_tokenizer, RobertaTokenizer):
                return _tokenizer.tokenize(text,
                                           add_prefix_space=add_prefix_space)
            else:
                return _tokenizer.tokenize(text)

        for paragraph in _dump_db.get_paragraphs(page_title):

            paragraph_text = paragraph.text

            # First, get paragraph links.
            # Parapraph links are represented as (link_title) and the start/end positions of strings
            # (link_start, link_end).
            paragraph_links = []
            for link in paragraph.wiki_links:
                link_title = _dump_db.resolve_redirect(link.title)
                # remove category links
                if link_title.startswith("Category:") and link.text.lower(
                ).startswith("category:"):
                    paragraph_text = (paragraph_text[:link.start] + " " *
                                      (link.end - link.start) +
                                      paragraph_text[link.end:])
                else:
                    if _entity_vocab.contains(link_title, _language):
                        paragraph_links.append(
                            (link_title, link.start, link.end))
                    elif _include_unk_entities:
                        paragraph_links.append(
                            (UNK_TOKEN, link.start, link.end))

            sent_spans = _sentence_tokenizer.span_tokenize(
                paragraph_text.rstrip())
            for sent_start, sent_end in sent_spans:
                cur = sent_start
                sent_words = []
                sent_links = []
                # Look for links that are within the tokenized sentence.
                # If a link is found, we separate the sentences across the link and tokenize them.
                for link_title, link_start, link_end in paragraph_links:
                    if not (sent_start <= link_start < sent_end
                            and link_end <= sent_end):
                        continue
                    entity_id = _entity_vocab.get_id(link_title, _language)

                    # read from the beginning of the sentence (or current cursor) to beginning of linked text
                    text = paragraph_text[cur:link_start]
                    # the add_prefix_space thing is because of the way RoBERTa was trained
                    # from tf library: "This tokenizer has been trained to treat spaces like parts of the tokens
                    # (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning
                    #  of the sentence (without space) or not"
                    if cur == 0 or text.startswith(" ") or paragraph_text[
                            cur - 1] == " ":
                        sent_words += tokenize(text, True)
                    else:
                        sent_words += tokenize(text, False)

                    # read the linked text
                    link_text = paragraph_text[link_start:link_end]

                    # tokenize the linked words, add spaces as necessary
                    if link_start == 0 or link_text.startswith(
                            " ") or paragraph_text[link_start - 1] == " ":
                        link_words = tokenize(link_text, True)
                    else:
                        link_words = tokenize(link_text, False)

                    # add the entities + the start and end number of tokens for the entity
                    # IMPORTANT
                    sent_links.append((entity_id, len(sent_words),
                                       len(sent_words) + len(link_words)))
                    # add entity words to the end of the sentence words
                    # this gets us our fully tokenized text
                    sent_words += link_words
                    cur = link_end

                text = paragraph_text[cur:sent_end]
                if cur == 0 or text.startswith(" ") or paragraph_text[
                        cur - 1] == " ":
                    sent_words += tokenize(text, True)
                else:
                    sent_words += tokenize(text, False)

                if len(sent_words) < _min_sentence_length or len(
                        sent_words) > _max_num_tokens:
                    continue
                sentences.append((sent_words, sent_links))

        ret = []
        words = []
        links = []
        # loop through the sentences in the paragraph
        # each sentence is a tf Example
        for i, (sent_words, sent_links) in enumerate(sentences):
            links += [(id_, start + len(words), end + len(words))
                      for id_, start, end in sent_links]
            words += sent_words
            if i == len(sentences) - 1 or len(words) + len(
                    sentences[i + 1][0]) > _max_num_tokens:
                if links or _include_sentences_without_entities:
                    links = links[:_max_entity_length]
                    # get the IDs based on the word list
                    word_ids = _tokenizer.convert_tokens_to_ids(words)
                    assert _min_sentence_length <= len(
                        word_ids) <= _max_num_tokens
                    # get the entity IDs from our entity vocab
                    entity_ids = [id_ for id_, _, _, in links]
                    assert len(entity_ids) <= _max_entity_length
                    # this is the position of the entities in the text?
                    entity_position_ids = itertools.chain(
                        *[(list(range(start, end)) + [-1] *
                           (_max_mention_length - end + start)
                           )[:_max_mention_length] for _, start, end in links])

                    example = tf.train.Example(features=tf.train.Features(
                        feature=dict(
                            page_id=tf.train.Feature(
                                int64_list=tf.train.Int64List(
                                    value=[page_id])),
                            word_ids=tf.train.Feature(
                                int64_list=tf.train.Int64List(value=word_ids)),
                            entity_ids=tf.train.Feature(
                                int64_list=tf.train.Int64List(
                                    value=entity_ids)),
                            entity_position_ids=tf.train.Feature(
                                int64_list=Int64List(
                                    value=entity_position_ids)),
                        )))
                    ret.append((example.SerializeToString()))

                words = []
                links = []
        return ret
Exemplo n.º 11
0
    def _process_page(page_title: str):
        if _entity_vocab.contains(page_title, _language):
            page_id = _entity_vocab.get_id(page_title, _language)
        else:
            page_id = -1

        sentences = []

        def tokenize(text: str, add_prefix_space: bool):
            text = re.sub(r"\s+", " ", text).rstrip()
            if not text:
                return []
            if isinstance(_tokenizer, RobertaTokenizer):
                return _tokenizer.tokenize(text,
                                           add_prefix_space=add_prefix_space)
            else:
                return _tokenizer.tokenize(text)

        for paragraph in _dump_db.get_paragraphs(page_title):

            paragraph_text = paragraph.text

            # First, get paragraph links.
            # Parapraph links are represented its form (link_title) and the start/end positions of strings
            # (link_start, link_end).
            paragraph_links = []
            for link in paragraph.wiki_links:
                link_title = _dump_db.resolve_redirect(link.title)
                # remove category links
                if link_title.startswith("Category:") and link.text.lower(
                ).startswith("category:"):
                    paragraph_text = (paragraph_text[:link.start] + " " *
                                      (link.end - link.start) +
                                      paragraph_text[link.end:])
                else:
                    if _entity_vocab.contains(link_title, _language):
                        paragraph_links.append(
                            (link_title, link.start, link.end))
                    elif _include_unk_entities:
                        paragraph_links.append(
                            (UNK_TOKEN, link.start, link.end))

            sent_spans = _sentence_tokenizer.span_tokenize(
                paragraph_text.rstrip())
            for sent_start, sent_end in sent_spans:
                cur = sent_start
                sent_words = []
                sent_links = []
                # Look for links that are within the tokenized sentence.
                # If a link is found, we separate the sentences across the link and tokenize them.
                for link_title, link_start, link_end in paragraph_links:
                    if not (sent_start <= link_start < sent_end
                            and link_end <= sent_end):
                        continue
                    entity_id = _entity_vocab.get_id(link_title, _language)

                    text = paragraph_text[cur:link_start]
                    if cur == 0 or text.startswith(" ") or paragraph_text[
                            cur - 1] == " ":
                        sent_words += tokenize(text, True)
                    else:
                        sent_words += tokenize(text, False)

                    link_text = paragraph_text[link_start:link_end]

                    if link_start == 0 or link_text.startswith(
                            " ") or paragraph_text[link_start - 1] == " ":
                        link_words = tokenize(link_text, True)
                    else:
                        link_words = tokenize(link_text, False)

                    sent_links.append((entity_id, len(sent_words),
                                       len(sent_words) + len(link_words)))
                    sent_words += link_words
                    cur = link_end

                text = paragraph_text[cur:sent_end]
                if cur == 0 or text.startswith(" ") or paragraph_text[
                        cur - 1] == " ":
                    sent_words += tokenize(text, True)
                else:
                    sent_words += tokenize(text, False)

                if len(sent_words) < _min_sentence_length or len(
                        sent_words) > _max_num_tokens:
                    continue
                sentences.append((sent_words, sent_links))

        ret = []
        words = []
        links = []
        for i, (sent_words, sent_links) in enumerate(sentences):
            links += [(id_, start + len(words), end + len(words))
                      for id_, start, end in sent_links]
            words += sent_words
            if i == len(sentences) - 1 or len(words) + len(
                    sentences[i + 1][0]) > _max_num_tokens:
                if links or _include_sentences_without_entities:
                    links = links[:_max_entity_length]
                    word_ids = _tokenizer.convert_tokens_to_ids(words)
                    assert _min_sentence_length <= len(
                        word_ids) <= _max_num_tokens
                    entity_ids = [id_ for id_, _, _, in links]
                    assert len(entity_ids) <= _max_entity_length
                    entity_position_ids = itertools.chain(
                        *[(list(range(start, end)) + [-1] *
                           (_max_mention_length - end + start)
                           )[:_max_mention_length] for _, start, end in links])

                    example = tf.train.Example(features=tf.train.Features(
                        feature=dict(
                            page_id=tf.train.Feature(
                                int64_list=tf.train.Int64List(
                                    value=[page_id])),
                            word_ids=tf.train.Feature(
                                int64_list=tf.train.Int64List(value=word_ids)),
                            entity_ids=tf.train.Feature(
                                int64_list=tf.train.Int64List(
                                    value=entity_ids)),
                            entity_position_ids=tf.train.Feature(
                                int64_list=Int64List(
                                    value=entity_position_ids)),
                        )))
                    ret.append((example.SerializeToString()))

                words = []
                links = []
        return ret
Exemplo n.º 12
0
 def _create_int_feature(self, values):
     return Feature(int64_list=Int64List(value=list(values)))
Exemplo n.º 13
0
def serialise(data):

    obs, acts, goals, seq_lens, masks, dataset_path, tstep_idxs , imgs , goal_imgs, proprioceptive_features = data['obs'], \
    data['acts'], data['goals'], data['seq_lens'], data['masks'], data['dataset_path'], data['tstep_idxs'], data['imgs'], data['goal_imgs'], data['proprioceptive_features']

    # obs (1, 40, 18)
    # acts (1, 40, 7)
    # goals (1, 40, 11)
    # seq_lens (1,)
    # masks (1, 40)
    # dataset_path (1, 40)
    # tstep_idxs (1, 40)
    # imgs (1, 40, 200, 200, 3)
    # goal_imgs (1, 40, 200, 200, 3)
    # proprioceptive_features (1, 40, 7)

    goal_imgs = tf.expand_dims(
        goal_imgs[:, 0, :, :, :],
        1)  # crete a :, 1, :,:,: shaped goal images for less file IO

    obs = Feature(bytes_list=BytesList(value=[
        tf.io.serialize_tensor(tf.squeeze(obs)).numpy(),
    ]))
    acts = Feature(bytes_list=BytesList(value=[
        tf.io.serialize_tensor(tf.squeeze(acts)).numpy(),
    ]))
    goals = Feature(bytes_list=BytesList(value=[
        tf.io.serialize_tensor(tf.squeeze(goals)).numpy(),
    ]))
    seq_lens = Feature(int64_list=Int64List(value=[
        seq_lens,
    ]))
    masks = Feature(bytes_list=BytesList(value=[
        tf.io.serialize_tensor(tf.squeeze(masks)).numpy(),
    ]))

    imgs = Feature(bytes_list=BytesList(value=[
        tf.io.serialize_tensor(tf.squeeze(imgs)).numpy(),
    ]))
    goal_imgs = Feature(bytes_list=BytesList(value=[
        tf.io.serialize_tensor(tf.squeeze(goal_imgs)).numpy(),
    ]))
    proprioceptive_features = Feature(bytes_list=BytesList(value=[
        tf.io.serialize_tensor(tf.squeeze(proprioceptive_features)).numpy(),
    ]))

    features = Features(
        feature={
            'obs': obs,
            'acts': acts,
            'goals': goals,
            'seq_lens': seq_lens,
            'masks': masks,
            'imgs': imgs,
            'goal_imgs': goal_imgs,
            'proprioceptive_features': proprioceptive_features
        })

    example = Example(features=features)

    return example.SerializeToString()


# Sample Usage
# r = lfp.data.PlayDataloader(include_imgs = args.images, batch_size=1,  window_size=args.window_size_max, min_window_size=args.window_size_min)
# rd = r.extract(TRAIN_DATA_PATHS, from_tfrecords=args.from_tfrecords)
# rd = r.load(rd)
# r_it = iter(rd)

# @tf.function
# def sample():
#   return r_it.next()

# data_paths = [str(STORAGE_PATH/'precompute')+f"/{x}.tfrecords" for x in range(0,8)]
# #@title write to gcs
# from tqdm import tqdm
# for path in data_paths:
#   with tf.io.TFRecordWriter(path) as file_writer:
#     print(path)
#     for i in tqdm(range(0,200)):
#         byte_stream = serialise(sample())
#         file_writer.write(byte_stream)
Exemplo n.º 14
0
    def _process_page(pmid: str):
        # print("start _process_page", pmid)
        if _entity_vocab.page_contains(pmid):
            # page_id = _entity_vocab.get_id(pmid)
            # TODO: verify if this is okay
            # we just use the PMID as the page_id, it doesn't look like it is used anywhere really
            # so should be fine.
            page_id = int(pmid)
        else:
            page_id = -1

        sentences = []

        def tokenize(text: str, add_prefix_space: bool):
            # clean up multiple spaces
            text = re.sub(r"\s+", " ", text).rstrip()
            if not text:
                return []
            if isinstance(_tokenizer, RobertaTokenizer):
                return _tokenizer.tokenize(text,
                                           add_prefix_space=add_prefix_space)
            else:
                return _tokenizer.tokenize(text)

        # print("start get data")
        # we concatenate the title and abstract like they do in MedMentions to get the entity spans to match
        page_data = _medmentions_db.get_data()[pmid]
        paragraph_text = page_data['title'] + " " + page_data['abstract']
        # print("end get data")
        # First, get paragraph links.
        # Parapraph links are represented as (link_title) and the start/end positions of strings
        # (link_start, link_end).
        paragraph_links = []
        # print("start loop through entities")
        for entity in page_data['entities']:

            if _entity_vocab.contains(entity[4], _language):
                paragraph_links.append((entity[4], entity[0], entity[1]))
            elif _include_unk_entities:
                paragraph_links.append((UNK_TOKEN, entity[0], entity[1]))
        # print("stop loop through entities")
        sent_spans = _sentence_tokenizer.span_tokenize(paragraph_text.rstrip())
        for sent_start, sent_end in sent_spans:
            cur = sent_start
            sent_words = []
            sent_links = []
            # Look for links that are within the tokenized sentence.
            # If a link is found, we separate the sentences across the link and tokenize them.
            for cui_id, ent_start, ent_end in paragraph_links:
                if not (sent_start <= ent_start < sent_end
                        and ent_end <= sent_end):
                    continue
                entity_id = _entity_vocab.get_id(cui_id, _language)

                # read from the beginning of the sentence (or current cursor) to beginning of linked text
                text = paragraph_text[cur:ent_start]

                # the add_prefix_space thing is because of the way RoBERTa was trained
                # from tf library: "This tokenizer has been trained to treat spaces like parts of the tokens
                # (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning
                #  of the sentence (without space) or not"
                if cur == 0 or text.startswith(" ") or paragraph_text[
                        cur - 1] == " ":
                    sent_words += tokenize(text, True)
                else:
                    sent_words += tokenize(text, False)

                # read the linked text
                link_text = paragraph_text[ent_start:ent_end]

                # tokenize the linked words, add spaces as necessary
                if ent_start == 0 or link_text.startswith(
                        " ") or paragraph_text[ent_start - 1] == " ":
                    link_words = tokenize(link_text, True)
                else:
                    link_words = tokenize(link_text, False)

                # add the entities + the start and end number of tokens for the entity
                # IMPORTANT
                sent_links.append((entity_id, len(sent_words),
                                   len(sent_words) + len(link_words)))
                # add entity words to the end of the sentence words
                # this gets us our fully tokenized text
                sent_words += link_words
                cur = ent_end

            text = paragraph_text[cur:sent_end]
            if cur == 0 or text.startswith(" ") or paragraph_text[cur -
                                                                  1] == " ":
                sent_words += tokenize(text, True)
            else:
                sent_words += tokenize(text, False)

            if len(sent_words) < _min_sentence_length or len(
                    sent_words) > _max_num_tokens:
                continue
            sentences.append((sent_words, sent_links))
        # print("finish sent spans")
        ret = []
        words = []
        links = []
        # loop through the sentences in the paragraph
        for i, (sent_words, sent_links) in enumerate(sentences):
            links += [(id_, start + len(words), end + len(words))
                      for id_, start, end in sent_links]
            words += sent_words
            # we only create the tf example on the last sentence/if we hit the max number of tokens
            if i == len(sentences) - 1 or len(words) + len(
                    sentences[i + 1][0]) > _max_num_tokens:
                if links or _include_sentences_without_entities:
                    links = links[:_max_entity_length]
                    # get the IDs based on the word list
                    word_ids = _tokenizer.convert_tokens_to_ids(words)
                    assert _min_sentence_length <= len(
                        word_ids) <= _max_num_tokens
                    # get the entity IDs from our entity vocab
                    entity_ids = [id_ for id_, _, _, in links]
                    assert len(entity_ids) <= _max_entity_length
                    # this is the position of the entities in the text?
                    entity_position_ids = itertools.chain(
                        *[(list(range(start, end)) + [-1] *
                           (_max_mention_length - end + start)
                           )[:_max_mention_length] for _, start, end in links])
                    example = tf.train.Example(features=tf.train.Features(
                        feature=dict(
                            page_id=tf.train.Feature(
                                int64_list=tf.train.Int64List(
                                    value=[page_id])),
                            word_ids=tf.train.Feature(
                                int64_list=tf.train.Int64List(value=word_ids)),
                            entity_ids=tf.train.Feature(
                                int64_list=tf.train.Int64List(
                                    value=entity_ids)),
                            entity_position_ids=tf.train.Feature(
                                int64_list=Int64List(
                                    value=entity_position_ids)),
                        )))
                    ret.append((example.SerializeToString()))

                words = []
                links = []
        # print("about to return")
        return ret