예제 #1
0
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy(
        )  # BytesList won't unpack a string from an EagerTensor.
    return tf_train.Feature(bytes_list=tf_train.BytesList(value=[value]))
예제 #2
0
def _float_feature(value):
    """Returns a float_list from a float / double."""
    if not isinstance(value, np.ndarray):
        value = [value]
    return tf_train.Feature(float_list=tf_train.FloatList(value=value))
예제 #3
0
def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    if not isinstance(value, np.ndarray):
        value = [value]
    return tf_train.Feature(int64_list=tf_train.Int64List(value=value))
예제 #4
0
def _int64_feature(values):
    """Returns an int64_list from a bool / enum / int / uint."""
    return train.Feature(int64_list=train.Int64List(value=values))
예제 #5
0
def _bytes_feature(values):
    """Returns a bytes_list from a string / byte."""
    values = [value.encode("utf-8") for value in values]
    return train.Feature(bytes_list=train.BytesList(value=values))
예제 #6
0
def random_n_fold_crossval_tfrec_format(n_folds,
                                        output_location,
                                        file_list=['BIO_format.txt'],
                                        base_name='data',
                                        downsample=True,
                                        downsampling_rate=0.5):
    """
    Create Tensorflow Records from BIO format. 
    For a n-fold cross validation n tfrecs are created. 
    The creation is based on counting the number of samples 
    in the file and randomly sorting them to n buckets.

    A training and test version are created, respectively 
    for each fold.
    Downsampling is applied as a very simple batch control
    on the training version. 
    
    Arguments:
        n_folds {[int]} -- [num of folds for cross validation]
        output_location {[str]} -- [path to data]
    
    Keyword Arguments:
        file_list {list} -- [BIO files] (default: {['BIO_format.txt']})
        base_name {str} -- [base name for reading and writing] (default: {'data'})
        downsample {bool} -- [apply downsampling to corpus] (default: {False})
        downsampling_rate {float} -- [rate for downsampling empty data] (default: {0.0})
    """
    random.seed(1)
    sample_count, word_count = count_samples(file_list, output_location)
    print("Creating TFRecord for {}-fold random cross validation.".format(
        n_folds))
    print("Found {} input samples (sentences) and {} words.".format(
        sample_count, word_count))

    train_path = join(
        output_location,
        str(n_folds) + '_cval_random_d_' + str(downsample) + "_" +
        str(downsampling_rate), 'records_train')
    train_path_labels = join(
        output_location,
        str(n_folds) + '_cval_random_d_' + str(downsample) + "_" +
        str(downsampling_rate), 'labels_train')
    test_path = join(
        output_location,
        str(n_folds) + '_cval_random_d_' + str(downsample) + "_" +
        str(downsampling_rate), 'records_test')
    test_path_labels = join(
        output_location,
        str(n_folds) + '_cval_random_d_' + str(downsample) + "_" +
        str(downsampling_rate), 'labels_test')
    make_or_clean_if_exists(train_path, train_path_labels, test_path,
                            test_path_labels)

    with Path(join(output_location, (base_name + '.words.txt'))).open() as f:
        word_to_idx = {line.strip(): idx for idx, line in enumerate(f)}
    with Path(join(output_location, (base_name + '.tag.txt'))).open() as f:
        label_mapping = {line.strip(): idx for idx, line in enumerate(f)}
    with Path(join(output_location, (base_name + '.chars.txt'))).open() as f:
        char_mapping = {line.strip(): idx for idx, line in enumerate(f)}

    tfwriter_set_train = []
    correct_labels_train = []
    sentences_to_labels_train = []
    words_to_labels_train = []
    tfwriter_set_test = []
    correct_labels_test = []
    sentences_to_labels_test = []
    words_to_labels_test = []
    for idx in range(n_folds):
        record_name = base_name + "_" + str(idx + 1) + ".tfrec"
        tfwriter_set_train.append(TFRecordWriter(join(train_path,
                                                      record_name)))
        correct_labels_train.append([])
        sentences_to_labels_train.append([])
        words_to_labels_train.append([])
        tfwriter_set_test.append(TFRecordWriter(join(test_path, record_name)))
        correct_labels_test.append([])
        sentences_to_labels_test.append([])
        words_to_labels_test.append([])

    plain_words = []
    for f in file_list:
        print("Working on file: " + join(output_location, f))
        with open(join(output_location, f), 'r') as datafile:
            indices = []
            labels = []
            words = []
            high_low = []
            length = []
            for line in datafile:
                if line in ['\n', '\r\n']:
                    index_list = train.FeatureList(feature=[
                        train.Feature(int64_list=train.Int64List(value=[ind]))
                        for ind in indices
                    ])
                    label_list = train.FeatureList(feature=[
                        train.Feature(int64_list=train.Int64List(value=[lab]))
                        for lab in labels
                    ])
                    word_list = train.FeatureList(feature=[
                        train.Feature(bytes_list=train.BytesList(
                            value=[wor.encode()])) for wor in words
                    ])
                    upper_list = train.FeatureList(feature=[
                        train.Feature(int64_list=train.Int64List(value=[hl]))
                        for hl in high_low
                    ])
                    len_list = train.FeatureList(feature=[
                        train.Feature(int64_list=train.Int64List(value=[le]))
                        for le in length
                    ])
                    sentences = train.FeatureLists(
                        feature_list={
                            'indices': index_list,
                            'labels': label_list,
                            'plain': word_list,
                            'upper': upper_list,
                            'length': len_list
                        })
                    example = train.SequenceExample(feature_lists=sentences)
                    sentence_reconstructed = '_'.join(plain_words)
                    labels_in_sent = list(set(labels))
                    set_to_put = random.randint(0, n_folds - 1)
                    tfwriter_set_test[set_to_put].write(
                        example.SerializeToString())
                    correct_labels_test[set_to_put].extend(labels)
                    for i in range(len(words)):
                        sentences_to_labels_test[set_to_put].append(
                            sentence_reconstructed)
                    words_to_labels_test[set_to_put].extend(plain_words)
                    if downsample and len(
                            labels_in_sent
                    ) == 1 and labels_in_sent[0] == 5 and random.random(
                    ) < downsampling_rate:
                        pass
                    else:
                        tfwriter_set_train[set_to_put].write(
                            example.SerializeToString())
                        correct_labels_train[set_to_put].extend(labels)
                        for i in range(len(words)):
                            sentences_to_labels_train[set_to_put].append(
                                sentence_reconstructed)
                        words_to_labels_train[set_to_put].extend(plain_words)
                    indices = []
                    labels = []
                    words = []
                    plain_words = []
                    high_low = []
                    length = []

                elif not line.startswith("-DOCSTART-") and '---' not in line:
                    info = line.split()
                    word = info[0]
                    characters = [char_mapping[x] for x in word]
                    words.append(" ".join(str(x) for x in characters))
                    labels.append(label_mapping[info[1]])

                    if info[0] not in word_to_idx:
                        indices.append(len(word_to_idx))
                        print(info[0])
                        raise RuntimeError(
                            "Tried to process a word that is not in the prebuilt dictionary."
                        )
                    else:
                        indices.append(word_to_idx[info[0]])

                    high_low.append(word.isupper())
                    length.append(len(word))
                    plain_words.append(word)

    for idx in range(n_folds):
        label_name = base_name + "_" + str(idx + 1) + ".tfrec" + '_labels'
        with open(join(train_path_labels, label_name), 'w') as cor_labels:
            print("In train " + str(idx + 1) + " are " +
                  str(len(correct_labels_train[idx])) + " samples (words).")
            if len(correct_labels_train[idx]) != len(
                    sentences_to_labels_train[idx]) or len(
                        correct_labels_train[idx]) != len(
                            words_to_labels_train[idx]):
                raise (RuntimeError("Sentence length does not match labels."))
            for l, s, w in zip(correct_labels_train[idx],
                               sentences_to_labels_train[idx],
                               words_to_labels_train[idx]):
                cor_labels.write(str(l) + " " + s + " " + w + '\n')
        tfwriter_set_train[idx].close()

        with open(join(test_path_labels, label_name), 'w') as cor_labels:
            print("In test " + str(idx + 1) + " are " +
                  str(len(correct_labels_test[idx])) + " samples (words).")
            if len(correct_labels_test[idx]) != len(
                    sentences_to_labels_test[idx]) or len(
                        correct_labels_test[idx]) != len(
                            words_to_labels_test[idx]):
                raise (
                    RuntimeError("Sentence length does not match labels.\n"))
            for l, s, w in zip(correct_labels_test[idx],
                               sentences_to_labels_test[idx],
                               words_to_labels_test[idx]):
                cor_labels.write(str(l) + " " + s + " " + w + '\n')
        tfwriter_set_test[idx].close()
    print("Done building the TFRecords.")
예제 #7
0
def _float_feature(values):
    """Returns a float_list from a float / double."""
    return train.Feature(float_list=train.FloatList(value=values))
예제 #8
0
def _int64_feature(value):  
  if not isinstance(value, list):
    value = [value]
  return tft.Feature(int64_list=tft.Int64List(value=value))
예제 #9
0
def _bytes_feature(value):  
  return tft.Feature(bytes_list=tft.BytesList(value=[value]))
예제 #10
0
def _float_feature(value):
    return train.Feature(float_list=train.FloatList(value=[value]))
예제 #11
0
def _bytes_feature(value):
    return train.Feature(bytes_list=train.BytesList(value=[value]))
예제 #12
0
def _int64_feature(value):
    return train.Feature(int64_list=train.Int64List(value=[value]))