Python Preprocessor示例，preprocess.Preprocessor Python示例

示例#1

0

显示文件

文件： train.py 项目： bwallace/seq2tree-MeSH

def prep_data():
    input_texts, mesh_outputs = load_data.assemble_pairs()
    abstract_p = preprocess.Preprocessor()
    
    # preprocess and encode texts (inputs)
    abstract_p.preprocess(input_texts)
    X = abstract_p.encode_texts(input_texts)

    labels_p = preprocess.Preprocessor(vocab_size=None, split_char=".", normalize=False)
    labels_p.preprocess(mesh_outputs)
    Y = labels_p.encode_texts(mesh_outputs)

    return (input_texts, abstract_p, mesh_outputs, labels_p, list(zip(X,Y)))

示例#2

0

显示文件

def test():
    '''
    Trains the model and returns its score
    '''
    matplotlib.rcParams['backend'] = 'Qt5Agg'
    matplotlib.get_backend()
    D = DataManager(data_name, data_dir)
    #Load le model
    mdl = model()

    Prepro = prepro.Preprocessor()
    #D.data['X_train'] = Prepro.removeOutliers(D.data['X_train'])
    #D.data['Y_train'] = Prepro.removeOutliers(D.data['Y_train'])
    X_train = D.data['X_train']
    Y_train = D.data['Y_train'].ravel()

    #test de l'entrainement
    mdl.fit(X_train, Y_train)

    #test de la prediction
    Y_hat_train = mdl.predict(D.data['X_train'])
    Y_hat_valid = mdl.predict(D.data['X_valid'])
    Y_hat_test = mdl.predict(D.data['X_test'])

    metric_name, scoring_function = get_metric()
    scores = cross_val_score(mdl,
                             X_train,
                             Y_train,
                             cv=5,
                             scoring=make_scorer(scoring_function))
    print('\nCV score (95 perc. CI): %0.2f (+/- %0.2f)' %
          (scores.mean(), scores.std() * 2))

示例#3

0

显示文件

def jsd(fileName, query):
    pp = preprocess.Preprocessor()
    fileText = pp.prepDoc(fileName, combine = True)
    queryText = pp.preprocess(query)
    texts = [queryText, fileText]
    probDists = getProbDists(texts)
    return jensenshannon.jensen_shannon_divergence(numpy.array(probDists))

示例#4

0

显示文件

文件： test2.py 项目： dfalessi/CCP-changes

def testFile(fileName, query):
    pp = preprocess.Preprocessor()
    fileText = pp.prepDoc(fileName, combine=True)
    #print fileText
    queryText = pp.preprocess(query)
    texts = [queryText, fileText]
    #print texts
    probDists = getProbDists(texts)
    #print probDists
    print jensenshannon.jensen_shannon_divergence(numpy.array(probDists))

示例#5

0

显示文件

def data_prep(seed):
    profile = profile.Profile()
    interest = interest.Interest()
    preprocess = preprocess.Preprocessor()
    profile_raw = profile.get_profile()
    interest_raw, ids = interest.data_merge()
    data = preprocess.finalize_data(profile_raw, interest_raw)
    X, y, X_train, y_train, X_test, y_test = preprocess.split_data(data,
                                                                   seed=seed,
                                                                   re=False)
    return X, y, X_train, y_train, X_test, y_test, ids

示例#6

0

显示文件

文件： main.py 项目： kienvu58/income_prediction

def preprocess_data(url, seed):
    preprocessor = preprocess.Preprocessor()
    raw_data = preprocessor.get_data(url)
    contain_null = preprocessor.get_null(raw_data)
    for f in contain_null:
        raw_data.loc[(raw_data[f].isnull()),
                     f] = preprocessor.ImputeVoteClassifier(raw_data, f)
    X_train, y_train, X_test, y_test = preprocessor.split_data(raw_data,
                                                               seed,
                                                               re=False)
    return X_train, y_train, X_test, y_test

示例#7

0

显示文件

def main():
    preprocessor = preprocess.Preprocessor(
        '{}/words.txt'.format(cwd))  # preprocess is called
    ''' poor data sets: '''
    # preprocessor2 = preprocess.Preprocessor('{}/2-letter-words.json'.format(cwd))
    # preprocessor3 = preprocess.Preprocessor('{}/3-letter-words.json'.format(cwd))
    # preprocessor4 = preprocess.Preprocessor('{}/4-letter-words.json'.format(cwd))
    # preprocessor5 = preprocess.Preprocessor('{}/5-letter-words.json'.format(cwd))
    # preprocessor6 = preprocess.Preprocessor('{}/6-letter-words.json'.format(cwd))
    # preprocessor7 = preprocess.Preprocessor('{}/7-letter-words.json'.format(cwd))
    # preprocessor8 = preprocess.Preprocessor('{}/8-letter-words.json'.format(cwd))
    # preprocessor9 = preprocess.Preprocessor('{}/9-letter-words.json'.format(cwd))
    # preprocessor10 = preprocess.Preprocessor('{}/10-letter-words.json'.format(cwd))
    # preprocessor11 = preprocess.Preprocessor('{}/11-letter-words.json'.format(cwd))
    # preprocessor12 = preprocess.Preprocessor('{}/12-letter-words.json'.format(cwd))
    vocabpreprocessor = preprocess.Preprocessor('{}/vocab.txt'.format(cwd))
    moreWords = preprocess.Preprocessor(
        '{}/entriesWithCollocates.txt'.format(cwd))
    global wordDict
    '''preprocessor.processedWords +'''
    # wordDict = preprocessor.processedWords + preprocessor5.processedWords + preprocessor6.processedWords + preprocessor7.processedWords + preprocessor8.processedWords + preprocessor9.processedWords + preprocessor10.processedWords + preprocessor11.processedWords + preprocessor12.processedWords + vocabpreprocessor.processedWords + moreWords.processedWords
    wordDict = preprocessor.processedWords + vocabpreprocessor.processedWords + moreWords.processedWords
    # set the word dict so the game can find the best guesses
    # print 'PROCESSED WORDS={}'.format(preprocessor.processedWords)
    game = Game()  # starts the game

    while 1:
        GUESS = game.getNextBestGuess()
        game.guess(GUESS)
    # try:
    # 	while 1:
    # 		GUESS = game.getNextBestGuess()
    # 		game.guess(GUESS)
    # except:
    # 	print '\nGAME ENDED'
    return 0

示例#8

0

显示文件

文件： main.py 项目： kienvu58/keyword_suggestion

 def get_data(
     self,
     dsn_database,
     dsn_hostname,
     dsn_port,
     dsn_protocol,
     dsn_uid,
     dsn_pwd,
     level,
 ):
     preprocess = preprocess.Preprocessor()
     raw_data = preprocess.db2_connect(
         dsn_database, dsn_hostname, dsn_port, dsn_protocol, dsn_uid, dsn_pwd
     )
     data = preprocess.data_preprocess(raw_data, level)
     return data

示例#9

0

显示文件

文件： compress.py 项目： komais/boiler

    def compress(self, samFilename, compressedFilename, gtf, min_filename, frag_len_z_cutoff, split_diff_strands, split_discordant):
        ''' Compresses the alignments to 2 files, one for unspliced and one for spliced

            file_prefix: Prefix for all output file names
        '''

        self.p = preprocess.Preprocessor(samFilename, frag_len_z_cutoff, split_diff_strands)

        if not self.frag_len_cutoff:
            self.frag_len_cutoff = self.p.frag_len_cutoff
        print('Using fragment length cutoff of ' + str(self.frag_len_cutoff))

        if split_diff_strands:
            print('Splitting mates on different strands')
        else:
            print('Not splitting mates on different strands')

        if split_discordant:
            print('Splitting discordant')
        else:
            print('Not splitting discordant')

        # Reads on different strands that should be unpaired
        self.diff_strand_unpaired = self.p.unpaired
        del self.p

        # Read header
        header = ''
        with open(samFilename, 'r') as f:
            for line in f:
                if line[0] == '@':
                    header += line
                else:
                    break
        self.chromosomes = self.parseSAMHeader(header)
        self.aligned = alignments.Alignments(self.chromosomes, self.frag_len_cutoff, split_discordant)

        if gtf:
            self.aligned.gtf_exons = self.parseGTF(gtf, self.aligned.chromOffsets)

        self.compressByBundle(samFilename, compressedFilename, min_filename)

        #print('%d unmatched' % self.aligned.numUnmatched)
        print('Approximately %d / %d = %f%% of compressed file is coverage' % (self.covSize, self.totalSize, 100.0*float(self.covSize)/float(self.totalSize)))
        print('Finished compressing')

示例#10

0

显示文件

文件： Evaluate.py 项目： chugare/EVI-fact

def gate_value_report_write(fname,evids_ids,fact_ids,gate_v):
    '''
    用于记录gate值和生成事实之间的对应关系，每一个事实对应一个生成时的最佳证据编号
    :param fname: 文件名
    :param evids_ids: 证据的id序列
    :param fact_ids:  事实id序列
    :param gate_v: 门控值
    :return:
    '''
    p = preprocess.Preprocessor(False)
    fact = p.get_char_list(fact_ids)


    evids = []
    e_w = []
    for e in evids_ids:
        if e[0] == 2:
            e_w.append(0)
            for i in range(len(e)):
                if e[i] == 1:
                    e = e[:i]
                    break
            evids.append(p.get_sentence(e))
        else:
            break
    f = open(fname,'a',encoding='utf-8')
    fact_len = 0
    for g_i in range(len(gate_v)):
        if int(fact_ids[g_i])==1:
            break
        fact_len+=1
        e_w[gate_v[g_i]]+=1
    for i in range(len(evids)):

        f.write('%d\t%s'%(e_w[i],evids[i]))
        f.write('\n')
    for g in range(fact_len):
        f.write('%d\t'%gate_v[g])
    f.write('\n')
    for f_c in fact:
        f.write(f_c+'\t')
    f.write('\n')

    f.close()

示例#11

0

显示文件

文件： antideriv.py 项目： FelSiq/antiderivative-solution-insertion-on-images

    def __init__(self):
        """Main class for antiderivative detection."""
        app_id = 'LHLP7U-HHLKWGU3AT'.lower()

        self._wolfram_client = wolframalpha.Client(app_id)
        self.img_input = None  # type: t.Optional[np.ndarray]
        self.img_solved = None  # type: t.Optional[np.ndarray]
        self.img_segments = None  # type: t.Optional[t.Sequence[np.ndarray]]

        self.models = self._load_models(path=os.path.join(
            os.path.realpath(__file__)[:-len(os.path.basename(__file__))],
            "models"))

        self._preprocessor = preprocess.Preprocessor()
        self._postprocessor = postprocess.Postprocessor()

        # Must have correspondence with the class codification
        # used to train the CNN model loaded just above. Don't
        # change the symbol order.
        self._CLASS_SYMBOL = (
            "0",
            "1",
            "x",
            "+",
            "-",
            "/",
            "(",
            ")",
            "e",
            "integrate",
            "d",
            "2",
            "3",
            "4",
            "5",
            "6",
            "7",
            "8",
            "9",
        )

        self._RE_FIX_DNOTATION = re.compile(r"(?<=d)\s+(?=.)")

示例#12

0

显示文件

 def __init__(self):
     '''
     fancy_classifier = Pipeline([
                 ('preprocessing', Preprocessor()),
                 ('classification', RandomForestClassifier(n_estimators=136, max_depth=None, min_samples_split=2, random_state=0))
                 ])
     self.clf = VotingClassifier(estimators=[
                 ('Linear Discriminant Analysis', LinearDiscriminantAnalysis()),
                 ('Gaussian Classifier', GaussianNB()),
                 ('Support Vector Machine', SVC(probability=True)),
                 ('Fancy Classifier', fancy_classifier)],
                 voting='soft')   
     '''
     self.mdl = RandomForestClassifier(n_estimators=136,
                                       max_depth=None,
                                       min_samples_split=2,
                                       random_state=0)
     self.num_train_samples = 0
     self.num_feat = 1
     self.num_labels = 1
     self.prep = prepro.Preprocessor()

示例#13

0

显示文件

文件： strip_ifdefs.py 项目： qtinsider/opera-presto

            opt_lines = 1

    if not opt_system:
        if opt_testing:
            # If we are testing just use the lingogi file
            opt_system = os.path.join('platforms', 'lingogi', 'system.h')
        else:
            err("You need to specify -s, see -h")
            sys.exit(2)

    if not os.path.exists(opt_system):
        err("'%s' does not exist" % opt_system)
        sys.exit(2)

    # First do all preprocessing from pch.h
    processor = pp.Preprocessor()
    processor.__setitem__("PRODUCT_SYSTEM_FILE", '"' + opt_system + '"')
    processor.addUserIncludePath(".")
    processor.ignoreErrors()

    # If defines are specified we parse them here
    # currently treated as one space separated string
    # and we replace \" with " in string defines.
    #
    # FIXME: The python macros store the name
    # and value, the current code is pike inherited
    # and thus redundantly stores the name twice
    #
    if opt_defines:
        for define in opt_defines:
            if len(define):

示例#14

0

显示文件

        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)

if __name__ == '__main__':
    tmp_path="D:/dhm/programer-lx/BiDAF_tf2"
    ds = preprocess.Preprocessor([
        tmp_path+'/data/squad/train-v1.1.json',
        tmp_path+'/data/squad/dev-v1.1.json',
        tmp_path+'/data/squad/dev-v1.1.json'
    ])

##    train_c, train_q, train_y = ds.get_dataset(tmp_path+'/data/squad/train-v1.1.json')
##    test_c, test_q, test_y = ds.get_dataset(tmp_path+'/data/squad/dev-v1.1.json')
    train_cc, train_cq, train_wc, train_wq, train_y = ds.get_dataset(tmp_path+'/data/squad/test.json')
    test_cc, test_cq, test_wc, test_wq, test_y = ds.get_dataset(tmp_path+'/data/squad/test.json')

    bidaf = BiDAF(
        clen=ds.max_clen,
        qlen=ds.max_qlen,
        emb_size=50,
        max_features=len(ds.charset),  # ds.charset
        vocab_size=len(ds.word_list),
        conv_layers=[[10, 1], [10, 2], [30, 3]],  # 卷积的大小及个数

示例#15

0

显示文件

        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)


if __name__ == '__main__':
    ds = preprocess.Preprocessor(
        ['./data/squad/train-v1.1.json', './data/squad/dev-v1.1.json'],
        ['./data/glove.6B/glove.6B.50d.txt'])
    '''
    train_c, train_q, train_y = ds.get_dataset('./data/squad/train-v1.1.json')
    test_c, test_q, test_y = ds.get_dataset('./data/squad/dev-v1.1.json')
    

    print(train_c.shape, train_q.shape, train_y.shape)
    print(test_c.shape, test_q.shape, test_y.shape)

    '''

    bidaf = BiDAF(
        clen=ds.max_clen,
        qlen=ds.max_qlen,
        emb_size=50,

示例#16

0

显示文件

文件： test.py 项目： austinguo550/Hangman-AI

def main():
    preprocessor = preprocess.Preprocessor(
        '{}/2-letter-words.json'.format(cwd))  # preprocess is called
    print preprocessor.processedWords

示例#17

0

显示文件

文件： sklearns.py 项目： Nurkic/probspace_youtube

test_path = "../input/test_data.csv"
""" load raw data"""
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
""" Preprocessing"""
import preprocess as pr
import impute as im

import copy

df = train["y"]

predata = pd.concat([train.drop("y", axis=1), test], ignore_index=True)
predata_copy = copy.deepcopy(predata)
"""predata_onehot = pr.Preprocessor(predata).all("onehot")"""
predata_label = pr.Preprocessor(predata_copy).all("label", "date")
"""prep_train_onehot = pd.concat([df, predata_onehot.iloc[:len(train), :]], axis=1)
prep_test_onehot = predata_onehot.iloc[len(train):, :]"""

prep_train_label = pd.concat([df, predata_label.iloc[:len(train), :]], axis=1)
prep_test_label = predata_label.iloc[len(train):, :]
"""prep_train_onehot.to_csv("../prep_train_onehot.csv", index=False)
prep_test_onehot.to_csv("../prep_test_onehot.csv", index=False)
prep_train_label.to_csv("../prep_train_label.csv", index=False)
prep_test_label.to_csv("../prep_test_label.csv", index=False)"""
""" define data"""
train_X = prep_train_label.drop([
    "y", "video_id", "thumbnail_link", "publishedAt", "collection_date", "id",
    "tags", "description", "title"
],
                                axis=1)

示例#18

0

显示文件

文件： test2.py 项目： dfalessi/CCP-changes

def test(string1, string2):
    pp = preprocess.Preprocessor()
    texts = [pp.preprocess(string1), pp.preprocess(string2)]

    #print dictionary.token2id
    getProbDists(texts)

示例#19

0

显示文件

文件： preprocessing.py 项目： FelSiq/antiderivative-solution-insertion-on-images

import os
import re

import numpy as np
import skimage
import imageio

sys.path.insert(0, "../antideriv")
import preprocess as antideriv_preproc # noqa: ignore


OUTPUT_PATH = "./data-augmented-preprocessed"
RE_CLASS_NAME = re.compile(r"(?<=class_)[^_]+")
OUTPUT_FILE_TYPE = "png"

PREPROCESSOR_MODEL = antideriv_preproc.Preprocessor()
"""Preprocess the training data the same way as a regular input."""


def resize(img: np.ndarray,
           output_shape: t.Tuple[int, int] = (45, 45)) -> np.ndarray:
    """Resize image to ``output_shape`` with interpolation of order 3."""
    img = skimage.transform.resize(
        image=img,
        output_shape=output_shape,
        anti_aliasing=False,
        order=3)

    return img

示例#20

0

显示文件

文件： main.py 项目： whwangyf1234/NLP-project

        end_probability = y_pred_end[end_idx]

        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)


if __name__ == '__main__':
    ds = preprocess.Preprocessor(
        ['./data/train.json', './data/dev.json', './data/test.json'])

    train_c, train_q, train_y = ds.get_dataset('./data/train.json')
    test_c, test_q, test_y = ds.get_dataset('./data/dev.json')

    print(train_c.shape, train_q.shape, train_y.shape)
    print(test_c.shape, test_q.shape, test_y.shape)

    bidaf = BiDAF(clen=ds.max_clen,
                  qlen=ds.max_qlen,
                  emb_size=128,
                  max_features=len(ds.charset))
    bidaf.build_model()
    bidaf.model.fit([train_c, train_q],
                    train_y,
                    batch_size=16,

示例#21

0

显示文件

    )
    with open(input_file, 'rb') as f:
        lang_data = pickle.load(f)
    new_data = [[preprocess.preprocess_sentence(w) for w in l.split('\t')]
                for l in lang_data[:NUM_DATA]]

    label_holder = []
    input_sentences = []
    for line in new_data:
        labels = postprocess.sentence_labeller(line[0], line[1])
        label_holder.append(labels)
        input_sentences.append(line[1])

        #label_holder = np.array(label_holder)
    #Pre-process the data
    data_holder = preprocess.Preprocessor(lang_data, NUM_DATA, 'TRAIN')
    _, target_dataset, _, output_table, _, max_length_tar, _, _, _, output_index2word, target_lengths = data_holder.finalise_dataset(
    )

    train_targets, val_targets, train_labels, val_labels, train_lengths, val_lengths = train_test_split(
        target_dataset, label_holder, target_lengths, test_size=TEST_SPLIT)
    #Feeding the data in reverse order helps with training
    #input_dataset = np.flip(input_dataset)

    #Create a dataset
    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        train_targets, maxlen=max_length_tar, padding='post')
    label_holder = tf.keras.preprocessing.sequence.pad_sequences(
        train_labels, maxlen=max_length_tar, padding='post')
    padded_outputs = tf.keras.preprocessing.sequence.pad_sequences(
        train_labels, maxlen=max_length_tar, padding='post')

示例#22

0

显示文件

文件： dep.py 项目： xyuan/amrex

                        help="output a detailed log file describing each source file",
                        action="store_true")
    parser.add_argument("files", metavar="source files", type=str, nargs="*",
                        help="F90 source files to find dependencies amongst")

    args = parser.parse_args()

    if args.prefix != "":
        prefix_pass = "******".format(os.path.normpath(args.prefix))
    else:
        prefix_pass = "******"

    if args.temp_dir != "":
        temp_dir = args.temp_dir
    else:
        temp_dir = "./"

    # create a preprocessor object
    if args.cpp != "":
        cpp_pass = preprocess.Preprocessor(temp_dir=temp_dir, cpp_cmd=args.cpp,
                                           defines=args.defines,
                                           f90_preprocess=args.f90_preprocess)
    else:
        cpp_pass = None

    try:
        doit(prefix_pass, args.search_path.split(), args.files, cpp_pass, debug=args.debug)
    except:
        # something went wrong
        print("$(error something went wrong in dep.py.  Remake, adding the option 'DEP_CHECK_OPTS=--debug' to your make command and examine the 'dependencies.out' file)")

示例#23

0

显示文件

文件： noising_model.py 项目： emielzyde/grammar_correction_thesis

        if prediction_word == '<EOS>':
            return decoded_text, sentence, attention_matrix

        decoder_input = tf.expand_dims([prediction_id],0)

    return decoded_text, sentence, attention_matrix

if __name__ == '__main__':

    input_file = os.path.join('/Users/emielzyde/Desktop/Project/grammar_correction/lang8_preprocess.pickle')
    with open(input_file, 'rb') as f:
        #lang_data = f.readlines()
        lang_data = pickle.load(f)
        #lang_data = lang_data.readlines()
    #Pre-process the data
    data_holder = preprocess.Preprocessor(lang_data, 2000, 'TRAIN')
    input_dataset, target_dataset, input_table, output_table, max_length_inp, max_length_tar, input_word2index, output_word2index, input_index2word, output_index2word = data_holder.finalise_dataset()

    train_input_dataset, val_input_dataset, train_target_dataset, val_target_dataset = train_test_split(input_dataset, target_dataset, test_size = TEST_SPLIT)
    #Feeding the data in reverse order helps with training
    #input_dataset = np.flip(input_dataset)

    print('The vocabulary size is {}'.format(len(input_word2index)))

    #Create a dataset
    number_batches = len(train_input_dataset) // BATCH_SIZE
    input_vocab_size = len(input_table.word2index)
    target_vocab_size = len(output_table.word2index)
    dataset = tf.data.Dataset.from_tensor_slices((train_input_dataset, train_target_dataset)).shuffle(len(train_input_dataset))
    dataset = dataset.batch(BATCH_SIZE, drop_remainder= True)

示例#24

0

显示文件

文件： main.py 项目： jiscal/kaikeba-MRC-second

        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)


if __name__ == '__main__':
    ds = preprocess.Preprocessor([
        './data/squad/train-v1.1.json', './data/squad/dev-v1.1.json',
        './data/squad/dev-v1.1.json'
    ])
    train_c, train_q, train_y = ds.get_dataset('./data/squad/train-v1.1.json')
    test_c, test_q, test_y = ds.get_dataset('./data/squad/dev-v1.1.json')

    print(train_c.shape, train_q.shape, train_y.shape)
    print(test_c.shape, test_q.shape, test_y.shape)

    #加载样本中样本的GLove词向量和字符char的初始化
    train_c, train_q, train_y = ds.get_chardataset(
        './data/squad/train-v1.1.json')

    bidaf = BiDAF(clen=ds.max_clen,
                  qlen=ds.max_qlen,
                  emb_size=50,
                  max_features=len(ds.charset))

示例#25

0

显示文件

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN
import time
import os
import preprocess
import LSTM
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# file = open('./figures/output1.txt', 'w')
# sys.stdout = file

if __name__ == '__main__':
    preprocessor = preprocess.Preprocessor()
    ratio = 0.7
    # preprocessor.visualize_data()
    preprocessor.tokenize_data()
    training_data_x = preprocessor.sequenced_summaries[:int(
        ratio * len(preprocessor.sequenced_summaries))]
    training_data_y = preprocessor.rewards[:int(ratio *
                                                len(preprocessor.
                                                    sequenced_summaries))]
    data_x = preprocessor.sequenced_summaries[int(ratio *
                                                  len(preprocessor.
                                                      sequenced_summaries)):]
    data_y = preprocessor.rewards[int(ratio *
                                      len(preprocessor.sequenced_summaries)):]
    # print("hey there")
    # for i in range(7):

示例#26

0

显示文件

文件： linear.py 项目： Nurkic/probspace_re_realestate

train_path = "../input/train_data.csv"
test_path = "../input/test_data.csv"
""" load raw data"""
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
""" Preprocessing"""
import preprocess as pr
import impute as im

import copy

df = train["y"]

predata = pd.concat([train.drop("y", axis=1), test], ignore_index=True)
predata_copy = copy.deepcopy(predata)
predata_onehot = pr.Preprocessor(predata_copy).all("onehot", "nonpub")
#predata_label = pr.Preprocessor(predata_copy).all("label", "nonpub")

#prep_train_label = pd.concat([df, predata_label.iloc[:len(train), :]], axis=1)
#prep_test_label = predata_label.iloc[len(train):, :]

num_list = [
    "TimeToNearestStation", "TotalFloorArea", "Area", "Frontage",
    "BuildingYear", "BuildingAge", "Breadth", "CoverageRatio",
    "FloorAreaRatio", "Period"
]
predata_onehot = im.Imputer(predata_onehot).num_imputer(num_list)
print(predata_onehot[num_list].isnull().sum())

prep_train_onehot = pd.concat([df, predata_onehot.iloc[:len(train), :]],
                              axis=1)

示例#27

0

显示文件

        default="")

    args = parser.parse_args()

    defines = args.defines

    if args.exclude_defines != "":
        excludes = args.exclude_defines.split()
        for ex in excludes:
            defines = defines.replace(ex, "")

    print("defines: ", defines)

    if args.cpp != "":
        cpp_pass = preprocess.Preprocessor(temp_dir=args.output_dir,
                                           cpp_cmd=args.cpp,
                                           defines=defines)
    else:
        cpp_pass = None

    headers, _ = ffv.find_files(args.vpath, args.headers)
    cxx, _ = ffv.find_files(args.vpath, args.cxx)

    # part I: we need to find the names of the Fortran routines that
    # are called from C++ so we can modify the header in the
    # corresponding *_F.H file.

    # A list of specific macros that we want to look for in each target.

    macro_list = [
        'AMREX_INT_ANYD', 'AMREX_REAL_ANYD', 'BL_TO_FORTRAN_ANYD',

示例#28

0

显示文件

        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)


if __name__ == '__main__':
    ds = preprocess.Preprocessor([
        './data/drcd/DRCD_training.json', './data/drcd/DRCD_dev.json',
        './data/drcd/DRCD_training.json'
    ])

    train_c, train_q, train_y = ds.get_dataset(
        './data/drcd/DRCD_training.json')
    test_c, test_q, test_y = ds.get_dataset('./data/drcd/DRCD_dev.json')

    print(train_c.shape, train_q.shape, train_y.shape)
    print(test_c.shape, test_q.shape, test_y.shape)

    bidaf = BiDAF(clen=ds.max_clen,
                  qlen=ds.max_qlen,
                  emb_size=128,
                  max_features=len(ds.charset))
    bidaf.build_model()
    bidaf.model.fit([train_c, train_q],

示例#29

0

显示文件

文件： xgb_tyuko.py 项目： Nurkic/probspace_re_realestate

test_path = "../input/test_data.csv"
""" load raw data"""
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
""" Preprocessing"""
import preprocess as pr
import impute as im

import copy

df = train["y"]

predata = pd.concat([train.drop("y", axis=1), test], ignore_index=True)
predata_copy = copy.deepcopy(predata)
#predata_onehot = pr.Preprocessor(predata).all("onehot")
predata_label = pr.Preprocessor(predata_copy).all("label", "nonpub")

prep_train_label = pd.concat([df, predata_label.iloc[:len(train), :]], axis=1)
prep_test_label = predata_label.iloc[len(train):, :]
""" define data"""
train_X = prep_train_label.drop(["y", "id", "Prefecture", "Municipality"],
                                axis=1)
train_y = np.log1p(prep_train_label["y"])
test_X = prep_test_label.drop(["id", "Prefecture", "Municipality"], axis=1)
""" divine data"""
train_X_tyuko = train_X[train_X["Type"] == 1]
train_X_tatemono = train_X[train_X["Type"] == 2]
train_X_toti = train_X[train_X["Type"] == 3]
train_y_tyuko = train_y[train_X_tyuko.index]
train_y_tatemono = train_y[train_X_tatemono.index]
train_y_toti = train_y[train_X_toti.index]

示例#30

0

显示文件

文件： test.py 项目： chugare/EVI-fact

# -*- coding: utf-8 -*-
#   Project name : Evi-Fact
#   Edit with PyCharm
#   Create by simengzhao at 2018/8/17 下午2:08
#   南京大学软件学院 Nanjing University Software Institute
#

import tensorflow as tf
import numpy as np
import json
import re
import preprocess as PP
import model
npk = PP.Preprocessor(False)
GEFG = model.gated_evidence_fact_generation()
dg = npk.data_provider(
    'train_data.json', {
        'NAME': 'GEFG',
        'MEL': GEFG.MAX_EVID_LEN,
        'MEC': GEFG.MAX_EVIDS,
        'MFL': GEFG.MAX_FACT_LEN,
        'BATCH_SIZE': 1
    })

tf.nn.dynamic_rnn()
m1 = tf.placeholder(dtype=tf.float32, shape=[5, 3, 4])
m2 = tf.placeholder(dtype=tf.float32, shape=[1, 3, 4])
r1 = m1
r2[3] = m2
# r1 = tf.reduce_sum(r1,1)
with tf.Session() as sess: