def prep_data(): input_texts, mesh_outputs = load_data.assemble_pairs() abstract_p = preprocess.Preprocessor() # preprocess and encode texts (inputs) abstract_p.preprocess(input_texts) X = abstract_p.encode_texts(input_texts) labels_p = preprocess.Preprocessor(vocab_size=None, split_char=".", normalize=False) labels_p.preprocess(mesh_outputs) Y = labels_p.encode_texts(mesh_outputs) return (input_texts, abstract_p, mesh_outputs, labels_p, list(zip(X,Y)))
def test(): ''' Trains the model and returns its score ''' matplotlib.rcParams['backend'] = 'Qt5Agg' matplotlib.get_backend() D = DataManager(data_name, data_dir) #Load le model mdl = model() Prepro = prepro.Preprocessor() #D.data['X_train'] = Prepro.removeOutliers(D.data['X_train']) #D.data['Y_train'] = Prepro.removeOutliers(D.data['Y_train']) X_train = D.data['X_train'] Y_train = D.data['Y_train'].ravel() #test de l'entrainement mdl.fit(X_train, Y_train) #test de la prediction Y_hat_train = mdl.predict(D.data['X_train']) Y_hat_valid = mdl.predict(D.data['X_valid']) Y_hat_test = mdl.predict(D.data['X_test']) metric_name, scoring_function = get_metric() scores = cross_val_score(mdl, X_train, Y_train, cv=5, scoring=make_scorer(scoring_function)) print('\nCV score (95 perc. CI): %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
def jsd(fileName, query): pp = preprocess.Preprocessor() fileText = pp.prepDoc(fileName, combine = True) queryText = pp.preprocess(query) texts = [queryText, fileText] probDists = getProbDists(texts) return jensenshannon.jensen_shannon_divergence(numpy.array(probDists))
def testFile(fileName, query): pp = preprocess.Preprocessor() fileText = pp.prepDoc(fileName, combine=True) #print fileText queryText = pp.preprocess(query) texts = [queryText, fileText] #print texts probDists = getProbDists(texts) #print probDists print jensenshannon.jensen_shannon_divergence(numpy.array(probDists))
def data_prep(seed): profile = profile.Profile() interest = interest.Interest() preprocess = preprocess.Preprocessor() profile_raw = profile.get_profile() interest_raw, ids = interest.data_merge() data = preprocess.finalize_data(profile_raw, interest_raw) X, y, X_train, y_train, X_test, y_test = preprocess.split_data(data, seed=seed, re=False) return X, y, X_train, y_train, X_test, y_test, ids
def preprocess_data(url, seed): preprocessor = preprocess.Preprocessor() raw_data = preprocessor.get_data(url) contain_null = preprocessor.get_null(raw_data) for f in contain_null: raw_data.loc[(raw_data[f].isnull()), f] = preprocessor.ImputeVoteClassifier(raw_data, f) X_train, y_train, X_test, y_test = preprocessor.split_data(raw_data, seed, re=False) return X_train, y_train, X_test, y_test
def main(): preprocessor = preprocess.Preprocessor( '{}/words.txt'.format(cwd)) # preprocess is called ''' poor data sets: ''' # preprocessor2 = preprocess.Preprocessor('{}/2-letter-words.json'.format(cwd)) # preprocessor3 = preprocess.Preprocessor('{}/3-letter-words.json'.format(cwd)) # preprocessor4 = preprocess.Preprocessor('{}/4-letter-words.json'.format(cwd)) # preprocessor5 = preprocess.Preprocessor('{}/5-letter-words.json'.format(cwd)) # preprocessor6 = preprocess.Preprocessor('{}/6-letter-words.json'.format(cwd)) # preprocessor7 = preprocess.Preprocessor('{}/7-letter-words.json'.format(cwd)) # preprocessor8 = preprocess.Preprocessor('{}/8-letter-words.json'.format(cwd)) # preprocessor9 = preprocess.Preprocessor('{}/9-letter-words.json'.format(cwd)) # preprocessor10 = preprocess.Preprocessor('{}/10-letter-words.json'.format(cwd)) # preprocessor11 = preprocess.Preprocessor('{}/11-letter-words.json'.format(cwd)) # preprocessor12 = preprocess.Preprocessor('{}/12-letter-words.json'.format(cwd)) vocabpreprocessor = preprocess.Preprocessor('{}/vocab.txt'.format(cwd)) moreWords = preprocess.Preprocessor( '{}/entriesWithCollocates.txt'.format(cwd)) global wordDict '''preprocessor.processedWords +''' # wordDict = preprocessor.processedWords + preprocessor5.processedWords + preprocessor6.processedWords + preprocessor7.processedWords + preprocessor8.processedWords + preprocessor9.processedWords + preprocessor10.processedWords + preprocessor11.processedWords + preprocessor12.processedWords + vocabpreprocessor.processedWords + moreWords.processedWords wordDict = preprocessor.processedWords + vocabpreprocessor.processedWords + moreWords.processedWords # set the word dict so the game can find the best guesses # print 'PROCESSED WORDS={}'.format(preprocessor.processedWords) game = Game() # starts the game while 1: GUESS = game.getNextBestGuess() game.guess(GUESS) # try: # while 1: # GUESS = game.getNextBestGuess() # game.guess(GUESS) # except: # print '\nGAME ENDED' return 0
def get_data( self, dsn_database, dsn_hostname, dsn_port, dsn_protocol, dsn_uid, dsn_pwd, level, ): preprocess = preprocess.Preprocessor() raw_data = preprocess.db2_connect( dsn_database, dsn_hostname, dsn_port, dsn_protocol, dsn_uid, dsn_pwd ) data = preprocess.data_preprocess(raw_data, level) return data
def compress(self, samFilename, compressedFilename, gtf, min_filename, frag_len_z_cutoff, split_diff_strands, split_discordant): ''' Compresses the alignments to 2 files, one for unspliced and one for spliced file_prefix: Prefix for all output file names ''' self.p = preprocess.Preprocessor(samFilename, frag_len_z_cutoff, split_diff_strands) if not self.frag_len_cutoff: self.frag_len_cutoff = self.p.frag_len_cutoff print('Using fragment length cutoff of ' + str(self.frag_len_cutoff)) if split_diff_strands: print('Splitting mates on different strands') else: print('Not splitting mates on different strands') if split_discordant: print('Splitting discordant') else: print('Not splitting discordant') # Reads on different strands that should be unpaired self.diff_strand_unpaired = self.p.unpaired del self.p # Read header header = '' with open(samFilename, 'r') as f: for line in f: if line[0] == '@': header += line else: break self.chromosomes = self.parseSAMHeader(header) self.aligned = alignments.Alignments(self.chromosomes, self.frag_len_cutoff, split_discordant) if gtf: self.aligned.gtf_exons = self.parseGTF(gtf, self.aligned.chromOffsets) self.compressByBundle(samFilename, compressedFilename, min_filename) #print('%d unmatched' % self.aligned.numUnmatched) print('Approximately %d / %d = %f%% of compressed file is coverage' % (self.covSize, self.totalSize, 100.0*float(self.covSize)/float(self.totalSize))) print('Finished compressing')
def gate_value_report_write(fname,evids_ids,fact_ids,gate_v): ''' 用于记录gate值和生成事实之间的对应关系,每一个事实对应一个生成时的最佳证据编号 :param fname: 文件名 :param evids_ids: 证据的id序列 :param fact_ids: 事实id序列 :param gate_v: 门控值 :return: ''' p = preprocess.Preprocessor(False) fact = p.get_char_list(fact_ids) evids = [] e_w = [] for e in evids_ids: if e[0] == 2: e_w.append(0) for i in range(len(e)): if e[i] == 1: e = e[:i] break evids.append(p.get_sentence(e)) else: break f = open(fname,'a',encoding='utf-8') fact_len = 0 for g_i in range(len(gate_v)): if int(fact_ids[g_i])==1: break fact_len+=1 e_w[gate_v[g_i]]+=1 for i in range(len(evids)): f.write('%d\t%s'%(e_w[i],evids[i])) f.write('\n') for g in range(fact_len): f.write('%d\t'%gate_v[g]) f.write('\n') for f_c in fact: f.write(f_c+'\t') f.write('\n') f.close()
def __init__(self): """Main class for antiderivative detection.""" app_id = 'LHLP7U-HHLKWGU3AT'.lower() self._wolfram_client = wolframalpha.Client(app_id) self.img_input = None # type: t.Optional[np.ndarray] self.img_solved = None # type: t.Optional[np.ndarray] self.img_segments = None # type: t.Optional[t.Sequence[np.ndarray]] self.models = self._load_models(path=os.path.join( os.path.realpath(__file__)[:-len(os.path.basename(__file__))], "models")) self._preprocessor = preprocess.Preprocessor() self._postprocessor = postprocess.Postprocessor() # Must have correspondence with the class codification # used to train the CNN model loaded just above. Don't # change the symbol order. self._CLASS_SYMBOL = ( "0", "1", "x", "+", "-", "/", "(", ")", "e", "integrate", "d", "2", "3", "4", "5", "6", "7", "8", "9", ) self._RE_FIX_DNOTATION = re.compile(r"(?<=d)\s+(?=.)")
def __init__(self): ''' fancy_classifier = Pipeline([ ('preprocessing', Preprocessor()), ('classification', RandomForestClassifier(n_estimators=136, max_depth=None, min_samples_split=2, random_state=0)) ]) self.clf = VotingClassifier(estimators=[ ('Linear Discriminant Analysis', LinearDiscriminantAnalysis()), ('Gaussian Classifier', GaussianNB()), ('Support Vector Machine', SVC(probability=True)), ('Fancy Classifier', fancy_classifier)], voting='soft') ''' self.mdl = RandomForestClassifier(n_estimators=136, max_depth=None, min_samples_split=2, random_state=0) self.num_train_samples = 0 self.num_feat = 1 self.num_labels = 1 self.prep = prepro.Preprocessor()
opt_lines = 1 if not opt_system: if opt_testing: # If we are testing just use the lingogi file opt_system = os.path.join('platforms', 'lingogi', 'system.h') else: err("You need to specify -s, see -h") sys.exit(2) if not os.path.exists(opt_system): err("'%s' does not exist" % opt_system) sys.exit(2) # First do all preprocessing from pch.h processor = pp.Preprocessor() processor.__setitem__("PRODUCT_SYSTEM_FILE", '"' + opt_system + '"') processor.addUserIncludePath(".") processor.ignoreErrors() # If defines are specified we parse them here # currently treated as one space separated string # and we replace \" with " in string defines. # # FIXME: The python macros store the name # and value, the current code is pike inherited # and thus redundantly stores the name twice # if opt_defines: for define in opt_defines: if len(define):
return (start_probability + end_probability) / 2.0 y_true = tf.squeeze(y_true) y_pred_start = y_pred[:, 0, :] y_pred_end = y_pred[:, 1, :] inputs = (y_true, y_pred_start, y_pred_end) acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32) return tf.math.reduce_mean(acc, axis=0) if __name__ == '__main__': tmp_path="D:/dhm/programer-lx/BiDAF_tf2" ds = preprocess.Preprocessor([ tmp_path+'/data/squad/train-v1.1.json', tmp_path+'/data/squad/dev-v1.1.json', tmp_path+'/data/squad/dev-v1.1.json' ]) ## train_c, train_q, train_y = ds.get_dataset(tmp_path+'/data/squad/train-v1.1.json') ## test_c, test_q, test_y = ds.get_dataset(tmp_path+'/data/squad/dev-v1.1.json') train_cc, train_cq, train_wc, train_wq, train_y = ds.get_dataset(tmp_path+'/data/squad/test.json') test_cc, test_cq, test_wc, test_wq, test_y = ds.get_dataset(tmp_path+'/data/squad/test.json') bidaf = BiDAF( clen=ds.max_clen, qlen=ds.max_qlen, emb_size=50, max_features=len(ds.charset), # ds.charset vocab_size=len(ds.word_list), conv_layers=[[10, 1], [10, 2], [30, 3]], # 卷积的大小及个数
return (start_probability + end_probability) / 2.0 y_true = tf.squeeze(y_true) y_pred_start = y_pred[:, 0, :] y_pred_end = y_pred[:, 1, :] inputs = (y_true, y_pred_start, y_pred_end) acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32) return tf.math.reduce_mean(acc, axis=0) if __name__ == '__main__': ds = preprocess.Preprocessor( ['./data/squad/train-v1.1.json', './data/squad/dev-v1.1.json'], ['./data/glove.6B/glove.6B.50d.txt']) ''' train_c, train_q, train_y = ds.get_dataset('./data/squad/train-v1.1.json') test_c, test_q, test_y = ds.get_dataset('./data/squad/dev-v1.1.json') print(train_c.shape, train_q.shape, train_y.shape) print(test_c.shape, test_q.shape, test_y.shape) ''' bidaf = BiDAF( clen=ds.max_clen, qlen=ds.max_qlen, emb_size=50,
def main(): preprocessor = preprocess.Preprocessor( '{}/2-letter-words.json'.format(cwd)) # preprocess is called print preprocessor.processedWords
test_path = "../input/test_data.csv" """ load raw data""" train = pd.read_csv(train_path) test = pd.read_csv(test_path) """ Preprocessing""" import preprocess as pr import impute as im import copy df = train["y"] predata = pd.concat([train.drop("y", axis=1), test], ignore_index=True) predata_copy = copy.deepcopy(predata) """predata_onehot = pr.Preprocessor(predata).all("onehot")""" predata_label = pr.Preprocessor(predata_copy).all("label", "date") """prep_train_onehot = pd.concat([df, predata_onehot.iloc[:len(train), :]], axis=1) prep_test_onehot = predata_onehot.iloc[len(train):, :]""" prep_train_label = pd.concat([df, predata_label.iloc[:len(train), :]], axis=1) prep_test_label = predata_label.iloc[len(train):, :] """prep_train_onehot.to_csv("../prep_train_onehot.csv", index=False) prep_test_onehot.to_csv("../prep_test_onehot.csv", index=False) prep_train_label.to_csv("../prep_train_label.csv", index=False) prep_test_label.to_csv("../prep_test_label.csv", index=False)""" """ define data""" train_X = prep_train_label.drop([ "y", "video_id", "thumbnail_link", "publishedAt", "collection_date", "id", "tags", "description", "title" ], axis=1)
def test(string1, string2): pp = preprocess.Preprocessor() texts = [pp.preprocess(string1), pp.preprocess(string2)] #print dictionary.token2id getProbDists(texts)
import os import re import numpy as np import skimage import imageio sys.path.insert(0, "../antideriv") import preprocess as antideriv_preproc # noqa: ignore OUTPUT_PATH = "./data-augmented-preprocessed" RE_CLASS_NAME = re.compile(r"(?<=class_)[^_]+") OUTPUT_FILE_TYPE = "png" PREPROCESSOR_MODEL = antideriv_preproc.Preprocessor() """Preprocess the training data the same way as a regular input.""" def resize(img: np.ndarray, output_shape: t.Tuple[int, int] = (45, 45)) -> np.ndarray: """Resize image to ``output_shape`` with interpolation of order 3.""" img = skimage.transform.resize( image=img, output_shape=output_shape, anti_aliasing=False, order=3) return img
end_probability = y_pred_end[end_idx] return (start_probability + end_probability) / 2.0 y_true = tf.squeeze(y_true) y_pred_start = y_pred[:, 0, :] y_pred_end = y_pred[:, 1, :] inputs = (y_true, y_pred_start, y_pred_end) acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32) return tf.math.reduce_mean(acc, axis=0) if __name__ == '__main__': ds = preprocess.Preprocessor( ['./data/train.json', './data/dev.json', './data/test.json']) train_c, train_q, train_y = ds.get_dataset('./data/train.json') test_c, test_q, test_y = ds.get_dataset('./data/dev.json') print(train_c.shape, train_q.shape, train_y.shape) print(test_c.shape, test_q.shape, test_y.shape) bidaf = BiDAF(clen=ds.max_clen, qlen=ds.max_qlen, emb_size=128, max_features=len(ds.charset)) bidaf.build_model() bidaf.model.fit([train_c, train_q], train_y, batch_size=16,
) with open(input_file, 'rb') as f: lang_data = pickle.load(f) new_data = [[preprocess.preprocess_sentence(w) for w in l.split('\t')] for l in lang_data[:NUM_DATA]] label_holder = [] input_sentences = [] for line in new_data: labels = postprocess.sentence_labeller(line[0], line[1]) label_holder.append(labels) input_sentences.append(line[1]) #label_holder = np.array(label_holder) #Pre-process the data data_holder = preprocess.Preprocessor(lang_data, NUM_DATA, 'TRAIN') _, target_dataset, _, output_table, _, max_length_tar, _, _, _, output_index2word, target_lengths = data_holder.finalise_dataset( ) train_targets, val_targets, train_labels, val_labels, train_lengths, val_lengths = train_test_split( target_dataset, label_holder, target_lengths, test_size=TEST_SPLIT) #Feeding the data in reverse order helps with training #input_dataset = np.flip(input_dataset) #Create a dataset padded_inputs = tf.keras.preprocessing.sequence.pad_sequences( train_targets, maxlen=max_length_tar, padding='post') label_holder = tf.keras.preprocessing.sequence.pad_sequences( train_labels, maxlen=max_length_tar, padding='post') padded_outputs = tf.keras.preprocessing.sequence.pad_sequences( train_labels, maxlen=max_length_tar, padding='post')
help="output a detailed log file describing each source file", action="store_true") parser.add_argument("files", metavar="source files", type=str, nargs="*", help="F90 source files to find dependencies amongst") args = parser.parse_args() if args.prefix != "": prefix_pass = "******".format(os.path.normpath(args.prefix)) else: prefix_pass = "******" if args.temp_dir != "": temp_dir = args.temp_dir else: temp_dir = "./" # create a preprocessor object if args.cpp != "": cpp_pass = preprocess.Preprocessor(temp_dir=temp_dir, cpp_cmd=args.cpp, defines=args.defines, f90_preprocess=args.f90_preprocess) else: cpp_pass = None try: doit(prefix_pass, args.search_path.split(), args.files, cpp_pass, debug=args.debug) except: # something went wrong print("$(error something went wrong in dep.py. Remake, adding the option 'DEP_CHECK_OPTS=--debug' to your make command and examine the 'dependencies.out' file)")
if prediction_word == '<EOS>': return decoded_text, sentence, attention_matrix decoder_input = tf.expand_dims([prediction_id],0) return decoded_text, sentence, attention_matrix if __name__ == '__main__': input_file = os.path.join('/Users/emielzyde/Desktop/Project/grammar_correction/lang8_preprocess.pickle') with open(input_file, 'rb') as f: #lang_data = f.readlines() lang_data = pickle.load(f) #lang_data = lang_data.readlines() #Pre-process the data data_holder = preprocess.Preprocessor(lang_data, 2000, 'TRAIN') input_dataset, target_dataset, input_table, output_table, max_length_inp, max_length_tar, input_word2index, output_word2index, input_index2word, output_index2word = data_holder.finalise_dataset() train_input_dataset, val_input_dataset, train_target_dataset, val_target_dataset = train_test_split(input_dataset, target_dataset, test_size = TEST_SPLIT) #Feeding the data in reverse order helps with training #input_dataset = np.flip(input_dataset) print('The vocabulary size is {}'.format(len(input_word2index))) #Create a dataset number_batches = len(train_input_dataset) // BATCH_SIZE input_vocab_size = len(input_table.word2index) target_vocab_size = len(output_table.word2index) dataset = tf.data.Dataset.from_tensor_slices((train_input_dataset, train_target_dataset)).shuffle(len(train_input_dataset)) dataset = dataset.batch(BATCH_SIZE, drop_remainder= True)
return (start_probability + end_probability) / 2.0 y_true = tf.squeeze(y_true) y_pred_start = y_pred[:, 0, :] y_pred_end = y_pred[:, 1, :] inputs = (y_true, y_pred_start, y_pred_end) acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32) return tf.math.reduce_mean(acc, axis=0) if __name__ == '__main__': ds = preprocess.Preprocessor([ './data/squad/train-v1.1.json', './data/squad/dev-v1.1.json', './data/squad/dev-v1.1.json' ]) train_c, train_q, train_y = ds.get_dataset('./data/squad/train-v1.1.json') test_c, test_q, test_y = ds.get_dataset('./data/squad/dev-v1.1.json') print(train_c.shape, train_q.shape, train_y.shape) print(test_c.shape, test_q.shape, test_y.shape) #加载样本中样本的GLove词向量和字符char的初始化 train_c, train_q, train_y = ds.get_chardataset( './data/squad/train-v1.1.json') bidaf = BiDAF(clen=ds.max_clen, qlen=ds.max_qlen, emb_size=50, max_features=len(ds.charset))
from keras.preprocessing import sequence from keras.preprocessing.text import Tokenizer from keras.models import Sequential from keras.layers.core import Dense from keras.layers.embeddings import Embedding from keras.layers.recurrent import SimpleRNN import time import os import preprocess import LSTM os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # file = open('./figures/output1.txt', 'w') # sys.stdout = file if __name__ == '__main__': preprocessor = preprocess.Preprocessor() ratio = 0.7 # preprocessor.visualize_data() preprocessor.tokenize_data() training_data_x = preprocessor.sequenced_summaries[:int( ratio * len(preprocessor.sequenced_summaries))] training_data_y = preprocessor.rewards[:int(ratio * len(preprocessor. sequenced_summaries))] data_x = preprocessor.sequenced_summaries[int(ratio * len(preprocessor. sequenced_summaries)):] data_y = preprocessor.rewards[int(ratio * len(preprocessor.sequenced_summaries)):] # print("hey there") # for i in range(7):
train_path = "../input/train_data.csv" test_path = "../input/test_data.csv" """ load raw data""" train = pd.read_csv(train_path) test = pd.read_csv(test_path) """ Preprocessing""" import preprocess as pr import impute as im import copy df = train["y"] predata = pd.concat([train.drop("y", axis=1), test], ignore_index=True) predata_copy = copy.deepcopy(predata) predata_onehot = pr.Preprocessor(predata_copy).all("onehot", "nonpub") #predata_label = pr.Preprocessor(predata_copy).all("label", "nonpub") #prep_train_label = pd.concat([df, predata_label.iloc[:len(train), :]], axis=1) #prep_test_label = predata_label.iloc[len(train):, :] num_list = [ "TimeToNearestStation", "TotalFloorArea", "Area", "Frontage", "BuildingYear", "BuildingAge", "Breadth", "CoverageRatio", "FloorAreaRatio", "Period" ] predata_onehot = im.Imputer(predata_onehot).num_imputer(num_list) print(predata_onehot[num_list].isnull().sum()) prep_train_onehot = pd.concat([df, predata_onehot.iloc[:len(train), :]], axis=1)
default="") args = parser.parse_args() defines = args.defines if args.exclude_defines != "": excludes = args.exclude_defines.split() for ex in excludes: defines = defines.replace(ex, "") print("defines: ", defines) if args.cpp != "": cpp_pass = preprocess.Preprocessor(temp_dir=args.output_dir, cpp_cmd=args.cpp, defines=defines) else: cpp_pass = None headers, _ = ffv.find_files(args.vpath, args.headers) cxx, _ = ffv.find_files(args.vpath, args.cxx) # part I: we need to find the names of the Fortran routines that # are called from C++ so we can modify the header in the # corresponding *_F.H file. # A list of specific macros that we want to look for in each target. macro_list = [ 'AMREX_INT_ANYD', 'AMREX_REAL_ANYD', 'BL_TO_FORTRAN_ANYD',
return (start_probability + end_probability) / 2.0 y_true = tf.squeeze(y_true) y_pred_start = y_pred[:, 0, :] y_pred_end = y_pred[:, 1, :] inputs = (y_true, y_pred_start, y_pred_end) acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32) return tf.math.reduce_mean(acc, axis=0) if __name__ == '__main__': ds = preprocess.Preprocessor([ './data/drcd/DRCD_training.json', './data/drcd/DRCD_dev.json', './data/drcd/DRCD_training.json' ]) train_c, train_q, train_y = ds.get_dataset( './data/drcd/DRCD_training.json') test_c, test_q, test_y = ds.get_dataset('./data/drcd/DRCD_dev.json') print(train_c.shape, train_q.shape, train_y.shape) print(test_c.shape, test_q.shape, test_y.shape) bidaf = BiDAF(clen=ds.max_clen, qlen=ds.max_qlen, emb_size=128, max_features=len(ds.charset)) bidaf.build_model() bidaf.model.fit([train_c, train_q],
test_path = "../input/test_data.csv" """ load raw data""" train = pd.read_csv(train_path) test = pd.read_csv(test_path) """ Preprocessing""" import preprocess as pr import impute as im import copy df = train["y"] predata = pd.concat([train.drop("y", axis=1), test], ignore_index=True) predata_copy = copy.deepcopy(predata) #predata_onehot = pr.Preprocessor(predata).all("onehot") predata_label = pr.Preprocessor(predata_copy).all("label", "nonpub") prep_train_label = pd.concat([df, predata_label.iloc[:len(train), :]], axis=1) prep_test_label = predata_label.iloc[len(train):, :] """ define data""" train_X = prep_train_label.drop(["y", "id", "Prefecture", "Municipality"], axis=1) train_y = np.log1p(prep_train_label["y"]) test_X = prep_test_label.drop(["id", "Prefecture", "Municipality"], axis=1) """ divine data""" train_X_tyuko = train_X[train_X["Type"] == 1] train_X_tatemono = train_X[train_X["Type"] == 2] train_X_toti = train_X[train_X["Type"] == 3] train_y_tyuko = train_y[train_X_tyuko.index] train_y_tatemono = train_y[train_X_tatemono.index] train_y_toti = train_y[train_X_toti.index]
# -*- coding: utf-8 -*- # Project name : Evi-Fact # Edit with PyCharm # Create by simengzhao at 2018/8/17 下午2:08 # 南京大学软件学院 Nanjing University Software Institute # import tensorflow as tf import numpy as np import json import re import preprocess as PP import model npk = PP.Preprocessor(False) GEFG = model.gated_evidence_fact_generation() dg = npk.data_provider( 'train_data.json', { 'NAME': 'GEFG', 'MEL': GEFG.MAX_EVID_LEN, 'MEC': GEFG.MAX_EVIDS, 'MFL': GEFG.MAX_FACT_LEN, 'BATCH_SIZE': 1 }) tf.nn.dynamic_rnn() m1 = tf.placeholder(dtype=tf.float32, shape=[5, 3, 4]) m2 = tf.placeholder(dtype=tf.float32, shape=[1, 3, 4]) r1 = m1 r2[3] = m2 # r1 = tf.reduce_sum(r1,1) with tf.Session() as sess: