def train_and_evaluate_RNN(batch_size, lstm_size): with open('./datasets/w2v/vocab-raw.txt') as f: vocab_size = len(f.read().splitlines()) tf.set_random_seed(2021) tf.reset_default_graph() rnn = RNN(vocab_size=vocab_size, embedding_size=300, lstm_size=lstm_size, batch_size=batch_size) predicted_labels, loss = rnn.build_graph() train_op = rnn.trainer(loss=loss, learning_rate=0.01) with tf.Session() as sess: train_data_reader = DataReader( data_path='./datasets/w2v/20news-trainencoded.txt', batch_size=batch_size, vocab_size=vocab_size) test_data_reader = DataReader( data_path='./datasets/w2v/20news-testencoded.txt', batch_size=batch_size, vocab_size=vocab_size) step = 0 MAX_STEP = 5000 sess.run(tf.global_variables_initializer()) while step < MAX_STEP: next_train_batch = train_data_reader.next_batch() train_data, train_labels, train_sentence_lengths = next_train_batch plabels_eval, loss_eval, _ = sess.run( [predicted_labels, loss, train_op], feed_dict={ rnn._data: train_data, rnn._labels: train_labels, rnn._sentence_lengths: train_sentence_lengths, }, ) step += 1 if step % 50 == 0: print("Step: {}, Loss: {}".format(str(step), str(loss_eval))) if train_data_reader._batch_id == 0: num_true_preds = 0 while True: next_test_batch = test_data_reader.next_batch() test_data, test_labels, test_sentence_lengths = next_test_batch test_plabels_eval = sess.run(predicted_labels, feed_dict={ rnn._data: test_data, rnn._labels: test_labels, rnn._sentence_lengths: test_sentence_lengths, }) matches = np.equal(test_plabels_eval, test_labels) num_true_preds += np.sum(matches.astype(float)) if test_data_reader._batch_id == 0: break print('Epoch:', train_data_reader._num_epoch) print('Accuracy on test data:', num_true_preds * 100 / len(test_data_reader._data))
def trainModel(experiment, testRun, setTarg): print("Training model ...") datasetTrain = DataReader(experiment.data["path"]) datasetTrain.setDatasetClassic("train", experiment.data["feature"], experiment.data["annotation"]) if setTarg == "MeanStd": datasetTrain.setTargetMeanStd() if testRun: datasetTrain = keepOne(datasetTrain) datasetDev = DataReader(experiment.data["path"]) datasetDev.setDatasetClassic("dev", experiment.data["feature"], experiment.data["annotation"]) if setTarg == "MeanStd": datasetDev.setTargetMeanStd() if testRun: datasetDev = keepOne(datasetDev) if testRun: experiment.maxEpoch = 1 inp, tar = datasetDev[0] experiment.inputDim = inp.shape[1] experiment.outputDim = tar.shape[1] # print("experiment.outputDim", tar.shape) wrapper = getWrapper(experiment) wrapper.trainModel(datasetTrain, datasetDev, batchSize=experiment.batchSize, maxEpoch=experiment.maxEpoch, loadBefore=True, tolerance=experiment.tolerance, minForTolerance=experiment.minForTolerance) wrapper.saveLogToCSV()
def generate_templates(inputfilepath, series, filename_noise_psds, vec_r_lim, mat_theta_lim, filename_templates): print('generate_templates') E_min = 0. E_max = 1E12 V = get_noise_psds(filename_noise_psds) gen = TemplateGeneratorNxM(V, calc_r, calc_theta, E_min, E_max, vec_r_lim, mat_theta_lim) dr = DataReader() dr.OpenFile(inputfilepath, series, 0) event_count = 0 while dr.LoadEvent(trigger='Trigger'): gen.IncludeEvent(dr.GetTraces()) event_count += 1 if event_count % STEP_MONITOR == 1: print('Event', event_count) dr.CloseFile() templates = gen.GetTemplates() if type(templates) == list: map_bins_part = gen.GetMapBinsPart() save_templates_nxm(templates, E_min, E_max, map_bins_part, filename_templates) gen.Draw(PATH + '/png')
def main(argv): try: rdr = DataReader(argv[1]) level = int(argv[2]) except: print "usage: python Extractor.py <bson_file> <level>" print level with open('catid_to_levelid.json', 'r') as ldict_file: cat_dict = json.load(ldict_file) with open('l%d_dict.json' % level, 'r') as ldict_file: l_dict = json.load(ldict_file) category_cnts = [0 for i in xrange(len(l_dict))] category_img_cnts = [0 for i in xrange(len(l_dict))] cnt = 1 while 1: try: tup = rdr.getOne() if cnt % 10000 == 0: print cnt cnt += 1 catid = tup[0] cat_id = cat_dict[str(catid)][level-1] category_img_cnts[cat_id] += len(tup[1]) except StopIteration: print "none" break with open("l%d_histo.json" % level, 'r') as histo_file: json.dump(histo_file, category_img_cnts)
def _readData(self, dataPath, dataType): def findURL(dataPath, file): from bs4 import BeautifulSoup content = open(dataPath + file, 'r', encoding='utf-8').read() soup = BeautifulSoup(content, 'html.parser') ref = soup.find('link').get('href') return ref.split('/')[-1].lower().replace('.html', '') dataReader = DataReader(dataPath, dataType, self.project) numOfFiles = dataReader.getNumberOfFiles() for i in range(numOfFiles): _file, context = dataReader.readData(i) if dataType == 'UserManual': manual = Manual() manual.id = i manual.name = _file.split('.')[0].lower() if self.project == 'komodo': manual.url = findURL(dataPath, _file) manual.sentences = context self.manuals.append(manual) elif dataType == 'IssueReport': issue = Issue() issue.number = _file issue.html = context self.issues.append(issue)
def main(): # Delete portfolio.db if it exists os.remove("portfolio.db") print("portfolio.db removed successfully") # Get data paths stock_filename = "data_stocks.csv" bond_filename = "data_bonds.csv" # Initialize dataReader dataReader = DataReader(stock_filename, bond_filename) # Get stock and bond data stockData = dataReader.getStockData() bondData = dataReader.getBondData() # Initialize an investor investor = Investor("Bob", "Smith", "123 Fake St, Denver, CO 80221", "303.777.1234") # Initialize a portfolio portfolio = Portfolio(investor) # Add the stocks and bonds to the portfolio portfolio.addStocks(stockData) portfolio.addBonds(bondData) # Initialize a report report = Report(portfolio) # Print the report report.print()
def identifier(self): #for each data folder for i in range(0, len(self.data_list)): data_folder = self.data_dir + self.data_list[i] + '/estimated/txt/' #reading data from txt files dff = DataReader(data_folder) data, time = dff.reader() #calculation of the number of frame to individuate a stance fase frameTime = float(time[1]) - float(time[0]) framesRange = round(0.15 / frameTime) #extracting right ankle coordinates lankle = [] for k in range(0, len(data[14])): lankle.append([data[14][k][0], data[14][k][1]]) #deburrings elimination lankle = self.deburringsElimination(lankle, framesRange) #individuation of points in which a stance phase begins stanceId = StancesIdentifier(lankle, framesRange) print("\n\n\t" + self.data_list[i]) stanceBeginnings = stanceId.identifier() #identification of the beginning and the end of each stride strides = self.stridesIdentification(stanceBeginnings) #store the information about each stride ss = StridesStorage(self.data_list[i], data, time, strides) ss.storage()
def __init__(self, seg_type='jieba', data=DEV_DATA, fill='NULL', needfill=True): self.base_fname = data.split('.')[0] self.word_file = data.split('.')[0] + '.word' self.bin_file = data.split('.')[0] + '.bin' self.seg_file = data.split('.')[0] + '.seg' self.vec_file = data.split('.')[0] + '.vec' self.seg = Segmentation(seg_type) self.dr = DataReader(data) self.dr.filt() self.qu = [] self.wr_ans = [] self.cor_ans = [] self.qu_vec = [] self.wr_ans_vec = [] self.cor_ans_vec = [] self.max_len = 0 self.fill_len = 0 self.fill = fill # self.needfill = needfill self.freq = [0] * 1050 self.stop_word = self._load_stop_word()
def __init__(self, ex_type, coin, min_tick_size, start_time, end_time, delay=timedelta(milliseconds=100), path='./'): ''' Initiate Exchange of specific ex_type Inputs: path, request, ex_type: used for getting data, see DataReader delay: set to 100ms for delay of trading operations start_time, end_time: datetime object specifying start and end of backtest ''' start_date = start_time.date() end_date = end_time.date() # read data for specific coin type, firstly try read pickle # if pickle not exist, try request from web try: print('Reading data from {}'.format('{}data/{}.pkl'.format( path, coin))) self.data = pd.read_pickle('{}data/{}.pkl'.format( path, coin))[start_time:end_time] except: print( 'Pickle file does not exist, trying to request data using DataReader:' ) dr = DataReader(ex_type) dr.read_data(start_date, end_date, path=path, request=True) try: print('Request completed, now reading data from {}'.format( '{}data/{}.pkl'.format(path, coin))) self.data = pd.read_pickle('{}data/{}.pkl'.format( path, coin))[start_time:end_time] except: raise Exception('Data for {} not avaliable from {}'.format( coin, ex_type)) # orders: strategies submitted orders that are not yet excuted, currently 1d list: for 1 strategy self.orders = {'Buy': [], 'Sell': []} self.order_prices = [] # subscriber: now for convenience just kept 1 strategy, might change later to list of strategies, self.subscriber = None # current_time: current time point of the exchange self.current_time = start_time self.DELAY = delay self.end_time = end_time self.last_sell_price = np.inf self.last_buy_price = 0 self.coin = coin self.timestamps = self.data.index.to_list() # heap of (time, order, coin) self.queue = [] self.min_tick_size = min_tick_size
def getSimilarityScores(self, testingIcon, numOfNeighbours, min_libs, testingIcons): numOfNeighbours = 50 score_dict = {} a = self.datas[testingIcon] sha256_a = a[5] dataReader = DataReader(Database_name) filtering_res = dataReader.query_detail(sha256_a, a[3], a[10], a[6], testingIcons, self.if_apk) for item in filtering_res: b = self.datas[item] sha256_b = b[5] libs = b[-2].split(";") if len(libs) < min_libs: continue # calculate the simScore of testingIcon simCounter = SimCounter() score = simCounter.image_similarity_score(a[8], sha256_a, b[8], sha256_b) score_dict[item] = score score_lst = dict2sortedlist(score_dict)[:numOfNeighbours] topn = {} flag = 0 if len(score_lst) >= numOfNeighbours: for item in score_lst: if item[1]: flag = 1 topn[item[0]] = item[1] if flag: return topn else: return None else: return None
def make_readable(locale_path, recipes): end_pattern = r'=(?P<new_name>.*)' d = DataReader(locale_path) remove_recipes = [] for entry in recipes: pattern = entry.name + end_pattern match = re.search(pattern, d.content) if match: entry.name = match.group("new_name") else: remove_recipes.append(entry) continue remove_resources = [] for pair in entry.get_resources().items(): pattern = pair[0].name + end_pattern match = re.search(pattern, d.content) if match: pair[0].name = match.group("new_name") else: remove_resources.append(pair[0]) continue for resource in remove_resources: entry.remove_resource(resource) if entry.number_of_resources() == 0: remove_recipes.append(entry) for entry in remove_recipes: recipes.remove(entry) return recipes
def get_tweets_labels(tweet_file, labels_file,tests_file): #Simply read in data data_reader = DataReader(tweet_file, labels_file,tests_file) tweets = data_reader.read_tweets() labels = data_reader.read_labels() tests = data_reader.read_tests() return tweets, labels, tests
def getSimilarityScores(self, testingIcon, numOfNeighbours, min_libs, testingIcons): numOfNeighbours = 50 score_dict = {} a = self.datas[testingIcon] sha256 = a[5] dataReader = DataReader(Database_name) filtering_res = dataReader.query_detail(sha256, a[3], a[10], a[6], testingIcons, self.if_apk) for item in filtering_res: b = self.datas[item] libs = b[-2].split(";") if len(libs) < min_libs: continue # calculate the simScore of testingIcon simCounter = SimCounter() score1 = simCounter.edit_distance(a[1], b[1]) score2 = simCounter.edit_distance(a[2], b[2]) score3 = simCounter.edit_distance(a[4], b[4]) score_dict[item] = self.alpha * score1 + self.beta * score2 + self.gamma * score3 score_lst = dict2sortedlist(score_dict)[:numOfNeighbours] print(score_lst) topn = {} flag = 0 if len(score_lst) >= numOfNeighbours: for item in score_lst: if item[1]: flag = 1 topn[item[0]] = item[1] if flag: return topn else: return None else: return None
def run(args): # Load data data = DataReader(args.input) # Determine data for validation data.loadFeatures = False isVal = np.array([x['Validation'] for x in data]) isTrn = np.logical_not(isVal) hasEmo = np.array([not np.any(np.isnan(x['Emotion'])) for x in data]) isTrnEmo = np.logical_and(isTrn, hasEmo) # Create samplers bsVal = BatchSampler(SubsetSampler(np.where(isVal)[0]), args.batchSize, False) dlVal = DataLoader(data, collate_fn=data.collate, batch_sampler=bsVal) bsTrn = BatchReplace(np.where(isTrn)[0], args.batchSize) dlTrn = DataLoader(data, collate_fn=data.collate, batch_sampler=bsTrn) itTrn = iter(dlTrn) bsTrnEmo = BatchReplace(np.where(isTrnEmo)[0], args.batchSize) dlTrnEmo = DataLoader(data, collate_fn=data.collate, batch_sampler=bsTrnEmo) itTrnEmo = iter(dlTrnEmo) # Get output weights using train and val data allEmo = np.stack([x['Emotion'] for x in data]) allDs = np.stack([x['Dataset'] for x in data]) wEmo = np.nansum(allEmo, axis=0) wDs = np.nansum(allDs, axis=0) wEmo = np.sum(wEmo) / (wEmo * len(wEmo)) wDs = (np.sum(wDs) - wDs) / wDs data.loadFeatures = True # Setup model model = Maddog(data.featLen, wDs, wEmo, args) # Get validation ground truth valActual = np.concatenate([x['Emotion'] for x in dlVal]) valActual = np.argmax(valActual, axis=-1) # Loop through all epochs bestUar = None for ep in range(args.maxEpochs): # Train for one epoch model.Fit(itTrn, itTrnEmo) # Predict VAL valPred = np.argmax(model.Predict(dlVal), axis=-1) valUar = calcUar(valActual, valPred) # Print UAR if args.verbose: print('Epoch', ep, ' UAR:', valUar) # Check for best val UAR if bestUar is None or valUar > bestUar: bestUar = valUar model.Save(args.model)
def __init__(self): self.dr = DataReader() self.score = self.dr.get_score_data() self.cb_matrix = None self.interacted_cb_matrix = None self.cl_matrix = None self.ensemble_matrix = None self.tech_keyword_matrix = self.get_tech_keyword_matrix()
def test_data_reader_assigned_duration(): """Tests the DataReader overrides the default value if a duration is assigned during construction Returns ------- None """ dr = DataReader("test_file.csv", (0, 2)) assert dr.duration == (0, 2)
def test_read_csv_voltage(): """Tests the read_csv function of the data reader for reading in the voltage numpy array from the csv file. Returns ------- None """ dr = DataReader("test_file.csv") expected_voltage = np.array([10, 15, 20]) assert np.array_equal(dr.output_dict["voltage"], expected_voltage)
def __init__(self, graph_file_path): self.graph_file_path = graph_file_path dr = DataReader(graph_file_path) gr = dr.readGraphFile() self.graph = dr.readGraphFile() self.num_nodes = len(gr) self.degree = [len(l) for l in gr] self.community_record = None self.std_Q = None self.P_Mat = None
def test_validate_csv_file_bad_file_extension(): """Tests the validate_csv_file function for raising a ValueError when there is a bad file extension (i.e. one that is not .csv) Returns ------- None """ with pytest.raises(ValueError): dr = DataReader("BadExtensionTest.txt") dr.validate_csv_file("BadExtensionTest.txt")
def test_read_csv_time(): """Tests the read_csv function of the data reader for reading in the time numpy array from the csv file. Returns ------- None """ dr = DataReader("test_file.csv") expected_time = np.array([0, 1, 2]) assert np.array_equal(dr.output_dict["time"], expected_time)
def test_data_reader_default_duration(): """Tests that data reader assigns a default value to the duration parameter if one is not assigned during construction. Default value should be min max values from the time array. Returns ------- None """ dr = DataReader("test_file.csv") assert dr.duration == (0, 2)
def main(): print("Running on BIO-NLP data\n\n") home_dir = "../dl4nlp" # The hyper-parameters of the word embedding trained model window_size = 5 embed_vector_size = 50 min_count = 1000 # Define the data files data_folder = os.path.join("..\\", "sample_data", "drugs_and_diseases") train_file_path = os.path.join(data_folder, "Drug_and_Disease_train.txt") test_file_path = os.path.join(data_folder, "Drug_and_Disease_test.txt") data_file_path = os.path.join(data_folder, "unlabeled_test_sample.txt") resources_pickle_file = os.path.join(home_dir, "models", "resources.pkl") embedding_pickle_file = os.path.join(home_dir, "models", "w2vmodel_pubmed_vs_{}_ws_{}_mc_{}.pkl" \ .format(embed_vector_size, window_size, min_count)) print("embedding_pickle_file= {}".format(embedding_pickle_file)) # The hyperparameters of the LSTM trained model #network_type= 'unidirectional' network_type = 'bidirectional' num_layers = 2 num_hidden_units = 150 num_epochs = 10 batch_size = 50 dropout = 0.2 reg_alpha = 0.0 model_file_path = os.path.join(home_dir,'models','lstm_{}_model_units_{}_lyrs_{}_epchs_{}_vs_{}_ws_{}_mc_{}.h5'.\ format(network_type, num_hidden_units, num_layers, num_epochs, embed_vector_size, window_size, min_count)) print("Training the model... num_epochs = {}, num_layers = {}, num_hidden_units = {}".\ format(num_epochs, num_layers,num_hidden_units)) reader = DataReader() entityExtractor = EntityExtractor(reader, embedding_pickle_file) entityExtractor.train (train_file_path, \ output_resources_pickle_file = resources_pickle_file, \ network_type = network_type, \ num_epochs = num_epochs, \ batch_size = batch_size, \ dropout = dropout, \ reg_alpha = reg_alpha, \ num_hidden_units = num_hidden_units, \ num_layers = num_layers) #Save the model entityExtractor.save(model_file_path) print("Done.")
def main(): # Get data path filename = "allStocks.json" # Initialize dataReader and get data dataReader = DataReader(filename) data = dataReader.getData() # Initialize a report and call print report = Report(data) report.print()
def test_validate_csv_file_bad_file_name(): """Tests the validate_csv_file for raising an exception when a file that does not exist is inputted for the csv_file_name argument in DataReader. Returns ------- None """ with pytest.raises(FileNotFoundError): dr = DataReader("random_file_name.csv") dr.validate_csv_file("random_file_name.csv")
def __init__(self): self.dr = DataReader() self.score = self.dr.get_score_data() self.tech_keyword_matrix = self.get_tech_keyword_matrix() self.tech_keyword_sim_matrix = None self.item_based_sim_matrix = None self.ensemble_sim_matrix = None # Auxiliary variable self.tech_id_set_for_item_based = None
def cluster_on_graph(self, rwfile): if not os.path.exists(rwfile): print("no random walk file provided, exit...") exit(1) dreader = DataReader(rwfile) rwList = dreader.readRWFile() Clusters = [[x] for x in range(self.graph_file.num_nodes)] self.community_cluster = Clusters return Clusters
def main(): logging.basicConfig(filename="HRM_logs.txt", format='%(asctime)s %(levelname)s:%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') file_name = get_file_name() wants_duration = get_wants_duration() try: if wants_duration: duration = get_duration() dr = DataReader(file_name, duration) else: dr = DataReader(file_name) hrm = HRM_Processor(dr) dw = DataWriter(hrm) except (FileNotFoundError, ValueError, TypeError): logging.info("Driver script terminated unsuccessfully.") logging.info("Successful termination of HRM_Driver")
def testModel(experiment, testRun, setTarg): # print("Testing model ...") dataset = DataReader(experiment.data["path"]) dataset.setDatasetClassic("test", experiment.data["feature"], experiment.data["annotation"]) if setTarg == "MeanStd": dataset.setTargetMeanStd() inp, tar = dataset[0] experiment.inputDim = inp.shape[1] experiment.outputDim = tar.shape[1] firstID1 = list(dataset.dataPart.keys())[0] firstID2 = list(dataset.dataPart[firstID1]["annotations"])[0] headers = dataset.dataPart[firstID1]["annotations"][firstID2]["headers"] if setTarg == "MeanStd": headers = ["mean", "std"] # print(headers) wrapper = getWrapper(experiment, getBest=True) modelOutPath = os.path.join(wrapper.savePath, "ouputs") if testRun: dataset = keepOne(dataset) IDs = dataset.dataPart.keys() for key in experiment.evaluation.keys(): metrics = {} for idx, ID in enumerate(IDs): savePath = os.path.join(modelOutPath, ID + ".csv") outputs = pd.read_csv(savePath).to_numpy() targets = dataset.targetReader(ID) # print(targets.shape, outputs.shape) if idx == 0: results = [[] for _ in range(targets.shape[1])] # bestresult = 0; bestID = "0" for dim in range(targets.shape[1]): metrics[headers[dim]] = {} for dim in range(targets.shape[1]): output = outputs[:, dim] target = targets[:, dim] while target.shape[0] > output.shape[0]: output = np.append(output, outputs[-1]) while target.shape[0] < output.shape[0]: output = outputs[:target.shape[0]].reshape(target.shape[0]) result = getMetric(target, output, metric=key) # if result > bestresult: bestresult=result; bestID = ID # print(ID, result, len(output)) results[dim].append(result) printProgressBar(idx + 1, len(IDs), prefix='Testing model with ' + key + ':', suffix='', length="fit") for dim in range(targets.shape[1]): metrics[headers[dim]]['mean'] = np.mean(np.array(results[dim])) metrics[headers[dim]]['std'] = np.std(np.array(results[dim])) experiment.evaluation[key] = metrics return experiment
def eval_one_data(random_seed): data_reader = DataReader() data = DataObject(data_reader, 1, random_seed=random_seed) result = [] recommender = ItemKNNCFRecommender(data.urm_train) recommender.fit(topK = args["topK"], shrink=args["shrink"], similarity=args["similarity"], feature_weighting=args["feature_weighting"]) for n, users, description in data.urm_train_users_by_type: eval, map = MyEvaluator.evaluate_algorithm(data.urm_test, users, recommender, at=10, remove_top=0) result.append(map) users = data.ids_target_users eval, map = MyEvaluator.evaluate_algorithm(data.urm_test, users, recommender, at=10, remove_top=0) result.append(map) return result
def __init__(self, oscars): self.Data = DataReader("tmdb-movies.csv") self.Data.formatData() self.OscarFile = pd.read_csv(oscars) self.ActorsDictionary = {} self.MovieDF = self.Data.getMovieDF() self.Categories = [ "ACTOR", "ACTRESS", "ACTOR IN A SUPPORTING ROLE", "ACTRESS IN A SUPPORTING ROLE", "ACTOR IN A LEADING ROLE", "ACTRESS IN A LEADING ROLE" ] self.OutputData = self.Data.getOutput() self.cleanOscarData()