def weight_extract(model, optimizer, criterion, train_loader, epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): if torch.cuda.is_available(): data, target = Variable(data.cuda()), Variable(target.cuda()) else: data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = criterion(output, target) utils.c = target.view(-1, 1) # batch array torch.tensor[128] utils.c = utils.c.type(torch.cuda.FloatTensor) utils.weight_extract_densenet(model.module) for i in utils.c: for j in i: utils.str_w = utils.str_w + str(j.tolist()) + ',' utils.str_w += '\n' utils.save_to_csv() utils.str_w = '' if batch_idx % 100 == 0: print('Epoch: {}'.format(epoch))
def weight_extract_test(model, criterion, test_loader): utils.csv_file_name = 'weight_test.csv' model.eval() for batch_idx, (data, target) in enumerate(test_loader): if torch.cuda.is_available(): data, target = Variable(data.cuda()), Variable(target.cuda()) else: data, target = Variable(data), Variable(target) output = model(data) loss = criterion(output, target) utils.c = target.view(-1, 1) # batch array torch.tensor[128] utils.c = utils.c.type(torch.cuda.FloatTensor) utils.weight_extract(model.module) for i in utils.c: for j in i: utils.str_w = utils.str_w + str(j.tolist()) + ',' utils.str_w += '\n' utils.save_to_csv() utils.str_w = '' print('weight test extract')
def collecting_handler(message): """Collect orders""" # open sheet sh = gc.open_by_key(SPREADSHEET_ID) if message.chat.type == "group" or message.chat.type == "private": print(message) # read each sheet wks_quay_dau = sh.worksheet(ROTATION_NAME) # read data from group and write to sheets if message.chat.title == "KIỂM KHO QUẬN 7" or message.chat.title == "test": file_uri = f"./assets/csv/kiem_kho_{datetime.fromtimestamp(message.date).strftime('%Y-%m-%d')}.csv" # save text to csv file save_to_csv(file_uri, message) bot.send_message(message.chat.id, "Đã lưu vào file csv.") elif message.chat.title == "Đơn Quay Đầu - Bắn Kiểm Thiếu": # save to Quaydau_bot sheet save_to_sheet(wks_quay_dau, message) bot.send_message( message.chat.id, "Mã đơn hàng đã được lưu vào google sheets. Các anh chị có tên vui lòng hoàn tất đơn quay đầu trong ngày. Thanks.", ) else: bot.send_message(message.chat.id, "Nothing to do...")
def main(): transactions = pd.read_csv(INPUT_PATH, sep="\t", names=['user_id', 'item_id', 'rating', 'time'], engine='python') # print(transactions.head()) print(transactions.head()) # convert to implicit scenario #transactions['rating'] = 1 print(transactions.head()) # make the dataset train_df, test_df = get_train_test_df(transactions) save_to_csv(train_df, OUTPUT_PATH_TRAIN, header=False, index=False, verbose=1) save_to_csv(test_df, OUTPUT_PATH_TEST, header=False, index=False, verbose=1) report_stats(transactions, train_df, test_df) return 0
def calculate_cdf(self, df, node): node_difference = node+'_difference' stats_df = df.groupby([node_difference])[node_difference].agg('count').pipe(pd.DataFrame).rename(columns={node_difference: 'frequency'}) stats_df['pdf'] = stats_df['frequency'] / sum(stats_df['frequency']) stats_df['cdf'] = stats_df['pdf'].cumsum() stats_df = stats_df.reset_index() if not os.path.exists(self.path): os.mkdir(self.path) utils.save_to_csv(stats_df, self.path, 'latency_' + node) return stats_df
def deploy_zone_prediction(): """ generate java model for zone prediction :return: training result and java model """ dir = "log/peps normal" pattern = r'(left|right|front|back|start|trunk|lock)\\\d{1,2}.csv$' pattern_valid = r'(3|6|9|12).csv$' utils.construct_set(dir, pattern, pattern_valid, filter=1) utils.save_to_csv() id = 'EightNormal' dir_path = 'model/' rf = utils.train_rf(model_id=id, ntrees=25, weight_lock=1) if not os.path.exists(dir_path): os.makedirs(dir_path) rf.download_pojo(path=dir_path, get_genmodel_jar=False)
def _create_set(tweets, set_name, index_name, nb_authors, previous_authors, author_mode, nb_tweets_per_author, nb_pos_pairs_per_tweet, nb_neg_pairs_per_tweet, previous_tweets, tweet_mode, texts): print("pairing tweets...") authors, tweet_ids, pairs = _pair_tweets(tweets, nb_authors, previous_authors, author_mode, nb_tweets_per_author, nb_pos_pairs_per_tweet, nb_neg_pairs_per_tweet, previous_tweets, tweet_mode) print("converting format...") dataset = _convert_format(pairs, texts) print("saving...") save_to_csv(dataset, "data/datasets/"+set_name+".csv", index_name) return authors, tweet_ids
def main(): """Main function that executes all of the data alterations and checks against the input data. First, the function iterates through all of the files in the data directory, and creates a dictionary object containing the file name in file_name and file contents in file. Afterwards, we check the encoding of the file with utils.process_encoding, and attempt to force convert the file to UTF-8 if it is not in that format already. Then, the delimiter of the file is taken from utils.get_file_delim. Next, utils.replace_newline is used to make the new line/line breaks to '\n' Finally, utils.row_by_row_check is run to parse each row and fix any common data issues. Args: There are no arguments as the program parses all the files in the data directory. Returns: None. For each file in the data director, a copy of its cleaned data is saved to the processed_data/cleaned_files directory, and a copy of its dirty data is saved to the processed_data/dirty_files directory. """ for filename in os.listdir('data'): with open('data/' + filename) as raw_data: data_file = utils.get_file_metaddata(raw_data, filename) utf_encoding, data_file['file'] = utils.process_encoding( data_file['file']) if not utf_encoding: utils.save_to_csv(data_file, data_file['file'], 'bad_files') continue has_delim, delimiter = utils.get_file_delim(data_file['file']) if not has_delim: utils.save_to_csv(data_file, data_file['file'], 'bad_files') continue data_file['file'] = utils.replace_newline(data_file['file']) header = data_file['file'].pop(0) header_length = len(header.split(',')) cleaned_data, dirty_data = utils.row_by_row_check( data_file['file'], delimiter, header_length) if cleaned_data: utils.save_to_csv(data_file, header + '\n' + cleaned_data, 'cleaned_files') if dirty_data: utils.save_to_csv(data_file, header + '\n' + dirty_data, 'dirty_files')
def saveTrace(self, checked): #diag = QFileDialog.getSaveFileName(self, "Select destination", "./", "Comma Separated Values (*.csv)"); diag = QFileDialog(self) diag.setAcceptMode(QFileDialog.AcceptSave) #Save file, not open one diag.setNameFilter( "Comma Separated Values (*.csv);;Space separated Values (*.csv)") diag.setDefaultSuffix("csv") # Make sure selected files end in .csv diag.exec() try: filename = diag.selectedFiles()[0] except IndexError: filename = '' user_filter = diag.selectedNameFilter() if (user_filter == "Space separated Values (*.csv)"): delimiter = " " else: delimiter = "," if (filename != '' and not os.path.isdir(filename)): npzfile = self.last_result t = npzfile["t"] cos2 = npzfile["cos2"] cos2d = npzfile["cos2d"] extra_header = [] extra_columns = [] if ('Javg' in npzfile.keys()): Javg = npzfile["Javg"] std = npzfile["std"] psi_pulse = npzfile["psi"] psi_final = psi_pulse[-1] psi_out = numpy.abs(psi_final)**2 percentile_999 = npzfile["percentile_999"] extra_header = [ "<J>", "std(J)", "J_99.9%", "Probability coefficients" ] extra_columns = [Javg, std, percentile_999, psi_out] utils.save_to_csv(filename, t, cos2, cos2d, extra_header, extra_columns, delimiter)
def scanimg(self,image_data): """ 提交表单 需求base64图片 """ try: data = {"accuracy": self.quality, "image": image_data} response = requests.post(self.api_url, data=data, headers=self.header) if response.status_code != 200: print(time.ctime()[:-5], "Failed to get info") return None else: result = response.json()["words_result"] print("- ",result) invoice_data = { '检索日期': '-'.join(time.ctime().split()[1:3]), "发票类型": result["InvoiceType"], '发票代码': result['InvoiceCode'], '发票号码': result['InvoiceNum'], '开票日期': result['InvoiceDate'], '合计金额': result['TotalAmount'], '税率': result['CommodityTaxRate'][0]['word'], '合计税额': result['TotalTax'], '价税合计': result['AmountInFiguers'], '销售方名称': result['SellerName'], '销售方税号': result['SellerRegisterNum'], '购方名称': result['PurchaserName'], '购方税号': result['PurchaserRegisterNum'], '备注': result['Remarks'], } save_to_csv(invoice_data, "test.csv") return invoice_data except: message = "发票识别API调用出现错误" # Pushover.push_message(message) return None finally: print(time.ctime()[:-5], "产生一次了调用")
def save_artist_data(self, artist_data): print('Save artist data:', artist_data.artist_information) utils.save_to_csv(artist_data.artist_information.name, artist_data.events, 'events') utils.save_to_csv(artist_data.artist_information.name, artist_data.setlists, 'setlists') utils.save_to_csv(artist_data.artist_information.name, artist_data.recordings, 'recordings')
def egalitarian_score(self, save_plot=True, label=None): TIME_LIMIT = 5 msgs_df = self.all_messages.copy() # determine if node was in sync for node in self.all_nodes: msgs_df[node + '_sync'] = (msgs_df[node].subtract(msgs_df['time']) < TIME_LIMIT).astype(int) # calculate how many nodes were in sync nodes_sync = [x+'_sync' for x in self.all_nodes] msgs_df['totals'] = msgs_df[nodes_sync].sum(axis=1) # calculate normalized value of how many nodes were out of sync msgs_df['egalitarian_score'] = (len(self.all_nodes) - msgs_df['totals']) / len(self.all_nodes) if self.plot_all: utils.save_to_csv(msgs_df, self.path, 'egalitarian_score') df_plot = msgs_df[msgs_df['time'] < (self.simulation_time - TIME_LIMIT)] plt.plot(df_plot['time'], df_plot['egalitarian_score'], label=label, linewidth=0.9, markevery=3.5) # calculate mean of egalitarian score: leave away last interval bc results can be wrong due to messages not yet delivered before simulation ends mean = df_plot['egalitarian_score'].mean() print('Egalitarian score: %.2f' % mean) if save_plot: # set y-axis from 0 to max axes = plt.gca() # axes.set_ylim([0, 0.2]) plt.title('Egalitarian Score=%.2f' % mean) plt.ylabel('nodes') plt.xlabel('time (s)') plt.legend(bbox_to_anchor=(1, 1), loc="upper left") plt.subplots_adjust(right=0.81) self.save_to_file('egalitarian_score', 'all') return mean
def saveTrace(self,checked): #diag = QFileDialog.getSaveFileName(self, "Select destination", "./", "Comma Separated Values (*.csv)"); diag = QFileDialog(self); diag.setAcceptMode(QFileDialog.AcceptSave) #Save file, not open one diag.setNameFilter("Comma Separated Values (*.csv);;Space separated Values (*.csv)"); diag.setDefaultSuffix("csv"); # Make sure selected files end in .csv diag.exec(); try: filename = diag.selectedFiles()[0]; except IndexError: filename = ''; user_filter = diag.selectedNameFilter(); if (user_filter == "Space separated Values (*.csv)"): delimiter = " "; else: delimiter = ","; if (filename != '' and not os.path.isdir(filename)): npzfile = self.last_result; t = npzfile["t"]; cos2 = npzfile["cos2"]; cos2d = npzfile["cos2d"]; extra_header = []; extra_columns = []; if ('Javg' in npzfile.keys()): Javg = npzfile["Javg"]; std = npzfile["std"]; psi_pulse = npzfile["psi"]; psi_final = psi_pulse[-1]; psi_out=numpy.abs(psi_final)**2; percentile_999 = npzfile["percentile_999"]; extra_header = ["<J>","std(J)","J_99.9%","Probability coefficients"]; extra_columns = [Javg,std,percentile_999,psi_out]; utils.save_to_csv(filename,t,cos2,cos2d,extra_header,extra_columns,delimiter);
def _save_records_to_csv(self, records): """ This will take the records and save it to a csv file Parameters: -----------' records: List[dict] Returns: ------- path: str This will return the path of the csv file where the records has been saved """ if records: try: path = save_to_csv(self.headers, self.csvfile, records) # this is a function that has a side effect # that is it stores the records to a file # so we return a json to say that the file is saved and the path # is this. Also additional information are given like the # headers and the date. This is mainly to save the information # to a log file later we will assert to check if this is the # same csv file was used to write the data assert self.csvfile == path self.log.debug("csv file {path}".format(path=path)) return path except TypeError: information = "cannot save to csv as start date is not set" self.log.debug(information) raise ValueError(information) except AttributeError as e: p = "the return value of the functions has to be an iterable" self.log.debug(p) raise ValueError(p)
test_data = build_loader(mode='test', cfg=cfg).get_data() # pre process train = build_pre_process(data=train_data, mode='train', cfg=cfg).get_feature() valid = build_pre_process(data=valid_data, mode='valid', cfg=cfg).get_feature() test = build_pre_process(data=test_data, mode='test', cfg=cfg).get_feature() features = [ c for c in train.columns if c not in ['loadingOrder', 'label', 'mmin', 'mmax', 'count'] ] # training meta = { 'train': train, 'valid': valid, 'test': test, 'pred': features, 'label': 'label', 'seed': 1080, 'is_shuffle': True, } trainer = build_trainer(meta=meta, cfg=cfg) result = trainer.do_train() # save results save_to_csv(result, test_data, cfg)
delimiter = "," savepath = 'C:\Jmax_data/' for i in range(len(p)): INTENSITY = 0.6 * p[i] * 1e-6 / (tau * (waist**2)) pulses = [config.laserpulse(INTENSITY, tau, 0, waist=31e-6)] t, cos2, cos2d, psi = dispatcher.dispatch(states, pulses, Jmax, Nshells, molecule, dt, t_end, probe_waist, calculate_cos2d, do_psi_pulse=True) psi = psi[0] pdf = numpy.abs(psi)**2 Js = numpy.arange(0, Jmax + 1) Jssq = Js**2 Javg = numpy.sum(Js * pdf, axis=1) Jsq_avg = numpy.sum(Jssq * pdf, axis=1) std = numpy.sqrt(Jsq_avg - Javg**2) cdf = numpy.cumsum(numpy.abs(psi)**2, axis=1) percentile_999 = numpy.argmax(cdf >= 0.999, axis=1) psi_out = numpy.abs(psi[-1])**2 extra_columns = [Javg, std, percentile_999, psi_out] filename = mol + '_p' + str(int(p[i])) + '_T' + str( tau * 1e12) + '.csv' utils.save_to_csv(savepath + filename, t, cos2, cos2d, extra_header, extra_columns, delimiter)
warnings.filterwarnings('ignore') import re from datetime import datetime from utils import save_to_csv DATA_PATH = '../data/' FILE_NAME = 'reviews.csv' def clean_data(reviews_df): for i in range(0, len(reviews_df)): reviews_df.date[i] = re.sub('Reviewed in India on', '', reviews_df.date[i]) reviews_df.date[i] = reviews_df.date[i].strip() reviews_df.date[i] = datetime.strptime(reviews_df.date[i], '%d %B %Y').date() reviews_df['rating'] = reviews_df['rating'].astype('int') return reviews_df if __name__ == '__main__': df = pd.read_csv(DATA_PATH + FILE_NAME) cleaned_df = clean_data(df) save_to_csv(cleaned_df, DATA_PATH, FILE_NAME)
def twitter_api_caller(keyword_user_search_param, keywords_list, ids, batch_size, save_dir, csv_name): if keyword_user_search_param == 'search': csv_columns = [ 'id', 'username', 'text', 'keywords', 'date', 'location' ] else: csv_columns = ['id', 'username', 'text', 'date', 'location'] try: os.chdir(os.path.join(ROOT_DIR, "scraped_tweet")) os.mkdir(save_dir) print("Directory 'final_tweet_csv' Created") except FileExistsError: print("Directory 'final_tweet_csv' already exists") n_chunks = int((len(ids) - 1) // batch_size + 1) tweets = [] i = 0 while i < n_chunks: if i > 0 and i % 300 == 0: # if batch number exceed 300 request could fail time.sleep(60) if i != n_chunks - 1: batch = ids[i * batch_size:(i + 1) * batch_size] else: batch = ids[i * batch_size:] print(f"Processing batch n° {i+1}/{n_chunks} ...") try: list_of_tw_status = api.statuses_lookup(batch, tweet_mode="extended") except RateLimitError as err: print('Tweepy: Rate Limit exceeded') # https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/faq save_to_csv(tweets, os.path.join("scraped_tweet", save_dir), f"{csv_name}_last_batch_{i}", csv_columns) break except Exception as err: save_to_csv(tweets, os.path.join("scraped_tweet", save_dir), f"{csv_name}_last_batch_{i}", csv_columns) print(f"General Error: {str(err)}") break tweets_batch = [] for status in list_of_tw_status: try: tweet = { "id": status.id, "username": status.user.screen_name, "text": status.full_text, "date": str(status.created_at), "location": status.user.location } if keyword_user_search_param == 'search': kl1 = [ e for e in keywords_list if e.lower() in status.full_text.lower() ] kl2 = [ e for e in keywords_list if e.lower() in status.user.screen_name.lower() ] keywords = [x for x in set(kl1 + kl2) if len(x) > 0] tweet["keywords"] = keywords except Exception as err: print(f"General Error: {str(err)}") continue tweets_batch.append(tweet) print(f"Processed - scraped {len(tweets_batch)} tweets.") if len(tweets_batch) == 0: save_to_csv(tweets, os.path.join("scraped_tweet", save_dir), f"{csv_name}_last_batch_{i}", csv_columns) print("No tweets scraped") break i += 1 tweets.append(tweets_batch) save_to_csv(tweets, os.path.join("scraped_tweet", save_dir), csv_name, csv_columns)
try: os.mkdir("data") except OSError: pass if (not store_csv): if (do_psi_pulse): numpy.savez(filename, t=t, cos2=cos2, cos2d=cos2d, Javg=Javg, std=std, percentile_999=percentile_999, psi=psi_pulse) else: numpy.savez(filename, t=t, cos2=cos2, cos2d=cos2d) else: if (out_filename == ""): filename = filename.replace("npz", "csv") if (not do_psi_pulse): utils.save_to_csv(filename, t, cos2, cos2d) else: utils.save_to_csv(filename, t, cos2, cos2d, ["<J>", "std(J)", "J_99.9%"], [Javg, std, percentile_999]) if (out_filename == ""): print("Saved trace in " + filename)
os.path.basename(args.pulses), Nshells, T, probe_waist ] filename = out_filename if (filename == ""): filename = "data/" + ','.join([str(i) for i in attributes]) + ".npz" meta = dict() meta['molecule'] = molecule meta['Jmax'] = Jmax meta['dt'] = dt meta['pulses'] = pulses meta['Nshells'] = Nshells meta['temperature'] = T meta['probe_waist'] = probe_waist try: os.mkdir("data") except OSError: pass if (not store_csv): numpy.savez(filename, t=t, cos2=cos2, cos2d=cos2d, meta=meta) else: if (out_filename == ""): filename = filename.replace("npz", "csv") utils.save_to_csv(filename, t, cos2, cos2d) if (out_filename == ""): print("Saved trace in " + filename)
import utils if __name__ == '__main__': dir = "log/peps normal" pattern = r'(left|right|front|back|start|trunk|lock)\\\d{1,2}.csv$' pattern_valid = r'(3|6|9|12).csv$' utils.construct_set(dir, pattern, pattern_valid, filter=1) utils.save_to_csv() X, y = utils.load_all() X_train, X_valid, y_train, y_valid = utils.load_train_valid() # compare train result methods = ["Logistic", "LDA", "QDA", "KNN", "SVM", "RF", "GBM", "MLP"] params = [ None, None, None, { "n_neighbors": 10 }, { "C": 0.25, "gamma": 0.5 }, { "max_features": 2, "n_estimators": 100 }, { "n_estimators": 400, "max_depth": 3 }, { "hidden_layer_sizes": (16, 8) } ]
t_end = 30e-12 cache = dict(); states = [(1,0,0,0,1)]; extra_header = ["<J>","std(J)","J_99.9%","Probability coefficients"]; delimiter = "," savepath='C:\Jmax_data/' for i in range(len(p)): INTENSITY=0.6*p[i]*1e-6/(tau*(waist**2)) pulses = [config.laserpulse(INTENSITY,tau,0,waist=31e-6)]; t,cos2,cos2d,psi = dispatcher.dispatch(states,pulses,Jmax,Nshells,molecule,dt,t_end,probe_waist,calculate_cos2d,do_psi_pulse=True) psi=psi[0] pdf = numpy.abs(psi)**2; Js = numpy.arange(0,Jmax+1); Jssq = Js**2; Javg = numpy.sum(Js*pdf,axis=1); Jsq_avg = numpy.sum(Jssq*pdf,axis=1); std = numpy.sqrt(Jsq_avg - Javg**2); cdf = numpy.cumsum(numpy.abs(psi)**2,axis=1); percentile_999 = numpy.argmax(cdf>=0.999,axis=1); psi_out=numpy.abs(psi[-1])**2 extra_columns = [Javg,std,percentile_999,psi_out]; filename=mol+'_p'+str(int(p[i]))+'_T'+str(tau*1e12)+'.csv' utils.save_to_csv(savepath+filename,t,cos2,cos2d,extra_header,extra_columns,delimiter);
def twitter_api_caller(keyword_user_search_param, keywords_list, ids, batch_size, save_dir, csv_name, collect_replies): if keyword_user_search_param == 'search': csv_columns = [ 'id', 'username', 'text', 'keywords', 'date', 'location' ] else: csv_columns = ['id', 'username', 'text', 'date', 'location'] if collect_replies: csv_columns.append('replies') try: os.chdir(SCRAPED_TWEET_PATH) os.mkdir(save_dir) print("Directory 'final_tweet_csv' Created") except FileExistsError: print("Directory 'final_tweet_csv' already exists") n_chunks = int((len(ids) - 1) // batch_size + 1) tweets = [] i = 0 while i < n_chunks: if i > 0 and i % 300 == 0: # if batch number exceed 300 request could fail time.sleep(60) if i != n_chunks - 1: batch = ids[i * batch_size:(i + 1) * batch_size] else: batch = ids[i * batch_size:] print(f"Processing batch n° {i + 1}/{n_chunks} ...") try: list_of_tw_status = api.statuses_lookup(batch, tweet_mode="extended") except RateLimitError as err: print('Tweepy: Rate Limit exceeded') # https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/faq save_to_csv(tweets, save_dir, f"{csv_name}_last_batch_{i}", csv_columns) break except Exception as err: save_to_csv(tweets, save_dir, f"{csv_name}_last_batch_{i}", csv_columns) print(f"General Error: {str(err)}") break tweets_batch = [] for status in list_of_tw_status: try: tweet = { "id": status.id, "username": status.user.screen_name, "text": status.full_text.replace('\n', ' '), "date": str(status.created_at), "location": status.user.location } if keyword_user_search_param == 'search': keywords_in_tweet = get_tweet_keywords( keywords_list, status) tweet["keywords"] = list(set(keywords_in_tweet)) if collect_replies: replies = collect_tweet_replies(status.id, max_num_replies=100) tweet['replies'] = replies except Exception as err: print(f"General Error: {str(err)}") continue tweets_batch.append(tweet) print(f"Processed - scraped {len(tweets_batch)} tweets.") if len(tweets_batch) == 0: save_to_csv(tweets, save_dir, f"{csv_name}_last_batch_{i}", csv_columns) print("No tweets scraped") break i += 1 tweets.append(tweets_batch) save_to_csv(tweets, save_dir, csv_name, csv_columns)
def train(self, train_data, test_data, prediction_data, epochs, restore_checkpoint=False, csv_name="transformer_data.csv"): """ Training method that uses distributed training parameters: train_data - input data for training. Should be in form : en_train, fr_train_in, fr_train_out test_data - input data for test step. Should be in form : en_test, fr_test_in, fr_test_out prediction_data - input data for prediction step. Should be in form of: en_predict, fr_predict epochs - number of epochs that should be run restore_checkpoint - should we restore last checkpoint and resume training. Default set to false. csv_name - name of csv file where losses/accuracies will be saved. default = transformer_data.csv. If restore_checkpoint is set to False, file will be erased and only current run will be present. Returns: tuple losses, accuracy where losses = (train_losses, test_losses), accuracy = (train-accuracy, test_accuracy) """ en_predict, fr_predict = prediction_data en_vocab_size = self.en_tokenizer.vocab_size fr_vocab_size = self.fr_tokenizer.vocab_size + 2 print('Number of devices: {}'.format( self.strategy.num_replicas_in_sync)) GLOBAL_BATCH_SIZE = self.batch_size * self.strategy.num_replicas_in_sync train_dataset_distr, test_dataset_distr = makeDatasets( train_data, test_data, GLOBAL_BATCH_SIZE, self.strategy) test_losses = [] train_losses = [] train_accuracyVec = [] test_accuracyVec = [] test_loss = tf.keras.metrics.Mean() test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() prediction_idx = np.random.randint(low=0, high=len(en_predict), size=1)[0] prediction_en, prediction_fr = en_predict[prediction_idx], fr_predict[ prediction_idx] print("prediction input : ", prediction_en) print("prediction output: ", prediction_fr) with self.strategy.scope(): custom_learning_rate = customLearningRate(warmup_steps=4000, d_model=self.d_model) self.optimizer = tf.keras.optimizers.Adam( learning_rate=custom_learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) self.transformer_model = Transformer( embedding_size=self.d_model, dff=self.dff, input_max_seq_length=2000, output_max_seq_length=1855, input_vocab_size=en_vocab_size, output_vocab_size=fr_vocab_size, encoder_blocks=self.num_layers, decoder_blocks=self.num_layers, heads=self.num_heads) ckpt = tf.train.Checkpoint(transformer=self.transformer_model, optimizer=self.optimizer, epoch=tf.Variable(1)) manager = tf.train.CheckpointManager(ckpt, self.checkpoint_path, max_to_keep=5) if manager.latest_checkpoint and restore_checkpoint: ckpt.restore(manager.latest_checkpoint) print('Latest checkpoint restored!!') else: print("training from scratch") loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction="none") def loss_fn(real, targets): mask = tf.math.logical_not(tf.math.equal(targets, 0)) mask = tf.cast(mask, tf.int64) per_example_loss = loss_object(targets, real, sample_weight=mask) return tf.nn.compute_average_loss( per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE) def train_step(input_data, real_data_in, real_data_out): encoder_pad_mask = makePaddingMask(input_data) elements_mask = makeSequenceMask(real_data_in.shape[1]) with tf.GradientTape() as tape: predicted_data = self.transformer_model( input_data, real_data_in, encoder_pad_mask, elements_mask, training_enabled=True, training=True) loss = loss_fn(predicted_data, real_data_out) trainable_vars = self.transformer_model.trainable_variables grads = tape.gradient(loss, trainable_vars) self.optimizer.apply_gradients(zip(grads, trainable_vars)) train_accuracy.update_state(real_data_out, predicted_data) return loss @tf.function def distributed_train_step(input_data, real_data_in, real_data_out): per_replica_losses = self.strategy.experimental_run_v2( train_step, args=(input_data, real_data_in, real_data_out)) return self.strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) def test_step(input_data, real_data_in, real_data_out): encoder_pad_mask = makePaddingMask(input_data) elements_mask = makeSequenceMask(real_data_in.shape[1]) predicted_data = self.transformer_model(input_data, real_data_in, encoder_pad_mask, elements_mask, training_enabled=False, training=False) loss = loss_fn(predicted_data, real_data_out) test_accuracy.update_state(real_data_out, predicted_data) return loss @tf.function def distributed_test_step(input_data, real_data_in, real_data_out): per_replica_losses = self.strategy.experimental_run_v2( test_step, args=( input_data, real_data_in, real_data_out, )) return self.strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) for epoch in range(epochs): total_loss = 0 num_batches = 0 test_loss.reset_states() test_accuracy.reset_states() train_accuracy.reset_states() for _, (en_data, fr_data_in, fr_train_out) in enumerate(train_dataset_distr): loss = distributed_train_step(en_data, fr_data_in, fr_train_out) total_loss += loss num_batches += 1 train_losses.append(total_loss / num_batches) total_loss = 0 num_batches = 0 for _, (en_data, fr_data_in, fr_data_out) in enumerate(test_dataset_distr): loss = distributed_test_step(en_data, fr_data_in, fr_data_out) total_loss += loss num_batches += 1 test_losses.append(total_loss / num_batches) print ('Epoch {} training Loss {:.4f} Accuracy {:.4f} test Loss {:.4f} Accuracy {:.4f}' .format( \ epoch + 1, train_losses[-1], train_accuracy.result(), test_losses[-1], test_accuracy.result())) train_accuracyVec.append(train_accuracy.result()) test_accuracyVec.append(test_accuracy.result()) if epoch % self.predict_every == 0 and epoch != 0: output_seq = self.translate(prediction_en) print( "----------------------------PREDICTION----------------------------" ) print("Predicted :", output_seq) print("Correct :", prediction_fr) print( "--------------------------END PREDICTION--------------------------" ) ckpt.epoch.assign_add(1) if int(epoch) % 5 == 0: save_path = manager.save() print("Saving checkpoint for epoch {}: {}".format( epoch, save_path)) save_path = manager.save() print('Saving checkpoint for end at {}'.format(save_path)) save_to_csv(losses=(train_losses, test_losses), accuracy=(train_accuracyVec, test_accuracyVec), append=restore_checkpoint, file_name=csv_name) return (train_losses, test_losses), (train_accuracyVec, test_accuracyVec)
def train(self, train_data, test_data, prediction_data, epochs, restore_checkpoint=False, csv_name="seq2seq_data.csv"): """ Training method that uses distributed training Parameters: train_data - input data for training. Should be in form : en_train, fr_train_in, fr_train_out test_data - input data for test step. Should be in form : en_test, fr_test_in, fr_test_out prediction_data - input data for prediction step. Should be in form of: en_predict, fr_predict epochs - number of epochs that should be run restore_checkpoint - should we restore last checkpoint and resume training. Defualt set to false. csv_name - name of csv file where losses/accuracies will be saved. default = seq2seq_data.csv. If restore_checkpoint is set to False, file will be erased and only current run will be present. Returns: tuple losses, accuracy where losses = (train_losses, test_losses), accuracy = (train-accuracy, test_accuracy) """ en_predict, fr_predict = prediction_data en_vocab_size = self.en_tokenizer.vocab_size fr_vocab_size = self.fr_tokenizer.vocab_size + 2 print('Number of devices: {}'.format( self.strategy.num_replicas_in_sync)) GLOBAL_BATCH_SIZE = self.batch_size * self.strategy.num_replicas_in_sync train_dataset_distr, test_dataset_distr = makeDatasets( train_data, test_data, GLOBAL_BATCH_SIZE, self.strategy) test_losses = [] train_losses = [] train_accuracyVec = [] test_accuracyVec = [] test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() prediction_idx = np.random.randint(low=0, high=len(en_predict), size=1)[0] prediction_en, prediction_fr = en_predict[prediction_idx], fr_predict[ prediction_idx] print("input : ", prediction_en) print("output: ", prediction_fr) with self.strategy.scope(): self.optimizer = tf.keras.optimizers.Adam(clipnorm=5.0) self.encoder = Encoder(self.lstm_size, self.embedding_size, en_vocab_size) self.decoder = Decoder(self.lstm_size, self.embedding_size, fr_vocab_size) ckpt = tf.train.Checkpoint(encoder=self.encoder, decoder=self.decoder, optimizer=self.optimizer, epoch=tf.Variable(1)) manager = tf.train.CheckpointManager(ckpt, "./checkpoints/Seq2Seq", max_to_keep=5) if manager.latest_checkpoint and restore_checkpoint: ckpt.restore(manager.latest_checkpoint) print('Latest checkpoint restored!!') else: print("training from scratch") loss_obj = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction="none") def compute_loss(predictions, labels): mask = tf.math.logical_not(tf.math.equal(labels, 0)) mask = tf.cast(mask, tf.int64) per_example_loss = loss_obj(labels, predictions, sample_weight=mask) return tf.nn.compute_average_loss( per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE) # one training step def train_step(encoder_input, decoder_in, decoder_out, initial_states): with tf.GradientTape() as tape: encoder_states = self.encoder(encoder_input, initial_state, training_mode=True) predicted_data, _, _ = self.decoder(decoder_in, encoder_states[1:], training_mode=True) loss = compute_loss(predicted_data, decoder_out) trainable = self.encoder.trainable_variables + self.decoder.trainable_variables grads = tape.gradient(loss, trainable) self.optimizer.apply_gradients(zip(grads, trainable)) train_accuracy.update_state(decoder_out, predicted_data) return loss @tf.function def distributed_train_step(encoder_input, decoder_in, decoder_out, initial_states): per_replica_losses = self.strategy.experimental_run_v2( train_step, args=( encoder_input, decoder_in, decoder_out, initial_states, )) return self.strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) def test_step(encoder_input, decoder_in, decoder_out): initial_state = self.encoder.init_states(self.batch_size) encoder_states = self.encoder(encoder_input, initial_state, training_mode=False) predicted_data, _, _ = self.decoder(decoder_in, encoder_states[1:], training_mode=False) loss = compute_loss(predicted_data, decoder_out) test_accuracy.update_state(decoder_out, predicted_data) return loss @tf.function def distributed_test_step(encoder_input, decoder_in, decoder_out): per_replica_losses = self.strategy.experimental_run_v2( test_step, args=( encoder_input, decoder_in, decoder_out, )) return self.strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) print( "starting training with {} epochs with prediction each {} epoch" .format(epochs, self.predict_every)) for epoch in range(epochs): test_accuracy.reset_states() train_accuracy.reset_states() initial_state = self.encoder.init_states(self.batch_size) total_loss = 0.0 num_batches = 0 for _, (en_data, fr_data_in, fr_data_out) in enumerate(train_dataset_distr): loss = distributed_train_step(en_data, fr_data_in, fr_data_out, initial_state) total_loss += loss num_batches += 1 train_losses.append(total_loss / num_batches) total_loss = 0.0 num_batches = 0 for _, (en_data, fr_data_in, fr_data_out) in enumerate(test_dataset_distr): loss = distributed_test_step(en_data, fr_data_in, fr_data_out) total_loss += loss num_batches += 1 test_losses.append(total_loss / num_batches) print( 'Epoch {} training Loss {:.4f} Accuracy {:.4f} test Loss {:.4f} Accuracy {:.4f}' .format(epoch + 1, train_losses[-1], train_accuracy.result(), test_losses[-1], test_accuracy.result())) train_accuracyVec.append(train_accuracy.result()) test_accuracyVec.append(test_accuracy.result()) ckpt.epoch.assign_add(1) if int(epoch) % 5 == 0: save_path = manager.save() print("Saving checkpoint for epoch {}: {}".format( epoch, save_path)) if epoch % self.predict_every == 0 and epoch != 0: output_seq = self.translate(prediction_en) print( "----------------------------PREDICTION----------------------------" ) print("Predicted :", output_seq) print("Correct :", prediction_fr) print( "--------------------------END PREDICTION--------------------------" ) save_path = manager.save() print('Saving checkpoint for end at {}'.format(save_path)) save_to_csv(losses=(train_losses, test_losses), accuracy=(train_accuracyVec, test_accuracyVec), append=restore_checkpoint, file_name=csv_name) return (train_losses, test_losses), (train_accuracyVec, test_accuracyVec)
def main(): tweets = _read_tweets_to_dataframe("data/tweet_data/", True, 2000) make_new_dir("data/datasets") save_to_csv(tweets, "data/datasets/individual_tweets.csv", "tweet_id")
def train(self, train_data, test_data, prediction_data, epochs, attention_type="general", restore_checkpoint=False, csv_name="seq2seqAttention_data.csv"): """ Training method that uses distributed training Parameters: train_data - input data for training. Should be in form : en_train, fr_train_in, fr_train_out test_data - input data for test step. Should be in form : en_test, fr_test_in, fr_test_out prediction_data - input data for prediction step. Should be in form of: en_predict, fr_predict epochs - number of epochs that should be run attention_type - what attention method to use " dot/general/concat. Default - general restore_checkpoint - should we restore last checkpoint and resume training. Defualt set to false. csv_name - name of csv file where losses/accuracies will be saved. default = seq2seqAttention_data.csv. If restore_checkpoint is set to False, file will be erased and only current run will be present. Returns: tuple losses, accuracy where losses = (train_losses, test_losses), accuracy = (train-accuracy, test_accuracy) """ print_heatmap = True en_predict, fr_predict = prediction_data en_vocab_size = self.en_tokenizer.vocab_size fr_vocab_size = self.fr_tokenizer.vocab_size + 2 print('Number of devices: {}'.format( self.strategy.num_replicas_in_sync)) GLOBAL_BATCH_SIZE = self.batch_size * self.strategy.num_replicas_in_sync train_dataset_distr, test_dataset_distr = makeDatasets( train_data, test_data, GLOBAL_BATCH_SIZE, self.strategy) test_losses = [] train_losses = [] train_accuracyVec = [] test_accuracyVec = [] test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() prediction_idx = np.random.randint(low=0, high=len(en_predict), size=1)[0] prediction_en, prediction_fr = en_predict[prediction_idx], fr_predict[ prediction_idx] print("prediction input : ", prediction_en) print("prediction output: ", prediction_fr) if not os.path.exists("heatmap"): os.mkdir("heatmap") alignments = [] with self.strategy.scope(): self.encoder = Encoder(lstm_size=self.lstm_size, embedding_size=self.embedding_size, vocab_size=en_vocab_size) self.decoder = Decoder(lstm_size=self.lstm_size, embedding_size=self.embedding_size, vocab_size=fr_vocab_size, attention_type=attention_type) self.optimizer = tf.keras.optimizers.Adam(clipnorm=0.5) ckpt = tf.train.Checkpoint(encoder=self.encoder, decoder=self.decoder, optimizer=self.optimizer, epoch=tf.Variable(1)) manager = tf.train.CheckpointManager( ckpt, "./checkpoints/Seq2SeqAttention", max_to_keep=5) if manager.latest_checkpoint and restore_checkpoint: ckpt.restore(manager.latest_checkpoint) print('Latest checkpoint restored!!') else: print("training from scratch") loss_obj = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction="none") def compute_loss(predictions, labels): mask = tf.math.logical_not(tf.math.equal(labels, 0)) mask = tf.cast(mask, tf.int64) per_example_loss = loss_obj(labels, predictions, sample_weight=mask) return tf.nn.compute_average_loss( per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE) # one training step def train_step(en_data, fr_data_in, fr_data_out, initial_states): loss = 0 predicted_output = None train_accuracy.reset_states() with tf.GradientTape() as tape: encoder_output, state_h, state_c = self.encoder( en_data, initial_states, training_mode=True) # shape[1] because we want each word for all batches for i in range(fr_data_out.shape[1]): decoder_input = tf.expand_dims(fr_data_in[:, i], 1) decoder_output, state_h, state_c, _ = self.decoder( decoder_input, (state_h, state_c), encoder_output, training_mode=True) loss += compute_loss(decoder_output, fr_data_out[:, i]) decoder_output = tf.expand_dims(decoder_output, axis=1) if i == 0: predicted_output = decoder_output else: predicted_output = tf.concat( [predicted_output, decoder_output], axis=1) trainable_vars = self.encoder.trainable_variables + self.decoder.trainable_variables grads = tape.gradient(loss, trainable_vars) self.optimizer.apply_gradients(zip(grads, trainable_vars)) train_accuracy.update_state(fr_data_out, predicted_output) return loss / fr_data_out.shape[1] @tf.function def distributed_train_step(en_data, fr_data_in, fr_data_out, initial_states): per_replica_losses = self.strategy.experimental_run_v2( train_step, args=( en_data, fr_data_in, fr_data_out, initial_states, )) return self.strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) def test_step(en_data, fr_data_in, fr_data_out): loss = 0 predicted_output = [] initial_states = self.encoder.init_states(self.batch_size) encoder_output, state_h, state_c = self.encoder( en_data, initial_states, training_mode=False) for i in range(fr_data_out.shape[1]): decoder_input = tf.expand_dims(fr_data_in[:, i], 1) decoder_output, state_h, state_c, _ = self.decoder( decoder_input, (state_h, state_c), encoder_output, training_mode=False) loss += compute_loss(decoder_output, fr_data_out[:, i]) decoder_output = tf.expand_dims(decoder_output, axis=1) if i == 0: predicted_output = decoder_output else: predicted_output = tf.concat( [predicted_output, decoder_output], axis=1) test_accuracy.update_state(fr_data_out, predicted_output) return loss / fr_data_out.shape[1] @tf.function def distributed_test_step(en_data, fr_data_in, fr_data_out): per_replica_losses = self.strategy.experimental_run_v2( test_step, args=( en_data, fr_data_in, fr_data_out, )) return self.strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) print( "starting training with {} epochs with prediction each {} epoch" .format(epochs, self.predict_every)) for epoch in range(epochs): test_accuracy.reset_states() train_accuracy.reset_states() initial_states = self.encoder.init_states(self.batch_size) total_loss = 0.0 num_batches = 0 for _, (en_data, fr_data_in, fr_data_out) in enumerate(train_dataset_distr): loss = distributed_train_step(en_data, fr_data_in, fr_data_out, initial_states) total_loss += loss num_batches += 1 train_losses.append(total_loss / num_batches) total_loss = 0.0 num_batches = 0 for _, (en_data, fr_data_in, fr_data_out) in enumerate(test_dataset_distr): loss = distributed_test_step(en_data, fr_data_in, fr_data_out) total_loss += loss num_batches += 1 test_losses.append(total_loss / num_batches) print ('Epoch {} training Loss {:.4f} Accuracy {:.4f} test Loss {:.4f} Accuracy {:.4f}' .format( \ epoch + 1, train_losses[-1], train_accuracy.result(), test_losses[-1], test_accuracy.result())) train_accuracyVec.append(train_accuracy.result()) test_accuracyVec.append(test_accuracy.result()) ckpt.epoch.assign_add(1) if int(epoch) % 5 == 0: save_path = manager.save() print("Saving checkpoint for epoch {}: {}".format( epoch, save_path)) predicted, alignment = self.translate(prediction_en) if epoch % self.predict_every == 0: print( "----------------------------PREDICTION----------------------------" ) print("Predicted: {} ".format(predicted)) print("Should be: {} ".format(prediction_fr)) print( "--------------------------END PREDICTION--------------------------" ) if print_heatmap: attention_map = np.squeeze(alignment, (1, 2)) alignments.append(attention_map) fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(1, 1, 1) ax.matshow(attention_map, cmap='jet') ax.set_xticklabels([''] + prediction_en.split(' '), rotation=90) ax.set_yticklabels([''] + predicted.split(' ')) plt.savefig('heatmap/prediction_{}.png'.format(epoch)) #plt.show() plt.close() save_path = manager.save() print('Saving checkpoint for end at {}'.format(save_path)) save_to_csv(losses=(train_losses, test_losses), accuracy=(train_accuracyVec, test_accuracyVec), append=restore_checkpoint, file_name=csv_name) return (train_losses, test_losses), (train_accuracyVec, test_accuracyVec)
import os import sys import utils if __name__ == '__main__': print('main') if len(sys.argv) < 2: raise Exception('Enter the path of the directory') path = sys.argv[1] if not os.path.isdir(path): raise Exception('Invalid path : ', path) print('path', path) for idx, filename in enumerate(os.listdir(path)): # print(filename) tags, text = utils.extract_tags_risk(os.path.join(path, filename)) words = utils.extract_words(text, tags, idx) df = utils.map_output(words) name = filename.split('.')[0] utils.save_to_csv('.', f'{name}.csv', df)
attributes = [molecule.name,Jmax,J,K,M,dt,os.path.basename(args.pulses),Nshells,probe_waist]; filename = out_filename; if (filename == ""): filename = ','.join([str(i) for i in attributes]) + ".npz"; filename = "data/single_state_" + filename; try: os.mkdir("data"); except OSError: pass; if (not store_csv): if (do_psi_pulse): numpy.savez(filename,t=t,cos2=cos2,cos2d=cos2d,Javg=Javg,std=std,percentile_999=percentile_999,psi=psi_pulse); else: numpy.savez(filename,t=t,cos2=cos2,cos2d=cos2d); else: if (out_filename == ""): filename = filename.replace("npz","csv"); if (not do_psi_pulse): utils.save_to_csv(filename,t,cos2,cos2d); else: utils.save_to_csv(filename,t,cos2,cos2d,["<J>","std(J)","J_99.9%"],[Javg,std,percentile_999]); if (out_filename == ""): print("Saved trace in "+filename);
def create_dataframe(pasin, name, dates, stars, reviews): reviews_dict = { 'asin': pasin, 'name': name, 'date': dates, 'rating': stars, 'review': reviews } reviews_df = pd.DataFrame( data=reviews_dict, columns=['asin', 'name', 'date', 'rating', 'review']) return reviews_df if __name__ == '__main__': company_list = get_company_list() asin = get_product_asin(headers, company_list) link = get_product_links(headers, company_list, asin) pasin, name, dates, stars, reviews = get_product_details( headers, company_list, asin, link) reviews_df = create_dataframe(pasin, name, dates, stars, reviews) save_to_csv(reviews_df, DATA_PATH, FILE_NAME)