def data_cleanup(): """ Cleans the dataset by removing 1/10 of the wines, sorted by wine rarity. """ # file locations csv_path = "../data/Wine/winemag-data-130k-v2.csv" output_path = "../data/Wine/wine_clean.csv" wine_count = "../data/Wine/varietals.csv" # remove 10% of the rarest wines in the list count_data = util.load(wine_count, [])[0] counts, varietals = count_data counts = [int(count) for count in counts] remove_pct = 0.1 remove_num = remove_pct * sum(counts) partial_sum = 0 idx = 0 while partial_sum < remove_num: partial_sum += counts[idx] idx += 1 wines_to_keep = varietals[idx:] # create flags for the data cleaning function flags = dict() flags["Top Wines"] = wines_to_keep # can change these column indices flags["Special Chars"] = [2, 3, 6, 7, 8, 11, 12] util.clean_data(csv_path, output_path, flags)
def read_dataset(clip_len): # Read dataset test_dataset = get_data('test.csv') classes_list = get_classes(test_dataset) print('Number of classes:', len(classes_list)) print('Test set:', len(test_dataset)) test_dataset = clean_data(test_dataset, clip_len + 1, classes=classes_list, MAX_FRAMES=3000) print('Test set after clean:', len(test_dataset)) return test_dataset, classes_list
def index_message(): filter_words = load_filter_words() message_loaded_nodes = pickle.load(open(SETTINGS.message_object_file, "rb")) sPerson = sunburnt.SolrInterface("http://localhost:8983/solr/message/") docs = [] for key, value in message_loaded_nodes.iteritems(): #Checking if the subject or body contains filter words (non-compliant words) compliantFlag = True #NoneType check if value._subject == None: text = value._body elif value._body == None: text = value._subject else: text = value._subject + value._body if is_filter_word_present(text, filter_words): compliantFlag = False doc = { "nodeId": key, "datetime": value._datetime, "epochSecs": value._epoch_secs, "subject": value._subject, "body": clean_data(value._body), "emailId": value._email_id, "compliantFlag": compliantFlag } # doc = {"nodeId":key, "datetime":value._datetime, "epochSecs":value._epoch_secs, "subject":value._subject, "body":value._body, "emailId":value._email_id,"compliantFlag":compliantFlag} docs.append(doc) sPerson.add(docs) sPerson.commit()
import util import config start_mark = '"text":' def preprocess_wiki(wiki_folder): allfiles = util.absoluteFilePaths(wiki_folder) wikis = [] for path in allfiles: fin = open(path, encoding='utf-8', errors='ignore') lines = fin.readlines() fin.close() for line in lines: linestr = line.strip() l = linestr.index(start_mark) + len(start_mark) + 2 txt = linestr[l:-4].strip() txt = txt.replace('\\n', '') txt = ''.join(txt.splitlines()) if len(txt) > 1: wikis.append(txt) return wikis if __name__ == '__main__': wikis = preprocess_wiki(config.wiki_folder) util.clean_data(wikis, config.wiki_file, ' ')
#!/usr/bin/env python import csv import xlrd from util import make_headers, clean_data IMPORT_FILES = [ 'src/Alaska_Louisiana.xls', 'src/Massachussetts_Wyoming_Territories.xls' ] if __name__ == "__main__": for i, filename in enumerate(IMPORT_FILES): workbook = xlrd.open_workbook(filename) datemode = workbook.datemode worksheets = workbook.sheet_names() if i == 0: headers = make_headers(workbook.sheet_by_name(worksheets[0])) headers['federal_supply_class'] = 'federal_supply_class' headers['federal_supply_category'] = 'federal_supply_category' f = open("src/leso.csv", "w") writer = csv.DictWriter(f, fieldnames=headers.values()) writer.writeheader() for worksheet in worksheets: sheet = workbook.sheet_by_name(worksheet) clean_data(sheet, writer, headers, datemode)
if (disciplina > 0): df_s = df_s.loc[(df_s['CodigoDisciplina'] == disciplina)] df_t = df_t.loc[(df_t['CodigoDisciplina'] == disciplina)] if (len(periodo_letivo_source) > 0): df_s = df_s.loc[(df_s['PeriodoLetivo'].isin(periodo_letivo_source))] if (len(periodo_letivo_test) > 0): df_t = df_t.loc[(df_t['PeriodoLetivo'].isin(periodo_letivo_test))] df_s = df_s.reset_index(drop=True) df_t = df_t.reset_index(drop=True) df_s_filter = util.clean_data(df_s, standardize=True, plot_cov=False, title='Matriz de Covariância - ' + disciplinas[disciplina] + ' / ' + s_periodo) df_t_filter = util.clean_data(df_t, standardize=True, plot_cov=False, title='Matriz de Covariância - ' + disciplinas[disciplina] + ' / ' + t_periodo) print('Registros source: ' + str(len(df_s_filter))) print('Registros target: ' + str(len(df_t_filter))) #df_s_std = util.correlation_alignment(df_s_filter, df_t_filter,1) df_s_std = df_s_filter #Embaralha dataframe normalizado
def index_message(): filter_words = load_filter_words() message_loaded_nodes = pickle.load(open(SETTINGS.message_object_file, "rb")) sPerson = sunburnt.SolrInterface("http://localhost:8983/solr/message/") docs = [] for key, value in message_loaded_nodes.iteritems(): #Checking if the subject or body contains filter words (non-compliant words) compliantFlag= True #NoneType check if value._subject == None: text = value._body elif value._body == None: text = value._subject else: text = value._subject + value._body if is_filter_word_present(text, filter_words): compliantFlag = False doc = {"nodeId":key, "datetime":value._datetime, "epochSecs":value._epoch_secs, "subject":value._subject, "body":clean_data(value._body), "emailId":value._email_id,"compliantFlag":compliantFlag} # doc = {"nodeId":key, "datetime":value._datetime, "epochSecs":value._epoch_secs, "subject":value._subject, "body":value._body, "emailId":value._email_id,"compliantFlag":compliantFlag} docs.append(doc) sPerson.add(docs) sPerson.commit()
plot_var_cov = True modulo = '3' s_disciplina = 'logica' #s_disciplina = 'mat_adm' df_s = pd.read_csv('../Week 3/m' + modulo + '_' + s_disciplina + '_ext_2012_01.csv', sep=',') df_t = pd.read_csv('../Week 3/m' + modulo + '_' + s_disciplina + '_ext_2012_02_2014_01.csv', sep=',') #Limpa e organiza algumas features e normaliza com z-score df_s_std = util.clean_data(df_s, normalizar, plot_cov=False, title="Clean Data - Covariancia (Ds)") df_t_std = util.clean_data(df_t, normalizar, plot_cov=False, title="Clean Data - Covariancia (Dt)") #df_s_std = util.correlation_alignment(df_s_std, df_t_std,1) #Embaralha dataframe normalizado df_normalized = shuffle(df_s_std) df_t_normalized = shuffle(df_t_std) cm_final = algoritmos.predict_decision_tree( df_normalized, df_t_normalized) #, group_fold_column='CodigoTurma')
@author: Zhenlin """ import util import config def preprocess_sohu_news(sohu_path): fin=open(sohu_path,encoding='utf-8',errors='ignore') lines=fin.readlines() fin.close() mark1='<content>' mark2='</content>' news=[] for i in range(4,len(lines),6): content=lines[i].strip()[len(mark1):-len(mark2)] content=content.strip() if len(content)>1: news.append(content) return news if __name__ == '__main__': sohu_path=r'/root/bytecamp2019/datasets/news_sohusite_xml-utf8.dat' sohu_clean=r'/root/bytecamp2019/datasets/news_sohusite_clean-utf8.dat' sohu_clean_space=r'/root/bytecamp2019/datasets/news_sohusite_clean_space-utf8.dat' news=preprocess_sohu_news(sohu_path) util.clean_data(news,sohu_clean) util.clean_data(news,sohu_clean_space,' ')
""" import util import config import json start_mark = '"text":' def preprocess_webtext(text_path): fin = open(text_path, encoding='utf-8', errors='ignore') lines = fin.readlines() fin.close() answers = [] for line in lines: linestr = line.strip() contents = json.loads(linestr) if "content" in contents: ans = contents["content"].replace('\\n', '').replace('\\r', '') ans = ''.join(ans.splitlines()) if len(ans) > 1: answers.append(ans) return answers if __name__ == '__main__': answers = preprocess_webtext(config.webtext_path) util.clean_data(answers, config.webtext_clean_path, ' ')
#!/usr/bin/env python import csv import xlrd from util import make_headers, clean_data IMPORT_FILES = ['src/Alaska_Louisiana.xls', 'src/Massachussetts_Wyoming_Territories.xls'] if __name__ == "__main__": for i, filename in enumerate(IMPORT_FILES): workbook = xlrd.open_workbook(filename) datemode = workbook.datemode worksheets = workbook.sheet_names() if i == 0: headers = make_headers(workbook.sheet_by_name(worksheets[0])) headers['federal_supply_class'] = 'federal_supply_class' headers['federal_supply_category'] = 'federal_supply_category' f = open("src/leso.csv", "w") writer = csv.DictWriter(f, fieldnames=headers.values()) writer.writeheader() for worksheet in worksheets: sheet = workbook.sheet_by_name(worksheet) clean_data(sheet, writer, headers, datemode)
def main(): args = parse_args() os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 os.environ["CUDA_VISIBLE_DEVICES"] = str( args.gpu) # Choose GPU for training config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True session = tf.compat.v1.InteractiveSession(config=config) input_shape = (args.clip_len, args.crop_size, args.crop_size, 3) model_name = args.model reg_factor = args.reg_factor batch_size = args.batch_size epochs = args.epochs lr_init = args.lr start_epoch = args.start_epoch save_path = args.save_path temperature = args.temperature alpha = args.lambd drop_rate = args.drop_rate every = 1 # Create folders for callback if not os.path.exists(save_path): os.mkdir(save_path) if not os.path.exists(os.path.join(save_path, "output")): os.mkdir(os.path.join(save_path, "output")) if not os.path.exists(os.path.join(save_path, "checkpoints")): os.mkdir(os.path.join(save_path, "checkpoints")) # Write all config to file f = open(os.path.join(save_path, 'config.txt'), "w") f.write('input shape: ' + str(input_shape) + '\n') f.write('model name: ' + model_name + '\n') f.write('reg factor: ' + str(reg_factor) + '\n') f.write('batch size: ' + str(batch_size) + '\n') f.write('numbers of epochs: ' + str(epochs) + '\n') f.write('lr init: ' + str(lr_init) + '\n') f.write('Temperature: ' + str(temperature) + '\n') f.write('Alpha: ' + str(alpha) + '\n') f.write('start epoch: ' + str(start_epoch) + '\n') f.write('Drop rate: ' + str(drop_rate) + '\n') f.close() # Read dataset train_dataset = get_data('train.csv') val_dataset = get_data('val.csv') classes_list = get_classes(train_dataset) print('Number of classes:', len(classes_list)) print('Train set:', len(train_dataset)) print('Val set:', len(val_dataset)) weight_model_path = os.path.join(save_path, "best_" + model_name + "_.h5") train_dataset = clean_data(train_dataset, args.clip_len + 1, classes=classes_list, MAX_FRAMES=3000) val_dataset = clean_data(val_dataset, args.clip_len + 1, classes=classes_list, MAX_FRAMES=3000) print('Train set after clean:', len(train_dataset)) print('Val set after clean:', len(val_dataset)) # --------------------------------------Continuous training with Self Knowledge Distillation---------------------------------------- train_self_KD(train_dataset, val_dataset, model_name, input_shape, classes_list, lr_init, weight_model_path, start_epoch=start_epoch, reg_factor=reg_factor, save_path=save_path, alpha=alpha, temperature=temperature, batch_size=batch_size, every=every, epochs=epochs, drop_rate=drop_rate)