def add_label_to_dict(src_path, dl_pair_path, dictionary, r_dictionary): label_path = '/home/ubuntu/workspace/text_summary_data/task_w2v/data_label_pairs/toytmp_label_dict.pickle' if os.path.isfile(label_path): label_dict_ori = statics.loadfrompickle(label_path) else: label_dict_ori = create_label_dict(src_path) statics.savetopickle(label_path, label_dict_ori) label_dict = label_dict_ori[3] for l in label_dict: labelidx = len(dictionary) dictionary[l] = labelidx r_dictionary[labelidx] = l for l in label_dict: word_onehot = np.zeros(len(dictionary)) word_onehot[dictionary[l]] = 1 label = np.zeros(len(label_dict)) label[label_dict[l]] = 1 index = len(os.listdir(dl_pair_path)) dl_pair = [word_onehot, label] statics.savetopickle( os.path.join(dl_pair_path, str(index + 1) + '.pickle'), dl_pair) return dictionary, r_dictionary
def pad_words(wdict, rawdata_path): words = statics.loadfrompickle(rawdata_path) encode_w = [] for w in words: blank_words = np.zeros(len(wdict), np.int64) blank_words[wdict[w]] = 1 encode_w.append(blank_words) return encode_w
def create_label_dict(path): files = os.listdir(path) models = [] OS = [] category = [] labels = [] total_file = len(files) process_file = 0 for f in files: subfold = os.path.join(path, f) subfiles = os.listdir(subfold) process_file = process_file + 1 print("create_label_dict: {}/{}".format(process_file, total_file)) for sf in subfiles: if sf == 'label.pickle': l = statics.loadfrompickle(os.path.join(subfold, sf)) models.append(l['model'][0].replace(" ", "")) OS.append(l['OS'][0].replace(" ", "")) category.append(l['category'][0].replace(" ", "")) labels = list(set(models)) + list(set(OS)) + list(set(category)) labels_dict = {} idx = 0 for i in range(len(labels)): if labels[i] not in labels_dict: labels_dict[labels[i]] = idx idx = idx + 1 return list(set(models)), list(set(OS)), list(set(category)), labels_dict
def random_batch(dl_pair_path, index, shufflelist): batch_d = [] batch_l = [] if shufflelist == []: shufflelist = os.listdir(dl_pair_path) shuffle(shufflelist) print('Shuffle List') batch_file = shufflelist[index:index + batch_size] for f in batch_file: dl = statics.loadfrompickle(os.path.join(dl_pair_path, f)) batch_d.append(dl[0]) batch_l.append(dl[1]) batch_d = np.stack(batch_d, axis=0) batch_l = np.stack(batch_l, axis=0) return batch_d, batch_l, shufflelist
def collect_all_words(path): all_words = [] files = os.listdir(path) total_file = len(files) process_file = 0 for idx in range(total_file): f = files[idx] subfold = os.path.join(path, f) subfiles = os.listdir(subfold) process_file = process_file + 1 print("create_vocab_dict: {}/{}".format(process_file, total_file)) for sf in subfiles: if sf != 'label.pickle': file = os.path.join(subfold, sf) words = statics.loadfrompickle(file) all_words = all_words + words return all_words
import numpy as np import os import statics import collections import random final_ldict_path = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/processed_v2/final_ldict.pickle' final_wdict_path = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/processed_v2/final_wdict20k.pickle' process_data_root = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/processed_v2' word_pool_path = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/w2vec/words_pool.pickle' wdict = statics.loadfrompickle(final_wdict_path) ldict = statics.loadfrompickle(final_ldict_path) #-----------------Create all words------------------------------- src_list = os.listdir(process_data_root) dl_pair = [] if os.path.isfile(word_pool_path): words = statics.loadfrompickle(word_pool_path) else: words = [] for i in range(6000, len(src_list)):
def create_train_pair_by_dict(src_path, all_words, dl_pair_path, dictionary): label_path = '/home/ubuntu/workspace/text_summary_data/task_w2v/data_label_pairs/tmp_label_dict.pickle' if os.path.isfile(label_path): label_dict_ori = statics.loadfrompickle(label_path) else: label_dict_ori = create_label_dict(src_path) statics.savetopickle(label_path, label_dict_ori) label_dict = label_dict_ori[3] files = os.listdir(src_path) total_file = len(files) unk_onehot = np.zeros(len(dictionary)) unk_onehot[0] = 1 count = 0 for w in dictionary: count = count + 1 print("create_train_pair_by_dict:{}/{}".format(count, len(dictionary))) word_onehot = np.zeros(len(dictionary)) word_onehot[dictionary[w]] = 1 for idx in range(total_file): f = files[idx] subfold = os.path.join(src_path, f) subfiles = os.listdir(subfold) l = statics.loadfrompickle(os.path.join(subfold, 'label.pickle')) label = np.zeros(len(label_dict)) if l['model'][0].replace(" ", "") in label_dict: label[label_dict[l['model'][0].replace(" ", "")]] = 1 if l['OS'][0].replace(" ", "") in label_dict: label[label_dict[l['OS'][0].replace(" ", "")]] = 1 if l['category'][0].replace(" ", "") in label_dict: label[label_dict[l['category'][0].replace(" ", "")]] = 1 for sf in subfiles: if sf != 'label.pickle': file = os.path.join(subfold, sf) words = statics.loadfrompickle(file) if w in words: index = len(os.listdir(dl_pair_path)) dl_pair = [word_onehot, label] statics.savetopickle( os.path.join(dl_pair_path, str(index + 1) + '.pickle'), dl_pair) index = len(os.listdir(dl_pair_path)) dl_pair = [unk_onehot, label] statics.savetopickle( os.path.join(dl_pair_path, str(index + 1) + '.pickle'), dl_pair) break
import math import tensorflow as tf import os from random import shuffle from scipy import spatial import word2vec_utility as w2v import statics words_dict_path = '/home/dashmoment/workspace/text_summary_data/data_label_pair/toy_words_dict_for_taskw2v.pickle' label_dict_path = '/home/dashmoment/workspace/text_summary_data/data_label_pair/toy_label_dict_for_taskw2v.pickle' word_label_pair_path = '/home/dashmoment/workspace/text_summary_data/data_label_pair/toy_dl_pair_for_taskw2v.pickle' dl_pair_path = '/home/dashmoment/workspace/text_summary_data/data_label_pair/task_w2v_dl' w2v_dict_path = 'w2v_dict_2000_2.pickle' words = statics.loadfrompickle(words_dict_path) word_label_pair = statics.loadfrompickle(word_label_pair_path) label_dict = statics.loadfrompickle(label_dict_path) w2v_dict = statics.loadfrompickle(w2v_dict_path) sim_list = {} for w in w2v_dict: if w in word_label_pair: #if w == 'ioLogik': avg_sim = 0 total_N = 0 embed = w2v_dict[w][1]
for w in words: blank_words = np.zeros(len(wdict), np.int64) blank_words[wdict[w]] = 1 encode_w.append(blank_words) return encode_w final_ldict_path = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/processed_v2/final_ldict.pickle' final_wdict_path = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/processed_v2/final_wdict20k.pickle' process_data_root = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/processed_v2' dlpair_root = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/dl_pair_for_training' wdict = statics.loadfrompickle(final_wdict_path) ldict = statics.loadfrompickle(final_ldict_path) src_list = os.listdir(process_data_root) dir_path = os.path.join(process_data_root, src_list[1]) file_path = os.listdir(dir_path) FILE_LENGTH = 500 contents = [] raw_w = [] coded_l = {} for i in file_path:
def create_data_label_path(dataset_path_list, dl_pair_path): if os.path.isfile(dl_pair_path): return statics.loadfrompickle(dl_pair_path) data_dict = {} count = 0 for d in dataset_path_list: lsdir = os.listdir(d) for folder in lsdir: count = count + 1 sys.stdout.write("Create path dict:{}/{}\n".format( count, len(lsdir))) sys.stdout.flush() folderpath = os.path.join(d, folder) filelist = os.listdir(folderpath) data_dict[folder] = {} if len(filelist) > 1: filelist.remove('labels.txt') filepath = [{x: os.path.join(folderpath, x)} for x in filelist] if type(filepath) is not list: filepath = [filepath] data_dict[folder]['context'] = filepath else: data_dict[folder]['context'] = {} label_path = os.path.join(folderpath, 'labels.txt') text = open(label_path, "r").read() if data_dict[folder]['context'] == {}: if len(text) == 0: shutil.rmtree(folderpath) else: sl = slipt_label(label_path) if len(sl['description']) != 0: context_path = os.path.join(folderpath, 'context.txt') with open(context_path, 'w') as f: for line in sl['description']: f.write(line + " ") f.close data_dict[folder]['label'] = label_path data_dict[folder]['context'] = context_path else: del data_dict[folder] else: data_dict[folder]['label'] = label_path data_dict = statics.savetopickle(dl_pair_path, data_dict) return data_dict
def create_wdict_ldict_general(Nword, wdict_path, ldict_path, final_wdict_path, final_ldict_path): qwords = ["what", "is", " model", "OS", "category"] wdict = statics.loadfrompickle(wdict_path) ldict = statics.loadfrompickle(ldict_path) sorted_wdict = sorted(wdict.items(), key=operator.itemgetter(1)) print(len(sorted_wdict)) r_wdict = {} #=============================================================== pure_dict = {} label_type = ['OS', 'category', 'model'] r_ldict = {} pure_ldict = {} count = 1 for lt in label_type: pure_ldict['UNK_' + lt] = len(pure_ldict) r_ldict[len(r_ldict)] = 'UNK_' + lt for l in ldict[lt]: if len(l) > 1 and l not in pure_ldict: r_ldict[count] = l pure_ldict[l] = len(pure_ldict) count = count + 1 elif len(l) > 1: l = lt + l r_ldict[count] = l pure_ldict[l] = len(pure_ldict) count = count + 1 print(l) #============================================================ r_wdict[0] = 'UNK' pure_dict['UNK'] = 0 idx = 1 count = 1 while idx < Nword + 1: # print("Create_dict:{}/{}".format(count, len(sorted_wdict))) if len(getChinese(sorted_wdict[-count][0])) == 0: r_wdict[idx] = sorted_wdict[-count][0] pure_dict[r_wdict[idx]] = idx idx = idx + 1 count = count + 1 for i in range(1, len(r_ldict)): if r_ldict[i] not in pure_dict: pure_dict[r_ldict[i]] = len(r_wdict) r_wdict[len(r_wdict)] = r_ldict[i] for i in range(len(qwords)): if qwords[i] not in pure_dict: pure_dict[qwords[i]] = len(r_wdict) r_wdict[len(r_wdict)] = qwords[i] # statics.savetopickle(final_wdict_path, pure_dict) statics.savetopickle(final_ldict_path, pure_ldict) # statics.savetopickle(final_rwdict_path, r_wdict) return pure_dict, pure_ldict, r_wdict
def process_data_to_pickle(process_root, path_dict, wdict_path, ldict_path): if os.path.isfile(wdict_path): wdicts = statics.loadfrompickle(wdict_path) else: wdicts = {} if os.path.isfile(ldict_path): ldicts = statics.loadfrompickle(ldict_path) else: ldicts = {'model': {}, 'OS': {}, 'category': {}} count = 0 for d in path_dict: count = count + 1 sys.stdout.write("Data to pickle:{}/{}\n".format( count, len(path_dict))) sys.stdout.flush() casefolder = os.path.join(process_root, d) if not os.path.isdir(casefolder): os.mkdir(casefolder) file_idx = 0 if type(path_dict[d]['context']) is not list: path_dict[d]['context'] = [path_dict[d]['context']] for clist in path_dict[d]['context']: for c in clist: savepath = os.path.join(casefolder, str(file_idx) + '.pickle') if os.path.isfile(savepath): continue if not os.path.isfile(savepath): stripe = slipt_doc_by_space(clist[c]) wdicts = collect_dict(stripe, wdict_path, wdicts) with open(savepath, 'wb') as f: pickle.dump(stripe, f, protocol=pickle.HIGHEST_PROTOCOL) file_idx = file_idx + 1 lsavepath = os.path.join(casefolder, 'label.pickle') if os.path.isfile(lsavepath): continue if not os.path.isfile(lsavepath): with open(lsavepath, 'wb') as lf: labels = slipt_label(path_dict[d]['label']) pickle.dump(labels, lf, protocol=pickle.HIGHEST_PROTOCOL) ldicts = collect_dict(labels, ldict_path, ldicts)
import statics w2vfile = '/home/ubuntu/workspace/text_summary_data/w2v.pickle' tfidf_path = '/home/ubuntu/workspace/text_summary_data/tfidf_score/tfidfscore.pickle' save_path = '/home/ubuntu/workspace/text_summary_data/tfidf_score/w2v_tfidf.pickle' w2v = statics.loadfrompickle(w2vfile) tiidf = statics.loadfrompickle(tfidf_path) tfidf_w2v = {} for w in w2v: tfidf_w2v[w] = w2v[w]*tiidf[1][w][2] statics.savetopickle(save_path, tfidf_w2v)