def get_embedding_vectors(candidate_summaries, reference_summaries, n_gram_encoding: None, model, layer, pooling_strategy): """ Generates the embedding vectors for the given sentences/tokens Uses the BERT as Service Client to produce the vectors. Args: - :param: `candidate_summaries` (list of list of strings): candidate summaries to be encoded - each summary should be represented as a list of sentences - :param: `reference_summaries` (list of list of strings): reference summaries to be encoded - each summary should be represented as a list of sentences - :param 'n_gram_encoding' (int): n-gram encoding level - desginates how many word vectors to combine for each final embedding vector if 'None' -> embedding level defaults to the sentence level of each individual sentence - :param: `model` (str): the specific bert model to use - :param: `layer` (int): the layer of representation to use. - :param: `pooling_strategy` (str): the vector combination strategy Return: - :param: candidate_embeddings, reference_embeddings (list of lists of float): list of embedding vectors for the summaries each summary has a list of vectors (i.e. a matrix) """ """ notes: start server from distiinct method for that --> parse the arguments as the guide shows different pooling strategies --> specify which to be used Should not truncate the sentences --> set encode parameter for this to "not truncate" include list of valid values for vector_level --> n-gram, sentence etc. (Scale up to n = 1 and up so that 1 vector for an entire summary) return_tensors: set the value so that it returns the torch sensors steps: 3) Extract and combine at the designated level 4) return the final vectors 5) ensure that the method that launches the server is placed in a "main" function call b/c of windows' multi-threading issues """ launch_bert_as_service_server() bert_client = BertClient() candidate_embeddings = [] reference_embeddings = [] #Generates the embedding vectors for each summary. If the for i in range(len(candidate_embeddings)): candidate_embeddings.append(bert_client.encode(candidate_summaries[i])) reference_embeddings.append(bert_client.encode(reference_summaries[i])) print("ENCODINGS COMPLETED, TERMINATING SERVER...") shutdown = get_shutdown_parser().parse_args(['-ip','localhost','-port','5555','-timeout','5000']) BertServer.shutdown(shutdown) if n_gram_encoding == None: return candidate_embeddings, reference_embeddings elif n_gram_encoding >= 1: get_ngram_embedding_vectors(candidate_embeddings, reference_embeddings, n_gram_encoding, pooling_strategy, )
def stop_server(): shut_args = get_shutdown_parser().parse_args(['-port', '5555']) BertServer.shutdown(shut_args)
whole_sentiment_embeddings, whole_sentiment_embeddings_with_emojis = processing.get_word_embeddings_with_without_emojis(data, emojisInData) avg_sentiment_embeddings, avg_sentiment_embeddings_with_emojis = processing.average_word_embeddings_with_without_emojis(data, emojisInData) with open('data/whole_tweet_embeddings.json', 'w', encoding="utf8") as fp: json.dump(whole_sentiment_embeddings, fp, default=default) with open('data/whole_tweet_embeddings_with_emojis.json', 'w', encoding="utf8") as fp: json.dump(whole_sentiment_embeddings_with_emojis, fp, default=default) with open('data/avg_tweet_embeddings.json', 'w', encoding="utf8") as fp: json.dump(avg_sentiment_embeddings, fp, default=default) with open('data/avg_tweet_embeddings_with_emojis.json', 'w', encoding="utf8") as fp: json.dump(avg_sentiment_embeddings_with_emojis, fp, default=default) # if csv: # NOTE the ml version currently does not support cvs # write_csv_embeddings(embeddings) shut_args = get_shutdown_parser().parse_args(['-ip','localhost','-port','5555','-timeout','5000']) BertServer.shutdown(shut_args) else: with open('data/whole_tweet_embeddings.json') as fp: embeddings = json.load(fp) if args.ml: embeddingFileNames = ['data/whole_tweet_embeddings.json', 'data/whole_tweet_embeddings_with_emojis.json', 'data/avg_tweet_embeddings.json', 'data/avg_tweet_embeddings_with_emojis.json'] for fileName in embeddingFileNames: fileNameUpdate = 'Using ' + fileName + ' as the embeddings' print(fileNameUpdate)
def load_data(emb_type='w2v', collapse_classes=False, fold=None, num_folds=1, random_state=None, force_reload=False, drop_feat_idx=[]): print('Loading data from',dataset_dir) data = pd.read_csv(dataset_dir+"/dataset.csv", sep=',') if force_reload: reset_hash() print("size of initial \"dataset\":",len(data)) data = data.drop_duplicates(subset='o_url', keep='first') print("after dropping duplicates:",len(data)) data.o_body = data.o_body.astype('str') data.verdict = data.verdict.astype('str') data['verdict'] = data['verdict'].str.lower() #data = data[data['o_body'].map(len) > MIN_BODY_LEN] #print("after dropping origins with less than "+str(MIN_BODY_LEN)+" chars:",len(data)) data = data.reset_index() if(collapse_classes): print("labels before collapse classes:", data.verdict.unique()) data.loc[data['verdict'] == "mfalse", 'verdict'] = 'false' data.loc[data['verdict'] == "mtrue", 'verdict'] = 'true' labels = ['true', 'false'] print(data['verdict'].value_counts()) data = data.loc[data.verdict.isin(labels)] print("considered labels:", data.verdict.unique()) print("after dropping invalid labels:",len(data)) #creating hash json_data = data.to_json().encode() data = data.sample(frac=1, random_state=random_state) df_hash = hashlib.sha256(json_data).hexdigest() labels_idx = [labels.index(label) for label in labels] labels_one_hot = np.eye(len(labels))[labels_idx] label_to_oh = {label:labels_one_hot[labels.index(label)] for label in labels} print("MEMORY: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) assert (num_folds > 2), "Needs at least three folds for Dev/Train/Test to be different from each other" #generate and save the folds: for fold in range(num_folds): bucket_size = int(len(data.index)/num_folds) fold_dev = fold+1 if fold == num_folds-1: fold_dev = 0 if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx): #TODO modify these two lines back!!! df = data[['o_body','verdict']].copy() #df = data[['claim','verdict']].copy() df = df.rename(columns={"o_body": "body"}) #df = df.rename(columns={"claim": "body"}) df.body.apply(clean_text) lens = np.asarray([len(e.split(" ")) for e in df['body'].values]) #df = df[lens < MAX_SENT_LEN] df.reset_index(drop = True, inplace = True) df.to_csv(data_dir+'/data.csv', sep="\t", index=False) num_entries = len(df) #plots the data distribution by number of words print("Number of entries: ", num_entries) print("True/False: ",df.groupby('verdict').count()) print("Mean and Std of number of words per document: ",np.mean(lens),np.std(lens), "\n") #sns.distplot(lens) #plt.show() ################################### ############# FEATURES ############ ################################### #check if new linguistic features should be generated flag_concat = False if not check_hash(df_hash, num_folds, stage="complexity"): flag_concat = True #Generate the features ndarray and save it to a pickle try: feat.generate_complexity() except Exception as e: print(traceback.format_exc()) input("Error occured while GENERATING COMPLEXITY. Press any key to exit.") sys.exit(1) savehash("complexity", hashcode=df_hash) if not check_hash(df_hash, num_folds, stage="specificity"): flag_concat = True try: feat.generate_specificity() except Exception as e: print(traceback.format_exc()) input("Error occured while GENERATING SPECIFICITY. Press any key to exit.") sys.exit(1) savehash("specificity", hashcode=df_hash) if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx, stage="features"): flag_concat = True try: features = feat.generateFeats() except Exception as e: print(traceback.format_exc()) input("Error occured while GENERATING FEATURES. Press any key to exit.") sys.exit(1) save_p(data_dir+"/features", features) print("Generated Features. Saved to pickle.") print("Features Shape:", features.shape) savehash("features", hashcode=df_hash, drop_feat_idx=drop_feat_idx) #check if drop_features is NOT the same if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx, stage="drop_feat"): flag_concat = True savehash("drop_feat", hashcode=df_hash, drop_feat_idx=drop_feat_idx) print("MEMORY AFTER FEATURES: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) #################################### ############### BERT ############### #################################### #check if new bert should be generated if not check_hash(df_hash, num_folds, stage="bert"): try: #creates the shuffle order (not random) index_shuf = list(range(len(df))) #creates a list of N=folds lists, each inner list contains the index of the elements of each fold bert_folds = np.array_split(index_shuf, num_folds) bert_folds = [a.tolist() for a in bert_folds] #creates an ordered list of N=entries of integers(:folds) indicating the fold idx of each entry fold_idx = [bert_folds.index(list(sl)) for e in index_shuf for sl in bert_folds if e in list(sl)] #I think this should start as True flag = {idx:True for idx in range(len(bert_folds))} #get the starting time: start_time = time.time() #start the bert-as-a-service server bert_dir = os.environ.get("BERT_BASE_DIR") print(bert_dir) args = get_args_parser().parse_args(['-model_dir', bert_dir, '-port', '5555', '-port_out', '5556', '-max_seq_len', '512', '-mask_cls_sep']) server = BertServer(args) server.start() print(num_folds) #delete the bert.csv files inside the folds for i in range(num_folds): filename = data_dir+"/folds/"+str(i)+"/bert.csv" if os.path.exists(filename): subprocess.call("rm -rf "+filename, shell=True, cwd=data_dir) #TODO make this process read only one fold at a time for fold, idx in zip(fold_idx, index_shuf): #generates the encodings for the texts bc = BertClient(check_version=False) b = bc.encode([df.body[idx]])[0] bert_df = pd.DataFrame([b], columns=['f'+str(e) for e in range(len(b))]) bert_df.to_csv(data_dir+"/folds/"+str(fold)+"/bert.csv", mode='a+', index=False, header=flag[fold]) flag[fold] = False #stops the bert-as-a-service server shut_args = get_shutdown_parser().parse_args(['-ip','localhost','-port','5555','-timeout','5000']) server.shutdown(shut_args) #print total time delta_time = time.time() - start_time print('Time Taken: for BERT generation:', time.strftime("%H:%M:%S",time.gmtime(delta_time))) except Exception as e: print(traceback.format_exc()) input("Error occured while fine training BERT. Press any key to exit.") sys.exit(1) print("BERT Embeddings Saved") savehash("bert", df_hash) ######################################### ## CONCATENATION, SHUFFLING AND SAVING ## ######################################### #if not check_hash(df_hash, num_folds, stage="concat"): if flag_concat: features = read_p(data_dir+"/features") features = np.delete(features,drop_feat_idx,axis=1) #normalize features features = np.nan_to_num(features) features_t = features.T for c in range(features_t.shape[0]): row = features_t[c] features_t[c] = np.interp(row, (np.min(row), np.max(row)), (-2, +2)) features = features_t.T #delete labels and folds folders for i in range(num_folds): subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/labels", shell=True, cwd=data_dir) subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/features+bert.csv", shell=True, cwd=data_dir) subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/bert", shell=True, cwd=data_dir) subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/only_bert", shell=True, cwd=data_dir) #creates the shuffle order (not random) index_shuf = list(range(len(df))) #LABELS labels = [label_to_oh[label].tolist() for label in df['verdict'].values.tolist()] labels = [labels[i] for i in index_shuf] label_folds = np.array_split(labels, num_folds) for i in range(num_folds): fold_dir = data_dir+"/folds/"+str(i) if not os.path.exists(fold_dir): os.mkdir(fold_dir) save_p(fold_dir+"/labels", label_folds[i]) #creates a list of N=folds lists, each inner list contains the index of the elements of each fold bert_folds = np.array_split(index_shuf, num_folds) bert_folds = [a.tolist() for a in bert_folds] #creates an ordered list of N=entries of integers(:folds) indicating the fold idx of each entry fold_idx = [bert_folds.index(list(sl)) for e in index_shuf for sl in bert_folds if e in list(sl)] #TODO make this process read only one fold at a time for fold in range(num_folds): b_fold_csv = pd.read_csv(data_dir+"/folds/"+str(fold)+"/bert.csv") #gets only the indexes count = sum([1 for fidx,_ in zip(fold_idx, index_shuf) if fold == fidx]) for idx in range(count): #print("csv:",b_fold_csv) #print("len",len(b_fold_csv)) #print("count: ", count) #print("range(count): ",range(count)) b = b_fold_csv.iloc[idx] entry = np.concatenate((features[idx,:],b)) feat_df = pd.DataFrame([entry], columns=['f'+str(e) for e in range(len(entry))]) feat_df.to_csv(data_dir+"/folds/"+str(fold)+"/features+bert.csv", mode='a+', index=False, header=False) for i in range(num_folds): fold_dir = data_dir+"/folds/"+str(i) bert = np.genfromtxt(fold_dir+"/features+bert.csv", delimiter=',') only_bert = np.genfromtxt(fold_dir+"/bert.csv", delimiter=',') print("saving bert fold ",str(i), bert.shape) save_p(fold_dir+"/bert", bert) save_p(fold_dir+"/only_bert", only_bert) print("MEMORY AFTER FOLDS SAVING: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) savehash("concat", hashcode=df_hash) checks = ["bert", "features", "concat", "complexity", "specificity"] for e in checks: print(e) print(check_hash(df_hash,num_folds,stage=e)) if not (check_hash(df_hash,num_folds,stage=e, drop_feat_idx=drop_feat_idx)): print('Problem at Generation of data!') print("Stage: "+e) return print('Generation of data successfully done!') savehash("data", hashcode=df_hash) savehash("folds", hashcode=str(num_folds)) return load_data(emb_type=emb_type, collapse_classes=collapse_classes, fold=fold, num_folds=num_folds, random_state=random_state, drop_feat_idx=drop_feat_idx) else: print("Reading already processed data") #returns the selected emb type (bert/w2v) test_data = read_p(data_dir+"/folds/"+str(fold)+"/"+emb_type) test_target = read_p(data_dir+"/folds/"+str(fold)+"/labels") dev_data = read_p(data_dir+"/folds/"+str(fold_dev)+"/"+emb_type) #dev_data = np.ndarray(dev_data) dev_target = read_p(data_dir+"/folds/"+str(fold_dev)+"/labels") train_data_filenames = [data_dir+"/folds/"+str(i)+"/"+emb_type for i in range(num_folds) if i not in [fold,fold_dev]] train_data = np.concatenate([read_p(fn) for fn in train_data_filenames], axis=0) train_target_filenames = [data_dir+"/folds/"+str(i)+"/labels" for i in range(num_folds) if i not in [fold,fold_dev]] train_target = np.concatenate([read_p(fn) for fn in train_target_filenames], axis=0) return train_data, train_target, dev_data, dev_target, test_data, test_target, label_to_oh
def __init__(self): self.start_args = get_args_parser().parse_args(START_ARGS) self.shut_args = get_shutdown_parser().parse_args(SHUT_ARGS)