def get_embedding_vectors(candidate_summaries, reference_summaries, n_gram_encoding: None, model, layer, pooling_strategy):
    Generates the embedding vectors for the given sentences/tokens
    Uses the BERT as Service Client to produce the vectors. 

        - :param: `candidate_summaries` (list of list of strings): candidate summaries to be encoded - each summary should be represented as a list of sentences
        - :param: `reference_summaries` (list of list of strings): reference summaries to be encoded - each summary should be represented as a list of sentences

        - :param  'n_gram_encoding' (int): n-gram encoding level - desginates how many word vectors to combine for each final embedding vector
                                        if 'None' -> embedding level defaults to the sentence level of each individual sentence
        - :param: `model` (str): the specific bert model to use
        - :param: `layer` (int): the layer of representation to use.
        - :param: `pooling_strategy` (str): the vector combination strategy 
        - :param: candidate_embeddings, reference_embeddings (list of lists of float): list of embedding vectors for the summaries
                  each summary has a list of vectors (i.e. a matrix)


    start server from distiinct method for that --> parse the arguments as the guide shows

    different pooling strategies --> specify which to be used

    Should not truncate the sentences --> set encode parameter for this to "not truncate"

    include list of valid values for vector_level --> n-gram, sentence etc. (Scale up to n = 1 and up so that 1 vector for an entire summary)

    return_tensors: set the value so that it returns the torch sensors


    3) Extract and combine at the designated level
    4) return the final vectors
    5) ensure that the method that launches the server is placed in a "main" function call b/c of windows' multi-threading issues 


    bert_client = BertClient()

    candidate_embeddings = []
    reference_embeddings = []

    #Generates the embedding vectors for each summary. If the 
    for i in range(len(candidate_embeddings)):

    shutdown = get_shutdown_parser().parse_args(['-ip','localhost','-port','5555','-timeout','5000'])

    if n_gram_encoding == None:
        return candidate_embeddings, reference_embeddings
    elif n_gram_encoding >= 1:
        get_ngram_embedding_vectors(candidate_embeddings, reference_embeddings, n_gram_encoding, pooling_strategy, ) 
def stop_server():
    shut_args = get_shutdown_parser().parse_args(['-port', '5555'])
        whole_sentiment_embeddings, whole_sentiment_embeddings_with_emojis = processing.get_word_embeddings_with_without_emojis(data, emojisInData)
        avg_sentiment_embeddings, avg_sentiment_embeddings_with_emojis = processing.average_word_embeddings_with_without_emojis(data, emojisInData)

        with open('data/whole_tweet_embeddings.json', 'w', encoding="utf8") as fp:
            json.dump(whole_sentiment_embeddings, fp, default=default)
        with open('data/whole_tweet_embeddings_with_emojis.json', 'w', encoding="utf8") as fp:
            json.dump(whole_sentiment_embeddings_with_emojis, fp, default=default)
        with open('data/avg_tweet_embeddings.json', 'w', encoding="utf8") as fp:
            json.dump(avg_sentiment_embeddings, fp, default=default)
        with open('data/avg_tweet_embeddings_with_emojis.json', 'w', encoding="utf8") as fp:
            json.dump(avg_sentiment_embeddings_with_emojis, fp, default=default)

        # if csv: # NOTE the ml version currently does not support cvs
        #     write_csv_embeddings(embeddings)
        shut_args = get_shutdown_parser().parse_args(['-ip','localhost','-port','5555','-timeout','5000'])
        with open('data/whole_tweet_embeddings.json') as fp:
            embeddings = json.load(fp)

    if args.ml:

        embeddingFileNames = ['data/whole_tweet_embeddings.json', 

        for fileName in embeddingFileNames:
            fileNameUpdate = 'Using ' + fileName + ' as the embeddings'
文件: data_loader.py 项目: lucas0/Lux
def load_data(emb_type='w2v', collapse_classes=False, fold=None, num_folds=1, random_state=None, force_reload=False, drop_feat_idx=[]):
    print('Loading data from',dataset_dir)
    data = pd.read_csv(dataset_dir+"/dataset.csv", sep=',')

    if force_reload: reset_hash()

    print("size of initial \"dataset\":",len(data))
    data = data.drop_duplicates(subset='o_url', keep='first')
    print("after dropping duplicates:",len(data))
    data.o_body = data.o_body.astype('str')
    data.verdict = data.verdict.astype('str')
    data['verdict'] = data['verdict'].str.lower()
    #data = data[data['o_body'].map(len) > MIN_BODY_LEN]
    #print("after dropping origins with less than "+str(MIN_BODY_LEN)+" chars:",len(data))
    data = data.reset_index()

        print("labels before collapse classes:", data.verdict.unique())
        data.loc[data['verdict'] == "mfalse", 'verdict'] = 'false'
        data.loc[data['verdict'] == "mtrue", 'verdict'] = 'true'

    labels = ['true', 'false']
    data = data.loc[data.verdict.isin(labels)]
    print("considered labels:", data.verdict.unique())
    print("after dropping invalid labels:",len(data))

    #creating hash
    json_data = data.to_json().encode()
    data = data.sample(frac=1, random_state=random_state)
    df_hash = hashlib.sha256(json_data).hexdigest()

    labels_idx = [labels.index(label) for label in labels]
    labels_one_hot = np.eye(len(labels))[labels_idx]
    label_to_oh = {label:labels_one_hot[labels.index(label)] for label in labels}

    print("MEMORY: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    assert (num_folds > 2), "Needs at least three folds for Dev/Train/Test to be different from each other"
    #generate and save the folds:
    for fold in range(num_folds):
        bucket_size = int(len(data.index)/num_folds)
        fold_dev = fold+1
        if fold == num_folds-1:
            fold_dev = 0

    if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx):
        #TODO modify these two lines back!!!
        df = data[['o_body','verdict']].copy()
        #df = data[['claim','verdict']].copy()
        df = df.rename(columns={"o_body": "body"})
        #df = df.rename(columns={"claim": "body"})

        lens = np.asarray([len(e.split(" ")) for e in df['body'].values])
        #df = df[lens < MAX_SENT_LEN]
        df.reset_index(drop = True, inplace = True)
        df.to_csv(data_dir+'/data.csv', sep="\t", index=False)
        num_entries = len(df)

        #plots the data distribution by number of words
        print("Number of entries: ", num_entries)
        print("True/False: ",df.groupby('verdict').count())
        print("Mean and Std of number of words per document: ",np.mean(lens),np.std(lens), "\n")

        ############# FEATURES ############
        #check if new linguistic features should be generated
        flag_concat = False
        if not check_hash(df_hash, num_folds, stage="complexity"):
            flag_concat = True
            #Generate the features ndarray and save it to a pickle
            except Exception as e:
                input("Error occured while GENERATING COMPLEXITY. Press any key to exit.")
            savehash("complexity", hashcode=df_hash)
        if not check_hash(df_hash, num_folds, stage="specificity"):
            flag_concat = True
            except Exception as e:
                input("Error occured while GENERATING SPECIFICITY. Press any key to exit.")
            savehash("specificity", hashcode=df_hash)

        if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx, stage="features"):
            flag_concat = True
                features = feat.generateFeats()
            except Exception as e:
                input("Error occured while GENERATING FEATURES. Press any key to exit.")
            save_p(data_dir+"/features", features)
            print("Generated Features. Saved to pickle.")
            print("Features Shape:", features.shape)
            savehash("features", hashcode=df_hash, drop_feat_idx=drop_feat_idx)

        #check if drop_features is NOT the same
        if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx, stage="drop_feat"):
            flag_concat = True
            savehash("drop_feat", hashcode=df_hash, drop_feat_idx=drop_feat_idx)

        print("MEMORY AFTER FEATURES: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

        ############### BERT ###############
        #check if new bert should be generated
        if not check_hash(df_hash, num_folds, stage="bert"):
                #creates the shuffle order (not random)
                index_shuf = list(range(len(df)))

                #creates a list of N=folds lists, each inner list contains the index of the elements of each fold
                bert_folds = np.array_split(index_shuf, num_folds)
                bert_folds = [a.tolist() for a in bert_folds]

                #creates an ordered list of N=entries of integers(:folds) indicating the fold idx of each entry
                fold_idx = [bert_folds.index(list(sl)) for e in index_shuf for sl in bert_folds if e in list(sl)]

                #I think this should start as True
                flag = {idx:True for idx in range(len(bert_folds))}

                #get the starting time:
                start_time = time.time()

                #start the bert-as-a-service server
                bert_dir = os.environ.get("BERT_BASE_DIR")
                args = get_args_parser().parse_args(['-model_dir', bert_dir, '-port', '5555', '-port_out', '5556', '-max_seq_len', '512', '-mask_cls_sep'])
                server = BertServer(args)

                #delete the bert.csv files inside the folds
                for i in range(num_folds):
                    filename = data_dir+"/folds/"+str(i)+"/bert.csv"
                    if os.path.exists(filename):
                        subprocess.call("rm -rf "+filename, shell=True, cwd=data_dir)

                #TODO make this process read only one fold at a time
                for fold, idx in zip(fold_idx, index_shuf):

                    #generates the encodings for the texts
                    bc = BertClient(check_version=False)
                    b = bc.encode([df.body[idx]])[0]

                    bert_df = pd.DataFrame([b], columns=['f'+str(e) for e in range(len(b))])
                    bert_df.to_csv(data_dir+"/folds/"+str(fold)+"/bert.csv", mode='a+', index=False, header=flag[fold])
                    flag[fold] = False

                #stops the bert-as-a-service server
                shut_args = get_shutdown_parser().parse_args(['-ip','localhost','-port','5555','-timeout','5000'])

                #print total time
                delta_time = time.time() - start_time
                print('Time Taken: for BERT generation:', time.strftime("%H:%M:%S",time.gmtime(delta_time)))

            except Exception as e:
                input("Error occured while fine training BERT. Press any key to exit.")

            print("BERT Embeddings Saved")
            savehash("bert", df_hash)


        #if not check_hash(df_hash, num_folds, stage="concat"):
        if flag_concat:
            features = read_p(data_dir+"/features")
            features = np.delete(features,drop_feat_idx,axis=1)

            #normalize features
            features = np.nan_to_num(features)
            features_t = features.T
            for c in range(features_t.shape[0]):
                row = features_t[c]
                features_t[c] = np.interp(row, (np.min(row), np.max(row)), (-2, +2))
            features = features_t.T
            #delete labels and folds folders
            for i in range(num_folds):
                subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/labels", shell=True, cwd=data_dir)
                subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/features+bert.csv", shell=True, cwd=data_dir)
                subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/bert", shell=True, cwd=data_dir)
                subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/only_bert", shell=True, cwd=data_dir)

            #creates the shuffle order (not random)
            index_shuf = list(range(len(df)))

            labels = [label_to_oh[label].tolist() for label in df['verdict'].values.tolist()]
            labels = [labels[i] for i in index_shuf]
            label_folds = np.array_split(labels, num_folds)

            for i in range(num_folds):
                fold_dir = data_dir+"/folds/"+str(i)
                if not os.path.exists(fold_dir):
                save_p(fold_dir+"/labels", label_folds[i])

            #creates a list of N=folds lists, each inner list contains the index of the elements of each fold
            bert_folds = np.array_split(index_shuf, num_folds)
            bert_folds = [a.tolist() for a in bert_folds]

            #creates an ordered list of N=entries of integers(:folds) indicating the fold idx of each entry
            fold_idx = [bert_folds.index(list(sl)) for e in index_shuf for sl in bert_folds if e in list(sl)]

            #TODO make this process read only one fold at a time
            for fold in range(num_folds):
                b_fold_csv = pd.read_csv(data_dir+"/folds/"+str(fold)+"/bert.csv")
                #gets only the indexes
                count = sum([1 for fidx,_ in zip(fold_idx, index_shuf) if fold == fidx])
                for idx in range(count):
                    #print("count: ", count)
                    #print("range(count): ",range(count))
                    b = b_fold_csv.iloc[idx]
                    entry = np.concatenate((features[idx,:],b))

                    feat_df = pd.DataFrame([entry], columns=['f'+str(e) for e in range(len(entry))])
                    feat_df.to_csv(data_dir+"/folds/"+str(fold)+"/features+bert.csv", mode='a+', index=False, header=False)

            for i in range(num_folds):
                fold_dir = data_dir+"/folds/"+str(i)
                bert = np.genfromtxt(fold_dir+"/features+bert.csv", delimiter=',')
                only_bert = np.genfromtxt(fold_dir+"/bert.csv", delimiter=',')
                print("saving bert fold ",str(i), bert.shape)
                save_p(fold_dir+"/bert", bert)
                save_p(fold_dir+"/only_bert", only_bert)

            print("MEMORY AFTER FOLDS SAVING: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

            savehash("concat", hashcode=df_hash)

        checks = ["bert", "features", "concat", "complexity", "specificity"]

        for e in checks:
            if not (check_hash(df_hash,num_folds,stage=e, drop_feat_idx=drop_feat_idx)):
                print('Problem at Generation of data!')
                print("Stage: "+e)

        print('Generation of data successfully done!')
        savehash("data", hashcode=df_hash)
        savehash("folds", hashcode=str(num_folds))

        return load_data(emb_type=emb_type, collapse_classes=collapse_classes, fold=fold, num_folds=num_folds, random_state=random_state, drop_feat_idx=drop_feat_idx)

        print("Reading already processed data")
        #returns the selected emb type (bert/w2v)
        test_data = read_p(data_dir+"/folds/"+str(fold)+"/"+emb_type)
        test_target = read_p(data_dir+"/folds/"+str(fold)+"/labels")

        dev_data = read_p(data_dir+"/folds/"+str(fold_dev)+"/"+emb_type)
        #dev_data = np.ndarray(dev_data)
        dev_target = read_p(data_dir+"/folds/"+str(fold_dev)+"/labels")

        train_data_filenames = [data_dir+"/folds/"+str(i)+"/"+emb_type for i in range(num_folds) if i not in [fold,fold_dev]]
        train_data = np.concatenate([read_p(fn) for fn in train_data_filenames], axis=0)
        train_target_filenames = [data_dir+"/folds/"+str(i)+"/labels" for i in range(num_folds) if i not in [fold,fold_dev]]
        train_target = np.concatenate([read_p(fn) for fn in train_target_filenames], axis=0)

        return train_data, train_target, dev_data, dev_target, test_data, test_target, label_to_oh
 def __init__(self):
     self.start_args = get_args_parser().parse_args(START_ARGS)
     self.shut_args = get_shutdown_parser().parse_args(SHUT_ARGS)