Exemplo n.º 1
0
    def getTestingSet(self):
        df_test = read_csv_(self.test_file, names=None,delimiter=self.configs.delimiter)
        df_testr = read_csv_(self.testr_file, names=None,delimiter=self.configs.delimiter)

        if len(list(df_test.columns)) == 2:
            df_test.columns = ["token", "label"]
            df_testr.columns = ["token", "label"]
            df_test = df_test[["token"]]
            df_testr = df_testr[["label"]]
        elif len(list(df_test.columns)) == 1:
            df_test.columns = ["token"]
            df_testr.columns = ["token", "label"]
            df_testr = df_testr[["label"]]

        df_test["token_id"] = df_test.token.map(lambda x: self.mapFunc(x, self.token2id))
        df_testr["label_id"] = df_testr.label.map(lambda x: -1 if str(x) == str(np.nan) else self.rule2id[x])
        df_test["token"] = df_test.token.map(lambda x: -1 if str(x) == str(np.nan) else x)

        X_test_id, y_test_psyduo_label = self.prepare(df_test["token_id"], df_test["token_id"],
                                                      return_psyduo_label=True)
        X_test_token, _ = self.prepare(df_test["token"], df_test["token"])
        _, r_test = self.prepare(df_test["token_id"], df_testr["label_id"])

        self.logger.info("\ntesting set size: %d\n" % (len(X_test_id)))
        return X_test_id, y_test_psyduo_label, X_test_token, r_test
Exemplo n.º 2
0
    def getValidingSet(self):
        df_val = read_csv_(self.dev_file, names=["token", "label"],delimiter=self.configs.delimiter)
        df_valr = read_csv_(self.devr_file, names=["token", "label"],delimiter=self.configs.delimiter)

        df_val["token_id"] = df_val.token.map(lambda x: self.mapFunc(x, self.token2id))
        df_val["label_id"] = df_val.label.map(lambda x: -1 if str(x) == str(np.nan) else self.label2id[x])
        df_valr["label_id"] = df_valr.label.map(lambda x: -1 if str(x) == str(np.nan) else self.rule2id[x])

        X_val, y_val = self.prepare(df_val["token_id"], df_val["label_id"])
        _, r_val = self.prepare(df_val["token_id"], df_valr["label_id"])
        return X_val, y_val, r_val
Exemplo n.º 3
0
    def match_corr_diff(self,):
        '''
            match corrected df and predicted df
        : param df: the originally predicted csv
        : param corr_df: the corrected csv
        '''
        # read predicted and corrected results
        df_pred_ori = read_csv_(self.output_test_file, names=['id','token','label'], delimiter=self.configs.delimiter)
        df_corr_ori = read_csv_(self.corr_file, names=['id','token','label'], delimiter=self.configs.delimiter)
        
        df_pred = df_pred_ori.copy()
        df_corr = df_corr_ori.copy()

        df_pred['index1'] = df_pred.index
        df_corr['index1'] = df_corr.index

        # extract sen id in order to match sentences to train
        df_pred['url_sen_id'] = df_pred['id'].map(lambda x: x.rsplit('_', 1)[0] if str(x) != str(np.nan) else x)
        df_corr['url_sen_id'] = df_corr['id'].map(lambda x: x.rsplit('_', 1)[0] if str(x) != str(np.nan) else x)

        # extract the wrong prediction label
        df = df_corr[df_pred.label != df_corr.label]

        mat_id_sta_indexes = df_corr[df_corr.url_sen_id.isin (df.groupby("url_sen_id").first().index)].index

        df_corr_part = df_corr_ori.iloc[mat_id_sta_indexes ].reset_index()[['id','token','label']]
        df_corr_part['url_sen_id'] = df_corr_part['id'].map(lambda x: x.rsplit('_', 1)[0] if str(x) != str(np.nan) else x)

        # Let's create a row which we want to insert
        rows_num = len(df_corr_part)
        df_dict = df_corr_part.to_dict()
        # df_dict_no_index = {key: df_dict[key].values() for key, value in df_dict.items()}

        new_df_dict = {'id': [], 'token': [], 'label': []}

        for i in tqdm(range(rows_num), desc="Inserting sen dilimiter:"):
            if i < rows_num -1:
                # append the current item first
                new_df_dict['id'].append(df_corr_part['id'][i])
                new_df_dict['token'].append(df_corr_part['token'][i])
                new_df_dict['label'].append(df_corr_part['label'][i])
                # append delimiter if id is not equal
                if df_corr_part['url_sen_id'][i] != df_corr_part['url_sen_id'][i+1]:
                    new_df_dict['id'].append(np.nan)
                    new_df_dict['token'].append(np.nan)
                    new_df_dict['label'].append(np.nan)
    
        df_fin = pd.DataFrame.from_dict(new_df_dict)
        return df_fin
Exemplo n.º 4
0
    def build_new_token(self,):
        
        df_train = read_csv_(self.corr_file, names=["ord", "token", "label"], delimiter=self.configs.delimiter)
        # the new_tokens should smaller then ori tokens or just a subset of them
        new_tokens = list(set(df_train["token"][df_train["token"].notnull()]))
        
        if len(set(new_tokens) & set(list(self.token2id.keys()))) < len(new_tokens):
            # there are new tokens
            tokens = list(set(list(self.token2id.keys()) + new_tokens))
        else:
            # it is a subset and exit
            return

        token2id = dict(zip(tokens, range(1, len(tokens) + 1)))
        id2token = dict(zip(range(1, len(tokens) + 1), tokens))
        
        id2token[0] = self.PADDING
        token2id[self.PADDING] = 0
        id2token[len(tokens) + 1] = self.UNKNOWN
        token2id[self.UNKNOWN] = len(tokens) + 1

        with open(self.token2id_file, "w", encoding='utf-8') as outfile:
            for idx in id2token:
                outfile.write(id2token[idx] + "\t" + str(idx) + "\n")

        return token2id
Exemplo n.º 5
0
    def getTrainingSet(self, train_val_ratio=0.9):

        # df_train = read_csv_(self.train_file, names=["token", "label"], delimiter=self.configs.delimiter)

        df_train = read_csv_(self.train_file, names=["token", "label"], delimiter=self.configs.delimiter)

        # map the token and label into id
        df_train["token_id"] = df_train.token.map(lambda x: -1 if str(x) == str(np.nan) else self.token2id[x])
        df_train["label_id"] = df_train.label.map(lambda x: -1 if str(x) == str(np.nan) else self.label2id[x])
        # convert the data in maxtrix
        X, y = self.prepare_train(df_train["token_id"], df_train["label_id"])

        # shuffle the samples
        num_samples = len(X)
        indexs = np.arange(num_samples)
        np.random.shuffle(indexs)
        X = X[indexs]
        y = y[indexs]

        if self.dev_file != None:
            X_train = X
            y_train = y
            X_val, y_val = self.getValidingSet()
        else:
            # split the data into train and validation set
            X_train = X[:int(num_samples * train_val_ratio)]
            y_train = y[:int(num_samples * train_val_ratio)]
            X_val = X[int(num_samples * train_val_ratio):]
            y_val = y[int(num_samples * train_val_ratio):]

        self.logger.info("\ntraining set size: %d, validating set size: %d\n" % (len(X_train), len(y_val)))

        return X_train, y_train, X_val, y_val
Exemplo n.º 6
0
    def getTrainingSet(self, train_val_ratio=0.9):
        df_train = read_csv_(self.train_file,
                             names=["token", "label1", "label2", "label3"],
                             delimiter=self.configs.delimiter)

        # map the token and label into id
        df_train["token_id"] = df_train.token.map(
            lambda x: -1 if str(x) == str(np.nan) else self.token2id[x])
        df_train["label1_id"] = df_train.label1.map(
            lambda x: -1 if str(x) == str(np.nan) else self.label2id[x])
        df_train["label2_id"] = df_train.label2.map(
            lambda x: -1 if str(x) == str(np.nan) else self.label2id[x])
        df_train["label3_id"] = df_train.label3.map(
            lambda x: -1 if str(x) == str(np.nan) else self.label2id[x])

        # convert the data in maxtrix
        X, y1 = self.prepare(df_train["token_id"], df_train["label1_id"])
        _, y2 = self.prepare(df_train["token_id"], df_train["label2_id"])
        _, y3 = self.prepare(df_train["token_id"], df_train["label3_id"])

        # shuffle the samples
        num_samples = len(X)
        indexs = np.arange(num_samples)
        np.random.shuffle(indexs)
        X = X[indexs]
        y1 = y1[indexs]
        y2 = y2[indexs]
        y3 = y3[indexs]

        if self.dev_file != None:
            X_train = X
            y_train1 = y1
            y_train2 = y2
            y_train3 = y3
            X_val, y_val1, y_val2, y_val3 = self.getValidingSet()
        else:
            # split the data into train and validation set
            X_train = X[:int(num_samples * train_val_ratio)]
            y_train1 = y1[:int(num_samples * train_val_ratio)]
            y_train2 = y2[:int(num_samples * train_val_ratio)]
            y_train3 = y3[:int(num_samples * train_val_ratio)]
            X_val = X[int(num_samples * train_val_ratio):]
            y_val1 = y1[int(num_samples * train_val_ratio):]
            y_val2 = y2[int(num_samples * train_val_ratio):]
            y_val3 = y3[int(num_samples * train_val_ratio):]

        self.logger.info("\ntraining set size: %d, validating set size: %d\n" %
                         (len(X_train), len(X_val)))

        return X_train, y_train1, y_train2, y_train3, X_val, y_val1, y_val2, y_val3
Exemplo n.º 7
0
    def buildVocab(self, train_path, trainr_path):
        df_train = read_csv_(train_path, names=["token", "label"],delimiter=self.configs.delimiter)
        df_trainr = read_csv_(trainr_path, names=["token", "rule"],delimiter=self.configs.delimiter)
        tokens = list(set(df_train["token"][df_train["token"].notnull()]))
        labels = list(set(df_train["label"][df_train["label"].notnull()]))
        rules = list(set(df_trainr["rule"][df_trainr["rule"].notnull()]))
        rule2id = dict(zip(rules, range(1, len(rules) + 1)))
        token2id = dict(zip(tokens, range(1, len(tokens) + 1)))
        label2id = dict(zip(labels, range(1, len(labels) + 1)))
        id2rule = dict(zip(range(1, len(rules) + 1), rules))
        id2token = dict(zip(range(1, len(tokens) + 1), tokens))
        id2label = dict(zip(range(1, len(labels) + 1), labels))
        id2rule[0] = self.PADDING
        id2token[0] = self.PADDING
        id2label[0] = self.PADDING
        rule2id[self.PADDING] = 0
        token2id[self.PADDING] = 0
        label2id[self.PADDING] = 0
        id2token[len(tokens) + 1] = self.UNKNOWN
        token2id[self.UNKNOWN] = len(tokens) + 1

        self.saveVocab(id2token, id2label, id2rule)

        return token2id, id2token, label2id, id2label, rule2id, id2rule
Exemplo n.º 8
0
    def getTestingrealY_str(self):
        df_test = read_csv_(self.test_file, names=None, delimiter=self.configs.delimiter)

        if len(list(df_test.columns)) == 2:
            df_test.columns = ["token", "label"]
            df_test = df_test[["label"]]
        elif len(list(df_test.columns)) == 1:
            df_test.columns = ["label"]

        df_test["label_id"] = df_test.label.map(lambda x: self.mapFuncLable(x, self.label2id))
        df_test["label"] = df_test.label.map(lambda x: -1 if str(x) == str(np.nan) else x)

        Y_test_id, y_test_psyduo_label = self.prepare(df_test["label_id"], df_test["label_id"],
                                                      return_psyduo_label=True)
        Y_test_label, _ = self.prepare(df_test["label"], df_test["label"])

        # self.logger.info("\ntesting set size: %d\n" % (len(Y_test_id)))
        return Y_test_id, y_test_psyduo_label, Y_test_label
Exemplo n.º 9
0
    def getValidingSet(self):
        df_val = read_csv_(self.dev_file,
                           names=["token", "label1", "label2", "label3"],
                           delimiter=self.configs.delimiter)

        df_val["token_id"] = df_val.token.map(
            lambda x: self.mapFunc(x, self.token2id))
        df_val["label1_id"] = df_val.label1.map(
            lambda x: -1 if str(x) == str(np.nan) else self.label2id[x])
        df_val["label2_id"] = df_val.label2.map(
            lambda x: -1 if str(x) == str(np.nan) else self.label2id[x])
        df_val["label3_id"] = df_val.label3.map(
            lambda x: -1 if str(x) == str(np.nan) else self.label2id[x])

        X_val, y_val1 = self.prepare(df_val["token_id"], df_val["label1_id"])
        _, y_val2 = self.prepare(df_val["token_id"], df_val["label2_id"])
        _, y_val3 = self.prepare(df_val["token_id"], df_val["label3_id"])

        return X_val, y_val1, y_val2, y_val3
Exemplo n.º 10
0
    def getTestingSet(self):
        df_test = read_csv_(self.test_file, names=None, delimiter=self.configs.delimiter)
        if len(list(df_test.columns)) == 3:
            df_test.columns = ["id", "token", "label"]
            
        elif len(list(df_test.columns)) == 2:
            df_test.columns = ["id", "token"]

        ids = df_test['id'].tolist()
        df_test = df_test[["token"]]
        df_test["token_id"] = df_test.token.map(lambda x: self.mapFunc(x, self.token2id))
        df_test["token"] = df_test.token.map(lambda x: -1 if str(x) == str(np.nan) else x)

        X_test_id, y_test_psyduo_label = self.prepare(ids, df_test["token_id"], df_test["token_id"],
                                                      return_psyduo_label=True)
        hash_ids, X_test_token, _ = self.prepare(ids, df_test["token"], df_test["token"])

        self.logger.info("\ntesting set size: %d\n" % (len(X_test_id)))

        return hash_ids, X_test_id, y_test_psyduo_label, X_test_token
Exemplo n.º 11
0
    def buildVocab(self, train_path):
        df_train = read_csv_(train_path,
                             names=["token", "label1", "label2", "label3"],
                             delimiter=self.configs.delimiter)
        tokens = list(set(df_train["token"][df_train["token"].notnull()]))
        labels1 = list(set(df_train["label1"][df_train["label1"].notnull()]))
        labels2 = list(set(df_train["label2"][df_train["label2"].notnull()]))
        labels3 = list(set(df_train["label3"][df_train["label3"].notnull()]))
        labels = list(set(labels1 + labels2 + labels3))
        token2id = dict(zip(tokens, range(1, len(tokens) + 1)))
        label2id = dict(zip(labels, range(1, len(labels) + 1)))
        id2token = dict(zip(range(1, len(tokens) + 1), tokens))
        id2label = dict(zip(range(1, len(labels) + 1), labels))
        id2token[0] = self.PADDING
        id2label[0] = self.PADDING
        token2id[self.PADDING] = 0
        label2id[self.PADDING] = 0
        id2token[len(tokens) + 1] = self.UNKNOWN
        token2id[self.UNKNOWN] = len(tokens) + 1

        self.saveVocab(id2token, id2label)

        return token2id, id2token, label2id, id2label