def get_dataloader(self, df, features_df, bs, include_df=False, **kwargs): text_values = df.preprocessed.values label_ids = df.label.values if features_df is not None: print(features_df.columns) extra_features = features_df.drop('Readable', axis=1).to_numpy() extra_features = StandardScaler().fit_transform(extra_features) assert (extra_features.shape[1] == self.config.extra_features_size) else: extra_features = None # dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values, label_ids, bs, extra_features=extra_features, **kwargs) if self.config.sep_token: # leave text section empty dataloader = tu.get_dataloader(self.config, self.tokenizer, [''] * len(text_values), label_ids, bs, text_pair_values=text_values, **kwargs) else: dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values, label_ids, bs, **kwargs) if include_df: return (dataloader, df) return dataloader
def get_dataloader(self, df, bs, include_df=False, fake=False, **kwargs): text_values = df.methodText.values label_ids = df.cyclomatic.values if fake: label_ids = label_ids.copy()[:128] text_values = text_values[:128] np.random.shuffle(label_ids) if self.config.sep_token: # leave text section empty dataloader = tu.get_dataloader(self.config, self.tokenizer, [''] * len(text_values), label_ids, bs, text_pair_values=text_values, **kwargs) else: dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values, label_ids, bs, **kwargs) if include_df: return (dataloader, df) return dataloader
def get_dataloader(self, df, bs, include_df=False, **kwargs): text_values = df.preprocessed label_ids = df['class'].values dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values, label_ids, bs, **kwargs) if include_df: return (dataloader, df) return dataloader
def get_dataloader(self, df, bs, include_df=False, **kwargs): text_values = df.text.values label_ids = df.informative.astype(int).values dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values, label_ids, bs, **kwargs) if include_df: return (dataloader, df) return dataloader
def get_dataloader(self, df, bs, text_col='text', **kwargs): text_values = df[text_col].values if self.config.soft_label: label_ids = df[self.LABEL_NAMES].to_numpy() else: label_ids = df.label.values dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values, label_ids, bs, **kwargs) return dataloader
def get_dataloader(self, df, bs, include_df=False, **kwargs): text_values = df.code.values label_ids = df.problematic.astype(int).values print(label_ids) if self.config.sep_token: # leave text section empty dataloader = tu.get_dataloader(self.config, self.tokenizer, [''] * len(text_values), label_ids, bs, text_pair_values=text_values, **kwargs) else: dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values, label_ids, bs, **kwargs) if include_df: return (dataloader, df) return dataloader
def get_dataloader(self, df, bs, include_df=False, **kwargs): text_values = df.code.values label_ids = df.complexity.map( {label: idx for idx, label in enumerate(self.LABEL_NAMES)}).values if self.config.sep_token: # leave text section empty dataloader = tu.get_dataloader(self.config, self.tokenizer, [''] * len(text_values), label_ids, bs, text_pair_values=text_values, **kwargs) else: dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values, label_ids, bs, **kwargs) if include_df: return (dataloader, df) return dataloader
def get_dataloader(self, tokenizer, df, bs, include_df=False, **kwargs): text_values = df.preprocessed.values single_class = self.config.single_class if self.config.single_class: label_ids = df[single_class].to_numpy() else: label_ids = df[Dataset.LABELS].to_numpy() dataloader = tu.get_dataloader(self.config, tokenizer, text_values, label_ids, bs, **kwargs) if include_df: return (dataloader, df) return dataloader
def get_dataloader(self, tokenizer, df, bs, include_df=False, **kwargs): # convert_label = {'DEFECT': 0, 'DESIGN': 1, # 'IMPLEMENTATION': 2, 'TEST': 3, # 'WITHOUT_CLASSIFICATION': 4, 'DOCUMENTATION': 5} text_values = df.preprocessed.values label_ids = df.label.values print(label_ids.dtype) print(np.unique(label_ids)) dataloader = tu.get_dataloader(self.config, tokenizer, text_values, label_ids, bs, **kwargs) if include_df: return (dataloader, df) return dataloader
def get_dataloader(self, df, bs, backtrans_langs=[], include_df=False, **kwargs): text_values = df.preprocessed.values label_ids = df.label_id.astype(int).values if backtrans_langs: logger.info("Text values array shape before augmentation: %s", text_values.shape) logger.info("Label ids array shape before augmentation: %s", label_ids.shape) for l in backtrans_langs: text_values = np.append(text_values, df[f'preprocessed_{l}'].values, axis=0) label_ids = np.tile(label_ids, len(backtrans_langs) + 1) logger.info("Text values array shape AFTER augmentation: %s", text_values.shape) logger.info("Label ids array shape AFTER augmentation: %s", label_ids.shape) dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values, label_ids, bs, **kwargs) if include_df: return (dataloader, df) return dataloader
def get_dataloader(self, df, bs, include_df=False, **kwargs): if self.config.sep_token: text_values1 = df.preprocessed_comment.values text_values2 = df.preprocessed_code.values else: text_values1 = df.preprocessed.values text_values2 = None label_ids = df.label.values dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values1, label_ids, bs, text_pair_values=text_values2, **kwargs) if include_df: return (dataloader, df) return dataloader
def main(): config = get_config() with config: config.logging_steps = 400 config.train_epochs = 2 config.lr = 4e-5 # config.lr = 1e-4 config.model_type = 'roberta' config.model_path = util.models_path('satd_complete_binary') # config.train_head_only = True tokenizer = tu.load_tokenizer(config) model_cls = tu.get_model_cls(config) df = pd.read_csv(util.data_path('satd', 'unclassified.csv')) # df = pd.read_csv(util.data_path('satd', 'dataset.csv')) df.dropna(inplace=True) # df.rename(columns={'classification': 'orig_classification'}, inplace=True) print(df.dtypes) print(df.head()) df['preprocessed'] = df.commenttext.map(TDDataset.preprocess) df.dropna(inplace=True) # df = df.head(100) preprocessed = df.preprocessed.values dummy_labels = np.zeros(preprocessed.shape[0]) dataloader = tu.get_dataloader(config, tokenizer, preprocessed, dummy_labels, bs=128, shuffle=False) model = tu.load_model(config) model.to(config.device) util.set_seed(config) experiment = Experiment(config, model, tokenizer) preds = experiment.predict(dataloader) preds = torch.from_numpy(preds) probs = F.softmax(preds, dim=1) uncertainty = least_conf(probs).numpy() labels = np.argmax(preds, axis=1) df['uncertainty'] = uncertainty df['probs0'] = probs[:, 0].numpy() df['probs1'] = probs[:, 1].numpy() df['classification'] = labels df.drop('preprocessed', axis='columns', inplace=True) label_name_map = {i: l for i, l in enumerate(TDDataset.BINARY_LABEL_NAMES)} print(label_name_map) # convert_label = {'DEFECT': 1, 'DESIGN': 1, # 'IMPLEMENTATION': 1, 'TEST': 1, # 'WITHOUT_CLASSIFICATION': 0, 'DOCUMENTATION': 1} # df['correct'] = (df.orig_classification.map(convert_label) == df.classification) # print(df.correct.value_counts(normalize=True)) df.classification = df.classification.map(label_name_map) df.to_csv(util.data_path('satd', 'unclassified_evaled.csv'), index=False) tech_debt_df = df[df.classification == 'TECHNICAL_DEBT'] print(tech_debt_df.shape) tech_debt_df.to_csv(util.data_path('satd', 'unclassified_pos.csv'), index=False)