def save_preset(save_dir: str, preset_args: dict): not_null_preset_args = { k: v for k, v in preset_args.items() if v is not None } os.makedirs(save_dir, exist_ok=True) write_json(f'{save_dir}/{PRESET_FILE_NAME}', not_null_preset_args)
def save_train_test_split(corpus, test_size: float, verbose=1, random_state=42): stratify_column = [row['offensive'] for row in corpus] if verbose == 1: print('Performing train test split...') train, test = train_test_split(corpus, stratify=stratify_column, test_size=test_size, random_state=random_state) write_json(f'{DIR_NAME}/{len(corpus)}/train_{FILE_NAME}', train) write_json(f'{DIR_NAME}/{len(corpus)}/test_{FILE_NAME}', test) if verbose == 1: print(f'Train test files saved to: {DIR_NAME}/{len(corpus)}')
def save(self, save_dir): text_cleaner_config = { 'text_cleaner': { 'replace_numbers': self.replace_numbers, 'use_ner': True if self.ner_tagger is not None else False, 'use_ner_converter': True if self.ner_converter is not None else False, 'use_stemming': True if self.stemmer is not None else False, 'use_lemmatization': True if self.lemmatizer is not None else False, 'use_twitter_data_preprocessing': self.use_twitter_data_preprocessing, 'lowercase': self.lowercase } } os.makedirs(f'{save_dir}', exist_ok=True) write_json(f'{save_dir}/predictor_config.json', text_cleaner_config)
def save(self, save_dir): write_json(f'{save_dir}/predictor_config.json', {'text_cleaner': {}})
def save_downloaded_corpus(corpus, verbose=1): os.makedirs(f'{DIR_NAME}/{len(corpus)}', exist_ok=True) file_path = f'{DIR_NAME}/{len(corpus)}/{FILE_NAME}' write_json(file_path, corpus) if verbose == 1: print(f'Processed {len(corpus)} rows and saved to {file_path}')
def read_twitter_input_data(csv_file_path: str) -> List[dict]: twitter_input_data = [] with open(INPUT_CSV_FILE_PATH, 'r') as f: lines = f.readlines() for line in lines: line_split = line.split(',') twitter_input_data.append({ 'polarity': line_split[1], 'id': line_split[2], 'date': line_split[3], 'query': line_split[4], 'user': line_split[5], 'text': ','.join(line_split[6:]) }) return twitter_input_data def extract_important_features(twitter_data: List[dict]) -> List[dict]: # There are no neutral tweets in the dataset, that's why this task becomes a binary classification problem. data = [{ 'polarity': 1 if sample['polarity'] == POSITIVE_TWEET else 0, 'text': sample['text'] } for sample in twitter_data] return data if __name__ == '__main__': twitter_input_data = prepare_twitter_data_for_library() write_json(out_path=OUTPUT_JSON_DATA_PATH, data=twitter_input_data)