def save_preset(save_dir: str, preset_args: dict):
    not_null_preset_args = {
        k: v
        for k, v in preset_args.items() if v is not None
    }
    os.makedirs(save_dir, exist_ok=True)
    write_json(f'{save_dir}/{PRESET_FILE_NAME}', not_null_preset_args)
def save_train_test_split(corpus,
                          test_size: float,
                          verbose=1,
                          random_state=42):
    stratify_column = [row['offensive'] for row in corpus]
    if verbose == 1:
        print('Performing train test split...')
    train, test = train_test_split(corpus,
                                   stratify=stratify_column,
                                   test_size=test_size,
                                   random_state=random_state)
    write_json(f'{DIR_NAME}/{len(corpus)}/train_{FILE_NAME}', train)
    write_json(f'{DIR_NAME}/{len(corpus)}/test_{FILE_NAME}', test)
    if verbose == 1:
        print(f'Train test files saved to: {DIR_NAME}/{len(corpus)}')
Пример #3
0
 def save(self, save_dir):
     text_cleaner_config = {
         'text_cleaner': {
             'replace_numbers': self.replace_numbers,
             'use_ner': True if self.ner_tagger is not None else False,
             'use_ner_converter':
             True if self.ner_converter is not None else False,
             'use_stemming': True if self.stemmer is not None else False,
             'use_lemmatization':
             True if self.lemmatizer is not None else False,
             'use_twitter_data_preprocessing':
             self.use_twitter_data_preprocessing,
             'lowercase': self.lowercase
         }
     }
     os.makedirs(f'{save_dir}', exist_ok=True)
     write_json(f'{save_dir}/predictor_config.json', text_cleaner_config)
Пример #4
0
 def save(self, save_dir):
     write_json(f'{save_dir}/predictor_config.json', {'text_cleaner': {}})
def save_downloaded_corpus(corpus, verbose=1):
    os.makedirs(f'{DIR_NAME}/{len(corpus)}', exist_ok=True)
    file_path = f'{DIR_NAME}/{len(corpus)}/{FILE_NAME}'
    write_json(file_path, corpus)
    if verbose == 1:
        print(f'Processed {len(corpus)} rows and saved to {file_path}')
Пример #6
0
def read_twitter_input_data(csv_file_path: str) -> List[dict]:
    twitter_input_data = []
    with open(INPUT_CSV_FILE_PATH, 'r') as f:
        lines = f.readlines()
        for line in lines:
            line_split = line.split(',')
            twitter_input_data.append({
                'polarity': line_split[1],
                'id': line_split[2],
                'date': line_split[3],
                'query': line_split[4],
                'user': line_split[5],
                'text': ','.join(line_split[6:])
            })
    return twitter_input_data


def extract_important_features(twitter_data: List[dict]) -> List[dict]:
    # There are no neutral tweets in the dataset, that's why this task becomes a binary classification problem.
    data = [{
        'polarity': 1 if sample['polarity'] == POSITIVE_TWEET else 0,
        'text': sample['text']
    } for sample in twitter_data]
    return data


if __name__ == '__main__':
    twitter_input_data = prepare_twitter_data_for_library()
    write_json(out_path=OUTPUT_JSON_DATA_PATH, data=twitter_input_data)