def prepare_processed_training_data(): """ Generate all features into processes/... folder from interim/... """ logger = logging.getLogger(__name__) logger.info('Making processed training data set from interim data') # Init absolute path of folders processed_folder_path = os.path.join(DATA_PROCESSED_ROOT, DATASET_NAME) interim_folder_path = os.path.join(DATA_INTERIM_ROOT, DATASET_NAME) if os.path.exists(processed_folder_path): shutil.rmtree(processed_folder_path) os.makedirs(processed_folder_path) for event_name in DATASET_EVENTS: event_folder_path = os.path.join(interim_folder_path, event_name) list_tweet_ids = [ name for name in os.listdir(event_folder_path) if os.path.isfile(os.path.join(event_folder_path, name)) ] processed_event_folder_path = os.path.join(processed_folder_path, event_name) os.makedirs(processed_event_folder_path) train_processed_file = open( os.path.join(processed_event_folder_path, 'train.txt'), "w") train_processed_label_file = open( os.path.join(processed_event_folder_path, 'train_label.txt'), "w") tweet_count = len(list_tweet_ids) for index, id in enumerate(list_tweet_ids): print event_name, '+', index source_tweet = json_from_file(os.path.join(event_folder_path, id)) features = collect_feature(source_tweet) features_str = "\t".join([str(i) for i in features]) train_processed_file.write(features_str) if index != tweet_count - 1: train_processed_file.write('\n') train_processed_label_file.write( str(VERACITY_LABELS_MAPPING[source_tweet['veracity']])) if index != tweet_count - 1: train_processed_label_file.write('\n') train_processed_file.close() train_processed_label_file.close()
import json from kafka import KafkaConsumer import pymongo from bson import ObjectId from email.mime.text import MIMEText from email.header import Header from subprocess import Popen, PIPE from utils import json_from_file COMMASPACE = ', ' config_file_name = 'config.json' config = {} try: config = json_from_file(config_file_name, "Can't open ss-config file.") except RuntimeError as e: print(e) exit() formatter = logging.Formatter(config['logging.format']) # Create handlers c_handler = logging.StreamHandler() f_handler = logging.FileHandler(config['logging.file']) # Create formatters and add it to handlers c_handler.setFormatter(formatter) f_handler.setFormatter(formatter) logging_level = config["logging.level"] if 'logging.level' in config else 20 print("Selecting logging level", logging_level)
def prepare_test_data(): """ Runs data processing scripts to turn testing raw data from (../raw) into interim data to be analyzed (saved in ../interim). """ logger = logging.getLogger(__name__) logger.info('Making interim test data set from raw data') # Init absolute path of folders raw_input_folder_path = os.path.join(DATA_RAW_ROOT, TESTSET_NAME) raw_output_folder_path = os.path.join(DATA_RAW_ROOT, TESTSET_NAME) interim_folder_path = os.path.join(DATA_INTERIM_ROOT, TESTSET_NAME) # Read veracities from both test and dev files veracity_labels = json_from_file(os.path.join(raw_output_folder_path, VERACITY_LABEL_TEST_FILE[0])) # Read stances from both test and dev files stance_labels = json_from_file(os.path.join(raw_output_folder_path, STANCE_LABEL_TEST_FILE[0])) # If interim data existed, delete and create a new one if os.path.exists(interim_folder_path): shutil.rmtree(interim_folder_path) os.makedirs(interim_folder_path) list_tweet_ids = [name for name in os.listdir(raw_input_folder_path) if os.path.isdir(os.path.join(raw_input_folder_path, name))] for index, id in enumerate(list_tweet_ids): # thread conversation folder in raw source_tweet_folder_path = os.path.join(raw_input_folder_path, id) # read source tweet source_tweet_file = open(os.path.join(source_tweet_folder_path, 'source-tweet', id + '.json'), 'r') source_tweet_content = source_tweet_file.read() source_tweet_file.close() source_tweet = json.loads(source_tweet_content) source_tweet_replies = [] # read replies replies_folder_path = os.path.join(source_tweet_folder_path, 'replies') list_reply_ids = [name for name in os.listdir(replies_folder_path) if os.path.isfile(os.path.join(replies_folder_path, name))] for reply_id in list_reply_ids: reply_file = open(os.path.join(replies_folder_path, reply_id), "r") reply_content = reply_file.read() reply_file.close() reply = json.loads(reply_content) reply['stance'] = stance_labels[reply['id_str']] source_tweet_replies.append(reply) source_tweet['replies'] = source_tweet_replies # read structure structure_file = open(os.path.join(source_tweet_folder_path, 'structure.json'), "r") structure_content = structure_file.read() structure_file.close() structure = json.loads(structure_content) source_tweet['structure'] = structure source_tweet['veracity'] = veracity_labels[source_tweet['id_str']] source_tweet['stance'] = stance_labels[source_tweet['id_str']] # create tweet file in interim to write interim_tweet_file = open(os.path.join(interim_folder_path, str(index) + '.json'), "w") # write tweet to interim interim_tweet_file.write(json.dumps(source_tweet, indent=4)) interim_tweet_file.close()
import jq from utils import (BASE, create_comment, delete_comments, json_from_file, request, validate_file) json_schema = os.getenv('INPUT_JSON_SCHEMA') json_path_pattern = os.getenv('INPUT_JSON_PATH_PATTERN') send_comment = strtobool(os.getenv('INPUT_SEND_COMMENT')) clear_comments = strtobool(os.getenv('INPUT_CLEAR_COMMENTS')) event_path = os.getenv('GITHUB_EVENT_PATH') repo = os.getenv('GITHUB_REPOSITORY') PR_FILES = BASE + '/repos/{repo}/pulls/{pull_number}/files' event = json_from_file(event_path) pull_number = jq.compile('.pull_request.number').input(event).first() errors = [] pr_files_url = PR_FILES.format(repo=repo, pull_number=pull_number) pr_files = request('get', pr_files_url) for pr_file in pr_files: filename = pr_file['filename'] validation_errors = validate_file(json_schema, json_path_pattern, filename) if len(validation_errors): errors.append({'path': filename, 'errors': validation_errors}) if clear_comments: delete_comments(repo, pull_number)