def extract_thread_features_incl_response(conversation): source_features = extract_thread_features(conversation) source_features['issource'] = 1 source_features['Word2VecSimilarityWrtSource'] = 0 source_features['Word2VecSimilarityWrtPrev'] = 0 srctokens = nltk.word_tokenize( re.sub(r'([^\s\w]|_)+', '', conversation['source']['text'].lower())) fullthread_featdict = {} fullthread_featdict[conversation['source']['id_str']] = source_features for tw in conversation['replies']: feature_dict = {} feature_dict['issource'] = 0 tokens = nltk.word_tokenize( re.sub(r'([^\s\w]|_)+', '', tw['text'].lower())) otherthreadtweets = '' otherthreadtweets += conversation['source']['text'] for response in conversation['replies']: otherthreadtweets += ' ' + response['text'] otherthreadtokens = nltk.word_tokenize( re.sub(r'([^\s\w]|_)+', '', otherthreadtweets.lower())) branches = tree2branches(conversation['structure']) for branch in branches: if tw['id_str'] in branch: if branch.index(tw['id_str']) - 1 == 0: prevtokens = srctokens else: prev_id = branch[branch.index(tw['id_str']) - 1] # Find conversation text for the id for ptw in conversation['replies']: if ptw['id_str'] == prev_id: prevtokens = nltk.word_tokenize( re.sub(r'([^\s\w]|_)+', '', ptw['text'].lower())) break else: prevtokens = [] break raw_txt = tw['text'] feature_dict['hasqmark'] = 0 if tw['text'].find('?') >= 0: feature_dict['hasqmark'] = 1 feature_dict['hasemark'] = 0 if tw['text'].find('!') >= 0: feature_dict['hasemark'] = 1 feature_dict['hasperiod'] = 0 if tw['text'].find('.') >= 0: feature_dict['hasperiod'] = 1 feature_dict['hashashtag'] = 0 if tw['text'].find('#') >= 0: feature_dict['hashashtag'] = 1 feature_dict['hasurl'] = 0 if tw['text'].find('urlurlurl') >= 0 or tw['text'].find('http') >= 0: feature_dict['hasurl'] = 1 feature_dict['haspic'] = 0 if (tw['text'].find('picpicpic') >= 0) or (tw['text'].find('pic.twitter.com') >= 0) or (tw['text'].find('instagr.am') >= 0): feature_dict['haspic'] = 1 feature_dict['hasnegation'] = 0 negationwords = [ 'not', 'no', 'nobody', 'nothing', 'none', 'never', 'neither', 'nor', 'nowhere', 'hardly', 'scarcely', 'barely', 'don', 'isn', 'wasn', 'shouldn', 'wouldn', 'couldn', 'doesn' ] for negationword in negationwords: if negationword in tokens: feature_dict['hasnegation'] += 1 feature_dict['charcount'] = len(tw['text']) feature_dict['wordcount'] = len( nltk.word_tokenize(re.sub(r'([^\s\w]|_)+', '', tw['text'].lower()))) swearwords = [] with open('data_preprocessing/data/badwords.txt', 'r') as f: for line in f: swearwords.append(line.strip().lower()) feature_dict['hasswearwords'] = 0 for token in tokens: if token in swearwords: feature_dict['hasswearwords'] += 1 uppers = [l for l in raw_txt if l.isupper()] l = len(raw_txt) if l != 0: feature_dict['capitalratio'] = float(len(uppers)) / l else: feature_dict['capitalratio'] = 0 if model_GN is not None: feature_dict[ 'Word2VecSimilarityWrtOther'] = getW2vCosineSimilarity( tokens, otherthreadtokens) feature_dict[ 'Word2VecSimilarityWrtSource'] = getW2vCosineSimilarity( tokens, srctokens) feature_dict['Word2VecSimilarityWrtPrev'] = getW2vCosineSimilarity( tokens, prevtokens) feature_dict['avgw2v'] = sumw2v(tw, avg=True) # Added textual features feature_dict['raw_text'] = raw_txt feature_dict['spacy_processed_text'], \ feature_dict['spacy_processed_BLvec'], \ feature_dict['spacy_processed_POSvec'], \ feature_dict['spacy_processed_DEPvec'], \ feature_dict['spacy_processed_NERvec'] = preprocess_text(raw_txt, initopts()) feature_dict['src_usr_hasurl'] = 0 postag_tuples = nltk.pos_tag(tokens) postag_list = [x[1] for x in postag_tuples] possible_postags = [ 'WRB', 'WP$', 'WP', 'WDT', 'VBZ', 'VBP', 'VBN', 'VBG', 'VBD', 'VB', 'UH', 'TO', 'SYM', 'RP', 'RBS', 'RBR', 'RB', 'PRP$', 'PRP', 'POS', 'PDT', 'NNS', 'NNPS', 'NNP', 'NN', 'MD', 'LS', 'JJS', 'JJR', 'JJ', 'IN', 'FW', 'EX', 'DT', 'CD', 'CC', '$' ] postag_binary = np.zeros(len(possible_postags)) for tok in postag_list: if tok in possible_postags: postag_binary[possible_postags.index(tok)] = 1 feature_dict['pos'] = postag_binary false_synonyms = [ 'false', 'bogus', 'deceitful', 'dishonest', 'distorted', 'erroneous', 'fake', 'fanciful', 'faulty', 'fictitious', 'fraudulent', 'improper', 'inaccurate', 'incorrect', 'invalid', 'misleading', 'mistaken', 'phony', 'specious', 'spurious', 'unfounded', 'unreal', 'untrue', 'untruthful', 'apocryphal', 'beguiling', 'casuistic', 'concocted', 'cooked-up', 'counterfactual', 'deceiving', 'delusive', 'ersatz', 'fallacious', 'fishy', 'illusive', 'imaginary', 'inexact', 'lying', 'mendacious', 'misrepresentative', 'off the mark', 'sham', 'sophistical', 'trumped up', 'unsound' ] false_antonyms = [ 'accurate', 'authentic', 'correct', 'fair', 'faithful', 'frank', 'genuine', 'honest', 'moral', 'open', 'proven', 'real', 'right', 'sincere', 'sound', 'true', 'trustworthy', 'truthful', 'valid', 'actual', 'factual', 'just', 'known', 'precise', 'reliable', 'straight', 'substantiated' ] feature_dict['src_num_false_synonyms'] = 0 for token in tokens: if token in false_synonyms: feature_dict['src_num_false_synonyms'] += 1 feature_dict['src_num_false_antonyms'] = 0 for token in tokens: if token in false_antonyms: feature_dict['src_num_false_antonyms'] += 1 feature_dict['thread_num_false_synonyms'] = 0 for token in otherthreadtokens: if token in false_synonyms: feature_dict['thread_num_false_synonyms'] += 1 feature_dict['thread_num_false_antonyms'] = 0 for token in otherthreadtokens: if token in false_antonyms: feature_dict['thread_num_false_antonyms'] += 1 feature_dict['src_unconfirmed'] = 0 feature_dict['src_rumour'] = 0 feature_dict['thread_unconfirmed'] = 0 feature_dict['thread_rumour'] = 0 if 'unconfirmed' in tokens: feature_dict['src_unconfirmed'] = 1 if 'unconfirmed' in otherthreadtokens: feature_dict['thread_unconfirmed'] = 1 if 'rumour' in tokens or 'gossip' in tokens or 'hoax' in tokens: feature_dict['src_rumour'] = 1 if ('rumour' in otherthreadtokens) or ( 'gossip' in otherthreadtokens) or ('hoax' in otherthreadtokens): feature_dict['thread_rumour'] = 1 whwords = [ 'what', 'when', 'where', 'which', 'who', 'whom', 'whose', 'why', 'how' ] feature_dict['src_num_wh'] = 0 for token in tokens: if token in whwords: feature_dict['src_num_wh'] += 1 feature_dict['thread_num_wh'] = 0 for token in otherthreadtokens: if token in whwords: feature_dict['thread_num_wh'] += 1 SpeechAct = {} SpeechAct['SpeechAct_ORDER'] = [ 'command', 'demand', 'tell', 'direct', 'instruct', 'require', 'prescribe', 'order' ] SpeechAct['SpeechAct_ASK1'] = [ 'ask', 'request', 'beg', 'bespeech', 'implore', 'appeal', 'plead', 'intercede', 'apply', 'urge', 'persuade', 'dissuade', 'convince' ] SpeechAct['SpeechAct_ASK2'] = [ 'ask', 'inquire', 'enquire', 'interrogate', 'question', 'query' ] SpeechAct['SpeechAct_CALL'] = [ 'call', 'summon', 'invite', 'call on', 'call for', 'order', 'book', 'reserve' ] SpeechAct['SpeechAct_FORBID'] = [ 'forbid', 'prohibit', 'veto', 'refuse', 'decline', 'reject', 'rebuff', 'renounce', 'cancel', 'resign', 'dismiss' ] SpeechAct['SpeechAct_PERMIT'] = [ 'permit', 'allow', 'consent', 'accept', 'agree', 'approve', 'disapprove', 'authorize', 'appoint' ] SpeechAct['SpeechAct_ARGUE'] = [ 'argue', 'disagree', 'refute', 'contradict', 'counter', 'deny', 'recant', 'retort', 'quarrel' ] SpeechAct['SpeechAct_REPRIMAND'] = [ 'reprimand', 'rebuke', 'reprove', 'admonish', 'reproach', 'nag', 'scold', 'abuse', 'insult' ] SpeechAct['SpeechAct_MOCK'] = ['ridicule', 'joke'] SpeechAct['SpeechAct_BLAME'] = [ 'blame', 'criticize', 'condemn', 'denounce', 'deplore', 'curse' ] SpeechAct['SpeechAct_ACCUSE'] = [ 'accuse', 'charge', 'challenge', 'defy', 'dare' ] SpeechAct['SpeechAct_ATTACK'] = ['attack', 'defend'] SpeechAct['SpeechAct_WARN '] = ['warn', 'threaten', 'blackmail'] SpeechAct['SpeechAct_ADVISE '] = [ 'advise', 'councel', 'consult', 'recommend', 'suggest', 'propose', 'advocate' ] SpeechAct['SpeechAct_OFFER '] = ['offer', 'volunteer', 'grant', 'give'] SpeechAct['SpeechAct_PRAISE '] = [ 'praise', 'commend', 'compliment', 'boast', 'credit' ] SpeechAct['SpeechAct_PROMISE '] = [ 'promise', 'pledge', 'vow', 'swear', 'vouch for', 'guarante' ] SpeechAct['SpeechAct_THANK '] = [ 'thank', 'apologise', 'greet', 'welcome', 'farewell', 'goodbye', 'introduce', 'bless', 'wish', 'congratulate' ] SpeechAct['SpeechAct_FORGIVE '] = [ 'forgive', 'excuse', 'justify', 'absolve', 'pardon', 'convict', 'acquit', 'sentence' ] SpeechAct['SpeechAct_COMPLAIN'] = [ 'complain', 'protest', 'object', 'moan', 'bemoan', 'lament', 'bewail' ] SpeechAct['SpeechAct_EXCLAIM'] = [ 'exclaim', 'enthuse', 'exult', 'swear', 'blaspheme' ] SpeechAct['SpeechAct_GUESS'] = [ 'guess', 'bet', 'presume', 'suspect', 'suppose', 'wonder', 'speculate', 'conjecture', 'predict', 'forecast', 'prophesy' ] SpeechAct['SpeechAct_HINT'] = ['hint', 'imply', 'insinuate'] SpeechAct['SpeechAct_CONCLUDE'] = [ 'conclude', 'deduce', 'infer', 'gather', 'reckon', 'estimate', 'calculate', 'count', 'prove', 'compare' ] SpeechAct['SpeechAct_TELL'] = [ 'tell', 'report', 'narrate', 'relate', 'recount', 'describe', 'explain', 'lecture' ] SpeechAct['SpeechAct_INFORM'] = [ 'inform', 'notify', 'announce', 'inform on', 'reveal' ] SpeechAct['SpeechAct_SUMUP'] = ['sum up', 'summarize', 'recapitulate'] SpeechAct['SpeechAct_ADMIT'] = [ 'admit', 'acknowledge', 'concede', 'confess', 'confide' ] SpeechAct['SpeechAct_ASSERT'] = [ 'assert', 'affirm', 'claim', 'maintain', 'contend', 'state', 'testify' ] SpeechAct['SpeechAct_CONFIRM'] = ['confirm', 'assure', 'reassure'] SpeechAct['SpeechAct_STRESS'] = [ 'stress', 'emphasize', 'insist', 'repeat', 'point out', 'note', 'remind', 'add' ] SpeechAct['SpeechAct_DECLARE'] = [ 'declare', 'pronounce', 'proclaim', 'decree', 'profess', 'vote', 'resolve', 'decide' ] SpeechAct['SpeechAct_BAPTIZE'] = [ 'baptize', 'chirsten', 'name', 'excommunicate' ] SpeechAct['SpeechAct_REMARK'] = ['remark', 'comment', 'observe'] SpeechAct['SpeechAct_ANSWER'] = ['answer', 'reply'] SpeechAct['SpeechAct_DISCUSS'] = [ 'discuss', 'debate', 'negotiate', 'bargain' ] SpeechAct['SpeechAct_TALK'] = ['talk', 'converse', 'chat', 'gossip'] for k in SpeechAct.keys(): feature_dict[k] = 0 for verb in SpeechAct[k]: if verb in tw['text'].lower(): feature_dict[k] += 1 fullthread_featdict[tw['id_str']] = feature_dict return fullthread_featdict
def load_test_data_reddit(path = "/home/ifajcik/Work/NLP/semeval_2019/7_Rumour_Eval/rumoureval-2019-test-data/reddit-test-data"): conversation_ids = listdir_nohidden(path) conversations = {} conversations['dev'] = [] conversations['train'] = [] conversations['test'] = [] for id in conversation_ids: conversation = {} conversation['id'] = id path_src = path + '/' + id + '/source-tweet' files_t = sorted(listdir_nohidden(path_src)) with open(os.path.join(path_src, files_t[0])) as f: for line in f: src = json.loads(line) src['text'] = src['data']['children'][0]['data']['title'] src['user'] = src['data']['children'][0]['data']['author'] if files_t[0].endswith('.json'): filename = files_t[0][:-5] src['id_str'] = filename else: print("No, no I don't like that") src['used'] = 0 src['setA'] = 'test' src['setB'] = 'test' conversation['source'] = src tweets = [] path_repl = path + '/' + id + '/replies' files_t = sorted(listdir_nohidden(path_repl)) for repl_file in files_t: with open(os.path.join(path_repl, repl_file)) as f: for line in f: tw = json.loads(line) if 'body' in list(tw['data'].keys()): tw['text'] = tw['data']['body'] tw['user'] = tw['data']['author'] if repl_file.endswith('.json'): filename = repl_file[:-5] tw['id_str'] = filename else: print("No, no I don't like that reply") tw['used'] = 0 tw['setA'] = 'test' tweets.append(tw) else: tw['text'] = '' tw['user'] = '' tw['used'] = 0 if repl_file.endswith('.json'): filename = repl_file[:-5] tw['id_str'] = filename else: print("No, no I don't like that reply") tw['setA'] = 'test' tweets.append(tw) conversation['replies'] = tweets path_struct = path + '/' + id + '/structure.json' with open(path_struct, 'r') as f: struct = json.load(f) conversation['structure'] = struct branches = tree2branches(conversation['structure']) conversation['branches'] = branches conversations['test'].append(conversation) return conversations
def load_test_data_twitter(set_path=PATH_TO_TEST_TWITTER): allconv = [] train_dev_split = {} train_dev_split['dev'] = [] train_dev_split['train'] = [] train_dev_split['test'] = [] tweet_data = sorted(os.listdir(set_path)) newfolds = [i for i in tweet_data if i[0] != '.'] tweet_data = newfolds # conversation ids, source post id == conversation id conversation = {} # build conversations for tweet group for tweet_topic in tweet_data: path = os.path.join(set_path, tweet_topic) tweet_topic_data = sorted(os.listdir(path)) tweet_topic_data = [i for i in tweet_topic_data if i[0] != '.'] for foldr in tweet_topic_data: flag = 0 conversation['id'] = foldr tweets = [] path_repl = path + '/' + foldr + '/replies' files_t = sorted(os.listdir(path_repl)) newfolds = [i for i in files_t if i[0] != '.'] files_t = newfolds flag = "test" if files_t != []: # iterate over json reply files for repl_file in files_t: with open(os.path.join(path_repl, repl_file)) as f: for line in f: tw = json.loads(line) tw['used'] = 0 tw['set'] = flag tweets.append(tw) if tw['text'] is None: print("Tweet has no text", tw['id']) conversation['replies'] = tweets path_src = path + '/' + foldr + '/source-tweet' files_t = sorted(os.listdir(path_src)) with open(os.path.join(path_src, files_t[0])) as f: for line in f: src = json.loads(line) src['used'] = 0 scrcid = src['id_str'] src['set'] = flag conversation['source'] = src if src['text'] is None: print("Tweet has no text", src['id']) path_struct = path + '/' + foldr + '/structure.json' with open(path_struct) as f: for line in f: struct = json.loads(line) if len(struct) > 1: new_struct = {} new_struct[foldr] = struct[foldr] struct = new_struct # Take item from structure if key is same as source tweet id conversation['structure'] = struct branches = tree2branches(conversation['structure']) conversation['branches'] = branches train_dev_split[flag].append(conversation.copy()) allconv.append(conversation.copy()) # if no replies are present, still add just source else: flag = 'test' path_src = path + '/' + foldr + '/source-tweet' files_t = sorted(os.listdir(path_src)) with open(os.path.join(path_src, files_t[0])) as f: for line in f: src = json.loads(line) src['used'] = 0 scrcid = src['id_str'] src['set'] = flag conversation['source'] = src if src['text'] is None: print("Tweet has no text", src['id']) path_struct = path + '/' + foldr + '/structure.json' with open(path_struct) as f: for line in f: struct = json.loads(line) if len(struct) > 1: # print "Structure has more than one root" new_struct = {} new_struct[foldr] = struct[foldr] struct = new_struct # Take item from structure if key is same as source tweet id conversation['structure'] = struct branches = tree2branches(conversation['structure']) conversation['branches'] = branches train_dev_split[flag].append(conversation.copy()) allconv.append(conversation.copy()) print(foldr) return train_dev_split
def load_data(): # this is mix of twitter and reddit path_dev = os.path.join(TRAIN_DATA_PREFIX, "dev-key.json") with open(path_dev, 'r') as f: dev_key = json.load(f) path_train = os.path.join(TRAIN_DATA_PREFIX, "train-key.json") with open(path_train, 'r') as f: train_key = json.load(f) # %% path = os.path.join(TRAIN_DATA_PREFIX, "reddit-training-data") conversation_ids = listdir_nohidden(path) conversations = {} conversations['dev'] = [] conversations['train'] = [] conversations['test'] = [] for id in conversation_ids: conversation = {} conversation['id'] = id path_src = path + '/' + id + '/source-tweet' files_t = sorted(listdir_nohidden(path_src)) with open(os.path.join(path_src, files_t[0])) as f: for line in f: src = json.loads(line) src['text'] = src['data']['children'][0]['data']['title'] src['user'] = src['data']['children'][0]['data']['author'] if files_t[0].endswith('.json'): filename = files_t[0][:-5] src['id_str'] = filename else: print("No, no I don't like that") src['used'] = 0 if src['id_str'] in list(dev_key['subtaskaenglish'].keys()): src['setA'] = 'dev' src['label'] = dev_key['subtaskaenglish'][src['id_str']] elif src['id_str'] in list(train_key['subtaskaenglish'].keys()): src['setA'] = 'train' src['label'] = train_key['subtaskaenglish'][src['id_str']] else: print("Post was not found! Task A, Post ID: ", src['id_str']) if src['id_str'] in list(dev_key['subtaskbenglish'].keys()): src['setB'] = 'dev' conversation['veracity'] = dev_key['subtaskbenglish'][src['id_str']] elif src['id_str'] in list(train_key['subtaskbenglish'].keys()): src['setB'] = 'train' conversation['veracity'] = train_key['subtaskbenglish'][src['id_str']] else: print("Post was not found! Task B, Post ID: ", src['id_str']) conversation['source'] = src tweets = [] path_repl = path + '/' + id + '/replies' files_t = sorted(listdir_nohidden(path_repl)) for repl_file in files_t: with open(os.path.join(path_repl, repl_file)) as f: for line in f: tw = json.loads(line) if 'body' in list(tw['data'].keys()): tw['text'] = tw['data']['body'] tw['user'] = tw['data']['author'] if repl_file.endswith('.json'): filename = repl_file[:-5] tw['id_str'] = filename else: print("No, no I don't like that reply") tw['used'] = 0 if tw['id_str'] in list(dev_key['subtaskaenglish'].keys()): tw['setA'] = 'dev' tw['label'] = dev_key['subtaskaenglish'][tw['id_str']] elif tw['id_str'] in list(train_key['subtaskaenglish'].keys()): tw['setA'] = 'train' tw['label'] = train_key['subtaskaenglish'][tw['id_str']] else: print("Post was not found! Task A, Reply ID: ", tw['id_str']) tweets.append(tw) else: tw['text'] = '' tw['user'] = '' tw['used'] = 0 if repl_file.endswith('.json'): filename = repl_file[:-5] tw['id_str'] = filename else: print("No, no I don't like that reply") if tw['id_str'] in list(dev_key['subtaskaenglish'].keys()): tw['setA'] = 'dev' tw['label'] = dev_key['subtaskaenglish'][tw['id_str']] elif tw['id_str'] in list(train_key['subtaskaenglish'].keys()): tw['setA'] = 'train' tw['label'] = train_key['subtaskaenglish'][tw['id_str']] else: print("Post was not found! Task A, Reply ID: ", tw['id_str']) tweets.append(tw) conversation['replies'] = tweets path_struct = path + '/' + id + '/structure.json' with open(path_struct, 'r') as f: struct = json.load(f) conversation['structure'] = struct branches = tree2branches(conversation['structure']) conversation['branches'] = branches conversations['train'].append(conversation) # %% path = os.path.join(TRAIN_DATA_PREFIX, "reddit-dev-data") conversation_ids = listdir_nohidden(path) for id in conversation_ids: conversation = {} conversation['id'] = id path_src = path + '/' + id + '/source-tweet' files_t = sorted(listdir_nohidden(path_src)) with open(os.path.join(path_src, files_t[0])) as f: for line in f: src = json.loads(line) src['text'] = src['data']['children'][0]['data']['title'] src['user'] = src['data']['children'][0]['data']['author'] if files_t[0].endswith('.json'): filename = files_t[0][:-5] src['id_str'] = filename else: print("No, no I don't like that") src['used'] = 0 # if src['id_str'] in list(dev_key['subtaskaenglish'].keys()): src['setA'] = 'dev' src['label'] = dev_key['subtaskaenglish'][src['id_str']] elif src['id_str'] in list(train_key['subtaskaenglish'].keys()): src['setA'] = 'train' src['label'] = train_key['subtaskaenglish'][src['id_str']] else: print("Post was not found! Task A, Post ID: ", src['id_str']) if src['id_str'] in list(dev_key['subtaskbenglish'].keys()): src['setB'] = 'dev' conversation['veracity'] = dev_key['subtaskbenglish'][src['id_str']] elif src['id_str'] in list(train_key['subtaskbenglish'].keys()): src['setB'] = 'train' conversation['veracity'] = train_key['subtaskbenglish'][src['id_str']] else: print("Post was not found! Task B, Post ID: ", src['id_str']) conversation['source'] = src tweets = [] path_repl = path + '/' + id + '/replies' files_t = sorted(listdir_nohidden(path_repl)) for repl_file in files_t: with open(os.path.join(path_repl, repl_file)) as f: for line in f: tw = json.loads(line) if 'body' in list(tw['data'].keys()): tw['text'] = tw['data']['body'] tw['user'] = tw['data']['author'] if repl_file.endswith('.json'): filename = repl_file[:-5] tw['id_str'] = filename else: print("No, no I don't like that reply") tw['used'] = 0 if tw['id_str'] in list(dev_key['subtaskaenglish'].keys()): tw['setA'] = 'dev' tw['label'] = dev_key['subtaskaenglish'][tw['id_str']] elif tw['id_str'] in list(train_key['subtaskaenglish'].keys()): tw['setA'] = 'train' tw['label'] = train_key['subtaskaenglish'][tw['id_str']] else: print("Post was not found! Task A, Reply ID: ", tw['id_str']) tweets.append(tw) else: tw['text'] = '' tw['user'] = '' tw['used'] = 0 if repl_file.endswith('.json'): filename = repl_file[:-5] tw['id_str'] = filename else: print("No, no I don't like that reply") if tw['id_str'] in list(dev_key['subtaskaenglish'].keys()): tw['setA'] = 'dev' tw['label'] = dev_key['subtaskaenglish'][tw['id_str']] elif tw['id_str'] in list(train_key['subtaskaenglish'].keys()): tw['setA'] = 'train' tw['label'] = train_key['subtaskaenglish'][tw['id_str']] else: print("Post was not found! Task A, Reply ID: ", tw['id_str']) tweets.append(tw) conversation['replies'] = tweets path_struct = path + '/' + id + '/structure.json' with open(path_struct, 'r') as f: struct = json.load(f) conversation['structure'] = struct branches = tree2branches(conversation['structure']) conversation['branches'] = branches conversations['dev'].append(conversation) return conversations
def load_dataset(): # Load labels and split for task A and task B tweet_label_dict, veracity_label_dict = load_true_labels() dev = tweet_label_dict['dev'] train = tweet_label_dict['train'] dev_tweets = dev.keys() train_tweets = train.keys() # Load folds and conversations path_to_folds = os.path.join(TRAIN_DATA_PREFIX, 'twitter-english') folds = sorted(os.listdir(path_to_folds)) newfolds = [i for i in folds if i[0] != '.'] folds = newfolds cvfolds = {} allconv = [] train_dev_split = {} train_dev_split['dev'] = [] train_dev_split['train'] = [] train_dev_split['test'] = [] # iterate over all tweet groups [reffered to as 'folds'] - charliehebdo etc... for nfold, fold in enumerate(folds): path_to_tweets = os.path.join(path_to_folds, fold) tweet_data = sorted(os.listdir(path_to_tweets)) newfolds = [i for i in tweet_data if i[0] != '.'] tweet_data = newfolds # conversation ids, source post id == conversation id conversation = {} # build conversations for tweet group for foldr in tweet_data: flag = 0 conversation['id'] = foldr tweets = [] path_repl = path_to_tweets + '/' + foldr + '/replies' files_t = sorted(os.listdir(path_repl)) newfolds = [i for i in files_t if i[0] != '.'] files_t = newfolds if files_t != []: # iterate over json reply files for repl_file in files_t: with open(os.path.join(path_repl, repl_file)) as f: for line in f: tw = json.loads(line) tw['used'] = 0 replyid = tw['id_str'] # check if tweet belongs to dev fold if replyid in dev_tweets: tw['set'] = 'dev' tw['label'] = dev[replyid] # train_dev_tweets['dev'].append(tw) if flag == 'train': print("The tree is split between sets", foldr) flag = 'dev' elif replyid in train_tweets: tw['set'] = 'train' tw['label'] = train[replyid] # train_dev_tweets['train'].append(tw) if flag == 'dev': print("The tree is split between sets", foldr) flag = 'train' else: print("Tweet was not found! ID: ", foldr) tweets.append(tw) if tw['text'] is None: print("Tweet has no text", tw['id']) conversation['replies'] = tweets path_src = path_to_tweets + '/' + foldr + '/source-tweet' files_t = sorted(os.listdir(path_src)) with open(os.path.join(path_src, files_t[0])) as f: for line in f: src = json.loads(line) src['used'] = 0 scrcid = src['id_str'] src['set'] = flag src['label'] = tweet_label_dict[flag][scrcid] conversation['source'] = src conversation['veracity'] = veracity_label_dict[flag][scrcid] if src['text'] is None: print("Tweet has no text", src['id']) path_struct = path_to_tweets + '/' + foldr + '/structure.json' with open(path_struct) as f: for line in f: struct = json.loads(line) if len(struct) > 1: # I had to alter the structure of this conversation if foldr == '553480082996879360': new_struct = {} new_struct[foldr] = struct[foldr] new_struct[foldr]['553495625527209985'] = struct['553485679129534464']['553495625527209985'] new_struct[foldr]['553495937432432640'] = struct['553490097623269376']['553495937432432640'] struct = new_struct else: new_struct = {} new_struct[foldr] = struct[foldr] struct = new_struct # Take item from structure if key is same as source tweet id conversation['structure'] = struct branches = tree2branches(conversation['structure']) conversation['branches'] = branches train_dev_split[flag].append(conversation.copy()) allconv.append(conversation.copy()) else: flag = 'train' path_src = path_to_tweets + '/' + foldr + '/source-tweet' files_t = sorted(os.listdir(path_src)) with open(os.path.join(path_src, files_t[0])) as f: for line in f: src = json.loads(line) src['used'] = 0 scrcid = src['id_str'] src['set'] = flag src['label'] = tweet_label_dict[flag][scrcid] conversation['source'] = src conversation['veracity'] = veracity_label_dict[flag][scrcid] if src['text'] is None: print("Tweet has no text", src['id']) path_struct = path_to_tweets + '/' + foldr + '/structure.json' with open(path_struct) as f: for line in f: struct = json.loads(line) if len(struct) > 1: # print "Structure has more than one root" new_struct = {} new_struct[foldr] = struct[foldr] struct = new_struct # Take item from structure if key is same as source tweet id conversation['structure'] = struct branches = tree2branches(conversation['structure']) conversation['branches'] = branches train_dev_split[flag].append(conversation.copy()) allconv.append(conversation.copy()) print(foldr) cvfolds[fold] = allconv allconv = [] return train_dev_split