Python tree2branches 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: data_preprocessing.tree2branches

메소드/함수: tree2branches

hotexamples.com에서의 예제들: 5

Python tree2branches - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 data_preprocessing.tree2branches.tree2branches에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: extract_thread_features.py 프로젝트: databill86/RumourEval2019

def extract_thread_features_incl_response(conversation):
    source_features = extract_thread_features(conversation)
    source_features['issource'] = 1
    source_features['Word2VecSimilarityWrtSource'] = 0
    source_features['Word2VecSimilarityWrtPrev'] = 0
    srctokens = nltk.word_tokenize(
        re.sub(r'([^\s\w]|_)+', '', conversation['source']['text'].lower()))
    fullthread_featdict = {}
    fullthread_featdict[conversation['source']['id_str']] = source_features

    for tw in conversation['replies']:
        feature_dict = {}
        feature_dict['issource'] = 0
        tokens = nltk.word_tokenize(
            re.sub(r'([^\s\w]|_)+', '', tw['text'].lower()))
        otherthreadtweets = ''
        otherthreadtweets += conversation['source']['text']

        for response in conversation['replies']:
            otherthreadtweets += ' ' + response['text']

        otherthreadtokens = nltk.word_tokenize(
            re.sub(r'([^\s\w]|_)+', '', otherthreadtweets.lower()))
        branches = tree2branches(conversation['structure'])
        for branch in branches:
            if tw['id_str'] in branch:
                if branch.index(tw['id_str']) - 1 == 0:
                    prevtokens = srctokens
                else:
                    prev_id = branch[branch.index(tw['id_str']) - 1]
                    # Find conversation text for the id
                    for ptw in conversation['replies']:
                        if ptw['id_str'] == prev_id:
                            prevtokens = nltk.word_tokenize(
                                re.sub(r'([^\s\w]|_)+', '',
                                       ptw['text'].lower()))
                            break
            else:
                prevtokens = []
            break
        raw_txt = tw['text']
        feature_dict['hasqmark'] = 0
        if tw['text'].find('?') >= 0:
            feature_dict['hasqmark'] = 1
        feature_dict['hasemark'] = 0
        if tw['text'].find('!') >= 0:
            feature_dict['hasemark'] = 1
        feature_dict['hasperiod'] = 0
        if tw['text'].find('.') >= 0:
            feature_dict['hasperiod'] = 1
        feature_dict['hashashtag'] = 0
        if tw['text'].find('#') >= 0:
            feature_dict['hashashtag'] = 1
        feature_dict['hasurl'] = 0
        if tw['text'].find('urlurlurl') >= 0 or tw['text'].find('http') >= 0:
            feature_dict['hasurl'] = 1
        feature_dict['haspic'] = 0
        if (tw['text'].find('picpicpic') >=
                0) or (tw['text'].find('pic.twitter.com') >=
                       0) or (tw['text'].find('instagr.am') >= 0):
            feature_dict['haspic'] = 1
        feature_dict['hasnegation'] = 0
        negationwords = [
            'not', 'no', 'nobody', 'nothing', 'none', 'never', 'neither',
            'nor', 'nowhere', 'hardly', 'scarcely', 'barely', 'don', 'isn',
            'wasn', 'shouldn', 'wouldn', 'couldn', 'doesn'
        ]
        for negationword in negationwords:
            if negationword in tokens:
                feature_dict['hasnegation'] += 1
        feature_dict['charcount'] = len(tw['text'])
        feature_dict['wordcount'] = len(
            nltk.word_tokenize(re.sub(r'([^\s\w]|_)+', '',
                                      tw['text'].lower())))
        swearwords = []
        with open('data_preprocessing/data/badwords.txt', 'r') as f:
            for line in f:
                swearwords.append(line.strip().lower())
        feature_dict['hasswearwords'] = 0
        for token in tokens:
            if token in swearwords:
                feature_dict['hasswearwords'] += 1
        uppers = [l for l in raw_txt if l.isupper()]

        l = len(raw_txt)
        if l != 0:
            feature_dict['capitalratio'] = float(len(uppers)) / l
        else:
            feature_dict['capitalratio'] = 0

        if model_GN is not None:
            feature_dict[
                'Word2VecSimilarityWrtOther'] = getW2vCosineSimilarity(
                    tokens, otherthreadtokens)
            feature_dict[
                'Word2VecSimilarityWrtSource'] = getW2vCosineSimilarity(
                    tokens, srctokens)
            feature_dict['Word2VecSimilarityWrtPrev'] = getW2vCosineSimilarity(
                tokens, prevtokens)
            feature_dict['avgw2v'] = sumw2v(tw, avg=True)

        # Added textual features
        feature_dict['raw_text'] = raw_txt
        feature_dict['spacy_processed_text'], \
        feature_dict['spacy_processed_BLvec'], \
        feature_dict['spacy_processed_POSvec'], \
        feature_dict['spacy_processed_DEPvec'], \
        feature_dict['spacy_processed_NERvec'] = preprocess_text(raw_txt, initopts())

        feature_dict['src_usr_hasurl'] = 0

        postag_tuples = nltk.pos_tag(tokens)
        postag_list = [x[1] for x in postag_tuples]
        possible_postags = [
            'WRB', 'WP$', 'WP', 'WDT', 'VBZ', 'VBP', 'VBN', 'VBG', 'VBD', 'VB',
            'UH', 'TO', 'SYM', 'RP', 'RBS', 'RBR', 'RB', 'PRP$', 'PRP', 'POS',
            'PDT', 'NNS', 'NNPS', 'NNP', 'NN', 'MD', 'LS', 'JJS', 'JJR', 'JJ',
            'IN', 'FW', 'EX', 'DT', 'CD', 'CC', '$'
        ]
        postag_binary = np.zeros(len(possible_postags))
        for tok in postag_list:
            if tok in possible_postags:
                postag_binary[possible_postags.index(tok)] = 1
        feature_dict['pos'] = postag_binary
        false_synonyms = [
            'false', 'bogus', 'deceitful', 'dishonest', 'distorted',
            'erroneous', 'fake', 'fanciful', 'faulty', 'fictitious',
            'fraudulent', 'improper', 'inaccurate', 'incorrect', 'invalid',
            'misleading', 'mistaken', 'phony', 'specious', 'spurious',
            'unfounded', 'unreal', 'untrue', 'untruthful', 'apocryphal',
            'beguiling', 'casuistic', 'concocted', 'cooked-up',
            'counterfactual', 'deceiving', 'delusive', 'ersatz', 'fallacious',
            'fishy', 'illusive', 'imaginary', 'inexact', 'lying', 'mendacious',
            'misrepresentative', 'off the mark', 'sham', 'sophistical',
            'trumped up', 'unsound'
        ]
        false_antonyms = [
            'accurate', 'authentic', 'correct', 'fair', 'faithful', 'frank',
            'genuine', 'honest', 'moral', 'open', 'proven', 'real', 'right',
            'sincere', 'sound', 'true', 'trustworthy', 'truthful', 'valid',
            'actual', 'factual', 'just', 'known', 'precise', 'reliable',
            'straight', 'substantiated'
        ]
        feature_dict['src_num_false_synonyms'] = 0
        for token in tokens:
            if token in false_synonyms:
                feature_dict['src_num_false_synonyms'] += 1
        feature_dict['src_num_false_antonyms'] = 0
        for token in tokens:
            if token in false_antonyms:
                feature_dict['src_num_false_antonyms'] += 1
        feature_dict['thread_num_false_synonyms'] = 0
        for token in otherthreadtokens:
            if token in false_synonyms:
                feature_dict['thread_num_false_synonyms'] += 1
        feature_dict['thread_num_false_antonyms'] = 0
        for token in otherthreadtokens:
            if token in false_antonyms:
                feature_dict['thread_num_false_antonyms'] += 1
        feature_dict['src_unconfirmed'] = 0
        feature_dict['src_rumour'] = 0
        feature_dict['thread_unconfirmed'] = 0
        feature_dict['thread_rumour'] = 0
        if 'unconfirmed' in tokens:
            feature_dict['src_unconfirmed'] = 1
        if 'unconfirmed' in otherthreadtokens:
            feature_dict['thread_unconfirmed'] = 1
        if 'rumour' in tokens or 'gossip' in tokens or 'hoax' in tokens:
            feature_dict['src_rumour'] = 1
        if ('rumour' in otherthreadtokens) or (
                'gossip' in otherthreadtokens) or ('hoax'
                                                   in otherthreadtokens):
            feature_dict['thread_rumour'] = 1
        whwords = [
            'what', 'when', 'where', 'which', 'who', 'whom', 'whose', 'why',
            'how'
        ]
        feature_dict['src_num_wh'] = 0
        for token in tokens:
            if token in whwords:
                feature_dict['src_num_wh'] += 1
        feature_dict['thread_num_wh'] = 0
        for token in otherthreadtokens:
            if token in whwords:
                feature_dict['thread_num_wh'] += 1
        SpeechAct = {}
        SpeechAct['SpeechAct_ORDER'] = [
            'command', 'demand', 'tell', 'direct', 'instruct', 'require',
            'prescribe', 'order'
        ]
        SpeechAct['SpeechAct_ASK1'] = [
            'ask', 'request', 'beg', 'bespeech', 'implore', 'appeal', 'plead',
            'intercede', 'apply', 'urge', 'persuade', 'dissuade', 'convince'
        ]
        SpeechAct['SpeechAct_ASK2'] = [
            'ask', 'inquire', 'enquire', 'interrogate', 'question', 'query'
        ]
        SpeechAct['SpeechAct_CALL'] = [
            'call', 'summon', 'invite', 'call on', 'call for', 'order', 'book',
            'reserve'
        ]
        SpeechAct['SpeechAct_FORBID'] = [
            'forbid', 'prohibit', 'veto', 'refuse', 'decline', 'reject',
            'rebuff', 'renounce', 'cancel', 'resign', 'dismiss'
        ]
        SpeechAct['SpeechAct_PERMIT'] = [
            'permit', 'allow', 'consent', 'accept', 'agree', 'approve',
            'disapprove', 'authorize', 'appoint'
        ]
        SpeechAct['SpeechAct_ARGUE'] = [
            'argue', 'disagree', 'refute', 'contradict', 'counter', 'deny',
            'recant', 'retort', 'quarrel'
        ]
        SpeechAct['SpeechAct_REPRIMAND'] = [
            'reprimand', 'rebuke', 'reprove', 'admonish', 'reproach', 'nag',
            'scold', 'abuse', 'insult'
        ]
        SpeechAct['SpeechAct_MOCK'] = ['ridicule', 'joke']
        SpeechAct['SpeechAct_BLAME'] = [
            'blame', 'criticize', 'condemn', 'denounce', 'deplore', 'curse'
        ]
        SpeechAct['SpeechAct_ACCUSE'] = [
            'accuse', 'charge', 'challenge', 'defy', 'dare'
        ]
        SpeechAct['SpeechAct_ATTACK'] = ['attack', 'defend']
        SpeechAct['SpeechAct_WARN '] = ['warn', 'threaten', 'blackmail']
        SpeechAct['SpeechAct_ADVISE '] = [
            'advise', 'councel', 'consult', 'recommend', 'suggest', 'propose',
            'advocate'
        ]
        SpeechAct['SpeechAct_OFFER '] = ['offer', 'volunteer', 'grant', 'give']
        SpeechAct['SpeechAct_PRAISE '] = [
            'praise', 'commend', 'compliment', 'boast', 'credit'
        ]
        SpeechAct['SpeechAct_PROMISE '] = [
            'promise', 'pledge', 'vow', 'swear', 'vouch for', 'guarante'
        ]
        SpeechAct['SpeechAct_THANK '] = [
            'thank', 'apologise', 'greet', 'welcome', 'farewell', 'goodbye',
            'introduce', 'bless', 'wish', 'congratulate'
        ]
        SpeechAct['SpeechAct_FORGIVE '] = [
            'forgive', 'excuse', 'justify', 'absolve', 'pardon', 'convict',
            'acquit', 'sentence'
        ]
        SpeechAct['SpeechAct_COMPLAIN'] = [
            'complain', 'protest', 'object', 'moan', 'bemoan', 'lament',
            'bewail'
        ]
        SpeechAct['SpeechAct_EXCLAIM'] = [
            'exclaim', 'enthuse', 'exult', 'swear', 'blaspheme'
        ]
        SpeechAct['SpeechAct_GUESS'] = [
            'guess', 'bet', 'presume', 'suspect', 'suppose', 'wonder',
            'speculate', 'conjecture', 'predict', 'forecast', 'prophesy'
        ]
        SpeechAct['SpeechAct_HINT'] = ['hint', 'imply', 'insinuate']
        SpeechAct['SpeechAct_CONCLUDE'] = [
            'conclude', 'deduce', 'infer', 'gather', 'reckon', 'estimate',
            'calculate', 'count', 'prove', 'compare'
        ]
        SpeechAct['SpeechAct_TELL'] = [
            'tell', 'report', 'narrate', 'relate', 'recount', 'describe',
            'explain', 'lecture'
        ]
        SpeechAct['SpeechAct_INFORM'] = [
            'inform', 'notify', 'announce', 'inform on', 'reveal'
        ]
        SpeechAct['SpeechAct_SUMUP'] = ['sum up', 'summarize', 'recapitulate']
        SpeechAct['SpeechAct_ADMIT'] = [
            'admit', 'acknowledge', 'concede', 'confess', 'confide'
        ]
        SpeechAct['SpeechAct_ASSERT'] = [
            'assert', 'affirm', 'claim', 'maintain', 'contend', 'state',
            'testify'
        ]
        SpeechAct['SpeechAct_CONFIRM'] = ['confirm', 'assure', 'reassure']
        SpeechAct['SpeechAct_STRESS'] = [
            'stress', 'emphasize', 'insist', 'repeat', 'point out', 'note',
            'remind', 'add'
        ]
        SpeechAct['SpeechAct_DECLARE'] = [
            'declare', 'pronounce', 'proclaim', 'decree', 'profess', 'vote',
            'resolve', 'decide'
        ]
        SpeechAct['SpeechAct_BAPTIZE'] = [
            'baptize', 'chirsten', 'name', 'excommunicate'
        ]
        SpeechAct['SpeechAct_REMARK'] = ['remark', 'comment', 'observe']
        SpeechAct['SpeechAct_ANSWER'] = ['answer', 'reply']
        SpeechAct['SpeechAct_DISCUSS'] = [
            'discuss', 'debate', 'negotiate', 'bargain'
        ]
        SpeechAct['SpeechAct_TALK'] = ['talk', 'converse', 'chat', 'gossip']
        for k in SpeechAct.keys():
            feature_dict[k] = 0
            for verb in SpeechAct[k]:
                if verb in tw['text'].lower():
                    feature_dict[k] += 1

        fullthread_featdict[tw['id_str']] = feature_dict
    return fullthread_featdict

예제 #2

파일 보기

파일: preprocessing_reddit.py 프로젝트: isspek/veracity-detection

def load_test_data_reddit(path = "/home/ifajcik/Work/NLP/semeval_2019/7_Rumour_Eval/rumoureval-2019-test-data/reddit-test-data"):

    conversation_ids = listdir_nohidden(path)
    conversations = {}

    conversations['dev'] = []
    conversations['train'] = []
    conversations['test'] = []


    for id in conversation_ids:
        conversation = {}
        conversation['id'] = id
        path_src = path + '/' + id + '/source-tweet'
        files_t = sorted(listdir_nohidden(path_src))
        with open(os.path.join(path_src, files_t[0])) as f:
            for line in f:
                src = json.loads(line)

                src['text'] = src['data']['children'][0]['data']['title']
                src['user'] = src['data']['children'][0]['data']['author']

                if files_t[0].endswith('.json'):
                    filename = files_t[0][:-5]
                    src['id_str'] = filename
                else:
                    print("No, no I don't like that")

                src['used'] = 0
                src['setA'] = 'test'
                src['setB'] = 'test'

                conversation['source'] = src

        tweets = []
        path_repl = path + '/' + id + '/replies'
        files_t = sorted(listdir_nohidden(path_repl))
        for repl_file in files_t:
            with open(os.path.join(path_repl, repl_file)) as f:
                for line in f:
                    tw = json.loads(line)
                    if 'body' in list(tw['data'].keys()):

                        tw['text'] = tw['data']['body']
                        tw['user'] = tw['data']['author']

                        if repl_file.endswith('.json'):
                            filename = repl_file[:-5]
                            tw['id_str'] = filename
                        else:
                            print("No, no I don't like that reply")

                        tw['used'] = 0
                        tw['setA'] = 'test'

                        tweets.append(tw)
                    else:
                        tw['text'] = ''
                        tw['user'] = ''
                        tw['used'] = 0
                        if repl_file.endswith('.json'):
                            filename = repl_file[:-5]
                            tw['id_str'] = filename
                        else:
                            print("No, no I don't like that reply")

                        tw['setA'] = 'test'
                        tweets.append(tw)
        conversation['replies'] = tweets
        path_struct = path + '/' + id + '/structure.json'

        with open(path_struct, 'r') as f:
            struct = json.load(f)
            conversation['structure'] = struct
            branches = tree2branches(conversation['structure'])
            conversation['branches'] = branches

        conversations['test'].append(conversation)
    return conversations

예제 #3

파일 보기

파일: preprocessing_tweets.py 프로젝트: databill86/RumourEval2019

def load_test_data_twitter(set_path=PATH_TO_TEST_TWITTER):
    allconv = []
    train_dev_split = {}
    train_dev_split['dev'] = []
    train_dev_split['train'] = []
    train_dev_split['test'] = []
    tweet_data = sorted(os.listdir(set_path))
    newfolds = [i for i in tweet_data if i[0] != '.']
    tweet_data = newfolds  # conversation ids, source post id == conversation id
    conversation = {}
    # build conversations for tweet group

    for tweet_topic in tweet_data:
        path = os.path.join(set_path, tweet_topic)
        tweet_topic_data = sorted(os.listdir(path))
        tweet_topic_data = [i for i in tweet_topic_data if i[0] != '.']
        for foldr in tweet_topic_data:
            flag = 0
            conversation['id'] = foldr
            tweets = []
            path_repl = path + '/' + foldr + '/replies'
            files_t = sorted(os.listdir(path_repl))
            newfolds = [i for i in files_t if i[0] != '.']
            files_t = newfolds
            flag = "test"
            if files_t != []:

                # iterate over json reply files
                for repl_file in files_t:
                    with open(os.path.join(path_repl, repl_file)) as f:
                        for line in f:
                            tw = json.loads(line)
                            tw['used'] = 0

                            tw['set'] = flag
                            tweets.append(tw)
                            if tw['text'] is None:
                                print("Tweet has no text", tw['id'])
                conversation['replies'] = tweets

                path_src = path + '/' + foldr + '/source-tweet'
                files_t = sorted(os.listdir(path_src))
                with open(os.path.join(path_src, files_t[0])) as f:
                    for line in f:
                        src = json.loads(line)
                        src['used'] = 0
                        scrcid = src['id_str']
                        src['set'] = flag

                conversation['source'] = src
                if src['text'] is None:
                    print("Tweet has no text", src['id'])
                path_struct = path + '/' + foldr + '/structure.json'
                with open(path_struct) as f:
                    for line in f:
                        struct = json.loads(line)
                if len(struct) > 1:
                    new_struct = {}
                    new_struct[foldr] = struct[foldr]
                    struct = new_struct
                    # Take item from structure if key is same as source tweet id
                conversation['structure'] = struct

                branches = tree2branches(conversation['structure'])
                conversation['branches'] = branches
                train_dev_split[flag].append(conversation.copy())
                allconv.append(conversation.copy())

            # if no replies are present, still add just source
            else:
                flag = 'test'
                path_src = path + '/' + foldr + '/source-tweet'
                files_t = sorted(os.listdir(path_src))
                with open(os.path.join(path_src, files_t[0])) as f:
                    for line in f:
                        src = json.loads(line)
                        src['used'] = 0
                        scrcid = src['id_str']
                        src['set'] = flag

                conversation['source'] = src
                if src['text'] is None:
                    print("Tweet has no text", src['id'])

                path_struct = path + '/' + foldr + '/structure.json'
                with open(path_struct) as f:
                    for line in f:
                        struct = json.loads(line)
                if len(struct) > 1:
                    # print "Structure has more than one root"
                    new_struct = {}
                    new_struct[foldr] = struct[foldr]
                    struct = new_struct
                    # Take item from structure if key is same as source tweet id
                conversation['structure'] = struct
                branches = tree2branches(conversation['structure'])

                conversation['branches'] = branches
                train_dev_split[flag].append(conversation.copy())
                allconv.append(conversation.copy())

                print(foldr)

    return train_dev_split

예제 #4

파일 보기

파일: preprocessing_reddit.py 프로젝트: isspek/veracity-detection

def load_data():
    # this is mix of twitter and reddit
    path_dev = os.path.join(TRAIN_DATA_PREFIX, "dev-key.json")
    with open(path_dev, 'r') as f:
        dev_key = json.load(f)

    path_train = os.path.join(TRAIN_DATA_PREFIX, "train-key.json")
    with open(path_train, 'r') as f:
        train_key = json.load(f)

    # %%

    path = os.path.join(TRAIN_DATA_PREFIX, "reddit-training-data")

    conversation_ids = listdir_nohidden(path)
    conversations = {}

    conversations['dev'] = []
    conversations['train'] = []
    conversations['test'] = []

    for id in conversation_ids:
        conversation = {}
        conversation['id'] = id
        path_src = path + '/' + id + '/source-tweet'
        files_t = sorted(listdir_nohidden(path_src))
        with open(os.path.join(path_src, files_t[0])) as f:
            for line in f:
                src = json.loads(line)

                src['text'] = src['data']['children'][0]['data']['title']
                src['user'] = src['data']['children'][0]['data']['author']

                if files_t[0].endswith('.json'):
                    filename = files_t[0][:-5]
                    src['id_str'] = filename
                else:
                    print("No, no I don't like that")

                src['used'] = 0

                if src['id_str'] in list(dev_key['subtaskaenglish'].keys()):
                    src['setA'] = 'dev'
                    src['label'] = dev_key['subtaskaenglish'][src['id_str']]


                elif src['id_str'] in list(train_key['subtaskaenglish'].keys()):
                    src['setA'] = 'train'
                    src['label'] = train_key['subtaskaenglish'][src['id_str']]

                else:

                    print("Post was not found! Task A, Post ID: ", src['id_str'])

                if src['id_str'] in list(dev_key['subtaskbenglish'].keys()):
                    src['setB'] = 'dev'
                    conversation['veracity'] = dev_key['subtaskbenglish'][src['id_str']]

                elif src['id_str'] in list(train_key['subtaskbenglish'].keys()):
                    src['setB'] = 'train'
                    conversation['veracity'] = train_key['subtaskbenglish'][src['id_str']]

                else:
                    print("Post was not found! Task B, Post ID: ", src['id_str'])

                conversation['source'] = src

        tweets = []
        path_repl = path + '/' + id + '/replies'
        files_t = sorted(listdir_nohidden(path_repl))
        for repl_file in files_t:
            with open(os.path.join(path_repl, repl_file)) as f:
                for line in f:
                    tw = json.loads(line)

                    if 'body' in list(tw['data'].keys()):

                        tw['text'] = tw['data']['body']
                        tw['user'] = tw['data']['author']

                        if repl_file.endswith('.json'):
                            filename = repl_file[:-5]
                            tw['id_str'] = filename
                        else:
                            print("No, no I don't like that reply")

                        tw['used'] = 0
                        if tw['id_str'] in list(dev_key['subtaskaenglish'].keys()):
                            tw['setA'] = 'dev'
                            tw['label'] = dev_key['subtaskaenglish'][tw['id_str']]
                        elif tw['id_str'] in list(train_key['subtaskaenglish'].keys()):
                            tw['setA'] = 'train'
                            tw['label'] = train_key['subtaskaenglish'][tw['id_str']]
                        else:
                            print("Post was not found! Task A, Reply ID: ", tw['id_str'])

                        tweets.append(tw)
                    else:

                        tw['text'] = ''
                        tw['user'] = ''
                        tw['used'] = 0
                        if repl_file.endswith('.json'):
                            filename = repl_file[:-5]
                            tw['id_str'] = filename
                        else:
                            print("No, no I don't like that reply")
                        if tw['id_str'] in list(dev_key['subtaskaenglish'].keys()):
                            tw['setA'] = 'dev'

                            tw['label'] = dev_key['subtaskaenglish'][tw['id_str']]
                        elif tw['id_str'] in list(train_key['subtaskaenglish'].keys()):
                            tw['setA'] = 'train'
                            tw['label'] = train_key['subtaskaenglish'][tw['id_str']]
                        else:
                            print("Post was not found! Task A, Reply ID: ", tw['id_str'])
                        tweets.append(tw)

        conversation['replies'] = tweets
        path_struct = path + '/' + id + '/structure.json'

        with open(path_struct, 'r') as f:
            struct = json.load(f)
            conversation['structure'] = struct
            branches = tree2branches(conversation['structure'])
            conversation['branches'] = branches

        conversations['train'].append(conversation)
    # %%
    path = os.path.join(TRAIN_DATA_PREFIX, "reddit-dev-data")

    conversation_ids = listdir_nohidden(path)

    for id in conversation_ids:
        conversation = {}
        conversation['id'] = id
        path_src = path + '/' + id + '/source-tweet'
        files_t = sorted(listdir_nohidden(path_src))
        with open(os.path.join(path_src, files_t[0])) as f:
            for line in f:
                src = json.loads(line)

                src['text'] = src['data']['children'][0]['data']['title']
                src['user'] = src['data']['children'][0]['data']['author']

                if files_t[0].endswith('.json'):
                    filename = files_t[0][:-5]
                    src['id_str'] = filename
                else:
                    print("No, no I don't like that")

                src['used'] = 0
                #
                if src['id_str'] in list(dev_key['subtaskaenglish'].keys()):
                    src['setA'] = 'dev'
                    src['label'] = dev_key['subtaskaenglish'][src['id_str']]

                elif src['id_str'] in list(train_key['subtaskaenglish'].keys()):
                    src['setA'] = 'train'

                    src['label'] = train_key['subtaskaenglish'][src['id_str']]

                else:
                    print("Post was not found! Task A, Post ID: ", src['id_str'])

                if src['id_str'] in list(dev_key['subtaskbenglish'].keys()):
                    src['setB'] = 'dev'
                    conversation['veracity'] = dev_key['subtaskbenglish'][src['id_str']]

                elif src['id_str'] in list(train_key['subtaskbenglish'].keys()):
                    src['setB'] = 'train'
                    conversation['veracity'] = train_key['subtaskbenglish'][src['id_str']]

                else:
                    print("Post was not found! Task B, Post ID: ", src['id_str'])

                conversation['source'] = src

        tweets = []
        path_repl = path + '/' + id + '/replies'
        files_t = sorted(listdir_nohidden(path_repl))
        for repl_file in files_t:
            with open(os.path.join(path_repl, repl_file)) as f:
                for line in f:
                    tw = json.loads(line)
                    if 'body' in list(tw['data'].keys()):

                        tw['text'] = tw['data']['body']
                        tw['user'] = tw['data']['author']

                        if repl_file.endswith('.json'):
                            filename = repl_file[:-5]
                            tw['id_str'] = filename
                        else:
                            print("No, no I don't like that reply")

                        tw['used'] = 0
                        if tw['id_str'] in list(dev_key['subtaskaenglish'].keys()):
                            tw['setA'] = 'dev'
                            tw['label'] = dev_key['subtaskaenglish'][tw['id_str']]

                        elif tw['id_str'] in list(train_key['subtaskaenglish'].keys()):
                            tw['setA'] = 'train'
                            tw['label'] = train_key['subtaskaenglish'][tw['id_str']]
                        else:
                            print("Post was not found! Task A, Reply ID: ", tw['id_str'])

                        tweets.append(tw)
                    else:
                        tw['text'] = ''
                        tw['user'] = ''
                        tw['used'] = 0
                        if repl_file.endswith('.json'):
                            filename = repl_file[:-5]
                            tw['id_str'] = filename
                        else:
                            print("No, no I don't like that reply")
                        if tw['id_str'] in list(dev_key['subtaskaenglish'].keys()):
                            tw['setA'] = 'dev'
                            tw['label'] = dev_key['subtaskaenglish'][tw['id_str']]
                        elif tw['id_str'] in list(train_key['subtaskaenglish'].keys()):
                            tw['setA'] = 'train'
                            tw['label'] = train_key['subtaskaenglish'][tw['id_str']]

                        else:
                            print("Post was not found! Task A, Reply ID: ", tw['id_str'])
                        tweets.append(tw)
        conversation['replies'] = tweets
        path_struct = path + '/' + id + '/structure.json'

        with open(path_struct, 'r') as f:
            struct = json.load(f)
            conversation['structure'] = struct
            branches = tree2branches(conversation['structure'])
            conversation['branches'] = branches

        conversations['dev'].append(conversation)
    return conversations

예제 #5

파일 보기

파일: preprocessing_tweets.py 프로젝트: databill86/RumourEval2019

def load_dataset():
    # Load labels and split for task A and task B
    tweet_label_dict, veracity_label_dict = load_true_labels()
    dev = tweet_label_dict['dev']
    train = tweet_label_dict['train']
    dev_tweets = dev.keys()
    train_tweets = train.keys()
    # Load folds and conversations
    path_to_folds = os.path.join(TRAIN_DATA_PREFIX, 'twitter-english')
    folds = sorted(os.listdir(path_to_folds))
    newfolds = [i for i in folds if i[0] != '.']
    folds = newfolds
    cvfolds = {}
    allconv = []
    train_dev_split = {}
    train_dev_split['dev'] = []
    train_dev_split['train'] = []
    train_dev_split['test'] = []
    # iterate over all tweet groups [reffered to as 'folds'] - charliehebdo etc...
    for nfold, fold in enumerate(folds):
        path_to_tweets = os.path.join(path_to_folds, fold)
        tweet_data = sorted(os.listdir(path_to_tweets))
        newfolds = [i for i in tweet_data if i[0] != '.']
        tweet_data = newfolds  # conversation ids, source post id == conversation id
        conversation = {}
        # build conversations for tweet group
        for foldr in tweet_data:
            flag = 0
            conversation['id'] = foldr
            tweets = []
            path_repl = path_to_tweets + '/' + foldr + '/replies'
            files_t = sorted(os.listdir(path_repl))
            newfolds = [i for i in files_t if i[0] != '.']
            files_t = newfolds
            if files_t != []:
                # iterate over json reply files
                for repl_file in files_t:
                    with open(os.path.join(path_repl, repl_file)) as f:
                        for line in f:
                            tw = json.loads(line)
                            tw['used'] = 0
                            replyid = tw['id_str']

                            # check if tweet belongs to dev fold
                            if replyid in dev_tweets:
                                tw['set'] = 'dev'
                                tw['label'] = dev[replyid]
                                #                        train_dev_tweets['dev'].append(tw)
                                if flag == 'train':
                                    print("The tree is split between sets", foldr)
                                flag = 'dev'
                            elif replyid in train_tweets:
                                tw['set'] = 'train'
                                tw['label'] = train[replyid]
                                #                        train_dev_tweets['train'].append(tw)
                                if flag == 'dev':
                                    print("The tree is split between sets", foldr)
                                flag = 'train'
                            else:
                                print("Tweet was not found! ID: ", foldr)
                            tweets.append(tw)
                            if tw['text'] is None:
                                print("Tweet has no text", tw['id'])
                conversation['replies'] = tweets

                path_src = path_to_tweets + '/' + foldr + '/source-tweet'
                files_t = sorted(os.listdir(path_src))
                with open(os.path.join(path_src, files_t[0])) as f:
                    for line in f:
                        src = json.loads(line)
                        src['used'] = 0
                        scrcid = src['id_str']
                        src['set'] = flag
                        src['label'] = tweet_label_dict[flag][scrcid]

                conversation['source'] = src
                conversation['veracity'] = veracity_label_dict[flag][scrcid]
                if src['text'] is None:
                    print("Tweet has no text", src['id'])
                path_struct = path_to_tweets + '/' + foldr + '/structure.json'
                with open(path_struct) as f:
                    for line in f:
                        struct = json.loads(line)
                if len(struct) > 1:
                    # I had to alter the structure of this conversation
                    if foldr == '553480082996879360':
                        new_struct = {}
                        new_struct[foldr] = struct[foldr]
                        new_struct[foldr]['553495625527209985'] = struct['553485679129534464']['553495625527209985']
                        new_struct[foldr]['553495937432432640'] = struct['553490097623269376']['553495937432432640']
                        struct = new_struct
                    else:
                        new_struct = {}
                        new_struct[foldr] = struct[foldr]
                        struct = new_struct
                    # Take item from structure if key is same as source tweet id
                conversation['structure'] = struct

                branches = tree2branches(conversation['structure'])
                conversation['branches'] = branches
                train_dev_split[flag].append(conversation.copy())
                allconv.append(conversation.copy())
            else:
                flag = 'train'
                path_src = path_to_tweets + '/' + foldr + '/source-tweet'
                files_t = sorted(os.listdir(path_src))
                with open(os.path.join(path_src, files_t[0])) as f:
                    for line in f:
                        src = json.loads(line)
                        src['used'] = 0
                        scrcid = src['id_str']
                        src['set'] = flag
                        src['label'] = tweet_label_dict[flag][scrcid]

                conversation['source'] = src
                conversation['veracity'] = veracity_label_dict[flag][scrcid]
                if src['text'] is None:
                    print("Tweet has no text", src['id'])

                path_struct = path_to_tweets + '/' + foldr + '/structure.json'
                with open(path_struct) as f:
                    for line in f:
                        struct = json.loads(line)
                if len(struct) > 1:
                    # print "Structure has more than one root"
                    new_struct = {}
                    new_struct[foldr] = struct[foldr]
                    struct = new_struct
                    # Take item from structure if key is same as source tweet id
                conversation['structure'] = struct
                branches = tree2branches(conversation['structure'])

                conversation['branches'] = branches
                train_dev_split[flag].append(conversation.copy())
                allconv.append(conversation.copy())

                print(foldr)

        cvfolds[fold] = allconv
        allconv = []

    return train_dev_split