示例#1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir',
                        default='../data/subtitles/subtitlesInTSV/')
    args = parser.parse_args()
    data_dir = args.data_dir
    dialogue_files = [
        f for f in os.listdir(data_dir) if re.findall('S[1-9]E[0-9]+.tsv', f)
    ]
    dialogue_files = [os.path.join(data_dir, f) for f in dialogue_files]
    stops = get_stopwords('en') + [
        'will',
        'don',
        've',
    ]
    all_docs = {}
    for f in dialogue_files:
        ep_name = re.findall('S[1-9]E[0-9]+', f)[0]
        data = pd.read_csv(f, sep='\t')
        docs = []
        for chunk, data_group in data.groupby('chunk'):
            clean_dialogue = []
            for d in data_group['dialogue']:
                # print('raw dialogue %s'%(d))
                cleaned = clean_text(str(d))
                try:
                    cleaned = cleaned.decode('utf-8')
                    clean_dialogue.append(cleaned)
                except Exception, e:
                    print('could not clean text %s because error %s' %
                          (cleaned, e))
            all_dialogue = ' '.join(clean_dialogue)
            docs.append(all_dialogue)
        episode_text = ' '.join(docs)
        # print('got full text %s'%
        #       (episode_text))
        all_docs[ep_name] = episode_text
 print('processing episode %s' % (e))
 e_data = episode_data[e]
 e_name = e.split('.tsv')[0]
 e_data.sort_values('chunk', ascending=True)
 # TODO: insert dummy values for empty chunks
 empty_chunks = full_chunk_list - set(e_data['chunk'].unique())
 if (len(empty_chunks) > 0):
     print('filling %s with empty chunks %s' % (e_name, empty_chunks))
     empty_chunk_rows = pd.DataFrame([{
         'chunk': c,
         'dialogue': ''
     } for c in empty_chunks])
     e_data = pd.concat([e_data, empty_chunk_rows], axis=0)
 chunk_iter = e_data.groupby('chunk')
 chunk_text = [
     clean_text(' '.join(map(str, c[1]['dialogue'].tolist())))
     for c in chunk_iter
 ]
 chunk_LIWC_counts = {c: [] for c in LIWC_categories}
 for t in chunk_text:
     tokens = TKNZR.tokenize(t)
     for c in LIWC_categories:
         counts = get_LIWC_counts(tokens,
                                  LIWC_words=LIWC_category_wordlists[c])
         if (count_option == 'total'):
             total_counts = sum(counts.values())
         elif (count_option == 'unique'):
             total_counts = len(counts)
         # TODO: store individual words as well as aggregate counts
         chunk_LIWC_counts[c].append(total_counts)
 chunk_LIWC_counts = pd.DataFrame(chunk_LIWC_counts)
示例#3
0
tokenizer = WordPunctTokenizer()
LIWC_words = {category: ["^" + l.strip() + "$" for l in open(os.path.join(LIWC_dir, category), 'r')] for category in categories} 
#print LIWC_words["positive_affect"]

jsonList = []
              
for filename in files:
    print filename
    with open (filename) as fin:
        categoryCounts = {category:Counter() for category in categories}  
        for line in fin:
            if "frameNo" in line:
                continue
            dialogue = line.strip().split("\t")[-1]
            dialogue = dialogue.replace ("$", " ")
            tokens = tokenizer.tokenize(clean_text (dialogue.strip()))
            #print tokens
            for category in categories:
                #print category
                #print LIWC_words[category]
                counts = get_LIWC_counts(tokens, LIWC_words=LIWC_words[category])
                if len (counts) > 0:
                    #print counts
                    categoryCounts[category].update (counts)
    temp_dict = {"name": os.path.basename (filename)}
    temp_dict.update (categoryCounts)
    jsonList.append (temp_dict)


# In[3]:
 subtitle_dir = '../data/subtitles/subtitlesInTSV/'
 all_episodes = [f for f in os.listdir(subtitle_dir) 
                 if re.findall('S[0-9]E[0-9]+', f)]
 sorted_episodes = sorted(all_episodes)
 episode_data = {e : pd.read_csv(os.path.join(subtitle_dir, e), sep='\t') 
                 for e in sorted_episodes}
 all_counts = {}
 min_df = 1
 stop_words = []
 cv = CountVectorizer(min_df=min_df, encoding='utf-8', stop_words=stop_words)
 for e in sorted_episodes:
     print('processing episode %s'%(e))
     e_data = episode_data[e]
     all_dialogue = []
     for d in e_data['dialogue']:
         clean_dialogue = clean_text(str(d)).replace('$', ' ').strip()
         
         if(clean_dialogue != ''):
             try:
                 cv.fit_transform([clean_dialogue])
                 all_dialogue.append(clean_dialogue)
             except Exception, e:
                 print('clean dialogue %s caused error %s'%(clean_dialogue, e))
     counts = cv.fit_transform(all_dialogue)
     # only care about per-episode counts
     count_sums = list(pd.np.array(counts.sum(axis=0))[0])
     sorted_vocab = sorted(cv.vocabulary_.keys(), key=lambda x: cv.vocabulary_[x])
     # make count dict from sorted vocab and sum counts
     count_dict = dict(zip(sorted_vocab, count_sums))
     all_counts[e] = count_dict
 # now combine all counts