示例#1
0
    def _build_network(self, path, char_dict, mode, min_lines = 5):
        if 'agarwal' in path:
            source = 'agarwal'
            scenes = get_boundaries_agarwal(path)
        else:
            source = 'gorinski'
            scenes = get_boundaries_gorinski(path)

        var2info = ag.get_variant_as_key(char_dict)
        char2lines = get_char_to_lines(path, char_dict)

        G = nx.Graph()
        for i,scene in enumerate(scenes):
            cdl = ag.get_char_diag_list(scene, var2info, source)

            if mode == 'overlap':
                # connect all characters in this scene
                char_tuples = set([cd[0] for cd in cdl])
                char_tuples = sorted(list(char_tuples), key=lambda x:x[0])  # sort by name
                for i,(cname1, _, _) in enumerate(char_tuples):
                    if len(char2lines[cname1]) >= min_lines:
                        for j,(cname2, gen2, score2) in enumerate(char_tuples[i+1:]):
                            if len(char2lines[cname2]) >= min_lines:
                                G.add_edge(cname1, cname2)
            else:
                # only connect characters who speak consecutively
                for i in range(len(cdl)-1):
                    cname1, _, _ = cdl[i][0]
                    if len(char2lines[cname1]) >= min_lines:
                        cname2, _, _ = cdl[i+1][0]
                        if len(char2lines[cname2]) >= min_lines:
                            G.add_edge(cname1, cname2)
        return G
示例#2
0
    def _predict(self, path, char_dict):
        if 'agarwal' in path:
            source = 'agarwal'
            scenes = get_boundaries_agarwal(path)
        else:
            source = 'gorinski'
            scenes = get_boundaries_gorinski(path)

        male_chars = self.get_male_chars(char_dict)  # soft mode
        var2info = ag.get_variant_as_key(char_dict)

        no_man_ff = 0
        ff_count = 0
        for scene in scenes:
            cdl = ag.get_char_diag_list(scene, var2info, source)
            ffs = ag.get_ff_conversations(cdl)
            ff_count += len(ffs)
            # len(ffs) > 0 means it passes consecutive soft
            for ff in ffs:
                if self.no_man_conversation(ff, male_chars):
                    no_man_ff += 1
                    if self.binary:
                        return 1
        if self.binary:
            return 0
        return no_man_ff, ff_count
示例#3
0
    def predict_overlap(self, path, char_dict, mode):
        if 'agarwal' in path:
            source = 'agarwal'
            scenes = get_boundaries_agarwal(path)
        else:
            source = 'gorinski'
            scenes = get_boundaries_gorinski(path)

        var2info = ag.get_variant_as_key(char_dict)

        for scene in scenes:
            cdl = ag.get_char_diag_list(scene, var2info, source)
            if self.overlap_in_scene(cdl, mode):
                return 1
        return 0
示例#4
0
    def extract_features(self, path, char_dict):
        if self.verbose: print(path)

        if 'agarwal' in path:
            source = 'agarwal'
            scenes = get_boundaries_agarwal(path)
        else:
            source = 'gorinski'
            scenes = get_boundaries_gorinski(path)

        var2info = ag.get_variant_as_key(char_dict)

        feats = np.zeros(3, dtype=np.int)  # counts per rating
        for scene in scenes:
            cdl = ag.get_char_diag_list(scene, var2info, source)
            rating = self.rate_scene(cdl)
            if rating >= 1:
                feats[rating - 1] += 1

        return feats
def get_dialogue_per_scene(files, source):

    nested_scenes = {}

    for f in files:
        f[0]
        char_dict = parse_by_gender_file(f[1])
        char_dict = char_dict[-1][-1]
        var = get_variant_as_key(char_dict)

        nested_scenes[f[1]] = {}

        scenes = os.listdir(f[0])

        for s in scenes:
            name = f[0] + "/" + s
            with open(name) as fp:
                c = fp.readlines()
                k = get_char_diag_list(c, var, source)
                nested_scenes[f[1]][name] = k

    return nested_scenes
示例#6
0
    def transform(self, X):
        if self.verbose:
            print('Transforming {} samples into {}'.format(
                str(len(X)), ', '.join(self.feats)))
        feat_mats = []

        if 'UNI' in self.feats:
            if self.verbose: print('Building UNIGRAMS model...')
            # corpus to train unigrams model - either all fem dialogue or all fem-fem dialogue
            diag_per_movie = []
            for i, (id, path, char_dict) in enumerate(X):
                this_diag = ''
                if self.verbose and i % 50 == 0: print(i)
                if 'agarwal' in path:
                    source = 'agarwal'
                    scenes = get_boundaries_agarwal(path)
                else:
                    source = 'gorinski'
                    scenes = get_boundaries_gorinski(path)
                var2info = ag.get_variant_as_key(char_dict)
                for scene in scenes:
                    cdl = ag.get_char_diag_list(scene, var2info, source)
                    if self.uni_only_ff:
                        ffs = ag.get_ff_conversations(cdl)
                        for ff in ffs:
                            for char, line in ff:
                                this_diag += line
                    else:
                        for (char, gen,
                             score), diag in cdl:  # for each character/line
                            if score != 'None' and float(score) > .5:
                                line = ' '.join(diag)
                                if len(line) > 0:
                                    this_diag += ' ' + line
                diag_per_movie.append(this_diag)

            # transform into bag-of-words unigram model
            if self.countvec is None:  # train
                self.countvec = CountVectorizer(max_features=self.uni_count)
                unigrams = self.countvec.fit_transform(diag_per_movie)
            else:  # test
                unigrams = self.countvec.transform(diag_per_movie)
            if self.verbose: print('Unigrams:', unigrams.shape)
            feat_mats.append(unigrams.toarray())

        if 'SNA' in self.feats:
            if self.verbose: print('Building SNA features...')
            sn_feats = []
            for i, (id, path, char_dict) in enumerate(X):
                if self.verbose and i % 50 == 0: print(i)
                sn_feats.append(
                    self.sna.transform_into_feats(id, self.sna_mode,
                                                  self.sna_min_lines,
                                                  self.sna_centralities))
            sn_feats = np.array(sn_feats)
            if self.verbose: print('SNA features:', sn_feats.shape)
            feat_mats.append(sn_feats)

        if 'FRA' in self.feats:
            if self.verbose: print('Building FRAME features...')
            fr_feats = []
            for i, (id, path, char_dict) in enumerate(X):
                if self.verbose and i % 50 == 0: print(i)
                scores = self.id2frames[id]
                if self.fr_mode == 'both':
                    feats = np.concatenate(
                        (scores['ff'], scores['fm'], scores['mm']), axis=0)
                elif self.fr_mode == 'agency':
                    feats = np.concatenate(
                        (scores['ff'][:3], scores['fm'][:3], scores['fm'][6:9],
                         scores['mm'][:3]),
                        axis=0)
                elif self.fr_mode == 'power':  # power
                    feats = np.concatenate(
                        (scores['ff'][3:], scores['fm'][3:6], scores['fm'][9:],
                         scores['mm'][3:]),
                        axis=0)
                elif self.fr_mode == 'ff':
                    feats = scores['ff']
                elif self.fr_mode == 'fm':
                    feats = scores['fm']
                elif self.fr_mode == 'ffmm':
                    feats = np.concatenate((scores['ff'], scores['mm']),
                                           axis=0)
                elif self.fr_mode == 'mm':
                    feats = scores['mm']
                else:
                    raise ValueError('Invalid frame mode:', self.fr_mode)
                fr_feats.append(feats)
            fr_feats = np.array(fr_feats)
            fr_feats = MinMaxScaler().fit_transform(fr_feats)
            if self.verbose: print('FRAME features:', fr_feats.shape)
            feat_mats.append(fr_feats)

        if 'RB' in self.feats:
            if self.verbose: print('Building RULE-BASED features...')
            X_rb = [(x[1], x[2]) for x in X]
            rb_feats = self.rb.predict(X_rb)
            rb_feats = np.array(rb_feats)
            if self.verbose: print('RB features:', rb_feats.shape)
            feat_mats.append(rb_feats)

        X = np.concatenate(feat_mats, axis=1)
        if self.verbose: print('X-shape:', X.shape)

        return X