示例#1
0
    def outline(self, url=None, episode=None, **kwargs):

        # path to 'outline' package resource
        path = resource_filename(self.__class__.__name__, url)

        # create empty transcription
        transcription = Transcription(episode=episode)

        # load file and split lines
        with open(path, 'r') as f:
            content = [line.strip().split() for line in f]

        # loop on file content
        for tokens in content:

            # parse line
            startTime = float(tokens[0])
            endTime = float(tokens[1])
            dataType = str(tokens[2])
            data = " ".join(tokens[3:])

            # add corresponding edge
            transcription.add_edge(startTime, endTime, **{dataType: data})

        return transcription
示例#2
0
    def transcript(self, url=None, episode=None, **kwargs):

        path = resource_filename(self.__class__.__name__, url)
        transcription = Transcription(episode=episode)

        # previous dialogue end time
        e_dialogue = None

        for line in self.iterlines(path):

            # ARYA_STARK I'm not a boy!
            # speaker = ARYA_STARK
            # speech = I'm not a boy!
            tokens = line.split()
            speaker = tokens[0].strip()
            speech = ' '.join(tokens[1:]).strip()

            # new dialogue
            _s_dialogue, _e_dialogue = T(), T()

            # connect dialogue with previous dialogue
            if e_dialogue is not None:
                transcription.add_edge(e_dialogue, _s_dialogue)

            transcription.add_edge(_s_dialogue, _e_dialogue,
                                   speaker=speaker, speech=speech)

            # keep track of previous dialogue end time
            e_dialogue = _e_dialogue

        return transcription
示例#3
0
    def outline(self, url=None, episode=None, **kwargs):

        # path to 'outline' package resource
        path = resource_filename(self.__class__.__name__, url)

        # create empty transcription
        transcription = Transcription(episode=episode)

        # load file and split lines
        with open(path, 'r') as f:
            content = [line.strip().split() for line in f]

        # loop on file content
        for tokens in content:

            # parse line
            startTime = float(tokens[0])
            endTime = float(tokens[1])
            dataType = str(tokens[2])
            data = " ".join(tokens[3:])

            # add corresponding edge
            transcription.add_edge(startTime, endTime, **{dataType: data})

        return transcription
示例#4
0
    def outline_www(self, url=None, episode=None, **kwargs):
        """
        Parameters
        ----------
        url : str, optional
            URL where resource is available
        episode : Episode, optional
            Episode for which resource should be downloaded
            Useful in case a same URL contains resources for multiple episodes.

        Returns
        -------
        G : Transcription
        """

        r = self.download_as_utf8(url)
        soup = BeautifulSoup(r)
        h2 = soup.find_all('h2')
        sp = ""
        i = 0
        outline = {}

        for element in h2[0].next_elements:
            if element.name == 'p':
                if outline.get(i) == "----":
                    sp = element.text
                else:
                    sp = outline.get(i) + " " + element.text
                outline.update({i: sp})
            if element.name == 'h2':
                i = i + 1
                sp = "----"
                outline.update({i: sp})

        G = Transcription(episode=episode)
        t2 = TStart

        i = 1
        while outline.get(i):
            # add /empty/ edge between previous and next annotations
            t1 = t2
            t2 = T()
            G.add_edge(t1, t2)

            # add next annotation
            t1 = t2
            t2 = T()
            G.add_edge(t1, t2, scene=outline.get(i))

            i = i + 1

        # add /empty/ edge between previous annotation and episode end
        t1 = t2
        t2 = TEnd
        G.add_edge(t1, t2)

        return G
示例#5
0
    def outline_www(self, url=None, episode=None, **kwargs):
        """
        Parameters
        ----------
        url : str, optional
            URL where resource is available
        episode : Episode, optional
            Episode for which resource should be downloaded
            Useful in case a same URL contains resources for multiple episodes.

        Returns
        -------
        G : Transcription
        """

        r = self.download_as_utf8(url)
        soup = BeautifulSoup(r)
        h2 = soup.find_all('h2')
        sp = ""
        i = 0
        outline = {}

        for element in h2[0].next_elements:
            if element.name == 'p':
                if outline.get(i) == "----":
                    sp = element.text
                else:
                    sp = outline.get(i) + " " + element.text
                outline.update({i: sp})
            if element.name == 'h2':
                i = i + 1
                sp = "----"
                outline.update({i: sp})

        G = Transcription(episode=episode)
        t2 = TStart

        i = 1
        while outline.get(i):
            # add /empty/ edge between previous and next annotations
            t1 = t2
            t2 = T()
            G.add_edge(t1, t2)

            # add next annotation
            t1 = t2
            t2 = T()
            G.add_edge(t1, t2, scene=outline.get(i))

            i = i + 1

        # add /empty/ edge between previous annotation and episode end
        t1 = t2
        t2 = TEnd
        G.add_edge(t1, t2)

        return G
示例#6
0
    def transcript(self, url=None, episode=None, **kwargs):

        path = resource_filename(self.__class__.__name__, url)
        transcription = Transcription(episode=episode)

        # previous dialogue end time
        e_dialogue = None

        for line in self.iterlines(path):

            # ARYA_STARK I'm not a boy!
            # speaker = ARYA_STARK
            # speech = I'm not a boy!
            tokens = line.split()
            speaker = tokens[0].strip()
            speech = ' '.join(tokens[1:]).strip()

            # new dialogue
            _s_dialogue, _e_dialogue = T(), T()

            # connect dialogue with previous dialogue
            if e_dialogue is not None:
                transcription.add_edge(e_dialogue, _s_dialogue)

            transcription.add_edge(_s_dialogue,
                                   _e_dialogue,
                                   speaker=speaker,
                                   speech=speech)

            # keep track of previous dialogue end time
            e_dialogue = _e_dialogue

        return transcription
示例#7
0
    def firstResource(self, url=None, episode=None, **kwargs):
        """Download `episode` `firstResource` from `url`

        Parameters
        ----------
        url : str, optional
            URL provided in file tvd.yml.
        episode : `tvd.Episode`, optional
            Episode for which resource should be downloaded.
            Useful in case a same URL contains resources for multiple episodes.

        Returns
        -------
        graph : `tvd.Transcription`
        """

        # start with an empty annotation graph
        annotation_graph = Transcription(episode=episode)

        # do what needs to be done...

        return annotation_graph
示例#8
0
    def transcript_www(self, url=None, episode=None, **kwargs):

        # load name mapping
        mapping = self._get_mapping()

        r = self.download_as_utf8(url)

        soup = BeautifulSoup(r)

        G = Transcription(episode=episode)
        t2 = TStart

        div = soup.find_all('div')
        transcript = ""

        for i in range(0, len(div)):
            if re.match("{'class': \['postbody'\]}", unicode(div[i].attrs)):
                transcript = div[i]

        for i in range(0, len(transcript.contents)):
            string = unicode(transcript.contents[i])
            if not re.match("\[(.*)\]", string):
                if re.match("(.*) : (.*)", string) and \
                   not re.match("(.*) by : (.*)", string):
                    ligne = re.split(' : ', transcript.contents[i])

                    # add /empty/ edge between previous and next annotations
                    t1 = t2
                    t2 = T()
                    G.add_edge(t1, t2)

                    # add next annotation
                    t1 = t2
                    t2 = T()

                    spk = ligne[0].lower().replace(' ', '_')

                    if re.match("(.*)_\(|\[(.*)\)|\]", spk):
                        match = re.match("(.*)_\(|\[(.*)\)|\]", spk)
                        spk = match.group(1)

                    spk = mapping.get(spk, spk)

                    if re.match("(.*)/(.*)", spk):
                        spks = spk.split('/')
                        if spks[0] in mapping:
                            spk = mapping.get(spks[0])
                            G.add_edge(t1, t2, speaker=spk, speech=ligne[1])
                        if spks[1] in mapping:
                            spk = mapping.get(spks[1])
                            G.add_edge(t1, t2, speaker=spk, speech=ligne[1])
                    else:
                        G.add_edge(t1, t2, speaker=spk, speech=ligne[1])

                elif (re.match("(.*): (.*)", string)
                      and not re.match("Credit: (.*)", string)
                      and not re.match("(.*) by: (.*)", string)):
                    ligne = re.split(': ', transcript.contents[i])

                    # add /empty/ edge between previous and next annotations
                    t1 = t2
                    t2 = T()
                    G.add_edge(t1, t2)

                    # add next annotation
                    t1 = t2
                    t2 = T()
                    spk = ligne[0].lower().replace(' ', '_')

                    if re.match("(.*)_\(|\[(.*)\)|\]", spk):
                        match = re.match("(.*)_\(|\[(.*)\)|\]", spk)
                        spk = match.group(1)
                    spk = mapping.get(spk, spk)

                    if re.match("(.*)/(.*)", spk):
                        spks = spk.split('/')
                        if spks[0] in mapping:
                            spk = mapping.get(spks[0])
                            G.add_edge(t1, t2, speaker=spk, speech=ligne[1])
                        if spks[1] in mapping:
                            spk = mapping.get(spks[1])
                            G.add_edge(t1, t2, speaker=spk, speech=ligne[1])
                    else:
                        G.add_edge(t1, t2, speaker=spk, speech=ligne[1])

        # add /empty/ edge between previous annotation and episode end
        t1 = t2
        t2 = TEnd
        G.add_edge(t1, t2)

        return G
示例#9
0
    def transcript_www(self, url=None, episode=None, **kwargs):

        # load name mapping
        mapping = self._get_mapping()

        r = self.download_as_utf8(url)

        soup = BeautifulSoup(r)

        G = Transcription(episode=episode)
        t2 = TStart

        div = soup.find_all('div')
        transcript = ""

        for i in range(0, len(div)):
            if re.match("{'class': \['postbody'\]}", unicode(div[i].attrs)):
                transcript = div[i]

        for i in range(0, len(transcript.contents)):
            string = unicode(transcript.contents[i])
            if not re.match("\[(.*)\]", string):
                if re.match("(.*) : (.*)", string) and \
                   not re.match("(.*) by : (.*)", string):
                    ligne = re.split(' : ', transcript.contents[i])

                    # add /empty/ edge between previous and next annotations
                    t1 = t2
                    t2 = T()
                    G.add_edge(t1, t2)

                    # add next annotation
                    t1 = t2
                    t2 = T()

                    spk = ligne[0].lower().replace(' ', '_')

                    if re.match("(.*)_\(|\[(.*)\)|\]", spk):
                        match = re.match("(.*)_\(|\[(.*)\)|\]", spk)
                        spk = match.group(1)

                    spk = mapping.get(spk, spk)

                    if re.match("(.*)/(.*)", spk):
                        spks = spk.split('/')
                        if spks[0] in mapping:
                            spk = mapping.get(spks[0])
                            G.add_edge(t1, t2, speaker=spk, speech=ligne[1])
                        if spks[1] in mapping:
                            spk = mapping.get(spks[1])
                            G.add_edge(t1, t2, speaker=spk, speech=ligne[1])
                    else:
                        G.add_edge(t1, t2, speaker=spk, speech=ligne[1])

                elif (
                    re.match("(.*): (.*)", string)
                    and not re.match("Credit: (.*)", string)
                    and not re.match("(.*) by: (.*)", string)
                ):
                    ligne = re.split(': ', transcript.contents[i])

                    # add /empty/ edge between previous and next annotations
                    t1 = t2
                    t2 = T()
                    G.add_edge(t1, t2)

                    # add next annotation
                    t1 = t2
                    t2 = T()
                    spk = ligne[0].lower().replace(' ', '_')

                    if re.match("(.*)_\(|\[(.*)\)|\]", spk):
                        match = re.match("(.*)_\(|\[(.*)\)|\]", spk)
                        spk = match.group(1)
                    spk = mapping.get(spk, spk)

                    if re.match("(.*)/(.*)", spk):
                        spks = spk.split('/')
                        if spks[0] in mapping:
                            spk = mapping.get(spks[0])
                            G.add_edge(t1, t2, speaker=spk, speech=ligne[1])
                        if spks[1] in mapping:
                            spk = mapping.get(spks[1])
                            G.add_edge(t1, t2, speaker=spk, speech=ligne[1])
                    else:
                        G.add_edge(t1, t2, speaker=spk, speech=ligne[1])

        # add /empty/ edge between previous annotation and episode end
        t1 = t2
        t2 = TEnd
        G.add_edge(t1, t2)

        return G
示例#10
0
    def outline_www(self, url=None, episode=None, **kwargs):
        """
        Parameters
        ----------
        url : str
            URL where resource is available
        episode : Episode, optional
            Episode for which resource should be downloaded
            Useful in case a same URL contains resources for multiple episodes.

        Returns
        -------
        G : Transcription
        """

        h = html_parser.HTMLParser()
        r = self.download_as_utf8(url)

        r = re.sub('<script[^<]+</script>', '', r)
        r = re.sub('<style[^<]+</style>', '', r)
        r = re.sub('<div[^<]+</div>', '', r)
        r = re.sub('<li>', '@EVENT', r)  # Alternate way to detect event,
        # without depending on 'IXV.' etc.
        # -> Events are always items in a list.
        r = re.sub('<[^>]+>', '', r)
        r = r.split('\n')

        G = Transcription(episode=episode)

        t_episode_start = TStart
        t_episode_stop = TEnd
        t_location_prev = t_episode_start
        t_event_prev = None

        start = 0

        for line in r:

            line = h.unescape(
                line
            )  # Decode HTML code e.g. "don&#8217;t feed the .." to Unicode.
            if re.search('\A[ \t\n\r]*\Z', line):  # Empty line.
                continue

            if re.search('\A[ \t]*episode outline[ \t]*\Z', line,
                         re.IGNORECASE):  # Start of episode outline section.
                start = 1
                continue

            if start == 1:
                # Check end of episode outline (or empty content).
                if (re.search('\A[ \t]*resources[ \t]*\Z', line, re.IGNORECASE)
                        or re.search('\A\[*[0-9]*\]*timeline[ \t]*\Z', line,
                                     re.IGNORECASE)
                        or re.search('\A[ \t]*commentary and trivia[ \t]*\Z',
                                     line, re.IGNORECASE)
                        or re.search('\A[ \t]*trivia[ \t]*\Z', line,
                                     re.IGNORECASE)
                        or re.search('\A[ \t]*Still to come[ \t]*\Z', line,
                                     re.IGNORECASE)):
                    break

                # Lines to be ignored:
                # 'Titles and opening theme',
                # 'Titles and credits'
                # 'Opening themes and credits'
                # 'Title and Opening Themes'
                # 'Theme song and titles'

                # New location description.
                #if re.search('\A[ \t]*[IVX]+[\.:]+', line): # DO NOT USE.
                if not re.search('@EVENT', line):

                    if (re.search('title', line, re.IGNORECASE)
                            or re.search('credit', line, re.IGNORECASE)
                            or re.search('theme', line, re.IGNORECASE)):
                        continue  # Assume it's 'Titles and opening theme' or something
                        # similar. Ignore.

                    # Finish the edge for previous location section.
                    if t_event_prev:
                        G.add_edge(t_event_prev, t_location_prev)

                    location_ = re.sub('\A[ \t]*[IVX]+[\.:]+[ \t]*', '',
                                       line)  # Remove roman numeral.
                    t_location_start = T()
                    G.add_edge(t_location_prev, t_location_start)
                    t_location_stop = T()
                    G.add_edge(t_location_start,
                               t_location_stop,
                               location=location_)
                    t_location_prev = t_location_stop
                    t_event_prev = t_location_start

                else:

                    event_ = ' '.join(line.split())
                    event_ = re.sub('@EVENT', '', event_)
                    t_event_start = T()
                    t_event_stop = T()
                    G.add_edge(t_event_prev, t_event_start)
                    G.add_edge(t_event_start, t_event_stop, event=event_)
                    t_event_prev = t_event_stop

        G.add_edge(t_event_prev, t_location_prev)
        G.add_edge(t_location_prev, t_episode_stop)

        return G
示例#11
0
    def transcript(self, url=None, episode=None, **kwargs):

        path = resource_filename(self.__class__.__name__, url)
        transcription = Transcription(episode=episode)

        # previous scene end time
        e_scene = None

        # previous dialogue end time
        e_dialogue = None

        for line in self.iterlines(path):

            tokens = line.split()
            left = tokens[0].strip()
            right = ' '.join(tokens[1:]).strip()

            # new scene
            if left == 'SCENE':
                scene = right

                # connect previous dialogue line with scene end time
                if e_dialogue is not None:
                    transcription.add_edge(e_dialogue, e_scene)

                # new scene
                _s_scene, _e_scene = T(), T()
                transcription.add_edge(_s_scene, _e_scene, scene=scene)

                # connect scene with previous scene
                if e_scene is not None:
                    transcription.add_edge(e_scene, _s_scene)

                # update previous scene start/end time
                s_scene, e_scene = _s_scene, _e_scene

                # artifically set previous dialogue end time
                e_dialogue = s_scene

            # new dialogue line
            else:
                speaker = left
                speech = right

                # new dialogue
                _s_dialogue, _e_dialogue = T(), T()
                transcription.add_edge(_s_dialogue,
                                       _e_dialogue,
                                       speaker=speaker,
                                       speech=speech)

                # connect dialogue with previous dialogue
                transcription.add_edge(e_dialogue, _s_dialogue)

                # update previous dialogue start/end time
                s_dialogue, e_dialogue = _s_dialogue, _e_dialogue

        # connect previous dialogue line with scene end time
        transcription.add_edge(e_dialogue, e_scene)

        return transcription
示例#12
0
    def transcript_www(self, url=None, episode=None, debug=True, **kwargs):

        SPEAKER_MAPPING = {
            'abby': [
                'abby',
            ],
            'alice': [
                'alice',
            ],
            'alicia': [
                'alicia',
            ],
            'amy': [
                'amy',
            ],
            'angela': [
                'angela',
            ],
            'barry_kripke': ['barry', 'barry kripke', 'kripke'],
            'bernadette': [
                'bermadette',
                'bernadette',
            ],
            'bethany': [
                'bethany',
            ],
            'beverley': [
                'beverley',
            ],
            'brent_spiner': ['brent', 'brent spiner'],
            'charlie_sheen': [
                'charlie sheen',
            ],
            'christie': [
                'christie',
            ],
            'dale': [
                'dale',
            ],
            'david': [
                'david',
            ],
            'dennis': [
                'dennis',
            ],
            'dimitri': [
                'dimitri',
            ],
            'doug': [
                'doug',
            ],
            'dr_gablehouser': [
                'gablehauser',
                'gablehouser',
                'dr gablehouser',
            ],
            'dr_greene': [
                'dr. brian greene',
                'greene',
            ],
            'dr_hofstadter': [
                "leonard's mother",
                'dr hofstadter',
            ],
            'dr_koothrappali': [
                'dr koothrappali',
            ],
            'dr_massimino': [
                'dr massimino',
            ],
            'dr_millstone': [
                'dr millstone',
            ],
            'dr_seibert': [
                'seibert',
                'siebert',
                'dr. seibert',
            ],
            'dr_tyson': [
                'dr tyson',
            ],
            'elizabeth': [
                'elizabeth',
            ],
            'eric': [
                'eric',
            ],
            'george_smoot': [
                'george smoot',
            ],
            'george_takei': [
                'george takei',
            ],
            'glenn': [
                'glenn',
            ],
            'hawking': [
                'hawking',
            ],
            'houston': [
                'houston',
            ],
            'howard': [
                'howard',
                'past howard',
            ],
            'ira': [
                'ira',
            ],
            'jimmy': [
                'jimmy',
            ],
            'joy': [
                'joy',
            ],
            'joyce_kim': [
                'joyce kim',
            ],
            'kathy_sackhoff': ['katee', 'katee sackhoff', 'kathy'],
            'kevin': [
                'kevin',
            ],
            'kurt': [
                'kurt',
            ],
            'lakshmi': [
                'lakshmi',
            ],
            'lalita': [
                'lalita',
            ],
            'laura': [
                'laura',
            ],
            'leonard': [
                'leonard',
                'past leonard',
            ],
            'leslie_winkle': ['leslie winkle', 'leslie', 'lesley'],
            'martha': [
                'martha',
            ],
            'mie_massimino': [
                'mike',
                'mike_massimino',
                'massimino',
            ],
            'michaela': [
                'michaela',
            ],
            'mike': [
                'mike',
            ],
            'missy': [
                'missy',
            ],
            'mr_rostenkowski': [
                'mr rostenkowski',
                'mr. rostenkowski',
            ],
            'mrs_cooper': [
                'mrs cooper',
            ],
            'mrs_fowler': [
                'mrs fowler',
            ],
            'mrs_gunderson': [
                'mrs gunderson',
            ],
            'mrs_koothrappali': [
                'mrs koothrappali',
            ],
            'mrs_latham': [
                'mrs latham',
            ],
            'mrs_wolowitz': [
                'mrs wolowitz',
                "howard's mother",
            ],
            'page': [
                'page',
            ],
            'penny': [
                'penny',
                'past penny',
            ],
            'penny_dad': [
                "penny's dad",
            ],
            'pr_crawley': [
                'prof crawley',
            ],
            'pr_goldfarb': [
                'goldfarb',
            ],
            'pr_laughlin': [
                'prof laughlin',
            ],
            'priya': [
                'priya',
            ],
            'raj': [
                'raj',
                'past raj',
                'rai',
            ],
            'roeger': [
                'roeger',
            ],
            'rothman': [
                'rothman',
            ],
            'sarah': [
                'sarah',
            ],
            'sheldon': [
                'sheldon',
                'sgeldon',
                'sheldon on laptop screen',
                "sheldon's voice",
                'past sheldon',
                'on-screen sheldon',
            ],
            'stan_lee': [
                'stan lee',
            ],
            'steph': [
                'steph',
            ],
            'steve_wozniak': [
                'steve wozniak',
            ],
            'stuart': [
                'stuart',
            ],
            'summer': [
                'summer',
            ],
            'toby': [
                'toby',
            ],
            'todd': [
                'todd',
            ],
            'tom': [
                'tom',
            ],
            'venkatesh': [
                'venkatesh',
            ],
            'wil_wheaton': [
                'wil',
                'wil wheaton',
            ],
            'wyatt': [
                'wyatt',
            ],
            'zack': [
                'zack',
            ],
        }

        speaker_mapping = {
            old: new
            for new, olds in six.iteritems(SPEAKER_MAPPING) for old in olds
        }

        # download webpage and parse it with BeautifulSoup
        soup = BeautifulSoup(self.download_as_utf8(url))

        # extract the following <div> containing the actual transcript
        # <div class='entrytext'> ... </div>
        div = soup.findAll('div', attrs={'class': 'entrytext'})[0]

        # initialize empty annotation graph
        G = Transcription(episode=episode)

        # episode start and end
        tstart = TStart
        tend = TEnd

        # tscene contains end of previous scene
        tscene = tstart
        # tspeech contains end of previous speech turn
        tspeech = None

        speakers = set([])

        for text in self._manual_transcript_line_iterator(div):

            # try to match xxxxx: yyyyyy
            REGEXP_DIALOGUE = '\A\s*([^:]+?)\s*:\s*(.*)\Z'
            m = re.match(REGEXP_DIALOGUE, text)

            if not m:
                # (They sit ...).
                # (Leonard starts rattling.)
                # u'Credits sequence.'
                # u'Credits sequence'
                # u'Credit sequence.'
                # u'Credit Sequence'
                # u'Credits sequence'
                # u'Time shift, Leonard and Sheldon are now ...'
                # u'Cut to Leonard entering living room in panic, ...'
                # u'Written by...'
                # u'(Time shift)'

                if debug:
                    print("SKIPPING: %s" % repr(text))

                continue

            # if there is a match, we are in one of the following situations:
            # Scene: blah blah blah
            # Sheldon: blah blah blah
            # Teleplay: blah blah blah
            # ...
            # left: right
            left, right = m.groups()
            left = left.strip().lower()
            right = right.strip()

            # if we are in of the following situations,
            # then we found a new scene
            # Scene: location
            # scene: location
            if left in {u'scene', u'secne'}:

                # remove unwanted spaces from location
                data = {'location': right.strip()}

                # add the new scene to the graph
                t1 = T()  # start time
                t2 = T()  # end time
                G.add_edge(t1, t2, **data)

                # make sure it is connected to the previous scene
                G.add_edge(tscene, t1)

                # make sure last speech turn of previous scene
                # is correctly connected to end of previous scene
                if tspeech:
                    G.add_edge(tspeech, tscene)

                # update end of previous scene/speech
                tscene = t2
                tspeech = t1

            # if we are in one of the following situations
            # Teleplay: Bill Prady & Steven Molaro
            # Story: Chuck Lorre
            elif left in {'story', 'teleplay'}:
                continue

            # that's what we are really looking for:
            # speaker_name: speech
            else:

                # remove stage directions from speaker name
                # e.g. "penny (to raj)" becomes ("penny", ["to raj", ])
                speaker, speaker_directions = self._get_directions(left)
                speaker = speaker.strip()

                # remove stage directions from speech
                # "hey sheldon (laughing). where is your spot?"
                # becomes "hey sheldon . where is your spot?", ["laughing",]
                speech, speech_directions = self._get_directions(right)
                speech = speech.strip()

                # gather all stage directions into one long string
                directions = u' '.join(speaker_directions + speech_directions)

                # debug
                if speaker not in speaker_mapping:
                    warnings.warn('no mapping for speaker "%s"' % speaker)

                # build annotation data
                # (with directions only if they exist)
                data = {
                    'speaker':
                    speaker_mapping.get(
                        speaker, 'unknown_%s' % '_'.join(speaker.split())),
                    'speech':
                    speech,
                }
                if directions:
                    data['directions'] = directions

                # add the new speech turn to the graph
                t1 = T()
                t2 = T()
                G.add_edge(t1, t2, **data)

                # make sure it is connected to the previous speech turn
                G.add_edge(tspeech, t1)

                # update end of previous speech turn
                tspeech = t2

        # make sure last speech turn is correctly connected to end of last scene
        G.add_edge(tspeech, tscene)

        # make sure last scene is correctly connected to episode end
        G.add_edge(tscene, tend)

        return G
示例#13
0
    def outline_www(self, url=None, episode=None, **kwargs):
        """
        Parameters
        ----------
        url : str
            URL where resource is available
        episode : Episode, optional
            Episode for which resource should be downloaded
            Useful in case a same URL contains resources for multiple episodes.

        Returns
        -------
        G : Transcription
        """

        h = html_parser.HTMLParser()
        r = self.download_as_utf8(url)

        r = re.sub('<script[^<]+</script>', '', r)
        r = re.sub('<style[^<]+</style>', '', r)
        r = re.sub('<div[^<]+</div>', '', r)
        r = re.sub('<li>', '@EVENT', r)  # Alternate way to detect event,
                    # without depending on 'IXV.' etc.
                    # -> Events are always items in a list.
        r = re.sub('<[^>]+>', '', r)
        r = r.split('\n')

        G = Transcription(episode=episode)

        t_episode_start = TStart
        t_episode_stop = TEnd
        t_location_prev = t_episode_start
        t_event_prev = None

        start = 0

        for line in r:

            line = h.unescape(line)  # Decode HTML code e.g. "don&#8217;t feed the .." to Unicode.
            if re.search('\A[ \t\n\r]*\Z', line):  # Empty line.
                continue

            if re.search(
                '\A[ \t]*episode outline[ \t]*\Z',
                line, re.IGNORECASE
            ):  # Start of episode outline section.
                start = 1
                continue

            if start == 1:
            # Check end of episode outline (or empty content).
                if (
                    re.search(
                        '\A[ \t]*resources[ \t]*\Z',
                        line,
                        re.IGNORECASE) or
                    re.search(
                        '\A\[*[0-9]*\]*timeline[ \t]*\Z',
                        line,
                        re.IGNORECASE) or
                    re.search(
                        '\A[ \t]*commentary and trivia[ \t]*\Z',
                        line,
                        re.IGNORECASE) or
                    re.search(
                        '\A[ \t]*trivia[ \t]*\Z',
                        line,
                        re.IGNORECASE) or
                    re.search(
                        '\A[ \t]*Still to come[ \t]*\Z',
                        line,
                        re.IGNORECASE)
                ):
                    break

                # Lines to be ignored:
                # 'Titles and opening theme',
                # 'Titles and credits'
                # 'Opening themes and credits'
                # 'Title and Opening Themes'
                # 'Theme song and titles'

                # New location description.
                #if re.search('\A[ \t]*[IVX]+[\.:]+', line): # DO NOT USE.
                if not re.search('@EVENT', line):

                    if (
                        re.search('title', line, re.IGNORECASE) or
                        re.search('credit', line, re.IGNORECASE) or
                        re.search('theme', line, re.IGNORECASE)
                    ):
                        continue  # Assume it's 'Titles and opening theme' or something
                         # similar. Ignore.

                    # Finish the edge for previous location section.
                    if t_event_prev:
                        G.add_edge(t_event_prev, t_location_prev)

                    location_ = re.sub(
                        '\A[ \t]*[IVX]+[\.:]+[ \t]*', '', line)  # Remove roman numeral.
                    t_location_start = T()
                    G.add_edge(t_location_prev, t_location_start)
                    t_location_stop = T()
                    G.add_edge(t_location_start, t_location_stop, location=location_)
                    t_location_prev = t_location_stop
                    t_event_prev = t_location_start

                else:

                    event_ = ' '.join(line.split())
                    event_ = re.sub('@EVENT', '', event_)
                    t_event_start = T()
                    t_event_stop = T()
                    G.add_edge(t_event_prev, t_event_start)
                    G.add_edge(t_event_start, t_event_stop, event=event_)
                    t_event_prev = t_event_stop

        G.add_edge(t_event_prev, t_location_prev)
        G.add_edge(t_location_prev, t_episode_stop)

        return G
示例#14
0
    def transcript(self, url=None, episode=None, **kwargs):

        path = resource_filename(self.__class__.__name__, url)
        transcription = Transcription(episode=episode)

        # previous scene end time
        e_scene = None

        # previous dialogue end time
        e_dialogue = None

        for line in self.iterlines(path):

            tokens = line.split()
            left = tokens[0].strip()
            right = ' '.join(tokens[1:]).strip()

            # new scene
            if left == 'SCENE':
                scene = right

                # connect previous dialogue line with scene end time
                if e_dialogue is not None:
                    transcription.add_edge(e_dialogue, e_scene)

                # new scene
                _s_scene, _e_scene = T(), T()
                transcription.add_edge(_s_scene, _e_scene, scene=scene)

                # connect scene with previous scene
                if e_scene is not None:
                    transcription.add_edge(e_scene, _s_scene)

                # update previous scene start/end time
                s_scene, e_scene = _s_scene, _e_scene

                # artifically set previous dialogue end time
                e_dialogue = s_scene

            # new dialogue line
            else:
                speaker = left
                speech = right

                # new dialogue
                _s_dialogue, _e_dialogue = T(), T()
                transcription.add_edge(_s_dialogue, _e_dialogue,
                                       speaker=speaker, speech=speech)

                # connect dialogue with previous dialogue
                transcription.add_edge(e_dialogue, _s_dialogue)

                # update previous dialogue start/end time
                s_dialogue, e_dialogue = _s_dialogue, _e_dialogue

        # connect previous dialogue line with scene end time
        transcription.add_edge(e_dialogue, e_scene)

        return transcription
示例#15
0
    def transcript_www(self, url=None, episode=None, debug=True, **kwargs):

        SPEAKER_MAPPING = {
            'abby': ['abby', ],
            'alice': ['alice', ],
            'alicia': ['alicia', ],
            'amy': ['amy', ],
            'angela': ['angela', ],
            'barry_kripke': ['barry', 'barry kripke', 'kripke'],
            'bernadette': ['bermadette', 'bernadette', ],
            'bethany': ['bethany', ],
            'beverley': ['beverley', ],
            'brent_spiner': ['brent', 'brent spiner'],
            'charlie_sheen': ['charlie sheen', ],
            'christie': ['christie', ],
            'dale': ['dale', ],
            'david': ['david', ],
            'dennis': ['dennis', ],
            'dimitri': ['dimitri', ],
            'doug': ['doug', ],
            'dr_gablehouser': ['gablehauser', 'gablehouser', 'dr gablehouser', ],
            'dr_greene': ['dr. brian greene', 'greene', ],
            'dr_hofstadter': ["leonard's mother", 'dr hofstadter', ],
            'dr_koothrappali': ['dr koothrappali', ],
            'dr_massimino': ['dr massimino', ],
            'dr_millstone': ['dr millstone', ],
            'dr_seibert': ['seibert', 'siebert', 'dr. seibert', ],
            'dr_tyson': ['dr tyson', ],
            'elizabeth': ['elizabeth', ],
            'eric': ['eric', ],
            'george_smoot': ['george smoot', ],
            'george_takei': ['george takei', ],
            'glenn': ['glenn', ],
            'hawking': ['hawking', ],
            'houston': ['houston', ],
            'howard': ['howard', 'past howard', ],
            'ira': ['ira', ],
            'jimmy': ['jimmy', ],
            'joy': ['joy', ],
            'joyce_kim': ['joyce kim', ],
            'kathy_sackhoff': ['katee', 'katee sackhoff', 'kathy'],
            'kevin': ['kevin', ],
            'kurt': ['kurt', ],
            'lakshmi': ['lakshmi', ],
            'lalita': ['lalita', ],
            'laura': ['laura', ],
            'leonard': ['leonard', 'past leonard', ],
            'leslie_winkle': ['leslie winkle', 'leslie', 'lesley'],
            'martha': ['martha', ],
            'mie_massimino': ['mike', 'mike_massimino', 'massimino', ],
            'michaela': ['michaela', ],
            'mike': ['mike', ],
            'missy': ['missy', ],
            'mr_rostenkowski': ['mr rostenkowski', 'mr. rostenkowski', ],
            'mrs_cooper': ['mrs cooper', ],
            'mrs_fowler': ['mrs fowler', ],
            'mrs_gunderson': ['mrs gunderson', ],
            'mrs_koothrappali': ['mrs koothrappali', ],
            'mrs_latham': ['mrs latham', ],
            'mrs_wolowitz': ['mrs wolowitz', "howard's mother", ],
            'page': ['page', ],
            'penny': ['penny', 'past penny', ],
            'penny_dad': ["penny's dad", ],
            'pr_crawley': ['prof crawley', ],
            'pr_goldfarb': ['goldfarb', ],
            'pr_laughlin': ['prof laughlin', ],
            'priya': ['priya', ],
            'raj': ['raj', 'past raj', 'rai', ],
            'roeger': ['roeger', ],
            'rothman': ['rothman', ],
            'sarah': ['sarah', ],
            'sheldon': ['sheldon', 'sgeldon', 'sheldon on laptop screen', "sheldon's voice", 'past sheldon', 'on-screen sheldon', ],
            'stan_lee': ['stan lee', ],
            'steph': ['steph', ],
            'steve_wozniak': ['steve wozniak', ],
            'stuart': ['stuart', ],
            'summer': ['summer', ],
            'toby': ['toby', ],
            'todd': ['todd', ],
            'tom': ['tom', ],
            'venkatesh': ['venkatesh', ],
            'wil_wheaton': ['wil', 'wil wheaton', ],
            'wyatt': ['wyatt', ],
            'zack': ['zack', ],
        }

        speaker_mapping = {
            old: new for new, olds in six.iteritems(SPEAKER_MAPPING) for old in olds
        }

        # download webpage and parse it with BeautifulSoup
        soup = BeautifulSoup(self.download_as_utf8(url))

        # extract the following <div> containing the actual transcript
        # <div class='entrytext'> ... </div>
        div = soup.findAll('div', attrs={'class': 'entrytext'})[0]

        # initialize empty annotation graph
        G = Transcription(episode=episode)

        # episode start and end
        tstart = TStart
        tend = TEnd

        # tscene contains end of previous scene
        tscene = tstart
        # tspeech contains end of previous speech turn
        tspeech = None

        speakers = set([])

        for text in self._manual_transcript_line_iterator(div):

            # try to match xxxxx: yyyyyy
            REGEXP_DIALOGUE = '\A\s*([^:]+?)\s*:\s*(.*)\Z'
            m = re.match(REGEXP_DIALOGUE, text)

            if not m:
                # (They sit ...).
                # (Leonard starts rattling.)
                # u'Credits sequence.'
                # u'Credits sequence'
                # u'Credit sequence.'
                # u'Credit Sequence'
                # u'Credits sequence'
                # u'Time shift, Leonard and Sheldon are now ...'
                # u'Cut to Leonard entering living room in panic, ...'
                # u'Written by...'
                # u'(Time shift)'

                if debug:
                    print("SKIPPING: %s" % repr(text))

                continue

            # if there is a match, we are in one of the following situations:
            # Scene: blah blah blah
            # Sheldon: blah blah blah
            # Teleplay: blah blah blah
            # ...
            # left: right
            left, right = m.groups()
            left = left.strip().lower()
            right = right.strip()

            # if we are in of the following situations,
            # then we found a new scene
            # Scene: location
            # scene: location
            if left in {u'scene', u'secne'}:

                # remove unwanted spaces from location
                data = {'location': right.strip()}

                # add the new scene to the graph
                t1 = T()  # start time
                t2 = T()  # end time
                G.add_edge(t1, t2, **data)

                # make sure it is connected to the previous scene
                G.add_edge(tscene, t1)

                # make sure last speech turn of previous scene
                # is correctly connected to end of previous scene
                if tspeech:
                    G.add_edge(tspeech, tscene)

                # update end of previous scene/speech
                tscene = t2
                tspeech = t1

            # if we are in one of the following situations
            # Teleplay: Bill Prady & Steven Molaro
            # Story: Chuck Lorre
            elif left in {'story', 'teleplay'}:
                continue

            # that's what we are really looking for:
            # speaker_name: speech
            else:

                # remove stage directions from speaker name
                # e.g. "penny (to raj)" becomes ("penny", ["to raj", ])
                speaker, speaker_directions = self._get_directions(left)
                speaker = speaker.strip()

                # remove stage directions from speech
                # "hey sheldon (laughing). where is your spot?"
                # becomes "hey sheldon . where is your spot?", ["laughing",]
                speech, speech_directions = self._get_directions(right)
                speech = speech.strip()

                # gather all stage directions into one long string
                directions = u' '.join(speaker_directions + speech_directions)

                # debug
                if speaker not in speaker_mapping:
                    warnings.warn('no mapping for speaker "%s"' % speaker)

                # build annotation data
                # (with directions only if they exist)
                data = {
                    'speaker': speaker_mapping.get(
                        speaker, 'unknown_%s' % '_'.join(speaker.split())),
                    'speech': speech,
                }
                if directions:
                    data['directions'] = directions

                # add the new speech turn to the graph
                t1 = T()
                t2 = T()
                G.add_edge(t1, t2, **data)

                # make sure it is connected to the previous speech turn
                G.add_edge(tspeech, t1)

                # update end of previous speech turn
                tspeech = t2

        # make sure last speech turn is correctly connected to end of last scene
        G.add_edge(tspeech, tscene)

        # make sure last scene is correctly connected to episode end
        G.add_edge(tscene, tend)

        return G