Пример #1
0
    def test_init_collection_od(self):
        basepath = GIT_THEMING_PATH_HIST

        #' test that non-collection entries not processed as collections
        storyobj = webobject.Story(name='movie: A Trip to the Moon (1902)',
                                   title='A Trip to the Moon',
                                   date='2020-09-22',
                                   description='A classic film.',
                                   meta=json.dumps(
                                       {'source': './a/token/path'}))
        collection_od = webtask.cache_collections.init_collection_od(
            storyobj, basepath)
        self.assertEqual(collection_od, None)

        #' test that collection entries correctly handled
        storyobj = webobject.Story(
            name='Collection: Akira Kurosawa',
            title='Akira Kurosawa',
            date='1949-1985',
            description='Films written or directed by Akira Kurosawa.',
            meta=json.dumps({'source': './a/token/path'}))
        collection_od = webtask.cache_collections.init_collection_od(
            storyobj, basepath)
        self.assertEqual(collection_od['collection-id'], storyobj.name)
        self.assertEqual(collection_od['title'], storyobj.title)
        self.assertEqual(collection_od['date'], storyobj.date)
        self.assertEqual(collection_od['genres'], [])
        self.assertEqual(collection_od['description'], storyobj.description)
        self.assertEqual(collection_od['references'], [])
        self.assertEqual(collection_od['component-story-ids'], [])
Пример #2
0
    def test_init_story_od(self):
        basepath = GIT_THEMING_PATH_HIST

        #' test that collection entries not processed as stories
        storyobj = webobject.Story(
            name='Collection: Akira Kurosawa',
            title='Akira Kurosawa',
            date='1949-1985',
            description='Films written or directed by Akira Kurosawa.',
            meta=json.dumps({'source': './a/token/path'}))
        story_od = webtask.cache_collections.init_story_od(storyobj, basepath)
        self.assertEqual(story_od, None)

        # ' test that multiple collection stories are correctly handled
        collection_ids = [
            'Collection: Science fiction films of the 1980s',
            'Collection: The Terminator'
        ]
        storyobj = webobject.Story(name='movie: The Terminator (1984)',
                                   title='The Terminator',
                                   date='1984-10-26',
                                   description='A classic film.',
                                   collections='\n'.join(collection_ids),
                                   meta=json.dumps(
                                       {'source': './a/token/path'}))
        story_od = webtask.cache_collections.init_story_od(storyobj, basepath)
        self.assertEqual(story_od['story-id'], storyobj.name)
        self.assertEqual(story_od['collection-ids'], collection_ids)
Пример #3
0
    def test_init_story_od(self):
        basepath = GIT_THEMING_PATH_HIST

        #' test that collection entries not processed as stories
        storyobj = webobject.Story(name='Collection: a token collection',
                                   title='a token collection',
                                   date='2020-09-22',
                                   description='a description',
                                   meta=json.dumps(
                                       {'source': './a/token/path'}))
        story_od = webtask.cache_data.init_story_od(storyobj, basepath)
        self.assertEqual(story_od, None)

        #' test that any trailing reference information is correctly stripped from description text
        expected_description = "A classic film."
        storyobj = webobject.Story(
            name='movie: Robot Monster (1953)',
            title='Robot Monster',
            date='1953-06-24',
            description='A classic film.\n\n\nReferences: a reference',
            meta=json.dumps({'source': './a/token/path'}))
        story_od = webtask.cache_data.init_story_od(storyobj, basepath)
        self.assertEqual(story_od['description'], expected_description)

        #' test that multiple references are correctly handled
        reference_1 = 'a reference'
        reference_2 = 'another reference'
        expected_references = [reference_1, reference_2]
        storyobj = webobject.Story(name='movie: Robot Monster (1953)',
                                   title='Robot Monster',
                                   date='1953-06-24',
                                   description='A classic film.',
                                   references=reference_1 + '\n' + reference_2,
                                   meta=json.dumps(
                                       {'source': './a/token/path'}))
        story_od = webtask.cache_data.init_story_od(storyobj, basepath)
        self.assertEqual(story_od['references'], expected_references)

        #' test the story source path is correctly handled
        source_path_fragment = '/notes/stories/film/film-scifi-1950s.st.txt'
        expected_source_path = '.' + source_path_fragment
        storyobj = webobject.Story(
            name='movie: Robot Monster (1953)',
            title='Robot Monster',
            date='1953-06-24',
            description='A classic film.',
            meta=json.dumps({'source': basepath + source_path_fragment}))
        story_od = webtask.cache_data.init_story_od(storyobj, basepath)
        self.assertEqual(story_od['source'], expected_source_path)
Пример #4
0
    def init_collections_list(self):
        basepath = GIT_THEMING_PATH_HIST

        #' initialize four Story objects for testing purposes: three of the collection variety and
        #' one of the story variety
        storyobj_1 = webobject.Story(
            name='Collection: William Shakespeare Plays',
            title='William Shakespeare Plays',
            date='1602-1606',
            description=
            'Plays written by the English playwright, poet, and actor William Shakespeare.',
            meta=json.dumps({'source': './a/token/path'}))
        storyobj_2 = webobject.Story(name='movie: A Trip to the Moon (1902)',
                                     title='A Trip to the Moon',
                                     date='1902-10-04',
                                     description='A classic film',
                                     meta=json.dumps(
                                         {'source': './a/token/path'}))
        storyobj_3 = webobject.Story(
            name='Collection: X-men',
            title='X-men',
            date='2000-2019',
            description='The X-Men is an American superhero film series.',
            meta=json.dumps({'source': './a/token/path'}))
        storyobj_4 = webobject.Story(
            name='Collection: Akira Kurosawa',
            title='Akira Kurosawa',
            date='1949-1985',
            description='Films written or directed by Akira Kurosawa.',
            meta=json.dumps({'source': './a/token/path'}))
        storyobjs_list = [storyobj_1, storyobj_2, storyobj_3, storyobj_4]

        #' test that correct number of collections is returned
        expected_story_count = len(storyobjs_list) - 1
        collections_list = webtask.cache_collections.init_collections_list(
            storyobjs_list, basepath)
        self.assertEqual(len(collections_list), expected_story_count)

        #' test that collections are alphabetically sorted by title
        expected_first_collection_id = storyobj_4.name
        expected_second_collection_id = storyobj_1.name
        expected_third_collection_id = storyobj_3.name
        self.assertEqual(collections_list[0]['collection-id'],
                         expected_first_collection_id)
        self.assertEqual(collections_list[1]['collection-id'],
                         expected_second_collection_id)
        self.assertEqual(collections_list[2]['collection-id'],
                         expected_third_collection_id)
Пример #5
0
    def test_populate_collections_with_component_stories_1(self):
        #' initialize a list containing one collection OrderedDict object
        collection_od = OrderedDict()
        collection_od['collection-id'] = 'Collection: Akira Kurosawa'
        collection_od['title'] = 'Akira Kurosawa'
        collection_od['date'] = '1949-1985'
        collection_od[
            'description'] = 'Films written or directed by Akira Kurosawa.'
        collection_od['component-story-ids'] = []
        collections_list = [collection_od]

        #' initialize a collection Story object
        expected_component_story_ids = [
            'movie: Dersu Uzala (1975)', 'movie: High and Low (1963)',
            'movie: Ikiru (1952)', 'movie: Ran (1985)',
            'movie: Rashomon (1950)', 'movie: Red Beard (1965)',
            'movie: Sanjuro (1962)', 'movie: Seven Samurai (1954)',
            'movie: Stray Dog (1949)', 'movie: The Hidden Fortress (1958)',
            'movie: Throne of Blood (1957)', 'movie: Yojimbo (1961)'
        ]
        storyobj = webobject.Story(
            name='Collection: Akira Kurosawa',
            title='Akira Kurosawa',
            date='1949-1985',
            description='Films written or directed by Akira Kurosawa.',
            collections='Collection: Akira Kurosawa',
            components='\n'.join(expected_component_story_ids),
            meta=json.dumps({'source': './a/token/path'}))
        storyobjs_list = [storyobj]

        #' test that the collection contains the expected component stories
        updated_collections_list = webtask.cache_collections.populate_collections_with_component_stories_1(
            collections_list, storyobjs_list)
        self.assertEqual(updated_collections_list[0]['component-story-ids'],
                         expected_component_story_ids)
Пример #6
0
    def test_populate_stories_with_collection_info(self):
        #' initialize a collection object and put it in a list
        component_story_names = [
            "movie: Alien (1979)", "movie: Aliens (1986)",
            "movie: Alien 3 (1992)", "movie: Alien Resurrection (1997)",
            "movie: Prometheus (2012)", "movie: Alien: Covenant (2017)"
        ]
        storyobj = webobject.Story(name='Collection: Alien',
                                   title='Alien',
                                   date='1979-2017',
                                   description='A classic film franchise.',
                                   components='\n'.join(component_story_names),
                                   meta=json.dumps(
                                       {'source': './a/token/path'}))
        storyobjs_list = [storyobj]

        #' initialize a story OrderedDict object and put it in a list
        story_od = OrderedDict()
        story_od['story-id'] = "movie: Alien (1979)"
        story_od['title'] = "Alien"
        story_od['date'] = "1979-05-25"
        story_od['description'] = "A classic film."
        story_od['source'] = json.dumps({'source': './a/token/path'})
        story_od['collections'] = []
        stories_list = [story_od]

        #' test that collection info is correctly added to story entry
        expected_collections = ["Collection: Alien"]
        expected_collection_count = len(expected_collections)
        updated_stories_list = webtask.cache_data.populate_stories_with_collection_info(
            storyobjs_list, stories_list)
        self.assertEqual(len(updated_stories_list[0]['collections']),
                         expected_collection_count)
        self.assertEqual(updated_stories_list[0]['collections'],
                         expected_collections)
Пример #7
0
    def test_populate_collections_with_component_stories_2(self):
        #' initialize a list containing one collection OrderedDict object
        collection_od = OrderedDict()
        collection_od['collection-id'] = 'Collection: Akira Kurosawa'
        collection_od['title'] = 'Akira Kurosawa'
        collection_od['date'] = '1949-1985'
        collection_od[
            'description'] = 'Films written or directed by Akira Kurosawa.'
        collection_od['component-story-ids'] = []
        collections_list = [collection_od]

        #' initialize list of three Story objects: two are component stories of the above defined
        # collection and one not
        storyobj_1 = webobject.Story(name='movie: Rashomon (1950)',
                                     title='Rashomon',
                                     date='1950-08-25',
                                     description='A classic film.',
                                     collections='Collection: Akira Kurosawa',
                                     meta=json.dumps(
                                         {'source': './a/token/path'}))
        storyobj_2 = webobject.Story(
            name='movie: The Terminator (1984)',
            title='The Terminator',
            date='1984-10-26',
            description='A classic film.',
            collections=
            'Collection: Science fiction films of the 1980s\nCollection: The Terminator',
            meta=json.dumps({'source': './a/token/path'}))
        storyobj_3 = webobject.Story(
            name='movie: Seven Samurai (1954)',
            title='Seven Samurai',
            date='1954-04-26',
            description='A classic film.',
            collections=
            'Collection: Akira Kurosawa\nCollection: dummy collection',
            meta=json.dumps({'source': './a/token/path'}))
        storyobjs_list = [storyobj_1, storyobj_2, storyobj_3]

        #' test that the collection contains the expected component stories
        expected_component_story_ids = [
            'movie: Rashomon (1950)', 'movie: Seven Samurai (1954)'
        ]
        updated_collections_list = webtask.cache_collections.populate_collections_with_component_stories_2(
            collections_list, storyobjs_list)
        self.assertEqual(updated_collections_list[0]['component-story-ids'],
                         expected_component_story_ids)
Пример #8
0
def main():
    fn = sys.argv[-1]
    urls = [
        "https://en.wikipedia.org/wiki/The_Decameron",
    ]
    stories = {}

    for title, pagename, data in fetch_links_info(
            urls[0], startafter="/wiki/Aesop$",
            stopat="/wiki/Template:Aesop$"):
        if pagename in fails:
            sid = "aesop{:02d}un".format(failcount)
            failcount += 1
        else:
            pi = None
            for pattern in perrypatterns:
                if not pi:
                    pi = re.search(pattern, data["extract"])
            if not pi:
                page = urllib2.urlopen(
                    data["content_urls"]["mobile"]["page"]).read()
                soup = BeautifulSoup(page, "html.parser")
                text = soup.get_text()
                lines = (line.strip() for line in text.splitlines())
                text = " ".join(x for x in lines if x)
                for pattern in perrypatterns:
                    if not pi:
                        pi = re.search(pattern, text)
            if not pi:
                print("FAIL: ", title, pagename)
            sid = "aesop{:03d}pi".format(int(pi.group(1)))
        stories[sid] = webobject.Story(
            name=sid,
            title=title,
            description=data["extract"],
            date="564 BC",
        )

    objs = [stories[sid] for sid in sorted(stories, key=orderpredicate)]
    txt = webdb.get_defenitions_text_for_objects(
        objs,
        empty_storythemes_headers=True,
        skip_fields=('collections', ),
        add_fields=("Ratings", ),
        presorted=True)
    txt = txt.encode("utf-8")
    if fn.endswith(".txt"):
        with open(fn, "wb+") as fh:
            fh.write(txt)
    else:
        # this f*****g breaks on f*****g windows with f*****g unicode sometimes
        sys.stdout.write(txt)
Пример #9
0
def fetch_table_list(url, tableclass="wikitable", cols=(0, 1, 5, 2)):
    """

    Args:
        url:

    Returns:

    """
    with urllib.request.urlopen(url) as response:
        data = response.read()
    soup = BeautifulSoup(data, "html.parser")
    sidcounter = defaultdict(int)
    descriptionfield = None

    for idx, table in enumerate(soup.find_all("table", class_=tableclass)):
        for row in table.find_all("tr"):
            tdfields = row.find_all("td")

            if len(tdfields) > max(cols):
                titlefield = tdfields[cols[0]]
                byfield_a = tdfields[cols[1]]
                byfield_b = tdfields[cols[2]]
                datefield = tdfields[cols[3]]

                title_link = titlefield.find("a")
                title = titlefield.get_text().strip(" \"").strip()
                datetext = datefield.get_text().strip()
                date = get_date(datetext) or "????"

                if title_link:
                    suffix = title_link.get("href").split("/")[-1].strip()
                    info = fetch_info(suffix)
                    description = info['extract']
                else:
                    description = ""

                description = description.strip() + "\n\n" + \
                              "Studio: " + byfield_a.get_text().strip(".").strip() + ".\n\n" + \
                              "Cinematographer: " + byfield_b.get_text().strip(".").strip() + ".\n"
                description = REFSTRIPPER.sub(u"", description)

                sid = "movie: {} ({})".format(title, date[:4])
                sid = re.sub("[^\w:() ]", "", sid)
                yield webobject.Story(
                    name=sid,
                    title=title,
                    description=description,
                    date=date,
                )
Пример #10
0
    def test_init_stories_list(self):
        basepath = GIT_THEMING_PATH_HIST

        #' initialize three Story objects for testing purposes
        storyobj_1 = webobject.Story(name='movie: Robot Monster (1953)',
                                     title='Robot Monster',
                                     date='1953-06-24',
                                     description='A classic film.',
                                     meta=json.dumps(
                                         {'source': './a/token/path'}))
        storyobj_2 = webobject.Story(name='movie: The Wizard of Oz (1939)',
                                     title='The Wizard of Oz',
                                     date='1939-08-25',
                                     description='Another classic film.',
                                     meta=json.dumps(
                                         {'source': './another/token/path'}))
        storyobj_3 = webobject.Story(
            name='movie: A Trip to the Moon (1902)',
            title='A Trip to the Moon',
            date='1902-10-04',
            description='Yet another classic film.',
            meta=json.dumps({'source': './yet/another/token/path'}))
        storyobjs_list = [storyobj_1, storyobj_2, storyobj_3]

        #' test that correct number of stories are returned
        expected_story_count = len(storyobjs_list)
        stories_list = webtask.cache_data.init_stories_list(
            storyobjs_list, basepath)
        self.assertEqual(len(stories_list), expected_story_count)

        # test that stories are sorted in increasing order of release data
        expected_first_story_id = storyobj_3.name
        expected_second_story_id = storyobj_2.name
        expected_third_story_id = storyobj_1.name
        self.assertEqual(stories_list[0]['story-id'], expected_first_story_id)
        self.assertEqual(stories_list[1]['story-id'], expected_second_story_id)
        self.assertEqual(stories_list[2]['story-id'], expected_third_story_id)
Пример #11
0
def main():
    fn = sys.argv[-1]
    urls = [
        "https://en.wikipedia.org/wiki/The_Decameron",
    ]
    tableclass = "wikitable sortable"
    stories = {}

    data = urllib2.urlopen(urls[0]).read()
    soup = BeautifulSoup(data, "html.parser")

    table = soup.find("table", class_=tableclass)
    for idx, row in enumerate(table.find_all("tr")):
        cells = row.find_all("td")
        if len(cells) >= 3:
            texts = [c.get_text() for c in cells]
            nday, nstory = [
                int(x)
                for x in re.match(r"\D*(\d+),\D*(\d+)", texts[0]).groups()
            ]
            sid = "boccachio1353d{:02d}s{:02d}".format(nday, nstory)
            description = "Narrator: {}.\nLocation: {}.".format(*texts[1:3])
            if len(cells) >= 4 and texts[3].strip():
                description += "\nSubject: {}.".format(
                    re.sub(r"\s*\[\d+\]\s*", "", texts[3]))
            stories[sid] = webobject.Story(
                name=sid,
                title="The Decameron: Day {}, Story {}".format(nday, nstory),
                description=description,
                date="1353",
            )
        else:
            print("SKIPPING ROW {}".format(idx))

    objs = [stories[sid] for sid in natsorted(stories)]
    txt = webdb.get_defenitions_text_for_objects(
        objs,
        empty_storythemes_headers=True,
        skip_fields=('collections', ),
        add_fields=("Ratings", ),
        presorted=True)
    txt = txt.encode("utf-8")
    if fn.endswith(".txt"):
        with open(fn, "wb+") as fh:
            fh.write(txt)
    else:
        # this f*****g breaks on f*****g windows with f*****g unicode sometimes
        sys.stdout.write(txt)
Пример #12
0
    def test_init_stories_list(self):
        basepath = GIT_THEMING_PATH_HIST

        #' initialize three Story objects for testing purposes
        expected_story_count = 1
        expected_story_id = 'movie: A Trip to the Moon (1902)'
        storyobj = webobject.Story(name='movie: A Trip to the Moon (1902)',
                                   title='A Trip to the Moon',
                                   date='1902-10-04',
                                   description='A classic film.',
                                   meta=json.dumps(
                                       {'source': './yet/another/token/path'}))
        storyobjs_list = [storyobj]

        #' test the correct story info is returned
        expected_story_count = len(storyobjs_list)
        stories_list = webtask.cache_collections.init_stories_list(
            storyobjs_list, basepath)
        self.assertEqual(len(stories_list), expected_story_count)
        self.assertEqual(stories_list[0]['story-id'], expected_story_id)
Пример #13
0
def find_episodes_st1(url, season_offsset, prefix, tableclass="wikitable", cols=(1, 3, 4, 6), isterse=False,
                      table_offset=0, singleseason=False):
    """

    :param url:
    :param season_offsset:
    :param prefix:
    :param tableclass:
    :param cols:
        index of columns (title, director, author, date)
        counts only <td> and not <th>
    :param isterse:
        By default we expect every other row in the list to contain a description that go along with the preceding
        row's information. If the description row is not present, set this flag.
    :return:
    """
    with urllib.request.urlopen(url) as response:
        data = response.read()
    soup = BeautifulSoup(data, "html.parser")
    sidcounter = defaultdict(int)
    descriptionfield = None

    for idx, table in enumerate(soup.find_all("table", class_=tableclass)):
        if idx < table_offset:
            continue
        sids = []
        description = None
        titlestack = deque()
        coloffsetstack = deque()

        for row in table.find_all("tr"):
            tdfields = row.find_all("td")

            # no rowspan cols may be in between the indexes given with the "cols" argument, or things will break
            coloffset = coloffsetstack.popleft() if coloffsetstack else 0
            for td in tdfields[:min(cols)]:
                rowspan = int(td.attrs.get('rowspan', 0))
                while len(coloffsetstack) < rowspan:
                    coloffsetstack.append(0)
                for ii in range(rowspan):
                    coloffsetstack[ii] += 1

            if len(tdfields) > max(cols) - coloffset:
                titlefield = tdfields[cols[0] - coloffset]
                directorfield = tdfields[cols[1] - coloffset] if cols[1] > 0 else ''
                authorfield = tdfields[cols[2] - coloffset] if cols[2] > 0 else ''
                datefield = tdfields[cols[3] - coloffset]

                title_link = titlefield.find("a")
                title = titlefield.get_text().strip(" \"")
                datetext = datefield.get_text().strip()
                author = "Story by: " + get_author(authorfield) if cols[2] > 0 else ''
                director = "Directed by: " + directorfield.get_text() if cols[1] > 0 else ''
                date = get_date(datetext)

                # find episode number-in-season
                if not coloffset:
                    if singleseason:
                        epfield = row.find("th").get_text()
                    else:
                        epfield = row.find("td").get_text()

                # sys.stderr.write(str(authorfield).decode("utf-8").encode("ascii", "ignore") + "\n")
                # f = authorfield
                # pdb.set_trace()

                for match in re.findall("(\d+)([a-z]*)", epfield):
                    nepid, sepid = int(match[0]), match[1]
                    if singleseason:
                        sid = prefix + "%02d%s" % (nepid, sepid)
                    else:
                        sid = prefix + "%sx%02d%s" % (idx - table_offset + season_offsset, nepid, sepid)
                    sids.append(sid)

                if isterse and title_link:
                    suffix = title_link.get("href").split("/")[-1].strip()
                    info = fetch_info(suffix)
                    description = info['extract']

                titlestack.append((sid, title, director, author, date))
                print("ADD", titlestack[-1])

            else:
                descriptionfield = row.find("td", class_="description")
                if descriptionfield:
                    description = descriptionfield.get_text().strip()

            if description and sids:
                if descriptionfield:
                    desclist = get_descriptions(descriptionfield)
                else:
                    desclist = [description]
                numstories = min(len(desclist), len(titlestack))
                for description in desclist:
                    if not titlestack:
                        break
                    print("POP", titlestack[0])
                    sid, title, director, author, date = titlestack.popleft()
                    sidcounter[sid] += 1
                    if numstories > 1:
                        sid += chr(ord("a") + sidcounter[sid] - 1)
                    description = description.strip()
                    if author or director:
                        description = description + "\n\n"
                        if director:
                            description += director.strip(".")
                            if author:
                                description += ". "
                        if author:
                            description += author.strip(".")
                        description += ".\n"
                    description = REFSTRIPPER.sub(u"", description)

                    yield webobject.Story(
                        name=sid,
                        title=title,
                        description=description,
                        date=date,
                    )

                sids = []
                description = ''
Пример #14
0
def find_episodes_st1(url,
                      season_offsset,
                      prefix,
                      tableclass="wikitable",
                      cols=(1, 3, 4, 6),
                      isterse=False,
                      table_offset=0):
    """

    :param url:
    :param season_offsset:
    :param prefix:
    :param tableclass:
    :param cols:
        index of columns (title, director, author, date)
        counts only <td> and not <th>
    :param isterse:
        By default we expect every other row in the list to contain a description that go along with the preceding
        row's information. If the description row is not present, set this flag.
    :return:
    """
    data = urllib2.urlopen(url).read()
    soup = BeautifulSoup(data, "html.parser")
    resturl = "https://en.wikipedia.org/api/rest_v1/page/summary/"
    sidcounter = defaultdict(int)

    for idx, table in enumerate(soup.find_all("table", class_=tableclass)):
        if idx < table_offset:
            continue
        sids = []
        description = None
        titlestack = deque()
        coloffsetstack = deque()

        for row in table.find_all("tr"):
            tdfields = row.find_all("td")

            # no rowspan cols may be in between the indexes given with the "cols" argument, or things will break
            coloffset = coloffsetstack.popleft() if coloffsetstack else 0
            for td in tdfields[:min(cols)]:
                rowspan = int(td.attrs.get('rowspan', 0))
                while len(coloffsetstack) < rowspan:
                    coloffsetstack.append(0)
                for ii in range(rowspan):
                    coloffsetstack[ii] += 1

            if len(tdfields) > max(cols) - coloffset:
                titlefield = tdfields[cols[0] - coloffset]
                directorfield = tdfields[cols[1] - coloffset]
                authorfield = tdfields[cols[2] - coloffset]
                datefield = tdfields[cols[3] - coloffset]

                title_link = titlefield.find("a")
                title = titlefield.get_text().strip(" \"")
                datetext = datefield.get_text().strip()
                date = None

                for regex in [
                        r"(\d{4}-\d{2}-\d{2})",  #yyyy-mm-dd
                        r"(\w+ \d{1,2}, \d{4})",  #MM dd, yyyy
                        r"(\d{1,2} \w+ \d{4})",  #dd MM yyyy
                ]:
                    try:
                        date1 = re.search(regex, datetext).group(0)
                        date = parser.parse(date1).strftime("%Y-%m-%d")
                        break
                    except AttributeError:  # no regex match
                        pass

                if not coloffset:
                    epfield = row.find("td").get_text()
                author = "Story by: " + get_author(authorfield)
                director = "Directed by: " + directorfield.get_text()

                #sys.stderr.write(str(authorfield).decode("utf-8").encode("ascii", "ignore") + "\n")
                #f = authorfield
                #pdb.set_trace()

                for match in re.findall("(\d+)([a-z]*)", epfield):
                    nepid, sepid = int(match[0]), match[1]
                    sid = prefix + "%sx%02d%s" % (idx - table_offset +
                                                  season_offsset, nepid, sepid)
                    sids.append(sid)

                if isterse and title_link:
                    suffix = title_link.get("href").split("/")[-1].strip()
                    info = fetch_info(suffix)
                    description = info['extract']

                titlestack.append((sid, title, director, author, date))
                print("ADD", titlestack[-1])

            else:
                descriptionfield = row.find("td", class_="description")
                if descriptionfield:
                    description = descriptionfield.get_text().strip()

            if description and sids:
                if descriptionfield:
                    desclist = get_descriptions(descriptionfield)
                else:
                    desclist = [description]
                numstories = min(len(desclist), len(titlestack))
                for description in desclist:
                    if not titlestack:
                        break
                    print("POP", titlestack[0])
                    sid, title, director, author, date = titlestack.popleft()
                    sidcounter[sid] += 1
                    if numstories > 1:
                        sid += chr(ord("a") + sidcounter[sid] - 1)
                    description = unicode(description).strip()
                    description = description + "\n\n" + director.strip(
                        ".") + ". " + author.strip(".") + ".\n"
                    description = REFSTRIPPER.sub(u"", description)

                    yield webobject.Story(
                        name=sid,
                        title=title,
                        description=description,
                        date=date,
                    )

                sids = []
                description = ''
Пример #15
0
def read_stories_from_fieldcollection(fieldcollection, verbose=True, addextras=False, globcollection=False, strict=True):
    """
    Stories in our special text file format.
    """
    out = {}
    out_composites = defaultdict(lambda: defaultdict(str))
    field_map = {
        "title": "title",
        "release date": "date",
        "description": "description",
        "date": "date",
        "collections": "collections",
        "component stories": "components",
    }
    composite_fields = {
        "description": "description",
        "references": "references",
        "collections": "collections",
    }
    recognized_fields = set([
        "authors",
        "choice themes",
        "genre",
        "major themes",
        "minor themes",
        "not themes",
        "notes",
        "other keywords",
        "ratings",
        "related stories",
        "aliases",
    ] + list(composite_fields) + list(field_map))
    global_collections = []

    for sid, field, data in fieldcollection:
        # is this is a "collection" for all stories in this file?
        if globcollection:
            if field.lower() == "collections" and sid in data:
                for d in data:
                    if d not in global_collections:
                        global_collections.append(d)

        if sid not in out:
            out[sid] = webobject.Story(name=sid, meta={})

        obj = out[sid]
        lfield = field.strip().lower()
        attr = field_map.get(lfield, "")

        if lfield in composite_fields:
            out_composites[sid][lfield] = '\n'.join(data)
        if attr and attr in obj.fields:
            setattr(obj, attr, data[0])
        elif addextras:
            exattr = lfield.replace(" ", "")
            setattr(obj, exattr, '\n'.join(data))
            obj.extra_fields += (exattr,)
        elif lfield == "ratings":
            numbers = [int(s) for s in re.findall("\d+", ' '.join(data), re.DOTALL)]
            numbers = [min(5, max(0, n)) for n in numbers]
            count = float(len(numbers))
            if count > 0:
                mean = sum(numbers) / count
                stddev = (sum((x - mean)**2 for x in numbers) / count)**0.5
                obj.meta['rating'] = u'%.2f \u00B1 %.2f' % (mean, stddev)
            else:
                obj.meta['rating'] = 'n/a'
        elif lfield in recognized_fields:
            # recognized so don't warn even if we're not adding them
            pass
        else:
            if verbose:
                lib.log.warn("%s: %s.%s - don't grok", verbose, sid, field)

    for sid in sorted(out):
        obj = out[sid]
        description = getattr(obj, "description", "")
        references = out_composites[sid]["references"].strip()
        collections = out_composites[sid]["collections"].strip()

        if references:
            description += "\n\nReferences:\n"
            for line in references.split("\n"):
                line = line.strip()
                if line:
                    description += line + "\n"

        clist = list(global_collections)
        for c in collections.split("\n"):
            if c and c not in clist:
                clist.append(c)
        clist = [c.strip() for c in clist if c.strip()]

        obj.description = description
        obj.collections = "\n".join(clist)

        try:
            if strict:
                obj.test_fields()
            yield obj
        except ValueError as e:
            if verbose:
                lib.log.warn("%s: %s.%s - %s", verbose, sid, field, str(e))