def test_init_collection_od(self): basepath = GIT_THEMING_PATH_HIST #' test that non-collection entries not processed as collections storyobj = webobject.Story(name='movie: A Trip to the Moon (1902)', title='A Trip to the Moon', date='2020-09-22', description='A classic film.', meta=json.dumps( {'source': './a/token/path'})) collection_od = webtask.cache_collections.init_collection_od( storyobj, basepath) self.assertEqual(collection_od, None) #' test that collection entries correctly handled storyobj = webobject.Story( name='Collection: Akira Kurosawa', title='Akira Kurosawa', date='1949-1985', description='Films written or directed by Akira Kurosawa.', meta=json.dumps({'source': './a/token/path'})) collection_od = webtask.cache_collections.init_collection_od( storyobj, basepath) self.assertEqual(collection_od['collection-id'], storyobj.name) self.assertEqual(collection_od['title'], storyobj.title) self.assertEqual(collection_od['date'], storyobj.date) self.assertEqual(collection_od['genres'], []) self.assertEqual(collection_od['description'], storyobj.description) self.assertEqual(collection_od['references'], []) self.assertEqual(collection_od['component-story-ids'], [])
def test_init_story_od(self): basepath = GIT_THEMING_PATH_HIST #' test that collection entries not processed as stories storyobj = webobject.Story( name='Collection: Akira Kurosawa', title='Akira Kurosawa', date='1949-1985', description='Films written or directed by Akira Kurosawa.', meta=json.dumps({'source': './a/token/path'})) story_od = webtask.cache_collections.init_story_od(storyobj, basepath) self.assertEqual(story_od, None) # ' test that multiple collection stories are correctly handled collection_ids = [ 'Collection: Science fiction films of the 1980s', 'Collection: The Terminator' ] storyobj = webobject.Story(name='movie: The Terminator (1984)', title='The Terminator', date='1984-10-26', description='A classic film.', collections='\n'.join(collection_ids), meta=json.dumps( {'source': './a/token/path'})) story_od = webtask.cache_collections.init_story_od(storyobj, basepath) self.assertEqual(story_od['story-id'], storyobj.name) self.assertEqual(story_od['collection-ids'], collection_ids)
def test_init_story_od(self): basepath = GIT_THEMING_PATH_HIST #' test that collection entries not processed as stories storyobj = webobject.Story(name='Collection: a token collection', title='a token collection', date='2020-09-22', description='a description', meta=json.dumps( {'source': './a/token/path'})) story_od = webtask.cache_data.init_story_od(storyobj, basepath) self.assertEqual(story_od, None) #' test that any trailing reference information is correctly stripped from description text expected_description = "A classic film." storyobj = webobject.Story( name='movie: Robot Monster (1953)', title='Robot Monster', date='1953-06-24', description='A classic film.\n\n\nReferences: a reference', meta=json.dumps({'source': './a/token/path'})) story_od = webtask.cache_data.init_story_od(storyobj, basepath) self.assertEqual(story_od['description'], expected_description) #' test that multiple references are correctly handled reference_1 = 'a reference' reference_2 = 'another reference' expected_references = [reference_1, reference_2] storyobj = webobject.Story(name='movie: Robot Monster (1953)', title='Robot Monster', date='1953-06-24', description='A classic film.', references=reference_1 + '\n' + reference_2, meta=json.dumps( {'source': './a/token/path'})) story_od = webtask.cache_data.init_story_od(storyobj, basepath) self.assertEqual(story_od['references'], expected_references) #' test the story source path is correctly handled source_path_fragment = '/notes/stories/film/film-scifi-1950s.st.txt' expected_source_path = '.' + source_path_fragment storyobj = webobject.Story( name='movie: Robot Monster (1953)', title='Robot Monster', date='1953-06-24', description='A classic film.', meta=json.dumps({'source': basepath + source_path_fragment})) story_od = webtask.cache_data.init_story_od(storyobj, basepath) self.assertEqual(story_od['source'], expected_source_path)
def init_collections_list(self): basepath = GIT_THEMING_PATH_HIST #' initialize four Story objects for testing purposes: three of the collection variety and #' one of the story variety storyobj_1 = webobject.Story( name='Collection: William Shakespeare Plays', title='William Shakespeare Plays', date='1602-1606', description= 'Plays written by the English playwright, poet, and actor William Shakespeare.', meta=json.dumps({'source': './a/token/path'})) storyobj_2 = webobject.Story(name='movie: A Trip to the Moon (1902)', title='A Trip to the Moon', date='1902-10-04', description='A classic film', meta=json.dumps( {'source': './a/token/path'})) storyobj_3 = webobject.Story( name='Collection: X-men', title='X-men', date='2000-2019', description='The X-Men is an American superhero film series.', meta=json.dumps({'source': './a/token/path'})) storyobj_4 = webobject.Story( name='Collection: Akira Kurosawa', title='Akira Kurosawa', date='1949-1985', description='Films written or directed by Akira Kurosawa.', meta=json.dumps({'source': './a/token/path'})) storyobjs_list = [storyobj_1, storyobj_2, storyobj_3, storyobj_4] #' test that correct number of collections is returned expected_story_count = len(storyobjs_list) - 1 collections_list = webtask.cache_collections.init_collections_list( storyobjs_list, basepath) self.assertEqual(len(collections_list), expected_story_count) #' test that collections are alphabetically sorted by title expected_first_collection_id = storyobj_4.name expected_second_collection_id = storyobj_1.name expected_third_collection_id = storyobj_3.name self.assertEqual(collections_list[0]['collection-id'], expected_first_collection_id) self.assertEqual(collections_list[1]['collection-id'], expected_second_collection_id) self.assertEqual(collections_list[2]['collection-id'], expected_third_collection_id)
def test_populate_collections_with_component_stories_1(self): #' initialize a list containing one collection OrderedDict object collection_od = OrderedDict() collection_od['collection-id'] = 'Collection: Akira Kurosawa' collection_od['title'] = 'Akira Kurosawa' collection_od['date'] = '1949-1985' collection_od[ 'description'] = 'Films written or directed by Akira Kurosawa.' collection_od['component-story-ids'] = [] collections_list = [collection_od] #' initialize a collection Story object expected_component_story_ids = [ 'movie: Dersu Uzala (1975)', 'movie: High and Low (1963)', 'movie: Ikiru (1952)', 'movie: Ran (1985)', 'movie: Rashomon (1950)', 'movie: Red Beard (1965)', 'movie: Sanjuro (1962)', 'movie: Seven Samurai (1954)', 'movie: Stray Dog (1949)', 'movie: The Hidden Fortress (1958)', 'movie: Throne of Blood (1957)', 'movie: Yojimbo (1961)' ] storyobj = webobject.Story( name='Collection: Akira Kurosawa', title='Akira Kurosawa', date='1949-1985', description='Films written or directed by Akira Kurosawa.', collections='Collection: Akira Kurosawa', components='\n'.join(expected_component_story_ids), meta=json.dumps({'source': './a/token/path'})) storyobjs_list = [storyobj] #' test that the collection contains the expected component stories updated_collections_list = webtask.cache_collections.populate_collections_with_component_stories_1( collections_list, storyobjs_list) self.assertEqual(updated_collections_list[0]['component-story-ids'], expected_component_story_ids)
def test_populate_stories_with_collection_info(self): #' initialize a collection object and put it in a list component_story_names = [ "movie: Alien (1979)", "movie: Aliens (1986)", "movie: Alien 3 (1992)", "movie: Alien Resurrection (1997)", "movie: Prometheus (2012)", "movie: Alien: Covenant (2017)" ] storyobj = webobject.Story(name='Collection: Alien', title='Alien', date='1979-2017', description='A classic film franchise.', components='\n'.join(component_story_names), meta=json.dumps( {'source': './a/token/path'})) storyobjs_list = [storyobj] #' initialize a story OrderedDict object and put it in a list story_od = OrderedDict() story_od['story-id'] = "movie: Alien (1979)" story_od['title'] = "Alien" story_od['date'] = "1979-05-25" story_od['description'] = "A classic film." story_od['source'] = json.dumps({'source': './a/token/path'}) story_od['collections'] = [] stories_list = [story_od] #' test that collection info is correctly added to story entry expected_collections = ["Collection: Alien"] expected_collection_count = len(expected_collections) updated_stories_list = webtask.cache_data.populate_stories_with_collection_info( storyobjs_list, stories_list) self.assertEqual(len(updated_stories_list[0]['collections']), expected_collection_count) self.assertEqual(updated_stories_list[0]['collections'], expected_collections)
def test_populate_collections_with_component_stories_2(self): #' initialize a list containing one collection OrderedDict object collection_od = OrderedDict() collection_od['collection-id'] = 'Collection: Akira Kurosawa' collection_od['title'] = 'Akira Kurosawa' collection_od['date'] = '1949-1985' collection_od[ 'description'] = 'Films written or directed by Akira Kurosawa.' collection_od['component-story-ids'] = [] collections_list = [collection_od] #' initialize list of three Story objects: two are component stories of the above defined # collection and one not storyobj_1 = webobject.Story(name='movie: Rashomon (1950)', title='Rashomon', date='1950-08-25', description='A classic film.', collections='Collection: Akira Kurosawa', meta=json.dumps( {'source': './a/token/path'})) storyobj_2 = webobject.Story( name='movie: The Terminator (1984)', title='The Terminator', date='1984-10-26', description='A classic film.', collections= 'Collection: Science fiction films of the 1980s\nCollection: The Terminator', meta=json.dumps({'source': './a/token/path'})) storyobj_3 = webobject.Story( name='movie: Seven Samurai (1954)', title='Seven Samurai', date='1954-04-26', description='A classic film.', collections= 'Collection: Akira Kurosawa\nCollection: dummy collection', meta=json.dumps({'source': './a/token/path'})) storyobjs_list = [storyobj_1, storyobj_2, storyobj_3] #' test that the collection contains the expected component stories expected_component_story_ids = [ 'movie: Rashomon (1950)', 'movie: Seven Samurai (1954)' ] updated_collections_list = webtask.cache_collections.populate_collections_with_component_stories_2( collections_list, storyobjs_list) self.assertEqual(updated_collections_list[0]['component-story-ids'], expected_component_story_ids)
def main(): fn = sys.argv[-1] urls = [ "https://en.wikipedia.org/wiki/The_Decameron", ] stories = {} for title, pagename, data in fetch_links_info( urls[0], startafter="/wiki/Aesop$", stopat="/wiki/Template:Aesop$"): if pagename in fails: sid = "aesop{:02d}un".format(failcount) failcount += 1 else: pi = None for pattern in perrypatterns: if not pi: pi = re.search(pattern, data["extract"]) if not pi: page = urllib2.urlopen( data["content_urls"]["mobile"]["page"]).read() soup = BeautifulSoup(page, "html.parser") text = soup.get_text() lines = (line.strip() for line in text.splitlines()) text = " ".join(x for x in lines if x) for pattern in perrypatterns: if not pi: pi = re.search(pattern, text) if not pi: print("FAIL: ", title, pagename) sid = "aesop{:03d}pi".format(int(pi.group(1))) stories[sid] = webobject.Story( name=sid, title=title, description=data["extract"], date="564 BC", ) objs = [stories[sid] for sid in sorted(stories, key=orderpredicate)] txt = webdb.get_defenitions_text_for_objects( objs, empty_storythemes_headers=True, skip_fields=('collections', ), add_fields=("Ratings", ), presorted=True) txt = txt.encode("utf-8") if fn.endswith(".txt"): with open(fn, "wb+") as fh: fh.write(txt) else: # this f*****g breaks on f*****g windows with f*****g unicode sometimes sys.stdout.write(txt)
def fetch_table_list(url, tableclass="wikitable", cols=(0, 1, 5, 2)): """ Args: url: Returns: """ with urllib.request.urlopen(url) as response: data = response.read() soup = BeautifulSoup(data, "html.parser") sidcounter = defaultdict(int) descriptionfield = None for idx, table in enumerate(soup.find_all("table", class_=tableclass)): for row in table.find_all("tr"): tdfields = row.find_all("td") if len(tdfields) > max(cols): titlefield = tdfields[cols[0]] byfield_a = tdfields[cols[1]] byfield_b = tdfields[cols[2]] datefield = tdfields[cols[3]] title_link = titlefield.find("a") title = titlefield.get_text().strip(" \"").strip() datetext = datefield.get_text().strip() date = get_date(datetext) or "????" if title_link: suffix = title_link.get("href").split("/")[-1].strip() info = fetch_info(suffix) description = info['extract'] else: description = "" description = description.strip() + "\n\n" + \ "Studio: " + byfield_a.get_text().strip(".").strip() + ".\n\n" + \ "Cinematographer: " + byfield_b.get_text().strip(".").strip() + ".\n" description = REFSTRIPPER.sub(u"", description) sid = "movie: {} ({})".format(title, date[:4]) sid = re.sub("[^\w:() ]", "", sid) yield webobject.Story( name=sid, title=title, description=description, date=date, )
def test_init_stories_list(self): basepath = GIT_THEMING_PATH_HIST #' initialize three Story objects for testing purposes storyobj_1 = webobject.Story(name='movie: Robot Monster (1953)', title='Robot Monster', date='1953-06-24', description='A classic film.', meta=json.dumps( {'source': './a/token/path'})) storyobj_2 = webobject.Story(name='movie: The Wizard of Oz (1939)', title='The Wizard of Oz', date='1939-08-25', description='Another classic film.', meta=json.dumps( {'source': './another/token/path'})) storyobj_3 = webobject.Story( name='movie: A Trip to the Moon (1902)', title='A Trip to the Moon', date='1902-10-04', description='Yet another classic film.', meta=json.dumps({'source': './yet/another/token/path'})) storyobjs_list = [storyobj_1, storyobj_2, storyobj_3] #' test that correct number of stories are returned expected_story_count = len(storyobjs_list) stories_list = webtask.cache_data.init_stories_list( storyobjs_list, basepath) self.assertEqual(len(stories_list), expected_story_count) # test that stories are sorted in increasing order of release data expected_first_story_id = storyobj_3.name expected_second_story_id = storyobj_2.name expected_third_story_id = storyobj_1.name self.assertEqual(stories_list[0]['story-id'], expected_first_story_id) self.assertEqual(stories_list[1]['story-id'], expected_second_story_id) self.assertEqual(stories_list[2]['story-id'], expected_third_story_id)
def main(): fn = sys.argv[-1] urls = [ "https://en.wikipedia.org/wiki/The_Decameron", ] tableclass = "wikitable sortable" stories = {} data = urllib2.urlopen(urls[0]).read() soup = BeautifulSoup(data, "html.parser") table = soup.find("table", class_=tableclass) for idx, row in enumerate(table.find_all("tr")): cells = row.find_all("td") if len(cells) >= 3: texts = [c.get_text() for c in cells] nday, nstory = [ int(x) for x in re.match(r"\D*(\d+),\D*(\d+)", texts[0]).groups() ] sid = "boccachio1353d{:02d}s{:02d}".format(nday, nstory) description = "Narrator: {}.\nLocation: {}.".format(*texts[1:3]) if len(cells) >= 4 and texts[3].strip(): description += "\nSubject: {}.".format( re.sub(r"\s*\[\d+\]\s*", "", texts[3])) stories[sid] = webobject.Story( name=sid, title="The Decameron: Day {}, Story {}".format(nday, nstory), description=description, date="1353", ) else: print("SKIPPING ROW {}".format(idx)) objs = [stories[sid] for sid in natsorted(stories)] txt = webdb.get_defenitions_text_for_objects( objs, empty_storythemes_headers=True, skip_fields=('collections', ), add_fields=("Ratings", ), presorted=True) txt = txt.encode("utf-8") if fn.endswith(".txt"): with open(fn, "wb+") as fh: fh.write(txt) else: # this f*****g breaks on f*****g windows with f*****g unicode sometimes sys.stdout.write(txt)
def test_init_stories_list(self): basepath = GIT_THEMING_PATH_HIST #' initialize three Story objects for testing purposes expected_story_count = 1 expected_story_id = 'movie: A Trip to the Moon (1902)' storyobj = webobject.Story(name='movie: A Trip to the Moon (1902)', title='A Trip to the Moon', date='1902-10-04', description='A classic film.', meta=json.dumps( {'source': './yet/another/token/path'})) storyobjs_list = [storyobj] #' test the correct story info is returned expected_story_count = len(storyobjs_list) stories_list = webtask.cache_collections.init_stories_list( storyobjs_list, basepath) self.assertEqual(len(stories_list), expected_story_count) self.assertEqual(stories_list[0]['story-id'], expected_story_id)
def find_episodes_st1(url, season_offsset, prefix, tableclass="wikitable", cols=(1, 3, 4, 6), isterse=False, table_offset=0, singleseason=False): """ :param url: :param season_offsset: :param prefix: :param tableclass: :param cols: index of columns (title, director, author, date) counts only <td> and not <th> :param isterse: By default we expect every other row in the list to contain a description that go along with the preceding row's information. If the description row is not present, set this flag. :return: """ with urllib.request.urlopen(url) as response: data = response.read() soup = BeautifulSoup(data, "html.parser") sidcounter = defaultdict(int) descriptionfield = None for idx, table in enumerate(soup.find_all("table", class_=tableclass)): if idx < table_offset: continue sids = [] description = None titlestack = deque() coloffsetstack = deque() for row in table.find_all("tr"): tdfields = row.find_all("td") # no rowspan cols may be in between the indexes given with the "cols" argument, or things will break coloffset = coloffsetstack.popleft() if coloffsetstack else 0 for td in tdfields[:min(cols)]: rowspan = int(td.attrs.get('rowspan', 0)) while len(coloffsetstack) < rowspan: coloffsetstack.append(0) for ii in range(rowspan): coloffsetstack[ii] += 1 if len(tdfields) > max(cols) - coloffset: titlefield = tdfields[cols[0] - coloffset] directorfield = tdfields[cols[1] - coloffset] if cols[1] > 0 else '' authorfield = tdfields[cols[2] - coloffset] if cols[2] > 0 else '' datefield = tdfields[cols[3] - coloffset] title_link = titlefield.find("a") title = titlefield.get_text().strip(" \"") datetext = datefield.get_text().strip() author = "Story by: " + get_author(authorfield) if cols[2] > 0 else '' director = "Directed by: " + directorfield.get_text() if cols[1] > 0 else '' date = get_date(datetext) # find episode number-in-season if not coloffset: if singleseason: epfield = row.find("th").get_text() else: epfield = row.find("td").get_text() # sys.stderr.write(str(authorfield).decode("utf-8").encode("ascii", "ignore") + "\n") # f = authorfield # pdb.set_trace() for match in re.findall("(\d+)([a-z]*)", epfield): nepid, sepid = int(match[0]), match[1] if singleseason: sid = prefix + "%02d%s" % (nepid, sepid) else: sid = prefix + "%sx%02d%s" % (idx - table_offset + season_offsset, nepid, sepid) sids.append(sid) if isterse and title_link: suffix = title_link.get("href").split("/")[-1].strip() info = fetch_info(suffix) description = info['extract'] titlestack.append((sid, title, director, author, date)) print("ADD", titlestack[-1]) else: descriptionfield = row.find("td", class_="description") if descriptionfield: description = descriptionfield.get_text().strip() if description and sids: if descriptionfield: desclist = get_descriptions(descriptionfield) else: desclist = [description] numstories = min(len(desclist), len(titlestack)) for description in desclist: if not titlestack: break print("POP", titlestack[0]) sid, title, director, author, date = titlestack.popleft() sidcounter[sid] += 1 if numstories > 1: sid += chr(ord("a") + sidcounter[sid] - 1) description = description.strip() if author or director: description = description + "\n\n" if director: description += director.strip(".") if author: description += ". " if author: description += author.strip(".") description += ".\n" description = REFSTRIPPER.sub(u"", description) yield webobject.Story( name=sid, title=title, description=description, date=date, ) sids = [] description = ''
def find_episodes_st1(url, season_offsset, prefix, tableclass="wikitable", cols=(1, 3, 4, 6), isterse=False, table_offset=0): """ :param url: :param season_offsset: :param prefix: :param tableclass: :param cols: index of columns (title, director, author, date) counts only <td> and not <th> :param isterse: By default we expect every other row in the list to contain a description that go along with the preceding row's information. If the description row is not present, set this flag. :return: """ data = urllib2.urlopen(url).read() soup = BeautifulSoup(data, "html.parser") resturl = "https://en.wikipedia.org/api/rest_v1/page/summary/" sidcounter = defaultdict(int) for idx, table in enumerate(soup.find_all("table", class_=tableclass)): if idx < table_offset: continue sids = [] description = None titlestack = deque() coloffsetstack = deque() for row in table.find_all("tr"): tdfields = row.find_all("td") # no rowspan cols may be in between the indexes given with the "cols" argument, or things will break coloffset = coloffsetstack.popleft() if coloffsetstack else 0 for td in tdfields[:min(cols)]: rowspan = int(td.attrs.get('rowspan', 0)) while len(coloffsetstack) < rowspan: coloffsetstack.append(0) for ii in range(rowspan): coloffsetstack[ii] += 1 if len(tdfields) > max(cols) - coloffset: titlefield = tdfields[cols[0] - coloffset] directorfield = tdfields[cols[1] - coloffset] authorfield = tdfields[cols[2] - coloffset] datefield = tdfields[cols[3] - coloffset] title_link = titlefield.find("a") title = titlefield.get_text().strip(" \"") datetext = datefield.get_text().strip() date = None for regex in [ r"(\d{4}-\d{2}-\d{2})", #yyyy-mm-dd r"(\w+ \d{1,2}, \d{4})", #MM dd, yyyy r"(\d{1,2} \w+ \d{4})", #dd MM yyyy ]: try: date1 = re.search(regex, datetext).group(0) date = parser.parse(date1).strftime("%Y-%m-%d") break except AttributeError: # no regex match pass if not coloffset: epfield = row.find("td").get_text() author = "Story by: " + get_author(authorfield) director = "Directed by: " + directorfield.get_text() #sys.stderr.write(str(authorfield).decode("utf-8").encode("ascii", "ignore") + "\n") #f = authorfield #pdb.set_trace() for match in re.findall("(\d+)([a-z]*)", epfield): nepid, sepid = int(match[0]), match[1] sid = prefix + "%sx%02d%s" % (idx - table_offset + season_offsset, nepid, sepid) sids.append(sid) if isterse and title_link: suffix = title_link.get("href").split("/")[-1].strip() info = fetch_info(suffix) description = info['extract'] titlestack.append((sid, title, director, author, date)) print("ADD", titlestack[-1]) else: descriptionfield = row.find("td", class_="description") if descriptionfield: description = descriptionfield.get_text().strip() if description and sids: if descriptionfield: desclist = get_descriptions(descriptionfield) else: desclist = [description] numstories = min(len(desclist), len(titlestack)) for description in desclist: if not titlestack: break print("POP", titlestack[0]) sid, title, director, author, date = titlestack.popleft() sidcounter[sid] += 1 if numstories > 1: sid += chr(ord("a") + sidcounter[sid] - 1) description = unicode(description).strip() description = description + "\n\n" + director.strip( ".") + ". " + author.strip(".") + ".\n" description = REFSTRIPPER.sub(u"", description) yield webobject.Story( name=sid, title=title, description=description, date=date, ) sids = [] description = ''
def read_stories_from_fieldcollection(fieldcollection, verbose=True, addextras=False, globcollection=False, strict=True): """ Stories in our special text file format. """ out = {} out_composites = defaultdict(lambda: defaultdict(str)) field_map = { "title": "title", "release date": "date", "description": "description", "date": "date", "collections": "collections", "component stories": "components", } composite_fields = { "description": "description", "references": "references", "collections": "collections", } recognized_fields = set([ "authors", "choice themes", "genre", "major themes", "minor themes", "not themes", "notes", "other keywords", "ratings", "related stories", "aliases", ] + list(composite_fields) + list(field_map)) global_collections = [] for sid, field, data in fieldcollection: # is this is a "collection" for all stories in this file? if globcollection: if field.lower() == "collections" and sid in data: for d in data: if d not in global_collections: global_collections.append(d) if sid not in out: out[sid] = webobject.Story(name=sid, meta={}) obj = out[sid] lfield = field.strip().lower() attr = field_map.get(lfield, "") if lfield in composite_fields: out_composites[sid][lfield] = '\n'.join(data) if attr and attr in obj.fields: setattr(obj, attr, data[0]) elif addextras: exattr = lfield.replace(" ", "") setattr(obj, exattr, '\n'.join(data)) obj.extra_fields += (exattr,) elif lfield == "ratings": numbers = [int(s) for s in re.findall("\d+", ' '.join(data), re.DOTALL)] numbers = [min(5, max(0, n)) for n in numbers] count = float(len(numbers)) if count > 0: mean = sum(numbers) / count stddev = (sum((x - mean)**2 for x in numbers) / count)**0.5 obj.meta['rating'] = u'%.2f \u00B1 %.2f' % (mean, stddev) else: obj.meta['rating'] = 'n/a' elif lfield in recognized_fields: # recognized so don't warn even if we're not adding them pass else: if verbose: lib.log.warn("%s: %s.%s - don't grok", verbose, sid, field) for sid in sorted(out): obj = out[sid] description = getattr(obj, "description", "") references = out_composites[sid]["references"].strip() collections = out_composites[sid]["collections"].strip() if references: description += "\n\nReferences:\n" for line in references.split("\n"): line = line.strip() if line: description += line + "\n" clist = list(global_collections) for c in collections.split("\n"): if c and c not in clist: clist.append(c) clist = [c.strip() for c in clist if c.strip()] obj.description = description obj.collections = "\n".join(clist) try: if strict: obj.test_fields() yield obj except ValueError as e: if verbose: lib.log.warn("%s: %s.%s - %s", verbose, sid, field, str(e))