def main(): category_counts = {} for category_label, category_title in CATEGORIES: print('\n== %s ==' % category_label) xml = XML() xml.fn = os.path.join(CONTENT_PATH, '%s.xml' % category_label) items_elem = E.items() xml.root = E.category( E.title(category_title, Entity('#xA'), {PSTYLEKEY: 'category-title'}), items_elem) list_url = ARTICLES_URL + "/List?category=%s&limit=%d" % ( category_label, NUM_RECIPES) recipe_list = requests.get(list_url).json() items = recipe_list.get('items') print(len(items), 'items') for item in items: try: print(items.index(item), '\t', item.get('id'), '\t', item.get('title')) item_elem = get_item_elem(item) if item_elem is not None: items_elem.append(item_elem) except: print(traceback.format_exc()) xml.write(pretty_print=True, canonicalized=False) category_counts[category_label] = len(xml.root.xpath("//item")) print(json.dumps(category_counts, indent=2))
def get_content_elem(item, parent_elem): elem = E(item.get('type'), item.get('text') or '') if elem.text not in [None, '']: elem.append(Entity("#xA")) elem.set(PSTYLEKEY, item.get('type')) for element in item.get('elements') or []: elem.append(get_element(element)) return elem
def get_content_elem(item, parent_elem): elem = E.content({'type': item.get('type')}, item.get('text') or '') if elem.text not in [None, '']: elem.append(Entity("#xA")) elem.set( pstylekey, parent_elem.tag + '-' + parent_elem.get('level') + '-' + elem.get('type')) for element in item.get('elements') or []: elem.append(get_element(element)) return elem
def get_item_elem(item, only_with_images=True): attrib = {k: str(item[k]) for k in item.keys()} elem = E.item(**attrib) details_url = ARTICLES_URL + '/Details?ids=%(id)s' % attrib # print(details_url) item_details = requests.get(details_url).json()['items'][attrib['id']] if item_details.get('type')=='category' \ or 'User_blog:' in item_details.get('url'): # don't include categories or user blog entries return keys = item_details.keys() for key in list(set(['id', 'title', 'type', 'ns']) & set(keys)): elem.set(key, str(item_details[key])) img_elem = None if 'thumbnail' in keys and item_details['thumbnail'] is not None \ and 'original_dimensions' in keys and item_details.get('original_dimensions') is not None : dimensions = item_details['original_dimensions'] if (MIN_IMAGE_HEIGHT is None or (type(dimensions.get('height'))==int and dimensions['height'] >= MIN_IMAGE_HEIGHT)) \ and (MIN_IMAGE_WIDTH is None or (type(dimensions.get('width'))==int and dimensions['width'] >= MIN_IMAGE_WIDTH)): img_elem = get_img_elem(item_details['thumbnail'], MIN_IMAGE_WIDTH) if img_elem is not None: elem.append(img_elem) elif (type(dimensions.get('width')) == int and dimensions['width'] >= MIN_IMAGE_WIDTH / 2): img_elem = get_img_elem(item_details['thumbnail'], MIN_IMAGE_WIDTH) if img_elem is not None: img_elem.tag = 'thumb_img' elem.append(img_elem) if img_elem is not None or only_with_images != True: content_url = ARTICLES_URL + '/AsSimpleJson?id=%(id)s' % attrib item_content = requests.get(content_url).json() for section in item_content.get('sections'): elem.append(get_section_elem(section, details=item_details)) url = url = WIKI_URL + item_details.get('url') elem.append( E.section( E.paragraph({PSTYLEKEY: "source"}, "Source: ", E.a(url.replace('http://', ''), Entity("#xA"), title=url)))) return elem
def get_section_elem(item, details=None): elem = E.section(level=str(item.get('level'))) if item.get('title') is not None: e = E("h%d" % item.get('level'), item.get('title'), Entity('#xA')) e.set(PSTYLEKEY, 'h%s' % elem.get('level')) elem.append(e) for content in item.get('content'): elem.append(get_content_elem(content, elem)) return elem
def get_section_elem(item): elem = E.section( **{k: str(item[k]) for k in item.keys() if k in ['title', 'level']}) # if elem.get('level') is not None and int(elem.get('level')) > 2: # elem.set('level', '2') if item.get('title') is not None: e = E.title(item.get('title'), Entity('#xA')) e.set(pstylekey, 'section-%s-title' % elem.get('level')) elem.append(e) for image in item.get('images')[:2]: img = E.img({'href': image.get('src')}) md = re.search(r"(^.*\.(?:jpe?g|gif|png|tiff?))", img.get('href'), flags=re.I) if md is not None: url = md.group() result = requests.get(md.group()) basename = re.sub("%..", "+", os.path.basename(url)) i = Image(fn=os.path.join(image_path, basename), data=result.content) i.write() w, h, x, y = i.identify(format="%w,%h,%x,%y").split(',') print(w, h, x, y, os.path.basename(i.fn)) i.mogrify(density="150x150") w, h, x, y = i.identify(format="%w,%h,%x,%y").split(',') print(w, h, x, y, os.path.basename(i.fn)) img.set('href', "file://" + os.path.relpath(i.fn, content_path)) image_elem = E.image(img) image_elem.set(pstylekey, 'image') elem.append(image_elem) if image.get('caption') not in [None, '']: elem.append(E.caption(image.get('caption'), Entity('#xA'))) for content in item.get('content'): elem.append(get_content_elem(content, elem)) return elem
def get_element(item): elem = E.element( item.get('text') or '', *[get_element(i) for i in item.get('elements') or []], Entity('#xA')) return elem
def entity(self, code): return Entity(code)