Exemplo n.º 1
0
def main():
    category_counts = {}
    for category_label, category_title in CATEGORIES:
        print('\n== %s ==' % category_label)
        xml = XML()
        xml.fn = os.path.join(CONTENT_PATH, '%s.xml' % category_label)
        items_elem = E.items()
        xml.root = E.category(
            E.title(category_title, Entity('#xA'),
                    {PSTYLEKEY: 'category-title'}), items_elem)
        list_url = ARTICLES_URL + "/List?category=%s&limit=%d" % (
            category_label, NUM_RECIPES)
        recipe_list = requests.get(list_url).json()
        items = recipe_list.get('items')
        print(len(items), 'items')
        for item in items:
            try:
                print(items.index(item), '\t', item.get('id'), '\t',
                      item.get('title'))
                item_elem = get_item_elem(item)
                if item_elem is not None:
                    items_elem.append(item_elem)
            except:
                print(traceback.format_exc())
        xml.write(pretty_print=True, canonicalized=False)
        category_counts[category_label] = len(xml.root.xpath("//item"))
    print(json.dumps(category_counts, indent=2))
Exemplo n.º 2
0
def get_content_elem(item, parent_elem):
    elem = E(item.get('type'), item.get('text') or '')
    if elem.text not in [None, '']:
        elem.append(Entity("#xA"))
    elem.set(PSTYLEKEY, item.get('type'))
    for element in item.get('elements') or []:
        elem.append(get_element(element))
    return elem
def get_content_elem(item, parent_elem):
    elem = E.content({'type': item.get('type')}, item.get('text') or '')
    if elem.text not in [None, '']:
        elem.append(Entity("#xA"))
    elem.set(
        pstylekey, parent_elem.tag + '-' + parent_elem.get('level') + '-' +
        elem.get('type'))
    for element in item.get('elements') or []:
        elem.append(get_element(element))
    return elem
Exemplo n.º 4
0
def get_item_elem(item, only_with_images=True):
    attrib = {k: str(item[k]) for k in item.keys()}
    elem = E.item(**attrib)

    details_url = ARTICLES_URL + '/Details?ids=%(id)s' % attrib
    # print(details_url)
    item_details = requests.get(details_url).json()['items'][attrib['id']]
    if item_details.get('type')=='category' \
    or 'User_blog:' in item_details.get('url'):    # don't include categories or user blog entries
        return
    keys = item_details.keys()
    for key in list(set(['id', 'title', 'type', 'ns']) & set(keys)):
        elem.set(key, str(item_details[key]))

    img_elem = None

    if 'thumbnail' in keys and item_details['thumbnail'] is not None \
    and 'original_dimensions' in keys and item_details.get('original_dimensions') is not None :

        dimensions = item_details['original_dimensions']


        if (MIN_IMAGE_HEIGHT is None
            or (type(dimensions.get('height'))==int and dimensions['height'] >= MIN_IMAGE_HEIGHT)) \
        and (MIN_IMAGE_WIDTH is None
            or (type(dimensions.get('width'))==int and dimensions['width'] >= MIN_IMAGE_WIDTH)):

            img_elem = get_img_elem(item_details['thumbnail'], MIN_IMAGE_WIDTH)
            if img_elem is not None:
                elem.append(img_elem)

        elif (type(dimensions.get('width')) == int
              and dimensions['width'] >= MIN_IMAGE_WIDTH / 2):

            img_elem = get_img_elem(item_details['thumbnail'], MIN_IMAGE_WIDTH)
            if img_elem is not None:
                img_elem.tag = 'thumb_img'
                elem.append(img_elem)

    if img_elem is not None or only_with_images != True:
        content_url = ARTICLES_URL + '/AsSimpleJson?id=%(id)s' % attrib
        item_content = requests.get(content_url).json()

        for section in item_content.get('sections'):
            elem.append(get_section_elem(section, details=item_details))

        url = url = WIKI_URL + item_details.get('url')
        elem.append(
            E.section(
                E.paragraph({PSTYLEKEY: "source"}, "Source: ",
                            E.a(url.replace('http://', ''),
                                Entity("#xA"),
                                title=url))))

        return elem
Exemplo n.º 5
0
def get_section_elem(item, details=None):
    elem = E.section(level=str(item.get('level')))
    if item.get('title') is not None:
        e = E("h%d" % item.get('level'), item.get('title'), Entity('#xA'))
        e.set(PSTYLEKEY, 'h%s' % elem.get('level'))
        elem.append(e)

    for content in item.get('content'):
        elem.append(get_content_elem(content, elem))

    return elem
def get_section_elem(item):
    elem = E.section(
        **{k: str(item[k])
           for k in item.keys() if k in ['title', 'level']})
    # if elem.get('level') is not None and int(elem.get('level')) > 2:
    #     elem.set('level', '2')
    if item.get('title') is not None:
        e = E.title(item.get('title'), Entity('#xA'))
        e.set(pstylekey, 'section-%s-title' % elem.get('level'))
        elem.append(e)
    for image in item.get('images')[:2]:
        img = E.img({'href': image.get('src')})
        md = re.search(r"(^.*\.(?:jpe?g|gif|png|tiff?))",
                       img.get('href'),
                       flags=re.I)
        if md is not None:
            url = md.group()
            result = requests.get(md.group())
            basename = re.sub("%..", "+", os.path.basename(url))
            i = Image(fn=os.path.join(image_path, basename),
                      data=result.content)
            i.write()
            w, h, x, y = i.identify(format="%w,%h,%x,%y").split(',')
            print(w, h, x, y, os.path.basename(i.fn))
            i.mogrify(density="150x150")
            w, h, x, y = i.identify(format="%w,%h,%x,%y").split(',')
            print(w, h, x, y, os.path.basename(i.fn))
            img.set('href', "file://" + os.path.relpath(i.fn, content_path))
            image_elem = E.image(img)
            image_elem.set(pstylekey, 'image')
            elem.append(image_elem)
            if image.get('caption') not in [None, '']:
                elem.append(E.caption(image.get('caption'), Entity('#xA')))
    for content in item.get('content'):
        elem.append(get_content_elem(content, elem))
    return elem
Exemplo n.º 7
0
def get_element(item):
    elem = E.element(
        item.get('text') or '',
        *[get_element(i) for i in item.get('elements') or []], Entity('#xA'))
    return elem
Exemplo n.º 8
0
 def entity(self, code):
     return Entity(code)