示例#1
0
def extract_links_from_swf(file):
    swf = swfparser.SWFParser(file)
    urls = set()
    for tag in swf.tags:
        for text in analyze_tag(tag):
            if looks_like_an_url(text):
                urls.add(text)
    return list(urls)
示例#2
0
def scrap(fh, custom_order=None):
    """Get useful info from a program."""
    swf = swfparser.SWFParser(fh)

    # get the images
    base = None
    images = []
    for tag in swf.tags:
        if tag.name == 'JPEGTables':
            base = tag.JPEGData
        elif tag.name == 'DefineBits':
            images.append((tag.CharacterID, tag.JPEGData))
        elif tag.name == 'DefineBitsJPEG2':
            images.append((tag.CharacterID, tag.ImageData))
    images = [base + x[1] for x in sorted(images, reverse=True)]

    # get the last DefineSprite
    defsprite = None
    for tag in swf.tags:
        if tag.name == 'DefineSprite':
            defsprite = tag
    assert tag is not None, "DefineSprite not found"

    # get the actions
    doaction = defsprite.ControlTags[0]
    for act in doaction.Actions:
        if act.name == 'ActionConstantPool':
            break
    else:
        if len(images) < 3:
            # not enough images and no constant pool: a non-programs swf!
            return []

        raise ValueError("No ActionConstantPool found!")

    # do some magic to retrieve the texts
    cpe = _ConstantPoolExtractor(act.ConstantPool, doaction.Actions)
    i = 0
    all_vals = []
    while True:
        i += 1
        name = 'titulo%d1' % i
        occup = 'titulo%d2' % i
        bio = 'htmlText'
        date = 'titulo%d3' % i
        vals = cpe.get(name, occup, bio, date)
        if vals is None:
            break
        all_vals.append((vals[name], vals[occup], vals[bio], vals[date]))

    items = []
    for i, (name, occup, bio, date) in enumerate(all_vals):
        date = _fix_date(date)
        if date is None:
            continue
        occup = _fix_occup(occup)
        bio = _fix_bio(bio)
        name = _fix_name(name)

        # use the corresponding image, or through the custom order
        if custom_order is None:
            idx = i
        else:
            try:
                idx = custom_order.index(name)
            except:
                continue

        image = images[idx]

        ep = Episode(name=name, occup=occup, bio=bio, image=image, date=date)
        items.append(ep)
    return items