示例#1
0
def get_html(url):
    r = http_do(url, 'GET')
    if r.reason != 'ok':
        raise internet_except('get_html fail: url = {}'.format(url))
    encoding = get_charset_from_content_type(r.head['content-type'])
    txt = r.data.decode(encoding)
    return txt
示例#2
0
def format_html(html_txt):
    p = format_HTMLParser()
    try:
        p.feed(html_txt)
    except:
        raise internet_except(p.txt)
    return ''.join(str(e) for strs in p.txt for e in strs)
示例#3
0
def download_image(fname, url):
    if os.path.exists(fname):
        raise Exception('{fname} already exists'.format(fname=fname))

    r = http_do(url, 'GET', timeout=60 * 2)
    if r.reason != 'ok':
        raise internet_except('http_do GET fail: url = {}'.format(url))

    write_bin(fname, r.data)
示例#4
0
def get_the_exact_only_one_data_from_tree_list_node(node):
    #print(node)
    try:
        if node[-1]:
            data, = node[-1]
            data, = data
            data, = data
            assert type(data) == str
        else:
            data = ''
    except:
        raise internet_except(node)
    return data
示例#5
0
def _get_info_from_html_txt(html_txt, match_to_list_root, match_to_info, \
                      get_data_from_tree_list_node):
    p = list_HTMLParser(match_to_list_root)
    p.feed(html_txt)

    page_info = get_data_attrs_from_tree_list(p.tree, match_to_info,
                                              get_data_from_tree_list_node)

    for slot_idx, one_matched_info_slot in enumerate(page_info):
        for pattern_idx, (matched_info_ls, info_pattern) in \
            enumerate(zip(one_matched_info_slot, match_to_info)):
            try:
                for t2_or_t3 in matched_info_ls:  # (tag, attrs) or (tag, attrs, data)
                    yield t2_or_t3, (slot_idx, pattern_idx, info_pattern)
            except StopIteration:
                raise
            except GeneratorExit:
                raise
            except:
                raise internet_except('error: "for t2_or_t3 in matched_info_ls:"',\
                                      one_matched_info_slot=one_matched_info_slot,\
                                      matched_info_ls=matched_info_ls)
示例#6
0
def _max_tag_match_split_under_some_assumption(max_int, non_zero_int_list):
    n = max_int
    ils = non_zero_int_list
    m = len(ils)
    for i in non_zero_int_list:
        assert -n <= i <= n and i != 0

    ls = [0] * (n + 1)
    flags = [True] * (n + 1)

    # set flags
    for tag in ils:
        i = abs(tag)
        if not flags[i]: continue
        count = ls[i]
        if tag < 0:  #close
            if count == 0:
                flags[i] = False
            else:
                ls[i] -= 1
        else:  #open
            ls[i] += 1

    for i, count in enumerate(ls):
        if count: flags[i] = False

    # check consistence on the True tag #assume for simplify
    # build split_tags_ls
    # partial init match_map
    stack = []
    split_tags_ls = []
    split_idc_ls = []
    split_tags_stack = [([], [])]
    match_map = [None] * m

    def split_tags_pop_to():
        split_tags, split_idc = split_tags_stack.pop()
        if split_tags:
            split_tags_ls.append(split_tags)
            split_idc_ls.append(split_idc)

    for idx, tag in enumerate(ils):
        i = abs(tag)
        if not flags[i]:
            split_tags, split_idc = split_tags_stack[-1]
            split_tags.append(tag)
            split_idc.append(idx)
            continue

        if tag < 0:  #close
            if not stack or stack[-1][1] != i:
                raise internet_except('assumption fail', 'tag non-consistence',
                                      ils)
            else:
                last = idx
                first, _ = stack.pop()
                assert match_map[first] == match_map[last] == None
                match_map[first] = last
                match_map[last] = first
                split_tags_pop_to()
        else:  #open
            stack.append((idx, tag))
            split_tags_stack.append(([], []))

    split_tags_pop_to()
    assert not stack
    assert not split_tags_stack

    #utag2idx_buffer = flags
    for tags, idc in zip(split_tags_ls, split_idc_ls):
        #utags = (abs(tag) for tag in tags)
        #idx2utag = unify_with_integer_buffer(utags, utag2idx_buffer)
        #tags = tuple((1+utag2idx_buffer[abs(tag)])*sign(tag) for tag in tags)##### +1
        ex_match_map = _max_tag_match_dynamic(tags)
        for i, match in enumerate(ex_match_map):
            assert match_map[idc[i]] == None
            if match != None:
                match_map[idc[i]] = idc[match]

    return match_map

    for i in range(m):
        if match_map[i] == None:
            match_map[i] = -1

    return match_map