Exemplo n.º 1
0
def scrape(input_cfg=None):
    cfg = jtutils.process_cfg(input_cfg, parser(), internal_args())
    if cfg["infile"]:
        with open(cfg["infile"]) as f_in:
            soup = jtutils.html_to_soup(f_in.read())
    elif cfg["html"]:
        soup = jtutils.html_to_soup(cfg["html"])
    elif cfg["url"]:
        soup = jtutils.url_to_soup(cfg["url"], cfg["js"], None, cfg["cookies"],
                                   cfg["headers"], cfg["params"])
    else:
        raise

    return scrape_soup(soup, cfg)
Exemplo n.º 2
0
def pcsv(input_cfg=None):
    cfg = process_cfg(input_cfg, parser(), internal_args())

    if input_cfg and not cfg["input"] and not cfg["infile"]:
        raise Exception("Couldn't find input for pawk")
    if sys.stdin.isatty() and (not cfg["input"]) and (not cfg["infile"]):
        sys.stderr.write(
            "WARNING: pcsv using /dev/stdin as default input file (-f) but nothing seems to be piped in..."
            + "\n")

    #for non commandline, capture the sys.stdout
    backup = None
    if input_cfg:  # not running from pawk script
        backup = sys.stdout
        sys.stdout = six.StringIO()

    if cfg["input"]:
        f_in = six.StringIO(cfg["input"])
    elif not cfg["infile"]:
        f_in = sys.stdin
    else:
        if sys.version_info[0] >= 3:
            f_in = open(cfg["infile"],
                        errors='ignore')  #don't crash on invalid unicode
        else:
            f_in = open(cfg["infile"])
    if cfg["delimiter"] == "TAB":
        cfg["delimiter"] = '\t'
    elif cfg["delimiter"] == "\\t":
        cfg["delimiter"] = '\t'

    keep_list = process_cut_csv(cfg["keep_list"])
    drop_list = process_cut_csv(cfg["drop_list"])

    in_hdr = None
    out_hdr = None
    has_exceptions = False
    has_printed_incomplete_line = False
    # do_write = process_code and ("print" in process_code or "write_line" in process_code)

    begin_code = None
    process_code = None
    end_code = None
    grep_code = None

    if cfg["begin_code"]:
        _check_is_list(cfg, "begin_code")
        begin_code = [pindent(code) for code in cfg["begin_code"]]
        begin_code = [compile(code, '', 'exec') for code in cfg["begin_code"]]
    if cfg["grep_code"]:
        grep_code = pindent(cfg["grep_code"])
        #preprocess /.*/ syntax
        grep_code = gen_grep_code(grep_code)
        grep_code = compile(grep_code, '', 'eval')
    if cfg["process_code"]:
        _check_is_list(cfg, "process_code")
        process_code = [pindent(code) for code in cfg["process_code"]]
        process_code = [compile(code, '', 'exec') for code in process_code]
    if cfg["end_code"]:
        _check_is_list(cfg, "end_code")
        end_code = [pindent(code) for code in cfg["end_code"]]
        end_code = [compile(code, '', 'exec') for code in end_code]

    if begin_code:
        for code in begin_code:
            exec(code)

    if cfg["set"]:
        s = set(l.strip() for l in open(cfg["set"]))

    #main iteration loop
    for i, (l, _csvlist) in enumerate(
            csv_row_and_raw(f_in, delimiter=cfg["delimiter"])):
        is_header_line = (i == 0 and not cfg["no_header"])
        if not in_hdr and cfg["no_header"]:
            #create a dummy header from the length of the line
            in_hdr = ["X" + str(j) for j, _ in enumerate(_csvlist)]
            hdrhash = dict((jx, j) for j, jx in enumerate(in_hdr))
            r = IndexDict(
                hdrhash, _csvlist
            )  #IndexDict can be accessed by string or index (all keys must be strings)
        elif not in_hdr:
            #read in the header
            in_hdr = _csvlist[:]
            if len(in_hdr) != len(set(in_hdr)):
                sys.stderr.write(
                    "WARNING: duplicated header columns. Using dummy header instead"
                    + '\n')
                #create a dummy header from the length of the line
                in_hdr = rename_duplicate_header(_csvlist)
            hdrhash = dict((jx, j) for j, jx in enumerate(in_hdr))
            if not _csvlist:
                _csvlist = [''] * len(in_hdr)
            r = IndexDict(
                hdrhash, _csvlist
            )  #IndexDict can be accessed by string or index (all keys must be strings)
            if cfg["no_print"]:  #TODO: what's this block for?
                for code in process_code:
                    exec(code)
            continue  #_csvlist is the header, don't continue to process as row
        else:
            #setup for regular rows
            if len(_csvlist) != len(in_hdr):
                if cfg["fix"]:
                    sys.stdout.write(l + "\n")
                    continue
                elif cfg["autofix"]:
                    continue
                elif not has_printed_incomplete_line:
                    raise Exception(
                        "ERROR: line length not equal to header length. Try running pcsv.py --fix or pcsv.py --autofix"
                    )
                    # sys.stderr.write("Header length " + str(len(hdr)) + "." + "  Row length " + str(len(_csvlist)) + "." + "\n")
                    # csv.writer(sys.stderr, lineterminator= '\n').writerows([_csvlist])
                    has_printed_incomplete_line = True
            if not _csvlist:
                _csvlist = [''] * len(in_hdr)
            r = IndexDict(
                hdrhash, _csvlist
            )  #IndexDict can be accessed by string or index (all keys must be strings)

        #run process and grep code
        try:
            if grep_code and not is_header_line and not eval(grep_code):
                continue

            if process_code and (not is_header_line):
                for code in process_code:
                    exec(code)
        except:
            if not cfg["exceptions_allowed"]:
                raise
            else:
                if not has_exceptions:
                    sys.stderr.write("WARNING: exception" + '\n')
                    has_exceptions = True
                continue

        #print header after processing the first row
        #(this allows auto adding of new columns)
        #like new in this case -p 'r["new"] = 2 * float(r["old"])'
        if not cfg["no_print"] and not out_hdr:
            out_hdr = print_header(in_hdr, r, keep_list, drop_list)

        #print line
        if cfg["fix"] or cfg["no_print"]:
            pass
        else:
            rout = [str(r[h]) for h in out_hdr]
            write_line(rout)

    #print header if not printed yet
    #eg file has only a header and no rows
    if not cfg["no_print"] and not out_hdr:
        out_hdr = print_header(in_hdr, r, keep_list, drop_list)

    if end_code:
        for code in end_code:
            exec(code)
    #for sys.stdout
    if input_cfg:  #not running from the script
        out = sys.stdout.getvalue()
        sys.stdout = backup
        return out
Exemplo n.º 3
0
def pawk(input_cfg=None):
    cfg = process_cfg(input_cfg, parser(), internal_args())
    if input_cfg and not cfg["input"] and not cfg["infile"]:
        raise Exception("Couldn't find input for pawk")
    if sys.stdin.isatty() and (not cfg["input"]) and (not cfg["infile"]):
        sys.stderr.write(
            "WARNING: pawk using /dev/stdin as default input file (-f) but nothing seems to be piped in..."
            + "\n")

    #for non commandline, capture the sys.stdout
    backup = None
    if input_cfg:  # not running from pawk script
        backup = sys.stdout
        sys.stdout = six.StringIO()

    if cfg["input"]:
        f_in = six.StringIO(cfg["input"])
    elif not cfg["infile"]:
        f_in = sys.stdin
    else:
        if sys.version_info[0] >= 3:
            f_in = open(cfg["infile"],
                        errors='ignore')  #don't crash on invalid unicode
        else:
            f_in = open(cfg["infile"])
    if cfg["delimiter"] == "TAB":
        cfg["delimiter"] = '\t'
    elif cfg["delimiter"] == "\\t":
        cfg["delimiter"] = '\t'

    hdr = None
    has_exceptions = False
    has_printed_incomplete_line = False
    #jtrigg@20160102 try out only writing when there's no -p option
    # do_write = cfg["process_code"] and ("print" in cfg["process_code"] or "write_line" in cfg["process_code"])
    do_write = cfg["process_code"]
    if cfg["set"]:
        s = set(l.strip() for l in open(cfg["set"]))

    begin_code = None
    process_code = None
    end_code = None
    grep_code = None

    if cfg["begin_code"]:
        _check_is_list(cfg, "begin_code")
        begin_code = [pyindent(c) for c in cfg["begin_code"]]
        begin_code = [compile(code, '', 'exec') for code in begin_code]
    if cfg["grep_code"]:
        if isinstance(cfg["grep_code"], list):
            raise Exception("grep_code can't be list")
        #preprocess /.*/ syntax
        grep_code = gen_grep_code(cfg["grep_code"])
        grep_code = pyindent(grep_code)
        grep_code = compile(grep_code, '', 'eval')
    if cfg["process_code"]:
        _check_is_list(cfg, "process_code")
        process_code = [pyindent(c) for c in cfg["process_code"]]
        process_code = [compile(code, '', 'exec') for code in process_code]
    if cfg["end_code"]:
        _check_is_list(cfg, "end_code")
        end_code = [pyindent(c) for c in cfg["end_code"]]
        end_code = [compile(code, '', 'exec') for code in end_code]
    if begin_code:
        for code in begin_code:
            #NOTE: this code appears in a couple places
            #but it breaks if it's wrapped in a function because exec()
            #uses the existing environment
            try:
                exec(code)
            except:
                if backup:
                    sys.stdout = backup
                raise

    for i, (l, _csvlist) in enumerate(
            csvlist_and_raw(f_in, cfg["delimiter"],
                            multiline=cfg["multiline"])):
        # sys.stderr.write(str(i) + "\n")
        # sys.stderr.write(str(l) + "\n")
        # raise
        r = _csvlist
        try:
            # print r,process_code
            if grep_code:
                try:
                    if not eval(grep_code):
                        continue
                except:
                    if backup:
                        sys.stdout = backup
                    raise
            if process_code:
                for code in process_code:
                    try:
                        exec(code)
                    except:
                        if backup:
                            sys.stdout = backup
                        raise
        except:
            if not cfg["exceptions_allowed"]:
                raise
            else:
                if not has_exceptions:
                    sys.stderr.write("WARNING: exception" + '\n')
                    has_exceptions = True
                continue
        if not do_write:
            write_line(r, cfg["output_delimiter"])

    if end_code:
        for code in end_code:
            try:
                exec(code)
            except:
                if backup:
                    sys.stdout = backup
                raise

    #for sys.stdout
    if input_cfg:  #not running from the pawk script
        out = sys.stdout.getvalue()
        sys.stdout = backup
        return out