예제 #1
0
def download_file_stream(url: str, file_stream: IO, block_size: int = 1024 * 8, with_progress_bar: bool = True, fatal: bool = True) -> bool:
    try:
        download_stream = urllib.request.urlopen(url)
    except Exception as e:
        log(f'Failed on: {url}', fatal=False, log_type=LogType.ERROR)
        log(e, fatal=fatal, log_type=LogType.ERROR)
        return False

    with download_stream:
        res_headers: HTTPMessage = download_stream.info()
        total_size = int(res_headers.get('Content-Length', failobj=0))

        progress_bar = None
        if with_progress_bar and total_size > 0:
            progress_bar = DownloadProgressBar(total_size)

        read = 0
        while True:
            block: bytes = download_stream.read(block_size)
            if not block:
                break

            read += len(block)
            file_stream.write(block)

            if progress_bar and not progress_bar.run(len(block)):
                break

        if total_size >= 0 and read < total_size:
            return False

    return True
예제 #2
0
def move_file(old: str, new: str, make_dirs: bool = True, duplicate_handler: DuplicateHandler = None) -> str:
    if make_dirs:
        os.makedirs(Path(new).parent, exist_ok=True)

    if duplicate_handler and Path(new).exists():
        if duplicate_handler == DuplicateHandler.FIND_VALID_FILE:
            new = get_valid_filename(new)
        elif duplicate_handler == DuplicateHandler.THROW_ERROR:
            log(f'File "{new}" already exists', log_type=LogType.ERROR)
        elif duplicate_handler == DuplicateHandler.OVERWRITE:
            os.remove(new)
        elif duplicate_handler == DuplicateHandler.SKIP:
            return new
        elif duplicate_handler == DuplicateHandler.HASH_COMPARE:
            old_file = get_sha1_hash_file(old)
            new_file = get_sha1_hash_file(new)

            if old_file == new_file:
                return new

            new = get_valid_filename(new)

    file = str(Path(old).rename(new)).replace('\\', '/')

    return file
예제 #3
0
def json_parse(json, key, default=None, fatal=False):
    if key in json:
        return json[key]

    if fatal:
        log(f'Cannot find {key} in {json}.', log_type=LogType.ERROR)

    return default
예제 #4
0
def get_config(file=CONFIG_FILE):
    config_file = Path(file)
    if not config_file.is_file():
        log(f'{file} not found in path', log_type=LogType.ERROR)

    with open(file, 'r') as file_obj:
        text = ''.join(file_obj.readlines())
        json_obj = json.loads(text)

        return json_to_config(json_obj)
예제 #5
0
def add_to_download_cache(download_cache, *urls, headers: HTTPMessage = None, filename: str = None, result=DownloadedFileResult.SUCCESS) \
        -> Optional[DownloadedFile]:
    if len(urls) == 0:
        log(f'Cache fail, no url sent.', log_type=LogType.ERROR)

    downloaded_file = DownloadedFile(filename=filename, url=urls[0], headers=headers, result=result)

    for url in urls:
        download_cache[url] = downloaded_file

    return downloaded_file
예제 #6
0
def split_filename(s: str, fatal=False, include_ext_period: bool = False) -> Tuple[str, Optional[str]]:
    split = s.split('.')

    if len(split) == 1:
        if fatal:
            log(f'No file extension found: {s}', log_type=LogType.ERROR)

        return split[0], None

    ext = handle_extension_period(split[-1], include_ext_period=include_ext_period)

    return '.'.join(split[:-1]), ext
예제 #7
0
def json_parse_enum(obj, json_val, class_type, fatal=False):
    val = json_parse(obj, json_val, default=None, fatal=fatal)

    if not val:
        return None

    val = str(val).upper()
    if val not in class_type.__dict__.keys():
        log(f'Invalid Enum: {val}, Keys: {class_type.__dict__.keys()}',
            log_type=LogType.ERROR)

    return class_type.__dict__[val]
예제 #8
0
    def __init__(self, dictionary: dict = None, name: str = '', value: str = '', domain: str = '', path: str = ''):
        self.name = name
        self.value = value
        self.domain = domain
        self.path = path

        if dictionary:
            for key in dictionary:
                setattr(self, key, dictionary[key])

        if not self.name or not self.value or not self.domain or not self.path:
            log(f'Invalid Cookie', log_type=LogType.ERROR)
예제 #9
0
def validate_path(directory: str, default_path: str = join_path(os.getcwd(), '/out'), fatal: bool = False) -> str:
    if is_blank(directory):
        if fatal:
            log(f'Path {directory} does not exist.', log_type=LogType.ERROR)

        directory = default_path

    path = directory.replace('\\', '/')

    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

    return path
예제 #10
0
def driver_go_and_wait(driver: WebDriver,
                       url: str,
                       scroll_pause_time: float,
                       fail: int = 0):
    if fail >= 5:
        log(f'URL does not ever match, {url} never becomes {driver.current_url}',
            log_type=LogType.ERROR)

    driver.get(url)
    wait_page_load(driver)

    if not is_url_exact(driver.current_url, url):
        driver_go_and_wait(driver, url, scroll_pause_time, fail=fail + 1)

    scroll_to_bottom(driver, scroll_pause_time=scroll_pause_time)
예제 #11
0
def get_content_type_head(url: str):
    try:
        req = urllib.request.Request(url, method='HEAD')
        with urllib.request.urlopen(req) as response:
            res_headers: HTTPMessage = response.info()

            content_type = get_content_type_from_headers(res_headers)
    except HTTPError as e:
        if e and e.headers:
            content_type_tmp = e.headers.get('Content-Type', None)
            content_type = get_content_type_from_header(content_type_tmp)
        else:
            log(e, extra=f'URL: {url}', fatal=False, log_type=LogType.ERROR)
            content_type = None
    except Exception as e:
        log(e, extra=f'General Exception URL: {url}', fatal=False, log_type=LogType.ERROR)
        content_type = None

    return content_type
예제 #12
0
def extract_json_from_text(s: str):
    start = 0
    if not s.startswith('[') and not s.startswith('{'):
        bracket_i = brace_i = 1

        open_bracket = open_brace = 0
        while (open_bracket != -1 and open_brace != -1) and start == 0:
            open_bracket = find_nth(s, '[', bracket_i)
            open_brace = find_nth(s, '{', brace_i)

            lower, higher, lower_val, higher_val = min_val(open_brace, open_bracket, '{', '[', min_possible_val=0)

            prev_lower_char = find_first_previous_char(s, lower, exclude=[' '])
            if s[prev_lower_char] != '=' and s[prev_lower_char] != '(':
                if lower_val == '{':
                    brace_i += 1
                elif lower_val == '[':
                    bracket_i += 1
            else:
                start = lower

        if start == 0:
            return None

    end = 0
    stack = LifoQueue()
    for i in range(start, len(s)):
        if s[i] == '{':
            stack.put('{')

        if s[i] == '}':
            stack.get()

        if stack.empty():
            end = i
            break

    if end == 0:
        log('Error parsing JSON', log_type=LogType.ERROR)

    json = s[start:end + 1]

    return json
예제 #13
0
def download_file_impl(url: str, filename: str, download_cache: Optional[shelve.DbfilenameShelf], block_size: int = 1024 * 8, with_progress_bar: bool = True) \
        -> Union[Tuple[str, str, HTTPMessage], DownloadedFile, None]:
    try:
        download_stream = urllib.request.urlopen(url)
    except:
        return None

    old_url: str = ''
    with download_stream:
        res_headers: HTTPMessage = download_stream.info()
        new_url: str = download_stream.geturl()

        if download_cache and new_url in download_cache:
            return download_cache[new_url]

        if url != new_url:
            old_url = url
            url = new_url

        total_size = int(res_headers.get('Content-Length', failobj=0))

        progress_bar = None
        if with_progress_bar and total_size > 0:
            progress_bar = DownloadProgressBar(total_size, on_complete=lambda x: log(f'Downloaded {url} to {filename}'))

        read = 0
        with open(filename, 'w+b') as file_stream:
            while True:
                block: bytes = download_stream.read(block_size)
                if not block:
                    break

                read += len(block)
                file_stream.write(block)

                if progress_bar and not progress_bar.run(len(block)):
                    break

        if total_size >= 0 and read < total_size:
            log(f'File download incomplete, received {read} out of {total_size} bytes. URL: {url}, filename: {filename}', fatal=False, log_type=LogType.ERROR)

    return url, old_url, res_headers
예제 #14
0
def json_parse_class(json: dict, class_type: type):
    signature = inspect.signature(class_type.__init__)
    args = signature.parameters.keys()
    args = [arg for arg in args if arg != 'self']

    d = {}
    for arg in args:
        arg_type = signature.parameters[arg].annotation

        if arg in SAFE_PARAMETER_MAPPING:
            temp_arg = SAFE_PARAMETER_MAPPING[arg]
        else:
            temp_arg = arg

        if temp_arg not in json:
            continue

        if arg_type in PRIMITIVE_TYPES:
            generated_arg_obj = json[temp_arg]
        elif issubclass(arg_type, Enum):
            generated_arg_obj = json_parse_enum(json,
                                                temp_arg,
                                                arg_type,
                                                fatal=True)
        elif get_origin(arg_type) and get_origin(arg_type) == list:
            list_type = first_or_none(get_args(arg_type))

            if not list_type:
                log(f'List Type {arg_type} was None, origin: {get_origin(arg_type)}',
                    log_type=LogType.ERROR)

            generated_arg_obj = json_parse_class_list(json[temp_arg],
                                                      list_type,
                                                      fatal=True)
        else:
            generated_arg_obj = json_parse_class(json[temp_arg], arg_type)

        d[arg] = generated_arg_obj

    obj = class_type(**d)

    return obj
예제 #15
0
def get_sub_directory_path(base_url: str, new_url: str, prepend_dir: str = None, prepend_slash: bool = True, append_slash: bool = True) -> str:
    if not base_url:
        log(f'Invalid params: {base_url}, {new_url}, {prepend_dir}.', name_of(get_sub_directory_path), log_type=LogType.ERROR)

    if new_url.endswith('/'):
        new_url = new_url[:-1]

    base_url = get_base_url(base_url)

    if base_url not in new_url:
        log(f'Invalid params: {base_url}, {new_url}, {prepend_dir}.', name_of(get_sub_directory_path), log_type=LogType.ERROR)

    sub_dir = new_url[new_url.index(base_url) + len(base_url):]

    if not sub_dir.startswith('/'):
        sub_dir = f'/{sub_dir}'

    if append_slash:
        if not sub_dir.endswith('/'):
            sub_dir += '/'
    else:
        if sub_dir.endswith('/'):
            sub_dir = sub_dir[:-1]

    if prepend_dir:
        if prepend_dir.endswith('/'):
            prepend_dir = prepend_dir[:-1]

        return prepend_dir + sub_dir

    if prepend_slash:
        if not sub_dir.startswith('/'):
            sub_dir = f'/{sub_dir}'
    else:
        if sub_dir.startswith('/'):
            sub_dir = sub_dir[1:]

    return sub_dir
예제 #16
0
import sys

from src.config import CONFIG_FILE, get_config
from src.post_scrape import run_post_scrape
from src.scrape import scrape
from src.util.generic import log
'''
sys.argv = [
    __file__,
    'job3.json'
]'''

if __name__ == '__main__':
    config_file = CONFIG_FILE if len(sys.argv) == 1 else sys.argv[1]

    config = get_config(file=config_file)

    if not config.post_scrape_jobs_only:
        scrape(config)

    run_post_scrape(config)

    log('Completed everything. The program will now exit.')