def url_args_handler(url_args: str) -> List[ParamValPair]: ''' Tokenizes the parameter-value pairs in the parameter part of the url Args: url_args (str): Parameter part of url of webpage Returns: paths (ParamValPair): List of param-val pairs as 2tuple of lists. If no val is given, the second value in the tuple is the empty list [] Examples: >>> url_args_handler('sid=4') [(['sid'], ['4'])] >>> url_args_handler('sid=4&ring=hent&list') [(['sid'], ['4']), (['ring'], ['hent']), (['list'], [])] >>> url_args_handler('') [] ''' if len(url_args) == 0: return [] pair_list = [] for pair in re.split(r'(?:&)|;|&|\\', url_args): splitted = pair.split('=')[:2] param, val = (splitted[0], '') if len(splitted) == 1 else splitted param_val_tup = (word_splitter(param), word_splitter(val)) pair_list.append(param_val_tup) return pair_list
def url_domains_handler(url_domains: str) -> DomainData: ''' Splits the domain part of the URL to individual tokens Args: url_domains (str): Domains part of url of webpage Returns: sub_domains (List[str]): List of subdomains, i.e. ['www', 'blog'] main_domain (List[str]): Main domain, i.e. ['geo', 'cities'] domain_ending (str): The domain ending, i.e. 'com' or 'net' Examples: >>> url_domains_handler('geocities.com') ([], ['geo', 'cities'], 'com') >>> url_domains_handler('www.members.tripod.net') (['www', 'members'], ['tripod'], 'net') ''' splitted = url_domains.split('.') sub_domains = flatten([word_splitter(w) for w in splitted[:-2]]) main_domain = word_splitter(splitted[-2]) domain_ending = splitted[-1] return (sub_domains, main_domain, domain_ending)
def url_path_handler(url_path: str) -> List[str]: ''' Splits the path part of the url Args: url_path (str): Path part of url of webpage Returns: paths (List[str]): List of tokenized paths Examples: >>> url_path_handler('/path1/path2/page.html') ['path', '1', 'path', '2', 'page', 'html'] >>> url_path_handler('/') [] ''' token_lst = flatten( [word_splitter(token) for token in url_path.split('/') if token]) if url_path.find('@') >= 0: token_lst.append('@') return token_lst
def test_non_hyphenated_word(self): assert word_splitter('someword') == ['some', 'word']
def test_short_word(self): assert word_splitter('abc') == ['abc']
def test_empty_word(self): assert word_splitter('') == []