예제 #1
0
def url_args_handler(url_args: str) -> List[ParamValPair]:
    '''
    Tokenizes the parameter-value pairs in the parameter part of the url

    Args:
        url_args (str): Parameter part of url of webpage

    Returns:
        paths (ParamValPair): List of param-val pairs as 2tuple of lists. If no
                              val is given, the second value in the tuple is
                              the empty list []

    Examples:
        >>> url_args_handler('sid=4')
        [(['sid'], ['4'])]
        >>> url_args_handler('sid=4&ring=hent&list')
        [(['sid'], ['4']), (['ring'], ['hent']), (['list'], [])]
        >>> url_args_handler('')
        []
    '''
    if len(url_args) == 0:
        return []

    pair_list = []
    for pair in re.split(r'(?:&)|;|&|\\', url_args):
        splitted = pair.split('=')[:2]
        param, val = (splitted[0], '') if len(splitted) == 1 else splitted
        param_val_tup = (word_splitter(param), word_splitter(val))
        pair_list.append(param_val_tup)
    return pair_list
예제 #2
0
def url_domains_handler(url_domains: str) -> DomainData:
    '''
    Splits the domain part of the URL to individual tokens

    Args:
        url_domains (str): Domains part of url of webpage

    Returns:
        sub_domains (List[str]): List of subdomains, i.e. ['www', 'blog']
        main_domain (List[str]): Main domain, i.e. ['geo', 'cities']
        domain_ending (str): The domain ending, i.e. 'com' or 'net'

    Examples:
        >>> url_domains_handler('geocities.com')
        ([], ['geo', 'cities'], 'com')
        >>> url_domains_handler('www.members.tripod.net')
        (['www', 'members'], ['tripod'], 'net')
    '''
    splitted = url_domains.split('.')
    sub_domains = flatten([word_splitter(w) for w in splitted[:-2]])
    main_domain = word_splitter(splitted[-2])
    domain_ending = splitted[-1]
    return (sub_domains, main_domain, domain_ending)
예제 #3
0
def url_path_handler(url_path: str) -> List[str]:
    '''
    Splits the path part of the url

    Args:
        url_path (str): Path part of url of webpage

    Returns:
        paths (List[str]): List of tokenized paths

    Examples:
        >>> url_path_handler('/path1/path2/page.html')
        ['path', '1', 'path', '2', 'page', 'html']
        >>> url_path_handler('/')
        []
    '''
    token_lst = flatten(
        [word_splitter(token) for token in url_path.split('/') if token])
    if url_path.find('@') >= 0:
        token_lst.append('@')
    return token_lst
예제 #4
0
 def test_non_hyphenated_word(self):
     assert word_splitter('someword') == ['some', 'word']
예제 #5
0
 def test_short_word(self):
     assert word_splitter('abc') == ['abc']
예제 #6
0
 def test_empty_word(self):
     assert word_splitter('') == []