Пример #1
0
def extract_all_tags_classes(html_filename):
    """
    Inputs: filename of an html file, tag we want to retrieve, class we want to retrieve.
    Objective: an html file is opened, converted to a string, the string is converted to a soup,
                and a list of tags belonging to the sought tag AND class is returned.
    Output: list with extracted tags.
    """

    html_string = filegenerator.file_to_string(html_filename)
    soup = BeautifulSoup(html_string, "html.parser")
    all_tags = soup.find_all()
    all_tags_list = list(soup.find_all())
    
    output_snippets = []
    
    print('      {0} tags of all types and classes were retrieved in the file {1}.'.format(
        len(all_tags), html_filename))

    for item in all_tags:
        snippet = {}
        snippet["given_path"] = html_filename
        snippet["tag"] = item.name
        if item.get("class") is not None: snippet["class"] = str(item.get("class"))
        else: snippet["class"] = None
        snippet["contents"] = str(item)
        if item.get("href") is not None: snippet["href"] = str(item.get("href"))
        else: snippet["href"] = None
        snippet["file_index"] = file_index
        snippet["pos_index"]  = all_tags_list.index(item)
        # No plans to fully use RMDB yet, various data types will be converted to text

        if snippet not in output_snippets: output_snippets.append(snippet)

    return output_snippets
Пример #2
0
def test_find_all_tags(html_filename):
    ## Test with tags
    print("\n#####################################################################################")
    print("Test to find all tags in a .html file")
    html_string = filegenerator.file_to_string(html_filename)
    
    test = htmlparser.get_tag_names(html_string)
    print(test)
    return None
Пример #3
0
def extract_tags_classes_exact(html_filename, wanted_tags, wanted_classes, file_index = 0):
    """
    Inputs: filename of an html file,
            tuple with tag(s) we want to retrieve,
            tuple with class(es) we want to retrieve,
            file index of the file within a file list, if applicable.
    Objective: an html file is opened, converted to a string, the string is converted to a soup,
                and a list of tags belonging to the sought tag AND class is returned.
    Output: list with extracted tags.
    """

    html_string = filegenerator.file_to_string(html_filename)
    soup = BeautifulSoup(html_string, "html.parser")
    output_snippets = []
    all_tags_list = list(soup.find_all())
    
    if wanted_classes != ("",) and wanted_tags != ("",):
        for wanted_tag in wanted_tags:
            for wanted_class_ in wanted_classes:
                applicable_tags = soup.find_all(wanted_tag, wanted_class_)                        
                for item in applicable_tags:
                    snippet = {}
                    snippet["given_path"] = html_filename
                    snippet["tag"] = item.name
                    if item.get("class") is not None: snippet["class"] = str(item.get("class"))
                    else: snippet["class"] = None
                    snippet["contents"] = str(item)
                    if item.get("href") is not None: snippet["href"] = str(item.get("href"))
                    else: snippet["href"] = None
                    snippet["file_index"] = file_index
                    snippet["pos_index"]  = all_tags_list.index(item)
                    
                    # No plans to fully use RMDB yet, various data types will be converted to text

                    if snippet not in output_snippets: output_snippets.append(snippet)

                print('    {0} <{1}> tags with the class "{2}" were retrieved in the file {3}.\n'.format(
                    len(output_snippets), wanted_tag, wanted_class_, html_filename))

    elif wanted_classes == ("",) and wanted_tags != ("",):
        for wanted_tag in wanted_tags:
            applicable_tags = soup.find_all(wanted_tag)                        
            for item in applicable_tags:
                snippet = {}
                snippet["given_path"] = html_filename
                snippet["tag"] = item.name
                if item.get("class") is not None: snippet["class"] = str(item.get("class"))
                else: snippet["class"] = None
                snippet["contents"] = str(item)
                if item.get("href") is not None: snippet["href"] = str(item.get("href"))
                else: snippet["href"] = None
                snippet["file_index"] = file_index
                snippet["pos_index"]  = all_tags_list.index(item)
                    
                # No plans to fully use RMDB yet, various data types will be converted to text
                   
                if snippet not in output_snippets: output_snippets.append(snippet)

            print('    {0} <{1}> tags were retrieved in the file {2}.\n'.format(
                len(output_snippets), wanted_tag, html_filename))

    elif wanted_classes != ("",) and wanted_tags == ("",):
        all_tags = soup.find_all()
        for wanted_class_ in wanted_classes:
            for item in all_tags:
                if item.get("class") is not None and wanted_class_ in item.get("class"):
                    snippet = {}
                    snippet["given_path"] = html_filename
                    snippet["tag"] = item.name
                    if item.get("class") is not None: snippet["class"] = str(item.get("class"))
                    else: snippet["class"] = None
                    snippet["contents"] = str(item)
                    if item.get("href") is not None: snippet["href"] = str(item.get("href"))
                    else: snippet["href"] = None
                    snippet["file_index"] = file_index
                    snippet["pos_index"]  = all_tags_list.index(item)
                    
                    # No plans to fully use RMDB yet, various data types will be converted to text

                    if snippet not in output_snippets: output_snippets.append(snippet)

            print('    {0} tags with the class "{1}" were retrieved in the file {2}.\n'.format(
                len(output_snippets), wanted_class_, html_filename)) #### Show this only once

    return output_snippets
Пример #4
0
def extract_tags_classes_approximate(html_filename, wanted_tags, wanted_classes, file_index = 0):
    """
    Inputs: filename of an html file,
            tuple with tag(s) we want to retrieve,
            tuple with class(es) we want to retrieve,
            file index of the file within a file list, if applicable.
    Objective: an html file is opened, converted to a string, the string is converted to a soup,
                and a list of tags belonging to the sought tag AND class is returned.
    Output: list with extracted tags.
    """

    html_string = filegenerator.file_to_string(html_filename)
    soup = BeautifulSoup(html_string, "html.parser")
    output_snippets = []
    all_tags_list = list(soup.find_all())

    # Get all tags similar to tag
    if wanted_tags != ("",):
        similar_tags = set()
        for tag in soup.find_all():
            for wanted_tag in wanted_tags:
                if wanted_tag in tag.name: similar_tags.add(tag.name)
        print("      Similar tags:", similar_tags)

    # Get all classes similar to class_
    if wanted_classes != ("",):
        classes = get_class_names(html_string)
        similar_classes = set()
        for class_ in classes:
            for wanted_class_ in wanted_classes:
                if wanted_class_ in class_: similar_classes.add(class_)
        print("      Similar classes:", similar_classes)

        print("")
    # Different cases (whether we search classes, tags, both or all)
    
    if wanted_classes != ("",) and wanted_tags != ("",):
        for similar_tag in similar_tags:
            for similar_class_ in similar_classes:
                applicable_tags = soup.find_all(similar_tag, similar_class_)
                
                if len(similar_classes) > 0 and len(similar_tags) > 0: # Don't add empty search results
                    if len(similar_classes) > 0:
                        for item in applicable_tags:
                            snippet = {}
                            snippet["given_path"] = html_filename
                            snippet["tag"] = item.name
                            if item.get("class") is not None: snippet["class"] = str(item.get("class"))
                            else: snippet["class"] = None
                            snippet["contents"] = str(item)
                            if item.get("href") is not None: snippet["href"] = str(item.get("href"))
                            else: snippet["href"] = None
                            snippet["file_index"] = file_index
                            snippet["pos_index"]  = all_tags_list.index(item)
                            
                            # No plans to fully use RMDB yet, various data types will be converted to text
                    
                            if snippet not in output_snippets: output_snippets.append(snippet)
                print('      {0} <{1}> tags with the class "{2}" were retrieved in the file {3}.\n'.format(
                    len(output_snippets), similar_tag, similar_class_, html_filename))
                
    elif wanted_classes == ("",) and wanted_tags != ("",):
        for tag in similar_tags:
            applicable_tags = soup.find_all(tag)
            if len(similar_tags) > 0: # Don't add empty search results
                for item in applicable_tags:
                    snippet = {}
                    snippet["given_path"] = html_filename
                    snippet["tag"] = item.name
                    if item.get("class") is not None: snippet["class"] = str(item.get("class"))
                    else: snippet["class"] = None
                    snippet["contents"] = str(item)
                    if item.get("href") is not None: snippet["href"] = str(item.get("href"))
                    else: snippet["href"] = None
                    snippet["file_index"] = file_index
                    snippet["pos_index"]  = all_tags_list.index(item)
                    # No plans to fully use RMDB yet, various data types will be converted to text
                    
                    if snippet not in output_snippets: output_snippets.append(snippet)
            print('      {0} <{1}> tags were retrieved in the file {2}.\n'.format(
                len(output_snippets), tag, html_filename))
            
    elif wanted_classes != ("",) and wanted_tags == ("",):
        for similar_class in similar_classes:
            applicable_tags = soup.find_all(class_=similar_class)
            if len(similar_classes) > 0:
                for item in applicable_tags:
                    snippet = {}
                    snippet["given_path"] = html_filename
                    snippet["tag"] = item.name
                    if item.get("class") is not None: snippet["class"] = str(item.get("class"))
                    else: snippet["class"] = None
                    snippet["contents"] = str(item)
                    if item.get("href") is not None: snippet["href"] = str(item.get("href"))
                    else: snippet["href"] = None
                    snippet["file_index"] = file_index
                    snippet["pos_index"]  = all_tags_list.index(item)
                    # No plans to fully use RMDB yet, various data types will be converted to text

                    if snippet not in output_snippets: output_snippets.append(snippet)
            
            print('      {0} tags with class "{1}" were retrieved in the file {2}.\n'.format(
                len(output_snippets), similar_class, html_filename))
    
    return output_snippets