Exemplos de LinkExtractor em Python, exemplos de link_extractor.LinkExtractor em Python

Exemplo n.º 1

0

Exibir arquivo

 def gather_links_save_page(page_url):
     html_string = ''
     try:
         response = urlopen(page_url, timeout=10)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
             if Crawler.save_html_pages:
                 # By running concurrent threads this creates multiple equal codes
                 code = Crawler.count_code
                 Crawler.count_code += 1
                 write_file(Crawler.pages_folder + str(code), html_string)
                 Crawler.code_from_url[page_url] = code
                 Crawler.url_from_code[code] = page_url
                 if code % 100 == 0:
                     print(
                         'storing with pickle: code_from_url and url_from_code'
                     )
                     with open('code_from_url_dict.pickle', 'wb') as handle:
                         pickle.dump(Crawler.code_from_url,
                                     handle,
                                     protocol=pickle.HIGHEST_PROTOCOL)
                     with open('url_from_code_dict.pickle', 'wb') as handle:
                         pickle.dump(Crawler.url_from_code,
                                     handle,
                                     protocol=pickle.HIGHEST_PROTOCOL)
                         # with open('filename.pickle', 'rb') as handle:
                         #     b = pickle.load(handle)
         link_extractor = LinkExtractor(Crawler.base_url, page_url, True,
                                        Crawler.domain_name)
         link_extractor.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return link_extractor.page_links()

Exemplo n.º 2

0

Exibir arquivo

Arquivo: test_link_extractor.py Projeto: thelegend831/fbopen

 def test_collect_link_attrs(self):
     extractor = LinkExtractor('samples/source1.html')
     self.assertEqual(extractor.collect_link_attrs(), [{
         'desc':
         'Solicitation',
         'filename':
         'Solicitation.doc',
         'url':
         'https://www.fbo.gov/utils/view?id=46b7d20b80ba577b5e4dd10b1561b247'
     }, {
         'desc':
         'Attch 1 Specifications',
         'filename':
         'Attch_1_Specifications.zip',
         'url':
         'https://www.fbo.gov/utils/view?id=f08375882eee4900f88a748fb8a941c7'
     }, {
         'desc':
         'Attch 2 Material Submittal',
         'filename':
         'Attch_2_Submittal_Schedule.pdf',
         'url':
         'https://www.fbo.gov/utils/view?id=6b5544a2b5f254ae1dcfaea41f155960'
     }, {
         'desc':
         'Attch 3 Schedule of Drawings',
         'filename':
         'Attch_3_Schedule_of_Drawings.pdf',
         'url':
         'https://www.fbo.gov/utils/view?id=9e6640c9840978099dbe08351d0802bf'
     }, {
         'desc':
         'Attch 4 Drawings',
         'filename':
         'Attch_4_Drawings.zip',
         'url':
         'https://www.fbo.gov/utils/view?id=58e041568e210a73884254db1c069855'
     }, {
         'desc':
         'Attch 5 Wage Determination',
         'filename':
         'Attch_5_Wage_Determination.docx',
         'url':
         'https://www.fbo.gov/utils/view?id=7301f9274d34ebbf3ec3ff8df04968e4'
     }, {
         'desc':
         'Attch 6 Base Entry Policy',
         'filename':
         'Attch_6_Base_Entry_Policy_Letter.pdf',
         'url':
         'https://www.fbo.gov/utils/view?id=b4e13ed9cdeb5eec3822465565810457'
     }])

Exemplo n.º 3

0

Exibir arquivo

Arquivo: main.py Projeto: anmolj7/direc_listing

def main():
    url = input(
        "Enter the url of the website including the protocol, example(https://example.com): "
    )
    if is_vuln(url):
        print("The website is vulnerable at", url)
    else:
        Links = LinkExtractor().GetAllLinks(url)
        Direcs = LinkExtractor().DirectoryExtractor(Links)
        print('-' * 60)
        print('All Links Extracted.')
        print('-' * 60)
        print("Checking vulnerablities.")
        safe = True
        for direc in Direcs:
            print(f"Checking if the site is vulnerable at {direc}")
            if is_vuln(direc):
                safe = False
                print(f"The website is vulnerable at: {direc}")
        if safe:
            print(
                "The website is secure (Atleast of directory listing vulns).")

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test_link_extractor.py Projeto: 01022012/fbopen

 def test_collect_link_attrs(self):
     extractor = LinkExtractor('samples/source1.html')
     self.assertEqual(extractor.collect_link_attrs(), [
         {
             'desc': 'Solicitation',
             'filename': 'Solicitation.doc',
             'url': 'https://www.fbo.gov/utils/view?id=46b7d20b80ba577b5e4dd10b1561b247'
         },
         {
             'desc': 'Attch 1 Specifications',
             'filename': 'Attch_1_Specifications.zip',
             'url': 'https://www.fbo.gov/utils/view?id=f08375882eee4900f88a748fb8a941c7'
         },
         {
             'desc': 'Attch 2 Material Submittal',
             'filename': 'Attch_2_Submittal_Schedule.pdf',
             'url': 'https://www.fbo.gov/utils/view?id=6b5544a2b5f254ae1dcfaea41f155960'
         },
         {
             'desc': 'Attch 3 Schedule of Drawings',
             'filename': 'Attch_3_Schedule_of_Drawings.pdf',
             'url': 'https://www.fbo.gov/utils/view?id=9e6640c9840978099dbe08351d0802bf'
         },
         {
             'desc': 'Attch 4 Drawings',
             'filename': 'Attch_4_Drawings.zip',
             'url': 'https://www.fbo.gov/utils/view?id=58e041568e210a73884254db1c069855'
         },
         {
             'desc': 'Attch 5 Wage Determination',
             'filename': 'Attch_5_Wage_Determination.docx',
             'url': 'https://www.fbo.gov/utils/view?id=7301f9274d34ebbf3ec3ff8df04968e4'
         },
         {
             'desc': 'Attch 6 Base Entry Policy',
             'filename': 'Attch_6_Base_Entry_Policy_Letter.pdf',
             'url': 'https://www.fbo.gov/utils/view?id=b4e13ed9cdeb5eec3822465565810457'
         }
     ])

Exemplo n.º 5

0

Exibir arquivo

    def preprocess_documents(self):
        global link_extractor
        web_graph = graph.OptimizedDirectedGraph()

        with open('code_from_url_dict.pickle', 'rb') as handle:
            code_from_url = pickle.load(handle)

        # for filename in os.listdir(self.FOLDER + '/pages/'):
        for filename in range(self.n_pages):
            with open(self.FOLDER + '/pages/' + str(filename)) as f:
                doc_text = f.read()

            if doc_text is None:
                print('empty doc')
                print(filename)
                continue

            self.process_page(int(filename), doc_text)

            link_extractor = LinkExtractor(self.FOLDER, self.HOMEPAGE,
                                           self.DOMAIN_NAME)
            link_extractor.feed(doc_text)
            links = link_extractor.page_links()
            # print('document number '+filename)
            # print('total links: '+str(len(links)))
            count = 0
            web_graph.add_node(int(filename))
            for url in links:
                if url in code_from_url:
                    # if code_from_url[url] == 6:
                    #     print(url)
                    #     exit()
                    count += 1
                    web_graph.add_edge(int(filename), code_from_url[url])
            # print('node '+filename+str(web_graph.get_pointing_to(int(filename))))
        return web_graph

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_link_extractor.py Projeto: thelegend831/fbopen

 def test_get_opp_solnbr(self):
     extractor = LinkExtractor('samples/source1.html')
     self.assertEqual(extractor.get_opp_solnbr(), 'FA4626-14-R-0011')

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_link_extractor.py Projeto: 01022012/fbopen

 def test_get_opp_solnbr(self):
     extractor = LinkExtractor('samples/source1.html')
     self.assertEqual(extractor.get_opp_solnbr(), 'FA4626-14-R-0011')