예제 #1
0
def main():
    args = parser.parse_args()

    with open('./keys.json', 'r') as k:
        keys = json.load(k)
        
    p = Pyscape(**keys)
    
    urls = []
    with open(args.src, 'r') as s:
        for line in s:
            urls.append(line.rstrip())
    
    data = []
    for batch in batch_urls(urls, 10):
        data.extend(p.batch_url_metrics(batch).json())
        print('%s URLs returned' % (len(data)))

    with codecs.open(args.dest, 'w', encoding='utf-8') as outfile:
        write_csv(outfile, data)
예제 #2
0
def main():
    args = parser.parse_args()

    with open('./keys.json', 'r') as k:
        keys = json.load(k)

    p = Pyscape(**keys)

    urls = []
    with open(args.src, 'r') as s:
        for line in s:
            urls.append(line.rstrip())

    data = []
    for batch in batch_urls(urls, 10):
        data.extend(p.batch_url_metrics(batch).json())
        print('%s URLs returned' % (len(data)))

    with codecs.open(args.dest, 'w', encoding='utf-8') as outfile:
        write_csv(outfile, data)
예제 #3
0
class PyscapeAuthTestCase(unittest.TestCase):
    def setUp(self):
        self.valid_instance = Pyscape(**VALID_KEYS)
        self.bad_instance = Pyscape(**BAD_KEYS)

    def test_auth_success(self):
        """Test something."""
        self.assertEqual(200, self.valid_instance.get_index_stats().status_code)
    
    def test_auth_fail(self):
        """Test something."""
        self.assertEqual(401, self.bad_instance.get_index_stats().status_code)
    
    def test_get_url_metrics(self):
        r = self.valid_instance.get_url_metrics('distilled.net')
        return self.assertEqual('distilled.net/', r.json()['uu'])
    
    def test_get_anchor_text(self):
        r = self.valid_instance.get_anchor_text('distilled.net')
        return self.assertTrue('aput' in r.json()[0])
        
    def test_get_links(self):
        r = self.valid_instance.get_links('distilled.net')
        return self.assertTrue('luuu' in r.json()[0])
        
    def test_get_top_pages(self):
        r = self.valid_instance.get_top_pages('distilled.net')
        return self.assertTrue('uu' in r.json()[0])
class PyscapeAuthTestCase(unittest.TestCase):
    def setUp(self):
        self.valid_instance = Pyscape(**VALID_KEYS)
        self.bad_instance = Pyscape(**BAD_KEYS)

    def test_auth_success(self):
        """Test something."""
        self.assertEqual(200,
                         self.valid_instance.get_index_stats().status_code)

    def test_auth_fail(self):
        """Test something."""
        self.assertEqual(401, self.bad_instance.get_index_stats().status_code)

    def test_get_url_metrics(self):
        r = self.valid_instance.get_url_metrics('distilled.net')
        return self.assertEqual('distilled.net/', r.json()['uu'])

    def test_get_anchor_text(self):
        r = self.valid_instance.get_anchor_text('distilled.net')
        return self.assertTrue('aput' in r.json()[0])

    def test_get_links(self):
        r = self.valid_instance.get_links('distilled.net')
        return self.assertTrue('luuu' in r.json()[0])

    def test_get_top_pages(self):
        r = self.valid_instance.get_top_pages('distilled.net')
        return self.assertTrue('uu' in r.json()[0])
예제 #5
0
 def setUp(self):
     self.valid_instance = Pyscape(**VALID_KEYS)
     self.bad_instance = Pyscape(**BAD_KEYS)
예제 #6
0
    def downloadURL(self, url, check):

        # on doit déjà regarder si c'est une url qui commence par http
        if ( not url.startswith("http") ) :
            return False 

        if check :
            self.logMessage("down url CHECK : " + url)

        if self.stopThread :
            self._is_running = False
            return

        # on va indiquer si on doit regarder le whois et les stats du domaine
        regardeWhois = False
        traitement = False

        try :
            if check :
                # on doit savoir si on vérifier le domaine ou toute l'url
                if self.varRadioDomain.get() == 2 :
                    parsed_uri = urlparse(url)
                    url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
                    
                if url in self.queueUrlsChecked :
                    return False

                self.queueUrlsChecked.put(url)
                self.maj_compteur()
                self.logMessage("⌕ Check url : " + str(url), True)
            else :
                self.logMessage("⌛ Crawl url : " + str(url), True)
                if url not in self.queueUrlsCrawled :
                    self.queueUrlsCrawled.put(url)
            
            headers = { 'User-Agent' : self.user_agent, 'referer': url }
            data = urllib.parse.urlencode({})
            req = urllib.request.Request(url, data=None, headers=headers)
            html_page = urllib.request.urlopen(req).read()
            return html_page

        except urllib.error.URLError as e:
            
            print(e)
 
            # il y a eu une erreur
            if hasattr(e, 'code'):
            #if e.code : 
                if check :
                    codeerreur = str(e.code)
                    
                    self.logMessage("down url erreur : " + codeerreur)
                    whois_info = "Whois : ?"
                    
                    if self.varErreurAll.get() == 1 :
                        traitement = True
                    
                    if (codeerreur == "403" and self.varErreur403.get() == 1) :
                        #self.logMessage("403 passe")
                        traitement = True
                    if (codeerreur == "404" and self.varErreur404.get() == 1) :
                        #self.logMessage("404 passe")
                        traitement = True
                        regardeWhois = True
                            
                    if (codeerreur == "500" and self.varErreur500.get() == 1) :
                        #self.logMessage("500 passe")
                        traitement = True
                    
                    if url in self.toutesUrlFinded :
                        traitement = False
            else :
                # il n'y a pas de code mais un probleme quand même
                
                if hasattr(e, 'reason'):
                    print(str(e.reason))
                    regardeWhois = True
                    traitement = True
                    if self.varErreurExpired.get() == 1 :
                        codeerreur = "Address not reachable"      

        except urllib.error.URLError as e:
            #print(e.reason) 
            self.logMessage(" **** Erreur request **** : " + str(e.reason))
            
        except socket.timeout:
            self.logMessage(" **** Erreur request **** : socket.timeout")

        except socket.gaierror:
            self.logMessage(" **** Erreur request **** : socket.gaierror")
            regardeWhois = True 

        except SocketError as e:
            self.logMessage(" **** Erreur socket **** : " + str(e.errno))
            
        except Exception:
            self.logMessage(" **** Erreur - une autre exception bizarre...")

        resultat_whois = True
        if regardeWhois == True :
            # check whois pour être certain de son coup
            # retourne un texte ainsi qu'une valeur False si le whois a échoué
            whois_info, resultat_whois = self.checkWhois(url)
            
        # on chope le résultat de Moz PA et DA
        keys = {
            "access_id": self.moz_access_id,
            "secret_key": self.moz_secret_id
        }
        
        mozda = "?"
        mozpa = "?"
        mozlinks = "?"
        retour = ""
        if (self.moz_access_id != "" and self.moz_secret_id != "" and traitement) :
            try:
                p = Pyscape(**keys)
                resultat = p.get_url_metrics(url).json()
                mozda = str(round(resultat["pda"], 0))
                mozpa = str(round(resultat["upa"], 0))
                mozlinks = str(resultat["uid"])
                #self.logMessage(resultat)
                del keys
                del resultat
            except Exception :
                self.logMessage("Problem reading Moz Rank (have you correctly configured config.cfg ?)")

        # si on a demandé l'analyse d'erreurs 
        # ou si on est sur un check du domaine expiré et disponible uniquement
        affiche_dans_liste = False
        if traitement :
            # on ne cherche vraiment que les expirés
            if self.varErreurExpired.get() == 1 :
                # on affiche uniquement si le whois n'a rien donné et pas seulement s'il y a une erreur
                if  resultat_whois == False :
                    affiche_dans_liste = True
            else :
                affiche_dans_liste = True

        # si on affiche bien l'url trouvé dans la liste
        if affiche_dans_liste == True :
            if url not in self.toutesUrlFinded :
                self.toutesUrlFinded.append(url)
                temp = url + " | Error : " + codeerreur + " | DA : " + mozda + " | PA : " + mozpa + " | Links : " + mozlinks + " | " + whois_info
                self.tree.insert("",0, text=str(temp))
                with codecs.open(self.nomfichier,'a','utf-8') as file_:
                    file_.write(temp + "\n") 

        # cela n'a pas fonctionné on retourne False
        return False
 def setUp(self):
     self.valid_instance = Pyscape(**VALID_KEYS)
     self.bad_instance = Pyscape(**BAD_KEYS)