Exemplo n.º 1
0
def findContentsFromURLList(urllist):
    list_len = len(urllist)
    for i in range(list_len):
        for j in range(i + 1, list_len):
            #print "1 %d %d" %(i,j)
            url1 = urllist[i]
            url2 = urllist[j]
            #print "2 %d %d" %(i,j)
            if url1 == url2:
                continue
            rs = fetchDistance(url1, url2)
            if rs == None or len(rs) == 0:
                #print "3 %d %d" %(i,j)
                contents1 = fetchURLContents(url1)
                contents2 = fetchURLContents(url2)
                c1 = findAverageContents(contents1)
                c2 = findAverageContents(contents2)
                if c1 == None:
                    logger.debug("[alert] [%s] has no contents" % url1)
                    continue
                if c2 == None:
                    logger.debug("[alert] [%s] has no contents" % url2)
                    continue
                #print "4 %d %d" %(i,j)
                distance = calcTwoHTMLDistance(c1, c2)
                #print "5 %d %d" %(i,j)
                r = storeDistance(url1, url2, distance)
                logger.debug("calculate distance [%s][%s]: %f %s" %(\
                    url1, url2, distance, r) )
            else:
                #print "6 %d %d %s" %(i,j,str(rs))
                logger.debug("find distance [%s][%s]: %f " %(\
                    url1, url2, rs[0]) )
Exemplo n.º 2
0
def findContentsFromURLList(urllist):
    list_len = len(urllist)
    for i in range(list_len):
        for j in range(i+1, list_len):
            #print "1 %d %d" %(i,j)
            url1 = urllist[i]
            url2 = urllist[j]
            #print "2 %d %d" %(i,j)
            if url1 == url2:
                continue
            rs = fetchDistance(url1, url2)
            if rs == None or len(rs) == 0:
                #print "3 %d %d" %(i,j)
                contents1 = fetchURLContents(url1)
                contents2 = fetchURLContents(url2)
                c1 = findAverageContents(contents1)
                c2 = findAverageContents(contents2)
                if c1 == None :
                    logger.debug("[alert] [%s] has no contents" %url1 )
                    continue
                if c2 == None :
                    logger.debug("[alert] [%s] has no contents" %url2 )
                    continue  
                #print "4 %d %d" %(i,j)
                distance = calcTwoHTMLDistance(c1, c2)
                #print "5 %d %d" %(i,j)
                r = storeDistance(url1, url2, distance)
                logger.debug("calculate distance [%s][%s]: %f %s" %(\
                    url1, url2, distance, r) )
            else:
                #print "6 %d %d %s" %(i,j,str(rs))
                logger.debug("find distance [%s][%s]: %f " %(\
                    url1, url2, rs[0]) )
Exemplo n.º 3
0
def extractAndStoreScriptsFromFileList(file_list_path):
    f = open(file_list_path)
    urls = set()
    for line in f:
        urls.add(line.strip())

    for url in urls:
        print "prcossing scripts of %s " % url
        hosts, inlines = fetchScripts(url)
        if hosts == None or inlines == None:
            contents = fetchURLContents(url)
            if contents == None or len(contents) == 0:
                print >> sys.stderr, "%s doesn't have contents " % url
                continue
            content = findAverageContents(contents)
            if content == None:
                print >> sys.stderr, "failed to extract average content for %s" % url
                continue
            extractAndStoreScriptsFromDOM(url, content)

        else:
            print "%s already has %d hosts and %d inline scripts " \
              %(url, len(hosts), len(inlines))
Exemplo n.º 4
0
def extractAndStoreScriptsFromFileList(file_list_path):
  f = open(file_list_path)
  urls = set()
  for line in f:
    urls.add(line.strip())

  for url in urls:
    print "prcossing scripts of %s " % url
    hosts, inlines = fetchScripts(url)
    if hosts == None or inlines == None:
      contents = fetchURLContents(url)
      if contents == None or len(contents) == 0:
        print >> sys.stderr, "%s doesn't have contents " %url
        continue
      content = findAverageContents(contents)
      if content == None:
        print >> sys.stderr, "failed to extract average content for %s" %url
        continue
      extractAndStoreScriptsFromDOM(url, content)

    else:
      print "%s already has %d hosts and %d inline scripts " \
        %(url, len(hosts), len(inlines))