Пример #1
0
def getAlexarank(url):

    if not isinstance(url, Urlattributes):
        url = Urlattributes(url)

    uri = "http://data.alexa.com/data?cli=10&dat=s&url=" + url.geturl()
    uri = Urlattributes(uri)
    soup = uri.getsoup()
    try:
        rank = soup.find("reach")['rank']
    except:
        rank = None
    return rank
Пример #2
0
def getWot(url):

    if not isinstance(url, Urlattributes):
        url = Urlattributes(url)

    result = ("http://api.mywot.com/0.4/public_link_json2?hosts=" +
              url.geturl() +
              "/&callback=&key=d60fa334759ae377ceb9cd679dfa22aec57ed998")
    uri = Urlattributes(result)
    raw = uri.gettext()
    result = literal_eval(raw[1:-4])
    result = str(result).split(']')[0].split('[')[-1].split(',')
    data = None
    if isinstance(result, list) and len(result) == 2:
        data = {}
        data['reputation'] = int(result[0])
        data['confidence'] = int(result[1])
    return data
    def assess(self):

        now = datetime.now()

        if not isinstance(self.request, dict):
            self.request = dict(self.request.args)

        data = {}
        req = {}
        req['args'] = {}
        percentage = {}
        site = None
        dump = True
        try:
            # get percentage of each feature
            # and copy self.request to req['args']
            # TODO come back and do this properly
            for keys in apiList.keys():
                if self.request.get(keys, None):
                    # because self.request.args is of ImmutableMultiDict form
                    if isinstance(self.request.get(keys, None), list):
                        req['args'][keys] = str(self.request.get(keys)[0])
                        perc = keys + "Perc"
                        if self.request.get(perc):
                            percentage[keys] = self.request.get(perc)[0]
                    else:
                        req['args'][keys] = self.request.get(keys)
                        perc = keys + "Perc"
                        if self.request.get(perc):
                            percentage[keys] = self.request.get(perc)

            # to show wot ranking
            # req['args']['wot'] = "true"
            data['url'] = req['args']['site']

            site = Urlattributes(url=req['args'].get('site', None))

            # get genre
            # WARNING there can be some issue with it
            data['genre'] = self.request.get('genre', None)

            if data['url'] != site.geturl():
                data['redirected'] = site.geturl()

            data['lastmod'] = site.getlastmod()

            # site is not a WEBCred parameter
            del req['args']['site']

            # check database,
            # if url is already present?
            if self.db.filter('url', data['url']).count():
                '''
                if lastmod not changed
                    update only the columns with None value
                else update every column
                '''
                if self.db.filter(
                        'lastmod',
                        data['lastmod']).count() or not data['lastmod']:
                    # get all existing data in dict format
                    data = self.db.getdata('url', data['url'])

                    # check the ones from columns which have non None value
                    '''
                    None value indicates that feature has not
                    successfully extracted yet
                    '''
                    for k, v in data.items():
                        if v or str(v) == '0':
                            # always assess loadtime
                            if k != 'pageloadtime':
                                req['args'][k] = 'false'
                    dump = False
                else:
                    data = self.db.getdata('url', data['url'])

            data = self.extractValue(req, apiList, data, site)

            # HACK 13 is calculated number, refer to index.html, where new
            # dimensions are dynamically added
            # create percentage dictionary
            number = 13
            # TODO come back and do this properly

            print()
            print()
            print()
            print("no error 1")
            print()
            print()
            print()

            while True:
                dim = "dimension" + str(number)
                API = "api" + str(number)
                if dim in self.request.keys():
                    try:
                        data[self.request.get(dim)[0]] = surface.dimapi(
                            site.geturl(),
                            self.request.get(API)[0])
                        perc = API + "Perc"
                        percentage[dim] = self.request.get(perc)[0]
                    except WebcredError as e:
                        data[self.request.get(dim)[0]] = e.message
                    except:
                        data[self.request.get(dim)[0]] = "Fatal ERROR"
                else:
                    break
                number += 1

            print()
            print()
            print()
            print("no error 2")
            print()
            print()
            print()

            data = webcredScore(data, percentage)

            data['error'] = None
            print("data1error                                    ",
                  data['error'])

        except WebcredError as e:
            data['error'] = e.message
            print('python error')
            print()
            dump = False
        except Exception:
            # Get current system exception
            ex_type, ex_value, ex_traceback = sys.exc_info()

            # Extract unformatter stack traces as tuples
            trace_back = traceback.extract_tb(ex_traceback)

            # Format stacktrace
            stack_trace = list()

            for trace in trace_back:
                stack_trace.append(
                    "File : %s , Line : %d, Func.Name : %s, Message : %s" %
                    (trace[0], trace[1], trace[2], trace[3]))

            # print("Exception type : %s " % ex_type.__name__)
            logger.info(ex_value)
            logger.debug(stack_trace)
            # HACK if it's not webcred error,
            #  then probably it's python error
            data['error'] = 'Fatal Error'
            dump = False
            logger.debug(data['url'])
        finally:

            now = str((datetime.now() - now).total_seconds())
            data['assess_time'] = now

            # store it in data
            self.db.update('url', data['url'], data)

            # dump text and html of html
            if dump:
                self.dumpRaw(site)

            data = self.db.getdata('url', data['url'])

            # prevent users to know of dump location
            del data['html']
            del data['text']

            logger.debug(data['url'])

            logger.debug('Time = {}'.format(now))

            print("data2error                                    ",
                  data['error'])

            return data