示例#1
0
def proximateScoring(phrase):
    score = [0, 0]  # dummy
    # google it and get first page proximate words -> proximate text
    gs = GoogleSearch('"' + phrase + '"')
    gs.results_per_page = 50
    sleep(searchSleepTime)
    proximateText = ""
    try:
        results = gs.get_results()
        logging.info(gs.last_search_url)
        for result in results:
            proximateText += result.desc + " "
        logging.info("proximate text (" + phrase + "):" + proximateText)

        # domain identification of the text
        domainScores = identifyDomain(proximateText)
        score = domainScores
        score.append(proximateText)
    except SearchError as se:
        logging.error("Search Error on proximate scoring: " + str(se))
        print "Search Error on proximate scoring: " + str(se)
        score.append(proximateText)
    return score
示例#2
0
def hitScore(phrase):
    hitScore = 0
    gs = GoogleSearch('"' + phrase + '"')
    gs.results_per_page = 50
    matchStrings = ""
    try:
        page = gs._get_results_page()
        logging.info(gs.last_search_url)
        pageStr = str(page)
        if pageStr.find('resultStats">') != -1 and pageStr.find("No results found for") == -1:
            m = re.search(r"resultStats\">.*bout (.*) results</div", pageStr)
            if m is not None:
                score = m.group(1)
                matchStrings += m.group(0) + " - "
                logging.info("score (" + phrase + "): " + score)
                score = score.replace(",", "")
                hitScore = int(score)
            else:
                logging.warning("No match! .. no google hits? (" + phrase + ")")
        else:
            logging.warning("No google hits! (" + phrase + ")")
    except SearchError as se:
        logging.warning("Search Error on: " + phrase + " no results? " + str(se))
    return hitScore
示例#3
0
def augmentedScoring(phrase):
    # => more like: about .* results kann ueber die ganze seite gehen...
    # => investigate regex/google site!
    hitScores = []
    # without domains -> denominator
    searchStr = '"' + phrase + '"'
    gs = GoogleSearch(searchStr)
    gs.results_per_page = 50
    sleep(searchSleepTime)
    score = 0
    matchStrings = ""
    pageStrings = ""
    try:
        page = gs._get_results_page()
        logging.info(gs.last_search_url)
        pageStr = str(page)
        pageStrings += pageStr
        if pageStr.find('resultStats">') != -1 and pageStr.find("No results found for") == -1:
            m = re.search(r"resultStats\">.*bout (.*) results</div", pageStr)
            if m is not None:
                score = m.group(1)
                matchStrings += m.group(0) + " - "
                logging.info("score (" + searchStr + "): " + score)
                score = score.replace(",", "")
            else:
                logging.warning("No match! .. no google hits? (" + searchStr + ")")
                hitScores.append(0)
                for _ in domains:
                    hitScores.append(0)
                hitScores.append("first: no match")
                hitScores.append(unicode(pageStrings, "utf-8", "strict"))
                logging.info(hitScores)
                return hitScores
        else:
            logging.warning("No google hits! (" + searchStr + ")")
            hitScores.append(0)
            for _ in domains:
                hitScores.append(0)
            hitScores.append("first: no match")
            hitScores.append(unicode(pageStrings, "utf-8", "strict"))
            logging.info(hitScores)
            return hitScores
    except SearchError as se:
        logging.warning("Search Error on: " + searchStr + " no results? " + str(se))
        hitScores.append(0)
        for _ in domains:
            hitScores.append(0)
        hitScores.append("first: search error: " + str(se))
        hitScores.append(unicode(pageStrings, "utf-8", "strict"))
        logging.info(hitScores)
        return hitScores
    denominationScore = float(score)
    logging.info("denominator (" + searchStr + "): " + str(denominationScore))
    hitScores.append(denominationScore)

    # augmented with domains
    for domain in domains:
        searchStr = '"' + phrase + '" "' + domain + '"'
        gs = GoogleSearch(searchStr)
        gs.results_per_page = 50
        sleep(searchSleepTime)
        score = 0
        try:
            page = gs._get_results_page()
            logging.info(gs.last_search_url)
            pageStr = str(page)
            pageStrings += pageStr
            if pageStr.find('resultStats">') != -1:
                m = re.search(r'resultStats">.*bout (.*) results</div', pageStr)
                if m is not None:
                    score = m.group(1)
                    # matchStrings += m.group(0) + " - "
                    logging.info("score (" + searchStr + "): " + score)
                    score = score.replace(",", "")
                else:
                    logging.warning("No match! .. no google hits? (" + searchStr + ")")
            else:
                logging.warning("No google hits! (" + searchStr + ")")
        except SearchError as se:
            logging.warning("Search Error on: " + searchStr + " no results? " + str(se))

        # relativating by general hit count
        # hitScores.append(float(score) / denominationScore)
        hitScores.append(float(score))

    # hitScores.append(matchStrings)
    # hitScores.append(unicode(pageStrings, 'utf-8', "strict"))
    logging.info(hitScores)
    return hitScores