def proximateScoring(phrase): score = [0, 0] # dummy # google it and get first page proximate words -> proximate text gs = GoogleSearch('"' + phrase + '"') gs.results_per_page = 50 sleep(searchSleepTime) proximateText = "" try: results = gs.get_results() logging.info(gs.last_search_url) for result in results: proximateText += result.desc + " " logging.info("proximate text (" + phrase + "):" + proximateText) # domain identification of the text domainScores = identifyDomain(proximateText) score = domainScores score.append(proximateText) except SearchError as se: logging.error("Search Error on proximate scoring: " + str(se)) print "Search Error on proximate scoring: " + str(se) score.append(proximateText) return score
def hitScore(phrase): hitScore = 0 gs = GoogleSearch('"' + phrase + '"') gs.results_per_page = 50 matchStrings = "" try: page = gs._get_results_page() logging.info(gs.last_search_url) pageStr = str(page) if pageStr.find('resultStats">') != -1 and pageStr.find("No results found for") == -1: m = re.search(r"resultStats\">.*bout (.*) results</div", pageStr) if m is not None: score = m.group(1) matchStrings += m.group(0) + " - " logging.info("score (" + phrase + "): " + score) score = score.replace(",", "") hitScore = int(score) else: logging.warning("No match! .. no google hits? (" + phrase + ")") else: logging.warning("No google hits! (" + phrase + ")") except SearchError as se: logging.warning("Search Error on: " + phrase + " no results? " + str(se)) return hitScore
def augmentedScoring(phrase): # => more like: about .* results kann ueber die ganze seite gehen... # => investigate regex/google site! hitScores = [] # without domains -> denominator searchStr = '"' + phrase + '"' gs = GoogleSearch(searchStr) gs.results_per_page = 50 sleep(searchSleepTime) score = 0 matchStrings = "" pageStrings = "" try: page = gs._get_results_page() logging.info(gs.last_search_url) pageStr = str(page) pageStrings += pageStr if pageStr.find('resultStats">') != -1 and pageStr.find("No results found for") == -1: m = re.search(r"resultStats\">.*bout (.*) results</div", pageStr) if m is not None: score = m.group(1) matchStrings += m.group(0) + " - " logging.info("score (" + searchStr + "): " + score) score = score.replace(",", "") else: logging.warning("No match! .. no google hits? (" + searchStr + ")") hitScores.append(0) for _ in domains: hitScores.append(0) hitScores.append("first: no match") hitScores.append(unicode(pageStrings, "utf-8", "strict")) logging.info(hitScores) return hitScores else: logging.warning("No google hits! (" + searchStr + ")") hitScores.append(0) for _ in domains: hitScores.append(0) hitScores.append("first: no match") hitScores.append(unicode(pageStrings, "utf-8", "strict")) logging.info(hitScores) return hitScores except SearchError as se: logging.warning("Search Error on: " + searchStr + " no results? " + str(se)) hitScores.append(0) for _ in domains: hitScores.append(0) hitScores.append("first: search error: " + str(se)) hitScores.append(unicode(pageStrings, "utf-8", "strict")) logging.info(hitScores) return hitScores denominationScore = float(score) logging.info("denominator (" + searchStr + "): " + str(denominationScore)) hitScores.append(denominationScore) # augmented with domains for domain in domains: searchStr = '"' + phrase + '" "' + domain + '"' gs = GoogleSearch(searchStr) gs.results_per_page = 50 sleep(searchSleepTime) score = 0 try: page = gs._get_results_page() logging.info(gs.last_search_url) pageStr = str(page) pageStrings += pageStr if pageStr.find('resultStats">') != -1: m = re.search(r'resultStats">.*bout (.*) results</div', pageStr) if m is not None: score = m.group(1) # matchStrings += m.group(0) + " - " logging.info("score (" + searchStr + "): " + score) score = score.replace(",", "") else: logging.warning("No match! .. no google hits? (" + searchStr + ")") else: logging.warning("No google hits! (" + searchStr + ")") except SearchError as se: logging.warning("Search Error on: " + searchStr + " no results? " + str(se)) # relativating by general hit count # hitScores.append(float(score) / denominationScore) hitScores.append(float(score)) # hitScores.append(matchStrings) # hitScores.append(unicode(pageStrings, 'utf-8', "strict")) logging.info(hitScores) return hitScores