Exemplo n.º 1
0
    def GoogleSearch(self, site_url, srch_term, srch_kywrds="", page=1):
        # Checks only the first search result
        # if match is more than 65%...valid result

        valid_results = []

        from entertainment.xgoogle.search import GoogleSearch

        search_url = "site:" + site_url + " " + srch_term + " " + srch_kywrds
        gs = GoogleSearch(search_url)
        gs.results_per_page = 20
        gs.page = page - 1

        title_words = (srch_term + " " + srch_kywrds).lower().split(" ")

        for result in gs.get_results():
            result_title = (result.title + " " + result.url).lower()

            match_total = float(len(title_words))
            match_count = 0
            for title_word in title_words:
                if title_word in result_title:
                    match_count = match_count + 1

            if (match_count / match_total) > 0.65:
                valid_results.append({"title": result.title, "url": result.url})

        return valid_results
Exemplo n.º 2
0
    def GoogleSearchByTitleReturnFirstResultOnlyIfValid(
        self,
        site_url,
        title,
        srch_kywrds="",
        item_count=1,
        title_extrctr="",
        exact_match=False,
        use_site_prefix=True,
        return_dict=False,
    ):
        # Checks for the first valid result in the fetched number of items
        # if item starts with search term
        #    and match is more than 65%...valid result

        return_url = ""

        from entertainment.xgoogle.search import GoogleSearch

        if title_extrctr != "":
            import re

        search_url = "site:" if use_site_prefix == True else ""
        search_url = search_url + site_url + " " + title + " " + srch_kywrds

        gs = GoogleSearch(search_url)
        gs.results_per_page = item_count

        title_lower = title.lower().strip()
        title_words = title_lower.split(" ")

        for result in gs.get_results():
            result_title = result.title.lower()
            if str(title_extrctr) != "":
                if isinstance(title_extrctr, list):
                    for ttlextrct in title_extrctr:
                        result_title_re = re.search(ttlextrct, result_title)

                        if result_title_re:
                            result_title = result_title_re.group(1)
                            break
                else:
                    result_title_re = re.search(title_extrctr, result_title)

                    if result_title_re:
                        result_title = result_title_re.group(1)
                    else:
                        continue

            if exact_match == True:
                if result_title == title_lower or result_title.replace("'", "") == title_lower.replace("'", ""):
                    if return_dict:
                        return_url = result
                    else:
                        return_url = result.url
                    break
                else:
                    continue

            if not result_title.startswith(title_lower) and not title_lower.startswith(result_title):
                continue

            match_total = float(len(title_words))
            match_count = 0

            for title_word in title_words:
                if title_word in result_title:
                    match_count = match_count + 1

            match_fraction = match_count / match_total
            if (match_total == 2 and match_fraction >= 0.5) or (match_fraction > 0.65):
                if return_dict:
                    return_url = result
                else:
                    return_url = result.url
                break

        if not return_url and use_site_prefix == True:
            return_url = self.GoogleSearchByTitleReturnFirstResultOnlyIfValid(
                site_url,
                title,
                srch_kywrds,
                item_count,
                title_extrctr,
                exact_match,
                use_site_prefix=False,
                return_dict=return_dict,
            )

        return return_url