def search():
            item_list = list()

            # todo: limit > 0

            while True:
                try:
                    time.sleep(global_advanced_settings.getint('search_cooldown'))
                    print('searching for: "' + query + '"')
                    for url in google_search(query, stop=limit):
                        if len(item_list) >= limit:
                            break
                        new_item = {'link': url}

                        while True:

                            for existing_candidate in video_candidates:
                                if new_item['link'] == existing_candidate['link']:
                                    break

                            try:
                                new_item['pytube_result'] = YouTube(new_item['link'])
                                item_list.append(new_item)
                                break
                            except KeyError:
                                print('Pytube failed to initialize (KeyError). trying again in 2 seconds.')
                                time.sleep(2)
                            except URLError:
                                print('Pytube failed to initialize (URLError). trying again in 2 seconds.')
                                time.sleep(2)
                            except exceptions.RegexMatchError:
                                new_item['delete_this_item'] = True
                                break

                    break

                except HTTPError as e:
                    if e.code == 503:
                        print('------------------------------------------------------------------------------------')
                        print('It seems that your IP-address have been flagged by google for unusual activity. ')
                        print('They usually put down the flag after some time so try again tomorrow.')
                        print('If this is a reoccurring issue, increase the search cooldown under advanced settings')
                        print('------------------------------------------------------------------------------------')
                        sys.exit()
                    else:
                        print('Failed to retrieve search results, trying again in 2 seconds: ' + e.msg)
                        time.sleep(2)
                        continue

                except URLError as e:
                    print('Failed to retrieve search results, trying again in 2 seconds: ' + e.msg)
                    time.sleep(2)
                    continue

            return item_list
Exemplo n.º 2
0
def search(query):
    google_response = google_search(query + ' site:genius.com', stop=1)
    links = []
    for url in google_response:
        links.append(url)

    links_of_lyric_pages = []

    for link in links:
        # Remove url of other pages of genius.com
        if link.count('/') != 3:
            continue
        if link.split('/')[3] in ['artists-index', 'discussions']:
            continue
        if (link.split('/')[3]).split('?')[0] in ['songs']:
            continue
        if link.count('-') == 0:
            continue
        if 'lyrics' not in link:
            continue
        else:
            links_of_lyric_pages.append(link)

    # list_of_links_with_description = [{'description': "",
    #                                    'url': "",
    #                                    'user_query': ""}]
    list_of_links_with_description = []
    for link in links_of_lyric_pages:
        url_lyric_description_part = link.split('/')[-1]
        # remove '-' and 'lyrics' word
        description = ' '.join(url_lyric_description_part.split('-')[:-1])
        list_of_links_with_description.append({
            'description': description,
            'url': link,
            'query': query
        })
    return list_of_links_with_description
Exemplo n.º 3
0
        def search():
            item_list = list()

            # todo: limit > 0

            while True:
                try:
                    time.sleep(
                        global_advanced_settings.getint('search_cooldown'))
                    print('searching for: "' + query + '"')
                    for url in google_search(query, stop=limit):
                        if len(item_list) >= limit:
                            break
                        new_item = {'link': url}

                        while True:

                            for existing_candidate in video_candidates:
                                if new_item['link'] == existing_candidate[
                                        'link']:
                                    break

                            try:
                                new_item['pytube_result'] = YouTube(
                                    new_item['link'])
                                item_list.append(new_item)
                                break
                            except KeyError:
                                print(
                                    'Pytube failed to initialize (KeyError). trying again in 2 seconds.'
                                )
                                time.sleep(2)
                            except URLError:
                                print(
                                    'Pytube failed to initialize (URLError). trying again in 2 seconds.'
                                )
                                time.sleep(2)
                            except exceptions.RegexMatchError:
                                new_item['delete_this_item'] = True
                                break

                    break

                except HTTPError as e:
                    if e.code == 503:
                        print(
                            '------------------------------------------------------------------------------------'
                        )
                        print(
                            'It seems that your IP-address have been flagged by google for unusual activity. '
                        )
                        print(
                            'They usually put down the flag after some time so try again tomorrow.'
                        )
                        print(
                            'If this is a reoccurring issue, increase the search cooldown under advanced settings'
                        )
                        print(
                            '------------------------------------------------------------------------------------'
                        )
                        sys.exit()
                    else:
                        print(
                            'Failed to retrieve search results, trying again in 2 seconds: '
                            + e.msg)
                        time.sleep(2)
                        continue

                except URLError as e:
                    print(
                        'Failed to retrieve search results, trying again in 2 seconds: '
                        + e.msg)
                    time.sleep(2)
                    continue

            return item_list
    # Make all emails to lower case and return as list
    return list({email.casefold() for email in emails})


# Google Search
search_queries = ["webbyrå stockholm", "webbyrå göteborg"]  # Search queries
num_search_limit = 100  # Number of results to retrieve from Google search per search query

urls_to_scrape = []

for search_query in search_queries:
    url_index = 0
    for url_result in google_search(search_query,
                                    tld="se",
                                    num=num_search_limit,
                                    stop=num_search_limit,
                                    pause=2):

        # Extract the base url
        parts = urlsplit(url_result)
        base_url = "{0.scheme}://{0.netloc}".format(parts)

        # Check if the url has already been handled. If it has but the new ranking is higher (lower number) delete and replace
        run_continue = False
        list_len = len(urls_to_scrape)
        for i in range(0, list_len):
            if urls_to_scrape[i]['base_url'] == base_url:
                if urls_to_scrape[i][
                        'search_ranking'] > url_index:  # If the new ranking is higher (lower number) - delete the old one
                    del urls_to_scrape[i]
def get_video_to_download(movie, search_suffix, filter_arguments):

    def scan_response(response):

        response['max_video_resolution'] = 0
        for result in response['items']:

            result['delete_this_item'] = False

            video = None
            for try_count in range(5):

                if try_count > 2:
                    time.sleep(1)
                    video = YouTube(result['link'])
                else:
                    try:
                        video = YouTube(result['link'])
                        break
                    except KeyError:
                        print('Pytube failed to initialize (KeyError). trying again in 10 seconds.')
                        time.sleep(9)
                    except URLError:
                        print('Pytube failed to initialize (URLError). trying again in 10 seconds.')
                        time.sleep(9)
                    except exceptions.RegexMatchError:
                        result['delete_this_item'] = True
                        break

            if result['delete_this_item']:
                continue

            result['youtube_object'] = video
            result['title'] = video.title
            result['avg_rating'] = float(video.player_config_args['avg_rating'])
            result['view_count'] = int(video.player_config_args['view_count'])

            if result['view_count'] < 60:
                result['view_count'] = 60

            result['video_resolution'] = 0
            for stream in video.streams.filter(type='video').all():
                try:
                    resolution = int(stream.resolution.replace('p', ''))
                except AttributeError:
                    resolution = 0

                if resolution > response['max_video_resolution']:
                    response['max_video_resolution'] = resolution
                if resolution > result['video_resolution']:
                    result['video_resolution'] = resolution

            try:
                if 'ad_preroll' in video.player_config_args:
                    result['adds_info'] = 'have adds'
                else:
                    result['adds_info'] = 'No adds'
            except ValueError:
                result['adds_info'] = 'No adds'

        return response

    def filter_response(response, arguments):

        items = list()

        for result in response['items']:

            append_video = True

            if result['delete_this_item']:
                continue

            for word in arguments['video_name_must_contain']:
                if word.lower() not in result['title'].lower():
                    append_video = False

            for word in arguments['video_name_must_not_contain']:
                if word.lower() in result['title'].lower():
                    append_video = False

            if append_video:
                items.append(result)

        response.pop('items')
        response['items'] = items

        return response

    def score_response(response, scoring_arguments):

        for result in response['items']:

            result['true_rating'] = result['avg_rating'] * (1 - 1 / ((result['view_count'] / 60) ** 0.5))

            if result['video_resolution'] < 700:
                result['true_rating'] *= 0.90
                result['view_count'] *= 0.5

            for bonus in scoring_arguments['video_name_tag_bonuses']:
                for word in scoring_arguments['video_name_tag_bonuses'][bonus]:
                    if word in result['title'].lower():
                        result['true_rating'] *= bonus
                        result['view_count'] *= bonus
                        break

        return response

    # search for movie
    search = movie.replace('(', '').replace(')', '').replace('[', '').replace(']', '') + ' ' + search_suffix
    search = search.replace('.', ' ').replace('_', ' ').replace('-', ' ').replace('  ', ' ').replace('  ', ' ')
    search = str('site:youtube.com ' + search)

    item_list = list()
    for attempt in range(5):
        if attempt > 2:
            for url in google_search(search, stop=10):
                item = {'link': url}
                item_list.append(item)
            break
        else:
            try:
                for url in google_search(search, stop=10):
                    item = {'link': url}
                    item_list.append(item)
                break
            except URLError:
                print('Failed to retrieve search results, trying again in 10 seconds')
                time.sleep(10)
                continue

    item_list.pop()
    item_list.pop()
    item_list.pop()
    search_response = {'items': item_list}

    search_response = scan_response(search_response)
    search_response = filter_response(search_response, filter_arguments)
    search_response = score_response(search_response, filter_arguments)

    # select video
    selected_movie = None

    top_score = 0
    top_view_count = 0

    for item in search_response['items']:

        print('-----------------------------------------------------------------')
        print(item['title'])
        print(item['adds_info'])
        print(item['video_resolution'])
        print(item['link'])
        print(item['true_rating'])
        print(item['view_count'])

        if item['true_rating'] > top_score:
            top_score = item['true_rating']

    for item in search_response['items']:
        if item['true_rating'] > top_score * 0.95:
            if item['view_count'] > top_view_count:
                top_view_count = item['view_count']
                selected_movie = item

    return selected_movie
Exemplo n.º 6
0
def discover_google_plagiat(query):
    # stop  à 20 liens pour ne pas être trop long
    urls = google_search(query, stop=20)
Exemplo n.º 7
0
 def get_google_urls(self):
     urls = google_search(self.query, stop=self.google_search_max_results)
     return urls