def delete_404_links(): logger = delete_404_links.log n_days_ago = datetime.utcnow().replace(tzinfo=timezone.utc) - timedelta( days=60) download_results = DownloadSourceResult.search({ 'last_check__lt': n_days_ago, 'deleted': False }) logger(f'{len(download_results)} download results need to be checked') download_sources_map = { ds.source_name: ds for ds in get_all_download_sources() if ds.enabled } with ThreadPoolExecutor(max_workers=3) as executor: futures = [] for download_result in download_results: future = executor.submit(_check_download_result_existence, download_result, download_sources_map, logger) future.log_msg = f'Checking download result {download_result.name}' futures.append(future) # wait until completed for future in concurrent.futures.as_completed(futures): logger(future.log_msg) future.result(timeout=600)
def handle(self, *args, **options): page = 1 has_next_page = True page_size = 500 while has_next_page is not None: paginator = DownloadSourceResult.search({'deleted': False}, paginate=True, page_size=page_size, page=page) total_pages = paginator.get('total_pages') print( f'Checking downloads: {(page - 1) * page_size}/{page * page_size} / Page: {page}/{total_pages}' ) results = paginator.get('results') for ds in results: if ds.audiovisual_record is None: continue ar = ds.audiovisual_record people = ar.directors + ar.writers + ar.stars remove_first = [person['name'].lower() for person in people] new_lang = guess_language(ds.name, remove_first=remove_first) if new_lang != ds.lang: ds.lang = new_lang ds.save() has_next_page = paginator.get('next_page', None) page += 1
def get_source_results(self, logger=None, sleep_between_requests=30): self._logger = logger response = self._get_http_response(sleep_between_requests) if response is None or response.content is None: raise DownloadSourceException('Response from session was None') if response.status_code == 404: return [] html_dom = HTML(html=response.content) results = [] for a in html_dom.find('a'): name = a.text if len(name) < 4: continue link = list(a.links)[0] if len(a.links) > 0 else '' if link == '': continue name_remover = RemoveAudiovisualRecordNameFromString(self._name) text_without_name = name_remover.replace_name_from_string(name) quality_detector = VideoQualityInStringDetector(text_without_name) source_name = self.source_name name = name.strip() quality = quality_detector.quality if not link.lower().startswith('http'): link = self.base_url + link language = guess_language(name, default=self.language, remove_first=self._remove_first) result = DownloadSourceResult( source_name=source_name, name=name, link=link, quality=quality, lang=language, audiovisual_record=None ) valid_result, ratio = self._valid_result(result) if ratio < 0.8: # self.log(f'--- Not valid result {name} {link}. Dropping it. {ratio}') pass else: self.log(f'??? Possible valid result {name} {link}. Ratio: {ratio}') results.append(result) return self.post_process_results(results)
def remove_download(request, object_id): if not request.user.is_superuser: return HttpResponse(status=403) _id = ObjectId(object_id) try: download = DownloadSourceResult.search({'_id': _id})[0] download.delete() download.audiovisual_record.metadata['recheck_downloads'] = True download.audiovisual_record.save() except IndexError: pass finally: try: referer = request.META['HTTP_REFERER'] return redirect(referer) except IndexError: return redirect('/')
def handle(self, *args, **options): page = 1 has_next = True page_size = 500 while has_next: paginator = DownloadSourceResult.search({'deleted': False}, paginate=True, page_size=page_size, page=page) total_pages = paginator.get('total_pages') print( f'Checking downloads: {(page - 1) * page_size}/{page * page_size} / Page: {page}/{total_pages}' ) for ds in paginator.get('results'): if ds.audiovisual_record is None: continue qd = VideoQualityInStringDetector(ds.name) if qd.quality != ds.quality: print(f'Processing {ds}') ds.quality = qd.quality ds.save() has_next = paginator.get('next_page', False) page += 1
def details(request, slug=None): try: referer_uri = request.META['HTTP_REFERER'] referer_uri = urllib.parse.unquote(referer_uri) get_params = { p.split('=')[0]: p.split('=')[1] for p in referer_uri.split('?')[1].split('&') } except (IndexError, KeyError): get_params = {} audiovisual_records = AudiovisualRecord.search({ 'deleted': False, 'has_downloads': True, 'general_information_fetched': True, 'slug': slug }) if len(audiovisual_records) == 0: context = {'genres_names': _get_genres()} return render(request, 'web/404.html', status=404, context=context) audiovisual_record = audiovisual_records[0] for score in audiovisual_record.scores: source = get_general_information_source_by_name( score.get('source_name')) score['external_url'] = source.base_url + audiovisual_record.metadata[ 'detailed_page'][source.source_name] # Add to each person the search url to be used later in the template for person in audiovisual_record.directors + audiovisual_record.writers + audiovisual_record.stars: person.search_url = f'/s/?ft=b&s="{person.name}"'.replace(' ', '+') # related audiovisual records # TODO esto toca un poco los huevos related_records = AudiovisualRecord.search( { 'deleted': False, 'has_downloads': True, 'general_information_fetched': True, 'name__neq': audiovisual_record.name, 'stars__name__in': [person.name for person in audiovisual_record.stars], }, page_size=10, page=1, paginate=True, sort_by=['-global_score']).get('results') # more = AudiovisualRecord.search( # { # 'deleted': False, 'has_downloads': True, 'general_information_fetched': True, # 'name__neq': audiovisual_record.name, # 'name__simil': audiovisual_record.name, # '_id__nin': [r.id for r in related_records] # }, # page_size=10, page=1, paginate=True, sort_by=['-global_score'] # ).get('results') related_records = related_records # + more # downloads # TODO esto toca mucho los huevos downloads = DownloadSourceResult.search( { 'audiovisual_record': audiovisual_record, 'deleted': False }, sort_by='quality') lang_translations = { 'eng': 'English', 'rus': 'Russian', 'spa': 'Spanish', 'hin': 'Hindi', 'deu': 'German', 'ita': 'Italian', 'jpn': 'Japanese', 'fra': 'French', 'kor': 'Korean', 'gre': 'Greek', 'pol': 'Polish', } names_used = [] lang_downloads = [] for lang in [ 'eng', 'rus', 'spa', 'deu', 'fra', 'ita', 'gre', 'pol', 'hin', 'jpn', 'kor' ]: ds = [] for d in downloads: if d.lang == lang and d.name not in names_used: names_used.append(d.name) ds.append(d) ds = ds[:10] if len(ds) > 0: lang_downloads.append((lang, ds, lang_translations[lang])) context = { 'context_class': 'details', 'is_landing': True, 'audiovisual_record': audiovisual_record, 'downloads': downloads, 'lang_downloads': lang_downloads, 'filter_params': get_params, 'genres_names': _get_genres(), 'qualities': VideoQualityInStringDetector.our_qualities, 'related_records': related_records, 'year_range': [ str(y) for y in range(1970, int(datetime.utcnow().strftime('%Y')) + 1) ] } return render(request, 'web/details.html', context=context)
def _worker_get_download_links(source_class, audiovisual_record, logger): source = source_class(audiovisual_record.name, year=audiovisual_record.year) try: logger(f'get downloads links for {audiovisual_record.name}') results = source.get_source_results(logger=logger, sleep_between_requests=60) logger(f'{len(results)} for {audiovisual_record.name}') except DownloadSourceException as e: log_exception(e) # TODO maybe increase an error counter? except PhantomBrowsingSession.DomainError as e: # domain cannot be resolved to IP address # disable the source configuration = get_download_source_configuration(source_class) configuration.data['enabled'] = False configuration.save() log_exception(e) except PhantomBrowsingSession.RemoteServerError as e: # cannot connect to ports 80 / 443 log_exception(e) # TODO maybe increase an error counter? else: if len(results) == 0: resp = source._last_response if resp is not None: response_filename = _get_response_filename( audiovisual_record.name, source_class.source_name) with open(response_filename, 'wb') as f: f.write(resp.content) # this check for a lot of zero results. If is reached a number, disable de source _check_zero_results(results, source_class, audiovisual_record, logger) for result in results: if result.quality == 'Audio': continue # if link exists do nothing relative_url = urlparse(result.link).path exists = DownloadSourceResult.search({ 'source_name': source_class.source_name, 'link__icontains': relative_url }) exists += DownloadSourceResult.search({ 'source_name': source_class.source_name, 'name': result.name }) if len(exists) > 0: continue result.audiovisual_record = audiovisual_record result.save() logger(f'+++ Valid result {result.name} {result.link}.') audiovisual_record.refresh() if 'downloads_fetch' not in audiovisual_record.metadata: audiovisual_record.metadata['downloads_fetch'] = {} audiovisual_record.metadata['downloads_fetch'][ source_class.source_name] = True audiovisual_record.save() logger( f'Marked {audiovisual_record.name} as reviewed for source {source_class.source_name}' )
def _check_zero_results(results, source_class, audiovisual_record, logger): configuration = get_download_source_configuration(source_class) if len(results) == 0: if audiovisual_record.id not in configuration.data['audiovisual_ids']: configuration.data['zero_results_searches'] += 1 configuration.data['audiovisual_names'].append( audiovisual_record.name) configuration.data['audiovisual_ids'].append(audiovisual_record.id) else: configuration.data['zero_results_searches'] = 0 configuration.data['audiovisual_names'] = [] configuration.data['audiovisual_ids'] = [] configuration.save() # if there is a lot of results with 0 length we do this additional check # get last good downloads for this source, get the film and try again # if results now are zero, disable the source. # because the html structure of the web maybe changed if configuration.data['zero_results_searches'] > 300: previous_good_search = DownloadSourceResult.search( { 'deleted': False, 'source_name': source_class.source_name }, paginate=True, page_size=1, page=1) if len(previous_good_search) > 0: previous_good_search = previous_good_search[0] previous_audiovisual_record = previous_good_search.audiovisual_record ds = source_class(previous_audiovisual_record.name, year=previous_audiovisual_record.year) ar = previous_audiovisual_record people = ar.directors + ar.writers + ar.stars remove_first = [person.name.lower() for person in people] results_check = ds.get_source_results(logger=logger, remove_first=remove_first) if len(results_check) == 0: configuration.refresh() configuration.data['enabled'] = False configuration.save() else: configuration.refresh() configuration.data['enabled'] = True configuration.data['zero_results_searches'] = 0 configuration.data['audiovisual_names'] = [] configuration.data['audiovisual_ids'] = [] configuration.save() else: configuration.refresh() configuration.data['enabled'] = False configuration.save() if not configuration.data['enabled']: raise DownloadSourceException( f'Disabled {source_class.source_name} download source.')