class XzToeflListeningPipeline(FilesPipeline): client = mangoDbUtil("Listenging") def get_media_requests(self, item, info): audio_url = item['question_audio_url'] if audio_url is not None and str(audio_url).strip() != "": if audio_url.find("corpus")!=-1: headers = {} headers[ 'Accept'] = 'audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5' headers['Host'] = 'file-corpus.zhan.com' headers['User - Agent'] = 'Mozilla / 5.0(Windows NT 10.0;Win64;x64;rv: 63.0) Gecko / 20100101Firefox / 63.0' headers['Referer'] = item['question_audio_refer'] yield scrapy.Request(audio_url, headers=headers) else: yield scrapy.Request(audio_url) for url in item['question_content_file_url_list']: yield scrapy.Request(url) def file_path(self, request, response=None, info=None): path = urlparse(request.url).path return join(basename(dirname(path)), basename(path)) def item_completed(self, results, item, info): print(results) if len(results) > 0: for result in results: if result[0]: url = result[1]['url'] path = result[1]['path'] audio_url = item['question_audio_url'] if audio_url is not None and str(audio_url) != "" and url.find("mp3") != -1: if url == safe_url_string(audio_url): item['question_audio_url'] = path elif len(item['question_content_file_url_list']) > 0: for index, file in enumerate(item['question_content_file_url_list']): if safe_url_string(file) == url: article_html = item['question_content'][0] count = 0 new_article_html = "" new_article_html_list=[] for i in range(len(article_html) - 1): if article_html[i:i + len("$img")] == "$img": if count == index: new_article_html = article_html[:i] path_new = str(path).replace("\\", "/") new_article_html += "<img src='upload/upload/img/" + path_new + "'/>" new_article_html += article_html[i + len("$img"):] new_article_html_list.append(new_article_html) item['question_content'] = new_article_html_list break else: count += 1 print(results) data = dict(item) self.client.insert(data) return item
class SATPipeline(FilesPipeline): client = mangoDbUtil("SAT") def get_media_requests(self, item, info): for url in item['question_content_file']: yield scrapy.Request(url) def file_path(self, request, response=None, info=None): path = urlparse(request.url).path return "SAT\\" + join(basename(dirname(path)), basename(path)) def item_completed(self, results, item, info): if len(results) > 0: for result in results: if result[0]: url = result[1]['url'] path = result[1]['path'] if len(item['question_content_file']) > 0: for index, file in enumerate( item['question_content_file']): if safe_url_string(file, encoding="utf8") == url: article_html = item['question_title'] count = 0 new_article_html = "" for i in range(len(article_html) - 1): if article_html[i:i + len("$img")] == "$img": if count == index: new_article_html = article_html[:i] path_new = str(path).replace( "\\", "/") new_article_html += "<img src='upload/upload/img/sat/" + path_new + "'/>" new_article_html += article_html[ i + len("$img"):] item[ 'question_title'] = new_article_html break else: count += 1 else: print("sss") else: print(results) data = dict(item) self.client.insert(data) return item