def fetch_submission(self, slug): print(f"🤖 Fetching submission for problem: {slug}") query_params = { 'operationName': "Submissions", 'variables': {"offset": 0, "limit": 20, "lastKey": '', "questionSlug": slug}, 'query': '''query Submissions($offset: Int!, $limit: Int!, $lastKey: String, $questionSlug: String!) { submissionList(offset: $offset, limit: $limit, lastKey: $lastKey, questionSlug: $questionSlug) { lastKey hasNext submissions { id statusDisplay lang runtime timestamp url isPending __typename } __typename } }''' } resp = self.session.post("https://leetcode.com/graphql", data=json.dumps(query_params).encode('utf8'), headers={ "content-type": "application/json", }) body = json.loads(resp.content) # parse data submissions = get(body, "data.submissionList.submissions") if len(submissions) > 0: for sub in submissions: if Submission.get_or_none(Submission.id == sub['id']) is not None: continue if sub['statusDisplay'] == 'Accepted': url = sub['url'] html = self.session.get(f'https://leetcode.com{url}').text pattern = re.compile( r'submissionCode: \'(?P<code>.*)\',\n editCodeUrl', re.S ) matched = pattern.search(html) code = matched.groupdict().get('code') if matched else None if code: Submission.insert( id=sub['id'], slug=slug, language=sub['lang'], created=sub['timestamp'], source=code.encode('utf-8') ).execute() else: raise Exception(f"Cannot get submission code for problem: {slug}") random_wait(10, 15)
def fetch_problem(self, slug, accepted=False): print(f"🤖 Fetching problem: https://leetcode.com/problem/{slug}/...") query_params = { 'operationName': "getQuestionDetail", 'variables': {'titleSlug': slug}, 'query': '''query getQuestionDetail($titleSlug: String!) { question(titleSlug: $titleSlug) { questionId questionFrontendId questionTitle questionTitleSlug content difficulty stats similarQuestions categoryTitle topicTags { name slug } } }''' } resp = self.session.post( "https://leetcode.com/graphql", data=json.dumps(query_params).encode('utf8'), headers={ "content-type": "application/json", }) body = json.loads(resp.content) # parse data question = get(body, 'data.question') Problem.replace( id=question['questionId'], display_id=question['questionFrontendId'], title=question["questionTitle"], level=question["difficulty"], slug=slug, description=question['content'], accepted=accepted ).execute() for item in question['topicTags']: if Tag.get_or_none(Tag.slug == item['slug']) is None: Tag.replace( name=item['name'], slug=item['slug'] ).execute() ProblemTag.replace( problem=question['questionId'], tag=item['slug'] ).execute() random_wait(10, 15)
def slowly_gather(): max_range = 10000 increment = 1 current_start_page = 1 + len( [f for f in os.listdir('scraping') if '.xlsx' in f]) * increment while current_start_page < max_range: print('Start Page %s' % current_start_page) parser = AllRecipesParser(start_page=current_start_page, search_limit=increment) parser.main() current_start_page += increment random_wait(60)
def fetch_solution(self, slug): print(f"🤖 Fetching solution for problem: {slug}") query_params = { "operationName": "QuestionNote", "variables": {"titleSlug": slug}, "query": ''' query QuestionNote($titleSlug: String!) { question(titleSlug: $titleSlug) { questionId article solution { id content contentTypeId canSeeDetail paidOnly rating { id count average userRating { score __typename } __typename } __typename } __typename } } ''' } resp = self.session.post("https://leetcode.com/graphql", data=json.dumps(query_params).encode('utf8'), headers={ "content-type": "application/json", }) body = json.loads(resp.content) # parse data solution = get(body, "data.question") if solution['solution']['paidOnly'] is False: Solution.replace( problem=solution['questionId'], url=f"https://leetcode.com/articles/{slug}/", content=solution['solution']['content'] ).execute() random_wait(10, 15)
def get_urls_to_parse(self): search_url = self.base_url + '/' + self.base_search_page pageno = self.start_page search_page = self.call_function(requests.get, url=search_url + str(pageno), headers=HEADERS) if search_page == 'Error': return while search_page != 'Error' and search_page.status_code != 404 and self.search_limit > ( pageno - self.start_page): if search_page.status_code == 503: # temporary off random_wait(60) continue soup = BeautifulSoup(search_page.content, features='html.parser') self.parse_search_page(soup) pageno += 1 random_wait(15) search_page = self.call_function(requests.get, url=search_url + str(pageno), headers=HEADERS)
def collect_articles(self): local_folder = 'html-downloads' + '\\' + self.base_url.split( 'www')[-1].strip('.') if not os.path.exists(local_folder): os.makedirs(local_folder) for url in self.data: local_html = local_folder + '\\' + url.replace( self.base_url, '').strip('/').replace('/', '-') + '.html' if os.path.exists(local_html): print("Using Local Copy %s" % url) with open(local_html, 'rb') as html: content = html.read() else: random_wait(10) # only need to chill when using live print("Using live copy %s" % url) resp = self.call_function(requests.get, url=url, headers=HEADERS) if resp == 'Error': random_wait(20) continue if resp.status_code == 404: print("Cannot find %s" % url) continue if resp.status_code == 503: random_wait(20) continue content = resp.content if not os.path.exists(os.path.dirname(local_html)): os.makedirs(os.path.dirname(local_html)) with open(local_html, 'wb') as html: html.write(content) soup = BeautifulSoup(content) try: if soup.contents[ 2] == self.alt_flag: # todo implement alt parser print('Alt flagged', local_html) continue info = self.parse_article_page(soup) except Exception as wow: print('Serious error with url %s, : %s' % (url, wow)) continue self.data[url].update(info)
def update_to_goodreads(entries, cookies, disk_cache, limit, wait): """Update book entries to Goodreads. :param entries: list of books :param cookies: login cookie for Goodreads :param disk_cache: cache of updated books """ session = requests.Session() success = [] error = [] for entry in entries: isbn13 = entry['isbn13'] isbns = [isbn13] try: isbn10 = pyisbn.convert(isbn13) isbns.append(isbn10) except Exception: pass resp = check_exists(session, (isbn10, isbn13), cookies) if not resp: logging.warning('{} couldn\'t be found'.format(repr_book(entry))) error.append(entry) disk_cache[entry['isbn13']] = 'e' random_wait(2) continue url = get_edit_url(resp) if not url: logging.warning('{}\' url is not found'.format(repr_book(entry))) error.append(entry) disk_cache[entry['isbn13']] = 'e' random_wait(2) continue submit_url, form_data = get_form_data(session, cookies, url) if not form_data: logging.warning('{}\' form data is not found'.format(repr_book( entry))) error.append(entry) disk_cache[entry['isbn13']] = 'e' random_wait(2) continue # Do not cause any updates form_data['review[cog_explicit]'] = '0' for key in ('add_to_blog', 'add_update'): if key in form_data: form_data[key] = '0' # sanity check if len([key for key in form_data if 'readingSessionDatePicker' in key ]) != 10: logging.warning('{}\' date is problematic'.format(repr_book( entry))) logging.warning(form_data) error.append(entry) disk_cache[entry['isbn13']] = 'e' continue if update_book(entry, form_data, submit_url, session, cookies): success.append(entry) disk_cache[entry['isbn13']] = '' else: error.append(entry) disk_cache[entry['isbn13']] = 'e' if limit is not None and len(success) >= limit: break random_wait() return success, error