def get_all_series_links_thread(tot): # print 'inside a thread@@@@@@@@@@@@@!!!!!!!!!!!!' counter = 0 for url in tot: # print 'getting from: %s' % url counter += 1 if TEST: if counter == 2: break data = opener.fetch(url)["data"] # ~ url_to = '%s.html'%i # ~ f=open(url_to,'w') # ~ f.write(data) # ~ return soup = BeautifulSoup(data, "lxml") l = soup.find_all("a") reg = re.compile(r".*/watch-\d+-(.*)") for i in l: if not i.has_key("href"): continue link = i.get("href") m = reg.match(link) if m: # ~ series_name=i.get('title') # ~ if 'Watch' in series_name: # ~ series_name=series_name[6:-7].strip() series_link = "http://www.1channel.ch" + m.group(0) all_series.append(series_link)
def generate_all_the_main_page_name(): l=[] l.append('http://www.1channel.ch/?letter=123&tv') if TEST: return l for i in range(ord('a'),ord('z')+1): l.append('http://www.1channel.ch/?letter='+str(chr(i))+'&tv') #generating all pages with page number all_pages = [] for url in l: page_count=1 data=opener.fetch(url)['data'] soup=BeautifulSoup(data) l=soup.select('.pagination > a ') if len(l) != 0: ref = l[len(l)-1]['href'] reg=re.compile(r'.*?page=(\d+)') #~ print url, l[len(l)-1]['href'] m=reg.match(ref) if m: page_count=int(m.group(1)) #~ print page_count for i in range(1,page_count+1): all_pages.append(url+"=&page="+str(i)) return all_pages
def greetings(request): answer = 'this is purely random text' try: # response = requests.get('http://quotesondesign.com/api/3.0/api-3.0.json') # json_data = json.loads(response.text) # answer = json_data['quote'] url='http://ivyjoy.com/quote.shtml' data=opener.fetch(url)['data'] soup=bs(data) l=soup.text[1878:].split() l=l[:len(l)-1] t=" ".join(l) answer=t except: response = requests.get('http://quotesondesign.com/api/3.0/api-3.0.json') json_data = json.loads(response.text) answer = json_data['quote'] answer = answer.replace('\r', ' ') answer = answer.replace('\n', ' ') answer = " ".join(answer.split()) answer = 'Hello, Kitty! ' + answer return JsonResponse({"answer": answer})
def generate_all_the_main_page_name(): l = [] l.append("http://www.1channel.ch/?letter=123&tv") # ~ if TEST: # ~ return l for i in range(ord("a"), ord("z") + 1): l.append("http://www.1channel.ch/?letter=" + str(chr(i)) + "&tv") # generating all pages with page number all_pages = [] for url in l: page_count = 1 data = opener.fetch(url)["data"] soup = BeautifulSoup(data, "lxml") l = soup.select(".pagination > a ") if len(l) != 0: ref = l[len(l) - 1]["href"] reg = re.compile(r".*?page=(\d+)") # ~ print url, l[len(l)-1]['href'] m = reg.match(ref) if m: page_count = int(m.group(1)) # ~ print page_count for i in range(1, page_count + 1): all_pages.append(url + "=&page=" + str(i)) return all_pages
def get_all_series_links(pages): (all_series, counter) = ([], 0) for url in pages: counter += 1 if TEST: if counter == 2: break data = opener.fetch(url)["data"] # ~ url_to = '%s.html'%i # ~ f=open(url_to,'w') # ~ f.write(data) # ~ return soup = BeautifulSoup(data, "lxml") l = soup.find_all("a") reg = re.compile(r".*/watch-\d+-(.*)") for i in l: if not i.has_key("href"): continue if not i.has_key("title"): continue link = i.get("href") m = reg.match(link) if m: series_name = i.get("title") if "Watch" in series_name: series_name = series_name[6:-7].strip() series_link = "http://www.1channel.ch" + m.group(0) all_series.append((series_name, series_link)) return all_series
def get_all_movies_links_thread(tot): #print 'inside a thread@@@@@@@@@@@@@!!!!!!!!!!!!' counter=0 for url in tot: #print 'getting from: %s' % url counter+=1 if TEST: if counter==2: break data=opener.fetch(url)['data'] #~ url_to = '%s.html'%i #~ f=open(url_to,'w') #~ f.write(data) #~ return soup=BeautifulSoup(data,'lxml') l=soup.find_all('a') reg=re.compile(r'.*/watch-\d+-(.*)') for i in l: if not i.has_key('href'): continue; if not i.has_key('title'): continue; link =i.get('href') m=reg.match(link) if m: movie_link="http://www.1channel.ch"+m.group(0) if not movie_link in all_movies: #~ print 'New Link found: %s' % movie_link all_movies.append(movie_link)
def get_all_series_links(pages): (all_series, counter)=([], 0) for url in pages: counter+=1 if TEST: if counter==2: break data=opener.fetch(url)['data'] #~ url_to = '%s.html'%i #~ f=open(url_to,'w') #~ f.write(data) #~ return soup=BeautifulSoup(data) l=soup.find_all('a') reg=re.compile(r'.*/watch-\d+-(.*)') for i in l: if not i.has_key('href'): continue; if not i.has_key('title'): continue; link =i.get('href') m=reg.match(link) if m: series_name=i.get('title') if 'Watch' in series_name: series_name=series_name[6:-7].strip() series_link="http://www.1channel.ch"+m.group(0) all_series.append((series_name, series_link)) return all_series
def i_have_got_movies_url((url,con)): data=opener.fetch(url)['data'] soup=BeautifulSoup(data, 'lxml') released_date=datetime.today() try: l=soup.select('.movie_info > table ') l=(l[0].find_all('tr')) l=l[1].find_all('td')[1].text released_date=datetime.strptime(l,'%B %d, %Y') if released_date.year < 1900: released_date = datetime.today() except: pass imdb_id="-1" try: imdb_link=soup.select('.mlink_imdb')[0].find_all('a') imdb_link=imdb_link[0].get('href') if re.search(r'\d+', imdb_link): imdb_id = re.search(r'\d+', imdb_link).group(0) except: pass try: a=soup.findAll(attrs={"property":"og:title"}) name = a[0]['content'] if len(name) == 0: print 'name length is zero : %s Url: %s' % (name, url) return except: pass movie_id=get_movies_id_in_database(name,imdb_id,released_date,con) if not movie_id: return l=soup.find_all('a') reg=re.compile(r'.*?url=(.+?)&domain.*') reg2=re.compile(r'.*external.php.*') for i in l: if not i.has_key('href'): continue ref=i['href'] parsed=urlparse(ref) try: t1=parsed[2] if not reg2.match(t1): continue m=reg.match(parsed[4]) final_url=standard_b64decode(m.group(1)) insert_into_links_table(movie_id,final_url, con) except: pass
def get_put_unique_eps(url='show.html'): #~ print url data=opener.fetch(url)['data'] soup = BeautifulSoup(data, 'lxml') title= get_title(soup) if len(title) == 0: #~ print 'title is null returning' return released_date=datetime.today() try: l=soup.select('.movie_info > table ') l=(l[0].find_all('tr')) l=l[1].find_all('td')[1].text released_date=datetime.strptime(l,'%B %d, %Y') except: pass imdb_id="-1" try: imdb_link=soup.select('.mlink_imdb')[0].find_all('a') imdb_link=imdb_link[0].get('href') if re.search(r'\d+', imdb_link): imdb_id = re.search(r'\d+', imdb_link).group(0) except: pass series_id = get_series_id_in_database(title, imdb_id, released_date, con) all_eps=soup(attrs={'class':'tv_episode_item'}) # getting all eps except the transparent one all_eps[:] = [base+x('a')[0].get('href') if not 'transp2' in x['class'] else None for x in all_eps ] all_eps = list(set(all_eps)) if None in all_eps: all_eps.remove(None) cur=con.cursor(mdb.cursors.DictCursor) for link in all_eps: #~ print 'episode link: %s' % link matches = re.search(r'season-(\d+)-episode-(\d+)', link) season = int(matches.group(1)) episode = int(matches.group(2)) cur.execute("SELECT * FROM `vs_series_links` WHERE `series_id`=%s and `season`=%s and `episode`=%s" , (series_id, season, episode)) if cur.fetchone(): continue #~ print 'Inserting new Episode: series: %s season: %s episode: %s' % (series_id, season, episode) i_have_got_series_episode_url(title, series_id, link, season, episode, con)
def get_page_count_and_go_deeper(url): data=opener.fetch(url)['data'] soup=BeautifulSoup(data) l=soup.select('.pagination > a ') ref = l[len(l)-1]['href'] reg=re.compile(r'.*?page=(\d+).*?') page_count=1 m=reg.match(ref) if m: page_count=int(m.group(1)) for i in range(1,page_count+1): new_url=url+"&page="+str(i) i_have_got_page_number(url)
def initiator(): tv_url = 'http://www.1channel.ch/?tv' featured = 'http://www.1channel.ch/index.php?sort=featured' data=opener.fetch(tv_url)['data'] soup = BeautifulSoup(data, 'lxml') #Inserting latest shows latest_shows = get_latest_ones(soup) #~ print latest_shows for show in latest_shows: get_put_unique_eps(show) # episodes prime times eps =[] eps = fetch_prime_time_episodes(soup) for epi in eps: put_prime_time_eps(epi) #insert movies del data, soup data=opener.fetch(base)['data'] soup = BeautifulSoup(data, 'lxml') latest_movies = get_latest_ones(soup) #check featured del data, soup data=opener.fetch(featured)['data'] soup = BeautifulSoup(data, 'lxml') featured_movs = get_latest_ones(soup) to_parse = set(latest_movies+featured_movs) #~ print 'Parsing movies: %s ' % len(to_parse) for url in to_parse: i_have_got_movies_url(url)
def generate_main_pages_thread(url): print url page_count=1 data=opener.fetch(url)['data'] soup=BeautifulSoup(data) l=soup.select('.pagination > a ') if len(l) != 0: ref = l[len(l)-1]['href'] reg=re.compile(r'.*?page=(\d+)') #~ print url, l[len(l)-1]['href'] m=reg.match(ref) if m: page_count=int(m.group(1)) #~ print page_count for i in range(1,page_count+1): tot.append(url+"=&page="+str(i))
def i_have_got_page_number(url): data=opener.fetch(url)['data'] soup=BeautifulSoup(data) l=soup.find_all('a') reg=re.compile(r'.*/watch-\d+-(.*)') for i in l: if not i.has_key('href'): continue; if not i.has_key('title'): continue; link =i.get('href') m=reg.match(link) if m: series_name=i.get('title') series_link="http://www.1channel.ch"+m.group(0) i_have_got_series_name(series_link,series_name)
def put_prime_time_eps(link): data=opener.fetch(link)['data'] soup = BeautifulSoup(data, 'lxml') title = get_title(soup) if len(title) == 0: return series_id = get_series_id(title) if not series_id: series_link = base + soup(attrs = {'class':'titles'})[1]('a')[0]['href'] get_put_unique_eps(series_link) return matches = re.search(r'season-(\d+)-episode-(\d+)', link) season = int(matches.group(1)) episode = int(matches.group(2)) i_have_got_series_episode_url(title, series_id, link, season, episode, con)
def i_have_got_series_link(url, con): # ~ print 'Series:%s#%s' %(name,url) data = opener.fetch(url)["data"] soup = BeautifulSoup(data, "lxml") name = get_title(soup) if len(name) == 0: return released_date = datetime.today() try: l = soup.select(".movie_info > table ") l = l[0].find_all("tr") l = l[1].find_all("td")[1].text released_date = datetime.strptime(l, "%B %d, %Y") except: pass imdb_id = "-1" try: imdb_link = soup.select(".mlink_imdb")[0].find_all("a") imdb_link = imdb_link[0].get("href") if re.search(r"\d+", imdb_link): imdb_id = re.search(r"\d+", imdb_link) except: pass series_id_in_database = get_series_id_in_database(name, imdb_id, released_date, con) # getting all episodes all_eps = soup(attrs={"class": "tv_episode_item"}) # getting all eps except the transparent one all_eps[:] = [base + x("a")[0].get("href") if not "transp2" in x["class"] else None for x in all_eps] all_eps = list(set(all_eps)) if None in all_eps: all_eps.remove(None) for ep_link in all_eps: matches = re.search(r"season-(\d+)-episode-(\d+)", ep_link) season = int(matches.group(1)) episode = int(matches.group(2)) i_have_got_series_episode_url(name, series_id_in_database, ep_link, season, episode, con)
def i_have_got_series_name((url,name,con)): #~ print 'Series:%s#%s' %(name,url) data=opener.fetch(url)['data'] soup=BeautifulSoup(data) released_date=datetime.today() try: l=soup.select('.movie_info > table ') l=(l[0].find_all('tr')) l=l[1].find_all('td')[1].text released_date=datetime.strptime(l,'%B %d, %Y') except: pass imdb_link="-1" try: imdb_link=soup.select('.mlink_imdb')[0].find_all('a') imdb_link=imdb_link[0].get('href') except: pass series_id_in_database=get_series_id_in_database(name,imdb_link,released_date,con) l=soup.find_all('a') t1=url; t1=t1.replace('http://www.1channel.ch/watch','tv') t1='/'+t1+"/season-(\d+)-episode-(\d+).*" reg=re.compile(t1) for i in l: if not i.has_key('href'): continue m=reg.match(i.get('href')) if m: episode_link="http://www.1channel.ch"+m.group(0) season=m.group(1) episode=m.group(2) i_have_got_series_episode_url(name,series_id_in_database,episode_link,season,episode,con)
def i_have_got_series_episode_url(name,series_id,url,season,episode,con): print '\n\nName: %s\nSeason: %s .Episode: %s' %(name,season,episode) data=opener.fetch(url)['data'] soup=BeautifulSoup(data) l=soup.find_all('a') reg=re.compile(r'.*?url=(.+?)&domain.*') reg2=re.compile(r'.*external.php.*') for i in l: if not i.has_key('href'): continue ref=i['href'] parsed=urlparse(ref) try: t1=parsed[2] if not reg2.match(t1): continue m=reg.match(parsed[4]) final_url=standard_b64decode(m.group(1)) set_season_episode(series_id,final_url,season,episode,con) except: pass
def i_have_got_page_number(url): data=opener.fetch(url)['data'] #~ url_to = '%s.html'%i #~ f=open(url_to,'w') #~ f.write(data) #~ return soup=BeautifulSoup(data) l=soup.find_all('a') reg=re.compile(r'.*/watch-\d+-(.*)') for i in l: if not i.has_key('href'): continue; if not i.has_key('title'): continue; link =i.get('href') m=reg.match(link) if m: series_name=i.get('title') if 'Watch' in series_name: series_name=series_name[6:-7].strip() series_link="http://www.1channel.ch"+m.group(0) i_have_got_series_name(series_link,series_name)
def i_have_got_series_episode_url(name, series_id, url, season, episode, con): # print '\n\nName: %s\nSeason: %s .Episode: %s' %(name,season,episode) # data=urllib2.open(url).read() data = opener.fetch(url)["data"] soup = BeautifulSoup(data, "lxml") l = soup.find_all("a") reg = re.compile(r".*?url=(.+?)&domain.*") reg2 = re.compile(r".*external.php.*") for i in l: if not i.has_key("href"): continue ref = i["href"] parsed = urlparse(ref) try: t1 = parsed[2] if not reg2.match(t1): continue m = reg.match(parsed[4]) final_url = standard_b64decode(m.group(1)) set_season_episode(series_id, final_url, season, episode, con) except: pass