def __init__(self): self.tpp = TaoppDt() self.nm = NuomiDt() self.ti = TimeDt()
class Migration: def __init__(self): self.tpp = TaoppDt() self.nm = NuomiDt() self.ti = TimeDt() # 以淘票票数据作为比对基础 # 不做处理,直接存入 def ini_tpp(self): count = 0 for item in self.tpp.extract(): CinemaUrl.objects.create(city=item[1], district=item[2], location=item[5], cinema_name=item[3], taopp_url=item[4], code=item[0]) # break # 优化 district 字段 def _optimize_district(self): query = CinemaUrl.objects.all() for i in query: position = i.district if '市' in position: district = position.split('市')[0] + '市' elif '县' in position: district = position.split('县')[0] + '县' elif '区' in position: district = position.split('区')[0] + '区' elif '镇' in position: district = position.split('区')[0] + '镇' else: district = position if len(district) == 1: district = i.city + district i.district = district i.save() # 合并 糯米 table 到 django def _add_nuomi(self): count_1 = 0 count_2 = 0 for item in self.nm.extract(): city, district, location, cinema_name, nuomi_url = item[1], item[ 2], item[5], item[3], item[4] q_res = self._query_url(city, district, location, cinema_name) if q_res: # print(q_res) q_res.nuomi_url = nuomi_url q_res.save() count_1 += 1 else: CinemaUrl.objects.create(city=city, district=district, location=location, cinema_name=cinema_name, nuomi_url=nuomi_url, code=0) count_2 += 1 print(count_1, count_2) # break # 查询电影 url def _query_url(self, city, district, location, cinema_name): # 梯次比对 query = CinemaUrl.objects.filter(city__contains=city) if not query: return # 梯次一: 直接比较电影院名 query_c_name = query.filter(cinema_name__contains=cinema_name) if len(query_c_name) == 1: # print('梯次一比较') return query_c_name[0] # 梯次二:模糊比较电影院名 rates = [] for idx in range(len(query)): rate1 = Levenshtein.ratio(query[idx].cinema_name, cinema_name) if rate1 > 0.7: return query[idx] rate2 = Levenshtein.ratio(query[idx].location, location) if rate2 > 0.7: return query[idx] rates.append(rate1 * rate2) if max(rates) < 0.2: return None i = rates.index(max(rates)) return query[i] # 合并 时光网 数据 def _time(self): count_1 = 1 count_2 = 2 for item in self.ti.extract(): city, district, location, cinema_name, time_url = item[1], item[ 2], item[5], item[3], item[4] # print(city, district, location, cinema_name, time_url) q_res = self._query_url(city, district, location, cinema_name) if q_res: # print(q_res) q_res.time_url = time_url q_res.save() count_1 += 1 else: CinemaUrl.objects.create(city=city, district=district, location=location, cinema_name=cinema_name, time_url=time_url, code=0) count_2 += 1 print(count_1, count_2) # break # 如果在正式环境中执行 # 就赶紧辞职跑路吧 def _delete(self): m = CinemaUrl.objects.all() m.delete() def _test(self): query = CinemaUrl.objects.filter(city__contains='随州') print(len(query)) def _show_all(self): query = CinemaUrl.objects.all() # .filter(time_url__startswith='http').filter(nuomi_url__startswith='http').filter(taopp_url__startswith='http') # filter(city__contains='上海') # .filter(time_url__startswith='http').filter(nuomi_url__startswith='http').filter(taopp_url__startswith='http') print(len(query)) city_lst = []