def goubanjia(self): """ :-1: html tag mixed with invalid data :100:And the most important thing is the port writed in 'class' rather in text. The website is difficult to spider, but the proxys are very goog goubanjia proxy http://www.goubanjia.com """ version = begin_time() host = 'http://www.goubanjia.com' html = self.get_request_proxy(host, 0) if not html: return [] trs = html.find_all('tr', class_=['warning', 'success']) for tr in trs: tds = tr.find_all('td') ip = tds[2].find_all('a')[0].text + '://' iplist = tds[0].find_all(['div', 'span', not 'p'], class_=not 'port') for index in iplist: ip += index.text encode = tds[0].find_all(['div', 'span', 'p'], class_='port')[0]['class'][1] uncode = functools.reduce( lambda x, y: x * 10 + (ord(y) - ord('A')), map(lambda x: x, encode), 0) self.waitjudge.append(ip + ':' + str(int(uncode / 8))) self.threadjude() end_time(version)
def get_summarization(self): """ get summarization from https://www.google.com.hk/search?q=%E6%AF%92%E7%8B%97%E8%82%89&newwindow=1&safe=strict&tbm=nws&ei=FK1KXJ3EJbWx0PEPytmq2AI&start=0&sa=N&ved=0ahUKEwidnv-7p4jgAhW1GDQIHcqsCis4ChDy0wMIRw&biw=1627&bih=427&dpr=2 """ version = begin_time() threadings = [] for index in range(25): work = threading.Thread(target=self.summarization_once, args=(index, )) threadings.append(work) for work in threadings: time.sleep(1) work.start() for work in threadings: work.join() summarizations = [ self.summarizations[k] for k in sorted(self.summarizations.keys()) ] self.summarizations = sum(summarizations, []) hrefs = [self.hrefs[k] for k in sorted(self.hrefs.keys())] self.hrefs = sum(hrefs, []) with codecs.open('google_steal.txt', 'w', encoding='utf-8') as f: f.write('\n'.join(self.summarizations)) with codecs.open('google_steal_href.txt', 'w', encoding='utf-8') as f: f.write('\n'.join(self.hrefs)) end_time(version)
def get_playlist_id_thread(self): """ get play list id in threading """ begin_time() if not len(self.classifylist): self.get_classify() for index in self.classifylist: threadings = [] for offset in range(41): work = threading.Thread(target=self.get_playlist_id, args=( index, offset * 35, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.proxyclass.cleancannotuse() print(len(self.playlists)) self.test_queue(index) self.playlists = [] print(index + " Over") end_time()
def get_song_detail_thread(self): """ get song detail threadings """ begin_time() for classify in self.classifylist: ids = self.get_list_ids(classify) threadings = [] for oneid in ids: work = threading.Thread(target=self.get_song_detail, args=(oneid[1], )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.clean_data() self.test_song(classify, ids) self.songlist = [] self.songmap = {} self.finishlist = [] self.successtime = 0 print(classify + ' Over!') end_time()
def load_data(self, floor=10, upper_ptr=1e-5): version = begin_time() with open(self.origin_file_name, 'r') as f: origin_text = f.read() text_list = preprocess(origin_text, floor) word_counts = Counter(text_list) vocab_list = sorted(word_counts, key=word_counts.get, reverse=True) word2id = {word: ii for ii, word in enumerate(vocab_list)} id2word = {ii: word for ii, word in enumerate(vocab_list)} id_list = [word2id[word] for word in text_list] word_counts = Counter(id_list) count_len = len(id_list) p_drop = {word: (1 - np.sqrt(upper_ptr * count_len / count)) for word, count in word_counts.items()} train_list = [ww for ww in id_list if p_drop[ww] < np.random.random()] print("Total words: {}".format(len(train_list))) print("Unique words: {}".format(len(set(train_list)))) self.word2id = word2id self.id2word = id2word self.train_list = train_list end_time(version)
def load_goods(self): """ load goods """ version = begin_time() if not os.path.exists('%scookie' % data_dir): print('Youdao Note cookie not exist!!!') return with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readline() changeCookie(cookie[:-1]) threadings = [] for index, tid in enumerate(self.request_list): work = threading.Thread(target=self.load_goods_once, args=( index, tid, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() goods = [self.goods[k] for k in sorted(self.goods.keys())] goods = sum(goods, []) with codecs.open('%sgoods' % data_dir, 'w', encoding='utf-8') as f: f.write("\n".join(goods)) end_time(version)
def get_summarization(self): """ get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1 """ version = begin_time() threadings = [] for index in range(75): work = threading.Thread(target=self.summarization_once, args=(index, )) threadings.append(work) for work in threadings: # time.sleep(.5) work.start() for work in threadings: work.join() # self.text_map = self.total_map[0] # for index in list(range(1, len(self.total_map))): # for ids in self.total_map[index]: # if ids in self.text_map: # self.text_map[ids] += self.total_map[index][ids] # else: # self.text_map[ids] = self.total_map[index][ids] # print(sum(self.text_map)) word = [self.word[k] for k in sorted(self.word.keys())] with codecs.open('test', 'w', encoding='utf-8') as f: f.write("\n".join(word)) end_time(version)
def get_detail(self): """ get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1 """ version = begin_time() threadings = [] with codecs.open('bjh_href_poison.txt', 'r', encoding='utf-8') as f: href_list = f.readlines() for index, url in enumerate(href_list): work = threading.Thread(target=self.detail_once, args=( index, url, )) threadings.append(work) for work in threadings: # time.sleep(.5) work.start() for work in threadings: work.join() word_list = [self.word_list[k] for k in sorted(self.word_list.keys())] with codecs.open('bjh_detail_poison', 'w', encoding='utf-8') as f: f.write("\n".join(word_list)) self.failuredmap = {} with codecs.open('bjh.log', 'w', encoding='utf-8') as f: f.write('\n'.join(self.fail)) self.fail = [] end_time(version)
def load_collect(self, page): """ load collect """ version = begin_time() if not os.path.exists('%scookie_collect' % data_dir): print('TB cookie not exist!!!') return with codecs.open('%scookie_collect' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readline() changeCookie(cookie[:-1]) changeHtmlTimeout(30) for block in range(page // 10 + 1): begin = block * 10 end = min(page, (block + 1) * 10) threadings = [] for index in range(begin, end): work = threading.Thread(target=self.load_collect_once, args=(index, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() collect = [self.collect[k] for k in sorted(self.collect.keys())] collect = sum(collect, []) with codecs.open('%scollect_wyy' % data_dir, 'w', encoding='utf-8') as f: f.write("\n".join(collect)) end_time(version)
def get_summarization(self): """ get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1 """ version = begin_time() threadings = [] for index in range(30): work = threading.Thread(target=self.summarization_once, args=(index, )) threadings.append(work) for work in threadings: # time.sleep(.5) work.start() for work in threadings: work.join() summarizations = [ self.summarizations[k] for k in sorted(self.summarizations.keys()) ] self.summarizations = sum(summarizations, []) with codecs.open('news_posion.txt', 'w', encoding='utf-8') as f: f.write('\n'.join(self.summarizations)) end_time(version)
def search_goods(self): version = begin_time() if not os.path.exists('%swait' % data_dir): print('wait file not exist!!!') return with codecs.open('%swait' % data_dir, 'r', encoding='utf-8') as f: wait = f.readlines() threadings = [] for index, goods_name in enumerate(wait): work = threading.Thread(target=self.search_goods_once, args=( goods_name[:-1], index, )) threadings.append(work) for work in threadings: work.start() time.sleep(random.randint(5, 9)) for work in threadings: work.join() goods_name = [ self.goods_name[k] for k in sorted(self.goods_name.keys()) ] with codecs.open('%swait_goods' % data_dir, 'w', encoding='utf-8') as f: f.write('\n'.join(goods_name)) end_time(version)
def get_classify(self): """ get classify from /discover/playlist """ begin_time() self.classifylist = {} host = 'https://music.163.com/discover/playlist' html = self.proxyclass.get_request_proxy(host, host[8:21], 0) if not html: print('Empty') self.proxyclass.cleancannotuse() if self.can_retry(host): self.get_classify() return [] alist = html.find_all('a', class_='s-fc1') if not len(alist): if self.can_retry(host): self.get_classify() print(html) for index in alist: self.classifylist[index.text] = index['href'] self.proxyclass.cleancannotuse() end_time()
def origin_sample_master(self, input_file, output1_file, output2_file, block_size=100000, valnum=10000): """ the master of mult-Theading for get origin sample """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: self.origin_sample = f.readlines() threadings = [] num = len(self.origin_sample) start = 0 end = min(block_size, num - 1) for block in range(int(num / block_size) + 1): while self.origin_sample[end] != '\r\n' and end < num - 1: end += 1 work = threading.Thread(target=self.origin_sample_agent, args=( start, end, block, )) threadings.append(work) start = end + 1 end = min(num - 1, block_size * (block + 1)) for work in threadings: work.start() for work in threadings: work.join() content = [self.content[k] for k in sorted(self.content.keys())] self.content = sum(content, []) response = [self.response[k] for k in sorted(self.response.keys())] self.response = sum(response, []) # pre = [self.pre[k] for k in sorted(self.pre.keys())] # self.pre = sum(pre, []) totalnum = len(self.response) for index in range(len(self.content)): context = self.content[index] if index <= valnum: self.dev.append("1#" + context + self.response[index]) else: self.train.append("1#" + context + self.response[index]) otherindexs = np.random.randint(0, totalnum, 2) for otherindex in otherindexs: while otherindex == index: otherindex = np.random.randint(0, totalnum, 1)[0] if index <= valnum: self.dev.append("0#" + context + self.response[otherindex]) else: self.train.append("0#" + context + self.response[otherindex]) pickle.dump(self.train, open(output1_file, "wb")) pickle.dump(self.dev, open(output2_file, "wb")) end_time(version)
def preWord2vec(input_file, output_file): """ word bag construction """ version = begin_time() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = GetWords(input_file) model = Word2Vec(sentences, workers=100, min_count=5, size=200) model.save(output_file) end_time(version)
def pre_data_list(self, do_pre): version = begin_time() if do_pre == True: self.load_all(0) self.load_all(1) elif do_pre == 2: self.load_all_pickle(0) self.load_all_pickle(1) else: self.load_basic(1) end_time(version)
def participles_word(self): """ participles word """ version = begin_time() for file in self.filelists: pkuseg.test(file, file[:-4] + '_pkuseg.txt', model_name='../Model_retrieval/pkuseg', nthread=20) end_time(version)
def calculate_result(self, input_file, output_file, block_size=10): """ calculate result """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: with codecs.open(output_file, 'w') as outf: results = f.readlines() for index in range(int(len(results) / block_size)): pre = results[index * block_size:(index + 1) * block_size] temp_index = np.array(pre).argmax() outf.write(str(temp_index) + '\n') end_time(version)
def embedding_test_master(self, input_file, embedding_file, block_size=10000): """ the master of mult-Theading for test by embedding model """ version = begin_time() self.word2vec = load_bigger(embedding_file) self.origin_sample = load_bigger(input_file) threadings = queue.Queue() waitthreadings = queue.Queue() num = len(self.origin_sample) start = 0 end = min(block_size, num - 1) for block in range(int(num / block_size) + 1): work = threading.Thread(target=self.embedding_test_agent, args=( start, end, block, )) threadings.put(work) start = end + 1 end = min(num - 1, block_size * (block + 2)) while not threadings.empty(): tempwork = threadings.get() tempwork.start() waitthreadings.put(tempwork) while not waitthreadings.empty(): waitthreadings.get().join() result = [self.wordresult[k] for k in sorted(self.wordresult.keys())] results = sum(result, []) totalnum = int(len(results)) correctnum = 0 top3num = 0 block_sizes = 10 for index in range(int(totalnum / block_sizes)): pre = results[index * block_sizes:(index + 1) * block_sizes] temp_index = np.array(pre).argmax() top3 = np.array(pre).argsort()[-3:][::-1] if not temp_index: correctnum += 1 if 0 in top3: top3num += 1 print(correctnum, top3num, int(totalnum / block_sizes), spend_time(version), str(correctnum / int(totalnum / block_sizes))[:5], str(top3num / int(totalnum / block_sizes))[:5]) end_time(version)
def origin_sample_master(self, input_file, output_file='SMN/data/bert/train.pkl', block_size=900000, small_size=200000): """ the master of mult-Theading for get origin sample """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: self.origin_sample = f.readlines() threadings = [] num = len(self.origin_sample) start = 0 end = min(block_size, num - 1) for block in range(int(num / block_size) + 1): while self.origin_sample[end] != '\r\n' and end < num - 1: end += 1 work = threading.Thread(target=self.origin_sample_agent, args=( start, end, block, )) threadings.append(work) start = end + 1 end = min(num - 1, block_size * (block + 1)) for work in threadings: work.start() for work in threadings: work.join() response = sum(list(self.response.values()), []) content = sum(list(self.content.values()), []) totalnum = len(response) print(totalnum) randomIndexs = unique_randomint(0, totalnum, small_size) # otherIndexs = unique_randomint( # 0, totalnum, small_size * 2, randomIndexs) r = [] for index in randomIndexs: r.append('1#' + content[index] + '#' + response[index]) r.append('0#' + content[index] + '#' + response[unique_randomint(0, totalnum, 1, [index])[0]]) r.append('0#' + content[index] + '#' + response[unique_randomint(0, totalnum, 1, [index])[0]]) pickle.dump(r, open(output_file, "wb")) end_time(version)
def origin_test_master(self, input_file, output_file, block_size=100000, test_size=2000): """ the master of mult-Theading for get origin sample """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: self.origin_sample = f.readlines() threadings = [] num = len(self.origin_sample) start = 0 end = min(block_size, num - 1) for block in range(int(num / block_size) + 1): while self.origin_sample[end] != '\r\n' and end < num - 1: end += 1 work = threading.Thread(target=self.origin_sample_agent, args=( start, end, block, )) threadings.append(work) start = end + 1 end = min(num - 1, block_size * (block + 1)) for work in threadings: work.start() for work in threadings: work.join() content = [self.content[k] for k in sorted(self.content.keys())] self.content = sum(content, []) response = [self.response[k] for k in sorted(self.response.keys())] self.response = sum(response, []) totalnum = len(self.content) randomlists = np.random.randint(0, totalnum, test_size) for index in randomlists: temp_context = self.content[index][:-1].replace('\n', '[SEP]') + '#' self.test.append("1#" + temp_context + self.response[index][:-1]) otherindexs = np.random.randint(0, totalnum, 9) for otherindex in otherindexs: while otherindex == index: otherindex = np.random.randint(0, totalnum, 1)[0] self.test.append("0#" + temp_context + self.response[otherindex][:-1]) pickle.dump(self.test, open(output_file, 'wb')) end_time(version)
def press_threading(self, url, host, qps, types): """ press url at constant qps """ begin_time() threadings = [] for index in range(qps): work = threading.Thread(target=self.basic_press, args=(url, host, 0, types)) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() end_time()
def have_places(self): """ brush class """ version = begin_time() have_places = False while not have_places: if self.have_places_once(): send_email('大数据专题', '大数据专题 有名额啦 有名额啦') send_email('大数据专题', '大数据专题 有名额啦 有名额啦') send_email('大数据专题', '大数据专题 有名额啦 有名额啦') have_places = True time.sleep(random.randint(10, 20)) end_time(version)
def testdb(self): ''' test proxy in db can use ''' begin_time() results = self.Db.select_db(self.select_all) if results != 0: for index in results: self.waitjudge.append(index[0]) self.threadjude() else: pass self.initproxy() end_time()
def vsmCalculate(self): """ calculate vsm """ #: todo write block version = begin_time() threadings = [] for index1 in range(self.articleNum): work = threading.Thread(target=self.vsmThread, args=(index1, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() end_time(version)
def dump_test_make(pre_file='SMN/data/smn_test.pkl', result_file='SMN/data/result_test.txt', max_word_per_utterence=50, output_file='SMN/data/datasets_test.pkl'): """ dump test make file """ version = begin_time() pre = pickle.load(open(pre_file, "rb")) revs, wordvecs, max_l2 = pre[0], pre[1], pre[2] datasets = make_data(revs, wordvecs.word_idx_map, max_l=max_word_per_utterence) dump_bigger(datasets, output_file) end_time(version)
def parse_detail(self): ''' parse hotel detail ''' version = begin_time() text = self.get_hotel_detail() html = BeautifulSoup(text['html'], 'html.parser') trs = html.findAll('tr')[2:] hotel_detail = [] for tr in trs: room_name = re.findall('baseroomname="(.*?)"', str(tr)) if not len(room_name): room_name = re.findall('rel="nofollow">\n(.*?)\n', str(tr)) room_name = room_name[0].strip() if len( room_name) else hotel_detail[-1][0] price = re.findall(r'</dfn>(\d{4,5}?)</span>', str(tr)) if not len(price): continue else: price = price[0] price_type = re.findall('room_type_name">(.*?)</span>', str(tr))[0] if 'em' in price_type: price_type = ','.join([ *re.findall('(.*?)<em', price_type), *re.findall('((.*?))', price_type) ]) hotel_detail.append([room_name, price_type, price]) output_dir = '{}hotelDetail.txt'.format(data_dir) with open(output_dir, 'w') as f: f.write('\n'.join([','.join(ii) for ii in hotel_detail])) echo( 1, 'Load {} price\nOutput path: {}\nSpend time: {:.2f}s'.format( len(hotel_detail), output_dir, end_time(version, 0))) return hotel_detail
def kuaidaili(self, page): """ kuaidaili https://www.kuaidaili.com/free/ """ version = begin_time() threadings = [] for index in range(1, page + 1): work = threading.Thread(target=self.kuaidailithread, args=(index, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.threadjude() end_time(version)
def origin_result_direct(self, input_file1, input_file2, output_file): """ origin sample direct no theading """ version = begin_time() pre = [] dataset = [] with codecs.open(input_file1, 'r', 'utf-8') as f: temp_context = '' last_index = '' for tempword in f: if tempword == '\r\n': pre.append("1#" + temp_context + last_index) temp_context = '' last_index = '' else: if len(last_index): temp_context += (last_index + '#') last_index = tempword[:-1].strip() with codecs.open(input_file2, 'r', 'utf-8') as f: temp_context = [] index = 0 totalnum = len(pre) for tempword in f: if tempword == '\r\n': if len(temp_context) < 9: continue elif len(temp_context) == 9: if index < totalnum: dataset.append(pre[index] + '#' + temp_context[0]) index += 1 temp_context = [] else: index += 1 temp_context = [] else: temp_context.append(tempword[:-1].strip()) if index < totalnum: dataset.append( pre[index] + '#' + tempword[:-1].replace(u'\ufeff', '').strip()) pickle.dump([pre, dataset], open(output_file, "wb")) end_time(version)
def sixsixip(self, area, page): """ 66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html """ version = begin_time() threadings = [] for index in range(1, area + 1): for pageindex in range(1, page + 1): print(str(index) + ' ' + str(pageindex)) work = threading.Thread(target=self.sixsixthread, args=(index, pageindex)) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.threadjude() end_time(version)
def gatherproxy(self, types): """ :100: very nice website first of all you should download proxy ip txt from: http://www.gatherproxy.com/zh/proxylist/country/?c=China """ begin_time() file_d = open('proxy/gatherproxy', 'r') for index in file_d.readlines(): if types == 0: self.waitjudge.append('http://' + index[0:-1]) elif types == 1: self.waitjudge.append('https://' + index[0:-1]) else: self.waitjudge.append('http://' + index[0:-1]) self.waitjudge.append('https://' + index[0:-1]) self.threadjude() end_time()