Exemplo n.º 1
0
def dump_test_make(pre_file='SMN/data/smn_test.pkl',
                   result_file='SMN/data/result_test.txt',
                   max_word_per_utterence=50,
                   output_file='SMN/data/datasets_test.pkl'):
    """
    dump test make file
    """
    version = begin_time()
    pre = pickle.load(open(pre_file, "rb"))
    revs, wordvecs, max_l2 = pre[0], pre[1], pre[2]
    datasets = make_data(revs,
                         wordvecs.word_idx_map,
                         max_l=max_word_per_utterence)
    dump_bigger(datasets, output_file)
    end_time(version)
Exemplo n.º 2
0
 def origin_test_master(self,
                        input_file,
                        output_file,
                        block_size=100000,
                        test_size=2000):
     """
     the master of mult-Theading for get origin sample
     """
     version = begin_time()
     with codecs.open(input_file, 'r', 'utf-8') as f:
         self.origin_sample = f.readlines()
     threadings = []
     num = len(self.origin_sample)
     start = 0
     end = min(block_size, num - 1)
     for block in range(int(num / block_size) + 1):
         while self.origin_sample[end] != '\r\n' and end < num - 1:
             end += 1
         work = threading.Thread(target=self.origin_sample_agent,
                                 args=(
                                     start,
                                     end,
                                     block,
                                 ))
         threadings.append(work)
         start = end + 1
         end = min(num - 1, block_size * (block + 1))
     for work in threadings:
         work.start()
     for work in threadings:
         work.join()
     content = [self.content[k] for k in sorted(self.content.keys())]
     self.content = sum(content, [])
     response = [self.response[k] for k in sorted(self.response.keys())]
     self.response = sum(response, [])
     totalnum = len(self.content)
     randomlists = np.random.randint(0, totalnum, test_size)
     for index in randomlists:
         temp_context = self.content[index]
         self.test.append("1#" + temp_context + self.response[index])
         otherindexs = np.random.randint(0, totalnum, 9)
         for otherindex in otherindexs:
             while otherindex == index:
                 otherindex = np.random.randint(0, totalnum, 1)[0]
             self.test.append("0#" + temp_context +
                              self.response[otherindex])
     pickle.dump(self.test, open(output_file, 'wb'))
     end_time(version)
Exemplo n.º 3
0
    def kuaidaili(self, page):
        """
        kuaidaili https://www.kuaidaili.com/free/
        """

        version = begin_time()
        threadings = []
        for index in range(1, page + 1):
            work = threading.Thread(target=self.kuaidailithread,
                                    args=(index, ))
            threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.threadjude()
        end_time(version)
Exemplo n.º 4
0
def make_data_train(revs,
                    word_idx_map,
                    max_l=50,
                    validation_num=50000,
                    block_size=200000):
    """
    Transforms sentences into a 2-d matrix.
    """
    version = begin_time()
    test = []
    threadings = queue.Queue()
    waitthreadings = queue.Queue()
    num = len(revs)
    start = 0
    end = min(block_size, num - 1)
    for block in range(int(num / block_size) + 1):
        work = threading.Thread(target=make_data_theading,
                                args=(
                                    revs,
                                    word_idx_map,
                                    max_l,
                                    validation_num,
                                    start,
                                    end,
                                ))
        threadings.put(work)
        start = end + 1
        end = min(num - 1, block_size * (block + 2))
    while not threadings.empty():
        tempwork = threadings.get()
        tempwork.start()
        waitthreadings.put(tempwork)
    while not waitthreadings.empty():
        waitthreadings.get().join()

    global trains, vals
    train = sum(trains, [])
    val = sum(vals, [])
    train = np.array(train, dtype="int")
    val = np.array(val, dtype="int")
    test = np.array(test, dtype="int")
    print('trainning data', len(train), 'val data', len(val), 'spend time:',
          spend_time(version))
    return [train, val, test]
Exemplo n.º 5
0
    def origin_result_direct(self, input_file1, input_file2, output_file):
        """
        origin sample direct no theading
        """

        version = begin_time()
        pre = []
        dataset = []
        with codecs.open(input_file1, 'r', 'utf-8') as f:
            temp_context = ''
            last_index = ''
            for tempword in f:
                if tempword == '\r\n':
                    pre.append("1#" + temp_context + last_index)
                    temp_context = ''
                    last_index = ''
                else:
                    if len(last_index):
                        temp_context += (last_index + '#')
                    last_index = tempword[:-1].strip()
        with codecs.open(input_file2, 'r', 'utf-8') as f:
            temp_context = []
            index = 0
            totalnum = len(pre)
            for tempword in f:
                if tempword == '\r\n':
                    if len(temp_context) < 9:
                        continue
                    elif len(temp_context) == 9:
                        if index < totalnum:
                            dataset.append(pre[index] + '#' + temp_context[0])
                        index += 1
                        temp_context = []
                    else:
                        index += 1
                        temp_context = []
                else:
                    temp_context.append(tempword[:-1].strip())
                    if index < totalnum:
                        dataset.append(
                            pre[index] + '#' +
                            tempword[:-1].replace(u'\ufeff', '').strip())
            pickle.dump([pre, dataset], open(output_file, "wb"))
        end_time(version)
Exemplo n.º 6
0
    def sixsixip(self, area, page):
        """
        66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html
        """

        version = begin_time()
        threadings = []
        for index in range(1, area + 1):
            for pageindex in range(1, page + 1):
                print(str(index) + ' ' + str(pageindex))
                work = threading.Thread(target=self.sixsixthread,
                                        args=(index, pageindex))
                threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.threadjude()
        end_time(version)
Exemplo n.º 7
0
def make_data_theading(revs, word_idx_map, max_l, validation_num, start, end):
    """
    make data theading
    """
    version = begin_time()
    temptrain, tempval, temptest = [], [], []

    for index in range(start, end):
        rev = revs[index]
        sent = get_idx_from_sent_msg(rev["m"], word_idx_map, max_l, True)
        sent += get_idx_from_sent(rev["r"], word_idx_map, max_l, True)
        sent += get_session_mask(rev["m"])
        sent.append(int(rev["y"]))
        if index >= validation_num:
            temptrain.append(sent)
        else:
            tempval.append(sent)
    global trains, vals
    trains.append(temptrain)
    vals.append(tempval)
Exemplo n.º 8
0
    def get_href(self):
        """
        get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1
        """

        version = begin_time()
        threadings = []
        for index in range(71):
            work = threading.Thread(target=self.href_once, args=(index, ))
            threadings.append(work)

        for work in threadings:
            # time.sleep(.5)
            work.start()
        for work in threadings:
            work.join()
        href_map = [self.href_map[k] for k in sorted(self.href_map.keys())]
        self.href_map = sum(href_map, [])
        with codecs.open('bjh_href_poison.txt', 'w', encoding='utf-8') as f:
            f.write("\n".join(self.href_map))
        end_time(version)
Exemplo n.º 9
0
    def data5u(self):
        """
        data5u proxy http://www.data5u.com/
        no one can use
        """

        version = begin_time()
        url_list = ['', 'free/gngn/index.shtml', 'free/gwgn/index.shtml']
        host = 'http://www.data5u.com/'
        for uri in url_list:
            html = self.get_request_proxy(host + uri, 0)
            if not html:
                continue
            table = html.find_all('ul', class_='l2')
            for index in table:
                tds = index.find_all('li')
                ip = tds[3].text
                self.waitjudge.append(ip + '://' + tds[0].text + ':' +
                                      tds[1].text)
        self.threadjude()
        end_time(version)
Exemplo n.º 10
0
def test_model(dataset_file='SMN/data/datasets_test11.pkl',
               pre_file='SMN/data/smn_test11.pkl',
               model_name='SMN/data/model.bin',
               result_file='SMN/data/result_test11.txt'):
    """
    test model return accuracy
    """
    version = begin_time()
    datasets = load_bigger(dataset_file)
    pre = pickle.load(open(pre_file, "rb"))
    wordvecs = pre[1]
    predict(datasets,
            wordvecs.W,
            batch_size=200,
            max_l=50,
            hidden_size=200,
            word_embedding_size=200,
            model_name=model_name,
            result_file=result_file)
    sampleConduct = SampleConduct()
    end_time(version)
    return sampleConduct.calculate_test(result_file)
Exemplo n.º 11
0
    def testdb(self, types):
        '''
        test proxy in db can use
        '''

        version = begin_time()
        typestr = ''
        if types == 2:
            typestr = '(0,1,2,3)'
        elif types == 1:
            typestr = '(1,3)'
        else:
            typestr = '(0,2)'
        results = self.Db.select_db(self.select_all % typestr)
        if results != 0:
            for index in results:
                self.waitjudge.append(index[0])
            self.threadjude()
        else:
            pass
        self.initproxy()
        end_time(version)
Exemplo n.º 12
0
    def xiciproxy(self, page):
        """
        xici proxy http://www.xicidaili.com/nn/{page}
        The first proxy I use, but now it can not use it mostly.
        """

        if not str(page).isdigit():
            print("Please input num!")
            return []

        version = begin_time()
        url = 'http://www.xicidaili.com/nn/%d'
        for index in range(1, page + 1):
            html = basic_req(url % (index), 0)
            tem = html.find_all('tr')
            for index in range(1, len(tem)):
                tds = tem[index].find_all('td')
                ip = tds[5].text.lower()
                self.waitjudge.append(ip + '://' + tds[1].text + ':' +
                                      tds[2].text)
        self.threadjude()
        end_time(version)
Exemplo n.º 13
0
    def origin_t_direct(self,
                        input_file='SMN/data/test_SMN.pkl',
                        output_file='SMN/data/weibo/val_Dataset.pkl',
                        small_size=10000,
                        word2id_file='SMN/data/weibo/word2id.pkl'):
        """
        origin sample direct no theading
        """

        version = begin_time()
        test = pickle.load(open(input_file, 'rb'))
        self.word2id = pickle.load(open(word2id_file, 'rb'))
        c = []
        r = []
        num = 0
        for tempword in test:
            words = tempword[2:].split('#')
            contexts = words[:-1]
            replys = words[-1]
            context = []
            reply = []
            for idx, index in enumerate(words):
                if idx:
                    context.append(1)
                temp_context = index.split()
                for temp in temp_context:
                    context.append(self.word2id[LCS(temp)] if LCS(temp) in
                                   self.word2id else 0)
            for index in replys:
                temp_reply = index.split()
                for temp in temp_reply:
                    reply.append(self.word2id[LCS(temp)] if LCS(temp) in
                                 self.word2id else 0)
            r.append(reply)
            c.append(context)
        y = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] * (len(test) // 10)
        val_data = {'y': y, 'r': r, 'c': c}
        pickle.dump(val_data, open(output_file, "wb"))
        end_time(version)
Exemplo n.º 14
0
    def origin_sample_direct(self,
                             input1_file,
                             input2_file,
                             output_file,
                             small_size=2000):
        """
        origin sample direct no theading
        """

        version = begin_time()
        with codecs.open(input1_file, 'r', 'utf-8') as f:
            sample1 = f.readlines()
        with codecs.open(input2_file, 'r', 'utf-8') as f:
            sample2 = f.readlines()

        temp_context = ''
        last_index = ''
        content = []
        r = []
        for tempword in sample1:
            if tempword == '\n':
                content.append(temp_context + last_index[:-5])
                temp_context = ''
                last_index = ''
            else:
                if len(last_index):
                    temp_context += last_index
                last_index = tempword[:-1].strip() + '[SEP]'
        num = 0
        print(len(sample2))
        for index, tempword in enumerate(sample2):
            if tempword != '\n':
                last_index = tempword[:-1].replace('\"', '').replace('\\', '')
                r.append('0#' + content[num] + '#' + last_index)
            else:
                num += 1
        pickle.dump(r, open(output_file, "wb"))

        end_time(version)
Exemplo n.º 15
0
def run_model(pre_file,
              types,
              model_name='SMN/data/model_little0.pkl',
              max_word_per_utterence=50,
              validation_num=500000,
              result_file='SMN/data/20result1.txt',
              exicted_model=None):
    """
    run model for train or predict
    @params: types 0-train, 1-predict
    """
    version = begin_time()

    pre = pickle.load(open(pre_file, "rb"))
    revs, wordvecs, max_l2 = pre[0], pre[1], pre[2]

    datasets = make_data(revs,
                         wordvecs.word_idx_map,
                         max_l=max_word_per_utterence,
                         validation_num=validation_num)

    if not types:
        train(datasets,
              wordvecs.W,
              batch_size=200,
              max_l=max_word_per_utterence,
              hidden_size=200,
              word_embedding_size=200,
              exicted_model=exicted_model)
    else:
        predict(datasets,
                wordvecs.W,
                batch_size=200,
                max_l=max_word_per_utterence,
                hidden_size=200,
                word_embedding_size=200,
                model_name=model_name,
                result_file=result_file)
    end_time(version)
Exemplo n.º 16
0
    def build_md(self, load_img=False):
        """
        build md
        """
        version = begin_time()

        threadings = []
        for index, tid in enumerate(self.request_list):
            work = threading.Thread(target=self.build_md_once,
                                    args=(
                                        index,
                                        tid,
                                    ))
            threadings.append(work)

        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        if not load_img:
            return
        img_map = {k: self.img_map[k] for k in sorted(self.img_map.keys())}
        img_threadings = []
        for index in img_map.keys():
            for img_id, img_url in enumerate(img_map[index]):
                work = threading.Thread(target=self.load_img,
                                        args=(
                                            index,
                                            img_id,
                                            img_url,
                                        ))
                img_threadings.append(work)
        for work in img_threadings:
            work.start()
        for work in img_threadings:
            work.join()

        end_time(version)
Exemplo n.º 17
0
 def preData(self):
     """
     data prepare
     """
     version = begin_time()
     file_d = open('vsm/test3', 'r')
     articles = file_d.readlines()
     threadings = []
     self.articleNum = len(articles)
     self.articleMaps = [None for i in range(self.articleNum)]
     self.resultArray = [None for i in range(self.articleNum)]
     for index in range(self.articleNum):
         work = threading.Thread(target=self.preDataBasic,
                                 args=(
                                     articles[index].strip('\n').rstrip(),
                                     index,
                                 ))
         threadings.append(work)
     for work in threadings:
         work.start()
     for work in threadings:
         work.join()
     end_time(version)
Exemplo n.º 18
0
 def calculate_test(self, input_file, block_size=10):
     """
     calculate result
     """
     version = begin_time()
     with codecs.open(input_file, 'r', 'utf-8') as f:
         results = f.readlines()
         totalnum = int(len(results))
         correctnum = 0
         top3num = 0
         for index in range(int(totalnum / block_size)):
             pre = results[index * block_size:(index + 1) * block_size]
             temp_index = np.array(pre).argmax()
             top3 = np.array(pre).argsort()[-3:][::-1]
             if not temp_index:
                 correctnum += 1
             if 0 in top3:
                 top3num += 1
         print(correctnum, top3num, int(totalnum / block_size),
               spend_time(version),
               str(correctnum / int(totalnum / block_size))[:5],
               str(top3num / int(totalnum / block_size))[:5])
         return str(correctnum / int(totalnum / block_size))[:5]
Exemplo n.º 19
0
    def origin_sample_direct(self, input_file, output_file):
        """
        origin sample direct no theading
        """

        version = begin_time()
        with codecs.open(input_file, 'r', 'utf-8') as f:
            temp_context = ''
            last_index = ''
            content = []
            response = []
            pre = []
            for tempword in f:
                if tempword == '\r\n':
                    content.append(temp_context)
                    response.append(last_index)
                    pre.append("1#" + temp_context + last_index)
                    temp_context = ''
                else:
                    if len(last_index):
                        temp_context += (last_index + '#')
                    last_index = tempword[:-1].strip()
            pickle.dump(pre, open(output_file, "wb"))
        end_time(version)
Exemplo n.º 20
0
    def get_classify(self):
        """
        get classify from /discover/playlist
        """

        version = begin_time()
        self.classifylist = {}
        host = 'https://music.163.com/discover/playlist'
        html = get_request_proxy(host, 0)

        if not html:
            print('Empty')
            if can_retry(host):
                self.get_classify()
            return []

        alist = html.find_all('a', class_='s-fc1')
        if not len(alist):
            if can_retry(host):
                self.get_classify()
            print(html)
        for index in alist:
            self.classifylist[index.text] = index['href']
        end_time(version)
Exemplo n.º 21
0
    def load_spot(self, batch_size=50):
        ''' load spot '''
        version = begin_time()
        self.load_city_list()
        # self.city_list = [10186]
        city_threading = [
            threading.Thread(target=self.load_spot_once, args=(
                1,
                ii,
            )) for ii in self.city_list
        ]
        shuffle_batch_run_thread(city_threading, 150)

        spot_continue = []
        for ii, jj in self.spot_pn.items():
            spot_continue += [
                threading.Thread(target=self.load_spot_once, args=(
                    pn,
                    ii,
                )) for pn in range(2, jj + 1)
            ]

        shuffle_batch_run_thread(spot_continue, 150)
        output = [
            '{},{}'.format(self.id2map[ii], ','.join(jj))
            for ii, jj in self.spot_result.items()
        ]
        output_path = '{}spot.txt'.format(data_dir)
        with open(output_path, 'w') as f:
            f.write('\n'.join(output))
        city_num = len(self.city_list)
        spot_num = sum([len(ii) for ii in self.spot_result.values()])
        echo(
            1,
            'City num: {}\nSpot num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'
            .format(city_num, spot_num, output_path, end_time(version, 0)))
Exemplo n.º 22
0
 def gatherproxy(self, types):
     """
     :100: very nice website
     first of all you should download proxy ip txt from:
     http://www.gatherproxy.com/zh/proxylist/country/?c=China
     """
     version = begin_time()
     if not os.path.exists('%sgatherproxy' % data_path):
         print('Gather file not exist!!!')
         return
     with codecs.open('%sgatherproxy' % data_path, 'r',
                      encoding='utf-8') as f:
         file_d = [ii.strip() for ii in f.readlines()]
     if not types:
         waitjudge = ['http://' + ii[:-1] for ii in file_d]
     elif types == 1:
         waitjudge = ['https://' + ii[:-1] for ii in file_d]
     else:
         waitjudge1 = ['http://' + ii[:-1] for ii in file_d]
         waitjudge2 = ['https://' + ii[:-1] for ii in file_d]
         waitjudge = [*waitjudge1, *waitjudge2]
     self.waitjudge = waitjudge
     print('load gather over!')
     end_time(version)
Exemplo n.º 23
0
    def match_goods(self):

        self.headers = {
            'pragma':
            'no-cache',
            'X-Requested-With':
            'XMLHttpRequest',
            'cache-control':
            'no-cache',
            'Cookie':
            '',
            'Content-Type':
            'application/x-www-form-urlencoded;charset=UTF-8',
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            "Accept-Encoding":
            "",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
        }

        version = begin_time()
        changeHtmlTimeout(30)
        block_size = 10
        if not os.path.exists('%sgoods' % data_dir):
            print('goods file not exist!!!')
            return
        with codecs.open('%sgoods' % data_dir, 'r', encoding='utf-8') as f:
            wait_goods = f.readlines()
        goods_url = [
            re.findall('http.* ', index)[0].strip().replace('https', 'http')
            if 'http' in index and not '【' in index else False
            for index in wait_goods
        ]

        if not os.path.exists('%scollect_wyy' % data_dir):
            print('collect file not exist!!!')
            return
        with codecs.open('%scollect_wyy' % data_dir, 'r',
                         encoding='utf-8') as f:
            collect = f.readlines()
        self.title2map = {
            index.split("||")[1]: index.split("||")[0]
            for index in collect
        }

        threadings = []
        for index, url in enumerate(goods_url):
            if url == False:
                continue
            work = threading.Thread(target=self.get_goods_id_first,
                                    args=(
                                        url,
                                        index,
                                    ))
            threadings.append(work)
        url_len = len(threadings)
        for index in range((url_len - 1) // block_size + 1):
            begin_id = index * block_size
            end_id = min(url_len, (index + 1) * block_size)
            threadings_block = threadings[begin_id:end_id]

            for work in threadings_block:
                work.start()
            for work in threadings_block:
                work.join()

            time.sleep(random.randint(0, 9))

        write_body = [
            ' '.join([self.goods_map[index], body]) if index in self.goods_map
            else (' '.join([self.url2goods[goods_url[index]], body])
                  if goods_url[index] in self.url2goods else body)
            for index, body in enumerate(wait_goods)
        ]
        with codecs.open('%sgoods_one' % data_dir, 'w', encoding='utf-8') as f:
            f.write(''.join(write_body))
        end_time(version)
Exemplo n.º 24
0
    def onetime_master(self,
                       input_file,
                       output_file,
                       block_size=900000,
                       test_size=2000):
        """
        by numpy
        """
        version = begin_time()
        with codecs.open(input_file, 'r', 'utf-8') as f:
            self.origin_sample = f.readlines()
        threadings = []
        num = 0
        for index, line in enumerate(self.origin_sample):
            num += 1
        start = 0
        end = min(block_size, num - 1)
        block_num = int(num / block_size) + 1
        print('Thread Begin. ', num)
        for block in range(block_num):
            while self.origin_sample[end] != '\r\n' and end < num - 1:
                end += 1
            work = threading.Thread(target=self.origin_sample_agent,
                                    args=(
                                        start,
                                        end,
                                        block,
                                    ))
            threadings.append(work)
            start = end + 1
            end = min(num - 1, block_size * (block + 1))
        print('point 1')
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        print('Thread Over.')
        return self.content, self.response
        content = np.hstack(np.array(list(self.content.values())))
        totalnum = len(content)
        print(totalnum)
        randomIndexs = unique_randomint(0, totalnum, test_size)
        otherIndexs = np.setdiff1d(np.arange(totalnum), randomIndexs)
        pre_content = content[otherIndexs]
        test_content = content[randomIndexs]
        del content
        gc.collect()
        response = np.hstack(np.array(list(self.response.values())))
        test_response = [
            response[index] + '\n' +
            list2str(response[unique_randomint(0, totalnum, 9, [index])]) +
            '\n' for index in randomIndexs
        ]
        otherIndexs = np.setdiff1d(np.arange(totalnum), randomIndexs)

        pre_response = response[otherIndexs]
        max_dtype = max(pre_content.dtype, pre_response.dtype)
        pre_next = pre_content.astype(max_dtype) + pre_response.astype(
            max_dtype)
        with open(output_file + 'seq_replies.txt', 'wb') as f:
            f.write(list2str(test_response))
        with open(output_file + 'seq_context.txt', 'wb') as f:
            f.write(list2str(test_content))
        with open(output_file + 'train.txt', 'wb') as f:
            f.write(list2str(pre_next))
        end_time(version)
Exemplo n.º 25
0
    def origin_sample_master(
            self,
            input_file,
            output_file='SMN/data/weibo/train_data_small.pkl',
            word2id_file='SMN/data/weibo/word2id.pkl',
            embedding_file='SMN/data/weibo/word_embedding.pkl',
            block_size=900000,
            small_size=100000):
        """
        the master of mult-Theading for get origin sample
        """
        version = begin_time()
        with codecs.open(input_file, 'r', 'utf-8') as f:
            self.origin_sample = f.readlines()
        self.word2id = pickle.load(open(word2id_file, 'rb'))
        # self.embedding = pickle.load(open(embedding_file, 'rb'))

        threadings = []
        num = len(self.origin_sample)
        start = 0
        end = min(block_size, num - 1)
        for block in range(int(num / block_size) + 1):
            while self.origin_sample[end] != '\r\n' and end < num - 1:
                end += 1
            work = threading.Thread(target=self.origin_sample_agent,
                                    args=(
                                        start,
                                        end,
                                        block,
                                    ))
            threadings.append(work)
            start = end + 1
            end = min(num - 1, block_size * (block + 1))
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        content_order = self.content.keys()
        content_order.sort()
        response_order = self.response.keys()
        response_order.sort()

        content = sum(list(self.content.values()), [])
        response = sum(list(self.response.values()), [])

        totalnum = len(content)
        print(totalnum)
        # return totalnum
        randomIndexs = unique_randomint(0, totalnum, small_size)
        y = [1, 0, 0] * small_size
        c = []
        r = []
        for index in randomIndexs:
            c.append(content[index])
            c.append(content[index])
            c.append(content[index])
            r.append(response[index])
            r.append(response[unique_randomint(0, totalnum, 1, [index])[0]])
            r.append(response[unique_randomint(0, totalnum, 1, [index])[0]])

        train_data = {'y': y, 'r': r, 'c': c}
        pickle.dump(train_data, open(output_file, "wb"))
        end_time(version)
Exemplo n.º 26
0
    def origin_sample_direct(self,
                             input1_file,
                             input2_file,
                             output_file,
                             small_size=10000,
                             word2id_file='SMN/data/weibo/word2id.pkl'):
        """
        origin sample direct no theading
        """

        version = begin_time()
        with codecs.open(input1_file, 'r', 'utf-8') as f:
            sample1 = f.readlines()
        with codecs.open(input2_file, 'r', 'utf-8') as f:
            sample2 = f.readlines()
        self.word2id = pickle.load(open(word2id_file, 'rb'))
        temp_context = []
        last_index = []
        c = []
        r = []
        num = 0
        for tempword in sample1:
            if tempword == '\r\n':
                num += 1
                for idx in range(10):
                    c.append(temp_context + last_index[:-1])
                temp_context = []
                last_index = []
            else:
                if len(last_index):
                    temp_context += last_index
                last_index = tempword[:-1].replace('\"', '').replace(
                    '\\', '').strip().split()
                if '\"' in last_index:
                    print(last_index)
                last_index = [(self.word2id[LCS(index)]
                               if LCS(index) in self.word2id else 0)
                              for index in last_index]
                last_index.append(1)
        # for idx in range(10):
        #     c.append(temp_context + last_index[:-1])
        num = 0
        print(len(sample2))
        for index, tempword in enumerate(sample2):
            if tempword != '\r\n':
                num += 1
                last_index = tempword[:-1].replace('\"', '').replace(
                    '\\', '').strip().split()
                last_index = [(self.word2id[LCS(index)]
                               if LCS(index) in self.word2id else 0)
                              for index in last_index]
                r.append(last_index)
            else:
                if num != 10:
                    r.append(last_index)
                    print(num, index)
                num = 0
        y = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] * small_size
        val_data = {'y': y, 'r': r, 'c': c}
        pickle.dump(val_data, open(output_file, "wb"))

        end_time(version)
Exemplo n.º 27
0
    def word2ids(self,
                 input_file,
                 embedding_file,
                 output1_file='SMN/data/weibo/word2id.pkl',
                 output2_file='SMN/data/weibo/word_embedding.pkl',
                 output3_file='SMN/data/weibo/word2id',
                 min_n=1,
                 max_n=3):
        """
        word 2 id
        """
        version = begin_time()
        with codecs.open(input_file, 'r', 'utf-8') as f:
            origin_sample = f.readlines()
        word_embedding = load_bigger(embedding_file)
        words = []
        word_map = {}
        embedding_lists = []

        word_map['_OOV_'] = 0
        word_map['_EOS_'] = 1
        embedding_lists.append([0] * 200)
        embedding_lists.append([0] * 200)
        for index in origin_sample:
            if index == '\r\n':
                continue
            words += [LCS(idx) for idx in index.replace('\r\n', '').split()]
            # words.update(set(index.replace('\r\n', '').split()))
        words = Counter(words)
        words = [index for index in words]
        word2id = ['_OOV_ 0', '_EOS_ 1']
        word_size = word_embedding.wv.syn0[0].shape[0]

        print('Step 2: Begin')
        index_num = 2
        for idx, index in enumerate(words):
            if index in word_map:
                continue
            if index in word_embedding.wv.vocab.keys():
                word_map[index] = index_num
                index_num += 1
                word2id.append(index + ' ' + str(word_map[index]))
                embedding_lists.append(word_embedding[index].astype('float32'))
            else:
                ngrams = compute_ngrams(index, min_n=min_n, max_n=max_n)
                word_vec = np.zeros(word_size, dtype=np.float32)
                ngrams_found = 0
                ngrams_single = [ng for ng in ngrams if len(ng) == 1]
                ngrams_more = [ng for ng in ngrams if len(ng) > 1]
                for ngram in ngrams_more:
                    if ngram in word_embedding.wv.vocab.keys():
                        word_vec += word_embedding[ngram]
                        ngrams_found += 1
                if ngrams_found == 0:
                    for ngram in ngrams_single:
                        if ngram in word_embedding.wv.vocab.keys():
                            word_vec += word_embedding[ngram]
                            ngrams_found += 1
                if word_vec.any():
                    word_vec /= max(1, ngrams_found)
                    word_map[index] = index_num
                    index_num += 1
                    word2id.append(index + ' ' + str(word_map[index]))
                    embedding_lists.append(word_vec)
        print(index_num)
        with open(output3_file, 'w') as f:
            f.write(list2str(word2id))
        print('Step 2: Over')

        # return embedding_lists, word_map
        pickle.dump(embedding_lists, open(output2_file, "wb"))
        pickle.dump(word_map, open(output1_file, "wb"))
        end_time(version)
Exemplo n.º 28
0
    def word2ids(self,
                 input_file,
                 embedding_file,
                 output1_file='SMN/data/weibo/word2id.pkl',
                 output2_file='SMN/data/weibo/word_embedding.pkl',
                 output3_file='SMN/data/weibo/word2id'):
        """
        word 2 id
        """
        version = begin_time()
        with codecs.open(input_file, 'r', 'utf-8') as f:
            origin_sample = f.readlines()
        word_embedding = load_bigger(embedding_file)
        words = []
        word_map = {}
        embedding_lists = []

        word_map['_OOV_'] = 0
        word_map['_EOS_'] = 1
        embedding_lists.append([0] * 200)
        embedding_lists.append([0] * 200)
        for index in origin_sample:
            if index == '\r\n':
                continue
            words += [LCS(idx) for idx in index.replace('\r\n', '').split()]
            # words.update(set(index.replace('\r\n', '').split()))
        words = Counter(words)
        words = [index for index in words if words[index] > 2]
        word2id = ['_OOV_ 0', '_EOS_ 1']

        print('Step 2: Begin')
        index_num = 2
        for idx, index in enumerate(words):
            if index in word_embedding:
                if index not in word_map:
                    word_map[index] = index_num
                    index_num += 1
                    word2id.append(index + ' ' + str(word_map[index]))
                    embedding_lists.append(
                        list(word_embedding[index].astype('float16')))
            # elif index[:3] in word_embedding:
            #     if index[:3] not in word_map:
            #         word_map[index[:3]] = index_num
            #         word_map[index] = index_num
            #         index_num += 1
            #         word2id.append(index[:3] + ' ' + str(word_map[index[:3]]))
            #         word2id.append(index + ' ' + str(word_map[index]))
            #         embedding_lists.append(list(word_embedding[index[:3]].astype('float16')))
            #     else:
            #         word_map[index] = word_map[index[:3]]
            #         word2id.append(index + ' ' + str(word_map[index]))
            # elif index[:2] in word_embedding:
            #     if index[:2] not in word_map:
            #         word_map[index[:2]] = index_num
            #         word_map[index] = index_num
            #         index_num += 1
            #         word2id.append(index[:2] + ' ' + str(word_map[index[:2]]))
            #         word2id.append(index + ' ' + str(word_map[index]))
            #         embedding_lists.append(list(word_embedding[index[:2]].astype('float16')))
            #     else:
            #         word_map[index] = word_map[index[:2]]
            #         word2id.append(index + ' ' + str(word_map[index]))
            # elif index[:1] in word_embedding:
            #     if index[:1] not in word_map:
            #         word_map[index[:1]] = index_num
            #         word_map[index] = index_num
            #         index_num += 1
            #         word2id.append(index[:1] + ' ' + str(word_map[index[:1]]))
            #         word2id.append(index + ' ' + str(word_map[index]))
            #         embedding_lists.append(list(word_embedding[index[:1]].astype('float16')))
            #     else:
            #         word_map[index] = word_map[index[:1]]
            #         word2id.append(index + ' ' + str(word_map[index]))
        print(index_num)
        with open(output3_file, 'w') as f:
            f.write(list2str(word2id))
        print('Step 2: Over')

        # return embedding_lists, word_map
        pickle.dump(embedding_lists, open(output2_file, "wb"))
        pickle.dump(word_map, open(output1_file, "wb"))
        end_time(version)
Exemplo n.º 29
0
        result.append('best cv score:' + str(eval_hist['auc-mean'][-1]) + '\n')
        with open(model_path + 'result', 'a') as f:
            f.write('\n'.join([str(index) for index in result]))
        print('best n_estimators:', len(eval_hist['auc-mean']))
        print('best cv score:', eval_hist['auc-mean'][-1])
        self.OPT_ROUNDS = len(eval_hist['auc-mean'])
        if (eval_hist['auc-mean'][-1] > self.basic_auc):
            self.basic_auc = eval_hist['auc-mean'][-1]
            if not index is None and index != -1:
                self.good_columns.append(self.wait_columns[index])
        with open(model_path + 'columns.csv', 'w') as f:
            f.write(','.join([str(index) for index in self.good_columns]))


if __name__ == '__main__':
    version = begin_time()

    model = False
    single = True
    im = SA()
    # im.pre_data_v1(1)
    # im.pre_data_v1(0)
    # single = True
    if single:
        im.load_data(model)
        im.optimize_model(model)
        im.train_model()
        im.evaulate_model(model)

    else:
        for index in range(-1, len(im.wait_columns)):  # filter good feature
Exemplo n.º 30
0
    def bulk_import_alimama(self):
        """
        bulk import alimama
        """

        version = begin_time()
        if not os.path.exists('%scollect_wyy' % data_dir):
            print('Collect File not exist!!!')
            return
        with codecs.open('%scollect_wyy' % data_dir, 'r',
                         encoding='utf-8') as f:
            goods = f.readlines()
        self.goods_candidate = [index.split('||')[0] for index in goods]
        goods_len = len(self.goods_candidate)

        self.headers = {
            'pragma':
            'no-cache',
            'X-Requested-With':
            'XMLHttpRequest',
            'cache-control':
            'no-cache',
            'Cookie':
            '',
            'Content-Type':
            'application/x-www-form-urlencoded;charset=UTF-8',
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            "Accept-Encoding":
            "",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
            'Origin':
            'http://pub.alimama.com',
            'Referer':
            'http://pub.alimama.com/promo/search/index.htm?q=%E7%AC%AC%E5%9B%9B%E5%8D%81%E4%B9%9D%E5%A4%A9%2019%E6%98%A5%E5%AD%A3&_t=1550891362391'
        }
        if not os.path.exists('%scookie_alimama' % data_dir):
            print('alimama cookie not exist!!!')
            return
        with codecs.open('%scookie_alimama' % data_dir, 'r',
                         encoding='utf-8') as f:
            cookie = f.readlines()
        url_list = [
            'https://pub.alimama.com/favorites/group/newList.json?toPage=1&perPageSize=40&keyword=&t=',
            str(int(round(time.time() * 1000))), '&_tb_token_=',
            cookie[1][:-1], '&pvid=', cookie[2][:-1]
        ]
        url = ''.join(url_list)
        self.headers['Cookie'] = cookie[0][:-1]
        self.headers['Host'] = url.split('/')[2]

        group_list = basic_req(url, 2, header=self.headers)

        if group_list.status_code != 200 or group_list.json(
        )['info']['message'] == 'nologin':
            print('group_list error')
            return
        group_list = group_list.json()['data']['result']
        group_list = [index['id'] for index in group_list]

        print(group_list)

        assert len(group_list) > (goods_len - 1) // 200

        threadings = []
        for index in range((goods_len - 1) // 200 + 1):
            work = threading.Thread(target=self.bulk_import_alimama_once,
                                    args=(
                                        index,
                                        group_list[index],
                                    ))
            threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        end_time(version)