예제 #1
0
    def f_create_data(self, args):
        self.m_min_occ = args.min_occ
        self.m_max_line = 1e8

        self.m_data_dir = args.data_dir
        self.m_data_name = args.data_name
        self.m_raw_data_file = args.data_file
        self.m_raw_data_path = os.path.join(self.m_data_dir,
                                            self.m_raw_data_file)

        self.m_output_file = args.output_file
        # self.m_vocab_file = self.m_data_name+".vocab.json"
        self.m_vocab_file = "vocab.json"
        ### to save new generated data
        self.m_data_file = "tokenized_" + self.m_output_file
        # self.m_data_file = "tokenized_"+self.m_data_name+"_"+self.m_output_file
        # self.m_data_file = "tokenized_"+self.m_data_name+"_pro_v2.pickle"

        data = pd.read_pickle(self.m_raw_data_path)
        train_df = data["train"]
        valid_df = data["valid"]

        tokenizer = TweetTokenizer(preserve_case=False)

        train_reviews = train_df.review
        train_item_ids = train_df.itemid
        train_user_ids = train_df.userid

        valid_reviews = valid_df.review
        valid_item_ids = valid_df.itemid
        valid_user_ids = valid_df.userid

        vocab_obj = _Vocab()

        self.f_create_vocab(vocab_obj, train_reviews)
        # i = 0

        review_corpus = defaultdict(dict)
        item_corpus = defaultdict(dict)
        user_corpus = defaultdict(dict)
        global_user2uid = defaultdict()
        global_item2iid = defaultdict()

        stop_word_ids = [
            vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>'])
            for w in stopwords.words('english')
        ]
        punc_ids = [
            vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>'])
            for w in string.punctuation
        ]

        print("loading train reviews")

        ss_time = datetime.datetime.now()

        non_informative_words = stop_word_ids + punc_ids
        # non_informative_words = stopwords.words()+string.punctuation
        print("non informative words num", len(non_informative_words))

        ### load user words
        self.m_user_word_file = os.path.join(self.m_data_dir,
                                             args.user_word_file)
        self.f_load_user_word_score(vocab_obj, self.m_user_word_file)

        ### load item words
        self.m_item_word_file = os.path.join(self.m_data_dir,
                                             args.item_word_file)
        self.f_load_item_word_score(vocab_obj, self.m_item_word_file)

        print("user word", len(self.m_user_word_score_map))
        print("item word", len(self.m_item_word_score_map))

        for index, review in enumerate(train_reviews):
            if index > self.m_max_line:
                break

            item_id = train_item_ids.iloc[index]
            user_id = train_user_ids.iloc[index]

            words = tokenizer.tokenize(review)

            word_ids = [
                vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in words
            ]

            new_review_user, new_review_item, new_review_local = self.f_get_perturb(
                vocab_obj, item_id, words, word_ids)

            review_id = len(review_corpus['train'])
            review_obj = _Review()

            review_obj.f_set_review(review_id, word_ids)
            # review_obj.f_set_review(review_id, word_ids, new_word_tf_map, informative_word_num)
            review_obj.f_set_pertub_review(new_review_user, new_review_item,
                                           new_review_local)

            review_corpus["train"][review_id] = review_obj

            if user_id not in user_corpus:
                user_obj = _User()
                user_obj.f_set_user_id(user_id)
                user_corpus[user_id] = user_obj

                global_user2uid[user_id] = len(global_user2uid)

            uid = global_user2uid[user_id]
            user_obj = user_corpus[user_id]
            user_obj.f_add_review_id(review_id)

            if item_id not in item_corpus:
                item_obj = _Item()
                item_corpus[item_id] = item_obj
                item_obj.f_set_item_id(item_id)

                global_item2iid[item_id] = len(global_item2iid)

            iid = global_item2iid[item_id]
            item_obj = item_corpus[item_id]
            item_obj.f_add_review_id(review_obj, review_id)

            review_obj.f_set_user_item(uid, iid)

        e_time = datetime.datetime.now()
        print("load training duration", e_time - ss_time)
        print("load train review num", len(review_corpus["train"]))

        s_time = datetime.datetime.now()

        user_num = len(user_corpus)
        vocab_obj.f_set_user(global_user2uid)

        print("item num", len(item_corpus))

        print("loading valid reviews")
        for index, review in enumerate(valid_reviews):

            if index > self.m_max_line:
                break

            item_id = valid_item_ids.iloc[index]
            user_id = valid_user_ids.iloc[index]

            if user_id not in global_user2uid:
                continue

            if item_id not in item_corpus:
                continue

            words = tokenizer.tokenize(review)

            word_ids = [
                vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in words
            ]

            new_review_user, new_review_item, new_review_local = self.f_get_perturb(
                vocab_obj, item_id, words, word_ids)

            review_id = len(review_corpus["valid"])
            review_obj = _Review()

            review_obj.f_set_review(review_id, word_ids)
            # review_obj.f_set_review(review_id, word_ids, new_word_tf_map, informative_word_num)
            review_obj.f_set_pertub_review(new_review_user, new_review_item,
                                           new_review_local)

            review_corpus["valid"][review_id] = review_obj

            uid = global_user2uid[user_id]
            iid = global_item2iid[item_id]
            review_obj.f_set_user_item(uid, iid)

            item_obj = item_corpus[item_id]
            # print(len(item_corpus))
            # item_obj.f_get_RRe(review_obj)
            # item_obj.f_get_ARe(review_obj)

        print("load validate review num", len(review_corpus["valid"]))

        save_data = {
            "item": global_item2iid,
            "review": review_corpus,
            "user": global_user2uid
        }

        print("save data to ", self.m_data_file)
        data_pickle_file = os.path.join(self.m_data_dir, self.m_data_file)
        f = open(data_pickle_file, "wb")
        pickle.dump(save_data, f)
        f.close()

        vocab = dict(w2i=vocab_obj.m_w2i,
                     i2w=vocab_obj.m_i2w,
                     user2uid=vocab_obj.m_user2uid)
        with io.open(os.path.join(self.m_data_dir, self.m_vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))
예제 #2
0
    def f_create_data(self, args):
        self.m_min_occ = args.min_occ
        self.m_max_line = 1e8

        self.m_data_dir = args.data_dir
        self.m_data_name = args.data_name
        self.m_raw_data_file = args.data_file
        self.m_raw_data_path = os.path.join(self.m_data_dir, self.m_raw_data_file)

        self.m_vocab_file = self.m_data_name+"_vocab.json"
        ### to save new generated data
        self.m_data_file = "tokenized_"+self.m_data_name+"_pro.pickle"

        data = pd.read_pickle(self.m_raw_data_path)
        train_df = data["train"]
        valid_df = data["valid"]

        tokenizer = TweetTokenizer(preserve_case=False)
        
        train_reviews = train_df.review
        train_item_ids = train_df.itemid
        train_user_ids = train_df.userid

        valid_reviews = valid_df.review
        valid_item_ids = valid_df.itemid
        valid_user_ids = valid_df.userid

        vocab_obj = _Vocab()

        self._create_vocab(vocab_obj, train_reviews)
        # i = 0

        review_corpus = defaultdict(dict)
        item_corpus = defaultdict(dict)
        user_corpus = defaultdict(dict)
        user2uid = defaultdict()

        stop_word_ids = [vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in stopwords.words('english')]
        punc_ids = [vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in string.punctuation]

        print("loading train reviews")

        ss_time = datetime.datetime.now()

        non_informative_words = stop_word_ids + punc_ids
        # non_informative_words = stopwords.words()+string.punctuation
        print("non informative words num", len(non_informative_words))

        # print_index = 0
        for index, review in enumerate(train_reviews):
            if index > self.m_max_line:
                break

            item_id = train_item_ids.iloc[index]
            user_id = train_user_ids.iloc[index]

            words = tokenizer.tokenize(review)
            
            word_ids = [vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in words]

            word_tf_map = Counter(word_ids)
            new_word_tf_map = {}
            for word in word_tf_map:
                if word in non_informative_words:
                    continue

                new_word_tf_map[word] = word_tf_map[word]

            informative_word_num = sum(new_word_tf_map.values())

            if informative_word_num < 5:
                continue

            review_id = len(review_corpus['train'])
            review_obj = _Review()
            review_obj.f_set_review(review_id, word_ids, new_word_tf_map, informative_word_num)

            # print_index += 1

            review_corpus["train"][review_id] = review_obj

            if user_id not in user_corpus:
                user_obj = _User()
                user_obj.f_set_user_id(user_id)
                user_corpus[user_id] = user_obj

                user2uid[user_id] = len(user2uid)

            uid = user2uid[user_id]
            user_obj = user_corpus[user_id]
            user_obj.f_add_review_id(review_id)

            if item_id not in item_corpus:
                item_obj = _Item()
                item_corpus[item_id] = item_obj
                item_obj.f_set_item_id(item_id)

            review_obj.f_set_user_item(uid, item_id)

            item_obj = item_corpus[item_id]
            item_obj.f_add_review_id(review_obj, review_id)

        e_time = datetime.datetime.now()
        print("load training duration", e_time-ss_time)
        print("load train review num", len(review_corpus["train"]))

        s_time = datetime.datetime.now()

        user_num = len(user_corpus)
        vocab_obj.f_set_user(user2uid)
        # vocab_obj.f_set_user_size(user_num)

        save_item_corpus = {}
        
        print("item num", len(item_corpus))

        # print_index = 0
        # print_review_index = 0

        for item_id in item_corpus:
            item_obj = item_corpus[item_id]

            # s_time = datetime.datetime.now()
                
            item_obj.f_get_item_lm()

            for review_id in item_obj.m_review_id_list:

                review_obj = review_corpus["train"][review_id]

                item_obj.f_get_RRe(review_obj)

            if item_id not in save_item_corpus:
                save_item_corpus[item_id] = item_obj.m_avg_review_words

        print("loading valid reviews")
        for index, review in enumerate(valid_reviews):

            if index > self.m_max_line:
                break

            item_id = valid_item_ids.iloc[index]
            user_id = valid_user_ids.iloc[index]

            if user_id not in user2uid:
                continue
            
            if item_id not in save_item_corpus:
                continue
            
            words = tokenizer.tokenize(review)

            word_ids = [vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in words]

            word_tf_map = Counter(word_ids)
            new_word_tf_map = {}
            for word in word_tf_map:
                if word in non_informative_words:
                    continue

                new_word_tf_map[word] = word_tf_map[word]

            informative_word_num = sum(new_word_tf_map.values())

            if informative_word_num < 5:
                continue

            review_id = len(review_corpus["valid"])
            review_obj = _Review()
            review_obj.f_set_review(review_id, word_ids, new_word_tf_map, informative_word_num)

            review_corpus["valid"][review_id] = review_obj
            
            uid = user2uid[user_id]
            review_obj.f_set_user_item(uid, item_id)

            item_obj = item_corpus[item_id]
            # print(len(item_corpus))
            item_obj.f_get_RRe(review_obj)

        print("load validate review num", len(review_corpus["valid"]))

        save_data = {"item": save_item_corpus, "review": review_corpus, "user":user_num}

        print("save data to ", self.m_data_file)
        data_pickle_file = os.path.join(self.m_data_dir, self.m_data_file) 
        f = open(data_pickle_file, "wb")
        pickle.dump(save_data, f)
        f.close()

        vocab = dict(w2i=vocab_obj.m_w2i, i2w=vocab_obj.m_i2w, user2uid=vocab_obj.m_user2uid)
        with io.open(os.path.join(self.m_data_dir, self.m_vocab_file), 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))
예제 #3
0
                                    self.businessZones)

                if self.currpage <= self.counts:
                    self.currpage = self.currpage + 1
                    logging.info("当前关键字%s - 爬到第%d页" % (self.kd, self.currpage))

                elif self.index <= len(self.kds):
                    self.currpage = 0
                    self.counts = 0
                    self.index = self.index + 1
                    self.kd = self.kds[self.index]
                    time.sleep(random.random() * 10)

        except Exception as e:
            logging.error("网络异常 %s" % (e))
            time.sleep(random.random() * 100)
        except KeyboardInterrupt as e:
            logging.error("用户外部中断,Ctrl+C,记录数据")
            with open("recode/data.txt", 'w') as f:
                f.write('%s\n%s' % (self.index, self.currpage))
                f.close()

            logging.shutdown()
            sys.exit(0)


sql = _Item()
scrapy = Snake()
while True:
    scrapy.request()