def encode_advertise(self, advertise): x_char, x_word, y_tag = [], [], [] char2idx = self.maps["char2idx"] word2idx = self.maps["word2idx"] tag2idx = self.maps["tag2idx"] terms = advertise["NER"] terms_words = [token[0] for token in terms] terms_tags = [token[1] for token in terms] tmp_seq_words = self.pad_term_sequence(terms_words, max_len=self.seq_max_len) tmp_seq_tags = self.pad_term_sequence(terms_tags, max_len=self.seq_max_len) w_rep, word2idx = self.build_word_representations(tmp_seq_words, word2idx) t_rep, tag2idx = self.build_word_representations(tmp_seq_tags, tag2idx) x_word.append(pad_sequences(maxlen=self.seq_max_len, sequences=[w_rep], value=word2idx["__PAD__"], padding='post', truncating='post').tolist()) y_tag.append(pad_sequences(maxlen=self.seq_max_len, sequences=[t_rep], value=tag2idx["__PAD__"], padding='post', truncating='post').tolist()) representation, char2idx = self.build_char_representations(tmp_seq_words, char2idx) x_char.append(representation) self.save_encoded_data(x_word, x_char, y_tag) # Update maps if self.update_maps: self.maps["char2idx"] = char2idx self.maps["word2idx"] = word2idx self.maps["tag2idx"] = tag2idx self.advertise_counter += 1 sc.get_notice(self.advertise_counter, 5000, msg_text="ads processed!")
def encode_advertise(self, advertise): x_char, x_word, y_price = [], [], [] char2idx = self.maps["char2idx"] word2idx = self.maps["word2idx"] terms = advertise["clean_text"] tmp_seq = self.pad_term_sequence(self.tokenize_sentence(terms), max_len=self.seq_max_len) w_rep, word2idx = self.build_word_representations(tmp_seq, word2idx) x_word.append(pad_sequences(maxlen=self.seq_max_len, sequences=[w_rep], value=word2idx["__PAD__"], padding='post', truncating='post').tolist()) representation, char2idx = self.build_char_representations(tmp_seq, char2idx) x_char.append(representation) y_price.append(np.log(float(advertise["price"]))) # Translate position of title and detail terms = advertise["clean_text_invert"] tmp_seq = self.pad_term_sequence(self.tokenize_sentence(terms), max_len=self.seq_max_len) w_rep, word2idx = self.build_word_representations(tmp_seq, word2idx) x_word.append(pad_sequences(maxlen=self.seq_max_len, sequences=[w_rep], value=word2idx["__PAD__"], padding='post', truncating='post').tolist()) representation, char2idx = self.build_char_representations(tmp_seq, char2idx) x_char.append(representation) y_price.append(np.log(float(advertise["price"]))) self.save_encoded_data(x_word, x_char, y_price) # Update maps if self.update_maps: self.maps["char2idx"] = char2idx self.maps["word2idx"] = word2idx self.advertise_counter += 1 sc.get_notice(self.advertise_counter, 5000, msg_text="ads processed!")
def save_encoded_data(self, *data): if self.model_folder: dataset_name = "dataset.jsonl" dataset_path = os.path.join(self.model_folder, dataset_name) with open(dataset_path, "a", encoding="utf-8") as js: x_word, x_char, y_price = data for word, char, price in zip(x_word, x_char, y_price): js.write(json.dumps({"x_word": word, "x_char": char, "y_tag": price}) + "\n") self.processed_counter += 1 sc.get_notice(self.processed_counter, msg_text="training obs processed!") else: # TODO: implement saving in DataStorage raise NotImplementedError("Saving outside a local folder path is not implemented yet!")
def encode_advertise(self, advertise): char2idx = self.maps["char2idx"] word2idx = self.maps["word2idx"] terms = advertise["clean_text"] tmp_seq = self.pad_term_sequence(self.tokenize_sentence(terms), max_len=self.seq_max_len) word2idx = self.build_word_representations(tmp_seq, word2idx) char2idx = self.build_char_representations(tmp_seq, char2idx) # Update maps self.maps["char2idx"] = char2idx self.maps["word2idx"] = word2idx self.advertise_counter += 1 sc.get_notice(self.advertise_counter, 5000, msg_text="ads processed!")
def encode_advertise(self, advertise): char2idx = self.maps["char2idx"] word2idx = self.maps["word2idx"] tag2idx = self.maps["tag2idx"] terms = advertise["NER"] terms_words = [token[0] for token in terms] terms_tags = [token[1] for token in terms] if self.debug: print(terms_words) print(terms_tags) word2idx = self.build_word_representations(terms_words, word2idx) char2idx = self.build_char_representations(terms_words, char2idx) tag2idx = self.build_tag_representations(terms_tags, tag2idx) # Update maps self.maps["char2idx"] = char2idx self.maps["word2idx"] = word2idx self.maps["tag2idx"] = tag2idx self.advertise_counter += 1 sc.get_notice(self.advertise_counter, 5000, msg_text="ads processed!")
def encode_advertise(self, advertise): self.update_schema_dist(advertise) self.advertise_counter += 1 sc.get_notice(self.advertise_counter, 5000, msg_text="ads processed!")