def load_data(self, labeled_data, ids): self.message = {} labels_esit = [] for i in ids: sentences = [] labels = [] doc_len = [] sent_len = [] sents, l = labeled_data[i] for j in range(0, len(sents)): sents[j] = str(sents[j]) results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", sents[j]) results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", dd) results = re.compile( r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)', re.S) dd = results.sub(" <website> ", dd) a = regexp_tokenize(transform_format(dd), self.pattern) temp = [] for k in range(0, len(a)): if a[k] not in self.english_punctuations and check_ack_word( a[k]) == 1: if a[k].isdigit(): a[k] = '<number>' elif a[k][0] == '$': a[k] = '<money>' elif a[k][-1] == '%': a[k] = '<percentage>' temp.append(a[k].lower()) if len(temp) > 0: temp_ = ['<sos>'] for k in range(0, min(len(temp), self.max_seq_len - 2)): temp_.append(temp[k]) temp_.append('<eos>') sentences.append(temp_) labels.append(self.lookup_label_id(l[j])) labels_esit.append(self.lookup_label_id(l[j])) sent_len.append(len(temp_) - 1) doc_len.append(len(sents) - 1) self.message[i] = (sentences, labels, sent_len, doc_len) x_d = set() for (u, v) in self.label_set.items(): x_d.add(v) x_d = np.array(list(x_d)) self.kde.fit(np.array(labels_esit)[:, None]) self.dist = self.kde.score_samples(x_d[:, None]) self.esit_dist = F.softmax(torch.tensor(self.dist), dim=-1)
def load_data(self, unlabeled_data, ids): self.message = {} self.ids = [] self.data_num = 0 for i in ids: try: sentences = [] labels = [] doc = unlabeled_data[i] doc_len = [] sent_len = [] doc += '.' results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", doc) results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", dd) results = re.compile( r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)', re.S) dd = results.sub(" <website> ", dd) sents = sentence_tokenize(dd) # print(sents) for j in range(0, len(sents)): a = regexp_tokenize(transform_format(sents[j]), self.pattern) temp = [] for k in range(0, len(a)): if a[k] not in self.english_punctuations and check_ack_word( a[k]) == 1: if a[k].isdigit(): a[k] = '<number>' elif a[k][0] == '$': a[k] = '<money>' elif a[k][-1] == '%': a[k] = '<percentage>' temp.append(a[k].lower()) if len(temp) > 0: temp_ = ['<sos>'] for k in range(0, min(len(temp), self.max_seq_len - 2)): temp_.append(temp[k]) temp_.append('<eos>') sentences.append(temp_) labels.append(10) sent_len.append(len(temp_) - 1) doc_len.append(min(len(sents) - 1, self.max_seq_num - 1)) self.message[i] = (sentences[:self.max_seq_num], labels[:self.max_seq_num], sent_len[:self.max_seq_num], doc_len) self.ids.append(i) except: #print(doc) #exit() pass
def build_vocab(self, unlabeled_data, labeled_data, embedding_size, max_seq_num, max_seq_len): sentences = [] words = [] if unlabeled_data is not None: for (u, v) in unlabeled_data.items(): try: results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", v) results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", dd) results = re.compile( r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)', re.S) dd = results.sub(" <website> ", dd) sents = sentence_tokenize(dd) for j in range(0, len(sents)): a = regexp_tokenize(transform_format(sents[j]), self.pattern) temp = [] for k in range(0, len(a)): if a[k] not in self.english_punctuations and check_ack_word( a[k]) == 1: if a[k].isdigit(): a[k] = '<number>' elif a[k][0] == '$': a[k] = '<money>' elif a[k][-1] == '%': a[k] = '<percentage>' temp.append(a[k].lower()) words.append(a[k].lower()) if len(temp) > 0: sentences.append(temp) except: #print(u,v) #exit() pass if labeled_data is not None: for (u, v) in labeled_data.items(): for i in range(0, len(v[0])): v[0][i] = str(v[0][i]) try: results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", v[0][i]) results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", dd) results = re.compile( r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)', re.S) dd = results.sub(" <website> ", dd) except: print(u, v) print(v[0][i]) exit() a = regexp_tokenize(transform_format(dd), self.pattern) temp = [] for k in range(0, len(a)): if a[k] not in self.english_punctuations and check_ack_word( a[k]) == 1: if a[k].isdigit(): a[k] = '<number>' elif a[k][0] == '$': a[k] = '<money>' elif a[k][-1] == '%': a[k] = '<percentage>' temp.append(a[k].lower()) words.append(a[k].lower()) if len(temp) > 0: sentences.append(temp) word_frequency = {} for i in range(0, len(words)): if words[i] in word_frequency: word_frequency[words[i]] += 1 else: word_frequency[words[i]] = 1 self.model = gensim.models.Word2Vec(sentences, size=embedding_size, window=5, min_count=1, iter=20, negative=50) x = 4 self.word2id['<pad>'] = 0 self.id2word[0] = '<pad>' self.word2id['<sos>'] = 2 self.id2word[2] = '<sos>' self.word2id['<eos>'] = 3 self.id2word[3] = '<eos>' self.unk_count = 0 for i in range(0, len(sentences)): for j in range(0, len(sentences[i])): if word_frequency[sentences[i][j].lower()] >= 2: if sentences[i][j].lower() in self.model: if sentences[i][j].lower() in self.word2id: pass else: self.word2id[sentences[i][j].lower()] = x self.id2word[x] = sentences[i][j].lower() x = x + 1 else: self.word2id['<unk>'] = 1 self.id2word[1] = '<unk>' self.unk_count += 1