def preprocess_test(): content = None with open('./raw/test.json') as f: content = f.read() data = json.loads(content) tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', cache_dir='./models', do_lower_case=False) rows = [] for q in data: for p in q['paragraphs']: words = tokenizer.tokenize(p['text']) if len(words) > MAX_SEQ_LENGTH: sentences = tokenize_sentence(p['text']) sentences = [s for s in sentences if len(s) > 3] if len(sentences) < 2: # can't tokenize # do smth stupid instead ratio = math.ceil(len(words) * 1.0 / MAX_SEQ_LENGTH) sentences = dummy_split(p['text'], ratio) for idx, sen in enumerate(sentences): rows.append((q['__id__'], p['id'] + '$$' + str(idx), q['question'], sen)) else: rows.append((q['__id__'], p['id'], q['question'], p['text'])) with open('./data/dev.tsv', 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerow( ('id', 'q_id', 'ans_id', 'question', 'answer', 'is_correct')) for idx, row in enumerate(rows): q_id, ans_id, question, ans = row writer.writerow( (str(idx), q_id, ans_id, question, ans.replace('\t', ' '), 0))
def create_label_lst(self): # out: (pl_label_lst, vec_label_lst) pl_label_lst = list() # └> [[('히말라야 인텐시브 고수분크림 150ml 영양', pl_no), 0],..] num = 0 for modelno, pl_nms in self.toyData.items(): for i in range(len(pl_nms)): pl_label_lst.append([(pl_nms[i][1], pl_nms[i][0]), num]) num += 1 vec_label_lst = list() # ㄴ> [[([[vector], [vector],...], pl_no), 0], .. ] # Final # -------------------------> 1 sentence for pl_label_set in pl_label_lst: # └> [('히말라야 인텐시브 고수분크림 150ml 영양', pl_no), 0] goodsnm = pl_label_set[0][0] # └> '히말라야 인텐시브 고수분크림 150ml 영양' tokenized = tokenize_sentence(goodsnm) # └> ['히말라야', '인텐시브', '고수분크림', '150ml', '영양'] # └> [[vector], [vector], [vector], ...] for i in range(len(tokenized)): word_vec = self.embedding[tokenized[i]] tokenized[i] = word_vec vec_label_lst.append([(tokenized, pl_label_set[0][1]), pl_label_set[1]]) # ㄴ> [([tokenized], pl_no), label] return (pl_label_lst, vec_label_lst)
def tokens_list_set(self): # INPUT for FastText tokenized_goods_list = [] for sent in self.nameList: # ㄴsent == '잎스네이처 마린콜라겐 50 워터젤 크림', 'D티엔 수분 퐁당 크림',.. tokenized = tokenize_sentence(sent) # ㄴtokenized == ['잎스네이처', '마린콜라겐', '50', '워터젤', '크림'], .. tokenized_goods_list.append(tokenized) # ㄴtokenized_goods_list == [[tokenized], [],..] return tokenized_goods_list
def get_tokens_from_file(self, path): ''' Tokenize and preprocesses the given text. Preprocessing includes lower case and conversion of digits to 9. ''' sentences = self.get_sentences_from_file(path) all_tokens = [token for sent in sentences for token in utils.tokenize_sentence(sent, True)] return all_tokens
def get_tokenized_sentence(self, index): ''' Return the sentence in the position indicated by the index properly tokenized. A cache is used to store sentences from the corpus previously tokenized. ''' if index in self.tokenized_cache: return self.tokenized_cache[index] sentence = self[index] tokens = utils.tokenize_sentence(sentence) self.tokenized_cache[index] = tokens return tokens
def get_tokens_from_file(self, path): ''' Tokenize and preprocesses the given text. Preprocessing includes lower case and conversion of digits to 9. ''' sentences = self.get_sentences_from_file(path) all_tokens = [ token for sent in sentences for token in utils.tokenize_sentence(sent, True) ] return all_tokens
def recursive_run(directory, only_lines, only_tokens): ''' Recursively tokenizes files in a directory. It will call itself on subdirectories. ''' logger = logging.getLogger(__name__) logger.info('Entering directory %s' % directory) dir_contents = os.listdir(unicode(directory)) files = 0 for item in dir_contents: full_path = os.path.join(directory, item) if os.path.isdir(full_path): recursive_run(full_path, only_lines, only_tokens) if not item.endswith('.txt'): # only consider .txt files continue with open(full_path, 'rb') as f: text = unicode(f.read(), 'utf-8') paragraphs = text.split('\n') sentences = [] sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle') for paragraph in paragraphs: # don't change to lower case yet in order not to mess with the # sentence splitter par_sentences = sent_tokenizer.tokenize(paragraph, 'pt') sentences.extend(par_sentences) if not only_tokens: text = '\n'.join(sentences) with open(full_path, 'wb') as f: f.write(text.encode('utf-8')) if not only_lines: tokenized_path = full_path.replace('.txt', '.token') with open(tokenized_path, 'wb') as f: for sentence in sentences: tokens = utils.tokenize_sentence(sentence, True) line = '%s\n' % ' '.join(tokens) f.write(line.encode('utf-8')) files += 1 logger.info('Tokenized %d files' % files)
def create_label_lst(self): # out: (pl_label_lst, vec_label_lst) pl_label_lst = list() num = 0 for modelno, pl_nms in self.toyData.items(): for i in range(len(pl_nms)): pl_label_lst.append([(pl_nms[i][1], pl_nms[i][0]), num]) num += 1 vec_label_lst = list() for pl_label_set in pl_label_lst: goodsnm = pl_label_set[0][0] tokenized = tokenize_sentence(goodsnm) for i in range(len(tokenized)): word_vec = self.embedding[tokenized[i]] tokenized[i] = word_vec vec_label_lst.append([(tokenized, pl_label_set[0][1]), pl_label_set[1]]) return (pl_label_lst, vec_label_lst)
def _iterate_on_dir(self, path): ''' Internal helper recursive function. ''' # sorted file list like in the parent class file_list = sorted(os.listdir(path)) for filename in file_list: full_path = os.path.join(path, filename) if os.path.isdir(full_path): for item in self._iterate_on_dir(full_path): yield item else: # this is a file if not filename.endswith('.txt'): continue sentences = self.get_sentences_from_file(full_path) for sentence in sentences: tokens = utils.tokenize_sentence(sentence, preprocess=True) if self.yield_tokens: yield tokens else: yield self.dictionary.doc2bow(tokens)
'fourgram': set([]) } for q in q_variants: q = q.lower() q_ngrams['bigram'] = q_ngrams['bigram'] | set( generateNgram(q, 2, '_', punct_set | stopwords)) q_ngrams['trigram'] = q_ngrams['trigram'] | set( generateNgram(q, 3, '_', punct_set | stopwords)) q_ngrams['fourgram'] = q_ngrams['fourgram'] | set( generateNgram(q, 4, '_', punct_set)) p_scores = [(sentence_score(q_ngrams, p), p) for p in sentences] p_scores.sort(key=lambda x: -x[0]) return p_scores if __name__ == '__main__': from utils import tokenize_sentence sentences = tokenize_sentence( 'Một trận thi đấu bóng đá thông thường diễn ra trong hai hiệp chính thức liên tiếp , mỗi hiệp gồm 45 phút ngăn cách bằng 15 phút nghỉ giữa giờ . Sau khi hiệp 1 , hai đội bóng sẽ phải đổi sân cho nhau để có sự công bằng trong vòng 1 phút .' ) print(sentences) # rel_ranking('', sentences) sentences = tokenize_sentence( 'Cũng trong thập niên 1850 , các đội bóng nghiệp dư bắt đầu được thành lập và thường mỗi đội xây dựng cho riêng họ những luật chơi mới của môn bóng đá , trong đó đáng chú ý có câu lạc bộ Sheffield F.C . Việc mỗi đội bóng có luật chơi khác nhau khiến việc điều hành mỗi trận đấu giữa họ diễn ra rất khó khăn .' ) print(sentences)
def __init__(self, file): self.sens = tokenize_sentence(file) self.freq_words = get_freq_word(file)
def tokens_list_set(self): tokenized_goods_list = [] for sent in self.nameList: tokenized = tokenize_sentence(sent) tokenized_goods_list.append(tokenized) return tokenized_goods_list
def new_main(): content = None with open('./raw/train.json') as f: content = f.read() data = json.loads(content) countd, countm = 0, 0 ml = 0 tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', cache_dir='./models', do_lower_case=False) for q in data: words = tokenizer.tokenize(q['text']) ml = len(words) if len(words) > ml else ml if len(words) > MAX_SEQ_LENGTH: sentences = tokenize_sentence(q['text']) sentences = [s for s in sentences if len(s) > 3] if len(sentences) < 2: # can't tokenize # do smth stupid instead ratio = math.ceil(len(words) * 1.0 / MAX_SEQ_LENGTH) # uwords = word_tokenize(q['text']) # dummy = q['text'].split(' ') # steps = math.ceil(len(dummy)/ratio) # r = [] # for i in range(0, ratio): # r.append(' '.join(dummy[i*steps:((i*steps) + steps) if i < (ratio - 1) else len(dummy)])) sentences = dummy_split(q['text'], ratio) if q['label'] is True: # rank sentences = rel_ranking(q['question'], sentences) relcount = sum([1 if v[0] > 0 else 0 for v in sentences]) delta = sentences[0][0] - sentences[1][0] if relcount == 1 or delta > 4: q['sentences'] = [(v[1], 1 if idx == 0 else 0) for idx, v in enumerate(sentences)] else: if len( tokenizer.tokenize( sentences[0][1] + ' . ' + sentences[1][1])) < MAX_SEQ_LENGTH: q['sentences'] = [ (sentences[0][1] + ' . ' + sentences[1][1], 1) ] q['sentences'] += [(s[1], 0) for s in sentences[2:]] else: # accept the risk q['sentences'] = [(v[1], 1 if idx == 0 else 0) for idx, v in enumerate(sentences)] else: q['sentences'] = [(sen, 0) for sen in sentences] with open('./data/train.tsv', 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerow( ('id', 'q_id', 'ans_id', 'question', 'answer', 'is_correct')) for idx, q in enumerate(data): if 'sentences' in q: q_id = q['id'] for aidx, sen in enumerate(q['sentences']): # writer.write(()) txt, is_correct = sen ans_id = q_id + '_ans_' + str(aidx) writer.writerow((str(idx), q_id, ans_id, q['question'], txt.replace('\t', ' '), str(is_correct))) else: is_correct = 1 if q['label'] else 0 q_id = q['id'] ans_id = q_id + '_ans_0' writer.writerow((str(idx), q_id, ans_id, q['question'], q['text'].replace('\t', ' '), str(is_correct)))