예제 #1
0
def preprocess_test():
    content = None
    with open('./raw/test.json') as f:
        content = f.read()
    data = json.loads(content)
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',
                                              cache_dir='./models',
                                              do_lower_case=False)
    rows = []
    for q in data:
        for p in q['paragraphs']:
            words = tokenizer.tokenize(p['text'])
            if len(words) > MAX_SEQ_LENGTH:
                sentences = tokenize_sentence(p['text'])
                sentences = [s for s in sentences if len(s) > 3]
                if len(sentences) < 2:
                    # can't tokenize
                    # do smth stupid instead
                    ratio = math.ceil(len(words) * 1.0 / MAX_SEQ_LENGTH)
                    sentences = dummy_split(p['text'], ratio)

                for idx, sen in enumerate(sentences):
                    rows.append((q['__id__'], p['id'] + '$$' + str(idx),
                                 q['question'], sen))
            else:
                rows.append((q['__id__'], p['id'], q['question'], p['text']))
    with open('./data/dev.tsv', 'w') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(
            ('id', 'q_id', 'ans_id', 'question', 'answer', 'is_correct'))
        for idx, row in enumerate(rows):
            q_id, ans_id, question, ans = row
            writer.writerow(
                (str(idx), q_id, ans_id, question, ans.replace('\t', ' '), 0))
예제 #2
0
    def create_label_lst(self):
        # out: (pl_label_lst, vec_label_lst)
        pl_label_lst = list()
        # └> [[('히말라야 인텐시브 고수분크림 150ml  영양', pl_no), 0],..]
        num = 0
        for modelno, pl_nms in self.toyData.items():
            for i in range(len(pl_nms)):
                pl_label_lst.append([(pl_nms[i][1], pl_nms[i][0]), num])
            num += 1

        vec_label_lst = list()
        # ㄴ> [[([[vector], [vector],...], pl_no), 0], .. ] # Final
        #      -------------------------> 1 sentence
        for pl_label_set in pl_label_lst:
            # └> [('히말라야 인텐시브 고수분크림 150ml 영양', pl_no), 0]
            goodsnm = pl_label_set[0][0]
            # └> '히말라야 인텐시브 고수분크림 150ml 영양'
            tokenized = tokenize_sentence(goodsnm)
            # └> ['히말라야', '인텐시브', '고수분크림', '150ml', '영양']
            # └> [[vector], [vector], [vector], ...]
            for i in range(len(tokenized)):
                word_vec = self.embedding[tokenized[i]]
                tokenized[i] = word_vec
            vec_label_lst.append([(tokenized, pl_label_set[0][1]),
                                  pl_label_set[1]])
            #                ㄴ> [([tokenized], pl_no), label]
        return (pl_label_lst, vec_label_lst)
 def tokens_list_set(self):
     # INPUT for FastText
     tokenized_goods_list = []
     for sent in self.nameList:
         # ㄴsent == '잎스네이처 마린콜라겐 50 워터젤 크림', 'D티엔 수분 퐁당 크림',..
         tokenized = tokenize_sentence(sent)
         # ㄴtokenized == ['잎스네이처', '마린콜라겐', '50', '워터젤', '크림'], ..
         tokenized_goods_list.append(tokenized)
         # ㄴtokenized_goods_list == [[tokenized], [],..]
     return tokenized_goods_list
예제 #4
0
 def get_tokens_from_file(self, path): 
     '''
     Tokenize and preprocesses the given text.
     Preprocessing includes lower case and conversion of digits to 9.
     '''
     sentences = self.get_sentences_from_file(path)
     
     all_tokens = [token
                   for sent in sentences
                   for token in utils.tokenize_sentence(sent, True)]
     
     return all_tokens
예제 #5
0
 def get_tokenized_sentence(self, index):
     '''
     Return the sentence in the position indicated by the index properly tokenized.
     A cache is used to store sentences from the corpus previously tokenized.
     '''
     if index in self.tokenized_cache:
         return self.tokenized_cache[index]
     
     sentence = self[index]
     tokens = utils.tokenize_sentence(sentence)
     self.tokenized_cache[index] = tokens
     
     return tokens
예제 #6
0
    def get_tokens_from_file(self, path):
        '''
        Tokenize and preprocesses the given text.
        Preprocessing includes lower case and conversion of digits to 9.
        '''
        sentences = self.get_sentences_from_file(path)

        all_tokens = [
            token for sent in sentences
            for token in utils.tokenize_sentence(sent, True)
        ]

        return all_tokens
예제 #7
0
    def get_tokenized_sentence(self, index):
        '''
        Return the sentence in the position indicated by the index properly tokenized.
        A cache is used to store sentences from the corpus previously tokenized.
        '''
        if index in self.tokenized_cache:
            return self.tokenized_cache[index]

        sentence = self[index]
        tokens = utils.tokenize_sentence(sentence)
        self.tokenized_cache[index] = tokens

        return tokens
예제 #8
0
def recursive_run(directory, only_lines, only_tokens):
    '''
    Recursively tokenizes files in a directory. It will call itself on 
    subdirectories.
    '''
    logger = logging.getLogger(__name__)

    logger.info('Entering directory %s' % directory)
    dir_contents = os.listdir(unicode(directory))
    files = 0

    for item in dir_contents:
        full_path = os.path.join(directory, item)
        if os.path.isdir(full_path):
            recursive_run(full_path, only_lines, only_tokens)

        if not item.endswith('.txt'):
            # only consider .txt files
            continue

        with open(full_path, 'rb') as f:
            text = unicode(f.read(), 'utf-8')

        paragraphs = text.split('\n')
        sentences = []
        sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')

        for paragraph in paragraphs:
            # don't change to lower case yet in order not to mess with the
            # sentence splitter
            par_sentences = sent_tokenizer.tokenize(paragraph, 'pt')
            sentences.extend(par_sentences)

        if not only_tokens:
            text = '\n'.join(sentences)
            with open(full_path, 'wb') as f:
                f.write(text.encode('utf-8'))

        if not only_lines:
            tokenized_path = full_path.replace('.txt', '.token')
            with open(tokenized_path, 'wb') as f:
                for sentence in sentences:
                    tokens = utils.tokenize_sentence(sentence, True)
                    line = '%s\n' % ' '.join(tokens)
                    f.write(line.encode('utf-8'))

        files += 1

    logger.info('Tokenized %d files' % files)
예제 #9
0
def recursive_run(directory, only_lines, only_tokens):
    '''
    Recursively tokenizes files in a directory. It will call itself on 
    subdirectories.
    '''
    logger = logging.getLogger(__name__)
    
    logger.info('Entering directory %s' % directory)
    dir_contents = os.listdir(unicode(directory))
    files = 0
    
    for item in dir_contents:
        full_path = os.path.join(directory, item)        
        if os.path.isdir(full_path):
            recursive_run(full_path, only_lines, only_tokens)
        
        if not item.endswith('.txt'):
            # only consider .txt files
            continue
        
        with open(full_path, 'rb') as f:
            text = unicode(f.read(), 'utf-8')
        
        paragraphs = text.split('\n')
        sentences = []
        sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
        
        for paragraph in paragraphs:
            # don't change to lower case yet in order not to mess with the
            # sentence splitter
            par_sentences = sent_tokenizer.tokenize(paragraph, 'pt')
            sentences.extend(par_sentences)
        
        if not only_tokens:
            text = '\n'.join(sentences)
            with open(full_path, 'wb') as f:
                f.write(text.encode('utf-8'))
        
        if not only_lines:
            tokenized_path = full_path.replace('.txt', '.token')
            with open(tokenized_path, 'wb') as f:
                for sentence in sentences:
                    tokens = utils.tokenize_sentence(sentence, True)
                    line = '%s\n' % ' '.join(tokens)
                    f.write(line.encode('utf-8'))
        
        files += 1
    
    logger.info('Tokenized %d files' % files)
예제 #10
0
    def create_label_lst(self):
        # out: (pl_label_lst, vec_label_lst)
        pl_label_lst = list()
        num = 0
        for modelno, pl_nms in self.toyData.items():
            for i in range(len(pl_nms)):
                pl_label_lst.append([(pl_nms[i][1], pl_nms[i][0]), num])
            num += 1

        vec_label_lst = list()
        for pl_label_set in pl_label_lst:
            goodsnm = pl_label_set[0][0]
            tokenized = tokenize_sentence(goodsnm)
            for i in range(len(tokenized)):
                word_vec = self.embedding[tokenized[i]]
                tokenized[i] = word_vec
            vec_label_lst.append([(tokenized, pl_label_set[0][1]),
                                  pl_label_set[1]])
        return (pl_label_lst, vec_label_lst)
예제 #11
0
 def _iterate_on_dir(self, path):
     '''
     Internal helper recursive function.
     '''
     # sorted file list like in the parent class
     file_list = sorted(os.listdir(path))
     for filename in file_list:
         full_path = os.path.join(path, filename)
         if os.path.isdir(full_path):
             for item in self._iterate_on_dir(full_path):
                 yield item
         else:
             # this is a file
             if not filename.endswith('.txt'):
                 continue
             
             sentences = self.get_sentences_from_file(full_path)
             for sentence in sentences:
                 tokens = utils.tokenize_sentence(sentence, preprocess=True)
             
                 if self.yield_tokens:
                     yield tokens
                 else:
                     yield self.dictionary.doc2bow(tokens)
예제 #12
0
    def _iterate_on_dir(self, path):
        '''
        Internal helper recursive function.
        '''
        # sorted file list like in the parent class
        file_list = sorted(os.listdir(path))
        for filename in file_list:
            full_path = os.path.join(path, filename)
            if os.path.isdir(full_path):
                for item in self._iterate_on_dir(full_path):
                    yield item
            else:
                # this is a file
                if not filename.endswith('.txt'):
                    continue

                sentences = self.get_sentences_from_file(full_path)
                for sentence in sentences:
                    tokens = utils.tokenize_sentence(sentence, preprocess=True)

                    if self.yield_tokens:
                        yield tokens
                    else:
                        yield self.dictionary.doc2bow(tokens)
예제 #13
0
        'fourgram':
        set([])
    }

    for q in q_variants:
        q = q.lower()
        q_ngrams['bigram'] = q_ngrams['bigram'] | set(
            generateNgram(q, 2, '_', punct_set | stopwords))
        q_ngrams['trigram'] = q_ngrams['trigram'] | set(
            generateNgram(q, 3, '_', punct_set | stopwords))
        q_ngrams['fourgram'] = q_ngrams['fourgram'] | set(
            generateNgram(q, 4, '_', punct_set))

    p_scores = [(sentence_score(q_ngrams, p), p) for p in sentences]
    p_scores.sort(key=lambda x: -x[0])

    return p_scores


if __name__ == '__main__':
    from utils import tokenize_sentence
    sentences = tokenize_sentence(
        'Một trận thi đấu bóng đá thông thường diễn ra trong hai hiệp chính thức liên tiếp , mỗi hiệp gồm 45 phút ngăn cách bằng 15 phút nghỉ giữa giờ . Sau khi hiệp 1 , hai đội bóng sẽ phải đổi sân cho nhau để có sự công bằng trong vòng 1 phút .'
    )
    print(sentences)
    # rel_ranking('', sentences)
    sentences = tokenize_sentence(
        'Cũng trong thập niên 1850 , các đội bóng nghiệp dư bắt đầu được thành lập và thường mỗi đội xây dựng cho riêng họ những luật chơi mới của môn bóng đá , trong đó đáng chú ý có câu lạc bộ Sheffield F.C . Việc mỗi đội bóng có luật chơi khác nhau khiến việc điều hành mỗi trận đấu giữa họ diễn ra rất khó khăn .'
    )
    print(sentences)
예제 #14
0
 def __init__(self, file):
     self.sens = tokenize_sentence(file)
     self.freq_words = get_freq_word(file)
예제 #15
0
 def tokens_list_set(self):
     tokenized_goods_list = []
     for sent in self.nameList:
         tokenized = tokenize_sentence(sent)
         tokenized_goods_list.append(tokenized)
     return tokenized_goods_list
예제 #16
0
def new_main():
    content = None
    with open('./raw/train.json') as f:
        content = f.read()
    data = json.loads(content)
    countd, countm = 0, 0
    ml = 0

    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',
                                              cache_dir='./models',
                                              do_lower_case=False)
    for q in data:
        words = tokenizer.tokenize(q['text'])
        ml = len(words) if len(words) > ml else ml
        if len(words) > MAX_SEQ_LENGTH:
            sentences = tokenize_sentence(q['text'])
            sentences = [s for s in sentences if len(s) > 3]
            if len(sentences) < 2:
                # can't tokenize
                # do smth stupid instead
                ratio = math.ceil(len(words) * 1.0 / MAX_SEQ_LENGTH)
                # uwords = word_tokenize(q['text'])
                # dummy = q['text'].split(' ')
                # steps = math.ceil(len(dummy)/ratio)
                # r = []
                # for i in range(0, ratio):
                #     r.append(' '.join(dummy[i*steps:((i*steps) + steps) if i < (ratio - 1) else len(dummy)]))
                sentences = dummy_split(q['text'], ratio)

            if q['label'] is True:
                # rank
                sentences = rel_ranking(q['question'], sentences)
                relcount = sum([1 if v[0] > 0 else 0 for v in sentences])
                delta = sentences[0][0] - sentences[1][0]
                if relcount == 1 or delta > 4:
                    q['sentences'] = [(v[1], 1 if idx == 0 else 0)
                                      for idx, v in enumerate(sentences)]
                else:
                    if len(
                            tokenizer.tokenize(
                                sentences[0][1] + ' . ' +
                                sentences[1][1])) < MAX_SEQ_LENGTH:
                        q['sentences'] = [
                            (sentences[0][1] + ' . ' + sentences[1][1], 1)
                        ]
                        q['sentences'] += [(s[1], 0) for s in sentences[2:]]
                    else:
                        # accept the risk
                        q['sentences'] = [(v[1], 1 if idx == 0 else 0)
                                          for idx, v in enumerate(sentences)]
            else:
                q['sentences'] = [(sen, 0) for sen in sentences]

    with open('./data/train.tsv', 'w') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(
            ('id', 'q_id', 'ans_id', 'question', 'answer', 'is_correct'))
        for idx, q in enumerate(data):
            if 'sentences' in q:
                q_id = q['id']
                for aidx, sen in enumerate(q['sentences']):
                    # writer.write(())
                    txt, is_correct = sen
                    ans_id = q_id + '_ans_' + str(aidx)
                    writer.writerow((str(idx), q_id, ans_id, q['question'],
                                     txt.replace('\t', ' '), str(is_correct)))
            else:
                is_correct = 1 if q['label'] else 0
                q_id = q['id']
                ans_id = q_id + '_ans_0'
                writer.writerow((str(idx), q_id, ans_id, q['question'],
                                 q['text'].replace('\t',
                                                   ' '), str(is_correct)))