Пример #1
0
def remove_stop_postag(dataset, output_dir):
    utils.mkdir(output_dir)
    stack = os.listdir(dataset)
    # print 'loading data in ' + dataset
    total_doc = 0
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):  # neu la thu muc thi day vao strong stack
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            with open(file_path, 'r', encoding='utf-8') as fr:
                data = unicodedata.normalize('NFKC', fr.read().strip())
                original_content = tokenizer.predict(data)
                content = map(lambda x: ViPosTagger.postagging(x),
                              spliter.split(original_content))
                clean_content = []
                for info in content:
                    sen = []
                    for i in xrange(len(info[0])):
                        if is_exist(info[1][i]):
                            sen.append(info[0][i])
                    clean_content.append(u' '.join(sen))
                with open(os.path.join(output_dir, os.path.basename(file_name)),
                          'w', encoding='utf-8') as fw:
                    if len(clean_content) > 0:
                        fw.write(u'\n'.join(clean_content))
                    else: fw.write(original_content)
                total_doc += 1
Пример #2
0
def load_dataset_from_disk(dataset):
    list_samples = {k: [] for k in my_map.name2label.keys()}
    print(list_samples)
    print 'load_data in ' + dataset

    # return list file and folder in dir
    stack = os.listdir(dataset)

    while len(stack) > 0:
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        # where is file_path
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            print('%s' % file_path)
            sys.stdout.flush()
            with open(file_path, 'r', encoding='utf-16') as fp:

                content = unicodedata.normalize('NFKC', fp.read())

                #tokenizer content
                content = r.run(tokenizer.predict(content))
                #dir name of file_path
                dir_name = utils.get_dir_name(file_path)
                list_samples[dir_name].append(content)
    print('')
    return list_samples
Пример #3
0
def load_document_content(dataset, documents):
    stack = os.listdir(dataset)
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = dataset + '/' + file_name
        if (os.path.isdir(file_path)
            ):  # neu la thu muc thi day vao strong stack
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                try:
                    raw_content = unicodedata.normalize(
                        'NFKC',
                        f.read().strip()).split(u'\n')
                    new_content = []
                    for i, sen in enumerate(raw_content):
                        if i == 0:
                            # highlight title
                            sen = u'<h2>' + sen + u'</h2>'
                        elif i == 1:
                            sen = u'<h5>' + sen + u'</h5>'
                        else:
                            sen = sen + u'<br>'
                        new_content.append(sen)
                    documents.update(
                        {raw_content[0].lower(): u'\n'.join(new_content)})
                except:
                    continue
Пример #4
0
def build_vocab(dataset, output_vocab, root_dir, title_map):
    vectorizer = TfidfVectorizer(ngram_range=(1, 1),
                                 max_df=0.6,
                                 min_df=1,
                                 stop_words=utils.load_data_from_list(
                                     os.path.join(root_dir, 'stopwords.txt')))
    stack = os.listdir(dataset)
    contents = []
    titles = []
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)
            ):  # neu la thu muc thi day vao strong stack
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                base = os.path.basename(file_name)
                titles.append(title_map[base])
                contents.append(content.lower())
    # change vectorizer to ensure length of document greater than 0
    if len(contents) < 50:
        vectorizer.max_df = 1.0
    vectorizer.fit(contents)
    with open(output_vocab, 'w', encoding='utf-8') as f:
        vocab = {w: i for i, w in enumerate(vectorizer.vocabulary_.keys())}
        f.write(u'\n'.join(vocab.keys()))
    return contents, titles
Пример #5
0
def update_title_map(dataset, title_map):
    stack = os.listdir(dataset)
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)
            ):  # neu la thu muc thi day vao strong stack
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = unicodedata.normalize('NFKC', f.read().strip())
                title = data.lower().split(u'\n')[0]
                base = os.path.basename(file_name)
                title_map.update({base: title})
Пример #6
0
def load_data_from_file(dataset):
    data = {}
    stack = os.listdir(dataset)
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            img = cv2.imread(file_path)
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            loan_id = file_name.split('_')[2].split('.')[0]
            data[loan_id] = gray
    return data
def load_data_from_file(dataset):
    data = {}
    stack = os.listdir(dataset)
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            img = cv.imread(file_path)
            loan_id = file_name.split('_')[2].split('.')[0]
            img = padding_img(img, max_width, max_height)
            img = padding_img(img, final_width, final_height)
            data[loan_id] = img
    return data
Пример #8
0
def load_dataset(dataset):
    list_samples = {k: [] for k in my_map.name2label.keys()}
    stack = os.listdir(dataset)
    print 'loading data in ' + dataset
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            print(file_path)
            with open(file_path, 'r', encoding='utf-16') as fp:
                content = unicodedata.normalize('NFKC', fp.read())
                content = r.run(tokenizer.predict(content))
                dir_name = utils.get_dir_name(file_path)
                list_samples[dir_name].append(content)
    return list_samples
Пример #9
0
def parse_training_data(dataset, output):
    docs = []
    utils.mkdir(output)
    stack = os.listdir(dataset)
    print 'loading data in ' + dataset
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = dataset + '/' + file_name
        if (os.path.isdir(file_path)
            ):  # neu la thu muc thi day vao strong stack
            utils.push_data_to_stack(stack, file_path, file_name)
        else:  # nguoc lai tien hanh readfile
            with open(file_path, 'r', encoding='utf-8') as ff:
                content = ff.read()
                bs = BeautifulSoup(content)
                docs.append([bs.text])
                # with open(output + '/' + file_name, 'w', encoding='utf-8') as f:
                #     f.write(bs.text)
    return docs
Пример #10
0
def load_data_from_file(dataset):
    data = {}
    stack = os.listdir(dataset)
    max_h, max_w = 0, 0
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            img = cv.imread(file_path)
            loan_id = file_name.split('_')[2].split('.')[0]
            data[loan_id] = img
            if img.shape[0] > max_h:
                max_h = img.shape[0]
            if img.shape[1] > max_w:
                max_w = img.shape[1]
    # print(f'{max_h} {max_w}')
    return data
def count_tokens():
    print('count tokens...')
    statistic = {name: {} for name in my_map.name2label.keys()}
    stack = os.listdir(tokenized_dataset)
    print 'loading data in ' + dataset
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(tokenized_dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            print('\r%s' % (file_path)),
            sys.stdout.flush()
            with open(file_path, 'r', encoding='utf-8') as fp:
                label = utils.get_dir_name(file_path)
                for sen in fp:
                    sen = sen.strip()
                    tag = ViPosTagger.postagging(sen)
                    tokens = [
                        tag[0][i] for i in xrange(len(tag[0]))
                        if tag[1][i] == u'N'
                    ]
                    update_count_tokens(statistic, label, tokens)
def tokenizer_dataset():
    utils.mkdir(tokenized_dataset)
    stack = os.listdir(dataset)
    print 'loading data in ' + dataset
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            print('\r%s' % (file_path)),
            sys.stdout.flush()
            with open(file_path, 'r', encoding='utf-16') as fp:
                content = unicodedata.normalize('NFKC', fp.read())
                content = r.run(tokenizer.predict(content))
                dir_name = utils.get_dir_name(file_path)
                output_dir = os.path.join(tokenized_dataset, dir_name)
                utils.mkdir(output_dir)
                name = os.path.basename(file_path)
                with open(os.path.join(output_dir, name),
                          'w',
                          encoding='utf-8') as fw:
                    fw.write(content)
    print('')