Пример #1
0
    def __init__(self,
                 fname_parties,
                 fname_ngram,
                 ngram_min_freq=3,
                 train_ratio=0.8,
                 train=True):
        # TODO
        # 1. shuffle
        # 2. divide as train and test
        all_dict = csv2dictlist(fname_parties)

        self.data = []
        self.label = []
        for dic in all_dict:
            clean_text = dic['clean_text']
            press = dic['press']

            if clean_text == '':
                continue

            self.data.append(clean_text)
            self.label.append(0 if press == '더불어민주당' else 1)

        self.bigram_cvec = BigramCounterVector(fname_ngram,
                                               min_freq=ngram_min_freq)
    def __init__(self, fname, min_freq):
        """Read a ngram file and load ngram words that are over min_freq """
        # todo: Error check if fname has csv ext or not
        self.fname = fname
        self.min_freq = min_freq

        self.bigram_list = csv2dictlist(fname=fname)
        self.cvec_words = []
        for bigram_dict in self.bigram_list:
            if int(bigram_dict['frequency']) >= min_freq:
                bigram_tup = tuple(bigram_dict['2gram word'].split())
                self.cvec_words.append(bigram_tup)

        self.cvec = [0] * (len(self.cvec_words) + 1)
Пример #3
0
def save_clean_text(press: str, dirname: str, out_dirname: str):
    # get file name
    fname = get_fname_sewolho(press, dirname)

    # open file and get dict list
    total_news = csv2dictlist(fname)

    # execute clean text
    for news in total_news:
        news['clean_text'] = clean_text(news['text'])

    # determine output file name
    out_fname = fname.split('/')[-1]
    out_fname = out_fname.replace('.csv', '_clean.csv')
    out_fname = os.path.join(out_dirname, out_fname)
    if not os.path.isdir(out_dirname):
        os.makedirs(out_dirname)

    # save to out_fname
    with open(out_fname, 'w', newline='', encoding='utf-8-sig') as csvoutput:
        writer = csv.DictWriter(csvoutput, fieldnames=total_news[0].keys())
        writer.writeheader()
        writer.writerows(total_news)
        print('saved to [' + out_fname + ']')
Пример #4
0
def get_texts(fname: str):
    news = csv2dictlist(fname)
    texts = [line['text'] for line in news]
    return texts
Пример #5
0
def get_cleantexts(fname: str):
    news = csv2dictlist(fname)
    clean_texts = [line['clean_text'] for line in news]
    return clean_texts
Пример #6
0
from tools import csv2dictlist, dictlist2csv
import random

# open data file and convert to list of dict
all_dict = csv2dictlist('data/train/parties_merged.csv')

# shuffle data randomly
random.shuffle(all_dict)

# calculate number of train and test
total = len(all_dict)
num_train = int(total * 0.8)
num_test = total - num_train
print('total:', total)
print('num_train:', num_train)
print('num_test:', num_test)

# split data into two
train = all_dict[:num_train]
test = all_dict[num_train:]

# save as csv file
dictlist2csv(dict_list=train, out_name='train.csv')
dictlist2csv(dict_list=test, out_name='test.csv')