def load_dataset(sentences, tags, word_to_id, tag_to_id, lower=False): """ Prepare the dataset. Return a list of lists of dictionaries containing: - word indexes - tag indexes """ def f(x): return x.lower() if lower else x data = [] for i in range(len(sentences)): str_words = sentences[i] words = [ word_to_id[f(w) if f(w) in word_to_id else '<UNK>'] for w in str_words ] caps = [cap_feature(w) for w in str_words] tag_ids = [tag_to_id[t] for t in tags[i]] data.append({ 'str_words': str_words, 'words': words, 'caps': caps, 'tags': tag_ids, 'pos': tag_ids, }) return data
def preprocess(self, path, draft): output = [] stopwords = [' ', '\n', '\u3000', '\u202f', '\u2009'] with open(path, 'r', encoding='utf-8') as f: with open(path, 'r', encoding = 'utf-8') as t: data = [] for line in t: data.append(json.loads(line)) t.close() # pdb.set_trace() if draft: data[0]['data'] = data[0]['data'][:1] for topic in data[0]['data']: for paragraph in topic['paragraphs']: context = paragraph['context'] tokens = word_tokenize(context) for qa in paragraph['qas']: qid = qa['id'] question = qa['question'] for ans in qa['answers']: answer = ans['text'] s_idx = ans['answer_start'] e_idx = s_idx + len(answer) l = 0 s_found = False for i, t in enumerate(tokens): while l < len(context): if context[l] in stopwords: l += 1 else: break if t[0] == '"' and context[l:l + 2] == '\'\'': t = '\'\'' + t[1:] elif t == '"' and context[l:l + 2] == '\'\'': t = '\'\'' l += len(t) if l > s_idx and s_found == False: s_idx = i s_found = True if l >= e_idx: e_idx = i break output.append(dict([('qid', qid), ('context', context), ('question', question), ('answer', answer), ('start_idx', s_idx), ('end_idx', e_idx)])) with open('{}l'.format(path), 'w', encoding='utf-8') as f: for line in output: json.dump(line, f) print('', file=f)
def process_file(f, encoding='utf8'): """初始化函数 Args: f[str] : 原始数据文件 encoding[str] : 编码(默认utf8) """ data = [] tag = [] for line in f: x = '' y = '' nr_flag = False nt_flag = False nr_word = '' nt_word = '' #ignore the ID for pair in line.strip().split()[1:]: word = pair.split('/')[0] #split sentence with token '。' if word == u'。' and len(x) > 0: lines.append(x+'\n') labels.append(y.strip()+'\n') x = '' y = '' continue #process nt words if pair.startswith('['): nt_flag = True nt_word = word[1:] continue if nt_flag: if not pair.endswith(']nt'): nt_word += word continue # process nr tag if pair.endswith('nr'): nr_word += word elif len(nr_word) > 0: x += nr_word y += ' B-PER'+' I-PER'*(len(nr_word)-1) nr_word = '' if pair.endswith('nt'): if pair.endswith(']nt'): word = nt_word+word nt_flag = False x += word y += ' B-ORG'+' I-ORG'*(len(word)-1) # process ns tag elif pair.endswith('ns'): x += word y += ' B-LOC'+' I-LOC'*(len(word)-1) else: x += word y += ' O'*(len(word)) if len(x) > 0: data.append(x+'\n') tag.append(y.strip()+'\n') return data, tag
def get_test_data(): val_data = pd.read_csv('./Dataset/test.csv') data = [] for content, label in zip(val_data['content'], val_data['label']): data.append((content, label)) random.shuffle(data) contents, True_labels = [],[] for content, label in data: contents.append(content) True_labels.append(label) return contents, True_labels
def json_to_csv(file, output_file): data = [] with open(file) as f: for line in f: data.append(json.loads(line)) df = pd.DataFrame.from_records(data)[[ 'sentence1', 'sentence2', 'gold_label' ]] df['gold_label'] = df['gold_label'].map(values_dict) df = df[df['gold_label'].isin([0, 1, 2])] df.to_csv(output_file, index=False)
def preprocess_data(input_list, type="train", letter_width=0.02, hop_length=400): data = [] for i in input_list: data.append( preprocess_data_single_entry(i, type=type, letter_width=letter_width, hop_length=hop_length)) return data
def add_padding(data, max_length, index_to_pad, index_to_eos): ''' 增加padding :param data: :param max_length: 每行最大单词数 :param index_to_pad: pad对应的索引号 :param index_to_eos: 行结束符的索引号 :return: ''' data = data[:max_length - 1] #减1是为了在最后添加eos这个token索引 padding_len = (max_length - 1) - len(data) assert padding_len >= 0, 'padding_len = max_length - len(data)' data.extend([index_to_pad] * padding_len) data.append(index_to_eos) return data
def user_builder(df): '''Given the flattened dataframe, output a user-based collection of data.''' data = [] user_ids = df.user_id.unique() for i in tnrange(len(user_ids)): user = user_ids[i] sub_df = df.loc[df['user_id'] == user].reset_index(drop=True) country = sub_df['country_id'][0] join_date = sub_df['joining_date'][0] sorted_df = sub_df.sort_values('unix_timestamp').reset_index(drop=True) history = sorted_df['cities'].tolist() timestamps = sorted_df['unix_timestamp'].tolist() history = [(timestamps[j], history[j]) for j in range(len(sorted_df))] data.append((user, country, join_date, history)) return data
def data_process(src_path, tgt_path, src_tokenizer, trg_tokenizer, src_vocab, trg_vocab): src_iter = iter(io.open(src_path, encoding="utf8")) trg_iter = iter(io.open(tgt_path, encoding="utf8")) data = [] for raw_src, raw_trg in zip(src_iter, trg_iter): raw_src, raw_trg = preprocess(raw_src), preprocess(raw_trg) # print(f"src:{raw_src}, target: {raw_trg}") src_tensor = torch.tensor( [src_vocab[token] for token in src_tokenizer(raw_src)], dtype=torch.long) trg_tensor = torch.tensor( [trg_vocab[token] for token in trg_tokenizer(raw_trg)], dtype=torch.long) data.append((src_tensor, trg_tensor)) # print(f"data: {data}") return data
def evaluate(model, data_iter, tipe, epoch, fold): model.eval() corrects, avg_loss = 0, 0 data_iter.sort_key = lambda x: len(x.text) for batch in data_iter: text_numerical, target = batch.text, batch.label if args.cuda: text_numerical, target = text_numerical.cuda(), target.cuda() text_numerical.data.t_() target.data.sub_(1) forward = model(text_numerical) loss = F.cross_entropy(forward, target, size_average=False) avg_loss += loss.data[0] corrects += (torch.max(forward, 1)[1].view( target.size()).data == target.data).sum() size = len(data_iter.dataset) avg_loss = avg_loss / size accuracy = 100.0 * corrects / size # print('Avg Loss = {}'.format(avg_loss)) output['fold_{}'.format(fold + 1)]['epoch_{}'.format( epoch)][tipe]['avg_loss'] = avg_loss.item() if tipe == 'testing': if args.cuda: data.append(accuracy.item()) else: data.append(accuracy) return target.data, torch.max(forward, 1)[1].view(target.size()).data
type=int, default=1, help='number of ngrams to be selected in each batch [default: 10]') parser.add_argument('-threshold', type=float, default=0.9, help='threshold for selecting ngram [default: 0.9]') args = parser.parse_args() # load up data data = [] with open('/mnt/storage01/milliet/data/ngrams/clean-ngrams-score-9500.csv', 'r') as csvfile: lines = csvfile.readlines() for line in lines: data.append(line.split('\sep')) threshold = 0.9999 bests = list(elem for elem in data if float(elem[-1]) > threshold) print("N-grams with score > " + str(threshold) + ": " + str(len(bests))) if len(bests) > 0: args.summary_dir = os.path.join( args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + "threshold_" + str(threshold) + "_model_dict_" + args.state_dict) writer = SummaryWriter(log_dir=args.summary_dir) x_data = list( re.sub("\s+", ",", elem[1][2:-2]).split(',') for elem in bests)
def augmentation(data): augmented = [] aug_df = pd.DataFrame({'summary': [], 'rating': []}) reps = [] for index, game in data.iterrows(): genre = game['rating'] if (genre == 'Dislike'): s = 38 tok = tokenize_word(game['summary']) elif (genre == 'Acclaim'): s = 14 tok = tokenize_word(game['summary']) else: s = 3 tok = tokenize_sent(game['summary']) # if(genre in ('Turn-based strategy (TBS)')): # s = 120 # tok = tokenize_word(game['summary']) # elif(genre in ('Tactical')): # s = 110 # tok = tokenize_word(game['summary']) # elif(genre in ('Quiz/Trivia')): # s = 100 # tok = tokenize_word(game['summary']) # elif(genre in ('Music')): # s = 45 # tok = tokenize_word(game['summary']) # elif(genre in ('Arcade')): # s = 17 # tok = tokenize_word(game['summary']) # elif(genre in ("Hack and slash/Beat 'em up")): # s = 21 # tok = tokenize_word(game['summary']) # elif(genre in ('Racing')): # s = 20 # tok = tokenize_word(game['summary']) # elif(genre in ('Sport')): # s = 23 # tok = tokenize_word(game['summary']) # elif(genre in ('Indie')): # s = 18 # tok = tokenize_sent(game['summary']) # elif(genre in ('Real Time Strategy (RTS)')): # s = 12 # tok = tokenize_sent(game['summary']) # elif(genre in ('Strategy')): # s = 1 # tok = tokenize_sent(game['summary']) # elif(genre in ('Fighting')): # s = 7 # tok = tokenize_sent(game['summary']) # elif(genre in ('Point-and-click')): # s = 6 # tok = tokenize_sent(game['summary']) # elif(genre in ('Simulator')): # s = 7 # tok = tokenize_sent(game['summary']) # elif(genre in ('Adventure')): # s = 3 # tok = tokenize_sent(game['summary']) # elif(genre in ('Platform')): # s = 2 # tok = tokenize_sent(game['summary']) # elif(genre in ('Puzzle')): # s = 4 # tok = tokenize_sent(game['summary']) # else: # s = 0 # tok = tokenize_sent(game['summary']) shuffled = [tok] #print(ng_rev) for i in range(s): #generate 11 new reviews shuffled.append(shuffle_tokenized(shuffled[-1])) for k in shuffled: '''create new review by joining the shuffled sentences''' s = ' ' new_game = s.join(k) if new_game not in augmented: augmented.append(new_game) aug_df = aug_df.append( pd.DataFrame({ 'summary': [new_game], 'rating': [genre] })) else: reps.append(new_game) return data.append(aug_df)