def impute_zip(df): geocoder = Geocoder('your-api-key') tqdm_notebook().pandas() df.loc[df["zipcode"].isnull(), "zipcode"] = df[df["zipcode"].isnull()].progress_apply( lambda row: geocoder.reverse_geocode(row['latitude'], row[ 'longitude'])[0].postal_code, axis=1) return (df)
def initialize(cls, shm_size_mb=SHM_SIZE_MB, nb_workers=NB_WORKERS, progress_bar=False): """ Initialize Pandarallel shared memory. Parameters ---------- shm_size_mb : int, optional Size of Pandarallel shared memory nb_workers : int, optional Number of worker used for parallelisation progress_bar : bool, optional Display a progress bar WARNING: Progress bar is an experimental feature. This can lead to a considerable performance loss. """ print("New pandarallel memory created - Size:", shm_size_mb, "MB") print("Pandarallel will run on", nb_workers, "workers") if progress_bar: print("WARNING: Progress bar is an experimental feature. This \ can lead to a considerable performance loss.") tqdm_notebook().pandas() cls.__store_ctx = _plasma.start_plasma_store(int(shm_size_mb * 1e6)) plasma_store_name, _ = cls.__store_ctx.__enter__() plasma_client = _plasma.connect(plasma_store_name) args = plasma_store_name, nb_workers, plasma_client _pd.DataFrame.parallel_apply = _DataFrame.apply(*args, progress_bar) _pd.DataFrame.parallel_applymap = _DataFrame.applymap( *args, progress_bar) _pd.Series.parallel_map = _Series.map(*args, progress_bar) _pd.Series.parallel_apply = _Series.apply(*args, progress_bar) _pd.core.window.Rolling.parallel_apply = _SeriesRolling.apply( *args, progress_bar) _pd.core.groupby.DataFrameGroupBy.parallel_apply = _DataFrameGroupBy.apply( *args) _pd.core.window.RollingGroupby.parallel_apply = _RollingGroupby.apply( *args)
def validate_step(dl, model, id2label, sup_labels, id2cls=None): model.eval() idx = 0 preds_cpu, targets_cpu = [], [] preds_cpu_cls, targets_cpu_cls = [], [] for batch in tqdm_notebook(dl, total=len(dl), leave=False): idx += 1 labels_mask, labels_ids = batch[-2:] preds = model.forward(batch) if id2cls is not None: preds, preds_cls = preds preds_cpu_, targets_cpu_ = transformed_result_cls([preds_cls], [batch[-3]], id2cls) preds_cpu_cls.extend(preds_cpu_) targets_cpu_cls.extend(targets_cpu_) preds_cpu_, targets_cpu_ = transformed_result([preds], [labels_mask], id2label, [labels_ids]) preds_cpu.extend(preds_cpu_) targets_cpu.extend(targets_cpu_) clf_report = flat_classification_report(targets_cpu, preds_cpu, labels=sup_labels, digits=3) if id2cls is not None: clf_report_cls = flat_classification_report([targets_cpu_cls], [preds_cpu_cls], digits=3) return clf_report, clf_report_cls return clf_report
def train_step(dl, model, optimizer, lr_scheduler=None, clip=None, num_epoch=1): model.train() epoch_loss = 0 idx = 0 for batch in tqdm_notebook(dl, total=len(dl), leave=False): idx += 1 model.zero_grad() loss = model.score(batch) loss.backward() if clip is not None: _ = torch.nn.utils.clip_grad_norm(model.parameters(), clip) optimizer.step() optimizer.zero_grad() epoch_loss += loss.data.cpu().tolist() if lr_scheduler is not None: lr_scheduler.step() # torch.cuda.empty_cache() if lr_scheduler is not None: logging.info("\nlr after epoch: {}".format(lr_scheduler.lr)) logging.info("\nepoch {}, average train epoch loss={:.5}\n".format( num_epoch, epoch_loss / idx))
def predict(dl, model, id2label, id2cls=None): model.eval() idx = 0 preds_cpu = [] preds_cpu_cls = [] for batch, sorted_idx in tqdm_notebook(dl, total=len(dl), leave=False): idx += 1 labels_mask, labels_ids = batch[-2:] preds = model.forward(batch) if id2cls is not None: preds, preds_cls = preds preds_cpu_ = transformed_result_cls([preds_cls], [preds_cls], id2cls, False) preds_cpu_cls.extend(preds_cpu_) bs = batch[0].shape[0] unsorted_mask = [0] * bs unsorted_pred = [0] * bs for idx, sidx in enumerate(sorted_idx): unsorted_pred[sidx] = preds[idx] unsorted_mask[sidx] = labels_mask[idx] preds_cpu_ = transformed_result([unsorted_pred], [unsorted_mask], id2label) preds_cpu.extend(preds_cpu_) if id2cls is not None: return preds_cpu, preds_cpu_cls return preds_cpu
def replace_func(input_file): p1 = re.compile(r'-\{.*?(zh-hans|zh-cn):([^;]*?)(;.*?)?\}-') p2 = re.compile(r'[(][: @ . , ?!\s][)]') p3 = re.compile(r'[「『]') p4 = re.compile( r'[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()0-9 , : ; \-\ \[\ \]\ ]' ) p5 = re.compile('<.*?>') p6 = re.compile('–') sentence = "" numstopwords = 0 replaced_file = open('data/' + 'preprocessed.txt', "w", encoding="utf8") with open('data/' + input_file, 'r', encoding="utf8") as source_f: print('data/' + input_file + " reading") for line in tqdm_notebook(source_f): line = p1.sub(r' ', line) line = p2.sub(r' ', line) line = p3.sub(r' ', line) line = p4.sub(r' ', line) line = p5.sub(r' ', line) line = p6.sub(r' ', line) line = line.lower() sentence += line sentence_list = sentence.split(' ') print('data/' + input_file + " to list") for word in sentence_list: word = WordNetLemmatizer().lemmatize(word) if not word in stopwords.words('english'): replaced_file.write(word + " ") else: numstopwords += 1 if numstopwords % 100 == 0: print("number of stopwords", numstopwords) replaced_file.close()
def perform_regularised_cv(train,y_colname, grid,high_card_cols, folds = 5,metric = mean_absolute_error,model='XGBoost'): '''Performs grid search crossfold validation with support for regularised mean encoding Inputs: train: Input data set y_colname : target column name grid: Set of hyperparameters over which the model is to be tuned high_card_col : categorical columns you want to consider for mean encoding folds: Number of folds to be used for cross validation Outputs: all_scores: the list of final scores ''' kf = KFold(folds, random_state=0, shuffle=True) param_grid = ParameterGrid(grid) all_scores = [] #Store all scores for params in tqdm_notebook(param_grid): errors = [] for train_idx, test_idx in kf.split(train): # Split data into train and test kf_train, kf_test = train.iloc[train_idx,:], train.iloc[test_idx,:] kf_train.reset_index(inplace=True,drop=True) kf_test.reset_index(inplace=True,drop=True) _,error,_,_ = train_model(params,kf_train,kf_test,y_colname,high_card_cols,valid=True,metric= metric,model='XGBoost') errors.append(error) avg_score = np.mean(errors) #Average scores of all KFold all_scores.append((params, avg_score)) rmsle = np.sqrt(avg_score) tqdm.write(f'Parameters: {params} RMSLE: {rmsle}') return all_scores
def preprocess(data): rows = data.shape[0] - params["time_steps"] cols = data.shape[1] x = np.zeros((rows, params["time_steps"], cols)) y = np.zeros((rows, )) for i in tqdm_notebook(range(rows)): x[i] = data[i:params["time_steps"] + i] y[i] = data[params["time_steps"] + i, 4] print(x.shape, y.shape) return x, y
def build_timeseries(mat, y_col_index): dim_0 = mat.shape[0] - TIME_STEPS dim_1 = mat.shape[1] x = np.zeros((dim_0, TIME_STEPS, dim_1)) y = np.zeros((dim_0,)) print("dim_0",dim_0) for i in tqdm_notebook(range(dim_0)): x[i] = mat[i:TIME_STEPS+i] y[i] = mat[TIME_STEPS+i, y_col_index] print("Ukuran time-series untuk Input dan Output : ",x.shape,y.shape) return x, y
def build_timeseries(mat, y_col_index): # y_col_index is the index of column that would act as output column # total number of time-series samples would be len(mat) - TIME_STEPS dim_0 = mat.shape[0] - TIME_STEPS dim_1 = mat.shape[1] x = np.zeros((dim_0, TIME_STEPS, dim_1)) y = np.zeros((dim_0,)) for i in tqdm_notebook(range(dim_0)): x[i] = mat[i:TIME_STEPS+i] y[i] = mat[TIME_STEPS+i, y_col_index] print("length of time-series i/o",x.shape,y.shape) return x, y
def build_timeseries(mat, time_step=50, debug=False): """ Covnerts a 2 dimensional dataframe into a 3 dimensional dataframe. :param mat: The dataframe input :param time_step: The number of of time steps used for the 2nd dimension :param debug: Boolean, if true, print shape of array returned :return: A three dimensional dataframe. """ dim_0 = mat.shape[0] - time_step dim_1 = mat.shape[1] x = np.zeros((dim_0, time_step, dim_1)) for i in tqdm_notebook(range(dim_0)): x[i] = mat[i:time_step + i] if debug: print("length of time-series i/o", x.shape) return x
def build_timeseries(mat, y_col_index): """ Converts ndarray into timeseries format and supervised data format. Takes first TIME_STEPS number of rows as input and sets the TIME_STEPS+1th data as corresponding output and so on. :param mat: ndarray which holds the dataset :param y_col_index: index of column which acts as output :return: returns two ndarrays-- input and output in format suitable to feed to LSTM. """ # total number of time-series samples would be len(mat) - TIME_STEPS dim_0 = mat.shape[0] - TIME_STEPS dim_1 = mat.shape[1] x = np.zeros((dim_0, TIME_STEPS, dim_1)) y = np.zeros((dim_0,)) print("dim_0",dim_0) for i in tqdm_notebook(range(dim_0)): x[i] = mat[i:TIME_STEPS+i] y[i] = mat[TIME_STEPS+i, y_col_index] print("length of time-series i/o",x.shape,y.shape) return x, y
def create_corpus(df): ''' creating a corpus consisting of all the words present in the text. ''' ''' get the most important words in the text documents using tf-idf ''' corpus_row = df.TRANS_CONV_TEXT.values vectorizer = TfidfVectorizer(max_df=0.5, max_features=70) vectorizer.fit_transform(corpus_row) imp_words = vectorizer.get_feature_names() text = df.TRANS_CONV_TEXT.values corpus = [] for tweet in tqdm_notebook(text): words = [ word.lower() for word in word_tokenize(tweet) if ((word.isalpha() == True) and (word in imp_words)) ] corpus.append(list(set(words))) return corpus
def replace_func(input_file): p1 = re.compile(r'-\{.*?(zh-hans|zh-cn):([^;]*?)(;.*?)?\}-') p2 = re.compile(r'[(][: @ . , ?!\s][)]') p3 = re.compile(r'[「『]') p4 = re.compile( r'[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()0-9 , : ; \-\ \[\ \]\ ]' ) p5 = re.compile('<.*?>') p6 = re.compile('–') replaced_file = open('data/' + 'preprocessed.txt', "w", encoding="utf8") with open('data/' + input_file, 'r', encoding="utf8") as source_f: for line in tqdm_notebook(source_f): line = p1.sub(r' ', line) line = p2.sub(r' ', line) line = p3.sub(r' ', line) line = p4.sub(r' ', line) line = p5.sub(r' ', line) line = p6.sub(r' ', line) replaced_file.write(line) replaced_file.close()
def get_pre_post_colonization_df(conflict_df): """ Create the pre and post colonization dataframe""" colonized_df = pd.read_csv('datasets/colonies_wikipedia.csv') colonized_countries = list(colonized_df["colonized_country"]) # Create two empty dataframe columns = [ "location", "ID", "colonizer_country", "Indep Date", "sidea", "side b", "incomp", "year", "intensity", "cumint", "type", "startdate", "startdate2", "ependdate", "region" ] pre_colonization_conflict_df = pd.DataFrame(columns=columns) post_colonization_conflict_df = pd.DataFrame(columns=columns) for index in tqdm_notebook(range(len(conflict_df))): country = conflict_df.loc[index, 'location'] # Correct issue when country string had a space at the end if country[-1] == ' ': country = country[:-1] # Check if the country is a ex colony if country in colonized_countries: # Get information from colonies dataframe indep_year = colonized_df.loc[colonized_df['colonized_country'] == country, 'Year'].values[0] indep_month = colonized_df.loc[colonized_df['colonized_country'] == country, 'Month'].values[0] indep_day = colonized_df.loc[colonized_df['colonized_country'] == country, 'Day'].values[0] indep_date = str(indep_day) + "/" + str(indep_month) + "/" + str( indep_year) ID = colonized_df.loc[colonized_df['colonized_country'] == country, 'ID'].values[0] colonizer_country = colonized_df.loc[ colonized_df['colonized_country'] == country, 'colonizer_country'].values[0] # Creation of the temporary row conflict_df_tmp = conflict_df.loc[index:index, ] conflict_df_tmp.set_value(index, 'Indep Date', indep_date) conflict_df_tmp.set_value(index, 'ID', ID) conflict_df_tmp.set_value(index, 'colonizer_country', colonizer_country) start_year = int(conflict_df.loc[index, 'year']) # Check when happend the conflict (before or after independence) if start_year > indep_year: post_colonization_conflict_df = post_colonization_conflict_df.append( conflict_df_tmp) else: pre_colonization_conflict_df = pre_colonization_conflict_df.append( conflict_df_tmp) #else: # print(country, "--> NOT EX COLONY") # Create general colonization dataframe conflict_df.to_csv("datasets/colonization_conflict_general.csv") # Create the post colonization dataframe pre_colonization_conflict_df = pre_colonization_conflict_df.reset_index( drop=True) pre_colonization_conflict_df = pre_colonization_conflict_df[columns] pre_colonization_conflict_df.to_csv( "datasets/colonization_conflict_pre.csv") # Create the post colinzation dataframe post_colonization_conflict_df = post_colonization_conflict_df.reset_index( drop=True) post_colonization_conflict_df = post_colonization_conflict_df[columns] post_colonization_conflict_df.to_csv( "datasets/colonization_conflict_post.csv") return pre_colonization_conflict_df, post_colonization_conflict_df
from tqdm._tqdm_notebook import tqdm_notebook #TQDM build def tqdm_pandas(t): from pandas.core.frame import Series def inner(series, func, *args, **kwargs): t.total = series.size def wrapper(*args, **kwargs): t.update(1) return func(*args, **kwargs) result = series.apply(wrapper, *args, **kwargs) t.close() return result Series.progress_apply = inner tqdm_pandas(tqdm_notebook()) tqdm.pandas(desc="my bar!") #data prep and feature engg def data(): os.chdir('D:/data_science/kaggle_sound_classification') train = pd.read_csv('train.csv') train = train.iloc[1:300,:] train_path = 'D:/data_science/kaggle_sound_classification/audio_train/' def get_feature(fname): b,_ = librosa.load(fname, res_type = 'kaiser_fast') try: mfcc = np.mean(librosa.feature.mfcc(y = b,n_mfcc=60).T,axis=0) mels = np.mean(librosa.feature.melspectrogram(b, sr = SAMPLE_RATE).T,axis = 0) stft = np.abs(librosa.stft(b)) chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr = SAMPLE_RATE).T,axis = 0)
def inner(series, func, *args, **kwargs): t.total = series.size def wrapper(*args, **kwargs): t.update(1) return func(*args, **kwargs) result = series.apply(wrapper, *args, **kwargs) t.close() return result Series.progress_apply = inner tqdm_pandas(tqdm_notebook()) tqdm.pandas(desc="my bar!") #data prep and feature engg def data(): os.chdir('D:/data_science/kaggle_sound_classification') train = pd.read_csv('train.csv') train = train.iloc[1:300, :] train_path = 'D:/data_science/kaggle_sound_classification/audio_train/' def get_feature(fname): b, _ = librosa.load(fname, res_type='kaiser_fast') try: mfcc = np.mean(librosa.feature.mfcc(y=b, n_mfcc=60).T, axis=0) mels = np.mean(librosa.feature.melspectrogram(b, sr=SAMPLE_RATE).T,
def get_data(df, config, label2idx=None, oov='<oov>', pad='<pad>', cls2idx=None, is_cls=False, word_lexicon=None, char_lexicon=None, max_seq_len=424): if label2idx is None: label2idx = {pad: 0, '<bos>': 1, '<eos>': 2} features = [] if is_cls: # Use joint model if cls2idx is None: cls2idx = dict() zip_args = zip(df["1"].tolist(), df["0"].tolist(), df["2"].tolist()) else: zip_args = zip(df["1"].tolist(), df["0"].tolist()) cls = None total = len(df["0"].tolist()) for args in tqdm_notebook(enumerate(zip_args), total=total, leave=False): if is_cls: idx, (text, labels, cls) = args else: idx, (text, labels) = args text = text.split() text = text[:max_seq_len - 2] labels = labels.split()[:max_seq_len - 2] labels = ['<bos>'] + labels + ['<eos>'] if config['token_embedder']['name'].lower() == 'cnn': tokens, text = read_list( [text], config['token_embedder']['max_characters_per_token']) else: tokens, text = read_list([text]) tokens, text = tokens[0], text[0] input_ids = None if word_lexicon is not None: oov_id, pad_id = word_lexicon.get(oov, None), word_lexicon.get( pad, None) assert oov_id is not None and pad_id is not None input_ids = [word_lexicon.get(x, oov_id) for x in tokens] char_ids = None # get a batch of character id whose size is (batch x max_len x max_chars) if char_lexicon is not None: char_ids = [] bow_id, eow_id, oov_id, pad_id = [ char_lexicon.get(key, None) for key in ('<eow>', '<bow>', oov, pad) ] assert bow_id is not None and eow_id is not None and oov_id is not None and pad_id is not None if config['token_embedder']['name'].lower() == 'cnn': max_chars = config['token_embedder'][ 'max_characters_per_token'] assert max([len(w) for w in tokens]) + 2 <= max_chars elif config['token_embedder']['name'].lower() == 'lstm': # counting the <bow> and <eow> pass else: raise ValueError('Unknown token_embedder: {0}'.format( config['token_embedder']['name'])) for token in tokens: chars = [bow_id] if token == '<bos>' or token == '<eos>': chars.append(char_lexicon.get(token)) chars.append(eow_id) else: for c in token: chars.append(char_lexicon.get(c, oov_id)) chars.append(eow_id) char_ids.append(chars) for l in labels: if l not in label2idx: label2idx[l] = len(label2idx) labels_ids = [label2idx[l] for l in labels] # For joint model cls_idx = None if is_cls: if cls not in cls2idx: cls2idx[cls] = len(cls2idx) cls_idx = cls2idx[cls] features.append( InputFeatures(input_ids, char_ids, tokens, labels, labels_ids, cls=cls, cls_idx=cls_idx)) if is_cls: return features, (label2idx, cls2idx) return features, label2idx
def get_data(df, tokenizer, label2idx=None, max_seq_len=424, pad="<pad>", cls2idx=None, is_cls=False, is_meta=False): tqdm_notebook = tqdm if label2idx is None: label2idx = {pad: 0, '[CLS]': 1, '[SEP]': 2} features = [] all_args = [] if is_cls: # Use joint model if cls2idx is None: cls2idx = dict() all_args.extend([df["1"].tolist(), df["0"].tolist(), df["2"].tolist()]) else: all_args.extend([df["1"].tolist(), df["0"].tolist()]) if is_meta: all_args.append(df["3"].tolist()) total = len(df["0"].tolist()) cls = None meta = None for args in tqdm_notebook(enumerate(zip(*all_args)), total=total, leave=False): if is_cls: if is_meta: idx, (text, labels, cls, meta) = args else: idx, (text, labels, cls) = args else: if is_meta: idx, (text, labels, meta) = args else: idx, (text, labels) = args tok_map = [] meta_tokens = [] if is_meta: meta = json.loads(meta) meta_tokens.append([0] * len(meta[0])) bert_tokens = [] bert_labels = [] bert_tokens.append("[CLS]") bert_labels.append("[CLS]") orig_tokens = [] orig_tokens.extend(str(text).split()) labels = str(labels).split() pad_idx = label2idx[pad] assert len(orig_tokens) == len(labels) prev_label = "" for idx_, (orig_token, label) in enumerate(zip(orig_tokens, labels)): # Fix BIO to IO as BERT proposed https://arxiv.org/pdf/1810.04805.pdf prefix = "I_" if label != "O": label = label.split("_")[1] prev_label = label else: prev_label = label cur_tokens = tokenizer.tokenize(orig_token) if max_seq_len - 1 < len(bert_tokens) + len(cur_tokens): break tok_map.append(len(bert_tokens)) if is_meta: meta_tokens.extend([meta[idx_]] * len(cur_tokens)) bert_tokens.extend(cur_tokens) bert_label = [prefix + label] + ["X"] * ( len(cur_tokens) - 1) # ["I_" + label] * (len(cur_tokens) - 1) bert_labels.extend(bert_label) bert_tokens.append("[SEP]") bert_labels.append("[SEP]") if is_meta: meta_tokens.append([0] * len(meta[0])) orig_tokens = ["[CLS]"] + orig_tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(bert_tokens) labels = bert_labels for l in labels: if l not in label2idx: label2idx[l] = len(label2idx) labels_ids = [label2idx[l] for l in labels] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) labels_mask = [1] * len(labels_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_len: input_ids.append(0) input_mask.append(0) labels_ids.append(pad_idx) labels_mask.append(0) tok_map.append(-1) if is_meta: meta_tokens.append([0] * len(meta[0])) # assert len(input_ids) == len(bert_labels_ids) input_type_ids = [0] * len(input_ids) # For joint model cls_idx = None if is_cls: if cls not in cls2idx: cls2idx[cls] = len(cls2idx) cls_idx = cls2idx[cls] if is_meta: meta = meta_tokens features.append( InputFeatures( # Bert data bert_tokens=bert_tokens, input_ids=input_ids, input_mask=input_mask, input_type_ids=input_type_ids, # Origin data tokens=orig_tokens, labels=labels, labels_ids=labels_ids, labels_mask=labels_mask, tok_map=tok_map, # Joint data cls=cls, cls_idx=cls_idx, # Meta data meta=meta)) assert len(input_ids) == len(input_mask) assert len(input_ids) == len(input_type_ids) assert len(input_ids) == len(labels_ids) assert len(input_ids) == len(labels_mask) if is_cls: return features, (label2idx, cls2idx) return features, label2idx
def get_data(df, tokenizer, label2idx=None, max_seq_len=424, pad="<pad>", cls2idx=None, is_cls=False): if label2idx is None: label2idx = {pad: 0, '[CLS]': 1, '[SEP]': 2} features = [] if is_cls: # Use joint model if cls2idx is None: cls2idx = dict() zip_args = zip(df["1"].tolist(), df["0"].tolist(), df["2"].tolist()) else: zip_args = zip(df["1"].tolist(), df["0"].tolist()) total = len(df["0"].tolist()) cls = None for args in tqdm_notebook(enumerate(zip_args), total=total, leave=False): if is_cls: idx, (text, labels, cls) = args else: idx, (text, labels) = args tok_map = [] bert_tokens = [] bert_labels = [] bert_tokens.append("[CLS]") bert_labels.append("[CLS]") orig_tokens = [] orig_tokens.extend(text.split()) labels = labels.split() pad_idx = label2idx[pad] # assert len(orig_tokens) == len(labels) prev_label = "" for orig_token, label in zip(orig_tokens, labels): prefix = "B_" if label != "O": label = label.split("_")[1] if label == prev_label: prefix = "I_" prev_label = label else: prev_label = label tok_map.append(len(bert_tokens)) cur_tokens = tokenizer.tokenize(orig_token) if max_seq_len - 1 < len(bert_tokens) + len(cur_tokens): break bert_tokens.extend(cur_tokens) bert_label = [prefix + label ] + ["I_" + label] * (len(cur_tokens) - 1) bert_labels.extend(bert_label) bert_tokens.append("[SEP]") bert_labels.append("[SEP]") orig_tokens = ["[CLS]"] + orig_tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(bert_tokens) labels = bert_labels for l in labels: if l not in label2idx: label2idx[l] = len(label2idx) labels_ids = [label2idx[l] for l in labels] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) labels_mask = [1] * len(labels_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_len: input_ids.append(0) input_mask.append(0) labels_ids.append(pad_idx) labels_mask.append(0) tok_map.append(-1) # assert len(input_ids) == len(bert_labels_ids) input_type_ids = [0] * len(input_ids) # For joint model cls_idx = None if is_cls: if cls not in cls2idx: cls2idx[cls] = len(cls2idx) cls_idx = cls2idx[cls] features.append( InputFeatures( # Bert data bert_tokens=bert_tokens, input_ids=input_ids, input_mask=input_mask, input_type_ids=input_type_ids, # Origin data tokens=orig_tokens, labels=labels, labels_ids=labels_ids, labels_mask=labels_mask, tok_map=tok_map, # Joint data cls=cls, cls_idx=cls_idx)) assert len(input_ids) == len(input_mask) assert len(input_ids) == len(input_type_ids) assert len(input_ids) == len(labels_ids) assert len(input_ids) == len(labels_mask) if is_cls: return features, (label2idx, cls2idx) return features, label2idx