def impute_zip(df):
    geocoder = Geocoder('your-api-key')
    tqdm_notebook().pandas()
    df.loc[df["zipcode"].isnull(),
           "zipcode"] = df[df["zipcode"].isnull()].progress_apply(
               lambda row: geocoder.reverse_geocode(row['latitude'], row[
                   'longitude'])[0].postal_code,
               axis=1)
    return (df)
Exemplo n.º 2
0
    def initialize(cls,
                   shm_size_mb=SHM_SIZE_MB,
                   nb_workers=NB_WORKERS,
                   progress_bar=False):
        """
        Initialize Pandarallel shared memory.

        Parameters
        ----------
        shm_size_mb : int, optional
            Size of Pandarallel shared memory

        nb_workers : int, optional
            Number of worker used for parallelisation

        progress_bar : bool, optional
            Display a progress bar
            WARNING: Progress bar is an experimental feature.
                     This can lead to a considerable performance loss.
        """

        print("New pandarallel memory created - Size:", shm_size_mb, "MB")
        print("Pandarallel will run on", nb_workers, "workers")

        if progress_bar:
            print("WARNING: Progress bar is an experimental feature. This \
can lead to a considerable performance loss.")
            tqdm_notebook().pandas()

        cls.__store_ctx = _plasma.start_plasma_store(int(shm_size_mb * 1e6))
        plasma_store_name, _ = cls.__store_ctx.__enter__()

        plasma_client = _plasma.connect(plasma_store_name)

        args = plasma_store_name, nb_workers, plasma_client

        _pd.DataFrame.parallel_apply = _DataFrame.apply(*args, progress_bar)
        _pd.DataFrame.parallel_applymap = _DataFrame.applymap(
            *args, progress_bar)

        _pd.Series.parallel_map = _Series.map(*args, progress_bar)
        _pd.Series.parallel_apply = _Series.apply(*args, progress_bar)

        _pd.core.window.Rolling.parallel_apply = _SeriesRolling.apply(
            *args, progress_bar)

        _pd.core.groupby.DataFrameGroupBy.parallel_apply = _DataFrameGroupBy.apply(
            *args)

        _pd.core.window.RollingGroupby.parallel_apply = _RollingGroupby.apply(
            *args)
Exemplo n.º 3
0
def validate_step(dl, model, id2label, sup_labels, id2cls=None):
    model.eval()
    idx = 0
    preds_cpu, targets_cpu = [], []
    preds_cpu_cls, targets_cpu_cls = [], []
    for batch in tqdm_notebook(dl, total=len(dl), leave=False):
        idx += 1
        labels_mask, labels_ids = batch[-2:]
        preds = model.forward(batch)
        if id2cls is not None:
            preds, preds_cls = preds
            preds_cpu_, targets_cpu_ = transformed_result_cls([preds_cls],
                                                              [batch[-3]],
                                                              id2cls)
            preds_cpu_cls.extend(preds_cpu_)
            targets_cpu_cls.extend(targets_cpu_)
        preds_cpu_, targets_cpu_ = transformed_result([preds], [labels_mask],
                                                      id2label, [labels_ids])
        preds_cpu.extend(preds_cpu_)
        targets_cpu.extend(targets_cpu_)
    clf_report = flat_classification_report(targets_cpu,
                                            preds_cpu,
                                            labels=sup_labels,
                                            digits=3)
    if id2cls is not None:
        clf_report_cls = flat_classification_report([targets_cpu_cls],
                                                    [preds_cpu_cls],
                                                    digits=3)
        return clf_report, clf_report_cls
    return clf_report
Exemplo n.º 4
0
def train_step(dl,
               model,
               optimizer,
               lr_scheduler=None,
               clip=None,
               num_epoch=1):
    model.train()
    epoch_loss = 0
    idx = 0
    for batch in tqdm_notebook(dl, total=len(dl), leave=False):
        idx += 1
        model.zero_grad()
        loss = model.score(batch)
        loss.backward()
        if clip is not None:
            _ = torch.nn.utils.clip_grad_norm(model.parameters(), clip)
        optimizer.step()
        optimizer.zero_grad()
        epoch_loss += loss.data.cpu().tolist()
        if lr_scheduler is not None:
            lr_scheduler.step()
        # torch.cuda.empty_cache()
    if lr_scheduler is not None:
        logging.info("\nlr after epoch: {}".format(lr_scheduler.lr))
    logging.info("\nepoch {}, average train epoch loss={:.5}\n".format(
        num_epoch, epoch_loss / idx))
Exemplo n.º 5
0
def predict(dl, model, id2label, id2cls=None):
    model.eval()
    idx = 0
    preds_cpu = []
    preds_cpu_cls = []
    for batch, sorted_idx in tqdm_notebook(dl, total=len(dl), leave=False):
        idx += 1
        labels_mask, labels_ids = batch[-2:]
        preds = model.forward(batch)
        if id2cls is not None:
            preds, preds_cls = preds
            preds_cpu_ = transformed_result_cls([preds_cls], [preds_cls],
                                                id2cls, False)
            preds_cpu_cls.extend(preds_cpu_)
        bs = batch[0].shape[0]
        unsorted_mask = [0] * bs
        unsorted_pred = [0] * bs
        for idx, sidx in enumerate(sorted_idx):
            unsorted_pred[sidx] = preds[idx]
            unsorted_mask[sidx] = labels_mask[idx]

        preds_cpu_ = transformed_result([unsorted_pred], [unsorted_mask],
                                        id2label)
        preds_cpu.extend(preds_cpu_)
    if id2cls is not None:
        return preds_cpu, preds_cpu_cls
    return preds_cpu
Exemplo n.º 6
0
def replace_func(input_file):
    p1 = re.compile(r'-\{.*?(zh-hans|zh-cn):([^;]*?)(;.*?)?\}-')
    p2 = re.compile(r'[(][: @ . , ?!\s][)]')
    p3 = re.compile(r'[「『]')
    p4 = re.compile(
        r'[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()0-9 , : ; \-\ \[\ \]\ ]'
    )
    p5 = re.compile('<.*?>')
    p6 = re.compile('–')
    sentence = ""
    numstopwords = 0
    replaced_file = open('data/' + 'preprocessed.txt', "w", encoding="utf8")
    with open('data/' + input_file, 'r', encoding="utf8") as source_f:
        print('data/' + input_file + " reading")
        for line in tqdm_notebook(source_f):
            line = p1.sub(r' ', line)
            line = p2.sub(r' ', line)
            line = p3.sub(r' ', line)
            line = p4.sub(r' ', line)
            line = p5.sub(r' ', line)
            line = p6.sub(r' ', line)
            line = line.lower()
            sentence += line
        sentence_list = sentence.split(' ')
        print('data/' + input_file + " to list")
        for word in sentence_list:
            word = WordNetLemmatizer().lemmatize(word)
            if not word in stopwords.words('english'):
                replaced_file.write(word + " ")
            else:
                numstopwords += 1
                if numstopwords % 100 == 0:
                    print("number of stopwords", numstopwords)
        replaced_file.close()
Exemplo n.º 7
0
def perform_regularised_cv(train,y_colname, grid,high_card_cols, folds = 5,metric = mean_absolute_error,model='XGBoost'):
    '''Performs grid search crossfold validation with support for regularised mean encoding
    Inputs:
        train: Input data set
        y_colname : target column name
        grid: Set of hyperparameters over which the model is to be tuned
        high_card_col : categorical columns you want to consider for mean encoding
        folds: Number of folds to be used for cross validation
    Outputs:
        all_scores: the list of final scores
    '''
    kf = KFold(folds, random_state=0, shuffle=True)
    param_grid = ParameterGrid(grid)
    all_scores = [] #Store all scores
    for params in tqdm_notebook(param_grid):
        errors = []
        for train_idx, test_idx in kf.split(train):
            # Split data into train and test
            kf_train, kf_test = train.iloc[train_idx,:], train.iloc[test_idx,:]
            kf_train.reset_index(inplace=True,drop=True)
            kf_test.reset_index(inplace=True,drop=True)
            _,error,_,_ = train_model(params,kf_train,kf_test,y_colname,high_card_cols,valid=True,metric= metric,model='XGBoost')
            errors.append(error)
        avg_score = np.mean(errors) #Average scores of all KFold
        all_scores.append((params, avg_score))
        rmsle = np.sqrt(avg_score)
        tqdm.write(f'Parameters: {params} RMSLE: {rmsle}')
    return all_scores
Exemplo n.º 8
0
def preprocess(data):
    rows = data.shape[0] - params["time_steps"]
    cols = data.shape[1]
    x = np.zeros((rows, params["time_steps"], cols))
    y = np.zeros((rows, ))
    for i in tqdm_notebook(range(rows)):
        x[i] = data[i:params["time_steps"] + i]
        y[i] = data[params["time_steps"] + i, 4]
    print(x.shape, y.shape)
    return x, y
Exemplo n.º 9
0
def build_timeseries(mat, y_col_index):
    dim_0 = mat.shape[0] - TIME_STEPS
    dim_1 = mat.shape[1]
    x = np.zeros((dim_0, TIME_STEPS, dim_1))
    y = np.zeros((dim_0,))
    print("dim_0",dim_0)
    for i in tqdm_notebook(range(dim_0)):
        x[i] = mat[i:TIME_STEPS+i]
        y[i] = mat[TIME_STEPS+i, y_col_index]
    print("Ukuran time-series untuk Input dan Output : ",x.shape,y.shape)
    return x, y
def build_timeseries(mat, y_col_index):
    # y_col_index is the index of column that would act as output column
    # total number of time-series samples would be len(mat) - TIME_STEPS
    dim_0 = mat.shape[0] - TIME_STEPS
    dim_1 = mat.shape[1]
    x = np.zeros((dim_0, TIME_STEPS, dim_1))
    y = np.zeros((dim_0,))
    
    for i in tqdm_notebook(range(dim_0)):
        x[i] = mat[i:TIME_STEPS+i]
        y[i] = mat[TIME_STEPS+i, y_col_index]
    print("length of time-series i/o",x.shape,y.shape)
    return x, y
Exemplo n.º 11
0
def build_timeseries(mat, time_step=50, debug=False):
    """
    Covnerts a 2 dimensional dataframe into a 3 dimensional dataframe.
    :param mat: The dataframe input
    :param time_step: The number of of time steps used for the 2nd dimension
    :param debug: Boolean, if true, print shape of array returned
    :return: A three dimensional dataframe.
    """
    dim_0 = mat.shape[0] - time_step
    dim_1 = mat.shape[1]
    x = np.zeros((dim_0, time_step, dim_1))

    for i in tqdm_notebook(range(dim_0)):
        x[i] = mat[i:time_step + i]

    if debug:
        print("length of time-series i/o", x.shape)
    return x
Exemplo n.º 12
0
def build_timeseries(mat, y_col_index):
    """
    Converts ndarray into timeseries format and supervised data format. Takes first TIME_STEPS
    number of rows as input and sets the TIME_STEPS+1th data as corresponding output and so on.
    :param mat: ndarray which holds the dataset
    :param y_col_index: index of column which acts as output
    :return: returns two ndarrays-- input and output in format suitable to feed
    to LSTM.
    """
    # total number of time-series samples would be len(mat) - TIME_STEPS
    dim_0 = mat.shape[0] - TIME_STEPS
    dim_1 = mat.shape[1]
    x = np.zeros((dim_0, TIME_STEPS, dim_1))
    y = np.zeros((dim_0,))
    print("dim_0",dim_0)
    for i in tqdm_notebook(range(dim_0)):
        x[i] = mat[i:TIME_STEPS+i]
        y[i] = mat[TIME_STEPS+i, y_col_index]
    print("length of time-series i/o",x.shape,y.shape)
    return x, y
Exemplo n.º 13
0
def create_corpus(df):
    '''
    creating a corpus consisting of all the words present 
    in the text.
    '''
    '''
    get the most important words in the text documents using tf-idf
    '''
    corpus_row = df.TRANS_CONV_TEXT.values
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=70)
    vectorizer.fit_transform(corpus_row)
    imp_words = vectorizer.get_feature_names()

    text = df.TRANS_CONV_TEXT.values
    corpus = []
    for tweet in tqdm_notebook(text):
        words = [
            word.lower() for word in word_tokenize(tweet)
            if ((word.isalpha() == True) and (word in imp_words))
        ]
        corpus.append(list(set(words)))
    return corpus
def replace_func(input_file):
    p1 = re.compile(r'-\{.*?(zh-hans|zh-cn):([^;]*?)(;.*?)?\}-')
    p2 = re.compile(r'[(][: @ . , ?!\s][)]')
    p3 = re.compile(r'[「『]')
    p4 = re.compile(
        r'[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()0-9 , : ; \-\ \[\ \]\ ]'
    )
    p5 = re.compile('<.*?>')
    p6 = re.compile('–')
    replaced_file = open('data/' + 'preprocessed.txt', "w", encoding="utf8")
    with open('data/' + input_file, 'r', encoding="utf8") as source_f:
        for line in tqdm_notebook(source_f):
            line = p1.sub(r' ', line)
            line = p2.sub(r' ', line)
            line = p3.sub(r' ', line)
            line = p4.sub(r' ', line)
            line = p5.sub(r' ', line)
            line = p6.sub(r' ', line)

            replaced_file.write(line)

        replaced_file.close()
Exemplo n.º 15
0
def get_pre_post_colonization_df(conflict_df):
    """ Create the pre and post colonization dataframe"""

    colonized_df = pd.read_csv('datasets/colonies_wikipedia.csv')
    colonized_countries = list(colonized_df["colonized_country"])

    # Create two empty dataframe
    columns = [
        "location", "ID", "colonizer_country", "Indep Date", "sidea", "side b",
        "incomp", "year", "intensity", "cumint", "type", "startdate",
        "startdate2", "ependdate", "region"
    ]
    pre_colonization_conflict_df = pd.DataFrame(columns=columns)
    post_colonization_conflict_df = pd.DataFrame(columns=columns)

    for index in tqdm_notebook(range(len(conflict_df))):
        country = conflict_df.loc[index, 'location']

        # Correct issue when country string had a space at the end
        if country[-1] == ' ': country = country[:-1]

        # Check if the country is a ex colony
        if country in colonized_countries:
            # Get information from colonies dataframe
            indep_year = colonized_df.loc[colonized_df['colonized_country'] ==
                                          country, 'Year'].values[0]
            indep_month = colonized_df.loc[colonized_df['colonized_country'] ==
                                           country, 'Month'].values[0]
            indep_day = colonized_df.loc[colonized_df['colonized_country'] ==
                                         country, 'Day'].values[0]
            indep_date = str(indep_day) + "/" + str(indep_month) + "/" + str(
                indep_year)

            ID = colonized_df.loc[colonized_df['colonized_country'] == country,
                                  'ID'].values[0]
            colonizer_country = colonized_df.loc[
                colonized_df['colonized_country'] == country,
                'colonizer_country'].values[0]

            # Creation of the temporary row
            conflict_df_tmp = conflict_df.loc[index:index, ]
            conflict_df_tmp.set_value(index, 'Indep Date', indep_date)
            conflict_df_tmp.set_value(index, 'ID', ID)
            conflict_df_tmp.set_value(index, 'colonizer_country',
                                      colonizer_country)

            start_year = int(conflict_df.loc[index, 'year'])

            # Check when happend the conflict (before or after independence)
            if start_year > indep_year:
                post_colonization_conflict_df = post_colonization_conflict_df.append(
                    conflict_df_tmp)
            else:
                pre_colonization_conflict_df = pre_colonization_conflict_df.append(
                    conflict_df_tmp)
        #else:
        #    print(country, "--> NOT EX COLONY")

    # Create general colonization dataframe
    conflict_df.to_csv("datasets/colonization_conflict_general.csv")

    # Create the post colonization dataframe
    pre_colonization_conflict_df = pre_colonization_conflict_df.reset_index(
        drop=True)
    pre_colonization_conflict_df = pre_colonization_conflict_df[columns]
    pre_colonization_conflict_df.to_csv(
        "datasets/colonization_conflict_pre.csv")

    # Create the post colinzation dataframe
    post_colonization_conflict_df = post_colonization_conflict_df.reset_index(
        drop=True)
    post_colonization_conflict_df = post_colonization_conflict_df[columns]
    post_colonization_conflict_df.to_csv(
        "datasets/colonization_conflict_post.csv")

    return pre_colonization_conflict_df, post_colonization_conflict_df
from tqdm._tqdm_notebook import tqdm_notebook

#TQDM build
def tqdm_pandas(t):
  from pandas.core.frame import Series
  def inner(series, func, *args, **kwargs):
      t.total = series.size
      def wrapper(*args, **kwargs):
          t.update(1)
          return func(*args, **kwargs)
      result = series.apply(wrapper, *args, **kwargs)
      t.close()
      return result
  Series.progress_apply = inner

tqdm_pandas(tqdm_notebook())
tqdm.pandas(desc="my bar!")

#data prep and feature engg 
def data():
    os.chdir('D:/data_science/kaggle_sound_classification')
    train = pd.read_csv('train.csv')
    train = train.iloc[1:300,:]
    train_path = 'D:/data_science/kaggle_sound_classification/audio_train/'
    def get_feature(fname):
        b,_ = librosa.load(fname, res_type = 'kaiser_fast')
        try:
            mfcc = np.mean(librosa.feature.mfcc(y = b,n_mfcc=60).T,axis=0)
            mels = np.mean(librosa.feature.melspectrogram(b, sr = SAMPLE_RATE).T,axis = 0)
            stft = np.abs(librosa.stft(b))
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr = SAMPLE_RATE).T,axis = 0)
Exemplo n.º 17
0
    def inner(series, func, *args, **kwargs):
        t.total = series.size

        def wrapper(*args, **kwargs):
            t.update(1)
            return func(*args, **kwargs)

        result = series.apply(wrapper, *args, **kwargs)
        t.close()
        return result

    Series.progress_apply = inner


tqdm_pandas(tqdm_notebook())
tqdm.pandas(desc="my bar!")


#data prep and feature engg
def data():
    os.chdir('D:/data_science/kaggle_sound_classification')
    train = pd.read_csv('train.csv')
    train = train.iloc[1:300, :]
    train_path = 'D:/data_science/kaggle_sound_classification/audio_train/'

    def get_feature(fname):
        b, _ = librosa.load(fname, res_type='kaiser_fast')
        try:
            mfcc = np.mean(librosa.feature.mfcc(y=b, n_mfcc=60).T, axis=0)
            mels = np.mean(librosa.feature.melspectrogram(b, sr=SAMPLE_RATE).T,
Exemplo n.º 18
0
def get_data(df,
             config,
             label2idx=None,
             oov='<oov>',
             pad='<pad>',
             cls2idx=None,
             is_cls=False,
             word_lexicon=None,
             char_lexicon=None,
             max_seq_len=424):
    if label2idx is None:
        label2idx = {pad: 0, '<bos>': 1, '<eos>': 2}
    features = []
    if is_cls:
        # Use joint model
        if cls2idx is None:
            cls2idx = dict()
        zip_args = zip(df["1"].tolist(), df["0"].tolist(), df["2"].tolist())
    else:
        zip_args = zip(df["1"].tolist(), df["0"].tolist())
    cls = None
    total = len(df["0"].tolist())
    for args in tqdm_notebook(enumerate(zip_args), total=total, leave=False):
        if is_cls:
            idx, (text, labels, cls) = args
        else:
            idx, (text, labels) = args
        text = text.split()
        text = text[:max_seq_len - 2]
        labels = labels.split()[:max_seq_len - 2]
        labels = ['<bos>'] + labels + ['<eos>']
        if config['token_embedder']['name'].lower() == 'cnn':
            tokens, text = read_list(
                [text], config['token_embedder']['max_characters_per_token'])
        else:
            tokens, text = read_list([text])
        tokens, text = tokens[0], text[0]
        input_ids = None
        if word_lexicon is not None:
            oov_id, pad_id = word_lexicon.get(oov, None), word_lexicon.get(
                pad, None)
            assert oov_id is not None and pad_id is not None
            input_ids = [word_lexicon.get(x, oov_id) for x in tokens]
        char_ids = None
        # get a batch of character id whose size is (batch x max_len x max_chars)
        if char_lexicon is not None:
            char_ids = []
            bow_id, eow_id, oov_id, pad_id = [
                char_lexicon.get(key, None)
                for key in ('<eow>', '<bow>', oov, pad)
            ]

            assert bow_id is not None and eow_id is not None and oov_id is not None and pad_id is not None

            if config['token_embedder']['name'].lower() == 'cnn':
                max_chars = config['token_embedder'][
                    'max_characters_per_token']
                assert max([len(w) for w in tokens]) + 2 <= max_chars
            elif config['token_embedder']['name'].lower() == 'lstm':
                # counting the <bow> and <eow>
                pass
            else:
                raise ValueError('Unknown token_embedder: {0}'.format(
                    config['token_embedder']['name']))
            for token in tokens:
                chars = [bow_id]
                if token == '<bos>' or token == '<eos>':
                    chars.append(char_lexicon.get(token))
                    chars.append(eow_id)
                else:
                    for c in token:
                        chars.append(char_lexicon.get(c, oov_id))
                    chars.append(eow_id)
                char_ids.append(chars)

        for l in labels:
            if l not in label2idx:
                label2idx[l] = len(label2idx)
        labels_ids = [label2idx[l] for l in labels]
        # For joint model
        cls_idx = None
        if is_cls:
            if cls not in cls2idx:
                cls2idx[cls] = len(cls2idx)
            cls_idx = cls2idx[cls]
        features.append(
            InputFeatures(input_ids,
                          char_ids,
                          tokens,
                          labels,
                          labels_ids,
                          cls=cls,
                          cls_idx=cls_idx))
    if is_cls:
        return features, (label2idx, cls2idx)
    return features, label2idx
Exemplo n.º 19
0
def get_data(df,
             tokenizer,
             label2idx=None,
             max_seq_len=424,
             pad="<pad>",
             cls2idx=None,
             is_cls=False,
             is_meta=False):
    tqdm_notebook = tqdm
    if label2idx is None:
        label2idx = {pad: 0, '[CLS]': 1, '[SEP]': 2}
    features = []
    all_args = []
    if is_cls:
        # Use joint model
        if cls2idx is None:
            cls2idx = dict()
        all_args.extend([df["1"].tolist(), df["0"].tolist(), df["2"].tolist()])
    else:
        all_args.extend([df["1"].tolist(), df["0"].tolist()])
    if is_meta:
        all_args.append(df["3"].tolist())
    total = len(df["0"].tolist())
    cls = None
    meta = None
    for args in tqdm_notebook(enumerate(zip(*all_args)),
                              total=total,
                              leave=False):
        if is_cls:
            if is_meta:
                idx, (text, labels, cls, meta) = args
            else:
                idx, (text, labels, cls) = args
        else:
            if is_meta:
                idx, (text, labels, meta) = args
            else:
                idx, (text, labels) = args

        tok_map = []
        meta_tokens = []
        if is_meta:
            meta = json.loads(meta)
            meta_tokens.append([0] * len(meta[0]))
        bert_tokens = []
        bert_labels = []
        bert_tokens.append("[CLS]")
        bert_labels.append("[CLS]")
        orig_tokens = []
        orig_tokens.extend(str(text).split())
        labels = str(labels).split()
        pad_idx = label2idx[pad]
        assert len(orig_tokens) == len(labels)
        prev_label = ""
        for idx_, (orig_token, label) in enumerate(zip(orig_tokens, labels)):
            # Fix BIO to IO as BERT proposed https://arxiv.org/pdf/1810.04805.pdf
            prefix = "I_"
            if label != "O":
                label = label.split("_")[1]
                prev_label = label
            else:
                prev_label = label

            cur_tokens = tokenizer.tokenize(orig_token)
            if max_seq_len - 1 < len(bert_tokens) + len(cur_tokens):
                break
            tok_map.append(len(bert_tokens))
            if is_meta:
                meta_tokens.extend([meta[idx_]] * len(cur_tokens))
            bert_tokens.extend(cur_tokens)
            bert_label = [prefix + label] + ["X"] * (
                len(cur_tokens) - 1)  # ["I_" + label] * (len(cur_tokens) - 1)
            bert_labels.extend(bert_label)
        bert_tokens.append("[SEP]")
        bert_labels.append("[SEP]")
        if is_meta:
            meta_tokens.append([0] * len(meta[0]))
        orig_tokens = ["[CLS]"] + orig_tokens + ["[SEP]"]

        input_ids = tokenizer.convert_tokens_to_ids(bert_tokens)
        labels = bert_labels
        for l in labels:
            if l not in label2idx:
                label2idx[l] = len(label2idx)
        labels_ids = [label2idx[l] for l in labels]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)
        labels_mask = [1] * len(labels_ids)
        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_len:
            input_ids.append(0)
            input_mask.append(0)
            labels_ids.append(pad_idx)
            labels_mask.append(0)
            tok_map.append(-1)
            if is_meta:
                meta_tokens.append([0] * len(meta[0]))
        # assert len(input_ids) == len(bert_labels_ids)
        input_type_ids = [0] * len(input_ids)
        # For joint model
        cls_idx = None
        if is_cls:
            if cls not in cls2idx:
                cls2idx[cls] = len(cls2idx)
            cls_idx = cls2idx[cls]
        if is_meta:
            meta = meta_tokens
        features.append(
            InputFeatures(
                # Bert data
                bert_tokens=bert_tokens,
                input_ids=input_ids,
                input_mask=input_mask,
                input_type_ids=input_type_ids,
                # Origin data
                tokens=orig_tokens,
                labels=labels,
                labels_ids=labels_ids,
                labels_mask=labels_mask,
                tok_map=tok_map,
                # Joint data
                cls=cls,
                cls_idx=cls_idx,
                # Meta data
                meta=meta))
        assert len(input_ids) == len(input_mask)
        assert len(input_ids) == len(input_type_ids)
        assert len(input_ids) == len(labels_ids)
        assert len(input_ids) == len(labels_mask)
    if is_cls:

        return features, (label2idx, cls2idx)
    return features, label2idx
Exemplo n.º 20
0
def get_data(df,
             tokenizer,
             label2idx=None,
             max_seq_len=424,
             pad="<pad>",
             cls2idx=None,
             is_cls=False):
    if label2idx is None:
        label2idx = {pad: 0, '[CLS]': 1, '[SEP]': 2}
    features = []
    if is_cls:
        # Use joint model
        if cls2idx is None:
            cls2idx = dict()
        zip_args = zip(df["1"].tolist(), df["0"].tolist(), df["2"].tolist())
    else:
        zip_args = zip(df["1"].tolist(), df["0"].tolist())
    total = len(df["0"].tolist())
    cls = None

    for args in tqdm_notebook(enumerate(zip_args), total=total, leave=False):
        if is_cls:
            idx, (text, labels, cls) = args
        else:
            idx, (text, labels) = args
        tok_map = []
        bert_tokens = []
        bert_labels = []
        bert_tokens.append("[CLS]")
        bert_labels.append("[CLS]")
        orig_tokens = []
        orig_tokens.extend(text.split())
        labels = labels.split()
        pad_idx = label2idx[pad]
        # assert len(orig_tokens) == len(labels)
        prev_label = ""
        for orig_token, label in zip(orig_tokens, labels):
            prefix = "B_"
            if label != "O":
                label = label.split("_")[1]
                if label == prev_label:
                    prefix = "I_"
                prev_label = label
            else:
                prev_label = label
            tok_map.append(len(bert_tokens))
            cur_tokens = tokenizer.tokenize(orig_token)
            if max_seq_len - 1 < len(bert_tokens) + len(cur_tokens):
                break

            bert_tokens.extend(cur_tokens)
            bert_label = [prefix + label
                          ] + ["I_" + label] * (len(cur_tokens) - 1)
            bert_labels.extend(bert_label)
        bert_tokens.append("[SEP]")
        bert_labels.append("[SEP]")

        orig_tokens = ["[CLS]"] + orig_tokens + ["[SEP]"]

        input_ids = tokenizer.convert_tokens_to_ids(bert_tokens)
        labels = bert_labels
        for l in labels:
            if l not in label2idx:
                label2idx[l] = len(label2idx)
        labels_ids = [label2idx[l] for l in labels]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)
        labels_mask = [1] * len(labels_ids)
        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_len:
            input_ids.append(0)
            input_mask.append(0)
            labels_ids.append(pad_idx)
            labels_mask.append(0)
            tok_map.append(-1)
        # assert len(input_ids) == len(bert_labels_ids)
        input_type_ids = [0] * len(input_ids)
        # For joint model
        cls_idx = None
        if is_cls:
            if cls not in cls2idx:
                cls2idx[cls] = len(cls2idx)
            cls_idx = cls2idx[cls]

        features.append(
            InputFeatures(
                # Bert data
                bert_tokens=bert_tokens,
                input_ids=input_ids,
                input_mask=input_mask,
                input_type_ids=input_type_ids,
                # Origin data
                tokens=orig_tokens,
                labels=labels,
                labels_ids=labels_ids,
                labels_mask=labels_mask,
                tok_map=tok_map,
                # Joint data
                cls=cls,
                cls_idx=cls_idx))
        assert len(input_ids) == len(input_mask)
        assert len(input_ids) == len(input_type_ids)
        assert len(input_ids) == len(labels_ids)
        assert len(input_ids) == len(labels_mask)
    if is_cls:

        return features, (label2idx, cls2idx)
    return features, label2idx