示例#1
0
def make_ctx(rec):
    options = eval(rec['options'])
    cmd = rec['cmd']
    synopsis = clean_text(rec['synopsis'])
    r = f'|{cmd} {synopsis}'
    for opt in options:
        short_flag = opt['short'][0] if len(opt['short']) > 0 else ''
        text = clean_text(opt['text'])
        r += f'|{short_flag} ' + ' '.join(text.split()[:5])
    return r
示例#2
0
def data_reader(data_file):
    q1 = []
    q2 = []
    labels = []
    with open(data_file, "r", errors="ignore") as f:
        #        next(f)
        for line in f:
            line = line.strip().split("\t")
            q1.append(clean_text(line[1]))
            q2.append(clean_text(line[2]))
            labels.append(int(line[0]))
    return q1, q2, labels
示例#3
0
    def predict_many(self, texts, result_cnt):
        
        alpha = 0.6
        n_utils = 5
        beam_width = 5
        
        text_cleaned = [clean_text(x) for x in texts]
        
        pred_utils = self.util_model.predict_many(text_cleaned, n_utils)
        
        result = []
        with torch.no_grad():
            for i in range(len(text_cleaned)):
                candidates = []
                for j in range(n_utils):
                    util, util_proba = pred_utils[i][j]
                    pred = self.ctx_model.predict(text_cleaned[i], util, beam_width)

                    for pred_cmd, ctx_proba in pred:
                        joined_proba = (1 - alpha) * util_proba + alpha * ctx_proba
                        candidates.append((pred_cmd,joined_proba))
                        
                
                candidates = sorted(candidates, key=lambda x: -x[1])[:result_cnt]
                candidates = [x[0] for x in candidates]
                result.append(candidates)
                
        return result
示例#4
0
def gensim_embedding_difference(data, field1, field2, clean=False):
    """
    Calculate the similarity between the sum of all embeddings.
    Setting clean to False will reproduce the results that were reported by Hartmann et al. .
    However, setting it to True will universally improve the score of the evaluated embeddings.    
    """
    distances = []
    for pair in data:
        if clean:
            e1 = [
                clean_text(i) for i in pair[field1]
                if clean_text(i) in embeddings
            ]
            e2 = [
                clean_text(i) for i in pair[field2]
                if clean_text(i) in embeddings
            ]
        else:
            e1 = [i if i in embeddings else 'unk' for i in pair[field1]]
            e2 = [i if i in embeddings else 'unk' for i in pair[field2]]
        distances.append([embeddings.n_similarity(e1, e2)])
    return distances
def main():
    logger.info("Loading 20 Newsgroups Dataset and extracting features...")
    dataset = fetch_20newsgroups(subset='all',
                                 categories=None,
                                 shuffle=False,
                                 random_state=42,
                                 remove=('headers', 'footers', 'quotes'))

    logger.info(
        "Running data preparation process. This might take a while ...")
    corpus = [preprocessing.clean_text(x) for x in dataset.data]

    vectorizer = CountVectorizer(analyzer='word',
                                 ngram_range=(1, 1),
                                 min_df=20,
                                 stop_words='english',
                                 lowercase=True,
                                 max_df=0.90,
                                 max_features=n_features)

    matrix = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names()

    output_matrix = open(
        BASE_WORKDIR + '/data/processed/20newsgroups-bag-of-words.pkl', 'wb')

    logger.info("Dumping bag-of-words to tmp file")
    # Pickle dictionary using protocol 0.
    pickle.dump(matrix, output_matrix)

    output_matrix.close()

    output_vocab = open(
        BASE_WORKDIR + '/data/processed/20newsgroups-vocabulary.pkl', 'wb')

    logger.info("Dumping vocabulary to tmp file")
    # Pickle dictionary.
    pickle.dump(feature_names, output_vocab)

    output_vocab.close()

    logger.info("Done!")
    logger.info("%d documents" % len(dataset.data))
    logger.info("%d categories" % len(dataset.target_names))
示例#6
0
def plot_most_common_words(cv_path, stopwords):
    # Läser in word filer
    cv_list = []
    for subdir, dirs, files in os.walk(cv_path):
        for file in files:
            cv_filepath = subdir + os.sep + file
            cleaned_cv = pp.clean_text(pp.read_word_file(cv_filepath),
                                       stopwords)
            #lemmalized = pp.lemmatization_sv(cleaned_cv)
            #cv_list.append(lemmalized)
            cv_list.append(cleaned_cv)

    wordcount = {}
    for cv in cv_list:
        for word in cv.split():
            if word not in wordcount:
                wordcount[word] = 1
            else:
                wordcount[word] += 1

    word_counter = collections.Counter(wordcount)
    for word, count in word_counter.most_common(100):
        print(word, ": ", count)
def compute_vectors_on_collection(collection, q, chunksize=1024):
    total_docs = collection.count_documents(q)
    num_processed = 0
    skips_variable = range(0, total_docs, chunksize)
    print(f"processing {total_docs} documents")
    with tqdm(total=total_docs) as pbar:
        for i in range(1, len(skips_variable)):
            # Expand the cursor and retrieve data 
            cur_chunk = collection.find(q, projection={"text": 1})[skips_variable[i-1]:skips_variable[i]]

            items = list(cur_chunk)
            texts_list = [it['text'] for it in items]
            cleaned_texts = [clean_text(text) for text in texts_list]
            vectors = vectorize(cleaned_texts)
            for i in range(len(cleaned_texts)):
                cleaned_text = cleaned_texts[i]
                id_ = items[i]['_id']
                if len(cleaned_text) > 40:
                    num_processed += 1
                    collection.update_one({"_id": id_}, {"$set": {"vector": vectors[i].tolist()}})
            pbar.update(chunksize)
    print(f"processed {num_processed} documents")
    return num_processed
示例#8
0
def clean_row(row):
    return preprocessing.clean_text(row[3])
示例#9
0
def read_original_data(data):
    cleaned_data = data.copy()
    cleaned_data["text"] = data["text"].apply(
        lambda x: clean_text(x, remove_whitespaces=False))
    return cleaned_data
def run_keras_experiment():
    print('Reading files')

    # Reading File Section - This should change
    full = pd.read_csv("data/english_dataset.tsv",
                       sep='\t',
                       names=['text_id', 'text', 'task_1', 'task_2', 'task_3'])

    is_hof = full['task_1'] == 'HOF'
    full = full[is_hof]

    train, test = train_test_split(full, test_size=0.2)

    print('Completed reading')

    #############
    print("Train shape : ", train.shape)
    print("Test shape : ", test.shape)

    # Variables

    TEXT_COLUMN = "text"
    LABEL_COLUMN = "task_2"

    configParser = configparser.RawConfigParser()
    configFilePath = "config.txt"
    configParser.read(configFilePath)

    EMBEDDING_FILE = configParser.get('english_task_2_model-config',
                                      'EMBEDDING_FILE')
    MODEL_PATH = configParser.get('english_task_2_model-config', 'MODEL_PATH')
    PREDICTION_FILE = configParser.get('english_task_2_model-config',
                                       'PREDICTION_FILE')

    print(train.head())

    print("Removing URLs")
    train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: remove_url(x))
    test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: remove_url(x))
    print(train.head())

    print("Removing usernames")
    train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: remove_names(x))
    test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: remove_names(x))
    print(train.head())
    #
    # print("Identifying names")
    #
    # train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: entity_recognizing(x))
    # test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: entity_recognizing(x))
    # print(train.head())

    print("Converting to lower-case")
    train[TEXT_COLUMN] = train[TEXT_COLUMN].str.lower()
    test[TEXT_COLUMN] = test[TEXT_COLUMN].str.lower()
    print(train.head())

    print("Cleaning punctuation marks")
    train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: clean_text(x))
    test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: clean_text(x))
    print(train.head())

    train['doc_len'] = train[TEXT_COLUMN].apply(
        lambda words: len(words.split(" ")))
    max_seq_len = np.round(train['doc_len'].mean() +
                           train['doc_len'].std()).astype(int)

    embed_size = 300  # how big is each word vector
    max_features = None  # how many unique words to use (i.e num rows in embedding vector)
    maxlen = max_seq_len  # max number of words in a question to use #99.99%

    # fill up the missing values
    X = train[TEXT_COLUMN].fillna("_na_").values
    X_test = test[TEXT_COLUMN].fillna("_na_").values

    # Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features, filters='')
    tokenizer.fit_on_texts(list(X))

    X = tokenizer.texts_to_sequences(X)
    X_test = tokenizer.texts_to_sequences(X_test)

    # Pad the sentences
    X = pad_sequences(X, maxlen=maxlen)
    X_test = pad_sequences(X_test, maxlen=maxlen)

    # Get the target values
    Y = train[LABEL_COLUMN].values

    le = LabelEncoder()

    le.fit(Y)
    encoded_Y = le.transform(Y)

    word_index = tokenizer.word_index
    max_features = len(word_index) + 1

    print('Loading Embeddings')

    embedding_matrix = get_emb_matrix(word_index, max_features, EMBEDDING_FILE)

    print('Finished loading Embeddings')

    print('Start Training')

    kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
    bestscore = []
    y_test = np.zeros((X_test.shape[0], 3))
    for i, (train_index, valid_index) in enumerate(kfold.split(X, encoded_Y)):
        X_train, X_val, Y_train, Y_val = X[train_index], X[
            valid_index], encoded_Y[train_index], encoded_Y[valid_index]

        Y_train = np_utils.to_categorical(Y_train, num_classes=3)
        Y_val = np_utils.to_categorical(Y_val, num_classes=3)

        filepath = MODEL_PATH
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_loss',
                                     verbose=2,
                                     save_best_only=True,
                                     mode='min')
        reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                      factor=0.6,
                                      patience=1,
                                      min_lr=0.0001,
                                      verbose=2)
        earlystopping = EarlyStopping(monitor='val_loss',
                                      min_delta=0.0001,
                                      patience=2,
                                      verbose=2,
                                      mode='auto')
        callbacks = [checkpoint, reduce_lr]
        model = capsule(maxlen, max_features, embed_size, embedding_matrix, 3)
        if i == 0: print(model.summary())
        model.fit(
            X_train,
            Y_train,
            batch_size=64,
            epochs=20,
            validation_data=(X_val, Y_val),
            verbose=2,
            callbacks=callbacks,
        )
        model.load_weights(filepath)

        y_pred = model.predict([X_val], batch_size=64, verbose=2)
        y_test += np.squeeze(model.predict([X_test], batch_size=64,
                                           verbose=2)) / 5

    print('Finished Training')

    pred_test_y = y_test.argmax(1)
    test['predictions'] = le.inverse_transform(pred_test_y)

    # save predictions
    file_path = PREDICTION_FILE
    test.to_csv(file_path, sep='\t', encoding='utf-8')

    print('Saved Predictions')

    # post analysis
    weighted_f1 = f1_score(test[LABEL_COLUMN],
                           test['predictions'],
                           average='weighted')
    accuracy = accuracy_score(test[LABEL_COLUMN], test['predictions'])
    weighted_recall = recall_score(test[LABEL_COLUMN],
                                   test['predictions'],
                                   average='weighted')
    weighted_precision = precision_score(test[LABEL_COLUMN],
                                         test['predictions'],
                                         average='weighted')

    print("Accuracy ", accuracy)
    print("Weighted F1 ", weighted_f1)
    print("Weighted Recall ", weighted_recall)
    print("Weighted Precision ", weighted_precision)
示例#11
0
        accu = (tp / total)
        prec = (pre_tp / pre_total)
        rec = (rec_tp / rec_total)
        evaluation[cat] = {'accuracy': accu, 'precision': prec, 'recall': rec}

    return evaluation


if __name__ == "__main__":

    print('Preprocessing...')

    train = preprocessing.parseXML('ABSA16_Laptops_Train_SB1_v2.xml')
    train = preprocessing.filter_data(train)
    train = preprocessing.flatten_attributes(train)
    train = preprocessing.clean_text(train)
    train = preprocessing.tokenize(train)

    test = preprocessing.parseXML('test_2016.xml')
    test = preprocessing.filter_data(test)
    test = preprocessing.flatten_attributes(test)
    test = preprocessing.clean_text(test)
    test = preprocessing.tokenize(test)
    train = {**train, **test}  # merging train and test set

    # Aspect Extraction
    print('\nAspect Detection')
    categories = retreive_tags(train)
    noun_dict, adj_dict, pos_dict = collect_pos(train, preprocessing.stopWords)
    noun_vocab = get_vocab(noun_dict)
    adj_vocab = get_vocab(adj_dict)
 def predict(self, text):
     return self.model.predict([clean_text(text,
                                           remove_whitespaces=False)])[0]
示例#13
0
def main(*kargs, **kwargs):
    get_kwargs(kwargs)
    train_fname = kwargs['train']
    test_fname = kwargs['test']
    result_fname = kwargs['output']
    word_embeds_fname = kwargs['word_embeds']
    char_embeds_fname = kwargs['char_embeds']
    logger_fname = kwargs['logger']
    mode = kwargs['mode']
    max_words = kwargs['max_words']
    use_only_exists_words = kwargs['use_only_exists_words']
    swear_words_fname = kwargs['swear_words']
    wrong_words_fname = kwargs['wrong_words']
    embeds_format = kwargs['format_embeds']
    config = kwargs['config']
    output_dir = kwargs['output_dir']
    norm_prob = kwargs['norm_prob']
    norm_prob_koef = kwargs['norm_prob_koef']
    gpus = kwargs['gpus']

    seq_col_name_words = 'comment_seq_lw_use_exist{}_{}k'.format(
        int(use_only_exists_words), int(max_words / 1000))
    seq_col_name_ll3 = 'comment_seq_ll3_use_exist{}_{}k'.format(
        int(use_only_exists_words), int(max_words / 1000))

    model_file = {
        'dense': os.path.join(output_dir, 'dense.h5'),
        'cnn': os.path.join(output_dir, 'cnn.h5'),
        'lstm': os.path.join(output_dir, 'lstm.h5'),
        'lr': os.path.join(output_dir, '{}_logreg.bin'),
        'catboost': os.path.join(output_dir, '{}_catboost.bin')
    }

    # ====Create logger====
    logger = Logger(logging.getLogger(), logger_fname)

    # ====Detect GPUs====
    logger.debug(device_lib.list_local_devices())

    # ====Load data====
    logger.info('Loading data...')
    train_df = load_data(train_fname)
    test_df = load_data(test_fname)

    target_labels = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    num_classes = len(target_labels)

    # ====Load additional data====
    logger.info('Loading additional data...')
    swear_words = load_data(swear_words_fname,
                            func=lambda x: set(x.T[0]),
                            header=None)
    wrong_words_dict = load_data(wrong_words_fname,
                                 func=lambda x: {val[0]: val[1]
                                                 for val in x})

    # ====Load word vectors====
    logger.info('Loading embeddings...')
    embeds_word = Embeds().load(word_embeds_fname, embeds_format)
    embeds_ll3 = Embeds().load(char_embeds_fname, embeds_format)

    # ====Clean texts====
    if mode in ('preprocess', 'all'):
        logger.info('Cleaning text...')
        train_df['comment_text_clear'] = clean_text(train_df['comment_text'],
                                                    wrong_words_dict,
                                                    autocorrect=True)
        test_df['comment_text_clear'] = clean_text(test_df['comment_text'],
                                                   wrong_words_dict,
                                                   autocorrect=True)
        train_df.to_csv(os.path.join(output_dir, 'train_clear.csv'),
                        index=False)
        test_df.to_csv(os.path.join(output_dir, 'test_clear.csv'), index=False)

    # ====Calculate maximum seq length====
    logger.info('Calc text length...')
    train_df.fillna('__NA__', inplace=True)
    test_df.fillna('__NA__', inplace=True)
    train_df['text_len'] = train_df['comment_text_clear'].apply(
        lambda words: len(words.split()))
    test_df['text_len'] = test_df['comment_text_clear'].apply(
        lambda words: len(words.split()))
    max_seq_len = np.round(train_df['text_len'].mean() +
                           3 * train_df['text_len'].std()).astype(int)
    max_char_seq_len = 2000  # empirical
    logger.debug('Max seq length = {}'.format(max_seq_len))

    # ====Prepare data to NN====
    logger.info('Converting texts to sequences...')

    if mode in ('preprocess', 'all'):
        train_df[seq_col_name_words], test_df[
            seq_col_name_words], word_index, train_df[
                seq_col_name_ll3], test_df[
                    seq_col_name_ll3], ll3_index = convert_text2seq(
                        train_df['comment_text_clear'].tolist(),
                        test_df['comment_text_clear'].tolist(),
                        max_words,
                        max_seq_len,
                        max_char_seq_len,
                        embeds_word,
                        lower=True,
                        oov_token='__NA__',
                        uniq=False,
                        use_only_exists_words=use_only_exists_words)
        logger.debug('Dictionary size use_exist{} = {}'.format(
            int(use_only_exists_words), len(word_index)))
        logger.debug('Char dict size use_exist{} = {}'.format(
            int(use_only_exists_words), len(ll3_index)))

        logger.info('Preparing embedding matrix...')
        words_not_found = embeds_word.set_matrix(max_words, word_index)
        embeds_ll3.matrix = np.random.normal(size=(len(ll3_index),
                                                   embeds_word.shape[1]))
        embeds_ll3.word_index = ll3_index
        embeds_ll3.word_index_reverse = {
            val: key
            for key, val in ll3_index.items()
        }
        embeds_ll3.shape = np.shape(embeds_ll3.matrix)
        embeds_word.save(
            os.path.join(output_dir,
                         'wiki.embeds_lw.{}k'.format(int(max_words / 1000))))
        embeds_ll3.save(
            os.path.join(output_dir,
                         'wiki.embeds_ll3.{}k'.format(int(max_words / 1000))))

        # ====Get text vector====
        pooling = {
            'max': {
                'func': np.max
            },
            'avg': {
                'func': np.sum,
                'normalize': True
            },
            'sum': {
                'func': np.sum,
                'normalize': False
            }
        }
        for p in ['max', 'avg', 'sum']:
            train_df['comment_vec_{}'.format(
                p)] = train_df[seq_col_name_words].apply(
                    lambda x: embed_aggregate(x, embeds_word, **pooling[p]))
            test_df['comment_vec_{}'.format(
                p)] = test_df[seq_col_name_words].apply(
                    lambda x: embed_aggregate(x, embeds_word, **pooling[p]))
        train_df.to_csv(os.path.join(output_dir, 'train_clear1.csv'),
                        index=False)
        test_df.to_csv(os.path.join(output_dir, 'test_clear1.csv'),
                       index=False)
    else:
        for col in train_df.columns:
            if col.startswith('comment_seq'):
                train_df[col] = train_df[col].apply(
                    lambda x: parse_seq(x, int))
                test_df[col] = test_df[col].apply(lambda x: parse_seq(x, int))
            elif col.startswith('comment_vec'):
                train_df[col] = train_df[col].apply(
                    lambda x: parse_seq(x, float))
                test_df[col] = test_df[col].apply(
                    lambda x: parse_seq(x, float))

    logger.debug('Embedding matrix shape = {}'.format(embeds_word.shape))
    logger.debug('Number of null word embeddings = {}'.format(
        np.sum(np.sum(embeds_word.matrix, axis=1) == 0)))

    # ====END OF `PREPROCESS`====
    if mode == 'preprocess':
        return True

    # ====Train/test split data====
    x = np.array(train_df[seq_col_name_words].values.tolist())
    y = np.array(train_df[target_labels].values.tolist())
    x_train_nn, x_val_nn, y_train, y_val, train_idxs, val_idxs = split_data(
        x, y, test_size=0.2, shuffle=True, random_state=42)
    x_test_nn = np.array(test_df[seq_col_name_words].values.tolist())

    x_char = np.array(train_df[seq_col_name_ll3].values.tolist())
    x_char_train_nn = x_char[train_idxs]
    x_char_val_nn = x_char[val_idxs]
    x_char_test_nn = np.array(test_df[seq_col_name_ll3].values.tolist())

    x_train_tfidf = train_df['comment_text_clear'].values[train_idxs]
    x_val_tfidf = train_df['comment_text_clear'].values[val_idxs]
    x_test_tfidf = test_df['comment_text_clear'].values

    catboost_cols = catboost_features(train_df, test_df)
    x_train_cb = train_df[catboost_cols].values[train_idxs].T
    x_val_cb = train_df[catboost_cols].values[val_idxs].T
    x_test_cb = test_df[catboost_cols].values.T

    # ====Train models====
    nn_models = {'cnn': cnn, 'dense': dense, 'rnn': rnn}

    params = Params(config)

    metrics = {}
    predictions = {}
    for param in params['models']:
        for model_label, model_params in param.items():
            if model_params.get('common', {}).get(
                    'warm_start', False) and os.path.exists(
                        model_params.get('common', {}).get('model_file', '')):
                logger.info('{} warm starting...'.format(model_label))
                model = load_model(
                    model_params.get('common', {}).get('model_file', None))
            elif model_label in nn_models:
                model = nn_models[model_label](embeds_word.matrix,
                                               embeds_ll3.matrix,
                                               num_classes,
                                               max_seq_len,
                                               max_char_seq_len,
                                               gpus=gpus,
                                               **model_params['init'])
                model_alias = model_params.get('common', {}).get('alias', None)
                if model_alias is None or not model_alias:
                    model_alias = '{}_{}'.format(model_label, i)
                logger.info("training {} ...".format(model_label))
                if model_label == 'dense':
                    x_tr = [x_train_nn, x_char_train_nn]
                    x_val = [x_val_nn, x_char_val_nn]
                    x_test = [x_test_nn, x_char_test_nn]
                else:
                    x_tr = x_train_nn
                    x_val = x_val_nn
                    x_test = x_test_nn
                hist = train(x_tr,
                             y_train,
                             model,
                             logger=logger,
                             **model_params['train'])
                predictions[model_alias] = model.predict(x_val)
                save_predictions(test_df, model.predict(x_test), target_labels,
                                 model_alias)
            elif model_label == 'tfidf':
                model = TFIDF(target_labels, **model_params['init'])
                model.fit(x_train_tfidf, y_train, **model_params['train'])
                predictions[model_alias] = model.predict(x_val_tfidf)
                save_predictions(test_df, model.predict(x_test_tfidf),
                                 target_labels, model_alias)
            elif model_label == 'catboost':
                model = CatBoost(target_labels, **model_params['init'])
                model.fit(x_train_cb,
                          y_train,
                          eval_set=(x_val_cb, y_val),
                          use_best_model=True)
                predictions[model_alias] = model.predict_proba(x_val_cb)
                save_predictions(test_df, model.predict_proba(x_test_cb),
                                 target_labels, model_alias)
            metrics[model_alias] = get_metrics(y_val, predictions[model_alias],
                                               target_labels)
            logger.debug('{} params:\n{}'.format(model_alias, model_params))
            logger.debug('{} metrics:\n{}'.format(
                model_alias, print_metrics(metrics[model_alias])))
            model.save(
                os.path.join(output_dir, model_params['common']['model_file']))

    logger.info('Saving metrics...')
    with open(os.path.join(output_dir, 'metrics.json'), 'w') as f:
        f.write(json.dumps(metrics))

    # ====END OF `VALIDATE`====
    if mode == 'validate':
        return True

    # Meta catboost
    logger.info('training catboost as metamodel...')

    x_meta = [
        predictions[model_alias] for model_alias in sorted(predictions.keys())
    ]
    x_meta = np.array(x_train_meta).T

    x_train_meta, x_val_meta, y_train_meta, y_val_meta = train_test_split(
        x_meta, y_val, test_size=0.20, random_state=42)
    meta_model = CatBoost(target_labels,
                          loss_function='Logloss',
                          iterations=1000,
                          depth=6,
                          learning_rate=0.03,
                          rsm=1)
    meta_model.fit(x_train_meta,
                   y_train_meta,
                   eval_set=(x_val_meta, y_val_meta),
                   use_best_model=True)
    y_hat_meta = meta_model.predict_proba(x_val_meta)
    metrics_meta = get_metrics(y_val_meta, y_hat_meta, target_labels)
    #model.save(os.path.join(output_dir, 'meta.catboost')
    logger.debug('{} metrics:\n{}'.format('META', print_metrics(metrics_meta)))

    # ====Predict====
    logger.info('Applying models...')
    test_cols = []
    for model_alias in sorted(predictions.keys()):
        for label in target_labels:
            test_cols.append('{}_{}'.format(model_alias, label))
    x_test = test_df[test_cols].values

    preds = meta_model.predict_proba(x_test)
    for i, label in enumerate(target_labels):
        test_df[label] = preds[:, i]

    # ====Normalize probabilities====
    if norm_prob:
        for label in target_labels:
            test_df[label] = norm_prob_koef * test_df[label]

    # ====Save results====
    logger.info('Saving results...')
    test_df[[
        'id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
        'identity_hate'
    ]].to_csv(result_fname, index=False, header=True)
    test_df.to_csv('{}_tmp'.format(result_fname), index=False, header=True)
示例#14
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import preprocessing as pp

STOPWORD_PATHS = ['./data/stopwords_en.txt', './data/stopwords_sv.txt']
DATA_PATH = './data/job_ad_mails.txt'
TEST_CV_TEXT = 'Joakim är en verksamhetsorienterad konsult inom Microsoft Dynamics 365 med CRM som huvudområde. Han har lång erfarenhet av att implementera CRM inom flera sektorer som en Web/Mobil/Kontaktcenter-lösning och har bl.a. lett ett flertal globala utrullningsprojekt (det största för +1100 användare i över 25 länder).  	Microsoft Dynamics 365 Resco Mobile CRM Verksamhet & processmodellering Kravhantering Lösningsdesign projektledning Testledning	Rådgivning 	Utrullning KURSER/CERTIFIKAT 	Microsoft Dynamics CRM 2015 Applications 	Microsoft Dynamics CRM 2011 Applications 	Microsoft Dynamics CRM 2011 Customization & Configuration 	Siebel 8 Consultant Certified Expert ANSTÄLLNINGAR Claremont Dynamics AB – Projektledare/Business Analyst Microsoft Dynamics 365 Avanade Sweden AB – Projektledare/Business Analyst Microsoft Dynamics 365 Tieto Sweden AB –Business Analyst Microsoft Dynamics 365SPRÅK svenska - modersmål 	Engelska – flytande Joakim var ansvarig för att rulla ut ett nytt Säljstöd baserat på Dynamics 365 V9 och Dynamics egna mobilapplikation till 4 regioner i Europa och Nordamerika. Ansvarsområden inkluderade bl.a. teknisk och funktionell verifiering av systemet, slutanvändarutbildning samt användarstöd under och i anslutning till utrullning. Joakim hade huvudansvaret som Lead Business Analyst och Projektledare under ett implementationsprojekt av ett Sälj- och Kundtjänststöd för en global tillverkare av produkter inom byggnadsindustrin. Lösningen baserades på Dynamics CRM Online och Dynamics egna mobilapplikation och rullades ut i 15 länder spridda över Europa, Asien och Amerika. I samband med utrullningsprojektet tog man också beslut om att uppgradera lösningen från Dynamics CRM 2011 On-Premise till Dynamics CRM Online. Joakim hade huvudansvaret för den funktionella sidan av uppgraderingen vilket inkluderade exempelvis workshop-planering och utförande, kravinsamling och lösningsdesign. Han ledde även testerna samt höll i slutanvändarutbildningar i den uppgraderade lösningen. Joakim hade huvudansvaret som Lead Business Analyst under ett implementationsprojekt av ett Sälj- och Kundtjänststöd för en global tillverkare av utomhusprodukter. Lösningen baserades på Dynamics CRM 2011 On-Premise, Resco Mobile CRM och Microsofts Unified Service Desk och rullades ut till +1100 användare i över 25 länder världen över. Joakims uppgifter innefattade bl.a. att leda utrullningsteamet och vara på plats under alla utrullningar för att förbereda både system och användare (exempelvis med datavalidering, utbildning och användarstöd) och ansvar för överlämning till supportorganisation efter avslutad utrullning.'
TEST_CV = pp.clean_text(TEST_CV_TEXT, STOPWORD_PATHS)

all_mail_data = pp.read_mail_data(DATA_PATH)
cleaned_messages = [TEST_CV]
for mail in all_mail_data:
    cleaned_messages.append(pp.clean_text(mail['message'], STOPWORD_PATHS))

tfidf = TfidfVectorizer()
tfidf_vector = tfidf.fit_transform(cleaned_messages)

model_tf_idf = NearestNeighbors(metric='cosine', algorithm='brute')
model_tf_idf.fit(tfidf_vector)

query_tf_idf = tfidf_vector[0]
distances, indices = model_tf_idf.kneighbors(query_tf_idf, n_neighbors=4)

for indx in indices.flatten():
    print(all_mail_data[indx]['message'])
    print()
示例#15
0
    TEXT_COLUMN = "tweet"
    LABEL_COLUMN = "label"

    configParser = configparser.RawConfigParser()
    configFilePath = "config.txt"
    configParser.read(configFilePath)

    EMBEDDING_FILE = configParser.get('model-config', 'EMBEDDING_FILE')
    MODEL_PATH = configParser.get('model-config', 'MODEL_PATH')
    PREDICTION_FILE = configParser.get('model-config', 'PREDICTION_FILE')

    print("Removing usernames")
    train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: remove_names(x))
    test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: remove_names(x))

    train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: clean_text(x))
    test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: clean_text(x))

    train['doc_len'] = train[TEXT_COLUMN].apply(
        lambda words: len(words.split(" ")))
    max_seq_len = np.round(train['doc_len'].mean() +
                           train['doc_len'].std()).astype(int)

    embed_size = 300  # how big is each word vector
    max_features = None  # how many unique words to use (i.e num rows in embedding vector)
    maxlen = max_seq_len  # max number of words in a question to use #99.99%

    # fill up the missing values
    X = train[TEXT_COLUMN].fillna("_na_").values
    X_test = test[TEXT_COLUMN].fillna("_na_").values
示例#16
0
def plot_cvs(cv_path, stopwords):
    """
	Läser in alla word-filer som finns i mappen "cv_path" och visualiserar datan
	genom att göra en dimensionell reduktion till 2D och plotta både k-means kluster
	och alla bolag för sig. Argumentet "stopwords" ska vara en lista med sökvägar till
	text dokument innehållande de stoppord som önskas användas.
	"""

    # Läser in word filer och städar upp med angivna stoppord.
    cv_list = []
    lbl_indx = []
    lbl_cntr = -1
    dir_path = ''
    for subdir, dirs, files in os.walk(cv_path):
        for file in files:
            cv_filepath = subdir + os.sep + file
            if dir_path != subdir:
                lbl_cntr += 1
            cleaned_cv = pp.clean_text(pp.read_word_file(cv_filepath),
                                       stopwords)
            #lemmalized = pp.lemmatization_sv(cleaned_cv)
            #cv_list.append(lemmalized)
            cv_list.append(cleaned_cv)
            lbl_indx.append(lbl_cntr)
            dir_path = subdir

    # Skapar en TF-IDF vektor
    tfidf = TfidfVectorizer()
    tfidf_vector = tfidf.fit_transform(cv_list)

    # Hitta kluster med k-means
    km = KMeans(n_clusters=17,
                init='k-means++',
                max_iter=100,
                n_init=5,
                verbose=1)
    km.fit(tfidf_vector)

    # Dimensionell reduktion till 2D
    tfs_reduced = TruncatedSVD(n_components=17,
                               random_state=0).fit_transform(tfidf_vector)
    tfs_embedded = TSNE(n_components=2, perplexity=40,
                        verbose=2).fit_transform(tfs_reduced)

    # Plotta kluster
    fig = plt.figure(figsize=(10, 10))
    ax1 = plt.axes()
    plt.scatter(tfs_embedded[:, 0],
                tfs_embedded[:, 1],
                marker="x",
                c=km.labels_)
    plt.show()
    plt.close()

    # Plotta olika bolag
    labels = [
        'CA', 'CAB', 'CAD', 'CBD', 'CBS', 'CBT', 'CDB', 'CDS', 'CEC', 'CED',
        'CIM', 'CLD', 'CNP', 'CNS', 'CQM', 'CQS', 'CXD'
    ]
    markers = [
        'o', 'v', '^', '<', '>', 's', 'p', '*', '+', 'x', 'X', 'D', '1', '2',
        '3', '4', '_'
    ]
    colors = cm.rainbow(np.linspace(0, 1, 18))
    fig = plt.figure(figsize=(10, 10))
    ax1 = plt.axes()
    lbl = labels[0]
    ax1.scatter(tfs_embedded[0, 0],
                tfs_embedded[0, 1],
                marker=markers[0],
                color=colors[lbl_indx[0]],
                label=lbl)
    for indx in range(1, len(tfs_embedded[:, 0])):
        if lbl == labels[lbl_indx[indx]]:
            ax1.scatter(tfs_embedded[indx, 0],
                        tfs_embedded[indx, 1],
                        marker=markers[lbl_indx[indx]],
                        color=colors[lbl_indx[indx]])

        else:
            lbl = labels[lbl_indx[indx]]
            ax1.scatter(tfs_embedded[indx, 0],
                        tfs_embedded[indx, 1],
                        marker=markers[lbl_indx[indx]],
                        color=colors[lbl_indx[indx]],
                        label=lbl)
    ax1.legend()
    plt.show()
示例#17
0
def clean_and_tokenize(x):
    return prep.port_tokenizer(prep.clean_text(x), var, genes)
示例#18
0
 def _get_vectors(self, X):
     text_data = X[self.column]
     text_data = [prepro.clean_text(text)
                  for text in text_data]  # text cleaning
     feature_vector = self.vectorizer.transform(text_data).toarray()
     return feature_vector
from sklearn.metrics import accuracy_score

from gensim.models.word2vec import FAST_VERSION 
FAST_VERSION=1

import sys
sys.path.append('../lib/')

file=[]
path=r'rvm.txt' # path to file
for string in open(path,'r',encoding='cp1251'):
    file.append(string.lower())
    

file_split=split_file(file)
text=clean_text([file_split[i][0] for i in range(len(file_split))]) # remove symbols in text
clear_text=remove_stopwords(text) # remove stop-words in text
s=func_lemma(func_container(clear_text)) # lemmatization procedure
w=func_tokenize(s) # w train dataset after preprocessing procedure


path=r'lenta-ru-news.csv' # path to test dataset
df = pd.read_csv(path,engine='python', delimiter=',',encoding = "utf-8-sig")

# plot topic news distribution
y_pos=np.arange(len(df['topic'].value_counts()))
performance=df['topic'].value_counts()
plt.figure(figsize=(8,6))
plt.bar(y_pos,performance,align='center',alpha=0.5,color='g',width=0.8)
plt.xticks(y_pos,df['topic'].value_counts().index.tolist(),rotation=90,size=15)
plt.yticks(size=15)
示例#20
0
 def create_response(self, question):
     question = np.expand_dims(self.tokenizer.tokenize_sequence(
         clean_text(question)),
                               axis=0)
     result = self.predict_sentence(question)
     return result
示例#21
0
def newswire_scrape(symbol):
    '''
    webscraping function going through articles related to ticker symbol passed and stores textual results in folder.
    :param symbol: String of ticker symbol
    :return: Saves textual data direclty to the folder
    '''
    URL = "https://www.prnewswire.com/search/news/?keyword=" + symbol
    print("Checking on: " + URL)
    page_number = 0

    # Creating folder structure and setting output path
    module_path = os.path.dirname(os.path.realpath(__file__))
    output_dir = os.path.join(module_path, 'data/articles/' + symbol)
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    article_number = 0

    # Loop over articles until no more found.
    while True:
        page_number += 1
        try:
            page = requests.get(URL + '&pagesize=25&page=' + str(page_number))
        except ValueError:
            print('URL not found')
            break

        soup = BeautifulSoup(page.content, 'html.parser')
        press_results = soup.find(class_='container search-results-text')
        press_results = press_results.find_all('a', 'news-release', 'href')
        print("result on page: " + str(len(press_results)))
        if len(press_results) == 0:
            break

        # Checking every single article in search results
        for result in press_results:
            print("Checking: " + result['href'])

            # Initializing article information values
            URL_ = "https://www.prnewswire.com" + result['href']
            page_ = requests.get(URL_)
            soup_ = BeautifulSoup(page_.content, 'html.parser')
            info = soup_.find(class_='container release-header')
            title = info.find('h1')
            subtitle = info.find('p', 'subtitle')
            date = info.find('p', 'mb-no')

            # Stripping additional content from webpage and only keeping text paragraphs
            body = soup_.find(class_='release-body container')
            paragraphs = body.find_all('p')

            # Cumulatively building article content string
            clean_body = ''
            for para in paragraphs:
                if para.string:
                    clean_body += para.string

            info = clean_missing(title, subtitle, date)
            art = Article(preprocessing.clean_text(info[0]), preprocessing.clean_text(info[1]), info[2], preprocessing.clean_text(clean_body))

            # Saving the article to the specified output path
            output_path = os.path.join(output_dir, 'article_' + str(article_number) + '.txt')
            article_number += 1
            with open(output_path, 'w+') as article_file:
                json.dump(art.to_dict(), article_file)
示例#22
0
def run_keras_experiment():
    print('Reading files')

    # Reading File Section - This should change
    train_2018 = pd.read_csv("data/german/germeval2018.training.txt",
                             sep='\t',
                             names=['tweet', 'sub_task_1', 'sub_task_2'])

    train_2019 = pd.read_csv("data/german/germeval2019_training_subtask12.txt",
                             sep='\t',
                             names=['tweet', 'sub_task_1', 'sub_task_2'])

    train = pd.concat([train_2018, train_2019])

    test = pd.read_csv("data/german/germeval2018.test_.txt",
                       sep='\t',
                       names=['tweet', 'sub_task_1', 'sub_task_2'])

    test_2019 = pd.read_csv("data/german/germeval2019_Testdata_Subtask12.txt",
                            sep='\t',
                            names=['tweet'])

    print('Completed reading')

    #############
    print("Train shape : ", train.shape)
    print("Test shape : ", test.shape)
    print("2019 Test shape :", test_2019.shape)

    # Variables

    TEXT_COLUMN = "tweet"
    LABEL_COLUMN = "sub_task_1"

    configParser = configparser.RawConfigParser()
    configFilePath = "config.txt"
    configParser.read(configFilePath)

    EMBEDDING_FILE = configParser.get('sub_task_1_model-config',
                                      'EMBEDDING_FILE')
    MODEL_PATH = configParser.get('sub_task_1_model-config', 'MODEL_PATH')
    PREDICTION_FILE = configParser.get('sub_task_1_model-config',
                                       'PREDICTION_FILE')

    print(train.head())

    print("Removing usernames")
    train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: remove_names(x))
    test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: remove_names(x))
    print(train.head())
    #
    # print("Identifying names")
    #
    # train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: entity_recognizing(x))
    # test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: entity_recognizing(x))
    # print(train.head())

    print("Converting to lower-case")
    train[TEXT_COLUMN] = train[TEXT_COLUMN].str.lower()
    test[TEXT_COLUMN] = test[TEXT_COLUMN].str.lower()
    test_2019[TEXT_COLUMN] = test_2019[TEXT_COLUMN].str.lower()
    print(train.head())

    print("Cleaning punctuation marks")
    train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: clean_text(x))
    test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: clean_text(x))
    test_2019[TEXT_COLUMN] = test_2019[TEXT_COLUMN].apply(
        lambda x: clean_text(x))
    print(train.head())

    train['doc_len'] = train[TEXT_COLUMN].apply(
        lambda words: len(words.split(" ")))
    max_seq_len = np.round(train['doc_len'].mean() +
                           train['doc_len'].std()).astype(int)

    embed_size = 300  # how big is each word vector
    max_features = None  # how many unique words to use (i.e num rows in embedding vector)
    maxlen = max_seq_len  # max number of words in a question to use #99.99%

    # fill up the missing values
    X = train[TEXT_COLUMN].fillna("_na_").values
    X_test = test[TEXT_COLUMN].fillna("_na_").values
    X_test_2019 = test_2019[TEXT_COLUMN].fillna("_na_").values

    # Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features, filters='')
    tokenizer.fit_on_texts(list(X))

    X = tokenizer.texts_to_sequences(X)
    X_test = tokenizer.texts_to_sequences(X_test)
    X_test_2019 = tokenizer.texts_to_sequences(X_test_2019)

    # Pad the sentences
    X = pad_sequences(X, maxlen=maxlen)
    X_test = pad_sequences(X_test, maxlen=maxlen)
    X_test_2019 = pad_sequences(X_test_2019, maxlen=maxlen)

    # Get the target values
    Y = train[LABEL_COLUMN].values

    le = LabelEncoder()

    le.fit(Y)
    encoded_Y = le.transform(Y)

    word_index = tokenizer.word_index
    max_features = len(word_index) + 1

    print('Loading Embeddings')

    embedding_matrix = get_emb_matrix(word_index, max_features, EMBEDDING_FILE)

    print('Finished loading Embeddings')

    print('Start Training')

    kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
    bestscore = []
    y_test = np.zeros((X_test.shape[0], ))
    y_test_2019 = np.zeros((X_test_2019.shape[0], ))
    for i, (train_index, valid_index) in enumerate(kfold.split(X, encoded_Y)):
        X_train, X_val, Y_train, Y_val = X[train_index], X[
            valid_index], encoded_Y[train_index], encoded_Y[valid_index]
        filepath = MODEL_PATH
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_loss',
                                     verbose=2,
                                     save_best_only=True,
                                     mode='min')
        reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                      factor=0.6,
                                      patience=1,
                                      min_lr=0.0001,
                                      verbose=2)
        earlystopping = EarlyStopping(monitor='val_loss',
                                      min_delta=0.0001,
                                      patience=2,
                                      verbose=2,
                                      mode='auto')
        callbacks = [checkpoint, reduce_lr]
        model = pooled_gru(maxlen, max_features, embed_size, embedding_matrix,
                           1)
        if i == 0: print(model.summary())
        model.fit(
            X_train,
            Y_train,
            batch_size=64,
            epochs=20,
            validation_data=(X_val, Y_val),
            verbose=2,
            callbacks=callbacks,
        )
        model.load_weights(filepath)
        y_pred = model.predict([X_val], batch_size=64, verbose=2)
        y_test += np.squeeze(model.predict([X_test], batch_size=64,
                                           verbose=2)) / 5
        y_test_2019 += np.squeeze(
            model.predict([X_test_2019], batch_size=64, verbose=2)) / 5
        f1, threshold = f1_smart(np.squeeze(Y_val), np.squeeze(y_pred))
        print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(f1, threshold))
        bestscore.append(threshold)

    print('Finished Training')

    y_test = y_test.reshape((-1, 1))
    pred_test_y = (y_test > np.mean(bestscore)).astype(int)
    test['predictions'] = le.inverse_transform(pred_test_y)

    y_test_2019 = y_test_2019.reshape((-1, 1))
    pred_test_y_2019 = (y_test_2019 > np.mean(bestscore)).astype(int)

    test_2019_temp = pd.read_csv(
        "data/german/germeval2019_Testdata_Subtask12.txt",
        sep='\t',
        names=['tweet'])

    test_2019['predictions'] = le.inverse_transform(pred_test_y_2019)
    test_2019['tweet'] = test_2019_temp['tweet']

    # save predictions
    file_path = PREDICTION_FILE
    test_2019.to_csv(file_path,
                     sep='\t',
                     encoding='utf-8',
                     header=False,
                     index=False)

    print('Saved Predictions')

    # post analysis
    tn, fp, fn, tp = confusion_matrix(test[LABEL_COLUMN],
                                      test['predictions']).ravel()
    weighted_f1 = f1_score(test[LABEL_COLUMN],
                           test['predictions'],
                           average='weighted')
    accuracy = accuracy_score(test[LABEL_COLUMN], test['predictions'])
    weighted_recall = recall_score(test[LABEL_COLUMN],
                                   test['predictions'],
                                   average='weighted')
    weighted_precision = precision_score(test[LABEL_COLUMN],
                                         test['predictions'],
                                         average='weighted')

    print("Confusion Matrix (tn, fp, fn, tp) {} {} {} {}".format(
        tn, fp, fn, tp))
    print("Accuracy ", accuracy)
    print("Weighted F1 ", weighted_f1)
    print("Weighted Recall ", weighted_recall)
    print("Weighted Precision ", weighted_precision)
                        row['newspaper_text'], False)
                    end = time.time()
                    time_stemmed = (end - start) * 1000

                    # Text stemming without stopwords.
                    start = time.time()
                    text_stemmed_without_stopwords = preprocessing.stem_text(
                        row['newspaper_text'], True)
                    end = time.time()
                    time_stemmed_without_stopwords = (end - start) * 1000

                    # Text stemming without stopwords and aggressive cleaning.
                    start = time.time()
                    text_stemmed_without_stopwords_aggr = preprocessing.clean_text(
                        row['newspaper_text'],
                        keep_stopwords=False,
                        use_stemming=True,
                        use_lemmatization=False)
                    end = time.time()
                    time_stemmed_without_stopwords_aggr = (end - start) * 1000

                    # Text lemmatization.
                    start = time.time()
                    text_lemmatized = preprocessing.lemmatize_text(
                        row['newspaper_text'], False)
                    end = time.time()
                    time_lemmatized = (end - start) * 1000

                    # Text lemmatization without stopwords.
                    start = time.time()
                    text_lemmatized_without_stopwords = preprocessing.lemmatize_text(
def main():
    """Tweet Classifier App with Streamlit """

    # Creates a main title and subheader on your page -
    # these are static across all pages
    display_image('undraw_welcome_cats_thqn', '')
    st.title("Sentiment Analysis on Climate Change")

    st.subheader("Should your business be Eco-friendly?")
    st.markdown("""
        This platform helps you make data-driven decisions. Find out how your customers feel about climate change.
    """)

    df = clean_text(raw)

    data = None
    uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv")

    # Creating sidebar with selection box -
    # you can create multiple pages this way
    options = ["Information", "Insights", "Predictions"]
    selection = st.sidebar.selectbox("Menu", options)

    # Building out the "Insights" page
    if selection == "Insights":
        title = 'Below are some data visualisations and Insights extracted from the tweets'
        display_image('undraw_google_analytics_a57d', title)
        st.write("## **Wordcloud Visualisations**")
        visualize_data(df)

        st.write(
            "### **The barplots below shows the most common words per category**"
        )
        options = st.multiselect(
            'Select tweet category to visualize with BarPlot:',
            ['Pro', 'Anti', 'Neutral', 'News'], ['Pro'])
        for sentiment in options:
            common_words(df, sentiment, f'{sentiment} Tweets')

        st.subheader("Observations")
        st.write("""
            * Climate Change and Global warming appear to be the most popular words amongst these tweets.
                """)

        extract_hash(df)
        #plot_pie(df_pol, 'Political View')
        #plot_pie(df_pol, 'Political View')

        st.subheader("Observations")
        st.write("""
            * Investigating individual words still shows that there is an overlap of most used words between the classes.
            * However, it does become very apparent that there are themes that help formulate or form tweeters opinions on twitter.
            * Seeing words such as Trump, Obama would lead one to believe that there is a political connection to what people tweet about climate change.
            * We can also see the word 'husband' appearing as most common under the pro tweets, this shows that the climate change topic is being discussed amongst families as well, or that people do think about climate change in relation to people close to them.
            * We can then also assume that there is perhaps a social aspect to how people form their opinion on climate change.
            * Hashtags provide more context, as people will most usually tweet under a certain hashtag as a means of making it easier to find information with a theme or specific context.
            """)

    # Building out the "Information" page
    if selection == "Information":
        display_image('undraw_my_code_snippets_lynx',
                      'Find out what users say about your business')

        information_view(pd.read_csv("resources/train.csv"), df)

        if uploaded_file is not None:
            st.markdown("""
                ## Your new dataset.
            """)
            data = pd.read_csv(uploaded_file)
            st.table(data.message.head())

    # Building out the predication page
    if selection == "Predictions":
        model_prediction_view()
示例#25
0
def main():

    print("Loading data...")
    train_data = pd.read_csv(conf.train_data_path)
    test_data = pd.read_csv(conf.x_test_data_path)

    print("Cleaning text...")
    train_data["comment_text"] = train_data["comment_text"].apply(
        TextCleaner.clean_text2)
    test_data["comment_text"] = test_data["comment_text"].apply(
        TextCleaner.clean_text2)
    print("remove_stop_words ", remove_stop_words)
    print("stem_words", stem_words)
    train_data["comment_text"] = train_data["comment_text"].apply(
        lambda x: TextCleaner.clean_text(
            x, remove_stop_words=remove_stop_words, stem_words=stem_words))
    test_data["comment_text"] = test_data["comment_text"].apply(
        lambda x: TextCleaner.clean_text(
            x, remove_stop_words=remove_stop_words, stem_words=stem_words))

    #     extra preprocessing
    print('extra cleaning...')
    swear_words = load_data(swear_words_fname,
                            func=lambda x: set(x.T[0]),
                            header=None)
    wrong_words_dict = load_data(wrong_words_fname,
                                 func=lambda x: {val[0]: val[1]
                                                 for val in x})
    tokinizer = RegexpTokenizer(r'\w+')
    regexps = [
        re.compile("([a-zA-Z]+)([0-9]+)"),
        re.compile("([0-9]+)([a-zA-Z]+)")
    ]
    train_data["comment_text"] = clean_text(train_data["comment_text"],
                                            tokinizer, wrong_words_dict,
                                            regexps)
    train_data["comment_text"] = clean_text(train_data["comment_text"],
                                            tokinizer, wrong_words_dict,
                                            regexps)

    list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values
    list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values
    y_train = train_data[CLASSES].values

    print("Tokenizing sentences in train set...")
    tokenized_sentences_train, words_dict = tokenize_sentences(
        list_sentences_train, {})

    print("Tokenizing sentences in test set...")
    tokenized_sentences_test, words_dict = tokenize_sentences(
        list_sentences_test, words_dict)

    words_dict[UNKNOWN_WORD] = len(words_dict)

    print("Loading embeddings...")
    embedding_list, embedding_word_dict = read_embedding_list(
        conf.embedding_path)
    embedding_size = len(embedding_list[0])

    print("Preparing data...")
    embedding_list, embedding_word_dict = clear_embedding_list(
        embedding_list, embedding_word_dict, words_dict)

    embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict)
    embedding_list.append([0.] * embedding_size)
    embedding_word_dict[END_WORD] = len(embedding_word_dict)
    embedding_list.append([-1.] * embedding_size)

    embedding_matrix = np.array(embedding_list)
    print("saving embedding matrix")
    data_util.save_embedding_matrix(embedding_matrix)

    id_to_word = dict((id, word) for word, id in words_dict.items())
    train_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_train,
                                                    id_to_word,
                                                    embedding_word_dict,
                                                    sentences_length)
    test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test,
                                                   id_to_word,
                                                   embedding_word_dict,
                                                   sentences_length)
    x_train = np.array(train_list_of_token_ids)
    x_test = np.array(test_list_of_token_ids)

    print('x_train:', x_train.shape, ', y_train:', y_train.shape,
          ', x_test.shape', x_test.shape)
    print("Save train and test data...")
    data_util.save_processed_dataset(x_train, y_train, x_test)