예제 #1
0
def predict_class(text):
    logging.info(f"Input text: {text}")
    logging.info("cleaning input text")
    text = clean_text(text)
    sentence = [text]
    tokenizer = load_tokenizer()
    logging.info("trained tokenizer loaded")
    word_index = tokenizer.word_index
    vocab_size = len(word_index)
    text_sequences = tokenizer.texts_to_sequences(sentence)
    text_padded = pad_sequences(text_sequences,
                                padding=PADDING_TYPE,
                                truncating=TRUNC_TYPE,
                                maxlen=MAX_LENGTH)

    logging.info("creating embedding matrix using Glove Embeddings")
    embedding_matrix = embedding_matrix_glove(word_index)
    logging.info(f"Embeddings Weights created {embedding_matrix.shape}")

    logging.info("getting pre-trained model")
    model = create_model(vocab_size, EMBEDDING_DIM, MAX_LENGTH,
                         embedding_matrix)
    print(model.summary())
    logging.info("loading model weights")
    model.load_weights(MODEL)

    predict = model.predict(text_padded)
    predict = np.argmax(predict)

    predicted_main_product = get_main_product(predict)
    logging.info(f'Predicted Main Product: {predicted_main_product}')
예제 #2
0
파일: data.py 프로젝트: NISH1001/tweetypie
def clean(emoticons, src="data/tweets/original", dest="data/tweets/clean/"):
    """
        Clean Tweet:
            - remove @username mentions
            - remove http
            - expand basic acronyms like we'll -> we will
            - remove emoticons
    """
    print("Cleaing tweets...")
    src = os.path.join(src, '*.js')
    files = sorted(glob.glob(src))
    for filename in files:
        print("Loading from {}".format(filename))
        fn = os.path.splitext(os.path.basename(filename))[0]
        with open(filename) as f:
            f.readline()
            data = []
            for d in json.load(f):
                text = d['text']
                text = add_sentence_boundary(text)
                text = clean_text(text)
                text = remove_emoticons(text, emoticons)
                data.append({'text': text, 'created_at': d['created_at']})
        dumpname = os.path.join(dest, fn) + '.json'
        print("Dumping to {}".format(dumpname))
        with open(dumpname, 'w') as f:
            json.dump(data, f)
예제 #3
0
 def parse_doc(self, input):
     '''
         Assume input is a sequence of sentences.
         split multiple sentences and apply nlp parse
     '''
     doc = self.nlp(clean_text(input))
     return [self.parse_sentence(sent.text) for sent in doc.sents]
def main():
    global hidden_size, data, English, vectors, words, word2idx, glove

    hidden_size = 100

    vectors = bcolz.open(f'Embedding/6B.300d.dat')[:]
    words = pickle.load(open(f'Embedding/6B.300_words.pkl', 'rb'))
    word2idx = pickle.load(open(f'Embedding/6B.300_idx.pkl', 'rb'))
    glove = {w: vectors[word2idx[w]] for w in words}

    print("Imported Embedding Data")

    fullData = pd.read_csv("dataset.csv")

    print("Imported Full Data")

    data = []

    English = Lang('English')
    print("Counting words...")
    for article, toxicity in zip(fullData['article'], fullData['toxicity']):
        English.addSentence(article)
        data.append(list([article, toxicity]))
    print("Counted Words:")
    print(English.name, English.n_words)

    train, test = train_test_split(data,
                                   random_state=42,
                                   test_size=0.4,
                                   shuffle=True)

    print("Data Split")

    target_vocab = English.word2index.keys()

    weights_matrix = get_weights_matrix(target_vocab)
    encoder1 = EncoderRNN(weights_matrix, English.n_words,
                          hidden_size).to(device)
    fnn1 = FullyConnectedNN(hidden_size).to(device)
    encoder1.load_state_dict(torch.load('cpu_encoder.pt'))
    fnn1.load_state_dict(torch.load('cpu_fcn.pt'))
    sentence = sys.argv[1]
    sentence = clean_text(sentence)
    output = evaluate(encoder1, fnn1, sentence, English)
    output = F.sigmoid(output)
    if output >= 0.7:
        output = 1
    else:
        output = 0
    print(output)
예제 #5
0
def pre_news_raw_text(news_id):
    title_text = df_analysis.loc[df_analysis['id'] == news_id,
                                 'title'].values[0]
    src_text = df_analysis.loc[df_analysis['id'] == news_id,
                               'source'].values[0]
    src_text = src_text + ".json"
    text_dir = st.d_news_text
    text_path = text_dir + st.d_delimiter + src_text

    special = r"(\W|\\u[\w][\w][\w][\w])*"

    title_text = title_text.replace('[', '\[')
    title_text = title_text.replace(']', '\]')
    title_text = title_text.replace('?', '\?')
    title_text = title_text.replace('(', '\(')
    title_text = title_text.replace(')', '\)')
    title_text = title_text.replace('.', '\.')
    title_text = title_text.replace(',', '\,')
    title_text = title_text.replace('"', '\",')

    title_text = title_text.replace(' ', special)
    re_title = r"title\":.*" + title_text + r".*(date|content)\""

    l_text = []
    with open(text_path) as f:
        l_text = [l for l in f if re.search(re_title, l)]

        for index, text in enumerate(l_text):
            tmp_text = text
            tmp_text = pre.clean_convertion(tmp_text)
            tmp_text = pre.clean_links_text(tmp_text)
            tmp_text = pre.clean_text(tmp_text)
            tmp_text = pre.clean_escape_char(tmp_text)
            ''' Clean up return line '''
            escape_char = re.compile(r'\\n')
            tmp_text = re.sub(escape_char, ' ', tmp_text,
                              re.MULTILINE | re.IGNORECASE).strip()

            escape_char = re.compile(r'\\n')
            tmp_text = re.sub(escape_char, ' ', tmp_text,
                              re.MULTILINE | re.IGNORECASE).strip()

            escape_char = re.compile(r'\\n')
            tmp_text = re.sub(escape_char, ' ', tmp_text,
                              re.MULTILINE | re.IGNORECASE).strip()

            l_text[index] = tmp_text

    return list(OrderedDict.fromkeys(l_text))
예제 #6
0
 def words_pos(self, input):
     """
         add pos or entity info to each word in the input
     """
     doc = self.nlp(clean_text(input))
     self.join_entity(doc)
     strings = []
     for sent in doc.sents:
         if sent.text.strip():
             strings.append(' '.join(
                 self.represent_word(w) for w in sent if not w.is_space))
     if strings:
         return '\n'.join(strings) + '\n'
     else:
         return ''
예제 #7
0
def fetch(page):
    result = newsapi.get_everything(sources=sources, language='en', page=page)
    articles = result['articles']
    for article in articles:
        if (not article['title']) or (not article['description']):
            continue
        values = [
            article['source']['name'], article['author'],
            dumps(article['title']),
            dumps(article['description']), article['url'],
            article['urlToImage'], article['publishedAt']
        ]
        text = clean_text(crawl(values[4], article['source']['id']))
        category = model.predict(vect.transform([text]))[0]
        insert(values, category)
        print(category)
        print('=' * len(category))
        print(text)
        print('')
예제 #8
0
    def parse_sentence(self, input):
        '''
            parse each sentence and return the spacy's nlp properties
            Pipeline: sentence |> nlp.tokenizer |> nlp.tagger |> nlp.parser |> nlp.entity
        '''
        line = self.nlp(clean_text(input))

        keyphrases = self.find_keyphrases(line)
        output = {
            'text':
            line.text,
            "entities": [(entity.text, entity.label_) for entity in line.ents],
            # Given we will be dealing with small text, I am assuming we do not need to compute word frequency to find keywords
            "keyphrases":
            list(keyphrases),
            'words':
            [(w.text, w.tag_, w.pos_, w.ent_type_, w.dep_) for w in line],
            'persons': [(entity.text, extract_entity(entity.text)["data"])
                        for entity in line.ents if entity.label_ == 'PERSON'],
            'products': [
                entity.text for entity in line.ents
                if entity.label_ == 'PRODUCT'
            ],
            'organizations':
            [(entity.text, extract_entity(entity.text)["data"])
             for entity in line.ents if entity.label_ == 'ORG'],
            'medias': [
                entity.text for entity in line.ents
                if entity.label_ == 'WORK_OF_ART'
            ],
            'locations':
            [entity.text for entity in line.ents if entity.label_ == 'GPE'],
            'info':
            extract_info(line.text),
            'nouns': [w.text for w in line if w.tag_ == 'NN'],
            'action_verbs': [w.text for w in line if w.tag_ == 'VB'],
            'subject': [w.text for w in line if w.dep_ == 'nsubj'],
            'object': [w.text for w in line if w.dep_ == 'nobj']
        }

        return output
예제 #9
0
def evaluate(sentence, samp_type = 1):
    sentence = clean_text(sentence)
    inputs = []
    # split the sentence and replace unknown words by <unk> token.
    for i in sentence.split(' '):
        try:
            inputs.append(inp_lang.word_index[i])
        except KeyError:
            inputs.append(inp_lang.word_index['<unk>'])
    
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen = max_sentence_length, padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''
    enc_output, enc_hidden = encoder(inputs)
    
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)
    
    for t in range(max_sentence_length):
        predictions, dec_hidden = decoder([enc_output, dec_hidden, dec_input])
        if samp_type == 1:
            # that means simple greedy sampling
            predicted_id = tf.argmax(predictions[0]).numpy()
        elif samp_type == 2:
            predicted_id = np.random.choice(vocab_tar_size, p = predictions[0].numpy())
        elif samp_type == 3:
            _ , indices = tf.math.top_k(predictions[0], k = 3)
            predicted_id = np.random.choice(indices)

        if predicted_id!= 0:
            if targ_lang.index_word[predicted_id] == '<end>':
                return result, sentence
            else:
                result += targ_lang.index_word[predicted_id] + ' '
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence
예제 #10
0
    def find_keyphrases(self, line):
        if not line.text:
            line = self.nlp(clean_text(line))

        keyphrases = set([word.text for word in line.ents])

        # add nouns that have word_vectors
        nouns = [w.text for w in line if w.tag_ in ['NN', 'NNP']]
        # add noun chunks that have word vectors
        candidates = [word.text for word in line.noun_chunks]

        # add noun_chunks if noun in noun chunks else add noun
        if nouns and candidates:
            for noun in nouns:
                add_noun = True
                for candidate in candidates:
                    if noun in candidate:
                        add_noun = False
                        keyphrases.add(candidate)
                if add_noun:
                    keyphrases.add(noun)

        return keyphrases
예제 #11
0
index = 1
for category in categories:
    path = './dataset/%s' % (category, )
    files = listdir(path)
    if (len(files) < 81):
        continue
    count = 0
    for file in files:
        with open(path + '/' + file, 'r') as fp:
            data = json.loads(fp.read())
        for article in data['posts']:
            if (count >= limit):
                break
            text = ' '.join([
                Word(word).lemmatize()
                for word in clean_text(article['text']).split()
            ])
            if (not len(text)):
                continue
            x.append(text)
            y.append(category)
            count += 1
        else:
            continue
        break
    print colored([index, category, count], 'green')
    index += 1

dataset = [x, y]

with open('dataset.json', 'w') as fp:
예제 #12
0
from preprocess import apply_lemmatization, apply_stemming, extract_n_grams
from vectorizer import create_TF_IDF_matrix
from cluster import k_mean

test_file_name = '../data/Test Corpus/Test Corpus.txt'
stopwords_file = '../data/Stopwords/Basic Stopwords List.txt'

# 1. a
lines = get_file_lines(test_file_name)
print("***********Test-1***********")
print(lines)
print("****************************")

# 1. b
for i in range(len(lines)):
    lines[i] = clean_text(lines[i])
print("***********Test-2***********")
print(lines)
print("****************************")

# 1.c
for i in range(len(lines)):
    lines[i] = remove_stopwords(stopwords_file, lines[i], do_clean=True)
print("***********Test-3***********")
print(lines)
print("****************************")

# 1.d
for i in range(len(lines)):
    lines[i] = apply_stemming(lines[i])
    lines[i] = apply_lemmatization(lines[i])
예제 #13
0
def launch_model():
    full_text = request.form['full_text']
    id_ = request.form['id']
    model_type = request.form['model_type']

    global BERT, JOINT, GRANU, MGN, NUM_TASK, MASKING, HIER
    BERT = model_type == BERT_PATH
    JOINT = model_type == JOINT_BERT_PATH
    GRANU = model_type == GRANU_BERT_PATH
    MGN = model_type == MGN_SIGM_BERT_PATH

    # either of the four variants:
    # BERT = False
    # JOINT = False
    # GRANU = False
    # MGN = True

    assert BERT or JOINT or GRANU or MGN
    assert not (BERT and JOINT) and not (BERT and GRANU) and not (BERT and MGN) \
           and not (JOINT and GRANU) and not (JOINT and MGN) and not (GRANU and MGN)

    # either of the two variants
    SIGMOID_ACTIVATION = True
    RELU_ACTIVATION = False
    assert not (SIGMOID_ACTIVATION and RELU_ACTIVATION) and (
        SIGMOID_ACTIVATION or RELU_ACTIVATION)

    if BERT:
        NUM_TASK = 1
        MASKING = 0
        HIER = 0
    elif JOINT:
        NUM_TASK = 2
        MASKING = 0
        HIER = 0
    elif GRANU:
        NUM_TASK = 2
        MASKING = 0
        HIER = 1
    elif MGN:
        NUM_TASK = 2
        MASKING = 1
        HIER = 0
    else:
        raise ValueError(
            "You should choose one of bert, joint, granu and mgn in options")

    dct = {
        'NUM_TASK': NUM_TASK,
        'MASKING': MASKING,
        'SIGMOID_ACTIVATION': SIGMOID_ACTIVATION,
        'HIER': HIER
    }
    model = load_model(model_type, **dct)

    if not id_:
        ids = get_existent_ids()
        id_ = random_module.randint(0, N)
        while id_ in ids:
            id_ = random_module.randint(0, N)
        with open(DIRECTORY_PREDICT.joinpath(f'article{id_}.txt'),
                  'w',
                  encoding='utf-8') as f:
            f.write(full_text)

    text = overwrite_one_article(id_, directory=DIRECTORY_PREDICT)

    my_predict_dataset = PropDataset(DIRECTORY_PREDICT, is_test=True)
    my_predict_iter = data.DataLoader(dataset=my_predict_dataset,
                                      batch_size=BATCH_SIZE,
                                      shuffle=False,
                                      num_workers=1,
                                      collate_fn=pad)

    tmp_file = 'tmp.txt'
    eval(model,
         my_predict_iter,
         tmp_file,
         criterion,
         binary_criterion,
         NUM_TASK=NUM_TASK)
    ids, texts = read_data(DIRECTORY_PREDICT, is_test=True)
    t_texts = clean_text(texts, ids)
    flat_texts = [sentence for article in t_texts for sentence in article]
    fi, prop_sents = convert(NUM_TASK - 1, flat_texts, tmp_file)
    prop_sents = prop_sents[id_]
    prop_sents = ['1' if elem else '' for elem in prop_sents]

    results = remove_duplicates(fi)

    DIRECTORY_PREDICT.joinpath(f'article{id_}.txt').rename(
        DIRECTORY_MARKUP.joinpath(f'article{id_}.txt'))

    lst = [set() for _ in range(len(full_text))]
    source_lst = [set() for _ in range(len(full_text))]
    for inner_lst in results:
        for i in range(inner_lst[-2], inner_lst[-1]):
            lst[i].add(HUMAN_READABLE_TECHNIQUES[TECHNIQUES.index(
                inner_lst[-3])])
            source_lst[i].add(inner_lst[-3])

    extracts_s_e = []
    extracts = []
    categories = []
    for elem in fi:
        if elem[0] != str(id_):
            continue
        _, category, start, end = elem
        extracts_s_e.append((start, end))
        extracts.append(text[start:end])
        categories.append(category)

    extracts = [
        ' '.join(normalize(extract.strip())) for extract in extracts if extract
    ]
    print(f'extracts: {extracts}')

    # CHECK
    # extracts = [word for sent in extracts for word in sent.split()]

    test_x, test_maxlen = get_data(extracts,
                                   vocab_size=args.vocab_size,
                                   maxlen=args.maxlen)
    test_x = sequence.pad_sequences(test_x,
                                    maxlen=max(train_maxlen, test_maxlen))

    test_length = test_x.shape[0]
    splits = []
    for i in range(1, test_length // args.batch_size):
        splits.append(args.batch_size * i)
    if test_length % args.batch_size:
        splits += [(test_length // args.batch_size) * args.batch_size]
    test_x = np.split(test_x, splits)

    with graph.as_default():
        aspect_model = keras_load_model(os.path.join('flask_app', 'output',
                                                     'reviews', 'model_param'),
                                        custom_objects={
                                            "Attention": Attention,
                                            "Average": Average,
                                            "WeightedSum": WeightedSum,
                                            "MaxMargin": MaxMargin,
                                            "WeightedAspectEmb":
                                            WeightedAspectEmb,
                                            "max_margin_loss":
                                            U.max_margin_loss
                                        },
                                        compile=True)

        test_fn = K.function([
            aspect_model.get_layer('sentence_input').input,
            K.learning_phase()
        ], [
            aspect_model.get_layer('att_weights').output,
            aspect_model.get_layer('p_t').output
        ])
        aspect_probs = []

        for batch in tqdm(test_x):
            _, cur_aspect_probs = test_fn([batch, 0])
            aspect_probs.append(cur_aspect_probs)

        aspect_probs = np.concatenate(aspect_probs)

        label_ids = np.argsort(aspect_probs, axis=1)[:, -5:]
        for i, labels in enumerate(label_ids):
            print(
                f'{extracts[i]}: {[aspects[label] for label in labels][::-1]}')

    correct_lst = ['; '.join(list(elem)) for elem in lst]
    commands = {
        extract: ([aspects[label] for label in label_ids[i]][::-1], [])
        for i, extract in enumerate(extracts)
    }
    write_existent_dict(id_, source_lst, directory=DIRECTORY_MARKUP)

    for f in glob.glob(f'{DIRECTORY_PREDICT}/*'):
        os.remove(f)

    return jsonify(
        result={
            'id': id_,
            'list': correct_lst,
            'text': text,
            'prop_sents': prop_sents,
            'commands': commands
        })
예제 #14
0
        if classifier is None:
            print("Load training first")
        else:
            path = command.split(" ")
            if (len(path) < 2):
                print("Please enter filepath")
            else:
                path = path[1]
                path = Path('.').joinpath(path)
                text = ""
                try:
                    text = path.open('r', encoding='utf-8').read()
                except OSError as e:
                    print("File doesn't exist/Invalid path")
                print(text)
                text = pp.clean_text(text)
                pos, neg = classifier.test(text)
                print("CLASS: ", end='')
                if pos == classifier.pos_prior or neg == classifier.neg_prior:
                    print("SOMETHING WENT WRONG")
                elif pos >= neg:
                    print("POSITIVE")
                else:
                    print("NEGATIVE")

    #Display stats clause
    elif command.startswith('d'):
        scores = pp.load_stats()
        pp.print_stats(scores)
    #Refresh menu clause
    elif command.startswith('m'):
예제 #15
0
def launch_model():
    full_text = request.form['full_text']
    id_ = request.form['id']
    model_type = request.form['model_type']

    global BERT, JOINT, GRANU, MGN, NUM_TASK, MASKING, HIER
    BERT = model_type == BERT_PATH
    JOINT = model_type == JOINT_BERT_PATH
    GRANU = model_type == GRANU_BERT_PATH
    MGN = model_type == MGN_SIGM_BERT_PATH

    # either of the four variants:
    # BERT = False
    # JOINT = False
    # GRANU = False
    # MGN = True

    assert BERT or JOINT or GRANU or MGN
    assert not (BERT and JOINT) and not (BERT and GRANU) and not (BERT and MGN) \
           and not (JOINT and GRANU) and not (JOINT and MGN) and not (GRANU and MGN)

    # either of the two variants
    SIGMOID_ACTIVATION = True
    RELU_ACTIVATION = False
    assert not (SIGMOID_ACTIVATION and RELU_ACTIVATION) and (
        SIGMOID_ACTIVATION or RELU_ACTIVATION)

    if BERT:
        NUM_TASK = 1
        MASKING = 0
        HIER = 0
    elif JOINT:
        NUM_TASK = 2
        MASKING = 0
        HIER = 0
    elif GRANU:
        NUM_TASK = 2
        MASKING = 0
        HIER = 1
    elif MGN:
        NUM_TASK = 2
        MASKING = 1
        HIER = 0
    else:
        raise ValueError(
            "You should choose one of bert, joint, granu and mgn in options")

    dct = {
        'NUM_TASK': NUM_TASK,
        'MASKING': MASKING,
        'SIGMOID_ACTIVATION': SIGMOID_ACTIVATION,
        'HIER': HIER
    }
    model = load_model(model_type, **dct)

    print(1)
    if not id_:
        print(2)
        ids = get_existent_ids()
        print(3)
        id_ = random_module.randint(0, N)
        print(4)
        while id_ in ids:
            id_ = random_module.randint(0, N)
        print(5)
        with open(DIRECTORY_PREDICT.joinpath(f'article{id_}.txt'),
                  'w',
                  encoding='utf-8') as f:
            f.write(full_text)
        print(6)
    print(7)
    text = overwrite_one_article(id_, directory=DIRECTORY_PREDICT)
    print(8)

    my_predict_dataset = PropDataset(DIRECTORY_PREDICT, is_test=True)
    print(9)
    my_predict_iter = data.DataLoader(dataset=my_predict_dataset,
                                      batch_size=BATCH_SIZE,
                                      shuffle=False,
                                      num_workers=1,
                                      collate_fn=pad)
    print(10)

    tmp_file = 'tmp.txt'
    print(11)
    eval(model,
         my_predict_iter,
         tmp_file,
         criterion,
         binary_criterion,
         NUM_TASK=NUM_TASK)
    print(12)
    ids, texts = read_data(DIRECTORY_PREDICT, is_test=True)
    print(13)
    t_texts = clean_text(texts, ids)
    print(14)
    flat_texts = [sentence for article in t_texts for sentence in article]
    print(15)
    fi, prop_sents = convert(NUM_TASK - 1, flat_texts, tmp_file)
    print(16)
    prop_sents = prop_sents[id_]
    print(17)
    prop_sents = ['1' if elem else '' for elem in prop_sents]
    print(18)
    results = remove_duplicates(fi)
    print(19)

    DIRECTORY_PREDICT.joinpath(f'article{id_}.txt').rename(
        DIRECTORY_MARKUP.joinpath(f'article{id_}.txt'))
    print(20)

    lst = [set() for _ in range(len(full_text))]
    print(21)
    source_lst = [set() for _ in range(len(full_text))]
    print(22)
    for inner_lst in results:
        for i in range(inner_lst[-2], inner_lst[-1]):
            lst[i].add(HUMAN_READABLE_TECHNIQUES[TECHNIQUES.index(
                inner_lst[-3])])
            source_lst[i].add(inner_lst[-3])
    print(23)

    correct_lst = ['; '.join(list(elem)) for elem in lst]
    print(24)
    write_existent_dict(id_, source_lst, directory=DIRECTORY_MARKUP)
    print(25)

    return jsonify(result={
        'id': id_,
        'list': correct_lst,
        'text': text,
        'prop_sents': prop_sents
    })
예제 #16
0
파일: lstm.py 프로젝트: pennydew/RNN
from keras.layers import Dense, LSTM, Dropout
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

from preprocess import clean_text

pd.set_option('display.max_columns', 500)

data_train = pd.read_csv("resources/train.csv", low_memory=False)
data_test = pd.read_csv("resources/test.csv", low_memory=False)

df = pd.concat([data_train, data_test], sort=False)
df = df.reset_index(drop=True)
df.comment_text = df.comment_text.map(lambda x: clean_text(x))

start = time.time()
comments = df.comment_text
pickle.dump(comments, open("test.pkl", "wb"))
corpus_comments = pickle.load(open("test.pkl", "rb"))
end = time.time()

# number of seconds taken
step = end - start

print(step)

train_cl = df[:data_train.shape[0]]
test_cl = df[data_train.shape[0]:]
예제 #17
0
    pred_test_y2 = model.predict_proba(test_X2.multiply(r))[:, 1]
    return pred_test_y, pred_test_y2


print('~~~~~~~~~~~~~~~~~~~')
print_step('Importing Data')
train, test = get_data()
train['non_toxic'] = train[[
    'toxic', 'severe_toxic', 'obscene', 'insult', 'threat', 'identity_hate'
]].sum(axis=1).apply(lambda x: 0 if x > 1 else 1)
save_in_cache('extra_label', train, test)

if not is_in_cache('cleaned'):
    print('~~~~~~~~~~~~~')
    print_step('Cleaning')
    train_cleaned, test_cleaned = clean_text(train, test)
    save_in_cache('cleaned', train_cleaned, test_cleaned)
else:
    train_cleaned, test_cleaned = load_cache('cleaned')

print('~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Making KFold for CV')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)

if not is_in_cache('tfidf_word'):
    print('~~~~~~~~~~~~~~~~~~~')
    print_step('Run TFIDF WORD')
    TFIDF_PARAMS_WORD.update({'train': train, 'test': test})
    post_train, post_test = run_tfidf(**TFIDF_PARAMS_WORD)
    save_in_cache('tfidf_word', post_train, post_test)
    del post_train
예제 #18
0
def tokenizer(doc):
    doc = preprocess.clean_text(doc)

    return preprocess.lemmatize(doc)