def generate_features(title): features = [] f1 = structural_and_punctuation(title) f2 = linguistic(title) f3 = word_sentence(title) f4 = libspacy.get_vector(title) return f1 + f2 + f3 + f4.tolist()
def generate_features(title, c_title): features = [] #title=clean_str(title.decode('utf-8')) #print title vecs = libspacy.get_vector(title) return vecs.tolist()
def generate_features(title): features=[] f1=structural_and_punctuation(title) #print("Len of f1", len(f1)) f2=linguistic(title) #print("Len of f2", len(f2)) f3=word_sentence(title) #print("Len of f3", len(f3)) #sys.exit() f4=libspacy.get_vector(title) #return f1+f2+f3 return f1+f2+f3+f4.tolist()
def generate_features(text): features = [] tokens = text.split() X = libspacy.get_vector(text) return X.tolist() f = len(text) #Length of the text features.append(f) f = text.count('.') features.append(f) f = text.count('#') features.append(f) f = text.count('I ') #features.append(f) f = text.count('@') features.append(f) f = text.count('?') features.append(f) f = text.count('irony') features.append(f) oh = [text.count(w) for w in auto_features] #print(sum(oh)) features += oh senti = sum(libsenti.get_sentiments(text)) #print(text) #features.append(senti) return features return features + X.tolist()
def generate_features(title): features = [] features = libspacy.get_vector(title) return features.tolist()
def main(): train_dir1 = 'clickbait17-train-170331' instances_filename = 'instances.jsonl' truths_filename = 'truth.jsonl' train_dir2 = 'clickbait17-train-170630' instances_filename = 'instances.jsonl' truths_filename = 'truth.jsonl' raw_data = {} #Data set indexed by the id, id is a string raw_truths = {} #Truths indexed by the id, id is a string abs_path = os.path.join(train_dir1, instances_filename) fp = open(abs_path) for line in fp: json_obj = json.loads(line) #print json_obj['postText'] item_id = json_obj['id'] raw_data[item_id] = json_obj #print raw_data fp.close() abs_path = os.path.join(train_dir2, instances_filename) fp = open(abs_path) for line in fp: json_obj = json.loads(line) #print json_obj['postText'] item_id = json_obj['id'] raw_data[item_id] = json_obj abs_path = os.path.join(train_dir1, truths_filename) fp = open(abs_path) for line in fp: json_obj = json.loads(line) item_id = json_obj['id'] raw_truths[item_id] = json_obj abs_path = os.path.join(train_dir2, truths_filename) fp = open(abs_path) for line in fp: json_obj = json.loads(line) item_id = json_obj['id'] raw_truths[item_id] = json_obj feature = 'postText' raw_cb = [] y_cb = [] y_labels = [] item_ids = [] cb = [] nocb = [] item_ids_cb = [] item_ids_nocb = [] all_words = [] for item_id in tqdm(raw_data): rating = raw_truths[item_id]['truthMean'] postText = raw_data[item_id].get('postText', '') postText = clean_str(postText[0].strip()) postText = clean_title(postText) label = raw_truths[item_id]['truthClass'] #sys.exit() item = raw_data[item_id] item['rating'] = rating item['postText'] = postText item['id'] = item_id postWords = word_tokenize(postText) item['postWords'] = postWords postWords = [w for w in postWords if not w in stop_words] # postWords = [w for w in postWords if not w in exclude] pos = nltk.pos_tag(postWords) item['pos'] = pos lemmatized = [] for word, tag in pos: if word == 'http' or word == ':' or word == '//t': continue if word.isdigit(): lemmatized.append('[n]') continue if tag[0] == 'J': part = wordnet.ADJ elif tag[0] == 'N': part = wordnet.NOUN elif tag[0] == 'V': part = wordnet.VERB elif tag[0] == 'R': part = wordnet.ADV else: part = wordnet.NOUN lemmatized.append(word) item['lemmatized'] = lemmatized all_words.extend(lemmatized) item['lemmaSent'] = ' '.join(lemmatized) if '[n]' not in item['lemmaSent']: continue raw_cb.append(item) if label == 'clickbait': cb.append(item) else: nocb.append(item) label = rating_to_class(rating) y_cb.append(rating) y_labels.append(label) item_ids.append(item_id) fp.close() # print(Counter(all_words).most_common(10)) nocb_filtered = [] cb_filtered = [] print('Filtering nocb') for item in tqdm(nocb): sent_array = item['lemmaSent'].split() if len(sent_array) < 2: continue word_idx = sent_array.index('[n]') for comparison in nocb: if item['postText'] == comparison['postText']: continue comparison_sent_array = comparison['lemmaSent'].split() if len(comparison_sent_array) < 2: continue comparison_idx = comparison_sent_array.index('[n]') if (word_idx > len(sent_array) - 2 and comparison_idx < 1) or ( word_idx < 1 and comparison_idx > len(comparison_sent_array) - 2): continue if (word_idx > len(sent_array) - 2 or comparison_idx > len(comparison_sent_array) - 2): word_vec = libspacy.get_vector(' '.join([ item['lemmaSent'][word_idx - 2], item['lemmaSent'][word_idx - 1], item['lemmaSent'][word_idx] ])) comparison_vec = libspacy.get_vector(' '.join([ comparison['lemmaSent'][comparison_idx - 2], comparison['lemmaSent'][comparison_idx - 1], comparison['lemmaSent'][comparison_idx] ])) difference = sqrt( sum( pow(a - b, 2) for a, b in zip(word_vec, comparison_vec))) elif (word_idx < 1 or comparison_idx < 1): word_vec = libspacy.get_vector(' '.join([ item['lemmaSent'][word_idx], item['lemmaSent'][word_idx + 1], item['lemmaSent'][word_idx + 2] ])) comparison_vec = libspacy.get_vector(' '.join([ comparison['lemmaSent'][comparison_idx], comparison['lemmaSent'][comparison_idx + 1], comparison['lemmaSent'][comparison_idx + 2] ])) difference = sqrt( sum( pow(a - b, 2) for a, b in zip(word_vec, comparison_vec))) else: word_vec = libspacy.get_vector(' '.join([ item['lemmaSent'][word_idx - 1], item['lemmaSent'][word_idx], item['lemmaSent'][word_idx + 1] ])) comparison_vec = libspacy.get_vector(' '.join([ comparison['lemmaSent'][comparison_idx - 1], comparison['lemmaSent'][comparison_idx], comparison['lemmaSent'][comparison_idx + 1] ])) difference = sqrt( sum( pow(a - b, 2) for a, b in zip(word_vec, comparison_vec))) if difference > 22.5: nocb_filtered.append(item) item_ids_nocb.append(item['id']) item['ssp'] = word_vec break print('Filtering cb') for item in tqdm(cb): sent_array = item['lemmaSent'].split() if len(sent_array) < 2: continue word_idx = sent_array.index('[n]') for comparison in cb: if item['postText'] == comparison['postText']: continue comparison_sent_array = comparison['lemmaSent'].split() if len(comparison_sent_array) < 2: continue comparison_idx = comparison_sent_array.index('[n]') if (word_idx > len(sent_array) - 2 and comparison_idx < 1) or ( word_idx < 1 and comparison_idx > len(comparison_sent_array) - 2): continue if (word_idx > len(sent_array) - 2 or comparison_idx > len(comparison_sent_array) - 2): word_vec = libspacy.get_vector(' '.join([ item['lemmaSent'][word_idx - 2], item['lemmaSent'][word_idx - 1], item['lemmaSent'][word_idx] ])) comparison_vec = libspacy.get_vector(' '.join([ comparison['lemmaSent'][comparison_idx - 2], comparison['lemmaSent'][comparison_idx - 1], comparison['lemmaSent'][comparison_idx] ])) difference = sqrt( sum( pow(a - b, 2) for a, b in zip(word_vec, comparison_vec))) elif (word_idx < 1 or comparison_idx < 1): word_vec = libspacy.get_vector(' '.join([ item['lemmaSent'][word_idx], item['lemmaSent'][word_idx + 1], item['lemmaSent'][word_idx + 2] ])) comparison_vec = libspacy.get_vector(' '.join([ comparison['lemmaSent'][comparison_idx], comparison['lemmaSent'][comparison_idx + 1], comparison['lemmaSent'][comparison_idx + 2] ])) difference = sqrt( sum( pow(a - b, 2) for a, b in zip(word_vec, comparison_vec))) else: word_vec = libspacy.get_vector(' '.join([ item['lemmaSent'][word_idx - 1], item['lemmaSent'][word_idx], item['lemmaSent'][word_idx + 1] ])) comparison_vec = libspacy.get_vector(' '.join([ comparison['lemmaSent'][comparison_idx - 1], comparison['lemmaSent'][comparison_idx], comparison['lemmaSent'][comparison_idx + 1] ])) difference = sqrt( sum( pow(a - b, 2) for a, b in zip(word_vec, comparison_vec))) if difference > 22.5: cb_filtered.append(item) item_ids_cb.append(item['id']) item['ssp'] = word_vec break print('Number of cb: ', len(cb_filtered), 'Number of nocb: ', len(nocb_filtered)) cb = shuffle(cb_filtered, random_state=0) nocb = shuffle(nocb_filtered, random_state=0) nocb = nocb[:len(cb)] print("CB=", len(cb), "NOCB=", len(nocb), "ITEM_IDS_CB", len(item_ids_cb), "ITEM_IDS_NOCB", len(item_ids_nocb)) item_ids_nocb = item_ids_nocb[:len(cb)] y_cb = [0] * len(nocb) + [1] * len(cb) raw_cb = nocb + cb item_ids = item_ids_nocb + item_ids_cb (raw_cb, y_cb, item_ids) = shuffle(raw_cb, y_cb, item_ids, random_state=0) # (raw_cb, y_cb, item_ids) = shuffle(raw_cb, y_cb, item_ids, random_state=0) #create the dataset for subba fa = open('clickbait_titles.txt', 'w') fb = open('clickbait_ratings.txt', 'w') for (cb, rating) in zip(raw_cb, y_cb): fa.write(cb['postText'] + '\n') fb.write(str(rating) + '\n') fa.close() fb.close() #(X, Y) = make_scatter(raw_cb, y_cb) train_percent = 0.8 train_size = int(len(raw_cb) * train_percent) X_raw_train = raw_cb[:train_size] y_train = y_cb[:train_size] train_ids = item_ids[:train_size] X_raw_test = raw_cb[train_size:] y_test = y_cb[train_size:] test_ids = item_ids[train_size:] #create test annotations fp = open("test_annotations.jsonl", 'w') for item_id in test_ids: json_obj = raw_truths[item_id] json_str = json.dumps(json_obj) fp.write(json_str + '\n') fp.close() print("X_raw_train, y_train", len(X_raw_train), len(y_train)) print("X_raw_test, y_test", len(X_raw_test), len(y_test)) X_train = [] X_test = [] print("Extracting features from train") for item in X_raw_train: # raw_title = item['postText'] # vectors = libspacy.get_vector(raw_title) features = np.append(item['ssp'], [len(item['postWords'])]) X_train.append(features) print("Extracting features from test") for item in X_raw_test: # raw_title = item['postText'] # vectors = libspacy.get_vector(raw_title) features = np.append(item['ssp'], [len(item['postWords'])]) X_test.append(features) num_features = len(features) print("Size of train, test", len(X_train), len(X_test)) print("Size of labels train, test", len(y_train), len(y_test)) print("#features=", num_features) print("Try linear regression") model = linear_model.LinearRegression() #model = svm.SVR(C=1.0, epsilon=0.2) model.fit(X_train, y_train) print("Mean squared error test: %.4f" % np.mean( (model.predict(X_test) - y_test)**2)) print("Mean squared error train: %.4f" % np.mean( (model.predict(X_train) - y_train)**2)) print("Minor improvements") y_pred = model.predict(X_test) y_pred = [0 if i < 0 else i for i in y_pred] y_pred = [1 if i > 1 else i for i in y_pred] y_pred = np.array(y_pred) print("Mean squared error test: %.4f" % np.mean((y_pred - y_test)**2)) #print y_pred #y_pred = np.random.rand(len(X_test)) #Uncomment this line to check with random guesses create_predictions("test_predictions", y_pred, test_ids) create_predictions("test_truths", y_test, test_ids) #Print those instances where the prediction varies by a threshold fp = open('max_errors.txt', 'w') for (y_p, y_real, test_id) in zip(y_pred, y_test, test_ids): if abs(y_p - y_real) > 0.4: line = '%s %f %f' % (raw_data[test_id][feature], y_p, y_real) fp.write(line + '\n') fp.close() #print(model.coef_) #print(sorted(model.coef_.tolist())) os.system('python eval.py test_annotations.jsonl test_predictions outfile') print(model.coef_, model.intercept_)