def event_pred_model(event, X, Y, command): # note that X is a list and Y is a array texts = load_demo_text(command) total_X = X + texts X_convert, X_pred, total_X_convert = np.array(X), np.array(texts), np.array(total_X) MIN_DF = 2 vec = CountVectorizer(lowercase=True, min_df=MIN_DF) vec = vec.fit(total_X_convert) X_convert_trans, X_pred_trans = vec.transform(X_convert), vec.transform(X_pred) clf.fit(X_convert_trans, Y) # training model y_pred = clf.predict(X_pred_trans) y_prob = clf.decision_function(X_pred_trans) max_prob, min_prob = max(y_prob), min(y_prob) list_write = list() for i in range(0, len(y_pred)): prob = (y_prob[i] - min_prob) / (max_prob - min_prob) print y_pred[i], prob, texts[i] # list_write.append(str(y_pred[i]) + '\t' + texts[i]) list_write.append(str(y_pred[i])) if command == 'twitter': path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/twitter/events_pred' write_file(path_write, event, list_write)
def matching_eventText(X, Y, event, call): pred = list() for value in X: if event in value: pred.append('1') else: pred.append('0') matrix = confusion_matrix(pred, Y) for value in matrix: line = '' for each in value: line = line + str(each) + '\t' print line.strip() print '----------------' list_print = list() if call == 'PrintPredicted': for index in range(0, len(pred)): tweet, pred_value, truth_value = X[index], pred[index], Y[index] list_ = list() list_.append(index), list_.append(pred_value), list_.append(truth_value), list_.append(tweet) list_print.append(list_) list_print = sorted(list_print, key=itemgetter(0)) # sorted list based on index list_write = list() for value in list_print: print str(value[0]) + '\t' + str(value[1]) + '\t' + str(value[2]) + '\t' + str(value[3]) list_write.append(str(value[0]) + '\t' + str(value[1]) + '\t' + str(value[2]) + '\t' + str(value[3])) write_file(path, event + '_match', list_write)
def writing_pred(path, event, X_id, X, X_pred): list_write = list() for i in range(0, len(X_id)): line = X_id[i] + '\t' + X_pred[i] + '\t' + X[i] list_write.append(line) write_file(path, 'twitter_event_' + event, list_write)
def distinct_list(list_, path, name): new_list = list() for value in list_: if value not in new_list: new_list.append(value) print len(new_list) write_file(path, name.replace('.txt', '') + '_new', new_list)
def events_all(path, events): list_events, list_nones = list(), list() for event in events: list__ = load_file(path, event + '.csv') list_event, list_none = events_none(list__) list_events.append(list_event), list_nones.append(list_none) # print len(list_event) for i in range(0, len(list_nones)): if i == 0: first = list(set(list_nones[i]).intersection(list_nones[i + 1])) elif i == len(list_nones) - 1: break else: first = list(set(first).intersection(list_nones[i + 1])) list_none = first list_events.append(list_none) events.append('none') for event in list_events: print len(event) for i in range(0, len(list_events)): j = i + 1 for k in range(j, len(list_events)): first, second = list_events[i], list_events[k] second = convert_list_(second, events[k]) new_list = first + second print events[i], events[k] write_file(path, events[i] + '_' + events[k], new_list)
def wordVec_facebook(sents, path_w, name_w, win_size): list_all = list() for i in range(0, len(sents)): split_sent = sents[i].split() tokens = list() for token in split_sent: token_filter = filter_eachTok_rmLinks(token, 'model') if len(token_filter) > 0: tokens.append(token_filter.lower()) print i list_all.append(tokens) model = gensim.models.Word2Vec(list_all, size=win_size, window=5, min_count=1, workers=5) print model.most_similar(['bus']) list_write = list() for i in range(0, len(model.index2word)): # print model.index2word[i], model.syn0norm[i] line = model.index2word[i] for value in model.syn0norm[i]: line += '\t' + str(value) line = line.strip() list_write.append(line) print line write_file(path_w, name_w + '_%i' % win_size, list_write)
def construct_ftr_wordVector(dict_w, vec_w, lines, path_write, name_write): word_lines = list() for i in xrange(0, len(lines), 3): split_line = lines[i].lower().split('\t') word_lines.append(split_line) nftr_wordVec = len(vec_w[0]) # number of features in word vector for nfr in xrange(nftr_wordVec): frt_wordVec = list() for i in xrange(0, len(word_lines)): wordvec_score = '' w_line = word_lines[i] for j in xrange(0, len(w_line)): word = w_line[j] if word in dict_w: index_ = dict_w.index(word) scores_ = vec_w[index_] ftr_score = scores_[nfr] # print index_, word, ftr_score wordvec_score += ftr_score + '\t' else: word = '@' + word if word in dict_w: index_ = dict_w.index(word) scores_ = vec_w[index_] ftr_score = scores_[nfr] # print index_, word, ftr_score wordvec_score += ftr_score + '\t' else: # print word, '0' wordvec_score += '0' + '\t' frt_wordVec.append(wordvec_score) # print len(frt_wordVec) # all_ftrWordVec.append(frt_wordVec) write_file(path_write, name_write + '_%i' % nfr, frt_wordVec)
def pattern_services(path, name, list_bus_services, list_posts_checkBusServices): list_write = [] with open(path + '/' + name) as f: for line in f: split_line = line.split('\t') if (name == 'posts_filter_v2.csv'): list_pattern_services = pattern_bus_service(split_line[1], list_bus_services) list_match_services = [] if (split_line[0] in list_posts_checkBusServices): list_match_services = match_bus_service(split_line[1], list_bus_services) else: list_pattern_services = pattern_bus_service(split_line[2], list_bus_services) list_match_services = [] if (split_line[1] in list_posts_checkBusServices): list_match_services = match_bus_service(split_line[2], list_bus_services) list_total = list(set(list_pattern_services) | set(list_match_services)) #print (split_line[1] + '\t' + str(len(list_total))) if (len(list_total) != 0): for each in list_total: if (name == 'posts_filter_v2.csv'): print (split_line[0] + '\t' + each) list_write.append(split_line[0] + '\t' + each) else: print (split_line[1] + '\t' + each) list_write.append(split_line[1] + '\t' + each) write_file(path, 'posts_busService', list_write) return list_write
def groupedEvents(path_, list_lbl, events, names, number, command): list_sents = get_sentence(list_lbl, number) for index in range(0, len(events)): event, name = events[index], names[index] list_lbl_event = give_label_sents_groupEvent(list_sents, event, name, command) write_file(path_, name, list_lbl_event) print name, len(list_lbl_event)
def originial_token(path, name, original_texts, filtering_texts, labels, command): texts_correct, labels_correct = list(), list() for index in range(0, len(original_texts)): text_org, text_fil, label = original_texts[index], filtering_texts[index], labels[index] split_textOrg, split_textFil, split_textLabel = text_org.split(), text_fil.split('\t'), label.split('\t') k = 0 # index of text labels line_correct, label_correct = '', '' for j in range(0, len(split_textOrg)): flag = check_token(split_textOrg[j], command) if flag is True: line_correct += split_textOrg[j] + ' ' label_correct += '0 ' else: line_correct += split_textOrg[j] + ' ' if split_textLabel[k] == '1': flag_int = RepresentsInt(filter_eachToken(split_textOrg[j], command)) if flag_int is True: label_correct += split_textLabel[k] + ' ' else: label_correct += '0 ' else: label_correct += split_textLabel[k] + ' ' k += 1 texts_correct.append(line_correct.strip()), labels_correct.append(label_correct.strip()) list_write = list() for i in range(0, len(texts_correct)): list_write.append(texts_correct[i]) list_write.append(labels_correct[i]) # list_write.append('\n') write_file(path, name + '_' + command, list_write)
def add_hour_dof(): file = 'C:/Users/vdthoang/Google Drive/LARC - NEC Project/icwsm2016/data/tweet_short_event_tagged_for_icwsm2016.json' start = time.time() df = pd.read_json(file) end = time.time() print end - start df['hour'] = df['createAtMilis'].map(lambda x: (pd.to_datetime(x, unit='ms').hour + 8) % 24) df['dow'] = df['createAtMilis'].map(lambda x: pd.to_datetime(x, unit='ms').dayofweek) # df['woy'] = df['createAtMilis'].map(lambda x: pd.to_datetime(x, unit='ms').weekofyear) list_id = df['id'] list_hour = df['hour'] list_dw = df['dow'] list_write = list() for i in range(0, len(list_dw)): print str(list_id[i]) + '\t' + str(list_hour[i]) + '\t' + str(list_dw[i]) list_write.append(str(list_id[i]) + '\t' + str(list_hour[i]) + '\t' + str(list_dw[i])) path_write = 'C:/Users/vdthoang/Google Drive/LARC - NEC Project/icwsm2016/data' name_write = 'twitter_hour_dow' write_file(path_write, name_write, list_write)
def pattern_plate(path, name): list_write = [] with open(path + '/' + name) as f: i = 0 for line in f: if (i != 0): split_line = line.split('\t') list_busPlate = pattern_busPlate(check_busPlate(split_line[1])) if (len(list_busPlate) != 0): for each in list_busPlate: list_write.append(split_line[0] + '\t' + each) #print (split_line[0] + '\t' + str(len(pattern_busPlate(split_line[1])))) i += 1 for value in list_write: print (value) # write_file(path, 'posts_busPlate', list_write) # write_file(path, 'posts_busPlate_v2', list_write) # write_file(path, 'tweet_2015_filtering_busPlate', list_write) # write_file(path, 'facebook_2015_filtering_busPlate', list_write) # write_file(path, 'facebook_2015_BusTransport_filtering_busPlate', list_write) write_file(path, 'facebook_2015_BusNews_filtering_busPlate', list_write) print (len(list_write))
def combine_text_event(path_write, list_text, list_event, name_write): print len(list_text), len(list_event) list_all = list() for i in range(0, len(list_text)): combine = list_text[i] + '\t' + list_event[i] list_all.append(combine) write_file(path_write, name_write, list_all)
def event_sentiment(path, event, ftr_list): path_event = path + '/allTweets_ver3' list_ = load_file(path_event, event + '.csv') new_list = list() for i in range(0, len(list_)): split_value = list_[i].split('\t') new_list.append(split_value[0] + '\t' + split_value[1] + '\t' + ftr_list[i]) write_file(path_event, event + '_sentiment', new_list)
def write_file_training(totalFold, numFold, training, path, events): for i in range(0, len(training)): j = i + 1 for k in range(j, len(training)): first, second = training[i], training[k] second = convert_list_(second, events[k]) new_list = first + second print events[i], events[k] write_file(path, str(totalFold) + 'Folds_' + events[i] + '_' + events[k] + '_training_' + str(numFold), new_list)
def write_pred_event(path, id_, text_, pred, event_1, event_2): list_write = list() for i in range(0, len(id_)): line = '' if int(pred[i]) == 0: line = id_[i] + '\t' + text_[i] + '\t' + event_2 else: line = id_[i] + '\t' + text_[i] + '\t' + event_1 list_write.append(line) write_file(path, 'pred_' + event_1 + '_' + event_2, list_write)
def combine_mult_file(path, name, enum): list_files = list() for index in range(1, (enum + 1)): file = load_file(path, name + '_' + str(index) + '.csv') # file = load_file(path, name + str(index) + '.csv') list_files = list_files + file print index, len(file) print len(list_files) write_file(path, name, list_files)
def extract_road_busstop_expression(list_line, list_dict): y_label = [] y_reg = [] list_svc = [] cnt = 1 list_write = [] for line in list_line: split_line = line.split('\t') index = split_line[0] label = split_line[1].strip() y_label.append(label) svc = split_line[2].strip() list_svc.append(svc) text = split_line[3].strip().lower() # this is a text for road or bus stop # print index, label, svc list_road_match = [] for index in range(0, len(list_dict)): road = list_road[index] split_road = road.split(';') for token in split_road: if pattern_match(token.lower(), text) is True: split_token = token.split() for value in split_token: if value not in list_road_match: list_road_match.append(value.lower()) break flag = 'FALSE' if svc in list_road_match: flag = 'TRUE' y_reg.append(flag) else: flag = 'FALSE' y_reg.append(flag) print '-- finished this line -- %i' % cnt + '\t' + flag list_write.append('-- finished this line -- %i' % cnt + '\t' + flag) cnt += 1 break # for value in y_reg: # print value # for i in range(0, len(y_reg)): # if y_label[i] != y_reg[i]: # print list_svc[i] write_file('d:/', 'busstop', list_write) print metrics.accuracy_score(y_label, y_reg) print metrics.classification_report(y_label, y_reg) print metrics.confusion_matrix(y_label, y_reg)
def stemming_stopWords_text(list_, path, name): new_list = list() for i in range(0, len(list_)): line = list_[i] split_line = line.split('\t') event, label, text = split_line[0], split_line[1], split_line[2] new_text = stemming_text(remove_stopWords(text)).strip() new_line = event + '\t' + label + '\t' + new_text new_list.append(new_line) print i write_file(path, name + '_stemming_removeStop', new_list)
def subset_tweetID(list_icwsm, list_time): list_ = extract_tweet(list_icwsm) list_time = extract_tweet(list_time) list_union = set(list_) & set(list_time) print len(list_union) for value in list_union: print value write_file('C:/Users/vdthoang/Google Drive/LARC - NEC Project/icwsm2016/data', 'twitter_correct', list_union)
def construct_oldfeatures(path, files, path_write): for f in files: list_ = load_file(path, f) list_convert = list() for line in list_: string = '' for c in line: string += c + '\t' list_convert.append(string.strip()) print f write_file(path_write, f.replace('.csv', ''), list_convert)
def filter_data(path, name): # clean text, first and last character if they are punctuation in string list_write = [] with open(path + "/" + name) as f: for line in f: split_line = line.split("\t") print(split_line[0] + "\t" + filter_token(split_line[1])) list_write.append(split_line[0] + "\t" + filter_token(split_line[1])) write_file(path, "posts_filter", list_write) # extract texts and write it on csv file
def load_sentiment(path, name, sentiment_label): list_ = load_file(path, name) list_write = list() for value in list_: split_value = value.split('\t') sentiment, sentence = split_value[0], split_value[1] if sentiment_label == 'veryNeg': if int(sentiment) == 0: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence elif sentiment_label == 'Neg': if int(sentiment) == 1: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence elif sentiment_label == 'Neutral': if int(sentiment) == 2: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence elif sentiment_label == 'Pos': if int(sentiment) == 3: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence elif sentiment_label == 'veryPos': if int(sentiment) == 4: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence elif sentiment_label == 'veryNeg_Neg': if int(sentiment) == 0 or int(sentiment) == 1: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence elif sentiment_label == 'Pos_veryPos': if int(sentiment) == 3 or int(sentiment) == 4: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence list_write.append(new_sent) print len(list_write) write_file(path, 'allTweets_ver3_sentLabel_' + sentiment_label, list_write)
def bus_stop(path, name): #used to extract information from bus stop with open(path + '/' + name) as data_file: data = json.load(data_file) # print (data) list_stop = [] for stop in data: no = stop['no'] name = stop['name'] list_stop.append(str(no) + '\t' + name) print (str(no) + '\t' + name) print (len(list_stop)) write_file(path, 'bus_stop', list_stop) #extract texts and write it on csv file
def convert_CRF_pred(path_write, name_write, list_pred): list_write = list() for j in range(0, len(list_pred)): # if j == 6: # print 'hello' label = list_pred[j] convert_label = '' split_label = label.split('\t') for i in range(0, len(split_label)): token = split_label[i] if len(split_label) > 1: if i == 0: if (token == '2') or (token == '3'): next_token = split_label[i + 1] if (next_token == '0') or (next_token == '1'): convert_label += '0' + '\t' else: convert_label += token + '\t' else: convert_label += token + '\t' else: if i == (len(split_label) - 1): if (token == '2') or (token == '3'): prev_token = split_label[i - 1] if (prev_token == '0') or (prev_token == '1'): convert_label += '0' + '\t' else: convert_label += token + '\t' else: convert_label += token + '\t' else: if (token == '2') or (token == '3'): prev_token, next_token = split_label[i - 1], split_label[i + 1] if ((prev_token == '0') or (prev_token == '1')) \ and ((next_token == '0') or (next_token == '1')): convert_label += '0' + '\t' else: convert_label += token + '\t' else: convert_label += token + '\t' else: convert_label += '0' + '\t' list_write.append(convert_label.strip()) write_file(path_write, name_write, list_write)
def split_sentence_CRF(list_post, path_write, name): list_sent_split, list_sent_origin = list(), list() for i in range(0, len(list_post)): post = list_post[i].replace('"', "") split_post, sentence = post.split(), "" for value in split_post: sentence = sentence + value + "\t" # print sentence.strip() # print post # print '\n' list_sent_split.append(sentence.strip()), list_sent_split.append("\n") list_sent_origin.append(post), list_sent_origin.append("\n") print len(list_sent_split), len(list_sent_origin) write_file(path_write, name + "_CRF", list_sent_split) write_file(path_write, name + "_origin", list_sent_origin) return None
def detectEvent(path, name, name_write, list_event): loadText = load_file(path, name) port = PorterStemmer() list_write = [] for text in loadText: split_text = text.strip().split('\t') if (len(split_text) == 2): print text events = eventRecg(port, split_text[1].strip().lower(), list_event) if (len(events) > 0): print split_text[0], '\t', events for event in events: list_write.append(split_text[0] + '\t' + event) write_file(path, name_write, list_write)
def road_extract(path, name, list_road, list_road_original): list_extract = [] cnt = 0 with open(path + '/' + name) as f: for line in f: cnt += 1 split_line = line.split('\t') list_index = match_road(split_line[1].lower(), list_road) # make the text is lowercase if len(list_index) > 0: for index in list_index: print (split_line[0] + '\t' + list_road_original[index]) list_extract.append(split_line[0] + '\t' + list_road_original[index]) print (cnt) # write_file(path, 'posts_roads.csv', list_extract) # write_file(path, 'tweet_2015_filtering_roads.csv', list_extract) write_file(path, 'facebook_2015_BusNews_filtering_roads.csv', list_extract)
def bus_stop_services(path, name): #used to extract information from bus service stop with open(path + '/' + name) as data_file: data = json.load(data_file) # print (data) list_write = [] for service in data: #print (each) list_service = data[str(service)] # print (str(service) + '\t' + str(list_service)) for value in list_service: print (str(service) + '\t' + str(value)) list_write.append(str(service) + '\t' + str(value)) write_file(path, 'bus_stop_service', list_write) #extract texts and write it on csv file
def filtering_json_facebook_ver2(path, name, name_write): with open(path + '/' + name) as data_file: data = json.load(data_file) cnt, list_write = 0, list() for element in data: from_data = element['from'] print (element['id'] + '\t' + from_data['name'].replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip() + '\t' + element['created_time'] + '\t' + element['message'].replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip()) list_write.append(element['id'] + '\t' + from_data['name'].replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip() + '\t' + element['created_time'] + '\t' + element['message'].replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip()) cnt += 1 print cnt write_file(path, name_write, list_write)