def inverse_frequency(whole_doc, opt='smooth'): all_doc_num = 0 title_word = {} content_word = {} for index, title in enumerate(whole_doc['title']): all_doc_num += 1 doc_id = whole_doc['id'][index] for word in util.clean_sent(title): if word not in title_word: title_word[word] = [] if doc_id not in title_word[word]: title_word[word].append(doc_id) for word in util.clean_sent(whole_doc['content'][index]): if word not in content_word: content_word[word] = [] if doc_id not in content_word[word]: content_word[word].append(doc_id) title_idf = {} content_idf = {} if opt == 'smooth': for word in title_word: title_idf[word] = math.log(all_doc_num/(1+len(title_word[word]))) for word in content_word: content_idf[word] = math.log(all_doc_num/(1+len(content_word[word]))) return title_idf, content_idf elif opt == 'max': max_title_n = float(max([len(x) for x in title_word.values()])) max_content_n = float(max([len(x) for x in content_word.values()])) for word in title_word: title_idf[word] = math.log(max_title_n/(1+len(title_word[word]))) for word in content_word: content_idf[word] = math.log(max_content_n/(1+len(content_word[word]))) return title_idf, content_idf
def edge_list(sents, w2id, w_len=2, isWeight=False): g = {} max_count = 0 for sent in sents: words = clean_sent(sent) for i in range(0, len(words)): for j in range(i + 1, min(len(words), i + w_len + 1)): if isPair(words[i], words[j]): w0, w1 = w2id[words[i]], w2id[words[j]] pair = (w0, w1) if w0 < w1 else (w1, w0) if pair not in g: g[pair] = 0 g[pair] = g[pair] + 1 if isWeight else 1 if max_count < g[pair]: max_count = g[pair] return g
def term_frequency(term, doc, opt='log'): doc = util.clean_sent(doc) if opt == 'simple': return 1.0 if term in doc else 0.0 elif opt == 'log': t = 0 for w in doc: if w == term: t += 1 if t == 0: return 0.0 return 1 + math.log(t) elif opt == 'aug': word = {} for w in doc: if w not in word: word[w] = 0 word[w] += 1 if term not in word: return 0.5 return 0.5 + 0.5*(float(word[term])/max(word.values()))
def coocurance(text, windows=2): word_lst = [e for e in clean_sent(text) if e not in stopwords_set] #print '/'.join(word_lst) data = defaultdict(Counter) for i, word in enumerate(word_lst): indexStart = i - windows indexEnd = i + windows if indexStart < 0: temp = Counter(word_lst[:windows + 1 + i]) temp.pop(word) data[word] += temp # print word elif indexStart >= 0 and indexEnd <= len(word_lst): temp = Counter(word_lst[i - windows:i + windows + 1]) temp.pop(word) data[word] += temp else: temp = Counter(word_lst[i - windows:]) temp.pop(word) data[word] += temp # print word return data
"robotics": pd.read_csv(data_dir + "robotics" + data_type), "biology": pd.read_csv(data_dir + "biology" + data_type), "travel": pd.read_csv(data_dir + "travel" + data_type), "diy": pd.read_csv(data_dir + "diy" + data_type), } print "class, top_n, precision, recall, f1_score" #content_weight = 0.8 for data_class in dataframes: title_idf, content_idf = inverse_frequency(dataframes[data_class], opt='smooth') for top_n in range(1,20): ans, f1, precision, recall = [],[],[],[] for index, title in enumerate(dataframes[data_class]['title']): predict_tags = "" content = dataframes[data_class]['content'][index] candidate = {} for word in util.clean_sent(title): score = title_idf[word]*term_frequency(word, title) if word in candidate: if candidate[word] < score: candidate[word] = score else: candidate[word] = score #for word in util.clean_sent(content): # score = content_idf[word]*term_frequency(word, content)*content_weight # if word in candidate: # if candidate[word] < score: # candidate[word] = score # else: # candidate[word] = score predict_tags = heapq.nlargest(top_n, candidate)
batch_size=1024, nb_epoch=5, validation_data=(x_val,y_val)) result = my_model.predict(x_val) print result[0] ans = {} choose_pos = ['NN', 'NNP', 'NNS', 'VB', 'VBD', 'VBG', 'VBP', 'VBZ'] all_th = [ 0.3, 0.2, 0.1,0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001] #all_th = [0.002, 0.004, 0.006, 0.008, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1] d = open('../result/' + val_class + '_lstm_weight_result.csv','w') for th in all_th: for index, doc_id in enumerate(x_id): temp = "" words = clean_sent(x_text[index]) word_pos = pos_tag(words) for position,word in enumerate(words): if result[index][position][1] > th:#and word_pos[position][1] in choose_pos: temp += word + ' ' if doc_id not in ans: ans[doc_id] = "" ans[doc_id] += temp precision = [] recall =[] f1 = [] for index, tags in enumerate(document[val_class]['tags']): doc_id = document[val_class]['id'][index] pre_tag = "" if doc_id in ans:
content_sent = np.load(content_sent_file).item() else: print "making vocab and sentences list" vocab = {} bacov = {} title_sent = {} content_sent = {} index = 0 for topic in data_all: if topic not in title_sent: title_sent[topic] = [] if topic not in content_sent: content_sent[topic] = [] for sent in data_all[topic]['title']: title_sent[topic].append(sent) for word in clean_sent(sent): if word not in stopwords_set: if word not in vocab: vocab[word] = index bacov[index] = word index += 1 for content in data_all[topic]['content']: for sent in sent_tokenize(content): content_sent[topic].append(sent) for word in clean_sent(sent): if word not in stopwords_set: if word not in vocab: vocab[word] = index bacov[index] = word index += 1