def inverse_frequency(whole_doc, opt='smooth'):
    all_doc_num = 0
    title_word = {}
    content_word = {}
    for index, title in enumerate(whole_doc['title']):
        all_doc_num += 1
        doc_id = whole_doc['id'][index]
        for word in util.clean_sent(title):
            if word not in title_word:
                title_word[word] = []
            if doc_id not in title_word[word]:
                title_word[word].append(doc_id)
 
        for word in util.clean_sent(whole_doc['content'][index]):
            if word not in content_word:
                content_word[word] = []
            if doc_id not in content_word[word]:
                content_word[word].append(doc_id)

    title_idf = {}
    content_idf = {}
    if opt == 'smooth':
        for word in title_word:
            title_idf[word] = math.log(all_doc_num/(1+len(title_word[word])))
        for word in content_word:
            content_idf[word] = math.log(all_doc_num/(1+len(content_word[word])))
        return title_idf, content_idf

    elif opt == 'max':
        max_title_n = float(max([len(x) for x in title_word.values()]))
        max_content_n = float(max([len(x) for x in content_word.values()]))
        for word in title_word:
            title_idf[word] = math.log(max_title_n/(1+len(title_word[word])))
        for word in content_word:
            content_idf[word] = math.log(max_content_n/(1+len(content_word[word])))
        return title_idf, content_idf
示例#2
0
def edge_list(sents, w2id, w_len=2, isWeight=False):
    g = {}
    max_count = 0
    for sent in sents:
        words = clean_sent(sent)
        for i in range(0, len(words)):
            for j in range(i + 1, min(len(words), i + w_len + 1)):
                if isPair(words[i], words[j]):
                    w0, w1 = w2id[words[i]], w2id[words[j]]
                    pair = (w0, w1) if w0 < w1 else (w1, w0)
                    if pair not in g:
                        g[pair] = 0
                    g[pair] = g[pair] + 1 if isWeight else 1
                    if max_count < g[pair]:
                        max_count = g[pair]
    return g
def term_frequency(term, doc, opt='log'):
    doc = util.clean_sent(doc)
    if opt == 'simple':
        return 1.0 if term in doc else 0.0

    elif opt == 'log':
        t = 0
        for w in doc:
            if w == term:
                t += 1
        if t == 0:
            return 0.0
        return 1 + math.log(t)
    elif opt == 'aug':
        word = {}
        for w in doc:
            if w not in word:
                word[w] = 0
            word[w] += 1
        if term not in word:
            return 0.5
        return 0.5 + 0.5*(float(word[term])/max(word.values()))
示例#4
0
def coocurance(text, windows=2):
    word_lst = [e for e in clean_sent(text) if e not in stopwords_set]
    #print '/'.join(word_lst)
    data = defaultdict(Counter)
    for i, word in enumerate(word_lst):
        indexStart = i - windows
        indexEnd = i + windows
        if indexStart < 0:
            temp = Counter(word_lst[:windows + 1 + i])
            temp.pop(word)
            data[word] += temp
            # print word
        elif indexStart >= 0 and indexEnd <= len(word_lst):
            temp = Counter(word_lst[i - windows:i + windows + 1])
            temp.pop(word)
            data[word] += temp
        else:
            temp = Counter(word_lst[i - windows:])
            temp.pop(word)
            data[word] += temp
            # print word
    return data
        "robotics": pd.read_csv(data_dir + "robotics" + data_type),
        "biology": pd.read_csv(data_dir + "biology" + data_type),
        "travel": pd.read_csv(data_dir + "travel" + data_type),
        "diy": pd.read_csv(data_dir + "diy" + data_type),
    }
    print "class, top_n, precision, recall, f1_score"
    #content_weight = 0.8
    for data_class in dataframes:
        title_idf, content_idf = inverse_frequency(dataframes[data_class], opt='smooth')
        for top_n in range(1,20):
            ans, f1, precision, recall = [],[],[],[]
            for index, title in enumerate(dataframes[data_class]['title']):
                predict_tags = ""
                content = dataframes[data_class]['content'][index]
                candidate = {}
                for word in util.clean_sent(title):
                    score = title_idf[word]*term_frequency(word, title)
                    if word in candidate:
                        if candidate[word] < score:
                            candidate[word] = score
                    else:
                        candidate[word] = score
                #for word in util.clean_sent(content):
                #    score = content_idf[word]*term_frequency(word, content)*content_weight
                #    if word in candidate:
                #        if candidate[word] < score:
                #            candidate[word] = score
                #    else:
                #        candidate[word] = score

                predict_tags = heapq.nlargest(top_n, candidate)
示例#6
0
             batch_size=1024,
             nb_epoch=5,
             validation_data=(x_val,y_val))

result = my_model.predict(x_val)
print result[0]

ans = {}
choose_pos = ['NN', 'NNP', 'NNS', 'VB', 'VBD', 'VBG', 'VBP', 'VBZ']
all_th = [ 0.3, 0.2, 0.1,0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001]
#all_th = [0.002, 0.004, 0.006, 0.008, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1]
d = open('../result/' + val_class + '_lstm_weight_result.csv','w')
for th in all_th:
    for index, doc_id in enumerate(x_id):
        temp = ""
        words = clean_sent(x_text[index])
        word_pos = pos_tag(words)
        for position,word in enumerate(words):
            if result[index][position][1] > th:#and word_pos[position][1] in choose_pos:
                temp += word + ' '
        if doc_id not in ans:
            ans[doc_id] = ""
        ans[doc_id] += temp

    precision = []
    recall =[]
    f1 = []
    for index, tags in enumerate(document[val_class]['tags']):
        doc_id = document[val_class]['id'][index]
        pre_tag = ""
        if doc_id in ans:
示例#7
0
    content_sent = np.load(content_sent_file).item()
else:
    print "making vocab and sentences list"
    vocab = {}
    bacov = {}
    title_sent = {}
    content_sent = {}
    index = 0
    for topic in data_all:
        if topic not in title_sent:
            title_sent[topic] = []
        if topic not in content_sent:
            content_sent[topic] = []
        for sent in data_all[topic]['title']:
            title_sent[topic].append(sent)
            for word in clean_sent(sent):
                if word not in stopwords_set:
                    if word not in vocab:
                        vocab[word] = index
                        bacov[index] = word
                        index += 1
        for content in data_all[topic]['content']:
            for sent in sent_tokenize(content):
                content_sent[topic].append(sent)
                for word in clean_sent(sent):
                    if word not in stopwords_set:
                        if word not in vocab:
                            vocab[word] = index
                            bacov[index] = word
                            index += 1