예제 #1
0
def main():
    parser = argparse.ArgumentParser(description="take text feature")
    parser.add_argument("-t", "--type", type=str, choices=("db", "file"), default="file", help="db/file")
    parser.add_argument("-s", "--source", type=str, help="file path/sql script")
    parser.add_argument("-n", "--name", type=str, help="output file name")
    parser.add_argument("-k", "--topk", type=int, default=500, help="top k words")
    parser.add_argument("-w", "--word_category", default="v,vd,vn,vf,a,ad,an,ag,al", type=str, help="word category")
    args = parser.parse_args()

    source_from = args.type
    source = args.source
    name = args.name
    k_num = args.topk
    word_category = args.word_category.split(",")
    print word_category
    if source_from == "db":
        comments_df = preprocess.get_data_from_db(source)
    elif source_from == "file":
        comments_df = preprocess.read_comment_from_file(source)
    else:
        return

    comments_list = list(comments_df["comment"].values)
    cutted, word_category_list = utils.word_cut(comments_list)
    word_weight_flag = utils.tfidf(cutted, word_category_list, "tfidf_" + name)
    key_word = utils.get_topK(word_weight_flag, "top_k_" + name, k=k_num, category_list=word_category)
예제 #2
0
    def prune_on_keywords(self, thres):
        data_words, note_ids, id2word, corpus = utils.preprocess(
            self.df, 5, ['NOUN', 'VERB'], STOP_WORDS, 'tokens_phrases')
        tfidf_matrix, tf_dicts, post_appear_dict = utils.tfidf(data_words)

        keywords = {
            i: utils.get_tfidfs_thres(tfidf_matrix[i], thres)
            for i, m in enumerate(tfidf_matrix)
        }
        word2id = {v: k for k, v in id2word.items()}
        tfidf_corpus = [[(word2id[pair[0]], pair[1]) for pair in post.items()]
                        for post in keywords]
예제 #3
0
def load_data(data_name):
    timer = utils.timer(name='main')
    data_path = './data/' + data_name
    user_pref_file = data_path + '/U_BPR.npy'
    item_pref_file = data_path + '/V_BPR.npy'
    item_content_file = data_path + '/item_features.txt'
    train_file = data_path + '/train.csv'
    test_file = data_path + '/test.csv'
    vali_file = data_path + '/vali.csv'
    dat = {}

    # load preference data
    timer.tic()
    dat['u_pref'] = np.load(user_pref_file)
    dat['v_pref'] = np.load(item_pref_file)
    timer.toc('loaded U:%s,V:%s' %
              (str(dat['u_pref'].shape), str(dat['v_pref'].shape))).tic()

    # pre-process preference data
    _, dat['u_pref'] = utils.standardize(dat['u_pref'])
    _, dat['v_pref'] = utils.standardize_2(dat['v_pref'])
    timer.toc('standardized U,V').tic()

    # load item(article) content data
    # load_svmlight_file(file): 读取svmlight格式的数据文件,文件存放格式
    # <label> <feature-id>:<feature-value> <feature-id>:<feature-value> ...
    # 其中 zero_based 选项,如果为 False 的话会将所有的 indices 减 1
    # 返回 (X, y),其中 X 是 scipy.sparse matrix,y 是 numpy.ndarray
    item_content, _ = datasets.load_svmlight_file(item_content_file,
                                                  zero_based=True,
                                                  dtype=np.float32)
    # tfidf 文本特征化
    item_content = tfidf(item_content)
    # svd 特征降维
    u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5)
    item_content = u * s
    # 特征标准化
    _, item_content = utils.standardize(item_content)
    dat['item_content'] = item_content
    timer.toc('loaded item feature sparse matrix: %s' %
              (str(item_content.shape))).tic()

    # load split
    train = pd.read_csv(train_file, dtype=np.int32)
    dat['user_list'] = train['uid'].values
    dat['item_list'] = train['iid'].values
    timer.toc('read train triplets %s' % str(train.shape))

    dat['test_eval'] = data.load_eval_data(test_file)
    dat['vali_eval'] = data.load_eval_data(vali_file)
    return dat
예제 #4
0
def get_nps(data):
    def parse_np(index):
        np = ''
        closing = 0
        for elem in tree[index:]:
            if elem[0] == '(':
                closing += 1
            else:
                match = re.findall("\)", elem)

                np += elem.replace(')', '').strip() + ' '

                closing -= len(match)
                if closing <= 0:
                    break
        return np.replace('-LRB- ', '(').replace(' -RRB-', ')').replace('-LRB-', '(').replace('-RRB-', ')').strip().lower()

    nps = []
    nps_condition = {}
    for guess in data:
        tree = guess['parse'].split()
        for i, elem in enumerate(tree):
            if elem == '(NP':
                np = parse_np(i)
                nps.append(np)

                if guess['condition'] not in nps_condition:
                    nps_condition[guess['condition']] = []
                nps_condition[guess['condition']].append(np)

    # print 'Most frequent descriptions'
    # print 10 * '-'
    nps = nltk.FreqDist(nps)
    # v = sorted(freq.items(), key=operator.itemgetter(1), reverse=True)[:30]
    # for np in v:
    #     print np[0], np[1]

    print 10 * '-'
    print 'Most distinctive descriptions per condition'
    nps_condition = utils.tfidf(nps_condition, 10)
    for condition in nps_condition:
        print 'Condition: ', condition
        print 10 * '-'
        for np in nps_condition[condition]:
            print np[0], np[1], nps[np[0]]
        print 10 * '-'
예제 #5
0
def main():
    parser = argparse.ArgumentParser(description='take text feature')
    parser.add_argument('-t',
                        '--type',
                        type=str,
                        choices=('db', 'file'),
                        default='file',
                        help='db/file')
    parser.add_argument('-s',
                        '--source',
                        type=str,
                        help='file path/sql script')
    parser.add_argument('-n', '--name', type=str, help='output file name')
    parser.add_argument('-k',
                        '--topk',
                        type=int,
                        default=500,
                        help='top k words')
    parser.add_argument('-w',
                        '--word_category',
                        default='v,vd,vn,vf,a,ad,an,ag,al',
                        type=str,
                        help='word category')
    args = parser.parse_args()

    source_from = args.type
    source = args.source
    name = args.name
    k_num = args.topk
    word_category = args.word_category.split(',')
    print word_category
    if source_from == 'db':
        comments_df = preprocess.get_data_from_db(source)
    elif source_from == 'file':
        comments_df = preprocess.read_comment_from_file(source)
    else:
        return

    comments_list = list(comments_df['comment'].values)
    cutted, word_category_list = utils.word_cut(comments_list)
    word_weight_flag = utils.tfidf(cutted, word_category_list, 'tfidf_' + name)
    key_word = utils.get_topK(word_weight_flag,
                              'top_k_' + name,
                              k=k_num,
                              category_list=word_category)
예제 #6
0
def get_nps_attractiveness(faces):
    def parse_np(index):
        np = ''
        closing = 0
        for elem in tree[index:]:
            if elem[0] == '(':
                closing += 1
            else:
                match = re.findall("\)", elem)

                np += elem.replace(')', '').strip() + ' '

                closing -= len(match)
                if closing <= 0:
                    break
        return np.replace('-LRB- ', '(').replace(' -RRB-', ')').replace('-LRB-', '(').replace('-RRB-', ')').strip().lower()

    nps = []
    nps_attractiveness = {}
    for face in faces:
        tree = face['parse'].split()
        for i, elem in enumerate(tree):
            if elem == '(NP':
                np = parse_np(i)
                nps.append(np)

                attract = face['responses']['attractive'].lower()

                if attract not in nps_attractiveness:
                    nps_attractiveness[attract] = []
                nps_attractiveness[attract].append(np)

    print 10 * '-'
    print 'Most frequent descriptions per attractiveness'
    nps_attractiveness = utils.tfidf(nps_attractiveness, 10)
    nps = nltk.FreqDist(nps)
    for attract in nps_attractiveness:
        print 'Race: ', attract
        print 10 * '-'
        for np in nps_attractiveness[attract]:
            print np[0], np[1], nps[np[0]]
        print 10 * '-'
예제 #7
0
def common_words_faces_typicality(data):
    typicality = set(map(lambda face: face['responses']['typical'], data))

    voc, gvoc = {}, []

    for typical in typicality:
        voc[typical] = []
        for face in filter(lambda face: face['responses']['typical'] == typical, data):
            for i, word in enumerate(face['tokens']):
                if 'NN' in face['pos_tag'][i] or 'JJ' in face['pos_tag'][i]:
                    voc[typical].append(word.lower())
                    gvoc.append(word.lower())

    tfidf = utils.tfidf(voc, 10)
    gvoc = nltk.FreqDist(gvoc)

    print 'Most common words in Faces per typicality:'
    for typical in typicality:
        print typical
        for word in tfidf[typical]:
            print word, gvoc[word[0]]
        print 20 * '-'
예제 #8
0
def common_words_faces_attractiveness(data):
    attractiveness = set(map(lambda face: face['responses']['attractive'], data))

    voc, gvoc = {}, []

    for attract in attractiveness:
        voc[attract] = []
        for face in filter(lambda face: face['responses']['attractive'] == attract, data):
            for i, word in enumerate(face['tokens']):
                if 'NN' in face['pos_tag'][i] or 'JJ' in face['pos_tag'][i]:
                    voc[attract].append(word.lower())
                    gvoc.append(word.lower())

    tfidf = utils.tfidf(voc, 10)
    gvoc = nltk.FreqDist(gvoc)

    print 'Most common words in Faces per attractiveness:'
    for attract in attractiveness:
        print attract
        for word in tfidf[attract]:
            print word, gvoc[word[0]]
        print 20 * '-'
예제 #9
0
def common_words_guesses(data):
    conditions = map(lambda x: x['condition'], data)
    voc, gvoc = {}, []
    for condition in set(conditions):
        voc[condition] = []
        f = filter(lambda guess: guess['condition'] == condition, data)

        for guess in f:
            for i, word in enumerate(guess['tokens']):
                if 'NN' in guess['pos_tag'][i] or 'NN' in guess['pos_tag'][i]:
                    voc[condition].append(word.lower())
                    gvoc.append(word.lower())

        # voc[condition] = nltk.FreqDist(voc[condition])
    tfidf = utils.tfidf(voc, 10)
    gvoc = nltk.FreqDist(gvoc)

    print 'Most common words in Guesses per condition:'
    for condition in tfidf:
        print 'Condition:', condition

        for word in tfidf[condition]:
            print word, gvoc[word[0]]
        print 20 * '-'
    # add ratio features
    data['ratio_title'] = data['word_in_title']/data['len_of_query']
    data['ratio_description'] = data['word_in_description']/data['len_of_query']
    data['attribute'] = data['search_term']+"\t"+data['brand']
    data['ratio_brand'] = data['word_in_brand']/data['len_of_query']
    data['ratio_attr'] = data['word_in_attr']/data['len_of_query']
    data['ratio_attr_title'] = data['word_in_attr_title']/data['len_of_query']
    data['brand_ratio'] = data['word_in_brand']/data['len_of_brand']
    data['attr_ratio'] = data['word_in_attr']/data['len_of_attr']
    data['attr_title_ratio'] = data['word_in_attr_title']/data['len_of_attr_title']
    data['title_ratio'] = data['word_in_title']/data['len_of_title']
    data['description_ratio'] = data['word_in_description']/data['len_of_description']

    # add bm25 features
    desc_tf, desc_idf, desc_length, desc_ave_length = utils.tfidf(data, 'product_description')
    data['desc_BM25'] = data.apply(lambda x: utils.BM25_score(
        x, desc_tf, desc_idf, desc_length, desc_ave_length), axis=1)
    attr_tf, attr_idf, attr_length, attr_ave_length = utils.tfidf(data, 'attr')
    data['attr_BM25'] = data.apply(lambda x: utils.BM25_score(
        x, attr_tf, attr_idf, attr_length, attr_ave_length), axis=1)
    title_tf, title_idf, title_length, title_ave_length = utils.tfidf(data, 'product_title')
    data['title_BM25'] = data.apply(lambda x: utils.BM25_score(
        x, title_tf, title_idf, title_length, title_ave_length), axis=1)
    attr_title_tf, attr_title_idf, attr_title_length, attr_title_ave_length = utils.tfidf(data, 'attr_title')
    data['attr_title_BM25'] = data.apply(lambda x: utils.BM25_score(
        x, attr_title_tf, attr_title_idf, attr_title_length, attr_title_ave_length), axis=1)
    brand_tf, brand_idf, brand_length, brand_ave_length = utils.tfidf(data, 'brand')
    data['brand_BM25'] = data.apply(lambda x: utils.BM25_score(
        x, brand_tf, brand_idf, brand_length, brand_ave_length), axis=1)
예제 #11
0
def get_data():
    node_information = pd.read_csv(
        'node_information.csv',
        header=None,
        names=['ID', 'Year', 'Title', 'Authors', 'Journal', 'Abstract'])
    node_information = pd.read_csv(
        'node_information.csv',
        header=None,
        names=['ID', 'Year', 'Title', 'Authors', 'Journal', 'Abstract'])
    training_set = pd.read_csv('training_set.txt',
                               header=None,
                               names=['Target', 'Source', 'Edge'],
                               delim_whitespace=True)
    #testing_set = pd.read_csv('testing_set.txt', header=None, names=['Target', 'Source'], delim_whitespace=True)

    print("Get valid IDs")
    valid_ids = set()
    for element in training_set.values:
        valid_ids.add(element[0])
        valid_ids.add(element[1])

    print("Select valid indices from valid IDs")
    index_valid = [
        i for i, element in enumerate(node_information.values)
        if element[0] in valid_ids
    ]
    node_info = node_information.iloc[index_valid]

    print("Get index for nodes")
    IDs = []
    ID_pos = {}
    for element in node_info.values:
        ID_pos[element[0]] = len(IDs)
        IDs.append(element[0])

    print("Add ID column for merging")
    training_set['Target_ID'] = training_set.apply(lambda row: ID_pos[row[0]],
                                                   axis=1)
    training_set['Source_ID'] = training_set.apply(lambda row: ID_pos[row[1]],
                                                   axis=1)

    print("Merge")
    train = pd.merge(training_set,
                     node_information,
                     how='left',
                     left_on='Target_ID',
                     right_index=True)
    train = pd.merge(train,
                     node_information,
                     how='left',
                     left_on='Source_ID',
                     right_index=True,
                     suffixes=['_target', '_source'])
    #train.to_csv('train_blank.csv', index=False)

    #train = pd.read_csv('train_blank.csv')
    #train.to_csv('train.csv', index=False)

    t = time()
    print("Add overlapping titles")
    train['Overlap_title'] = train.apply(lambda row: overlap(row, 'Title'),
                                         axis=1)
    print("Add common_authors")
    train['Common_authors'] = train.apply(lambda row: common(row, 'Authors'),
                                          axis=1)
    print("Add overlapping abstract")
    train['Overlap_abstract'] = train.apply(
        lambda row: overlap(row, 'Abstract'), axis=1)
    print("Date difference")
    train['Date_diff'] = (train['Year_source'] - train['Year_target']).abs()
    print(time() - t)

    #train.to_csv('train_basic.csv', index=False)
    #print("Loading set")
    #train = pd.read_csv('train_basic.csv')

    #print("Loaded")
    t = time()
    print("Tfidf")
    tfidf_vect = TfidfVectorizer(stop_words="english")
    abstracts_source = train['Abstract_source'].values
    abstracts_target = train['Abstract_target'].values
    all_abstracts = np.concatenate((abstracts_source, abstracts_target))
    tfidf_vect.fit(all_abstracts)
    print("tf_idf fitted")
    vect_source = tfidf_vect.transform(abstracts_source)
    print("source transformed")
    vect_target = tfidf_vect.transform(abstracts_target)
    print("target transformed")
    train['Tfidf_cosine_abstracts_nolim'] = tfidf(vect_source, vect_target)
    print(time() - t)

    #train.to_csv('train_basic_tfidf.csv', index=False)
    #train = pd.read_csv('train_basic_tfidf.csv')

    t = time()
    print("Tfidf")
    tfidf_vect = TfidfVectorizer(stop_words="english")
    titles_source = train['Title_source'].values
    titles_target = train['Title_target'].values
    all_abstracts = np.concatenate((titles_source, titles_target))
    tfidf_vect.fit(all_abstracts)
    print("tf_idf fitted")
    vect_source = tfidf_vect.transform(titles_source)
    print("source transformed")
    vect_target = tfidf_vect.transform(titles_target)
    print("target transformed")
    train['Tfidf_cosine_titles'] = tfidf(vect_source, vect_target)
    print(time() - t)

    #train.to_csv('train_basic_tfidf_title.csv', index=False)
    #train = pd.read_csv('train_basic_tfidf_title.csv')

    return train
예제 #12
0
    def embed_posts(self, min_count, allowed_pos, stopwords,
                    preprocess_option):

        if self.option == 'tfidf+lsi':
            logging.info(
                'CBF - Using TFIDF vectors, LSI for dimension reduction')
            data_words, note_ids, id2word, corpus = utils.preprocess(
                self.all_note_contents, min_count, allowed_pos, stopwords,
                preprocess_option)
            #self.post_bows = pd.DataFrame(data={'NoteID':note_ids,'BoW':data_words}).set_index('NoteID')
            logging.debug('[CBF] - %d non-empty posts', len(corpus))
            logging.debug('[CBF] - %s extracted %d tokens/phrases',
                          preprocess_option, len(id2word))

            tfidf_matrix, tf_dicts, post_appear_dict = utils.tfidf(data_words)
            word2id = {v: k for k, v in id2word.items()}
            tfidf_corpus = [[(word2id[pair[0]], pair[1])
                             for pair in post.items()]
                            for post in tfidf_matrix]
            model = LsiModel(tfidf_corpus,
                             num_topics=self.feature_size,
                             id2word=id2word)

            for i, post_tfidf in enumerate(tfidf_corpus):
                note_id = note_ids[i]
                if not note_id in self.items:
                    post_repr = model[post_tfidf]
                    self.items[note_id] = [
                        p[1] for p in post_repr
                        if len(post_repr) == self.feature_size
                    ]
            self.model = model
            return True

        elif self.option == 'tfidf+keywords+lsi':
            logging.info(
                'CBF - Using TFIDF vectors on only 1/3 keywords of each post, LSI for dimension reduction'
            )
            data_words, note_ids, id2word, corpus = utils.preprocess(
                self.all_note_contents, min_count, allowed_pos, stopwords,
                preprocess_option)
            #self.post_bows = pd.DataFrame(data={'NoteID':note_ids,'BoW':data_words}).set_index('NoteID')
            print('CBF - %d non-empty posts' % len(corpus))
            print('CBF - %s BoW extracted %d tokens/phrases' %
                  (preprocess_option, len(id2word)))

            tfidf_matrix, tf_dicts, post_appear_dict = utils.tfidf(data_words)
            keywords = {
                i: utils.get_top_tfidfs(
                    tfidf_matrix[i],
                    len(tfidf_matrix[i]) //
                    3)  # TODO:  have over-threshold phrases as the keyword
                for i, m in enumerate(tfidf_matrix)
            }
            word2id = {v: k for k, v in id2word.items()}
            tfidf_corpus = [[(word2id[pair[0]], pair[1])
                             for pair in post.items()] for post in keywords]
            model = LsiModel(tfidf_corpus,
                             num_topics=self.feature_size,
                             id2word=id2word)

            for i, post_tfidf in enumerate(tfidf_corpus):
                note_id = note_ids[i]
                self.items[note_id] = model[post_tfidf]
            self.model = model
            return True

        elif self.option == 'KCB':
            from gensim import models
            data_words, note_ids, id2word, corpus = utils.preprocess(
                self.all_note_contents, min_count, allowed_pos, stopwords,
                preprocess_option)
            tfidf = models.TfidfModel(corpus)
            corpus_tfidf = tfidf[corpus]
            corpus_tfidf_lst = []
            for doc in corpus_tfidf:
                doc.sort(key=operator.itemgetter(1))
                doc = doc[-len(doc) // self.top_ratio:]
                corpus_tfidf_lst.append(doc)
            # print('kt',corpus_tfidf_lst)
            lsi_model = models.LsiModel(corpus_tfidf_lst,
                                        id2word=id2word,
                                        num_topics=self.feature_size
                                        )  # initialize an LSI transformation
            corpus_lsi = lsi_model[corpus_tfidf_lst]

            for i, post_repr in enumerate(corpus_lsi):
                note_id = note_ids[i]
                self.items[note_id] = [
                    p[1] for p in post_repr
                    if len(post_repr) == self.feature_size
                ]
            self.model = lsi_model
            return True

        elif self.option == 'word_emb+wmd':
            # Load pretrained FastText embeddings
            self.model = FastText.load('cleaned_data/all_notes_model')
            # print('using model:',self.model)
            # Cannot get post embeddings from word embeddings
            # It cannot stand alone as a CBF method
            return False

        elif self.option == 'keyword+word_emb+wmd':
            # Load pretrained FastText embeddings
            self.model = FastText.load('cleaned_data/all_notes_model')
            # print('using model:',self.model)
            # Cannot get post embeddings from word embeddings
            # It cannot stand alone as a CBF method
            return False

        elif self.option == 'keyword+ft_word_emb+sif':  # Using SIF on keyword --> sentence embedding

            data_words, note_ids, id2word, corpus = utils.preprocess(
                self.all_note_contents, min_count, allowed_pos, stopwords,
                preprocess_option)
            self.post_bows = pd.DataFrame(data={
                'NoteID': note_ids,
                'BoW': data_words
            }).set_index('NoteID')
            logging.debug('CBF - %d non-empty posts', len(corpus))
            logging.debug('CBF - %s BoW extracted %d tokens/phrases',
                          preprocess_option, len(id2word))

            sentence_list = []
            note_ids_lookup = []
            for note_id, post in self.post_bows.iterrows():
                word_list = []
                for word in post:
                    word_emd = self.model[word]
                    word_list.append(Word(word, word_emd))
                if len(word_list
                       ) > 0:  # did we find any words (not an empty set)
                    sentence_list.append(Sentence(word_list))
                    note_ids_lookup.append(
                        note_id
                    )  # in case there are some posts of 0 length, thus not included in this

            sentence_embs = {}
            sentence_vectors = sentence_to_vec(
                sentence_list,
                self.feature_size)  # all vectors converted together
            if len(sentence_vectors) == len(sentence_list):
                for i in range(len(sentence_vectors)):
                    # map: note_id -> vector
                    sentence_embs[note_ids_lookup[i]] = sentence_vectors[i]
            self.items = sentence_embs

            return True

        elif self.option == 'ft_word_emb+sif':  # Using SIF on whole text --> sentence embedding

            data_words, note_ids, id2word, corpus = utils.preprocess_raw(
                self.all_note_contents)
            self.post_bows = pd.DataFrame(data={
                'NoteID': note_ids,
                'BoW': data_words
            }).set_index('NoteID')

            sentence_list = []
            note_ids_lookup = []
            for note_id, post in self.post_bows.iterrows():
                word_list = []
                for word in post:
                    word_emd = self.model[word]
                    word_list.append(Word(word, word_emd))
                if len(word_list
                       ) > 0:  # did we find any words (not an empty set)
                    sentence_list.append(Sentence(word_list))
                    note_ids_lookup.append(
                        note_id
                    )  # in case there are some posts of 0 length, thus not included in this

            sentence_embs = {}
            sentence_vectors = sentence_to_vec(
                sentence_list,
                self.feature_size)  # all vectors converted together
            if len(sentence_vectors) == len(sentence_list):
                for i in range(len(sentence_vectors)):
                    # map: note_id -> vector
                    sentence_embs[note_ids_lookup[i]] = sentence_vectors[i]
            self.items = sentence_embs

            return True

        elif self.option == 'bert_word_emb+sif':

            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                      max_len=512)
            #nids = [nid for nid in self.all_note_contents.NoteID.values if nid not in self.items.keys()]
            note_ids = self.all_note_contents.NoteID.to_list()
            MAX_LEN = 512
            tokenized_texts_list = []
            indexed_tokens_list = []
            attention_masks = []

            for text in self.all_note_contents.Contents.values:
                marked_text = "[CLS] " + text + " [SEP]"
                tokenized_text = tokenizer.tokenize(marked_text)
                tokenized_texts_list.append(tokenized_text)
                indexed_tokens_list.append(
                    tokenizer.convert_tokens_to_ids(tokenized_text))

            input_ids_list = pad_sequences(indexed_tokens_list,
                                           maxlen=MAX_LEN,
                                           dtype="long",
                                           truncating="post",
                                           padding="post")
            for seq in input_ids_list:
                seq_mask = [int(float(i > 0)) for i in seq]
                attention_masks.append(seq_mask)

            # Convert inputs to PyTorch tensors
            tokens_tensor = torch.tensor(input_ids_list)
            segments_tensors = torch.tensor(attention_masks)

            # Put the model in "evaluation" mode, meaning feed-forward operation.
            self.model.eval()
            with torch.no_grad():
                encoded_layers, _ = model(tokens_tensor, segments_tensors)

            emb_layers = encoded_layers[-4:]
            sum_layers = torch.stack(emb_layers, dim=0).sum(dim=0)
            sentence_word_embs = {}
            for i in range(len(tokenized_texts_list)):
                sentence_word_embs[
                    note_ids[i]] = sum_layers[i][:len(tokenized_texts_list[i])]
            tokenized_texts_ = {
                nid: tokenized_texts_list[i]
                for i, nid in enumerate(note_ids)
            }

            sentence_list = []
            note_ids_lookup = []
            for note_id in note_ids:
                #print(note_id)
                word_list = []
                for j in range(len(sentence_word_embs[note_id])):
                    word_emb = sentence_word_embs[note_id][j]
                    # Add here if to use only keywords
                    word_text = tokenized_texts_[note_id][j]
                    word_list.append(Word(word_text, word_emb.numpy()))
                if len(word_list
                       ) > 0:  # did we find any words (not an empty set)
                    sentence_list.append(Sentence(word_list))
                    note_ids_lookup.append(
                        note_id
                    )  # in case there are some posts of 0 length, thus not included in this
                    #print('wordlist',len(word_list))

            sentence_embs = {}
            sentence_vectors = sentence_to_vec(
                sentence_list,
                self.feature_size)  # all vectors converted together
            if len(sentence_vectors) == len(sentence_list):
                for i in range(len(sentence_vectors)):
                    # map: note_id -> vector
                    sentence_embs[note_ids_lookup[i]] = sentence_vectors[i]
            self.items = sentence_embs

            return True

        elif self.option == 'sentence_emb':
            note_ids = self.all_note_contents.NoteID.to_list()
            all_note_contents = self.all_note_contents['Contents'].to_list()

            sentence_embs = {}
            sentence_vectors = self.model[all_note_contents]
            if len(sentence_vectors) == len(all_note_contents):
                for i in range(len(sentence_vectors)):
                    # map: note_id -> vector
                    sentence_embs[
                        note_ids_lookup[i]] = sentence_vectors[i].numpy()
            self.items = sentence_embs

            return True

        elif self.option == 'sentence_emb_precomputed':
            return True
예제 #13
0
    def precompute_similarity(self, model_path, option, feature_size):
        all_note_contents = self.all_note_contents.Contents.to_list()
        all_note_nids = self.all_note_contents.NoteID.to_list()
        similarity_matrix = {}

        if option == 'ft_word_emd+sif':
            #'cleaned_data/ft_model_incr'
            # Load pretrained FastText embeddings
            model = FastText.load(model_path)
            logging.info('[Preprocessor] Using model: %s', str(model))

            similarity_matrix = {}
            nlp = spacy.load("en_core_web_sm")
            note_ids = all_note_contents['NoteID'].values
            contents = all_note_contents['Contents'].values
            data_words = [[token.text for token in nlp(content)]
                          for note_id, content in zip(note_ids, contents)]
            post_tokens = pd.DataFrame(data={
                'NoteID': note_ids,
                'Tokens': data_words
            }).set_index('NoteID')

            sentence_list = []
            sentence_embs = {}
            for note_id, post in post_tokens.iterrows():
                word_list = []
                for word in post.values[0]:
                    word_emd = model[word]
                    word_list.append(Word(word, word_emd))
                if len(word_list
                       ) > 0:  # did we find any words (not an empty set)
                    sentence_list.append(Sentence(word_list))
                sentence_embs[note_id] = sentence_to_vec(
                    sentence_list, feature_size)

            # Compute post-wise cosine similarities
            for note_id1, emb1 in sentence_embs.items():
                for note_id2, emb2 in sentence_embs.items():
                    if note_id1 != note_id2 and (
                            note_id2, note_id1) not in similarity_matrix:
                        # apply l2-distance
                        #utils.l2_sim()
                        # apply cosine distance
                        sim = utils.cosine_sim(emb1[0], emb2[0])
                        similarity_matrix[(note_id1, note_id2)] = sim
                        similarity_matrix[(note_id2, note_id1)] = sim

            return similarity_matrix

        elif option == 'bert_word_emb+sif':
            # for BERT
            import torch
            from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
            from keras.preprocessing.sequence import pad_sequences

            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                      max_len=128)
            MAX_LEN = 512
            tokenized_texts_list = []
            indexed_tokens_list = []
            attention_masks = []

            for text in all_note_contents.Contents.values:
                marked_text = "[CLS] " + text + " [SEP]"
                tokenized_text = tokenizer.tokenize(marked_text)
                tokenized_texts_list.append(tokenized_text)
                indexed_tokens_list.append(
                    tokenizer.convert_tokens_to_ids(tokenized_text))

            input_ids_list = pad_sequences(indexed_tokens,
                                           maxlen=MAX_LEN,
                                           dtype="long",
                                           truncating="post",
                                           padding="post")
            for seq in input_ids:
                seq_mask = [int(float(i > 0)) for i in seq]
                attention_masks.append(seq_mask)

            # Convert inputs to PyTorch tensors
            tokens_tensor = torch.tensor(input_ids_list)
            segments_tensors = torch.tensor(attention_masks)

            # Load pre-trained model (weights)
            model = BertModel.from_pretrained('bert-base-uncased')

            # Put the model in "evaluation" mode, meaning feed-forward operation.
            model.eval()

            with torch.no_grad():
                encoded_layers, _ = model(tokens_tensor, segments_tensors)

            emb_layers = encoded_layers[-4:]
            sum_layers = torch.stack(emb_layers,
                                     dim=0).sum(dim=0)  # 434*512*768
            sentence_word_embs = {}
            for i in range(len(tokenized_texts_list)):
                sentence_word_embs[
                    nids[i]] = sum_layers[i][:len(tokenized_texts_list[i])]

            # Keep a look up dictionary [note id] --> text content
            tokenized_texts_ = {
                nid: tokenized_texts_list[i]
                for i, nid in enumerate(nids)
            }

            embedding_size = feature_size  # Set the shape of the sentence/post embeddings
            sentence_list = []
            note_ids_lookup = []
            for note_id in nids:
                #print(note_id)
                word_list = []
                for j in range(len(sentence_word_embs[note_id])):
                    word_emb = sentence_word_embs[note_id][j]
                    # Add here if to use only keywords
                    word_text = tokenized_texts_[note_id][j]
                    word_list.append(Word(word_text, word_emb.numpy()))
                if len(word_list) > 0:
                    sentence_list.append(Sentence(word_list))
                    note_ids_lookup.append(
                        note_id
                    )  # in case there are some posts of 0 length, thus not included in this

            # Encode sentences/posts with embeddigns
            sentence_embs = {}
            sentence_vectors = sentence_to_vec(
                sentence_list,
                embedding_size)  # all vectors converted together
            if len(sentence_vectors) == len(sentence_list):
                for i in range(len(sentence_vectors)):
                    # map: note_id -> vector
                    sentence_embs[note_ids_lookup[i]] = sentence_vectors[i]

            # Compute post-wise cosine similarities
            for note_id1, emb1 in sentence_embs.items():
                for note_id2, emb2 in sentence_embs.items():
                    if note_id1 != note_id2 and (
                            note_id2, note_id1) not in similarity_matrix:
                        # apply l2-distance
                        #utils.l2_sim()
                        # apply cosine distance
                        sim = utils.cosine_sim(emb1[0], emb2[0])
                        similarity_matrix[(note_id1, note_id2)] = sim
                        similarity_matrix[(note_id2, note_id1)] = sim

            return similarity_matrix, sentence_embs

        elif option == 'sentence_emb':
            import tensorflow as tf
            import tensorflow_hub as hub

            embed = hub.load(model_path)

            logging.info(
                '[Preprocessor] using model: universal-sentence-encoder-1')
            sentence_embs = {}
            sentence_vectors = embed(all_note_contents)
            if len(sentence_vectors) == len(all_note_contents):
                for i in range(len(sentence_vectors)):
                    # map: note_id -> vector
                    sentence_embs[
                        all_note_nids[i]] = sentence_vectors[i].numpy()

            #corr = np.inner(sentence_vectors, sentence_vectors)
            #cosine_similarities = tf.reduce_sum(tf.multiply(sentence_vectors, sentence_vectors), axis=1)
            #clip_cosine_similarities = tf.clip_by_value(cosine_similarities, -1.0, 1.0)
            #sim_scores = 1.0 - tf.acos(clip_cosine_similarities)

            #print(sim_scores)
            #for i, sims in enumerate(sim_scores):
            #    for j, sim in enumerate(sims):
            ##        note_id1 = all_note_nids[i]
            #        note_id2 = all_note_nids[j]
            #        if not note_id1==note_id2:
            #            similarity_matrix[(note_id1, note_id2)] = sim

            # Compute post-wise cosine similarities
            for note_id1, emb1 in sentence_embs.items():
                for note_id2, emb2 in sentence_embs.items():
                    if note_id1 != note_id2 and (
                            note_id2, note_id1) not in similarity_matrix:
                        # apply l2-distance
                        #utils.l2_sim()
                        # apply cosine distance
                        sim = utils.cosine_sim(emb1, emb2)
                        similarity_matrix[(note_id1, note_id2)] = sim
                        similarity_matrix[(note_id2, note_id1)] = sim

            return similarity_matrix, sentence_embs

        elif option == 'tfidf+lsi':
            logging.info(
                '[Preprocessor] using TFIDF vectors, LSI for dimension reduction'
            )
            data_words, note_ids, id2word, corpus = utils.preprocess(
                self.all_note_contents, 10, ['NOUN', 'VERB'], STOP_WORDS,
                'tokens_phrases')
            #self.post_bows = pd.DataFrame(data={'NoteID':note_ids,'BoW':data_words}).set_index('NoteID')
            logging.debug('[Preprocessor] - %d non-empty posts', len(corpus))
            logging.debug('[Preprocessor] - %s extracted %d tokens/phrases',
                          'tokens_phrases', len(id2word))
            tfidf_matrix, tf_dicts, post_appear_dict = utils.tfidf(data_words)

            word2id = {v: k for k, v in id2word.items()}
            tfidf_corpus = [[(word2id[pair[0]], pair[1])
                             for pair in post.items()]
                            for post in tfidf_matrix]

            model = LsiModel(tfidf_corpus,
                             num_topics=feature_size,
                             id2word=id2word)

            sentence_embs = {}
            for i, post_tfidf in enumerate(tfidf_corpus):
                note_id = note_ids[i]
                if not note_id in sentence_embs:
                    post_repr = model[post_tfidf]
                    #print(post_repr)
                    #print(i)
                    sentence_embs[note_id] = np.array([
                        p[1] for p in post_repr
                        if len(post_repr) == feature_size
                    ])

            # Compute post-wise cosine similarities
            for note_id1, emb1 in sentence_embs.items():
                for note_id2, emb2 in sentence_embs.items():
                    if note_id1 != note_id2 and (
                            note_id2, note_id1) not in similarity_matrix:
                        if len(emb1) and len(emb2):
                            # apply l2-distance
                            #utils.l2_sim()
                            # apply cosine distance
                            sim = utils.cosine_sim(emb1, emb2)
                            similarity_matrix[(note_id1, note_id2)] = sim
                            similarity_matrix[(note_id2, note_id1)] = sim

            return similarity_matrix, sentence_embs