示例#1
0
 def __init__(self, target_train_path, test_x_path, test_y_path):
     self.svm_model = NuSVC()
     self.bayes_model = MultinomialNB()
     self.nn_model = MLPClassifier()
     self.target_data = datapreprocess.read_file(target_train_path)
     self.test_x = datapreprocess.read_file(test_x_path)
     self.test_y = datapreprocess.read_file(test_y_path)
示例#2
0
 def __init__(self, train_path, corpora_split_path, model_path):
     self.trigger_path = "../Data/trigger/"
     self.model_path = model_path
     self.target_train_data = datapreprocess.read_file(train_path)
     self.corpora_split_content = datapreprocess.read_file(
         corpora_split_path)
     self.seed_words, self.top_seed_trigger, self.seed_event_vec, self.event_words, self.event_vec = \
         None, None, None, None, None
示例#3
0
 def __init__(self, train_path, corpora_split_path, model_path, tain_x_path,
              train_y_path, test_x_path, test_y_path, weight_1, weight_2,
              method, threshold, top, enrich, source_from, new):
     Trigger.__init__(self, train_path, corpora_split_path, model_path)
     self.event_vec = Trigger.run(self,
                                  weight_1=weight_1,
                                  weight_2=weight_2,
                                  method=method,
                                  threshold=threshold,
                                  top=top,
                                  enrich=enrich,
                                  source_from=source_from,
                                  new=new)
     self.model_path = "../Data/model/"
     self.train_x = datapreprocess.read_file(tain_x_path)
     self.train_y = datapreprocess.read_file(train_y_path)
     self.test_x = datapreprocess.read_file(test_x_path)
     self.test_y = datapreprocess.read_file(test_y_path)
示例#4
0
 def __init__(self, train_path, corpora_split_path, test_vec_path,
              test_label_path, model_path, weight_1, weight_2, threshold,
              top_1, top_2, enrich, source_from, new):
     Trigger.__init__(self, train_path, corpora_split_path, model_path,
                      test_vec_path)
     self.event_vec = Trigger.run(self,
                                  weight_1=weight_1,
                                  weight_2=weight_2,
                                  threshold=threshold,
                                  top_1=top_1,
                                  top_2=top_2,
                                  enrich=enrich,
                                  source_from=source_from,
                                  new=new)
     self.model_path = "../Data/model/"
     self.train_x = self.target_train_data
     self.train_y = self.target_train_data
     self.test_x = self.test_split_data
     self.test_y = datapreprocess.read_file(test_label_path)
示例#5
0
 def convert_one_hot(self, item_data):
     target_words = datapreprocess.read_file(
         path="../Data/pre_process/target_represent.csv")["Split_content"]
     all_words = set()
     for i in range(len(target_words.index)):
         all_words = all_words | set(
             literal_eval(target_words.ix[i, "Split_content"]))
     all_words = list(all_words)
     onehot_result = pd.DataFrame(np.zeros(
         (len(item_data.index), len(all_words))),
                                  columns=all_words,
                                  index=item_data.index)
     for i in range(len(item_data.index)):
         words = literal_eval(item_data.ix[i, "Split_content"])
         for word in words:
             if word in all_words:
                 onehot_result.ix[i, word] = onehot_result.ix[i, word] + 1
             else:
                 pass
     return onehot_result
示例#6
0
    def add_new_word(self,
                     threshold=0.8,
                     top=2000,
                     enrich=True,
                     source_from=1,
                     new=False):
        """
        从测试集中提取top=1000的新词,加入到event
        :return:
        """
        if enrich:
            logger.info("丰富种子动词形成触发器词...")
            # 加载语料库中高频新词
            if source_from == 1:
                corpora_split_content = self.corpora_split_content.copy()
            else:
                # 加载测试集中高频新词
                corpora_split_content = \
                    datapreprocess.read_file(path="../Data/pre_process/target_test_x.csv")["Split_content"]
            model = Word2Vec.load(self.model_path)
            new_trigger_words, seed_event_vec, all_word = self.top_seed_trigger.copy(
            ), self.seed_event_vec, []
            final_trigger_words = {}
            # sorted the corpora high frequency words
            for i in range(len(corpora_split_content.index)):
                all_word.extend(
                    literal_eval(corpora_split_content.ix[i, "Split_content"]))
            high_frequency_words = self.sort_words(all_word, top=top)
            new_trigger_words_list = []
            for value in new_trigger_words.values():
                for wd in value:
                    new_trigger_words_list.append(wd[0])
            # add new word to trigger.
            if not new:
                for item in high_frequency_words:
                    if item[0] not in new_trigger_words_list:
                        cosine_value = []
                        for i in sorted(seed_event_vec.keys()):
                            if item[0] in model:
                                vec_pair = [
                                    seed_event_vec[i], model.wv[item[0]]
                                ]
                                cosine_value.append(
                                    cosine_similarity(vec_pair)[1, 0])
                        if cosine_value and max(cosine_value) >= threshold:
                            event = cosine_value.index(max(cosine_value)) + 1
                            logger.info(
                                "增加词:{word},频率:{frequency},相似度:{simi},对应事件{event}"
                                .format(word=item[0],
                                        frequency=item[1],
                                        simi=max(cosine_value),
                                        event=event))
                            new_trigger_words[event].append(item)
            else:
                new_trigger_words = {1: [], 2: [], 3: [], 4: [], 5: [], 6: []}
                for item in high_frequency_words:
                    cosine_value = []
                    for i in sorted(seed_event_vec.keys()):
                        if item[0] in model:
                            vec_pair = [seed_event_vec[i], model.wv[item[0]]]
                            cosine_value.append(
                                cosine_similarity(vec_pair)[1, 0])
                    if cosine_value and max(cosine_value) >= threshold:
                        event = cosine_value.index(max(cosine_value)) + 1
                        logger.info(
                            "增加词:{word},频率:{frequency},相似度:{simi},对应事件{event}".
                            format(word=item[0],
                                   frequency=item[1],
                                   simi=max(cosine_value),
                                   event=event))
                        new_trigger_words[event].append(item)

            datapreprocess.write_json_file(path=self.trigger_path +
                                           "event_words_add_new.json",
                                           item_data=new_trigger_words)
            high_top = min(
                list(map(lambda x: len(x), new_trigger_words.values())))
            logger.info("每类标签的词数{num}".format(num=high_top))
            for key, value in new_trigger_words.items():
                final_trigger_words[key] = value[:high_top]
            datapreprocess.write_json_file(path=self.trigger_path +
                                           "final_event_words.json",
                                           item_data=final_trigger_words)
            datapreprocess.write_file(
                path=self.trigger_path + "final_event_word.csv",
                item_data=pd.DataFrame(final_trigger_words).T)
            return final_trigger_words
        else:
            return self.top_seed_trigger.copy()
示例#7
0
        print(company[i],
              len(clean_data[clean_data["Company"] == company[i]].index))
    for i in range(1, 7):
        print(i, len(clean_data[clean_data["Label"] == i].index))
    print(len(clean_data.index))
    print(clean_data["Company"].unique())
    datapreprocess.write_file(path="../Data_2/raw/target.csv",
                              item_data=clean_data)
    return clean_data


def form_test_data(item):
    test_data = item.drop("Sentence_vec", axis=1)
    datapreprocess.write_file(path="../Data_2/pre_process/test_data_label.csv",
                              item_data=test_data)


if __name__ == "__main__":
    company_list = [
        '中兴通讯', '华为中国区', 'vivo智能手机', 'coolpad官方微博', 'OPPO', '联想', '小米公司',
        '金立智能手机', "小辣椒", "360手机", 'HTC官方微博', '魅族科技', '天语手机', '超级手机官微',
        '朵唯女性手机', '锤子科技', 'TCL通讯中国', 'nubia智能手机'
    ]
    label_num = {"招募": 1, "合作": 2, "研发": 3, "推广": 4, "销售": 5, "无": 6}
    # raw_data = datapreprocess.read_file(path="../Data/raw/Corpora.csv")
    # form_clean_corpora(raw_data, company_list)
    # item = datapreprocess.read_file(path="../Data_2/raw/Corpora_label.csv")
    # form_target_data(item=item, label_map=label_num, company=company_list)
    data = datapreprocess.read_file(path="../Data_2/pre_process/test_data.csv")
    form_test_data(data)
示例#8
0
    def write_file(self, item_data):
        with open("../Data/raw/Corpora.csv", "w", encoding="utf-8") as f:
            f.write("Weibo_id" + '\t' + "Company" + '\t' + "Content" + '\t' +
                    "Time"
                    '\t' + "Comment" + '\t' + 'Like' + '\t' + "Transfer" +
                    '\n')
            i = 1
            for item in item_data:
                f.write("weibo_" + str(i) + '\t' + item["nickname"] + '\t' +
                        item["Post"] + '\t' + item["Pubtime"] + '\t' +
                        str(item["Comment_num"]) + '\t' +
                        str(item["Like_num"]) + '\t' +
                        str(item["Transfer_num"]) + '\n')
                i += 1
        return i


if __name__ == "__main__":
    conn = MongoDB()
    items = conn.find_item()
    conn.write_file(item_data=items)
    corpora = datapreprocess.read_file(
        "../Data/pre_process/corpora_clean_data.csv")
    print(corpora["Company"].unique())
    company_list = [
        '中兴通讯', '华为中国区', 'vivo智能手机', 'coolpad官方微博', 'OPPO', '联想', '小米公司',
        '金立智能手机', "小辣椒", "360手机", 'HTC官方微博', '魅族科技', '天语手机', '超级手机官微', '一加手机',
        '朵唯女性手机', '锤子科技', 'TCL通讯中国', 'nubia智能手机', '天语手机'
    ]