def __init__(self): self.mongo_operation = MongoOperation() self.ltp_parser = LtpParser() self.words_embedding_file = "../Data/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt" self.wv_from_text = KeyedVectors.load_word2vec_format( self.words_embedding_file, binary=False, limit=100000) self.wv_from_text.init_sims(replace=True)
class NewsSpiderPipeline(object): def __init__(self): self.file = FileOperation() self.db = MongoOperation() def process_item(self, item, spider): self.file.get_news_list_file(item) self.db.news_db_add(item) return item def spider_closed(self, spider): pass
class EventVector: def __init__(self): self.mongo_operation = MongoOperation() self.ltp_parser = LtpParser() self.words_embedding_file = "../Data/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt" self.wv_from_text = KeyedVectors.load_word2vec_format( self.words_embedding_file, binary=False, limit=100000) self.wv_from_text.init_sims(replace=True) # 从数据库中得到事件三组元按“,”划分的事件,并不重复 def get_event_from_triple(self): event_triple_sets = self.mongo_operation.event_db_get() event_sets = list() for event_triple in event_triple_sets: event = event_triple.split(',')[0] if event not in event_sets: event_sets.append(event) event = event_triple.split(',')[2] if event not in event_sets: event_sets.append(event) return event_sets # 得到事件的向量(词向量平均得到) def get_event_vectors(self, event_sets): events_list = list() for event in event_sets: event_dict = dict() words_list = self.ltp_parser.get_words_by_pyltp(event) # print(words_list) # 建立长度向量长度的0向量 vector_sum = [ 0 for index in range(self.wv_from_text.wv.syn0[0].shape[0]) ] for word in words_list: vector = self.wv_from_text[word] # print(vector) vector_sum = list( numpy.array(vector_sum) + numpy.array(vector)) # print(vector_sum) event_vector = [i / len(words_list) for i in vector_sum] event_dict["event"] = event event_dict["vector"] = event_vector events_list.append(event_dict) return events_list
def __init__(self): self.file = FileOperation() self.db = MongoOperation()
from event_extractor import EventExtrator from event_relation_extractor import EventRelationExtractor from IO.file_operation import FileOperation from IO.database_operation import MongoOperation if __name__ == '__main__': content = "" # content = ''' # 部分研究结果显示即使在病毒传播密集地区,已有抗体的人群比例仍很低,意味着绝大多数人易感。 # 如果想回到没有封锁措施的社会,需要等待一个中长期过程。 # 因为我已经成功完成了作业了,因此我可以出去玩。 # 我出去玩,结果摔了一跤。 # ''' # print(content) file_operation = FileOperation() db_operation = MongoOperation() # 定义读取行 flag = 0 # 读取文件n行到n+5行的句子 text_list = file_operation.get_file_rows_list( file_operation.TEST_TEXT_FILE, flag, flag + 5) for index in range(len(text_list)): content += text_list[index] event_sets_list = list() event_extractor = EventExtrator() event_relation_extractor = EventRelationExtractor() event_relations_list = event_relation_extractor.event_relation_extrator_main( content) for event_relation in event_relations_list: if event_relation: pre_event = event_extractor.event_extrator_main(event_relation[0])
def __init__(self): # fd = open(self.EVENT_RELATIONS_LIST_FILE_NAME, 'r') # self.event_triple_sets = fd.readlines() db = MongoOperation() self.event_triple_sets = db.event_db_get() print('\n'.join(self.event_triple_sets))
for i in range(numiter): flag = 0 # E step % update the center for each cluster for cluster in self.clusters: cluster.update_center() # M step % assign the point to the newest cluster for point in self.points: flag += self.assign_point_cluster(point) if (flag == 0): break if __name__ == "__main__": mongo_operation = MongoOperation() vectors_list = mongo_operation.vector_db_get() dataset = list() for vector_dict in vectors_list: dataset.append(vector_dict["vector"]) # dataset = [[1, 1], [2, 3], [-1, 4], [5, 2], [-3, -7], [4, -2], [4, 2], [3, 3], [-2, 3], [-5, -3], [2, 5], [1, # -2], [3, 0], [0, 0], [1, 6], [1, 7]] kmean = Kmeanplusplus(dataset, 4) kmean.do_cluster(1000) for cluster in kmean.clusters: print("============") print("cluster:") print(cluster.center) for point in cluster.points: print(point.data) for vector_dict in vectors_list: