def make_embedding_matrix(word_index, fname): model = FastText(os.path.join('embeddings', fname)) embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM), dtype='float32') for word, i in word_index.items(): embedding_matrix[i] = model.get_numpy_vector(word, normalized=True) return embedding_matrix
class Fasttext_clf(BaseEstimator, ClassifierMixin): data_path = pkg_resources.resource_filename('addr_detector.model', 'ft_ad.ftz') def __init__(self, path=data_path): self.model = FastText(path) self.default = '0' def fit(self, X, y): return self def predict(self, X): results = [] if isinstance(X, str): # res=self.model.predict_single(X) results = results + [self.default if not res else res] elif isinstance(X, list): # X=[(x) for x in X] res = self.model.predict(X) results = results + self.model.predict(X) return results def predict_proba(self, X): results = [] if isinstance(X, str): # results = results + [self.model.predict_proba_single(X)] elif isinstance(X, list): #X=[(x+'\n') for x in X] results = results + self.model.predict_proba(X) return results
def collect_docs(p, lang_detection_model_name=None, lang='en'): if lang_detection_model_name != None: from pyfasttext import FastText model_path = SparkFiles.get(lang_detection_model_name) model = FastText(model_path) regex = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... r'localhost|' #localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) result = [] lines = list(p) indices = [i for i, line in enumerate(lines) if regex.search(line.strip())] for i in range(0, len(indices)): idx = indices[i] content = lines[idx + 1] paras = re.findall('<PAR>(.*?)</PAR>', content, re.DOTALL) if model: #filter only english paras langs = model.predict(paras) en_paras = list(filter(lambda p: lang in p[1], zip(paras, langs))) paras = list(map(lambda pair: pair[0], en_paras)) if paras: url = lines[idx].strip() result.append((url, paras)) return result
def fasttext(text): test_data = text.replace('\n', ' ') model = FastText('./model_audit.bin') test = test_data + '\n' pred = model.predict_proba_single(test, k=2) out = pred[0][1] return out
def use_pyfasttext_model(): # OK # 训练模型可以使用fasttext命令行工具进行(../doc/fastText_train.png),也可以使用本文件使用的pyfasttext包训练。 """ # OK: 1. pyfasttext包训练的模型的导入 model = FastText("../data/lxw_model_sg_pyfasttext.bin") print(model["先生"]) # type(model["先生"]): <class 'array.array'> print(model.get_numpy_vector("先生")) # type: <class 'numpy.ndarray'> print(model["刘晓伟"]) # OOV print(model.get_numpy_vector("刘晓伟")) print(model["陈贺"]) # OOV print(model.get_numpy_vector("陈贺")) model = FastText("../data/lxw_model_cbow_pyfasttext.bin") print(model["先生"]) print(model.get_numpy_vector("先生")) # type: <class 'numpy.ndarray'> print(model["刘晓伟"]) # OOV print(model.get_numpy_vector("刘晓伟")) print(model["陈贺"]) # OOV print(model.get_numpy_vector("陈贺")) # NOTE: 简单的测试发现, 两个不同的模型针对同一个OOV计算得到的向量是一样的(与fasttext包的情况相同,详情可参见NO_2_use_fasttext_model), 非OOV的向量是不一样的。 """ # OK: 2. fasttext命令行工具训练出来的模型的导入 model = FastText("../data/880w_fasttext_skip_gram.bin") print(model["先生"]) # type(model["先生"]): <class 'array.array'> print(model.get_numpy_vector("先生")) # print(model["刘晓伟"]) # OK. OOV # print(model["陈贺"]) # OK. OOV # Sentence and text vectors. sentence_vec = model.get_numpy_sentence_vector("刘晓伟 是 个 好人") print(sentence_vec) """
def test_vector(self): model = FastText() model.supervised(input='/input/tests/data/text.txt', output='model', epoch=1, lr=0.7)
def predict(self, test_set, test_labels_vector=None, report_accuracy=True): """ uses the trained model to predict the test set :param test_set: the test set :param test_labels_vector: the labels vector of the test set for accuracy computation :param report_accuracy: defines whether to report the prediction or not """ if self.model_name: from pyfasttext import FastText predictor = FastText() predictor.load_model('ft_extras/'+self.model_name+'.bin') predicted_labels = predictor.predict_proba(test_set) if report_accuracy and test_labels_vector: test_set_size = len(test_set) correct_predictions = 0 invalid_labels = 0 for index, labels in enumerate(predicted_labels): if len(labels) != 0: best_label = max(labels,key=lambda label:label[1]) if best_label[0] == test_labels_vector[index]: correct_predictions += 1 else: invalid_labels += 1 continue print('Prediction accuracy:{}\n'.format(correct_predictions / (test_set_size - invalid_labels))) else: print('Please use the train method to train a model first.') return
class FastTextEmbedding(Embedding): def __init__(self, binfile, normalize = False): self.file = binfile self.vdim = -1 self.normalize = normalize def load(self): print('Loading fasttext model.') self.ftmodel = FastText() self.ftmodel.load_model(self.file) self.vdim = len(self.ftmodel['is']) print('Finished loading fasttext model.') return self def getVector(self, word): return self.ftmodel.get_numpy_vector(word, normalized = self.normalize) def search(self, q, topk = 4): raise NotImplementedError() def wordForVec(self, v): word, sim = self.ftmodel.words_for_vector(v)[0] return word, sim def containsWord(self, word): return True def vocabulary(self): return self.ftmodel.words def dim(self): return self.vdim
def load(self): print('Loading fasttext model.') self.ftmodel = FastText() self.ftmodel.load_model(self.file) self.vdim = len(self.ftmodel['is']) print('Finished loading fasttext model.') return self
def main(): model = FastText('model_text8.bin') target_words = [ 'granada', 'python', 'harmony', 'mafia', 'yoga', 'goth', 'cyberpunk', 'nasa', 'japan', 'boolean', 'foodball', 'algorithm', 'china', 'usa', 'internet', 'harvard', 'earth', 'horse', 'angel', 'rock' ] for t_word in target_words: # get embedding target_word_embedding = model.get_numpy_vector(t_word) print('Target word:', t_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(t_word, k=15) # init array nn_word_embedding = np.zeros(shape=(15, 128)) i = 0 for word, similarity in closest_words: # get each word embedding nn_word_embedding[i] = model.get_numpy_vector(word) #print('Word:', word, 'Vec:', nn_word_embedding[i]) i = i + 1 # kmeans #print(nn_word_embedding.shape) #print(closest_words) cluster_model = KMeans(n_clusters=3, init='k-means++') prediction = cluster_model.fit_predict(nn_word_embedding) print(prediction) j = 0 for word in closest_words: print('Word:', word[0], '- Cluster #%d' % (prediction[j] + 1)) j = j + 1
def load_model(self): if not os.path.exists(self.model_path): raise FileNotFoundError('model file not found!') if self.model_name == 'fasttext': self.model = FastText(self.model_path) else: self.model = gensim.models.Word2Vec.load(self.model_path, mmap='r')
def get_language(text): """Given a list of lines, return a list of (line, lang)""" if not hasattr(settings, '_lang_detector'): lid_model = FastText() lid_model.load_model(settings.LID_MODEL_PATH) settings._lang_detector = lid_model langs = settings._lang_detector.predict([text]) return langs[0]
def load(self): try: self.ft = FastText(self.filepath) except: return "Failed to Load FT file" logger.info(f"loaded file {self.filepath}") self.loaded = True return "success"
def text(): model = FastText('wiki.zh.bin') print('load over..') s1 = '启航' s2 = '董启航' s3 = ' 董启文' print(model.nearest_neighbors('桃', k=5)) #text()
def pyfasttext_sample(): """https://pypi.org/project/pyfasttext/ """ model = FastText() # model.load_model('output/model_cooking_6.bin') model.load_model('output/model_cooking_5.ftz') result = model.predict_file('data/cooking/pre_cooking.valid', 2) for i, r in enumerate(result): print(i, r)
def build_w2v(relevant_tokens, model_file='wiki.cy.bin'): # using this library because it's more memory friendly for python :) from pyfasttext import FastText model = FastText(model_file) w2v = {} for token in relevant_tokens: vec = model.get_numpy_vector(token) w2v[token] = vec return w2v
def train(): train_file = const.train_processed_binary_file_name validate_file = const.validate_processed_binary_file_name current_best_score = 0 current_best_name = '' lr = 0.01 for epoch_i in range(1, 30): start_time = datetime.datetime.now().replace(microsecond=0) model_file_name = 'data/model_' + str(lr) + '_' + str(epoch_i) model = FastText() model.supervised(input=train_file, output=model_file_name, lr=lr, epoch=epoch_i, loss='softmax', wordNgrams=3, thread=12, ws=5, minn=2, maxn=4, dim=50) micro_precision, micro_recall, micro_f1, macro_precision, macro_recall, macro_f1 = validate_model( model_file_name, validate_file) end_time = datetime.datetime.now().replace(microsecond=0) result_log = ("epoch:" + str(epoch_i) + ': micro precision:' + str(round(micro_precision, 4)) + ', micro_recall:' + str(round(micro_recall, 4)) + ', micro_f1:' + str(round(micro_f1, 4)) + ', macro_precision:' + str(round(macro_precision, 4)) + ', macro_recall:' + str(round(macro_recall, 4)) + ', macro_f1:' + str(round(macro_f1, 4)) + ', lr:' + str(lr) + ', duration:' + str(end_time - start_time)) if current_best_score < micro_f1: current_best_score = micro_f1 print(result_log + ' ====> Model improved!!!!') if current_best_name != '': os.remove(current_best_name) current_best_name = model_file_name + '.bin' else: print(result_log) os.remove(model_file_name + '.bin') os.remove(model_file_name + '.vec') sys.stdout.flush()
def text(): model = FastText('wiki.zh.bin') print('load over..') s1 = '水果是指多汁且有甜味的植物果实,不但含有丰富的营养且能够帮助消化。水果是对部分可以食用的植物果实和种子的统称。水果有降血压、减缓衰老、减肥瘦身、皮肤保养、明目、抗癌、降低胆固醇等保健作用。一般的水果都是生食,不经过加工,洗干净就直接吃了,这样维生素很少损失,弥补了蔬菜的不足。' s2 = '在全球层面上,亚投行建立的主要背景是新兴大国的异军突起。' s3 = '亚洲基础设施投资银行Asian Infrastructure Investment Bank ,简称亚投行,AIIB是一个政府间性质的亚洲区域多边开发机构。重点支持基础设施建设,成立宗旨是为了促进亚洲区域的建设互联互通化和经济一体化的进程,并且加强中国及其他亚洲国家和地区的合作,是首个由中国倡议设立的多边金融机构,总部设在北京,法定资本1000亿美元。截至2017年10月,亚投行有70个正式成员国。2013年10月2日,习近平主席提出筹建倡议,2014年10月24日,包括中国、印度、新加坡等在内21个首批意向创始成员国的财长和授权代表在北京签约,共同决定成立投行。2015年12月25日,亚洲基础设施投资银行正式成立。2016年1月16日至18日,亚投行开业仪式暨理事会和董事会成立大会在北京举行。亚投行的治理结构分理事会、董事会、管理层三层。理事会是最高决策机构,每个成员在亚投行有正副理事各一名。董事会有12名董事,其中域内9名,域外3名。管理层由行长和5位副行长组成。' s1 = s1[:100] s2 = s2[:100] print(s2) s3 = s3[:100] print(model.similarity(s1,s2)) print(model.similarity(s3,s2))
def train(self, trainingfile): """Starts model building""" logger.info( f'Training started with : learningRate:{self.config.learningRate!s}, epoch:{self.config.epoch!s}, ngrams :{self.config.ngrams!s}' ) model = FastText() if self.supervised: model.supervised(input=trainingfile, output=self.filepath, epoch=self.config.epochs, lr=self.config.learningRate, wordNgrams=self.config.ngrams, verbose=2, minCount=1) elif self.config.method == "cbow": model.cbow(input=trainingfile, output='model', epoch=self.config.epoch, lr=self.config.learningRate) else: model.skipgram(input=trainingfile, output='model', epoch=self.config.epoch, lr=self.config.learningRate)
class FeatureGenerator: def __init__(self, fastext_path): self.fasttext = FastText(fastext_path) def generate_record(self, tuple): tr = self.fasttext.get_numpy_vector(tuple[0]) si = self.fasttext.get_numpy_vector(tuple[1]) lm = self.fasttext.get_numpy_vector(tuple[2]) #return numpy.concatenate((tr, lm)) #return numpy.concatenate((tr, si, lm)) return numpy.concatenate((tr, si, lm, lm - tr)) #return numpy.concatenate((si, lm - tr, tr - lm)) def generate(self, values): return numpy.array([self.generate_record(value) for value in values])
def main(): model = FastText('model_text8.bin') target_word = 'dog' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) print('Embedding shape:', target_word_embedding.shape) print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity)
def init(): global processtext processtext = ProcessText() global labels_list with open("both_labels.pkl", "rb") as f: labels_list = pickle.load(f) global contcmp contcmp = ContCmp("root_feature_file.allid") #loadModel() global fasttext_model fasttext_model = FastText() fasttext_model.load_model('3Ngram_3mincount_1wminlabel.bin')
def create_predict(HudongItem_csv): # 读取neo4j内容 db = Neo4j() db.connectDB() predict_List = readCSVbyColumn(HudongItem_csv, 'title') file_object = open('vector.txt', 'a') model = FastText('wiki.zh.bin') count = 0 vis = set() for p in predict_List: cur = HudongItem(db.matchHudongItembyTitle(p)) count += 1 title = cur.title if title in vis: continue vis.add(title) wv_list = model[title] strr = str(title) for p in wv_list: strr += ' ' + str(p)[:7] file_object.write(strr + "\n") print(str(count) + ' / ' + str(len(predict_List))) file_object.close()
def get_vector(self, text, get_type=2): ''' 根据分词内容获取分词向量 :param text: 分词内容 :param get_type: 分词模式 :return:分词向量,np:(n, 300) ''' word_np = [] if self.model is None: model = FastText(self.fasttext_bin) else: model = self.model if get_type == 1: seg_list = jieba.cut(text, cut_all=True) #全模式 elif get_type == 2: seg_list = jieba.cut(text, cut_all=False) #精确模式 else: seg_list = jieba.cut_for_search(text) #搜索引擎模式 for li in list(seg_list): word_np.append(np.array(model[li])) if len(word_np) == 0: word_np = np.zeros((1, 300)) else: word_np = np.array(word_np) return word_np
def get_fasttext_matrix(vocab, initial_embedding_np): """ return an embeddings matrix :param self: :param embeddings_file: :param initial_embedding_np: :return: np array of [V,E] """ from pyfasttext import FastText logging.info('Loading the FastText embeddings') model = FastText(cfg.embeddings_path) cnt = 0 vec_array = initial_embedding_np old_avg = np.average(vec_array) old_std = np.std(vec_array) vec_array = vec_array.astype(np.float32) new_avg, new_std = 0, 0 for word in vocab._item2idx: vec = model[word] vec = np.array(vec, np.float32) word_idx = vocab.encode(word) cnt += 1 vec_array[word_idx] = vec new_avg += np.average(vec) new_std += np.std(vec) new_avg /= cnt new_std /= cnt logging.info( '%d known embedding. old mean: %f new mean %f, old std %f new std %f' % (cnt, old_avg, new_avg, old_std, new_std)) return vec_array
def print_subwords(fname): model = FastText(fname) maxn = model.args['maxn'] res = {} for word in model.words: for subword, arr in zip(model.get_subwords(word), model.get_numpy_subword_vectors(word)): # real ngram, not the full word? if len(subword) > maxn: continue res[subword] = arr for key in sorted(res.keys()): print('{} {}'.format(key, ' '.join(str(val) for val in res[key])))
def __init__(self, vocab_path, vocab): self.pad_token = '<blank>' self.unk_token = '<unk>' self.model = FastText(vocab_path) self.vocab = ['<blank>', '<unk>'] + vocab self.token2id = {} self.id2token = {} self.embed_dim = 300 #this is deployed temporarily if not os.path.exists('embeddings.npy'): self.embeddings = np.random.rand(self.size(), self.embed_dim) else: self.embeddings = np.load('embeddings.npy') self.logger = logging.getLogger("sentiment") i = 0 for token in [self.pad_token, self.unk_token]: self.embeddings[i] = np.zeros([self.embed_dim]) self.token2id[token] = i self.id2token[i] = token i += 1 for token in vocab: self.token2id[token] = i self.id2token[i] = token i += 1 '''
def make_embeddings_simple_in_memory(self, name="fasttext-crawl", hasHeader=True): nbWords = 0 print('loading embeddings...') begin = True description = self._get_description(name) if description is not None: embeddings_path = description["path"] embeddings_type = description["type"] self.lang = description["lang"] print("path:", embeddings_path) if self.extension == 'bin': self.model = FastText(embeddings_path) nbWords = self.model.nwords self.embed_size = 300 else: if embeddings_type == "glove": hasHeader = False with open(embeddings_path) as f: for line in f: line = line.strip() line = line.split(' ') if begin: if hasHeader: # first line gives the nb of words and the embedding size nbWords = int(line[0]) self.embed_size = int(line[1].replace( "\n", "")) begin = False continue else: begin = False word = line[0] #if embeddings_type == 'glove': vector = np.array( [float(val) for val in line[1:len(line)]], dtype='float32') #else: # vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32') if self.embed_size == 0: self.embed_size = len(vector) self.model[word] = vector if nbWords == 0: nbWords = len(self.model) print('embeddings loaded for', nbWords, "words and", self.embed_size, "dimensions")
def __init__(self): start_time = time.time() # self.model = FastText("../data/input/models/sg_pyfasttext.bin") # DEBUG self.model = FastText( "../data/input/models/880w_fasttext_skip_gram.bin") end_time = time.time() print(f"Loading word vector model cost: {end_time - start_time:.2f}s") # self.vocab_size, self.vector_size = self.model.numpy_normalized_vectors.shape # OK self.vocab_size = self.model.nwords self.vector_size = self.model.args.get("dim") print( f"self.vector_size:{self.vector_size}, self.vocab_size: {self.vocab_size}" ) # self.vector_size:200, self.vocab_size: 925242 # 句子的表示形式: {"avg": 向量和的平均, "fasttext": get_numpy_sentence_vector, "matrix": matrix} self.sentence_vec_type = "avg"
def __init__(self, doc_catgy, n_vocab, emb_dim, out_channels, filter_size, word2index, pre_trained_embedding, multi_label): self.in_channels = 1 self.out_channels = out_channels self.row_dim = emb_dim self.hidden_dim = 512 ## fixed self.doc_catgy = doc_catgy self.n_classes = len(doc_catgy) self.n_vocab = n_vocab self.filter_size = filter_size self.word2index = word2index self.mutli_label = multi_label self.le = None if self.mutli_label == 1: self.le = MultiLabelBinarizer(classes=[i[0] for i in sorted(self.doc_catgy.items(), key=lambda x: x[1])],sparse_output=False) elif self.mutli_label == 0: self.le = LabelEncoder() self.le.fit([i[0] for i in sorted(self.doc_catgy.items(), key=lambda x: x[1])]) self.look_up_table = None self.pre_trained_embedding = pre_trained_embedding super(XMLCnn, self).__init__() self.to_gpu() if not self.pre_trained_embedding is None: model = FastText(self.pre_trained_embedding) dim = len(model['a']) n_vocab = len(self.word2index.keys()) self.look_up_table = self.xp.zeros((n_vocab, dim),dtype=np.float32) for word,index in tqdm(self.word2index.items()): try: self.look_up_table[index] = chainer.cuda.to_gpu(model.get_numpy_vector(word)) except: self.xp.random.seed(index) self.look_up_table[index][:] = self.xp.random.uniform(-0.25, 0.25, dim) self.set_seed_random(123) with self.init_scope(): if self.look_up_table is None: self.embedding=L.EmbedID(n_vocab, self.row_dim, ignore_label=-1,initialW=linear_init) else: self.embedding=L.EmbedID(n_vocab, self.row_dim, ignore_label=-1,initialW=self.look_up_table) self.conv1 = L.Convolution2D(self.in_channels,self.out_channels,(filter_size[0],self.row_dim), stride=2,initialW=linear_init) self.conv2 = L.Convolution2D(self.in_channels,self.out_channels,(filter_size[1],self.row_dim), stride=2,initialW=linear_init) self.conv3 = L.Convolution2D(self.in_channels,self.out_channels,(filter_size[2],self.row_dim), stride=2,initialW=linear_init) self.l1=L.Linear(in_size = None, out_size = self.hidden_dim, initialW=linear_init) self.l2=L.Linear(in_size = self.hidden_dim, out_size = self.n_classes,initialW=linear_init) self.to_gpu()
class Classifier: model = None labeled_hudongList = None mean = None #各分量的均值 var = None #各分量的方差 title_simi = None openTypeList_simi = None baseInfoKeyList_simi = None baseInfoValueList_simi = None openTypeList_IDF = None # 逆文本频率指数 用于给开发类别和属性名加权 baseInfoKeyList_IDF = None openTypeList_num = None # 拥有开放类别和基本属性的 item数量 baseInfoKeyList_num = None # 相似度权值,分别为:title,openTypeList,baseInfoKeyList,baseInfoValueList,detail weight = [0.2,0.2,0.2,0.2,0.2] # knn的k值 k = 10 def __init__(self,model_path): # 传入模型路径 self.model = FastText(model_path) print('classifier load over...') def load_trainSet(self,HudongList): # 传入已经标注过的hudongItem列表 self.labeled_hudongList = HudongList self.openTypeList_IDF = {} self.baseInfoKeyList_IDF = {} self.openTypeList_num = 0 self.baseInfoKeyList_num = 0 # 统计各个开放类别和属性的 IDF值 for p in self.labeled_hudongList: if len(p.openTypeList) > 0: self.openTypeList_num += 1 for t in p.openTypeList: if t in self.openTypeList_IDF: self.openTypeList_IDF[t] += 1 else: self.openTypeList_IDF[t] = 1 if len(p.baseInfoKeyList) > 0: self.baseInfoKeyList_num += 1 for t in p.baseInfoKeyList: if t in self.baseInfoKeyList_IDF: self.baseInfoKeyList_IDF[t] += 1 else: self.baseInfoKeyList_IDF[t] = 1 for p in self.openTypeList_IDF: self.openTypeList_IDF[p] = log(1.0 * self.openTypeList_num / self.openTypeList_IDF[p]) #print(str(p)+"---"+str(self.openTypeList_IDF[p])) for p in self.baseInfoKeyList_IDF: self.baseInfoKeyList_IDF[p] = log(1.0 * self.baseInfoKeyList_num / self.baseInfoKeyList_IDF[p]) #print(str(p)+"---"+str(self.baseInfoKeyList_IDF[p])) def set_parameter(self,weight,k): # 设置超参数 self.weight = weight self.k = k # 返回2个item的titles相似度 def get_title_simi(self,item1,item2): title_simi = self.model.similarity(item1.title,item2.title) return title_simi # 返回2个item的openTypeLis相似度 def get_openTypeList_simi(self,item1,item2): openTypeList_simi = 0.0 L1 = item1.openTypeList[:10] #取前10个开放类别足够 L2 = item2.openTypeList[:10] for p1 in L1: #两组开放类别之间两两比较相似度,求和,求平均 for p2 in L2: cur = self.model.similarity(p1,p2) openTypeList_simi += cur fm = len(L1)*len(L2) if fm > 0: openTypeList_simi /= fm return openTypeList_simi # 返回2个item的baseInfoKeyList相似度 def get_baseInfoKeyList_simi(self,item1,item2): baseInfoKeyList_simi = 0.0 # 基本信息的属性名之间 求jaccard相似系数 s1 = set() s2 = set() for p in item1.baseInfoKeyList: s1.add(p) for p in item2.baseInfoKeyList: s2.add(p) and12 = s1&s2 or12 = s1|s2 fz = 0.0 for p in and12: fz += self.baseInfoKeyList_IDF[p] # if len(or12)>0: # baseInfoKeyList_simi = 1.0*len(and12)/len(or12) baseInfoKeyList_simi = fz return baseInfoKeyList_simi # 返回2个item的baseInfoValueList相似度 def get_baseInfoValueList_simi(self,item1,item2): s1 = set() s2 = set() dict1 = {} dict2 = {} count = 0 for p in item1.baseInfoKeyList: s1.add(p) if count < len(item1.baseInfoValueList): dict1[p] = item1.baseInfoValueList[count] count += 1 count = 0 for p in item2.baseInfoKeyList: s2.add(p) if count < len(item2.baseInfoValueList): dict2[p] = item2.baseInfoValueList[count] count += 1 and12 = s1&s2 baseInfoValueList_simi = 0.0 # 基本信息的属性名相同的属性值,对应值相似度求平均 for s in and12: if s in dict1 and s in dict2 and dict1[s] == dict2[s] and s in self.baseInfoKeyList_IDF: baseInfoValueList_simi += 1.0*self.baseInfoKeyList_IDF[s] # if len(and12)>0: # baseInfoValueList_simi /= len(and12) return baseInfoValueList_simi #暂时不用 def similarity(self,item1,item2): # 比较两个页面的相似度,返回[-1,1]之间的相似度 title_simi = self.model.similarity(item1.title,item2.title) openTypeList_simi = 0.0 for p1 in item1.openTypeList: #两组开放类别之间两两比较相似度,求和,求平均 for p2 in item2.openTypeList: openTypeList_simi += self.model.similarity(p1,p2) fm = len(item1.openTypeList)*len(item2.openTypeList) if fm > 0: openTypeList_simi /= fm baseInfoKeyList_simi = 0.0 # 基本信息的属性名之间 求jaccard相似系数 s1 = set() s2 = set() dict1 = {} dict2 = {} count = 0 for p in item1.baseInfoKeyList: s1.add(p) dict1[p] = item1.baseInfoValueList[count] count += 1 count = 0 for p in item2.baseInfoKeyList: s2.add(p) dict2[p] = item2.baseInfoValueList[count] count += 1 and12 = s1&s2 or12 = s1|s2 if len(or12)>0: baseInfoKeyList_simi = 1.0*len(and12)/len(or12) baseInfoValueList_simi = 0.0 # 基本信息的属性名相同的属性值,对应值相似度求平均 for s in and12: baseInfoValueList_simi += self.model.similarity(dict1[s],dict2[s]) if len(and12)>0: baseInfoValueList_simi /= len(and12) # d1 = item1.detail[:60] #只判断前60个字,降低复杂度 # d2 = item2.detail[:60] # detail_simi = self.model.similarity(d1,d2) # 各组相似度线性加权 simi = self.weight[0]*title_simi + self.weight[1]*openTypeList_simi + self.weight[2]*baseInfoKeyList_simi + self.weight[3]*baseInfoValueList_simi return simi def KNN_predict(self,item): # 预测互动页面的类别 curList = [] # 用于存储和item相似度的临时列表 mean = [0.,0.,0.,0.,0.] #各分量的均值 var = [0.,0.,0.,0.,0.] #各分量的方差 stand = [0.,0.,0.,0.,0.] #各分量的标准差 maxx = [-2333.3,-2333.3,-2333.3,-2333.3,-2333.3] minn = [2333.3,2333.3,2333.3,2333.3,2333.3] title_simi = [] openTypeList_simi = [] baseInfoKeyList_simi = [] baseInfoValueList_simi = [] i = 0 for p in self.labeled_hudongList: # 预先计算存储各分量相似度 if p.title == item.title: # 如果训练集已经有,直接返回label return p.label title_simi.append(self.get_title_simi(p, item)) openTypeList_simi.append(self.get_openTypeList_simi(p, item)) baseInfoKeyList_simi.append(self.get_baseInfoKeyList_simi(p, item)) baseInfoValueList_simi.append(self.get_baseInfoValueList_simi(p, item)) mean[0] += title_simi[i] mean[1] += openTypeList_simi[i] mean[2] += baseInfoKeyList_simi[i] maxx[2] = max(maxx[2],baseInfoKeyList_simi[i]) minn[2] = min(minn[2],baseInfoKeyList_simi[i]) mean[3] += baseInfoValueList_simi[i] maxx[3] = max(maxx[3],baseInfoValueList_simi[i]) minn[3] = min(minn[3],baseInfoValueList_simi[i]) i += 1 for i in range(4): mean[i] /= len(self.labeled_hudongList) for p in self.labeled_hudongList: # 计算方差 var[0] += (title_simi[i]-mean[0])*(title_simi[i]-mean[0]) var[1] += (openTypeList_simi[i]-mean[1])*(openTypeList_simi[i]-mean[1]) var[2] += (baseInfoKeyList_simi[i]-mean[2])*(baseInfoKeyList_simi[i]-mean[2]) var[3] += (baseInfoValueList_simi[i]-mean[3])*(baseInfoValueList_simi[i]-mean[3]) for i in range(4): if var[i] ==0.0: var[i] = 0.000000001 for i in range(4): stand[i] = sqrt(var[i]) # 对于没有openTypeList的 ,赋予平均值 # 对title和openTypeList进行高斯归一,对后面两项进行maxmin归一 i = 0 for p in self.labeled_hudongList: title_simi[i] = (title_simi[i]-mean[0])/stand[0] if openTypeList_simi[i] == 0.0: #对于没有出现的,赋予平均值 openTypeList_simi[i] = mean[1] openTypeList_simi[i] = (openTypeList_simi[i]-mean[1])/stand[1] if baseInfoKeyList_simi[i] == 0.0: #对于没有出现的,赋予平均值 baseInfoKeyList_simi[i] = mean[2] baseInfoKeyList_simi[i] = (baseInfoKeyList_simi[i]-mean[2])/stand[2] baseInfoValueList_simi[i] = (baseInfoValueList_simi[i]-mean[3])/stand[3] i+=1 i = 0 count = 0 for p in self.labeled_hudongList: # 计算各项相似度的加权和 s = self.weight[0]*title_simi[i] + self.weight[1]*openTypeList_simi[i] + self.weight[2]*baseInfoKeyList_simi[i] + self.weight[3]*baseInfoValueList_simi[i] count += 1 if count < 2: pass # print(str(title_simi[i])+" "+str(openTypeList_simi[i])+" "+str(baseInfoKeyList_simi[i])+" "+str(baseInfoValueList_simi[i])) i += 1 l = p.label t = p.title curList.append(Node(s,l,t)) curList.sort(key=lambda obj:obj.simi,reverse=True) # 将训练集按照相对item的相似度进行排序 count = [0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.] for i in range(self.k): label = int(curList[i].label) count[label] += curList[i].simi #print(curList[i].title+"----"+str(curList[i].simi)+' '+str(label)) # 打印这k个 maxx = -233 answer = 0 for i in range(17): if count[i] > maxx: maxx = count[i] answer = i return answer
def __init__(self,model_path): # 传入模型路径 self.model = FastText(model_path) print('classifier load over...')