def make_embedding_matrix(word_index, fname):
    model = FastText(os.path.join('embeddings', fname))
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM),
                                dtype='float32')
    for word, i in word_index.items():
        embedding_matrix[i] = model.get_numpy_vector(word, normalized=True)
    return embedding_matrix
Exemplo n.º 2
0
class Fasttext_clf(BaseEstimator, ClassifierMixin):
    data_path = pkg_resources.resource_filename('addr_detector.model', 'ft_ad.ftz')
    def __init__(self, path=data_path):
        self.model = FastText(path)
        self.default = '0'

    def fit(self, X, y):
        return self

    def predict(self, X):
        results = []
        if isinstance(X, str):  #
            res=self.model.predict_single(X)
            results = results + [self.default if not res  else res]
        elif isinstance(X, list):
           # X=[(x) for x in X]
            res = self.model.predict(X)
            results = results + self.model.predict(X)
        return results

    def predict_proba(self, X):
        results = []
        if isinstance(X, str):  #
            results = results + [self.model.predict_proba_single(X)]
        elif isinstance(X, list):
            #X=[(x+'\n') for x in X]
            results = results + self.model.predict_proba(X)
        return results
Exemplo n.º 3
0
def collect_docs(p, lang_detection_model_name=None, lang='en'):

    if lang_detection_model_name != None:
        from pyfasttext import FastText
        model_path = SparkFiles.get(lang_detection_model_name)
        model = FastText(model_path)

    regex = re.compile(
        r'^(?:http|ftp)s?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  #domain...
        r'localhost|'  #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$',
        re.IGNORECASE)

    result = []
    lines = list(p)
    indices = [i for i, line in enumerate(lines) if regex.search(line.strip())]
    for i in range(0, len(indices)):
        idx = indices[i]
        content = lines[idx + 1]
        paras = re.findall('<PAR>(.*?)</PAR>', content, re.DOTALL)

        if model:
            #filter only english paras
            langs = model.predict(paras)
            en_paras = list(filter(lambda p: lang in p[1], zip(paras, langs)))
            paras = list(map(lambda pair: pair[0], en_paras))

        if paras:
            url = lines[idx].strip()
            result.append((url, paras))

    return result
 def fasttext(text):
     test_data = text.replace('\n', ' ')
     model = FastText('./model_audit.bin')
     test = test_data + '\n'
     pred = model.predict_proba_single(test, k=2)
     out = pred[0][1]
     return out
Exemplo n.º 5
0
def use_pyfasttext_model():
    # OK
    # 训练模型可以使用fasttext命令行工具进行(../doc/fastText_train.png),也可以使用本文件使用的pyfasttext包训练。
    """
    # OK: 1. pyfasttext包训练的模型的导入
    model = FastText("../data/lxw_model_sg_pyfasttext.bin")
    print(model["先生"])     # type(model["先生"]): <class 'array.array'>
    print(model.get_numpy_vector("先生"))    # type: <class 'numpy.ndarray'>
    print(model["刘晓伟"])   # OOV
    print(model.get_numpy_vector("刘晓伟"))
    print(model["陈贺"])   # OOV
    print(model.get_numpy_vector("陈贺"))

    model = FastText("../data/lxw_model_cbow_pyfasttext.bin")
    print(model["先生"])
    print(model.get_numpy_vector("先生"))    # type: <class 'numpy.ndarray'>
    print(model["刘晓伟"])   # OOV
    print(model.get_numpy_vector("刘晓伟"))
    print(model["陈贺"])   # OOV
    print(model.get_numpy_vector("陈贺"))
    # NOTE: 简单的测试发现, 两个不同的模型针对同一个OOV计算得到的向量是一样的(与fasttext包的情况相同,详情可参见NO_2_use_fasttext_model), 非OOV的向量是不一样的。
    """

    # OK: 2. fasttext命令行工具训练出来的模型的导入
    model = FastText("../data/880w_fasttext_skip_gram.bin")
    print(model["先生"])  # type(model["先生"]): <class 'array.array'>
    print(model.get_numpy_vector("先生"))
    # print(model["刘晓伟"])   # OK. OOV
    # print(model["陈贺"])   # OK. OOV

    # Sentence and text vectors.
    sentence_vec = model.get_numpy_sentence_vector("刘晓伟 是 个 好人")
    print(sentence_vec)
    """
Exemplo n.º 6
0
    def test_vector(self):
        model = FastText()

        model.supervised(input='/input/tests/data/text.txt',
                         output='model',
                         epoch=1,
                         lr=0.7)
Exemplo n.º 7
0
    def predict(self, test_set, test_labels_vector=None, report_accuracy=True):
        """
        uses the trained model to predict the test set
        :param test_set: the test set
        :param test_labels_vector: the labels vector of the test set for accuracy computation
        :param report_accuracy: defines whether to report the prediction or not
        """

        if self.model_name:
            from pyfasttext import FastText
            predictor = FastText()
            predictor.load_model('ft_extras/'+self.model_name+'.bin')
            predicted_labels = predictor.predict_proba(test_set)
            if report_accuracy and test_labels_vector:
                test_set_size = len(test_set)
                correct_predictions = 0
                invalid_labels = 0
                for index, labels in enumerate(predicted_labels):
                    if len(labels) != 0:
                        best_label = max(labels,key=lambda label:label[1])
                        if best_label[0] == test_labels_vector[index]:
                            correct_predictions += 1
                    else:
                        invalid_labels += 1
                        continue
                print('Prediction accuracy:{}\n'.format(correct_predictions / (test_set_size - invalid_labels)))
        else:
            print('Please use the train method to train a model first.')
            return
Exemplo n.º 8
0
class FastTextEmbedding(Embedding):

  def __init__(self, binfile, normalize = False):
    self.file = binfile
    self.vdim = -1
    self.normalize = normalize
    
  def load(self):
    print('Loading fasttext model.')
    self.ftmodel = FastText()
    self.ftmodel.load_model(self.file)
    self.vdim = len(self.ftmodel['is'])
    print('Finished loading fasttext model.')
    return self
  
  def getVector(self, word):
    return self.ftmodel.get_numpy_vector(word, normalized = self.normalize)
    
  def search(self, q, topk = 4):
    raise NotImplementedError()
    
  def wordForVec(self, v):
    word, sim = self.ftmodel.words_for_vector(v)[0]
    return word, sim
  
  def containsWord(self, word):
    return True
  
  def vocabulary(self):
    return self.ftmodel.words
  
  def dim(self):
    return self.vdim
Exemplo n.º 9
0
 def load(self):
     print('Loading fasttext model.')
     self.ftmodel = FastText()
     self.ftmodel.load_model(self.file)
     self.vdim = len(self.ftmodel['is'])
     print('Finished loading fasttext model.')
     return self
Exemplo n.º 10
0
def main():
    model = FastText('model_text8.bin')

    target_words = [
        'granada', 'python', 'harmony', 'mafia', 'yoga', 'goth', 'cyberpunk',
        'nasa', 'japan', 'boolean', 'foodball', 'algorithm', 'china', 'usa',
        'internet', 'harvard', 'earth', 'horse', 'angel', 'rock'
    ]
    for t_word in target_words:
        # get embedding
        target_word_embedding = model.get_numpy_vector(t_word)
        print('Target word:', t_word)
        #print('Embedding shape:', target_word_embedding.shape)
        #print('Embedding:', target_word_embedding[0:10], '...')

        # find closest words
        closest_words = model.nearest_neighbors(t_word, k=15)
        # init array
        nn_word_embedding = np.zeros(shape=(15, 128))
        i = 0
        for word, similarity in closest_words:
            # get each word embedding
            nn_word_embedding[i] = model.get_numpy_vector(word)
            #print('Word:', word, 'Vec:', nn_word_embedding[i])
            i = i + 1
        # kmeans
        #print(nn_word_embedding.shape)
        #print(closest_words)
        cluster_model = KMeans(n_clusters=3, init='k-means++')
        prediction = cluster_model.fit_predict(nn_word_embedding)
        print(prediction)
        j = 0
        for word in closest_words:
            print('Word:', word[0], '- Cluster #%d' % (prediction[j] + 1))
            j = j + 1
Exemplo n.º 11
0
 def load_model(self):
     if not os.path.exists(self.model_path):
         raise FileNotFoundError('model file not found!')
     if self.model_name == 'fasttext':
         self.model = FastText(self.model_path)
     else:
         self.model = gensim.models.Word2Vec.load(self.model_path, mmap='r')
Exemplo n.º 12
0
def get_language(text):
    """Given a list of lines, return a list of (line, lang)"""
    if not hasattr(settings, '_lang_detector'):
        lid_model = FastText()
        lid_model.load_model(settings.LID_MODEL_PATH)
        settings._lang_detector = lid_model
    langs = settings._lang_detector.predict([text])
    return langs[0]
Exemplo n.º 13
0
 def load(self):
     try:
         self.ft = FastText(self.filepath)
     except:
         return "Failed to Load FT file"
     logger.info(f"loaded file {self.filepath}")
     self.loaded = True
     return "success"
Exemplo n.º 14
0
def text():
	model = FastText('wiki.zh.bin')
	print('load over..')
	s1 = '启航'
	s2 = '董启航'
	s3 = ' 董启文'
	print(model.nearest_neighbors('桃', k=5))
		
#text()
Exemplo n.º 15
0
def pyfasttext_sample():
    """https://pypi.org/project/pyfasttext/
    """
    model = FastText()
    # model.load_model('output/model_cooking_6.bin')
    model.load_model('output/model_cooking_5.ftz')
    result = model.predict_file('data/cooking/pre_cooking.valid', 2)
    for i, r in enumerate(result):
        print(i, r)
Exemplo n.º 16
0
def build_w2v(relevant_tokens, model_file='wiki.cy.bin'):
    # using this library because it's more memory friendly for python :)
    from pyfasttext import FastText
    model = FastText(model_file)

    w2v = {}
    for token in relevant_tokens:
        vec = model.get_numpy_vector(token)
        w2v[token] = vec
    return w2v
Exemplo n.º 17
0
def train():
    train_file = const.train_processed_binary_file_name
    validate_file = const.validate_processed_binary_file_name

    current_best_score = 0
    current_best_name = ''
    lr = 0.01

    for epoch_i in range(1, 30):
        start_time = datetime.datetime.now().replace(microsecond=0)

        model_file_name = 'data/model_' + str(lr) + '_' + str(epoch_i)

        model = FastText()
        model.supervised(input=train_file,
                         output=model_file_name,
                         lr=lr,
                         epoch=epoch_i,
                         loss='softmax',
                         wordNgrams=3,
                         thread=12,
                         ws=5,
                         minn=2,
                         maxn=4,
                         dim=50)

        micro_precision, micro_recall, micro_f1, macro_precision, macro_recall, macro_f1 = validate_model(
            model_file_name, validate_file)

        end_time = datetime.datetime.now().replace(microsecond=0)

        result_log = ("epoch:" + str(epoch_i) + ': micro precision:' +
                      str(round(micro_precision, 4)) + ', micro_recall:' +
                      str(round(micro_recall, 4)) + ', micro_f1:' +
                      str(round(micro_f1, 4)) + ', macro_precision:' +
                      str(round(macro_precision, 4)) + ', macro_recall:' +
                      str(round(macro_recall, 4)) + ', macro_f1:' +
                      str(round(macro_f1, 4)) + ', lr:' + str(lr) +
                      ', duration:' + str(end_time - start_time))

        if current_best_score < micro_f1:
            current_best_score = micro_f1
            print(result_log + ' ====> Model improved!!!!')
            if current_best_name != '':
                os.remove(current_best_name)
            current_best_name = model_file_name + '.bin'

        else:
            print(result_log)
            os.remove(model_file_name + '.bin')
            os.remove(model_file_name + '.vec')

        sys.stdout.flush()
Exemplo n.º 18
0
def text():
	model = FastText('wiki.zh.bin')
	print('load over..')
	s1 = '水果是指多汁且有甜味的植物果实,不但含有丰富的营养且能够帮助消化。水果是对部分可以食用的植物果实和种子的统称。水果有降血压、减缓衰老、减肥瘦身、皮肤保养、明目、抗癌、降低胆固醇等保健作用。一般的水果都是生食,不经过加工,洗干净就直接吃了,这样维生素很少损失,弥补了蔬菜的不足。'
	s2 = '在全球层面上,亚投行建立的主要背景是新兴大国的异军突起。'
	s3 = '亚洲基础设施投资银行Asian Infrastructure Investment Bank ,简称亚投行,AIIB是一个政府间性质的亚洲区域多边开发机构。重点支持基础设施建设,成立宗旨是为了促进亚洲区域的建设互联互通化和经济一体化的进程,并且加强中国及其他亚洲国家和地区的合作,是首个由中国倡议设立的多边金融机构,总部设在北京,法定资本1000亿美元。截至2017年10月,亚投行有70个正式成员国。2013年10月2日,习近平主席提出筹建倡议,2014年10月24日,包括中国、印度、新加坡等在内21个首批意向创始成员国的财长和授权代表在北京签约,共同决定成立投行。2015年12月25日,亚洲基础设施投资银行正式成立。2016年1月16日至18日,亚投行开业仪式暨理事会和董事会成立大会在北京举行。亚投行的治理结构分理事会、董事会、管理层三层。理事会是最高决策机构,每个成员在亚投行有正副理事各一名。董事会有12名董事,其中域内9名,域外3名。管理层由行长和5位副行长组成。'
	
	s1 = s1[:100]
	s2 = s2[:100]
	print(s2)
	s3 = s3[:100]
	print(model.similarity(s1,s2))
	print(model.similarity(s3,s2))
Exemplo n.º 19
0
    def train(self, trainingfile):
        """Starts model building"""

        logger.info(
            f'Training started with : learningRate:{self.config.learningRate!s}, epoch:{self.config.epoch!s}, ngrams :{self.config.ngrams!s}'
        )
        model = FastText()
        if self.supervised:
            model.supervised(input=trainingfile,
                             output=self.filepath,
                             epoch=self.config.epochs,
                             lr=self.config.learningRate,
                             wordNgrams=self.config.ngrams,
                             verbose=2,
                             minCount=1)
        elif self.config.method == "cbow":
            model.cbow(input=trainingfile,
                       output='model',
                       epoch=self.config.epoch,
                       lr=self.config.learningRate)
        else:
            model.skipgram(input=trainingfile,
                           output='model',
                           epoch=self.config.epoch,
                           lr=self.config.learningRate)
Exemplo n.º 20
0
class FeatureGenerator:
    def __init__(self, fastext_path):
        self.fasttext = FastText(fastext_path)

    def generate_record(self, tuple):
        tr = self.fasttext.get_numpy_vector(tuple[0])
        si = self.fasttext.get_numpy_vector(tuple[1])
        lm = self.fasttext.get_numpy_vector(tuple[2])
        #return numpy.concatenate((tr, lm))
        #return numpy.concatenate((tr, si, lm))
        return numpy.concatenate((tr, si, lm, lm - tr))
        #return numpy.concatenate((si, lm - tr, tr - lm))

    def generate(self, values):
        return numpy.array([self.generate_record(value) for value in values])
Exemplo n.º 21
0
def main():
    model = FastText('model_text8.bin')

    target_word = 'dog'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    print('Embedding shape:', target_word_embedding.shape)
    print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
Exemplo n.º 22
0
def init():
    global processtext
    processtext = ProcessText()
    
    global labels_list
    with open("both_labels.pkl", "rb") as f:
        labels_list = pickle.load(f)
    
    global contcmp
    contcmp = ContCmp("root_feature_file.allid")
    #loadModel()
    
    global fasttext_model
    fasttext_model = FastText()
    fasttext_model.load_model('3Ngram_3mincount_1wminlabel.bin')
Exemplo n.º 23
0
def create_predict(HudongItem_csv):
    # 读取neo4j内容
    db = Neo4j()
    db.connectDB()

    predict_List = readCSVbyColumn(HudongItem_csv, 'title')
    file_object = open('vector.txt', 'a')

    model = FastText('wiki.zh.bin')

    count = 0
    vis = set()
    for p in predict_List:
        cur = HudongItem(db.matchHudongItembyTitle(p))
        count += 1
        title = cur.title
        if title in vis:
            continue
        vis.add(title)
        wv_list = model[title]
        strr = str(title)
        for p in wv_list:
            strr += ' ' + str(p)[:7]
        file_object.write(strr + "\n")
        print(str(count) + ' / ' + str(len(predict_List)))

    file_object.close()
Exemplo n.º 24
0
    def get_vector(self, text, get_type=2):
        '''
        根据分词内容获取分词向量
        :param text: 分词内容
        :param get_type: 分词模式
        :return:分词向量,np:(n, 300)
        '''
        word_np = []
        if self.model is None:
            model = FastText(self.fasttext_bin)
        else:
            model = self.model
        if get_type == 1:
            seg_list = jieba.cut(text, cut_all=True)  #全模式
        elif get_type == 2:
            seg_list = jieba.cut(text, cut_all=False)  #精确模式
        else:
            seg_list = jieba.cut_for_search(text)  #搜索引擎模式

        for li in list(seg_list):
            word_np.append(np.array(model[li]))
        if len(word_np) == 0:
            word_np = np.zeros((1, 300))
        else:
            word_np = np.array(word_np)
        return word_np
Exemplo n.º 25
0
def get_fasttext_matrix(vocab, initial_embedding_np):
    """
    return an embeddings matrix
    :param self:
    :param embeddings_file:
    :param initial_embedding_np:
    :return: np array of [V,E]
    """
    from pyfasttext import FastText

    logging.info('Loading the FastText embeddings')
    model = FastText(cfg.embeddings_path)

    cnt = 0
    vec_array = initial_embedding_np
    old_avg = np.average(vec_array)
    old_std = np.std(vec_array)
    vec_array = vec_array.astype(np.float32)
    new_avg, new_std = 0, 0

    for word in vocab._item2idx:
        vec = model[word]
        vec = np.array(vec, np.float32)
        word_idx = vocab.encode(word)
        cnt += 1
        vec_array[word_idx] = vec
        new_avg += np.average(vec)
        new_std += np.std(vec)

    new_avg /= cnt
    new_std /= cnt
    logging.info(
        '%d known embedding. old mean: %f new mean %f, old std %f new std %f' %
        (cnt, old_avg, new_avg, old_std, new_std))
    return vec_array
Exemplo n.º 26
0
def print_subwords(fname):
    model = FastText(fname)
    maxn = model.args['maxn']
    res = {}

    for word in model.words:
        for subword, arr in zip(model.get_subwords(word),
                                model.get_numpy_subword_vectors(word)):
            # real ngram, not the full word?
            if len(subword) > maxn:
                continue

            res[subword] = arr

    for key in sorted(res.keys()):
        print('{} {}'.format(key, ' '.join(str(val) for val in res[key])))
Exemplo n.º 27
0
    def __init__(self, vocab_path, vocab):
        self.pad_token = '<blank>'
        self.unk_token = '<unk>'
        self.model = FastText(vocab_path)
        self.vocab = ['<blank>', '<unk>'] + vocab
        self.token2id = {}
        self.id2token = {}
        self.embed_dim = 300  #this is deployed temporarily
        if not os.path.exists('embeddings.npy'):
            self.embeddings = np.random.rand(self.size(), self.embed_dim)
        else:
            self.embeddings = np.load('embeddings.npy')
        self.logger = logging.getLogger("sentiment")

        i = 0
        for token in [self.pad_token, self.unk_token]:
            self.embeddings[i] = np.zeros([self.embed_dim])
            self.token2id[token] = i
            self.id2token[i] = token
            i += 1
        for token in vocab:
            self.token2id[token] = i
            self.id2token[i] = token
            i += 1
        '''
Exemplo n.º 28
0
 def make_embeddings_simple_in_memory(self,
                                      name="fasttext-crawl",
                                      hasHeader=True):
     nbWords = 0
     print('loading embeddings...')
     begin = True
     description = self._get_description(name)
     if description is not None:
         embeddings_path = description["path"]
         embeddings_type = description["type"]
         self.lang = description["lang"]
         print("path:", embeddings_path)
         if self.extension == 'bin':
             self.model = FastText(embeddings_path)
             nbWords = self.model.nwords
             self.embed_size = 300
         else:
             if embeddings_type == "glove":
                 hasHeader = False
             with open(embeddings_path) as f:
                 for line in f:
                     line = line.strip()
                     line = line.split(' ')
                     if begin:
                         if hasHeader:
                             # first line gives the nb of words and the embedding size
                             nbWords = int(line[0])
                             self.embed_size = int(line[1].replace(
                                 "\n", ""))
                             begin = False
                             continue
                         else:
                             begin = False
                     word = line[0]
                     #if embeddings_type == 'glove':
                     vector = np.array(
                         [float(val) for val in line[1:len(line)]],
                         dtype='float32')
                     #else:
                     #    vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32')
                     if self.embed_size == 0:
                         self.embed_size = len(vector)
                     self.model[word] = vector
             if nbWords == 0:
                 nbWords = len(self.model)
         print('embeddings loaded for', nbWords, "words and",
               self.embed_size, "dimensions")
    def __init__(self):
        start_time = time.time()
        # self.model = FastText("../data/input/models/sg_pyfasttext.bin")  # DEBUG
        self.model = FastText(
            "../data/input/models/880w_fasttext_skip_gram.bin")
        end_time = time.time()
        print(f"Loading word vector model cost: {end_time - start_time:.2f}s")

        # self.vocab_size, self.vector_size = self.model.numpy_normalized_vectors.shape  # OK
        self.vocab_size = self.model.nwords
        self.vector_size = self.model.args.get("dim")
        print(
            f"self.vector_size:{self.vector_size}, self.vocab_size: {self.vocab_size}"
        )  # self.vector_size:200, self.vocab_size: 925242

        # 句子的表示形式: {"avg": 向量和的平均, "fasttext": get_numpy_sentence_vector, "matrix": matrix}
        self.sentence_vec_type = "avg"
Exemplo n.º 30
0
    def __init__(self, doc_catgy, n_vocab, emb_dim, out_channels, filter_size, word2index, pre_trained_embedding, multi_label):
        self.in_channels = 1
        self.out_channels = out_channels
        self.row_dim = emb_dim
        self.hidden_dim = 512 ## fixed
        self.doc_catgy = doc_catgy
        self.n_classes = len(doc_catgy)
        self.n_vocab = n_vocab
        self.filter_size = filter_size
        self.word2index = word2index
        self.mutli_label = multi_label
        self.le = None
        if self.mutli_label == 1:
            self.le = MultiLabelBinarizer(classes=[i[0] for i in sorted(self.doc_catgy.items(), key=lambda x: x[1])],sparse_output=False)
        elif self.mutli_label == 0:
            self.le = LabelEncoder()
            self.le.fit([i[0] for i in sorted(self.doc_catgy.items(), key=lambda x: x[1])])
        self.look_up_table = None
        self.pre_trained_embedding = pre_trained_embedding
        super(XMLCnn, self).__init__()
        self.to_gpu()
        if not self.pre_trained_embedding is None:
                model = FastText(self.pre_trained_embedding)
                dim = len(model['a'])
                n_vocab = len(self.word2index.keys())
                self.look_up_table = self.xp.zeros((n_vocab, dim),dtype=np.float32)
                for word,index in tqdm(self.word2index.items()):
                    try:
                        self.look_up_table[index] = chainer.cuda.to_gpu(model.get_numpy_vector(word))
                    except:
                        self.xp.random.seed(index)
                        self.look_up_table[index][:] = self.xp.random.uniform(-0.25, 0.25, dim)

        self.set_seed_random(123)
        with self.init_scope():
            if self.look_up_table is None:
                self.embedding=L.EmbedID(n_vocab, self.row_dim, ignore_label=-1,initialW=linear_init)
            else:
                self.embedding=L.EmbedID(n_vocab, self.row_dim, ignore_label=-1,initialW=self.look_up_table)
            self.conv1 = L.Convolution2D(self.in_channels,self.out_channels,(filter_size[0],self.row_dim), stride=2,initialW=linear_init)
            self.conv2 = L.Convolution2D(self.in_channels,self.out_channels,(filter_size[1],self.row_dim), stride=2,initialW=linear_init)
            self.conv3 = L.Convolution2D(self.in_channels,self.out_channels,(filter_size[2],self.row_dim), stride=2,initialW=linear_init)
            self.l1=L.Linear(in_size = None, out_size = self.hidden_dim, initialW=linear_init)
            self.l2=L.Linear(in_size = self.hidden_dim, out_size = self.n_classes,initialW=linear_init)
        self.to_gpu()    
class Classifier:
	model = None
	labeled_hudongList = None
	mean = None    #各分量的均值
	var = None   #各分量的方差
	title_simi = None
	openTypeList_simi = None
	baseInfoKeyList_simi = None
	baseInfoValueList_simi = None
	
	openTypeList_IDF = None   # 逆文本频率指数  用于给开发类别和属性名加权
	baseInfoKeyList_IDF = None
	
	openTypeList_num = None   # 拥有开放类别和基本属性的 item数量
	baseInfoKeyList_num = None
	
	# 相似度权值,分别为:title,openTypeList,baseInfoKeyList,baseInfoValueList,detail
	weight = [0.2,0.2,0.2,0.2,0.2]  
	# knn的k值
	k = 10 
	
	
	def __init__(self,model_path): # 传入模型路径
		self.model = FastText(model_path)			
		print('classifier load over...')
		
	def load_trainSet(self,HudongList):  # 传入已经标注过的hudongItem列表
		self.labeled_hudongList = HudongList
		self.openTypeList_IDF = {}
		self.baseInfoKeyList_IDF = {}
		self.openTypeList_num = 0
		self.baseInfoKeyList_num = 0
		# 统计各个开放类别和属性的 IDF值
		for p in self.labeled_hudongList:
			if len(p.openTypeList) > 0:
				self.openTypeList_num += 1
			for t in p.openTypeList:
				if t in self.openTypeList_IDF:
					self.openTypeList_IDF[t] += 1
				else:
					self.openTypeList_IDF[t] = 1
			
			if len(p.baseInfoKeyList) > 0:
				self.baseInfoKeyList_num += 1
			for t in p.baseInfoKeyList:
				if t in self.baseInfoKeyList_IDF:
					self.baseInfoKeyList_IDF[t] += 1
				else:
					self.baseInfoKeyList_IDF[t] = 1
					
		for p in self.openTypeList_IDF:
			self.openTypeList_IDF[p] = log(1.0 * self.openTypeList_num / self.openTypeList_IDF[p])
			#print(str(p)+"---"+str(self.openTypeList_IDF[p]))
			
		for p in self.baseInfoKeyList_IDF:
			self.baseInfoKeyList_IDF[p] = log(1.0 * self.baseInfoKeyList_num / self.baseInfoKeyList_IDF[p])
			#print(str(p)+"---"+str(self.baseInfoKeyList_IDF[p]))

		
	def set_parameter(self,weight,k):  # 设置超参数
		self.weight = weight
		self.k = k
		
	# 返回2个item的titles相似度
	def get_title_simi(self,item1,item2):
		title_simi = self.model.similarity(item1.title,item2.title)
		return title_simi
		
	# 返回2个item的openTypeLis相似度
	def get_openTypeList_simi(self,item1,item2):
		openTypeList_simi = 0.0
		L1 = item1.openTypeList[:10]  #取前10个开放类别足够
		L2 = item2.openTypeList[:10]
		for p1 in L1:   #两组开放类别之间两两比较相似度,求和,求平均
			for p2 in L2:
				cur = self.model.similarity(p1,p2)
				openTypeList_simi += cur
		
		fm = len(L1)*len(L2)
		if fm > 0:
			openTypeList_simi /= fm
		
		return openTypeList_simi
		
	# 返回2个item的baseInfoKeyList相似度	
	def get_baseInfoKeyList_simi(self,item1,item2):
		baseInfoKeyList_simi = 0.0   # 基本信息的属性名之间 求jaccard相似系数
		s1 = set()
		s2 = set()
		for p in item1.baseInfoKeyList:
			s1.add(p)
		for p in item2.baseInfoKeyList:
			s2.add(p)
		and12 = s1&s2
		or12 = s1|s2
		fz = 0.0
		for p in and12:
			fz += self.baseInfoKeyList_IDF[p]
#		if len(or12)>0:
#			baseInfoKeyList_simi = 1.0*len(and12)/len(or12)
		baseInfoKeyList_simi = fz
		return baseInfoKeyList_simi
	
	# 返回2个item的baseInfoValueList相似度
	def get_baseInfoValueList_simi(self,item1,item2):
		s1 = set()
		s2 = set()
		dict1 = {}  
		dict2 = {}
		count = 0
		for p in item1.baseInfoKeyList:
			s1.add(p)
			if count < len(item1.baseInfoValueList):
				dict1[p] = item1.baseInfoValueList[count]
			count += 1
		count = 0
		for p in item2.baseInfoKeyList:
			s2.add(p)
			if count < len(item2.baseInfoValueList):
				dict2[p] = item2.baseInfoValueList[count]
			count += 1
		and12 = s1&s2
			
		baseInfoValueList_simi = 0.0  # 基本信息的属性名相同的属性值,对应值相似度求平均
		for s in and12:
			if s in dict1 and s in dict2 and dict1[s] == dict2[s] and s in self.baseInfoKeyList_IDF:
				baseInfoValueList_simi += 1.0*self.baseInfoKeyList_IDF[s]
#		if len(and12)>0:
#			baseInfoValueList_simi /= len(and12)
			
		return baseInfoValueList_simi	

    #暂时不用
	def similarity(self,item1,item2):  # 比较两个页面的相似度,返回[-1,1]之间的相似度
		title_simi = self.model.similarity(item1.title,item2.title)
		
		openTypeList_simi = 0.0
		for p1 in item1.openTypeList:   #两组开放类别之间两两比较相似度,求和,求平均
			for p2 in item2.openTypeList:
				openTypeList_simi += self.model.similarity(p1,p2)
		fm = len(item1.openTypeList)*len(item2.openTypeList)
		if fm > 0:
			openTypeList_simi /= fm
			
		baseInfoKeyList_simi = 0.0   # 基本信息的属性名之间 求jaccard相似系数
		s1 = set()
		s2 = set()
		dict1 = {}  
		dict2 = {}
		count = 0
		for p in item1.baseInfoKeyList:
			s1.add(p)
			dict1[p] = item1.baseInfoValueList[count]
			count += 1
		count = 0
		for p in item2.baseInfoKeyList:
			s2.add(p)
			dict2[p] = item2.baseInfoValueList[count]
			count += 1
		and12 = s1&s2
		or12 = s1|s2
		if len(or12)>0:
			baseInfoKeyList_simi = 1.0*len(and12)/len(or12)
			
		baseInfoValueList_simi = 0.0  # 基本信息的属性名相同的属性值,对应值相似度求平均
		for s in and12:
			baseInfoValueList_simi += self.model.similarity(dict1[s],dict2[s])
		if len(and12)>0:
			baseInfoValueList_simi /= len(and12)	
			
#		d1 = item1.detail[:60]   #只判断前60个字,降低复杂度
#		d2 = item2.detail[:60]
#		detail_simi = self.model.similarity(d1,d2)
				
		
		# 各组相似度线性加权
		simi = self.weight[0]*title_simi + self.weight[1]*openTypeList_simi + self.weight[2]*baseInfoKeyList_simi + self.weight[3]*baseInfoValueList_simi
		
		return simi
		
		
	def KNN_predict(self,item): # 预测互动页面的类别
		curList = [] # 用于存储和item相似度的临时列表
		
		mean = [0.,0.,0.,0.,0.]    #各分量的均值
		var = [0.,0.,0.,0.,0.]   #各分量的方差
		stand = [0.,0.,0.,0.,0.]  #各分量的标准差
		maxx = [-2333.3,-2333.3,-2333.3,-2333.3,-2333.3]
		minn = [2333.3,2333.3,2333.3,2333.3,2333.3]
		title_simi = [] 
		openTypeList_simi = []
		baseInfoKeyList_simi = []  
		baseInfoValueList_simi = []
		
		i = 0
		for p in self.labeled_hudongList:  # 预先计算存储各分量相似度
			if p.title == item.title:	# 如果训练集已经有,直接返回label
				return p.label
			title_simi.append(self.get_title_simi(p, item))
			openTypeList_simi.append(self.get_openTypeList_simi(p, item))
			baseInfoKeyList_simi.append(self.get_baseInfoKeyList_simi(p, item))
			baseInfoValueList_simi.append(self.get_baseInfoValueList_simi(p, item))
			
			mean[0] += title_simi[i]
			mean[1] += openTypeList_simi[i]
			
			mean[2] += baseInfoKeyList_simi[i]
			maxx[2] = max(maxx[2],baseInfoKeyList_simi[i])
			minn[2] = min(minn[2],baseInfoKeyList_simi[i])
			
			mean[3] += baseInfoValueList_simi[i]
			maxx[3] = max(maxx[3],baseInfoValueList_simi[i])
			minn[3] = min(minn[3],baseInfoValueList_simi[i])
			
			i += 1
		
		for i in range(4):
			mean[i] /= len(self.labeled_hudongList)
		
		for p in self.labeled_hudongList: # 计算方差
			var[0] += (title_simi[i]-mean[0])*(title_simi[i]-mean[0])
			var[1] += (openTypeList_simi[i]-mean[1])*(openTypeList_simi[i]-mean[1])
			var[2] += (baseInfoKeyList_simi[i]-mean[2])*(baseInfoKeyList_simi[i]-mean[2])
			var[3] += (baseInfoValueList_simi[i]-mean[3])*(baseInfoValueList_simi[i]-mean[3])
		
		for i in range(4):
			if var[i] ==0.0:
				var[i] = 0.000000001
		
		for i in range(4):
			stand[i] = sqrt(var[i])
		
		# 对于没有openTypeList的 ,赋予平均值
		# 对title和openTypeList进行高斯归一,对后面两项进行maxmin归一
		i = 0	
		for p in self.labeled_hudongList:
			title_simi[i] = (title_simi[i]-mean[0])/stand[0]
			   
			if openTypeList_simi[i] == 0.0: #对于没有出现的,赋予平均值
				openTypeList_simi[i] = mean[1]
			openTypeList_simi[i] = (openTypeList_simi[i]-mean[1])/stand[1]
			
			if baseInfoKeyList_simi[i] == 0.0: #对于没有出现的,赋予平均值
				baseInfoKeyList_simi[i] = mean[2]
			baseInfoKeyList_simi[i] = (baseInfoKeyList_simi[i]-mean[2])/stand[2]
			
			baseInfoValueList_simi[i] = (baseInfoValueList_simi[i]-mean[3])/stand[3]
			
			i+=1
			
		i = 0
		count = 0
		for p in self.labeled_hudongList: # 计算各项相似度的加权和
			s = self.weight[0]*title_simi[i] + self.weight[1]*openTypeList_simi[i] + self.weight[2]*baseInfoKeyList_simi[i] + self.weight[3]*baseInfoValueList_simi[i]
			count += 1
			if count < 2:
				pass
			#	print(str(title_simi[i])+" "+str(openTypeList_simi[i])+" "+str(baseInfoKeyList_simi[i])+" "+str(baseInfoValueList_simi[i]))
			i += 1
			l = p.label
			t = p.title
			curList.append(Node(s,l,t))
		
		curList.sort(key=lambda obj:obj.simi,reverse=True)  # 将训练集按照相对item的相似度进行排序
		
		count = [0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.]	
		for i in range(self.k):
			label = int(curList[i].label)
			count[label] += curList[i].simi
			#print(curList[i].title+"----"+str(curList[i].simi)+'  '+str(label)) # 打印这k个
		
		
		maxx = -233
		answer = 0
		for i in range(17):
			if count[i] > maxx:
				maxx = count[i]
				answer = i
		return answer
	def __init__(self,model_path): # 传入模型路径
		self.model = FastText(model_path)			
		print('classifier load over...')
Exemplo n.º 33
0
    def test_vector(self):
        model = FastText()

        model.supervised(input='/input/tests/data/text.txt', output='model', epoch=1, lr=0.7)