Exemplo n.º 1
0
def train_eval(mode,
               model_file,
               descriptions_file,
               neg_words_mult=2.,
               lbda=50,
               min_words=50,
               eval_lines=5000,
               eval_words=10):

    model = load_word2vec_model(model_file, mmap='r')
    if mode == 'centroid':
        entity_model = EntityModelCentroid()
    elif mode == 'lr':
        bins = np.cumsum(
            [model.vocab[word].count for word in model.index2word])
        entity_model = EntityModelLR(bins, neg_words_mult, lbda)
    else:
        raise Exception('unsupported mode %s' % mode)

    rng = random.Random(1729)

    eval_items = []

    def sampled_word_seqs():
        for i, (entity, t, word_idxs) in \
            enumerate(read_entity_word_seqs(descriptions_file, model, min_words)):
            rng.shuffle(word_idxs)
            if i < eval_lines:
                eval_items.append(
                    (entity, word_idxs[:eval_words], len(word_idxs)))
            yield entity, t, word_idxs[eval_words:]

    entity_model.train(model, sampled_word_seqs())

    evaluate_retrieval(model, entity_model, eval_items)
Exemplo n.º 2
0
def eval(model_file, lr_entity_file, centroid_entity_file):
    import readline
    readline.parse_and_bind('set editing-mode emacs')

    model = load_word2vec_model(model_file, mmap='r')
    lr_entity_model = EntityModel.load(lr_entity_file, mmap='r')
    centroid_entity_model = EntityModel.load(centroid_entity_file, mmap='r')

    norm_entities = [(entity.lower(), entity)
                     for entity in lr_entity_model.entities]

    while True:
        try:
            line = raw_input('> ').strip()
        except EOFError:
            break

        words, entities = parse_query(norm_entities, line)
        lr_top = top_entities(model, lr_entity_model, entities, words)
        centroid_top = top_entities(model, centroid_entity_model, entities,
                                    words)

        for (lr_score, lr_ent), (centroid_score,
                                 centroid_ent) in zip(lr_top, centroid_top):
            print '%-50s%10.3f | %-50s%10.3f' % (lr_ent, lr_score,
                                                 centroid_ent, centroid_score)
Exemplo n.º 3
0
def train_eval(mode, model_file, descriptions_file,
               neg_words_mult=2., lbda=50, min_words=50,
               eval_lines=5000, eval_words=10):

    model = load_word2vec_model(model_file, mmap='r')
    if mode == 'centroid':
        entity_model = EntityModelCentroid()
    elif mode == 'lr':
        bins = np.cumsum([model.vocab[word].count
                      for word in model.index2word])
        entity_model = EntityModelLR(bins, neg_words_mult, lbda)
    else:
        raise Exception('unsupported mode %s' % mode)

    rng = random.Random(1729)

    eval_items = []
    def sampled_word_seqs():
        for i, (entity, t, word_idxs) in \
            enumerate(read_entity_word_seqs(descriptions_file, model, min_words)):
            rng.shuffle(word_idxs)
            if i < eval_lines:
                eval_items.append((entity, word_idxs[:eval_words], len(word_idxs)))
            yield entity, t, word_idxs[eval_words:]

    entity_model.train(model, sampled_word_seqs())

    evaluate_retrieval(model, entity_model, eval_items)
Exemplo n.º 4
0
def quant(input_file, output_template=None, target_err=0.1, transform=True, test_accuracy=None):
    model = load_word2vec_model(input_file, mmap='r')

    q, pred_bits, zeros, avg_err, quant_syn0, dequant_model = quantize(model, target_err, transform)
    pred_bps = float(pred_bits) / quant_syn0.size
    avg_zeros = float(zeros) / quant_syn0.size

    if output_template is not None:
        output_filename = '%s.e%.3f.%s' % (output_template, target_err, 'tr' if transform else 'nt')
        with open(output_filename + '.txt', 'w') as fout:
            save_vectors(fout, model.index2word, quant_syn0, q)

        dequant_model.save(output_filename + '.model')

    acc = None
    if test_accuracy is not None:
        acc = fast_accuracy(dequant_model.vocab, dequant_model.syn0,
                            test_accuracy, restrict=100000)

    print json.dumps(OrderedDict([
        ('q', q),
        ('transform', transform),
        ('pred_bps', float(pred_bps)),
        ('avg_zeros', float(avg_zeros)),
        ('avg_err', float(avg_err)),
        ('accuracy', acc),
    ]))
Exemplo n.º 5
0
def load_resources_for_infer_one(domain):
    """加载第三方情感词典"""
    general_opinion_doc = mongodb_client.db['opinion_resources'].find_one(
        {'doc_type': 'general_opinion'})
    res_for_one.general_opinion = general_opinion_doc['lexicon']
    """加载用户定义词典"""
    res_for_one.user_defined_aspect = TAG_LIST
    """加载word2vec模型"""
    res_for_one.word2vec_model = utils.load_word2vec_model()
    """加载build完成的pair"""
    build_pairs_query_res = mongodb_client.db['opinion_build_pairs'].find_one(
        {'domain': domain})
    pair_polarity = build_pairs_query_res['pair_polarity']
    res_for_one.pair_polarity = pair_polarity
Exemplo n.º 6
0
def train(mode, model_file, descriptions_file, output_file=None,
          neg_words_mult=2., lbda=50, min_words=1):

    model = load_word2vec_model(model_file, mmap='r')
    if mode == 'centroid':
        entity_model = EntityModelCentroid()
    elif mode == 'lr':
        bins = np.cumsum([model.vocab[word].count
                      for word in model.index2word])
        entity_model = EntityModelLR(bins, neg_words_mult, lbda)
    else:
        raise Exception('unsupported mode %s' % mode)

    entity_model.train(model,
                       read_entity_word_seqs(descriptions_file, model, min_words))

    if output_file is not None:
        entity_model.save(output_file)
Exemplo n.º 7
0
def eval(model_file, lr_entity_file, centroid_entity_file):
    import readline
    readline.parse_and_bind('set editing-mode emacs')

    model = load_word2vec_model(model_file, mmap='r')
    lr_entity_model = EntityModel.load(lr_entity_file, mmap='r')
    centroid_entity_model = EntityModel.load(centroid_entity_file, mmap='r')

    norm_entities = [(entity.lower(), entity) for entity in lr_entity_model.entities]

    while True:
        try:
            line = raw_input('> ').strip()
        except EOFError:
            break

        words, entities = parse_query(norm_entities, line)
        lr_top = top_entities(model, lr_entity_model, entities, words)
        centroid_top = top_entities(model, centroid_entity_model, entities, words)

        for (lr_score, lr_ent), (centroid_score, centroid_ent) in zip(lr_top, centroid_top):
            print '%-50s%10.3f | %-50s%10.3f' % (lr_ent, lr_score, centroid_ent, centroid_score)
Exemplo n.º 8
0
def train(mode,
          model_file,
          descriptions_file,
          output_file=None,
          neg_words_mult=2.,
          lbda=50,
          min_words=1):

    model = load_word2vec_model(model_file, mmap='r')
    if mode == 'centroid':
        entity_model = EntityModelCentroid()
    elif mode == 'lr':
        bins = np.cumsum(
            [model.vocab[word].count for word in model.index2word])
        entity_model = EntityModelLR(bins, neg_words_mult, lbda)
    else:
        raise Exception('unsupported mode %s' % mode)

    entity_model.train(
        model, read_entity_word_seqs(descriptions_file, model, min_words))

    if output_file is not None:
        entity_model.save(output_file)
def quant(input_file,
          output_template=None,
          target_err=0.1,
          transform=True,
          test_accuracy=None):
    model = load_word2vec_model(input_file, mmap='r')

    q, pred_bits, zeros, avg_err, quant_syn0, dequant_model = quantize(
        model, target_err, transform)
    pred_bps = float(pred_bits) / quant_syn0.size
    avg_zeros = float(zeros) / quant_syn0.size

    if output_template is not None:
        output_filename = '%s.e%.3f.%s' % (output_template, target_err,
                                           'tr' if transform else 'nt')
        with open(output_filename + '.txt', 'w') as fout:
            save_vectors(fout, model.index2word, quant_syn0, q)

        dequant_model.save(output_filename + '.model')

    acc = None
    if test_accuracy is not None:
        acc = fast_accuracy(dequant_model.vocab,
                            dequant_model.syn0,
                            test_accuracy,
                            restrict=100000)

    print json.dumps(
        OrderedDict([
            ('q', q),
            ('transform', transform),
            ('pred_bps', float(pred_bps)),
            ('avg_zeros', float(avg_zeros)),
            ('avg_err', float(avg_err)),
            ('accuracy', acc),
        ]))
Exemplo n.º 10
0
		vector = get_vector(analyzer, sentence)
		if not vector is None:
			X.append(vector)
			y.append(data[ix][1])
	return (X, y)

if __name__=="__main__":

	REPORT = "report.txt"

	# data[0] -> text
	# data[1] -> label (1 or 0)
	# data[2] -> id
	data, sentences = get_data("train.json")

	model = load_word2vec_model("GoogleNews-vectors-negative300.bin")
	ta = TextAnalyzer(model)

	X, y = get_X_y(ta, data, sentences)

	clf = svm.SVC()
	clf.fit(X, y)

	test_data, test_sentences = get_data("test.json")
	X, y = get_X_y(ta, test_data, test_sentences)


	test_ids = [item[2] for item in test_data]
	true_pos = false_pos = true_neg = false_neg = 0

	y_pred = list()
Exemplo n.º 11
0
def accuracy(input_file, questions_file, restrict=100000):
    model = load_word2vec_model(input_file, mmap='r')
    acc = fast_accuracy(model.vocab, model.syn0, questions_file, restrict)
    print json.dumps(acc)
Exemplo n.º 12
0
def accuracy(input_file, questions_file, restrict=100000):
    model = load_word2vec_model(input_file, mmap='r')
    acc = fast_accuracy(model.vocab, model.syn0, questions_file, restrict)
    print json.dumps(acc)