示例#1
0
import numpy as np
from gensim.models import Word2Vec
from gensim.models.word2vec import KeyedVectors, LineSentence
import itertools
import sys
sys.path.append('..')
import config
import pickle
import os
MAX_WORDS_IN_BATCH = 1000

logger = logging.Logger(name="word2vec", level=logging.INFO)
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s')
logging.root.setLevel(level=logging.INFO)

question_path = config.getPathConfig("question", "all_seg_question")

model_path = config.getPathConfig("word2vec", "word2vec") + ".teem"
emb_dim = config.getIntConfig("word2vec", "emb_dim")

vocab_path = config.vocab_path + ".teem"
emb_path = config.emb_path + ".teem"


def train_word2vec():
    '''训练词项向量
    '''
    model = Word2Vec(sentences=LineSentence(question_path),
                     size=emb_dim,
                     window=5,
                     min_count=5,
示例#2
0
import logging
import numpy as np
from gensim.models import Word2Vec
from gensim.models.word2vec import KeyedVectors,LineSentence
import itertools
import sys
sys.path.append('..')
import config
MAX_WORDS_IN_BATCH=1000


logger=logging.Logger(name="word2vec",level=logging.INFO)
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s')
logging.root.setLevel(level=logging.INFO)

kb_path=config.getPathConfig("kb","seg_kb")
question_path=config.getPathConfig("question","all_seg_question")

model_path=config.getPathConfig("word2vec","word2vec")
emb_dim=config.getIntConfig("word2vec","emb_dim")

def any2unicode(text, encoding='utf8', errors='strict'):
    """Convert a string (bytestring in `encoding` or unicode), to unicode."""
    if isinstance(text, str):
        return text
    return str(text, encoding, errors=errors)
to_unicode = any2unicode
class MyLineSentence(object):
    """
    Simple format: one sentence = one line; words already preprocessed and separated by whitespace.
    """
示例#3
0
from gensim.models.word2vec import KeyedVectors, LineSentence
import itertools
import sys
sys.path.append('..')
import config
import pickle
MAX_WORDS_IN_BATCH = 1000

logger = logging.Logger(name="word2vec", level=logging.INFO)
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s')
logging.root.setLevel(level=logging.INFO)

kb_path = config.kb_path
question_path = config.all_seg_question_path

model_path = config.getPathConfig("word2vec", "char2vec")
emb_dim = config.getIntConfig("word2vec", "emb_dim")

char_vocab_path = config.char_vocab_path
char_emb_path = config.char_embedding


def any2unicode(text, encoding='utf8', errors='strict'):
    """Convert a string (bytestring in `encoding` or unicode), to unicode."""
    if isinstance(text, str):
        return text
    return str(text, encoding, errors=errors)


to_unicode = any2unicode
示例#4
0
#coding:utf-8
'''
Mean Reciprocal Rank: 平均排名倒数
'''
import sys
sys.path.append("..")
import config
from myutils.io import read_table, read_lines

train_gold_path = config.getPathConfig("data", "train_data")
train_pre_path = config.train_score_path
test_gold_path = config.getPathConfig("data", "test_data")
test_pre_path = config.test_score_path

delta = 2


def is_right(gold):
    if float(gold.strip()) > delta:
        return True
    else:
        return False


class Metrics(object):
    def __init__(self, gold_path, pre_path):
        self.gold_path = gold_path
        self.pre_path = pre_path
        self.gold = None
        self.pre = None
        self.questions = dict()  #每个问题对应的标签和预测的得分