Python Phrases.load示例，gensim.models.phrases.Phrases.load Python示例

示例#1

0

显示文件

 def __init__(self):
     self.stopwords = stopwords.words('english')
     # Lemmatizer
     self.lmtzr = WordNetLemmatizer()
     # Stemmer
     self.stemmer = PorterStemmer()
     self.word2vec_model = None
     self.words = re.compile(r"\w+", re.I)
     try:
         self.bigrams = Phrases.load('slm/app/cached_models/bigrams.gensim')
     except:
         self.bigrams = None
     try:
         self.trigrams = Phrases.load(
             'slm/app/cached_models/trigrams.gensim')
     except:
         self.trigrams = None
     try:
         self.dictionary = corpora.Dictionary.load(
             'slm/app/cached_models/dictionary.dict')
     except:
         self.dictionary = None
     try:
         self.tfidf = TfidfModel.load('slm/app/cached_models/tfidf.gensim')
     except:
         self.tfidf = None

示例#2

0

显示文件

文件： model2.py 项目： hujiewang/research

    def __init__(self):

        '''
        Training parameters:
        '''

        self.w2v_dim=100
        self.num_feature=400
        self.batch_size=16
        self.num_epoch=30

        # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model')

        self.index2word_set = set(self.w2v_model.index2word)

        #self.bigram=None
        #self.trigram=None

        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')

        print('Build model...')

        self.model = Sequential()
        self.model.add(Dropout(0.2,input_shape=(self.num_feature,)))
        self.model.add(Dense(3, input_dim=self.num_feature, init='orthogonal'))
        self.model.add(Activation('softmax'))


        self.model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode="categorical")

        print('Model has been built!')

示例#3

0

显示文件

    def __init__(self):
        '''
        Training parameters:
        '''

        self.w2v_dim = 100
        self.num_feature = 400
        self.batch_size = 16
        self.num_epoch = 30

        # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model = Word2Vec.load('./data/word2vec/w2v.model')

        self.index2word_set = set(self.w2v_model.index2word)

        #self.bigram=None
        #self.trigram=None

        self.bigram = Phrases.load('./data/bigram.dat')
        self.trigram = Phrases.load('./data/trigram.dat')

        print('Build model...')

        self.model = Sequential()
        self.model.add(Dropout(0.2, input_shape=(self.num_feature, )))
        self.model.add(Dense(3, input_dim=self.num_feature, init='orthogonal'))
        self.model.add(Activation('softmax'))

        self.model.compile(loss='categorical_crossentropy',
                           optimizer='adam',
                           class_mode="categorical")

        print('Model has been built!')

示例#4

0

显示文件

def getTrigramList(g_DataQueue, g_FinishRead, savePath, bigramPath,
                   trigramPath):
    """

    :param g_DataQueue:
    :param g_FinishRead:
    :param savePath:保存字典路径
    :param bigramPath:
    :param trigramPath:
    :return:
    """
    count = 0
    vocabulary_dic = {}
    bigram = Phraser(Phrases.load(bigramPath))
    trigram = Phraser(Phrases.load(trigramPath))
    while (g_FinishRead.value == 0 or (not g_DataQueue.empty())):
        words = g_DataQueue.get()
        count += len(words)
        print("have processed sentences:", count)
        # 获取短语
        trigram_list = trigram[bigram[words]]
        del words
        gc.collect()
        # 放入字典中
        for phrase_list in trigram_list:
            for phrase in phrase_list:
                if phrase not in vocabulary_dic:
                    vocabulary_dic[phrase] = 0
                vocabulary_dic[phrase] += 1
    # 存入本地
    fw = codecs.open(savePath, "w", encoding="utf-8")
    fw.write(json.dumps(vocabulary_dic))
    fw.close()
    del vocabulary_dic
    gc.collect()

示例#5

0

显示文件

文件： main.py 项目： shereshevsky/Word-Embedding

def train_w2v_model() -> (Phraser, Word2Vec):
    # Build Word2Vec model
    if not Path(model_file).exists():
        sent = [row.split() for row in df['clean_lyrics'] if row]
        # Build collocations
        if not Path(bigrams_file).exists():
            bigram_phrases = Phrases(sent,
                                     min_count=30,
                                     progress_per=10000,
                                     max_vocab_size=200000,
                                     common_terms=sentiment_terms)
            bigram = Phraser(bigram_phrases)
            bigram.save(bigrams_file)
            trigram_phrases = Phrases(bigram[sent],
                                      min_count=30,
                                      progress_per=10000,
                                      max_vocab_size=200000,
                                      common_terms=sentiment_terms)
            trigram = Phraser(trigram_phrases)
            trigram.save(trigrams_file)

        trigram = Phrases.load(trigrams_file)

        sentences = trigram[sent]

        cores = multiprocessing.cpu_count()
        w2v_model = Word2Vec(
            min_count=20,  # Remove rare words
            window=2,
            size=300,
            sample=6e-5,
            alpha=0.03,
            min_alpha=0.0007,
            negative=20,
            workers=cores - 1)

        t = time()

        w2v_model.build_vocab(sentences, progress_per=10000)

        print('Time to build vocab: {} mins'.format(round((time() - t) / 60,
                                                          2)))
        w2v_model.vocabulary.save(vocabulary_file)

        t = time()

        w2v_model.train(sentences,
                        total_examples=w2v_model.corpus_count,
                        epochs=30,
                        report_delay=1)

        print('Time to train the model: {} mins'.format(
            round((time() - t) / 60, 2)))

        w2v_model.save(model_file)
    trigram = Phrases.load(trigrams_file)
    w2v_model = Word2Vec.load(model_file)

    return trigram, w2v_model

示例#6

0

显示文件

    def test_build_phrase_models_real(self, doc_content_stream):

        from eea.corpus.processing.phrases.phrases import build_phrase_models
        from eea.corpus.utils import rand
        from gensim.models.phrases import Phrases
        from itertools import tee, chain
        import os.path
        import tempfile

        content_A, content_B, test_A = tee(doc_content_stream, 3)

        # proof that the simple_content_stream can be used for phrases
        # ph_model = Phrases(content_A)
        # phrases = list(ph_model.export_phrases(sents))
        # assert phrases[0][0].decode('utf-8') == 'freshwater resources'

        base_dir = tempfile.gettempdir()
        b_name = rand(10)
        base_path = os.path.join(base_dir, b_name)
        build_phrase_models(content_A, base_path, {'level': 2})

        assert b_name + '.2' in os.listdir(base_dir)
        assert not (b_name + '.3' in os.listdir(base_dir))
        os.remove(base_path + '.2')

        t_name = rand(10)
        base_path = os.path.join(base_dir, t_name)
        build_phrase_models(content_B, base_path, {'level': 3})

        assert t_name + '.2' in os.listdir(base_dir)
        assert t_name + '.3' in os.listdir(base_dir)

        pm2 = Phrases.load(base_path + '.2')
        pm3 = Phrases.load(base_path + '.3')

        os.remove(base_path + '.2')
        os.remove(base_path + '.3')

        # an iterator of sentences, each a list of words
        test_A = chain.from_iterable(doc.tokenized_text for doc in test_A)
        trigrams = pm3[pm2[test_A]]
        words = chain.from_iterable(trigrams)
        w2, w3 = tee(words, 2)

        bigrams = [w for w in w2 if w.count('_') == 1]
        assert len(bigrams) == 27622
        assert len(set(bigrams)) == 2060

        trigrams = [w for w in w3 if w.count('_') == 2]
        assert len(trigrams) == 11268
        assert len(set(trigrams)) == 706

        assert 'freshwater_resources' in bigrams
        assert 'water_stress_conditions' in trigrams

示例#7

0

显示文件

文件： analysis.py 项目： hujiewang/research

 def __init__(self):
     reader = Reader()
     print('loading data')
     self.X_train=reader.getData(TRAIN)
     print('train data has been loaded!')
     self.X_valid=reader.getData(DEV)
     print('valid data has been loaded!')
     self.X_test=reader.getData(TEST)
     print('test data has been loaded!')
     self.c_title=[]
     self.c_body=[]
     self.bigram=Phrases.load('./data/bigram.dat')
     self.trigram=Phrases.load('./data/trigram.dat')

示例#8

0

显示文件

文件： analysis.py 项目： timothywangdev/research

 def __init__(self):
     reader = Reader()
     print('loading data')
     self.X_train = reader.getData(TRAIN)
     print('train data has been loaded!')
     self.X_valid = reader.getData(DEV)
     print('valid data has been loaded!')
     self.X_test = reader.getData(TEST)
     print('test data has been loaded!')
     self.c_title = []
     self.c_body = []
     self.bigram = Phrases.load('./data/bigram.dat')
     self.trigram = Phrases.load('./data/trigram.dat')

示例#9

0

显示文件

文件： phrasedetection.py 项目： waternk/medical-text

    def __init__(self, sentences, filename=None):

        # model parameters
        self.sentences = sentences
        self.dataset = "CASEREPORT"
        self.tokenizer = "RAW"
        self.prune_stopwords = stopwords("pubmed")
        self.phrases = None
        self.threshold = 250
        self.decay = 2
        self.bigram_iter = 3

        # data file path
        models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models'])
        if filename is None:
            filename = "PHRASE_%s_%s_%s_%s" % (self.threshold, self.decay, self.dataset, self.tokenizer, )
        self.filepath = os.path.join(models_folder, filename)

        # does identical model already exists?
        model_exists = os.path.isfile(self.filepath)
        if model_exists:
            logging.info("LOADING - loading phrase data..")
            self.phrases = Phrases.load(self.filepath)
        else:
            logging.info("CREATE - creating phrase data..")
            self.build()

示例#10

0

显示文件

文件： model.py 项目： timothywangdev/research

    def __init__(self, train_data, dev_data, test_data):
        self.train_data = train_data
        self.dev_data = dev_data
        self.test_data = test_data

        # Hyper-parameters
        self.learningRate = 0.01
        self.trainSize = 2000
        self.testSize = 1000
        self.totalSize = self.trainSize + self.testSize
        self.maxEpochs = 10000
        self.num_processed = -1

        self.w2v_model = Word2Vec.load('./data/word2vec/w2v.model')
        self.bigram = Phrases.load('./data/bigram.dat')
        self.trigram = Phrases.load('./data/trigram.dat')

示例#11

0

显示文件

文件： test_phrases.py 项目： chitang1990/Gensim-3.1.0

    def testSaveLoadCustomScorer(self):
        """ saving and loading a Phrases object with a custom scorer """

        try:
            bigram = Phrases(self.sentences,
                             min_count=1,
                             threshold=.001,
                             scoring=dumb_scorer)
            bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
            bigram_loaded = Phrases.load(
                "test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
            seen_scores = []
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.append(score)

            assert all(seen_scores)  # all scores 1
            assert len(
                seen_scores
            ) == 3  # 'graph minors' and 'survey human' and 'interface system'

        finally:
            if os.path.exists(
                    "test_phrases_testSaveLoadCustomScorer_temp_save.pkl"):
                os.remove(
                    "test_phrases_testSaveLoadCustomScorer_temp_save.pkl")

示例#12

0

显示文件

    def testCompatibilty(self):
        phrases = Phrases.load(datapath("phrases-3.6.0.model"))
        phraser = FrozenPhrases.load(datapath("phraser-3.6.0.model"))
        test_sentences = ['trees', 'graph', 'minors']

        self.assertEqual(phrases[test_sentences], ['trees', 'graph_minors'])
        self.assertEqual(phraser[test_sentences], ['trees', 'graph_minors'])

示例#13

0

显示文件

文件： test_phrases.py 项目： chitang1990/Gensim-3.1.0

    def testSaveLoadNoScoring(self):
        """ Saving and loading a Phrases object with no scoring parameter.
        This should ensure backwards compatibility with old versions of Phrases"""

        try:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            del (bigram.scoring)
            bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
            bigram_loaded = Phrases.load(
                "test_phrases_testSaveLoadNoScoring_temp_save.pkl")
            seen_scores = set()
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])

        finally:
            if os.path.exists(
                    "test_phrases_testSaveLoadNoScoring_temp_save.pkl"):
                os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")

示例#14

0

显示文件

文件： test_phrases.py 项目： lopusz/gensim

 def testSaveLoadNoCommonTerms(self):
     """ Ensure backwards compatibility with old versions of Phrases, before common_terms"""
     bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl"))
     self.assertEqual(bigram_loaded.common_terms, frozenset())
     # can make a phraser, cf #1751
     phraser = Phraser(bigram_loaded)  # does not raise
     phraser[["human", "interface", "survey"]]  # does not raise

示例#15

0

显示文件

文件： model.py 项目： hujiewang/research

    def __init__(self,train_data,dev_data,test_data):
        self.train_data=train_data
        self.dev_data=dev_data
        self.test_data=test_data

        # Hyper-parameters
        self.learningRate=0.01
        self.trainSize=2000
        self.testSize=1000
        self.totalSize = self.trainSize + self.testSize
        self.maxEpochs=10000
        self.num_processed=-1

        self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model')
        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')

示例#16

0

显示文件

 def testSaveLoadNoCommonTerms(self):
     """Ensure backwards compatibility with old versions of Phrases, before connector_words."""
     bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl"))
     self.assertEqual(bigram_loaded.connector_words, frozenset())
     # can make a phraser, cf #1751
     phraser = FrozenPhrases(bigram_loaded)  # does not raise
     phraser[["human", "interface", "survey"]]  # does not raise

示例#17

0

显示文件

文件： step1.3_phrase_SO.py 项目： DunZhang/SEDict

def trainSOPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath):
    """

    :param g_DataQueue:全局变量存放数据库中的数据
    :param g_FinishRead:是否读取完数据库的标志
    :param savePath:短语学习器保存的位置
    :param priorPhrasePath:前一个学习器保存的位置
    :return:
    """
    count = 0
    phrase = Phrases(None, min_count=10, threshold=15)
    if (priorPhrasePath is None):
        priorPhraser = None
    else:
        priorPhraser = Phraser(Phrases.load(priorPhrasePath))
    while (g_FinishRead.value == 0 or (not g_DataQueue.empty())):
        data = g_DataQueue.get()
        count += len(data)
        print("have processed:", count)
        words = []
        reSub0 = re.compile(
            "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"
        )  # URL
        reSub1 = re.compile(
            "[()\"{},:/-]|[^a-z]'|'[^a-z;?.!]|'$")  # replace with " "
        reSub2 = re.compile(
            "'[.?;!]")  # replace with . 主要考虑所有格问题，核心思想单引号左右的各种复杂情况
        reSplit1 = re.compile("\.[^a-z0-9]|[?!;]")
        # 获取单词
        for t in data:
            if (t[0] is not None):
                st = re.sub(reSub0, " ", t[0].lower())
                st = re.sub(reSub1, ".", st)
                st = re.sub(reSub2, ".", st)
                for sentence in re.split(reSplit1, st):
                    sen_word = sentence.split()
                    if (len(sen_word) > 6):
                        words.append(sen_word)
            if (t[1] is not None):
                st = re.sub(reSub0, " ", t[1].lower())
                st = re.sub(reSub1, ".", st)
                st = re.sub(reSub2, ".", st)
                for sentence in re.split(reSplit1, st):
                    sen_word = sentence.split()
                    if (len(sen_word) > 6):
                        words.append(sen_word)
        del data
        gc.collect()
        # 训练短语
        if (priorPhraser is None):  # 第一次训练
            phrase.add_vocab(words)
        else:  # 已经训练过一次，寻找个数更多的短语
            phrase.add_vocab(priorPhraser[words])
        del words
        # print(len(phrase.vocab))
        gc.collect
    phrase.save(savePath)

示例#18

0

显示文件

def load_phraser_models(models_dir, bigram_model_name, trigram_model_name):
    bigram_model = None
    trigram_model = None

    # check models dir
    if not os.path.isdir(models_dir):
        return bigram_model, trigram_model
    # check bigram model
    elif not os.path.exists(os.path.join(models_dir, bigram_model_name)):
        return bigram_model, trigram_model
    else:
        bigram_model = Phrases.load(os.path.join(models_dir,
                                                 bigram_model_name))
        # check trigram model
        if os.path.exists(os.path.join(models_dir, trigram_model_name)):
            trigram_model = Phrases.load(
                os.path.join(models_dir, trigram_model_name))

    return bigram_model, trigram_model

示例#19

0

显示文件

def use_phrase_models(content, files, settings):

    for doc in content:
        text = doc.tokenized_text
        for fpath in files:
            phrases = Phrases.load(fpath)
            text = phrases[text]

        text = ". ".join([" ".join(sent) for sent in text])
        yield set_text(doc, text)

示例#20

0

显示文件

    def test_save_load_with_connector_words(self):
        """Test saving and loading a Phrases object."""
        connector_words = frozenset({'of'})
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         connector_words=connector_words)
        with temporary_file("test.pkl") as fpath:
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)

        assert bigram_loaded.connector_words == connector_words

示例#21

0

显示文件

    def testCompatibilty(self):
        phr = Phraser.load(datapath("phraser-3.6.0.model"))
        model = Phrases.load(datapath("phrases-3.6.0.model"))

        test_sentences = ['trees', 'graph', 'minors']
        expected_res = ['trees', 'graph_minors']

        phr_out = phr[test_sentences]
        model_out = model[test_sentences]

        self.assertEqual(phr_out, expected_res)
        self.assertEqual(model_out, expected_res)

示例#22

0

显示文件

文件： test_phrases.py 项目： RaRe-Technologies/gensim

    def testCompatibilty(self):
        phr = Phraser.load(datapath("phraser-3.6.0.model"))
        model = Phrases.load(datapath("phrases-3.6.0.model"))

        test_sentences = ['trees', 'graph', 'minors']
        expected_res = ['trees', 'graph_minors']

        phr_out = phr[test_sentences]
        model_out = model[test_sentences]

        self.assertEqual(phr_out, expected_res)
        self.assertEqual(model_out, expected_res)

示例#23

0

显示文件

文件： train_emb.py 项目： vishalbelsare/snorkel-biocorpus

def load_phrase_models(indir, n):
    """

    :param indir:
    :param n:
    :return:
    """
    models = []
    for _ in range(2, n + 1):
        infile = "%s%sgram.phrase.model" % (indir, n)
        models += [Phrases.load(infile)]
    return models

示例#24

0

显示文件

文件： blacklab.py 项目： toth12/data_analysis_lts

 def __init__(self,
              search_pattern,
              window=5,
              lemma=False,
              document_ids=None,
              path_to_phrase_model=None):
     self.ids = document_ids
     self.window = window
     self.search_pattern = search_pattern
     self.lemma = lemma
     self.path_to_phrase_model = path_to_phrase_model
     if path_to_phrase_model is not None:
         self.phraser_model = Phraser(Phrases.load(path_to_phrase_model))

示例#25

0

显示文件

文件： test_phrases.py 项目： lopusz/gensim

    def testSaveLoadStringScoring(self):
        """ Saving and loading a Phrases object with a string scoring parameter.
        This should ensure backwards compatibility with the previous version of Phrases"""
        bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl"))
        seen_scores = set()
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
        for phrase, score in bigram_loaded.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == set([
            5.167,  # score for graph minors
            3.444  # score for human interface
        ])

示例#26

0

显示文件

    def __init__(self):
        '''
        Training parameters:
        '''

        self.w2v_dim = 100
        self.num_feature = 400
        self.batch_size = 16
        self.num_epoch = 1

        #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model = Word2Vec.load('./data/word2vec/w2v.model')
        self.index2word_set = set(self.w2v_model.index2word)
        self.bigram = Phrases.load('./data/bigram.dat')
        self.trigram = Phrases.load('./data/trigram.dat')

        print('Build model...')

        param_dist = {
            "n_estimators": sp_randint(20, 250),
            "criterion": ["gini", "entropy"],
            "max_depth": sp_randint(10, 300),
            "min_samples_split": sp_randint(1, 30),
            "min_samples_leaf": sp_randint(1, 30),
            "max_features": sp_randint(1, 200),
            "bootstrap": [True, False],
            'random_state': sp_randint(1, 1000000),
        }
        # build a classifier
        clf = RandomForestClassifier(n_jobs=8)
        # run randomized search
        self.model = RandomizedSearchCV(clf,
                                        param_distributions=param_dist,
                                        n_iter=10,
                                        cv=9,
                                        n_jobs=8)

        print('Model has been built!')

示例#27

0

显示文件

    def testSaveLoadNoScoring(self):
        """Test backwards compatibility with old versions of Phrases with no scoring parameter."""
        bigram_loaded = Phrases.load(datapath("phrases-no-scoring.pkl"))
        test_sentences = [[
            'graph', 'minors', 'survey', 'human', 'interface', 'system'
        ]]
        seen_scores = set(
            round(score, 3)
            for score in bigram_loaded.find_phrases(test_sentences).values())

        assert seen_scores == set([
            5.167,  # score for graph minors
            3.444  # score for human interface
        ])

示例#28

0

显示文件

文件： test_phrases.py 项目： lopusz/gensim

    def testSaveLoadCustomScorer(self):
        """ saving and loading a Phrases object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            seen_scores = []
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.append(score)

            assert all(seen_scores)  # all scores 1
            assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'

示例#29

0

显示文件

文件： test_phrases.py 项目： yujuyeon0511/gensim

    def test_save_load_string_scoring(self):
        """Test backwards compatibility with a previous version of Phrases with custom scoring."""
        bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl"))
        test_sentences = [[
            'graph', 'minors', 'survey', 'human', 'interface', 'system'
        ]]
        seen_scores = set(
            round(score, 3)
            for score in bigram_loaded.find_phrases(test_sentences).values())

        assert seen_scores == set([
            5.167,  # score for graph minors
            3.444  # score for human interface
        ])

示例#30

0

显示文件

文件： model3.py 项目： hujiewang/research

    def __init__(self):

        '''
        Training parameters:
        '''

        self.w2v_dim=100
        self.num_feature=400
        self.batch_size=16
        self.num_epoch=1

        #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model')
        self.index2word_set = set(self.w2v_model.index2word)
        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')

        print('Build model...')

        param_dist = {
            "n_estimators":sp_randint(20,250),
            "criterion": ["gini", "entropy"],
            "max_depth": sp_randint(10, 300),
            "min_samples_split": sp_randint(1, 30),
            "min_samples_leaf": sp_randint(1, 30),
            "max_features": sp_randint(1, 200),
            "bootstrap": [True, False],
            'random_state':sp_randint(1, 1000000),
        }
        # build a classifier
        clf = RandomForestClassifier(n_jobs=8)
        # run randomized search
        self.model=RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=10,cv=9,n_jobs=8)

        print('Model has been built!')

示例#31

0

显示文件

    def testSaveLoadStringScoring(self):
        """ Saving and loading a Phrases object with a string scoring parameter.
        This should ensure backwards compatibility with the previous version of Phrases"""
        bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl"))
        seen_scores = set()
        test_sentences = [[
            'graph', 'minors', 'survey', 'human', 'interface', 'system'
        ]]
        for phrase, score in bigram_loaded.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == set([
            5.167,  # score for graph minors
            3.444  # score for human interface
        ])

示例#32

0

显示文件

def trainPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath):
    count = 0
    phrase = Phrases(None, min_count=15, threshold=10, max_vocab_size=40000000)
    if (priorPhrasePath is None):
        priorPhraser = None
    else:
        priorPhraser = Phraser(Phrases.load(priorPhrasePath))
    while (g_FinishRead.value == 0 or (not g_DataQueue.empty())):
        words = g_DataQueue.get()
        if (priorPhraser is None):  # 第一次训练
            phrase.add_vocab(words)
        else:  # 已经训练过一次，寻找个数更多的短语
            phrase.add_vocab(priorPhraser[words])
        del words
        gc.collect()
    phrase.save(savePath)

示例#33

0

显示文件

文件： test_phrases.py 项目： lopusz/gensim

    def testSaveLoad(self):
        """ Saving and loading a Phrases object."""

        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            seen_scores = set()
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])

示例#34

0

显示文件

    def testSaveLoad(self):
        """Test saving and loading a Phrases object."""
        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            seen_scores = set(
                round(score, 3) for score in bigram_loaded.find_phrases(
                    test_sentences).values())

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])

示例#35

0

显示文件

文件： test_phrases.py 项目： vishalbelsare/gensim

    def testSaveLoadCustomScorer(self):
        """ saving and loading a Phrases object with a custom scorer """

        try:
            bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
            bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
            bigram_loaded = Phrases.load("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
            seen_scores = []
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.append(score)

            assert all(seen_scores)  # all scores 1
            assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'

        finally:
            if os.path.exists("test_phrases_testSaveLoadCustomScorer_temp_save.pkl"):
                os.remove("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")

示例#36

0

显示文件

def create_dictionary(texts,
                      dest_file: str,
                      build_bigram,
                      working_directory=DIR):
    """
    Reads the file specified by source_file, creates a dictionary and saves it to the dest_file
    path.
    :param working_directory: The path to the directory where the bigram model files should be saved.
    :param build_bigram: 1 if building a new phrases object is needed else an already processed bigram model will
                         be loaded.
    :param source_file: path to source text file.
    :param dest_file: path to save dictionary to.
    :return:
    """
    # collect statistics about all tokens
    stoplist = stopwords.words('english')
    if build_bigram:
        bigram = Phrases([tweet.split() for tweet in texts])
        bigram.save(working_directory + '/bigram_model.phrase')
    else:
        bigram = Phrases.load(working_directory + '/bigram_model.phrase')
    phraser = Phraser(bigram)
    # Build dictionary
    dictionary = corpora.Dictionary(phraser[line.lower().split()]
                                    for line in texts)
    # remove stop words and words that appear only once
    stop_ids = [
        dictionary.token2id[stopword] for stopword in stoplist
        if stopword in dictionary.token2id
    ]
    once_ids = [
        tokenid for tokenid, docfreq in iteritems(dictionary.dfs)
        if docfreq == 1
    ]
    dictionary.filter_tokens(
        stop_ids +
        once_ids)  # remove stop words and words that appear only once
    dictionary.filter_extremes(no_below=0.3, no_above=0.85)
    dictionary.compactify(
    )  # remove gaps in id sequence after words that were removed
    dictionary.save(dest_file)
    print(dictionary)
    print(dictionary.token2id)
    return dictionary

示例#37

0

显示文件

文件： gensim_utils.py 项目： toth12/language-technology-humanities

def identify_phrases(sentence, path_to_gensim_phrase_model):
    """Identify multiword expression by a trained phrase model.

    Parameters
    ----------
    sentence : {list}
        list with tokens as elements
    path_to_gensim_phrase_model : {str}
        Absolute path to the model.

    Returns
    -------
    list
        List with tokens as elements.
    """
    phrase_model = Phrases.load(path_to_gensim_phrase_model)
    phraser_model = Phraser(phrase_model)
    new_sentence = phraser_model[sentence]
    return new_sentence

示例#38

0

显示文件

    def testSaveLoadCustomScorer(self):
        """Test saving and loading a Phrases object with a custom scorer."""
        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences,
                             min_count=1,
                             threshold=.001,
                             scoring=dumb_scorer)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            seen_scores = list(
                bigram_loaded.find_phrases(test_sentences).values())

            assert all(score == 1 for score in seen_scores)
            assert len(
                seen_scores
            ) == 3  # 'graph minors' and 'survey human' and 'interface system'

示例#39

0

显示文件

文件： test_phrases.py 项目： vishalbelsare/gensim

    def testSaveLoad(self):
        """ Saving and loading a Phrases object."""

        try:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            bigram.save("test_phrases_testSaveLoad_temp_save.pkl")
            bigram_loaded = Phrases.load("test_phrases_testSaveLoad_temp_save.pkl")
            seen_scores = set()
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])

        finally:
            if os.path.exists("test_phrases_testSaveLoad_temp_save.pkl"):
                os.remove("test_phrases_testSaveLoad_temp_save.pkl")

示例#40

0

显示文件

    def testSaveLoadCustomScorer(self):
        """ saving and loading a Phrases object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences,
                             min_count=1,
                             threshold=.001,
                             scoring=dumb_scorer)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            seen_scores = []
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.append(score)

            assert all(seen_scores)  # all scores 1
            assert len(
                seen_scores
            ) == 3  # 'graph minors' and 'survey human' and 'interface system'

示例#41

0

显示文件

文件： test_phrases.py 项目： vishalbelsare/gensim

    def testSaveLoadNoScoring(self):
        """ Saving and loading a Phrases object with no scoring parameter.
        This should ensure backwards compatibility with old versions of Phrases"""

        try:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            del(bigram.scoring)
            bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
            bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
            seen_scores = set()
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])

        finally:
            if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"):
                os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")

示例#42

0

显示文件

文件： preprocessing.py 项目： tcunnington/balance_bias

    def get_trigram_model(self, recalculate=False, from_scratch=True):

        if not os.path.isfile(
                self.paths.trigram_model_filepath) or recalculate:

            if not from_scratch:
                raise ValueError(
                    'No trigram model file exists but from_scratch is False')

            print('Building tri-gram model...')
            bigram_sentences = LineSentence(
                self.paths.bigram_sentences_filepath)
            trigram_model = Phrases(bigram_sentences)
            trigram_model = Phraser(trigram_model)
            print('Writing model...')
            trigram_model.save(self.paths.trigram_model_filepath)
        else:
            print('Loading tri-gram model...')
            trigram_model = Phrases.load(self.paths.trigram_model_filepath)

        print('Done!')
        return trigram_model

示例#43

0

显示文件

文件： getText.py 项目： hujiewang/research

from reader import Reader,TRAIN,TEST,DEV,EXTRA
from preprocess import preprocess
from gensim.models.phrases import Phrases
reader = Reader()
sentences=reader.getText(TRAIN+EXTRA)
# use phrase only when it has already trained


bigram=Phrases.load('./data/bigram.dat')
trigram=Phrases.load('./data/trigram.dat')
sen_set=set()
with open('./data/text_cleaned_phrase.txt','w') as f:
    for sentence in sentences:
        s=preprocess(sentence,bigram=bigram,trigram=trigram)
        if s not in sen_set:
            sen_set.add(s)
            f.write(preprocess(sentence,bigram=bigram,trigram=trigram))
            f.write('\n')


'''
# for phrase training only

with open('./data/text_cleaned.txt','w') as f:
    for sentence in sentences:
        f.write(preprocess(sentence,no_stopwords=True))
        f.write('\n')
'''

示例#44

0

显示文件

文件： model7.py 项目： hujiewang/research

    def __init__(self):
        self.session = tf.Session()
        '''
        Training parameters:
        '''

        self.w2v_dim=100
        self.num_feature=400
        self.batch_size=32
        self.num_epoch=10000
        self.num_hidden_1=3
        self.num_hidden_2=3

        self.number_of_layers=3

        #self.max_len = 50
        self.max_len_title=6
        self.max_len_body=38

        self.d2v_model=Doc2Vec.load('data/word2vec/d2v.model')
        #self.bigram = None
        #self.trigram =None
        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')

        # Model
        self.input=tf.placeholder(tf.float32,[None,self.w2v_dim*4])


        self.dropout_input = tf.placeholder(tf.float32)
        self.dropout_hidden = tf.placeholder(tf.float32)

        self.target = tf.placeholder(tf.float32, [None, 3])


         # 2-layer NN
        # 2-layer NN
        with tf.variable_scope("NN", initializer=tf.random_uniform_initializer()):
            W_1 = tf.get_variable("W_1", [self.w2v_dim*4, self.num_hidden_1])
            b_1 = tf.get_variable("b_1", [self.num_hidden_1])
            # W_2 = tf.get_variable("W_2", [self.num_hidden_1, self.num_hidden_2])
            # b_2 = tf.get_variable("b_2", [self.num_hidden_2])

            # input = tf.nn.dropout(input, self.dropout_input)
            # y_1 = tf.sigmoid(tf.matmul(self.input, W_1)+b_1)
            # y_1 = tf.nn.dropout(y_1, self.dropout_hidden)
            # y_2 = tf.matmul(y_1, W_2)+b_2
        y_2 = tf.matmul(self.input, W_1)+b_1

        self.y_pred=tf.nn.softmax(y_2)
        self.y_pred=tf.clip_by_value(self.y_pred,1e-7, 1.0)
        self.cross_entropy = -tf.reduce_mean(self.target*tf.log(self.y_pred))


        # Optimizer.

        global_step = tf.Variable(0)
        # optimizer = tf.train.GradientDescentOptimizer(0.1)
        # optimizer = tf.train.AdamOptimizer(0.01)
        # gradients, v = zip(*optimizer.compute_gradients(self.cross_entropy))
        # gradients, _ = tf.clip_by_global_norm(gradients, 50)
        # self.optimizer= optimizer.apply_gradients(zip(gradients, v), global_step=global_step)
        self.optimizer = tf.train.AdamOptimizer(0.01).minimize(self.cross_entropy)


        print('Model has been built!')

示例#45

0

显示文件

文件： model5.py 项目： hujiewang/research

    def __init__(self):
        self.session = tf.Session()
        '''
        Training parameters:
        '''

        self.w2v_dim=30
        self.num_feature=400
        self.batch_size=32
        self.num_epoch=10000
        self.num_hidden_1=50
        self.num_hidden_2=3

        self.number_of_layers=1

        #self.max_len = 50
        self.max_len_title=6
        self.max_len_body=38

        #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model=Word2Vec.load('data/word2vec/w2v.model')
        self.index2word_set = set(self.w2v_model.index2word)
        #self.bigram = None
        #self.trigram =None
        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')

        # Model
        self.input_0=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim])
        self.input_1=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim])
        self.input_0_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim])
        self.input_1_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim])

        self.dropout_input = tf.placeholder(tf.float32)
        self.dropout_hidden = tf.placeholder(tf.float32)

        self.target = tf.placeholder(tf.float32, [self.batch_size, 3])

        input_0=array_ops.unpack(self.input_0)
        input_1=array_ops.unpack(self.input_1)
        input_0_=array_ops.unpack(self.input_0_)
        input_1_=array_ops.unpack(self.input_1_)


        def _rnn(inputs, reverse=False):
            with tf.variable_scope("GRU_RNN") as scope:
                cell=rnn_cell.GRUCell(self.w2v_dim)
                cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=self.dropout_input)
                stacked_cell = rnn_cell.MultiRNNCell([cell] * self.number_of_layers)
                state = stacked_cell.zero_state(self.batch_size, tf.float32)
                if reverse:
                    inputs=reversed(inputs)
                for time, input_ in enumerate(inputs):
                    if time > 0: scope.reuse_variables()
                    output, state = stacked_cell(input_, state)
                return state

        with tf.variable_scope('Feature_Generator') as scope:
            state_0 = _rnn(input_0)
            scope.reuse_variables()
            state_1 = _rnn(input_1)
            state_0_ = _rnn(input_0_)
            state_1_ = _rnn(input_1_)
        '''
        with tf.variable_scope('Feature_Generator_body') as scope:
            state_0_ = _rnn(input_0_)
            scope.reuse_variables()
            state_1_ = _rnn(input_1_)
        '''
        '''
        with tf.variable_scope('Feature_Generator_body_reverse') as scope:
            state_0_reverse = _rnn(input_0_, reverse=True)
            scope.reuse_variables()
            state_1_reverse = _rnn(input_1_, reverse=True)
        '''

        '''
        with tf.variable_scope('Feature_Generator_title') as scope:
            state_0 = _rnn(input_0)
            scope.reuse_variables()
            state_1 = _rnn(input_1)

        with tf.variable_scope('Feature_Generator_body') as scope:
            state_0_ = _rnn(input_0_)
            scope.reuse_variables()
            state_1_ = _rnn(input_1_)


        # state=tf.concat(1,[tf.abs(tf.sub(state_0,state_1)),tf.mul(state_0,state_1),
        #                   tf.abs(tf.sub(state_0_,state_1_)),tf.mul(state_0_,state_1_)])


        # state=tf.concat(1,[state_0,state_1, state_0_, state_1_])
        # state = tf.ones([32,10])

        # state=tf.concat(1,[tf.abs(tf.sub(state_0,state_1)),tf.mul(state_0,state_1)])
        '''

         # 2-layer NN
        with tf.variable_scope("NN", initializer=tf.random_uniform_initializer(-1.0,1.0)):
            self.W_mul = tf.get_variable("W_mul", [state_0_.get_shape()[1]*2,self.num_hidden_1])
            self.W_sub = tf.get_variable("W_sub", [state_0_.get_shape()[1]*2,self.num_hidden_1])
            self.b = tf.get_variable("b", [self.num_hidden_1])

            self.W_softmax=tf.get_variable("W_softmax", [self.num_hidden_1,self.num_hidden_2])
            self.b_softmax = tf.get_variable("b_softmax", [self.num_hidden_2])

        # h_mul = tf.mul(state_0,state_1)
        # h_sub = tf.abs(tf.sub(state_0,state_1))
        h_mul = tf.concat(1,[tf.mul(state_0,state_1),tf.mul(state_0_,state_1_)])
        h_sub = tf.concat(1,[tf.abs(tf.sub(state_0,state_1)),tf.abs(tf.sub(state_0_,state_1_))])

        y_1 = tf.nn.sigmoid(tf.matmul(h_mul, self.W_mul)+tf.matmul(h_sub, self.W_sub)+self.b)
        y_2 = tf.matmul(y_1, self.W_softmax)+self.b_softmax

        # regularizers = (tf.nn.l2_loss(self.W_1) + tf.nn.l2_loss(self.b_1)+tf.nn.l2_loss(self.W_2) + tf.nn.l2_loss(self.b_2))

        '''
        state_0_title_normalized = tf.nn.l2_normalize(state_0, 1)
        state_1_title_normalized = tf.nn.l2_normalize(state_1, 1)
        state_0_body_normalized = tf.nn.l2_normalize(state_0_, 1)
        state_1_body_normalized = tf.nn.l2_normalize(state_1_, 1)

        dist_title_ = tf.mul(state_0_title_normalized, state_1_title_normalized)
        dist_body_ = tf.mul(state_0_body_normalized, state_1_body_normalized)s

        dist_title=tf.reduce_sum(dist_title_, 1, keep_dims=True)
        dist_body=tf.reduce_sum(dist_body_, 1, keep_dims=True)

        feature = tf.concat(1, [dist_title,dist_body])

        with tf.variable_scope("log_reg", initializer=tf.random_uniform_initializer()):
             self.W = tf.get_variable("W", [feature.get_shape()[1],3])
             self.b = tf.get_variable("b", [3])

        y_2 = tf.matmul(feature, self.W)+self.b
        '''
        '''
        with tf.variable_scope("log_reg", initializer=tf.random_uniform_initializer()):
            self.W_1 = tf.get_variable("W_1", [state.get_shape()[1],self.num_hidden_1])
            self.b_1 = tf.get_variable("b_1", [self.num_hidden_1])
            self.W_2 = tf.get_variable("W_2", [self.num_hidden_1,self.num_hidden_2])
            self.b_2 = tf.get_variable("b_2", [self.num_hidden_2])
        '''
        '''
        # Create model
        def multilayer_perceptron(_X, _weights, _biases):
            layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1'])) #Hidden layer with RELU activation
            layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, _weights['h2']), _biases['b2'])) #Hidden layer with RELU activation
            return tf.matmul(layer_2, _weights['out']) + _biases['out']

        # Store layers weight & bias
        weights = {
            'h1': tf.Variable(tf.random_normal([10, 10])),
            'h2': tf.Variable(tf.random_normal([10, 5])),
            'out': tf.Variable(tf.random_normal([5, 3]))
        }
        biases = {
            'b1': tf.Variable(tf.random_normal([10])),
            'b2': tf.Variable(tf.random_normal([5])),
            'out': tf.Variable(tf.random_normal([3]))
        }
        # Construct model
        self.y_pred = multilayer_perceptron(state, weights, biases)

        # Define loss and optimizer
        self.cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self.y_pred, self.target)) # Softmax loss
        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.1).minimize(self.cross_entropy) # Adam Optimizer
        '''

        # self.W = tf.Variable(tf.zeros([10, 3]))
        # self.b = tf.Variable(tf.zeros([3]))
        # y_1 = tf.sigmoid(tf.matmul(state, self.W_1)+self.b_1)
        # y_2 = tf.sigmoid(tf.matmul(y_1, self.W_2)+self.b_2)
        # self.y_pred = tf.nn.softmax(tf.nn.sigmoid(tf.add(tf.matmul(state, self.W),self.b)))
        self.y_pred=tf.nn.softmax(y_2)
        # self.y_pred = tf.nn.softmax(tf.nn.sigmoid(tf.matmul(state, self.W_1)+self.b_1))
        self.cross_entropy = -tf.reduce_mean(self.target*tf.log(self.y_pred))
        # self.optimizer = tf.train.AdamOptimizer().minimize(self.cross_entropy)
        # self.optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(self.cross_entropy)
        # self.optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(self.cross_entropy)
        # self.gradstep = self.optimizer.compute_gradients(self.cross_entropy)


        # Optimizer.

        global_step = tf.Variable(0)
        # optimizer = tf.train.GradientDescentOptimizer(0.1)
        optimizer = tf.train.AdagradOptimizer(0.1)
        gradients, v = zip(*optimizer.compute_gradients(self.cross_entropy))
        gradients, _ = tf.clip_by_global_norm(gradients, 10)
        self.optimizer= optimizer.apply_gradients(zip(gradients, v), global_step=global_step)



        print('Model has been built!')

示例#46

0

显示文件

文件： model6.py 项目： hujiewang/research

    def __init__(self):
        self.session = tf.Session()
        '''
        Training parameters:
        '''

        self.w2v_dim=10
        self.num_feature=400
        self.batch_size=32
        self.num_epoch=10000
        self.num_hidden_1=100
        self.num_hidden_2=50
        self.num_hidden_3=3

        self.number_of_layers=1

        #self.max_len = 50
        self.max_len_title=13
        self.max_len_body=50

        # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model=Word2Vec.load('data/word2vec/w2v.model')
        self.index2word_set = set(self.w2v_model.index2word)
        #self.bigram = None
        #self.trigram =None
        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')

        # Model
        self.input_0=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim])
        self.input_1=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim])
        self.input_0_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim])
        self.input_1_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim])

        self.dropout_input = tf.placeholder(tf.float32)
        self.dropout_hidden_1 = tf.placeholder(tf.float32)

        self.target = tf.placeholder(tf.float32, [self.batch_size, 3])

        input_0=array_ops.unpack(self.input_0)
        input_1=array_ops.unpack(self.input_1)
        input_0_=array_ops.unpack(self.input_0_)
        input_1_=array_ops.unpack(self.input_1_)


        def _encoder(inputs, reverse=False):
            with tf.variable_scope("GRU_RNN") as scope:
                cell=rnn_cell.BasicLSTMCell(self.w2v_dim)
                stacked_cell = rnn_cell.MultiRNNCell([cell] * self.number_of_layers)
                # state = tf.zeros([1, cell.state_size])
                state = stacked_cell.zero_state(self.batch_size, tf.float32)
                if reverse:
                    inputs=reversed(inputs)
                for time, input_ in enumerate(inputs):
                    if time > 0: scope.reuse_variables()
                    output, state = stacked_cell(input_, state)
                return state
        def _decoder(state, inputs):
            with tf.variable_scope("GRU_RNN") as scope:
                cell=rnn_cell.BasicLSTMCell(self.w2v_dim)
                stacked_cell = rnn_cell.MultiRNNCell([cell] * self.number_of_layers*2)

                for time, input_ in enumerate(inputs):
                    if time > 0: scope.reuse_variables()
                    output, state = stacked_cell(input_, state)
                return output

        with tf.variable_scope('Encoder') as scope:
            state = _encoder(input_0_)
            scope.reuse_variables()
            state_reversed = _encoder(input_0_, reverse=True)


        with tf.variable_scope('Decoder') as scope:
            state = _decoder(tf.concat(1,[state,state_reversed]), input_1_)

        with tf.variable_scope("to_score", initializer=tf.random_uniform_initializer()):
             self.W = tf.get_variable("W", [state.get_shape()[1],3])
             self.b = tf.get_variable("b", [3])

        score = tf.matmul(state, self.W)+self.b

        # score_1 = tf.sigmoid(tf.matmul(out_1, self.W)+self.b)
        # state=tf.concat(1,[score_0,score_1])
        '''
        with tf.variable_scope("to_final", initializer=tf.random_uniform_initializer()):
             self.W = tf.get_variable("W", [state.get_shape()[1],3])
             self.b = tf.get_variable("b", [3])

        final = tf.matmul(state, self.W)+self.b
        '''

        self.y_pred=tf.nn.softmax(score)
        self.cross_entropy = -tf.reduce_mean(self.target*tf.log(self.y_pred))


        # Optimizer.

        global_step = tf.Variable(0)
        optimizer = tf.train.GradientDescentOptimizer(0.1)
        # optimizer = tf.train.AdamOptimizer(0.1)
        gradients, v = zip(*optimizer.compute_gradients(self.cross_entropy))
        gradients, _ = tf.clip_by_global_norm(gradients, 20)
        self.optimizer= optimizer.apply_gradients(zip(gradients, v), global_step=global_step)



        print('Model has been built!')