Exemplo n.º 1
0
def make_model(src_vocab,
               tgt_vocab,
               N=6,
               d_model=512,
               d_ff=2048,
               h=8,
               dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
            """https://zhuanlan.zhihu.com/p/74274453
            #權值初始化 Xavier均勻分佈"""
    return model
def one_classifier(text, lang, embedding_name, model_path, model_file):

    #--------------------------------------------------------------------------------------------
    #--- LOAD MODEL AND EMBEDDING
    #--------------------------------------------------------------------------------------------
    print(model_file)
    cls = pickle.load(open(model_path + model_file, 'rb'))

    embedding = Embeddings(embedding_name)

    #--------------------------------------------------------------------------------------------
    #--- PROCESSING
    #--------------------------------------------------------------------------------------------

    processed_text = preprocess(text)

    no_stpw_text = remove_stopwords(processed_text, lang)

    if len(
            to_vector_single_nonzeros(no_stpw_text, embedding,
                                      len(no_stpw_text))) > 0:
        vectorized_text = np.mean(to_vector_single_nonzeros(
            no_stpw_text, embedding, len(no_stpw_text)),
                                  axis=0)
        vectorized_text2 = np.reshape(vectorized_text, (1, -1))
        prob = cls.predict_proba(vectorized_text2)[:, 1]
    else:
        vectorized_text = np.zeros((300, ) * 1)
        prob = 0
    #print(cls.classes_) # check that class at second position is L1

    for i in list(prob):

        return (i)
Exemplo n.º 3
0
	def _make_model(self, num_tgt_chars, N, d_model, d_ff, h, dropout):
		"""
		
		:param num_tgt_chars: output space
		:param N: number of decoder and encoder layers
		:param d_model: model dimensionality
		:param d_ff: hidden size of the feed-forward neural network
		:param h: number of attention heads
		:param dropout: dropout rate
		:return: model

		"""
		c = copy.deepcopy
		attn = MultiHeadedAttention(h, d_model)
		ff = PositionwiseFeedForward(d_model, d_ff, dropout)
		position = PositionalEncoding(d_model, dropout)

		if self.config.USE_RESNET:
			feature_extractor = ResNet(block=BasicBlock, layers=self.config.RESNET_LAYERS, d_model=self.config.D_MODEL)
		else:
			feature_extractor = FeatureExtractionNetwork(d_model=self.config.D_MODEL)

		direction_embed = Embeddings(d_model, 2)

		model = EncoderDecoder(
			encoder=Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
			decoder=Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
			tgt_embed=nn.Sequential(Embeddings(d_model, num_tgt_chars), c(position)),
			generator=PredictionLayer(d_model, num_tgt_chars),
			feature_extractor=feature_extractor,
			prediction_layer=PredictionLayer(d_model, len(Dataset.CHAR_ID_MAP)),
			bidirectional_decoding=self.config.BIDIRECTIONAL_DECODING,
			direction_embed=direction_embed,
			device=self.device
		)
		
		for p in model.parameters():
			if p.dim() > 1:
				nn.init.xavier_normal_(p)
		
		logging.info("Model created")
		
		return model
Exemplo n.º 4
0
def make_vectorize():
    try:
        #Load the data
        data = request.get_json()

    except Exception as e:
        raise e

    if data == {}:
        return (bad_request())
    else:
        #Get the text and the language
        try:
            lang = data['lang']
        except:
            try:
                lang = detect_language(data['text'])
                print(lang)
            except:
                responses = jsonify(
                    "Error in vectorize: language field is missing")
                return responses
        try:
            text = data['text']
        except:
            responses = jsonify("Error in vectorize: text is missing")
            return responses

        if lang not in ['en', 'es', 'ar', 'ro', 'fr']:
            responses = jsonify(
                "Language not available. Language must be in ['en','es','ar','ro','fr']"
            )
            return responses
        #Preprocess the text
        print("Vectorize...")

        embeddings = Embeddings(emb_dict[lang])

        processed_text = preprocess(text)
        no_stpw_text = remove_stopwords(processed_text, lang)
        vectorized_tokens = to_vector_single_nonzeros(no_stpw_text, embeddings,
                                                      len(no_stpw_text))

        if len(vectorized_tokens) > 0:
            vectorized_text = np.mean(vectorized_tokens, axis=0)
        else:
            vectorized_text = np.zeros((300, ) * 1)
            print(vectorized_text)

        #Send the response codes
        responses = jsonify(vector=vectorized_text.tolist())
        responses.status_code = 200
        return responses
Exemplo n.º 5
0
def make_model(src_vocab,
               tgt_vocab,
               N=6,
               d_model=512,
               d_ff=2048,
               h=8,
               dropout=0.1):
    "Helper: Construct a model from hyperparameters"
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionWiseFeedForward(d_model, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab, c(position))),
        Generator(d_model, tgt_vocab))
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model
Exemplo n.º 6
0
def get_topics(text, lang, topics_path):
    #initialization
    embeddings = Embeddings(emb_dict[lang])

    # get the topics dictionary from the path
    topics_dicts = load_data(topics_path)
    topics_dict = topics_dicts[lang]

    topics = list(topics_dict.keys())

    if lang == 'en':
        #cl = 0.7 # when a topic is "close"
        cl = 0.5
    else:
        cl = 0.5
    # now vectorize the topics
    vect_dict_topics = [
        (w,
         np.mean(to_vector_single_nonzeros(topics_dict[w], embeddings,
                                           len(topics_dict[w])),
                 axis=0)) for w in topics
    ]
    #print(vect_dict_topics)

    # get topics
    assigned_topics = []
    dists = []

    if len(to_vector_single_nonzeros(text, embeddings, len(text))) > 0:
        vectorized_text = np.mean(to_vector_single_nonzeros(
            text, embeddings, len(text)),
                                  axis=0)
    else:
        vectorized_text = np.zeros((300, ) * 1)

    for v in vect_dict_topics:
        dists.append(spatial.distance.cosine(
            vectorized_text, v[1]))  # measure distance to all topics

    good_topics = [
        topics[i].upper() for i in range(len(topics)) if dists[i] < cl
    ]  # choose close topics
    if not good_topics:
        good_topics.append('OTHER')

        # assigned_topics.append(topic)
    assigned_topics.append(good_topics)

    return assigned_topics
Exemplo n.º 7
0
def test():
    emb = Embeddings(128, 1000, 0)
    model = TransformerEncoder(num_layers=2,
                               d_model=128,
                               heads=16,
                               d_ff=8,
                               dropout=0.5,
                               embeddings=emb,
                               max_relative_positions=100)
    #print(model)
    ipt = torch.empty((16, 10, 1), dtype=torch.long).random_(1000)
    embs, outs, lengths = model(ipt)
    print("embs")
    print(embs.size())
    print("outs")
    print(outs.size())
    print("lengths")
    print(lengths)
def two_classifier(text, lang, embedding_name, model_path, model_file_JIH,
                   model_file_EXR):
    #--------------------------------------------------------------------------------------------
    #--- LOAD MODEL AND EMBEDDING
    #--------------------------------------------------------------------------------------------

    cls_JIH = pickle.load(open(model_path + model_file_JIH, 'rb'))
    cls_EXR = pickle.load(open(model_path + model_file_EXR, 'rb'))

    embedding = Embeddings(embedding_name)

    #--------------------------------------------------------------------------------------------
    #--- PROCESSING
    #--------------------------------------------------------------------------------------------

    processed_text = preprocess(text)
    no_stpw_text = remove_stopwords(processed_text, lang)
    if len(
            to_vector_single_nonzeros(no_stpw_text, embedding,
                                      len(no_stpw_text))) > 0:
        vectorized_text = np.mean(to_vector_single_nonzeros(
            no_stpw_text, embedding, len(no_stpw_text)),
                                  axis=0)
        vectorized_text2 = np.reshape(vectorized_text, (1, -1))
        prob_JIH = cls_JIH.predict_proba(vectorized_text2)[:, 1]
        prob_EXR = cls_EXR.predict_proba(vectorized_text2)[:, 1]

    else:
        vectorized_text = np.zeros((300, ) * 1)
        prob_JIH = 0
        prob_EXR = 0

    if prob_JIH > prob_EXR:
        prob = prob_JIH
    else:
        prob = prob_EXR

    for i in list(prob):
        return (i)
Exemplo n.º 9
0
def classifier(annotated_data, lang, user_id, case_id, clas_name):

    #--------------------------------------------------------------------------------------------
    #--- DEFINE FILES AND LANGUAGE
    #--------------------------------------------------------------------------------------------

    model_path = './data/probability/insikt/'
    model_file = user_id + '_' + case_id + '_' + clas_name + '_classifier.model'

    if (lang == 'en'):
        embedding_name = 'embedding-EN'

    if (lang == 'ar'):
        embedding_name = 'embedding-AR'

    if (lang == 'es'):
        embedding_name = 'embedding-ES'

    if (lang == 'ro'):
        embedding_name = 'embedding-RO'

    if (lang == 'fr'):
        embedding_name = 'embedding-FR'

    embedding = Embeddings(embedding_name)
    #--------------------------------------------------------------------------------------------
    #--- GENERAL SCRIPT
    #--------------------------------------------------------------------------------------------

    ########## Tokenize + stopwords
    #print(annotated_data)
    #raw_data=np.array(annotated_data)
    x_train = [i[0] for i in annotated_data]
    #print(x_train)
    y_train = [i[1] for i in annotated_data]  #replace N0 for L0...!!!
    #print(y_train)
    x_train_DL = []

    print('Data training with ' + str(len(x_train)) + ' texts')

    for text in x_train:
        #print(text)
        processed_text = preprocess(text)
        no_stpw_text = remove_stopwords(processed_text, lang)
        if len(
                to_vector_single_nonzeros(no_stpw_text, embedding,
                                          len(no_stpw_text))) > 0:
            vectorized_text = np.mean(to_vector_single_nonzeros(
                no_stpw_text, embedding, len(no_stpw_text)),
                                      axis=0)
        else:
            vectorized_text = np.zeros((300, ) * 1)
        #print(vectorized_text)
        #x_train_DL.append(np.reshape(vectorized_text,(1,-1)))
        x_train_DL.append(vectorized_text)

########## Build and test classifiers with 10-fold -cross validation

    skf = StratifiedKFold(n_splits=10, shuffle=True)

    #	Stochastic Descent Gradient

    cls = SGDClassifier(loss="log", penalty="l2",
                        max_iter=500).fit(x_train_DL, y_train)
    scores = cross_val_score(cls,
                             x_train_DL,
                             y_train,
                             cv=skf,
                             scoring='accuracy')
    print("Accuracy C-10V EN: %2.1f (+/- %2.1f)" %
          (100 * scores.mean(), scores.std() * 200))
    print(cls.classes_)  # check that class at the second position is 'Yes'
    accuracy = round((100 * scores.mean()), 2)
    ########## Save the model

    pickle.dump(cls, open(model_path + model_file, 'wb'))
    return (accuracy)
Exemplo n.º 10
0
def main(morphFileFolder, outputFolder, embeddings_file, positive_words,
         negatve_words):
    print "loading embedding vectors..."
    e = Embeddings(embeddings_file)
    print "done!"
    positive_df = pd.read_csv(positive_words)
    negative_df = pd.read_csv(negatve_words)
    positive_df["vector"] = positive_df["word"].apply(lambda x: e.get(x))
    negative_df["vector"] = negative_df["word"].apply(lambda x: e.get(x))

    files = [
        f for f in listdir(morphFileFolder) if isfile(join(morphFileFolder, f))
    ]
    recordings = []
    for f in files:
        recordings.append(Recording.Recording(join(morphFileFolder, f)))

    questionSummaries = []
    for r in recordings:
        for i in xrange(len(r.questions)):
            if len(questionSummaries) < (i + 1):
                questionSummaries.append(Question.Question([]))

            questionSummaries[i].mergeWith(r.questions[i])

    #specific metrics comparison across all questions
    nouns = {}
    verbs = {}
    adjectives = {}
    adverbs = {}
    content = {}
    person_1 = {}
    person_2 = {}
    person_3 = {}

    for i, q in enumerate(questionSummaries):
        norm_pos = counter2normDictionary(q.pos, q.word_count)
        norm_per = counter2normDictionary(q.person, q.word_count)
        nouns[i + 1] = norm_pos["noun"]
        verbs[i + 1] = norm_pos["verb"]
        adjectives[i + 1] = norm_pos["adjective"]
        adverbs[i + 1] = norm_pos["adverb"]
        content[i + 1] = norm_pos["noun"] + norm_pos["verb"] + norm_pos[
            "adjective"] + norm_pos["adverb"]
        person_1[i + 1] = norm_per["1"]
        person_2[i + 1] = norm_per["2"]
        person_3[i + 1] = norm_per["3"]
        print "Question " + ` (i + 1) ` + ", avg word count: " + ` (
            q.word_count / len(questionSummaries)) `

    counter2hist(nouns, 'Nouns', outputFolder)
    counter2hist(verbs, 'Verbs', outputFolder)
    counter2hist(adjectives, 'Adjectives', outputFolder)
    counter2hist(adverbs, 'Adverbs', outputFolder)
    counter2hist(content, 'Content words', outputFolder)
    counter2hist(person_1, '1st person', outputFolder)
    counter2hist(person_2, '2nd person', outputFolder)
    counter2hist(person_3, '3rd person', outputFolder)

    #raw metrics for each question
    sentiment_scores = {}
    for i, q in enumerate(questionSummaries):

        positive_score = calculate_sentiment_score(q.words, positive_df, e)
        negative_score = calculate_sentiment_score(q.words, negative_df, e)

        print "Question " + ` (
            i + 1
        ) ` + ", Positive: " + ` positive_score ` + ", Negative: " + ` negative_score ` + ", Overall: " + ` (
            positive_score / negative_score) `
        sentiment_scores[i + 1] = (positive_score / negative_score)
        buildWordCloud(q.contentWords, True, 'Question ' + `
                       (i + 1) ` + ' Content Word Cloud', outputFolder)
        counter2hist(counter2normDictionary(q.pos, q.word_count),
                     'Question ' + ` (i + 1) ` + ' POS', outputFolder)
        counter2hist(counter2normDictionary(q.person, q.word_count),
                     'Question ' + ` (i + 1) ` + ' Person', outputFolder)

    counter2hist(sentiment_scores, 'Sentiment scores', outputFolder)