예제 #1
0
class Maxent:
	#regex find word (w) and label (lbl), especially in data train
	w = regx.w
	lbl = regx.lbl

	#call class function
	func = func.Func()

	#call class steming
	factory = StemmerFactory()
	stemmer = factory.create_stemmer()

	#mongodb connection
	client = MongoClient()
	db = client.indo_db

	def binary_feature(self, sentence, type_feature):
		self.sentence = sentence

		#define unclean temporary array data train and label
		train = []

		#jika training iis maka lakukan pencarian binary feature dengan label
		#contoh : (dict(f1=0, f2=0, f3=0, f4=0, f5=0, f6=0, f7=1, f8=1, f9=0, f12=0, f10=0, f11=0}, "NUM"))
		if type_feature == "train_iis":
			for index, data in enumerate(sentence): 
				label = []
				token = word_tokenize(data)
				for index, data in enumerate(token):
					if "/" in data :
						#add label to array
						label.append(self.lbl.search(token[index]).group(1))
						#add word to array
						token[index] = self.w.search(token[index]).group(1)
					else:
						label.append("O")
				for index, data in enumerate(token):
					#feature processing panggil class feature
					featuretrain = f.Feature()
					result = featuretrain.template_feature(token, label, index)
					#result = template_feature(token, label, index)
					train.append(result) 
		else:
			#jika training ner atau selain iis maka hanya melakukan pencarian binary feature, tidak dengan label
			#contoh : (dict(f1=0, f2=0, f3=0, f4=0, f5=0, f6=0, f7=1, f8=1, f9=0, f12=0, f10=0, f11=0}))
			token = word_tokenize(sentence)
			label = []
			for index, data in enumerate(token):
				#feature processing panggil class feature
				featuretrain = f.Feature()
				result = featuretrain.template_feature(token, label, index)
				#result = template_feature(token, label, index)
				train.append(result) 

		# filter array empty/none karena Other atau entitas O tidak diproses
		train_set = filter(None, train)
		#print train_set
		return train_set

	def training_weight_iis(self, paragraph):
		train = []
		for index, data in enumerate(paragraph):
			sentence = sent_tokenize(data)
		# 1. Pemecahan paragraf kedalam kalimat
			for index, data in enumerate(sentence):	
		# 2. Convert sentence to lower
				sent_lower = data.lower()
		# 3. Convert terbilang ke angka
				sent_conv = self.func.terbilang_to_number(sent_lower)
				print "training kata [%s]"%sent_conv
		# 4. Stemming
				tokenize = word_tokenize(sent_conv)

				div_sentence = []
				for data in tokenize:
					if "/" not in data:
						# ubah menjadi kata dasar
						sent_stem = self.stemmer.stem(data)
						data = sent_stem
					elif "/con" in data:
						# ubah menjadi kata dasar kemudian dicocokan kedalam gazeter kondisi
						sent_stem = self.stemmer.stem(self.w.search(data).group(1))
						data = sent_stem+"/CON"
					elif "/" in data:
						word = self.w.search(data).group(1)
						label = self.lbl.search(data).group(1)
						data = word+"/"+label.upper()
					div_sentence.append(data)
				train.append(" ".join(div_sentence))

		#print train
		#melakukan training dengan sentence yang sudah diubah kedalam kata dasar
		me_classifier = MaxentClassifier.train(self.binary_feature(train, "train_iis"), 'iis', trace=100, max_iter=2000, min_lldelta=0.5)
		#print me_classifier.show_most_informative_features()
		return me_classifier

	def training_ner(self, paragraph, classification):
		sentence = sent_tokenize(paragraph)
		#print paragraph
		
		#result = []
		train = []
		sentence_ne = ""
		# 1. Pemecahan paragraf kedalam kalimat
		for index, data in enumerate(sentence):	
			tokenize = word_tokenize(data)
			div_sentence = []
			for word in tokenize:
				#check_kota = len(list(self.db.cities.find({"kota":re.compile("^"+word+"$", re.IGNORECASE)})))>=1
				check_kota = (self.db.location.find({"$text": {"$search": word.lower()}}).count())>=1
				# print "word : %s, check : %s"%(word,check_kota) 
				if not check_kota:
					#apabila kata bukan kota maka dibuat kata dasar
					sent_stem = self.stemmer.stem(word)
					word = sent_stem
				div_sentence.append(word)
			train.append(" ".join(div_sentence))
			#ket parameter : self.div_sentence_ner(kalimat_dengan_kata_dasar, kalimat_asli, jenis_klasifikasi) 
			sentence_ne = self.div_sentence_ner("".join(train), " ".join(tokenize), classification)
			#result.append(sentence_ne)
			#reset array train agar tidak diikutkan training ner
			train = []

		return sentence_ne

	def div_sentence_ner(self, sentence_stem, sentence_unstem, classification):
		#kalimat sudah dicari kata dasar
		sentence_stem = sentence_stem.lower()
		sent_stem_conv = self.func.terbilang_to_number(sentence_stem)

		#kalimat asli (tidak di jadikan kata dasar)
		sentence_unstem = sentence_unstem.lower()
		sent_unstem_conv = self.func.terbilang_to_number(sentence_unstem)

		featureset = self.binary_feature(sent_stem_conv, "train_ner")
		self.classification = classification
		token = word_tokenize(sent_unstem_conv)

		entity = ["ORG", "LOC", "NUM", "CON"]
		temp_sentence = []

		# create array object result untuk penampung array balikan
		result = {} 

		result_entity = {}
		result_index_entity = {}
		
		temp_entity = []
		for index, feature in enumerate(featureset):
			#print token[index]
			#print index
			if sum(feature.values()) != 0:
				#print feature
				print ' '*20+'%s' %token[index]
				#jika bukan other atau sum feature tidak sama dengan 0
				print ' '*4+'p(ORG)      p(LOC)      p(NUM)      p(CON)'
				print '-'*(28+24)
				pdist = classification.prob_classify(feature)
				en = np.array([pdist.prob('ORG'), pdist.prob('LOC'), pdist.prob('NUM'), pdist.prob('CON')])
				en_index = np.argmax(en)
				print en
				print

				if "/" not in token[index]:
					#replace word original with word label
					temp_replace = token[index].replace(token[index], token[index]+"/"+entity[en_index])
					#apabila entitas maka append
					temp_sentence.append(temp_replace)
					result_index_entity[token[index]] = index

					if entity[en_index] in result_entity:
						#apabila index array entitas didalam result entitas, maka ambil array entitas tersebut, kemudian tambahkan data baru
						data = result_entity[entity[en_index]]
						data.append(token[index])
					else:
						#apabila ada entitas baru maka kosongkan array dan buat index array baru
						temp_entity = []
						temp_entity.append(token[index])
						result_entity[entity[en_index]] = temp_entity
			else:
				#apabila bukan entitas maka append
				temp_sentence.append(token[index])

			#print " ".join(temp_sentence)
			sentence = " ".join(temp_sentence)

		result["text_tweet"] = sentence
		result["entity"] = result_entity
		result["entity_position"] = result_index_entity
		
		#print result
		return result
예제 #2
0
def test():
    fun = func.Func()
    assert fun.val == 1
예제 #3
0
# Bedakan data yang digunakan untuk training iis dan proses ner, karena prosesnya juga berbeda
# misal pada training IIS Yogyakarta/LOC masih tetap di pertahankan,
# akan tetapi pada NER menjadi Yogyakarta LOC (atau dihilangkan simbol /)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#create object class maxent
classify = m.Maxent()

#create object class stopword removal
stopword = s.StopwordRemoval()

#create object class remove tag
tag = r.RemoveTag()

#create object class independent function
func = f.Func()

#create object class database model
dbmodel = d.DBModel()

#--------------------------------------------------------------------------
# open hasil training iis
#--------------------------------------------------------------------------
classifier = func.open_file('iis.pickle')
#print classifier

# define month
month = "april"

# define month clean
month_data_tweet = month
예제 #4
0
#coding:utf-8
import pandas as pd
import os
import re  #Regular expression
import copy
import json
import csv
import numpy as np
import func
import simulate

S = simulate.Simulate()
X = func.Func()


def getPipsPerMonth(currency, codeStr, codeArgValStr, year):
    spread = X.getSpread(S.getCurrency())
    try:
        result = X.getResult(currency, codeStr, codeArgValStr, year)
        if len(result.index) > 0:
            month = float(result['month'][len(result.index) - 1]) - 1 + float(
                result['day'][len(result.index) - 1]) / 31
            deltaRate = result.sum()['delta'] / (60 * 24 * 20 * month)
            average = result.mean()['gain']
            if deltaRate > 1:
                countPerMonth = len(result.index) / month / deltaRate
            else:
                countPerMonth = len(result.index) / month
            return (average - spread) * countPerMonth
        else:
            return 0