def addMultiWords(self, listOfMultiWords): """ Updates the internal regex with a list of multiwords """ buff = StringIO.StringIO() for multiWord in listOfMultiWords: if multiWord not in self.multiWordsRegex and multiWord not in buff.getvalue( ): buff.write(self.regexTemplate.format(multiWord)) #add a normalized (no accents) version if multiWord != Preprocessor.normalize(multiWord): buff.write( self.regexTemplate.format( Preprocessor.normalize(multiWord))) if len(self.multiWordsRegex) == 0: self.multiWordsRegex = buff.getvalue().strip('|') else: self.multiWordsRegex += "|" + buff.getvalue().strip('|') buff.close()
def Predict(): """ Predicts the data from DATA/test.csv :return: """ #get user input from file test.csv csvhand = Handler.CSVHandler("DATA/test.csv") pred_data = csvhand.getData() pred_data_pro = Preprocessor.preprocess(pred_data, train=False) fieldsInData = list(Preprocessor.getFI()) + list( Preprocessor.getSubjects()) # fields are FI_X and S_X predictions = Model.Predict(pred_data_pro) C = Model.getClusterCenters() #cluster centers for pred_i in range(len(predictions)): pred = predictions[pred_i] #get the prediciton vlaue form the list Ctarget = C[pred] # get the corresponding cluster student = pred_data_pro.values[ pred_i] #get the row of values (for a student) for which predicition is done print("Needed ", end=" ") for att_i in range(len(fieldsInData)): Catt = Ctarget[att_i] #Cluster attribute value studentAtt = student[att_i] # attribute value present in thhe field = fieldsInData[att_i] # column name if (Catt > 0.5): #if the value is high . Value might me changed later print(field, end=" " ) # print the field or append it to the result array print() return
def spectrograms_augmentation(self, class_to_augment, many, version, methods): spec_aug = [] if(version == 1): aug_dir = self.augmented_spec_v1_dir + class_to_augment + "/" elif(version == 2): aug_dir = self.augmented_spec_v2_dir + class_to_augment + "/" if not os.path.exists(aug_dir): os.makedirs(aug_dir) files = os.listdir(aug_dir) if(len(files) >= many): for file in files: spec = np.load(file) spec_aug.append(spec) if(len(spec_aug) >= many): break return np.asarray(spec_aug, dtype=np.float32) preprocessor = Preprocessor(aug_dir, version=version, dump=True) shutil.rmtree(aug_dir) os.makedirs(aug_dir) audio_signals_aug = self.audio_signals_augmentation( class_to_augment, many, methods) print("creating new spectrograms...") name = 0 for sig_aug in tqdm(audio_signals_aug): spec = preprocessor.compute_spectrogram( sig_aug, str(name) + ".npy") spec_aug.append(spec) name += 1 if(len(spec_aug) >= many): break return np.asarray(spec_aug, dtype=np.float32)
def Run(source: str, debug: bool): try: prog = Program(source) preproc = Preprocessor() preproc.preprocess(prog) # for l in prog.preprocessed: # print(l) assembler = Assembler() assembler.assemble(prog) # for l in prog.labels: # print(l, f" Position: {l.position}") # for i in prog.instructions: # print(i, f" Position: {i.position} Label: {i.labelName}") # for p in i.parameters: # print(" ", p, end = "") # if p.labelName != None: # print(f" {p.labelName}") # else: # print("") # for b in prog.binary: # print("%04X " % b, end = "") # print("") computer = Computer() computer.loadProgram(prog) if debug == False: computer.run() for l in prog.labels: if l.size > 0: print("%13s (%6s[%3d]): " % (l.name, l.datatype, l.size), end ="") for i in range(l.position, l.position + l.size): print("%d " % computer.memory[i], end = "") print("") else: debugger = Debugger(computer, prog) debugger.run() except PreprocessorError as e: print(e) except AssemblerError as e: print(e) except CompilerError as e: print(e) except Exception as e: raise e
def __init__(self, sample_rate=32000, dim_to_conform=3000): self.class_id_mapping = { 'Hi-hat': 0, 'Saxophone': 1, 'Trumpet': 2, 'Glockenspiel': 3, 'Cello': 4, 'Knock': 5, 'Gunshot_or_gunfire': 6, 'Clarinet': 7, 'Computer_keyboard': 8, 'Keys_jangling': 9, 'Snare_drum': 10, 'Writing': 11, 'Laughter': 12, 'Tearing': 13, 'Fart': 14, 'Oboe': 15, 'Flute': 16, 'Cough': 17, 'Telephone': 18, 'Bark': 19, 'Chime': 20, 'Bass_drum': 21, 'Bus': 22, 'Squeak': 23, 'Scissors': 24, 'Harmonica': 25, 'Gong': 26, 'Microwave_oven': 27, 'Burping_or_eructation': 28, 'Double_bass': 29, 'Shatter': 30, 'Fireworks': 31, 'Tambourine': 32, 'Cowbell': 33, 'Electric_piano': 34, 'Meow': 35, 'Drawer_open_or_close': 36, 'Applause': 37, 'Acoustic_guitar': 38, 'Violin_or_fiddle': 39, 'Finger_snapping': 40 } self.classes_frequency = {} self.classes_percent = {} self.classes_verified = {} self.files = [] self.labels = [] self.verified = [] self.files_loaded = set() self.train_csv = False self.sample_rate = sample_rate self.preprocessor = Preprocessor(dump=True) self.dim_to_conform = dim_to_conform
def preprocess(sentence): # Uses functions in Preprocessor.py to format sentence, including accounting for spelling errors normalized_sentence = Preprocessor.sentence_normalizer(sentence) formatted_sentence = Preprocessor.sentence_formatter(normalized_sentence) # remove punctuation lemmatized_sentence = Preprocessor.sentence_lemmatizer(formatted_sentence) # lemmatize words cleaned_sentence = Preprocessor.sentence_cleaner(lemmatized_sentence) # remove stopwords preprocessed_sentence = cleaned_sentence #print(preprocessed_sentence) return preprocessed_sentence
def regularize_data(): #Check if the unregularized data exists. If not, construct it if not os.path.exists('TrainingSet.pkl'): Preprocessor.process_mnist() maxEdgeness = 0 maxCornerness = 0 edgeness, cornerness, answers = pickle.load(open('TrainingSet.pkl', 'r')) for i in xrange(len(edgeness)): for x in xrange(28): for y in xrange(28): if maxEdgeness < edgeness[i][x][y]: maxEdgeness = edgeness[i][x][y] if maxCornerness < cornerness[i][x][y]: maxCornerness = cornerness[i][x][y] edgeness, cornerness, answers = pickle.load(open('ValidationSet.pkl', 'r')) for i in xrange(len(edgeness)): for x in xrange(28): for y in xrange(28): if maxEdgeness < edgeness[i][x][y]: maxEdgeness = edgeness[i][x][y] if maxCornerness < cornerness[i][x][y]: maxCornerness = cornerness[i][x][y] edgeness, cornerness, answers = pickle.load(open('TestSet.pkl', 'r')) for i in xrange(len(edgeness)): for x in xrange(28): for y in xrange(28): if maxEdgeness < edgeness[i][x][y]: maxEdgeness = edgeness[i][x][y] if maxCornerness < cornerness[i][x][y]: maxCornerness = cornerness[i][x][y] edgeness, cornerness, answers = pickle.load(open('TrainingSet.pkl', 'r')) for i in xrange(len(edgeness)): for x in xrange(28): for y in xrange(28): edgeness[i][x][y] /= maxEdgeness cornerness[i][x][y] /= maxCornerness pickle.dump([edgeness, cornerness, answers], open('TrainingSetRegularized.pkl', mode='w')) edgeness, cornerness, answers = pickle.load(open('ValidationSet.pkl', 'r')) for i in xrange(len(edgeness)): for x in xrange(28): for y in xrange(28): edgeness[i][x][y] /= maxEdgeness cornerness[i][x][y] /= maxCornerness pickle.dump([edgeness, cornerness, answers], open('ValidationSetRegularized.pkl', mode='w')) edgeness, cornerness, answers = pickle.load(open('TestSet.pkl', 'r')) for i in xrange(len(edgeness)): for x in xrange(28): for y in xrange(28): edgeness[i][x][y] /= maxEdgeness cornerness[i][x][y] /= maxCornerness pickle.dump([edgeness, cornerness, answers], open('TestSetRegularized.pkl', mode='w'))
def load_spectrogram(self, version): if (len(self.files) == 0): raise NameError("load the file name list before from csv file") if (version == 1): data_path = "./dataset/spec/ver1/" elif (version == 2): data_path = "./dataset/spec/ver2/" preprocessor = Preprocessor(spectrogram_path=data_path, version=version, test=False, dump=True) if (self.train_csv): audio_path = "./dataset/audio_train/" else: audio_path = "./dataset/audio_test/" print("loading spectrograms...") for file_name in tqdm(self.files): spec_file_name = file_name.replace(".wav", ".npy") spec_file_name = os.path.join(data_path, spec_file_name) try: spec = np.load(spec_file_name) except FileNotFoundError: print( file_name, " spectrogram not exist, compute spectrogram from the original file" ) audio_file_name = os.path.join(audio_path, file_name) signal, sample_rate = librosa.load(audio_file_name, sr=self.sample_rate, mono=True) spec = preprocessor.compute_spectrogram( signal, os.path.basename(spec_file_name)) self.spectrograms.append(spec) # compute statistics Xk = spec.shape[1] if (self.spec_statistic["max"] == None or Xk > self.spec_statistic["max"]): self.spec_statistic["max"] = Xk if (self.spec_statistic["min"] == None or Xk < self.spec_statistic["min"]): self.spec_statistic["min"] = Xk k = len(self.spectrograms) delta = (Xk - self.spec_statistic["average"]) / k self.spec_statistic[ "average"] = self.spec_statistic["average"] + delta self.spec_statistic["variance"] = ( ((k - 1) * self.spec_statistic["variance"]) / k) + (delta * (Xk - self.spec_statistic["average"])) self.spec_statistic["len_hist"][ Xk] = self.spec_statistic["len_hist"].setdefault(Xk, 0) + 1 return self.spectrograms, self.labels
def CoarseClassify(trainfile, testfile): fulltrainX, fulltrainFY, fulltrainCY = Preprocessor.getdataset(trainfile) fulltestX, fulltestFY, fulltestCY = Preprocessor.getdataset(testfile) vec = TfidfVectorizer(binary=True, use_idf=True, decode_error='ignore') tfidf_train_data = vec.fit_transform(fulltrainX) tfidf_test_data = vec.transform(fulltestX) trainX = tfidf_train_data.toarray().tolist() testX = tfidf_test_data.toarray().tolist() return classify(trainX, fulltrainCY, testX, fulltestCY, fulltrainFY, fulltestFY)
def preprocess( sentence): # Uses functions in Preprocessor.py to format sentence formatted_sentence = Preprocessor.sentence_formatter( sentence) # remove punctuation lemmatized_sentence = Preprocessor.sentence_lemmatizer( formatted_sentence) # lemmatize words cleaned_sentence = Preprocessor.sentence_cleaner( lemmatized_sentence) # remove stopwords preprocessed_sentence = cleaned_sentence return preprocessed_sentence
def CoarseClassifytext(trainfile, text): process = "dummy:dummy " + text textdf = pd.DataFrame(data=[process]) fulltrainX, fulltrainFY, fulltrainCY = Preprocessor.getdataset(trainfile) fulltestX, Trash1, Trash2 = Preprocessor.preprocess(textdf) vec = TfidfVectorizer(binary=True, use_idf=True, decode_error='ignore') tfidf_train_data = vec.fit_transform(fulltrainX) tfidf_test_data = vec.transform(fulltestX) trainX = tfidf_train_data.toarray().tolist() testX = tfidf_test_data.toarray().tolist() return predict(testX), testX, trainX, fulltrainCY, fulltrainFY
def testPreProcessor(): csvhand = han.CSVHandler("../DATA/input.csv") csvhand.open() data = csvhand.getData() data_train = pr.preprocess(data) csvhand = han.CSVHandler("../DATA/input.csv") csvhand.open() data_test = csvhand.getData() data_test = pr.preprocess(data_test, train=False) import pdb pdb.set_trace() return
def main(): """Drives the entire translation process.""" """Preprocess the file""" pp = Preprocessor() pp.remove_white_spaces_comments() pp.write_temp_file() """First pass - adds labels to the symbol table.""" parser = Parser(sys.argv[1] + '.tmp') symbol_table = SymbolTable() pc = -1 while parser.has_more_commands(): parser.advance() command_type = parser.command_type() if command_type == 'A_COMMAND' or command_type == 'C_COMMAND': pc += 1 elif command_type == 'L_COMMAND': label = parser.symbol() symbol_table.addEntry(label, pc + 1) """Second pass - handles variables names and writes the *.hack file.""" ram_address = 16 parser = Parser(sys.argv[1] + '.tmp') code = Code() file_name = parser.get_file_name() hack_file = open(file_name + '.hack', 'w') while parser.has_more_commands(): parser.advance() command_type = parser.command_type() if command_type == 'A_COMMAND': a_symbol = parser.symbol() if a_symbol[0] in '0123456789': a_symbol_binary = code.convert_to_binary(a_symbol) hack_file.write('0' + a_symbol_binary + '\n') else: if symbol_table.contains(a_symbol) is False: symbol_table.addEntry(a_symbol, ram_address) ram_address += 1 address = symbol_table.GetAddress(a_symbol) address_binary = code.convert_to_binary(address) hack_file.write('0' + address_binary + '\n') elif command_type == 'C_COMMAND': comp = code.comp(parser.comp()) dest = code.dest(parser.dest()) jump = code.jump(parser.jump()) hack_file.write('111' + comp + dest + jump + '\n') hack_file.close()
def __call__(self, data, targetClass, num_of_rules ): if self.dataOK(data): # Checks weather targetClass is discrete original_data = data data = Preprocessor.generateFeatures(data, targetClass) # initialization of beams beam = [SDRule(data=data, targetClass=targetClass, g=self.g)] * self.beamWidth newBeam = [SDRule(data=data, targetClass=targetClass, g=self.g)] * self.beamWidth worstRuleIndex = 0 improvements = true while improvements: improvements = false for rule in beam: # for f in features: for feature in data.domain.attributes: newRule = rule.cloneAndAddCondition(feature, 'True') if newRule.support > self.minSupport and self.betterThanWorstRule(newRule, newBeam, worstRuleIndex) and self.isRelevant(newRule, newBeam): worstRuleIndex = self.replaceWorstRule(newRule, newBeam, worstRuleIndex) improvements = true beam = newBeam # perform rule subset selection if num_of_rules != 0: beam = self.ruleSubsetSelection(beam, num_of_rules, data) targetClassRule = SDRule(original_data, targetClass, conditions=[], g=1) # change beam so the rules apply to original data fixedBeam = [rule.getFixed(original_data) for rule in beam] return SDRules(fixedBeam, targetClassRule)
def testPreProcessor(): csvHandler = han.CSVHandler("../DATA/input.csv") csvHandler.open() # todo remove this kind of useless code data = csvHandler.getData() data=pr.preprocess(data) print(data) return
def train(samples_proportion=0.7): global words_in_ham, ham_word_count, words_in_spam, spam_word_count, raw_ham_prob, raw_spam_prob ham, spam = read_spam_ham() print("Spam size: " + str(len(spam)) + " Ham size: " + str(len(ham))) all_emails = append_ham_and_spam(ham, spam) random.shuffle(all_emails) print('Corpus size = ' + str(len(all_emails)) + ' emails') features = [(Preprocessor.get_features(email, ' '), label) for (email, label) in all_emails] print('Collected ' + str(len(features)) + ' feature sets') ''' # define Support value in % support = 10 spam_support_count = (spam_size * 10) / 100; ham_support_count = (ham_size * 10) / 100; print('Spam support count:' + str(spam_support_count)) print('Ham support count:' + str(ham_support_count)) # get the spam frequent itemset and ham frequent itemset # spam_frequent, ham_frequent = get_frequent(all_features, spam_support_count, ham_support_count) # train the our own naivebayes classifier and collect dictionary of raw probabilities of words ''' train_size = int(len(features) * samples_proportion) train_set, test_set = features[:train_size], features[train_size:] ham_mail_count, spam_mail_count = mails_in_ham_spam(train_set) spam_prior = 1.0 * spam_mail_count / len(train_set) ham_prior = 1.0 * ham_mail_count / len(train_set) words_in_ham, words_in_spam = frequency_in_ham_spam(train_set) spam_vocab = len(spam_word_count) ham_vocab = len(ham_word_count) t = get_probabilities_in_each_class(ham_prior, words_in_ham, ham_vocab, ham_word_count, raw_ham_prob, raw_spam_prob, spam_prior, words_in_spam, spam_vocab, spam_word_count, test_set, train_set) ham_prior, words_in_ham, ham_vocab, raw_ham_prob, raw_spam_prob, spam_prior, words_in_spam, spam_vocab, test_set, train_set = get_parameters( t) #print("Train Size:" + str(len(train_set)) + str(' Test size:') + str(len(test_set))) #evaluate(train_set, test_set, raw_spam_prob, raw_ham_prob, words_in_spam, words_in_ham, spam_vocab, ham_vocab, # spam_prior, # ham_prior) classifier = NaiveBayesClassifier(list(spam_word_count), list(ham_word_count)) t = classifier.prob_classify(classifier, train_set).max()
def inferTarget(self,opinion): """ Tries to identify mentions of the targets in a message Params: opinion -> Opinion object Returns: tuple(inferred target, algorithm metadata) """ info = u"Targets: " sentence = Preprocessor.separateSpecialSymbols(opinion.sentence.lower()) #print sentence matches = [] for target in self.targets: for name in target.names: if sentence.find(" "+name+" ") != -1: matches.append(name) for nickName in target.nicknames: if sentence.find(" "+nickName+" ") != -1: matches.append(nickName) for ergo in target.ergos: if sentence.find(" "+ergo+" ") != -1: matches.append(ergo) targets = {} for mention in matches: targetId = self.getTargetByMention(mention) if targetId != None and not self.isFalsePositive(mention, sentence): if mention not in info: info += mention + "," targets[targetId] = mention if len(targets) > 0: results = [] info = info.strip(',') for target,mention in targets.items(): results.append(opinion.clone(target=target,mention=mention,metadata=info)) return results else: return None
def get_prediction(filename): preprocessor = Preprocessor.Preprocessor() # trainer = Trainer.Trainer(preprocessor) predictor = Predictor.Predictor(preprocessor) # image_name_list = os.listdir(os.path.join(os.getcwd(), "img")) # for image_name in image_name_list: # image_path = os.path.join(os.path.join(os.getcwd(), "img"), image_name) return predictor.predict([filename])
def menu(): PreProcessor = Preprocessor.PreProcessor() opcao = input("Preparar base de treino e teste yes/No?") if opcao.lower() in ["yes","y"]: prepararBasesTreinoTeste(PreProcessor) opcao = input("Preparar base de treino e teste sem stemming yes/No?") if opcao.lower() in ["yes","y"]: prepararBasesTreinoTesteWithoutStemming(PreProcessor) PreProcessor = Preprocessor.PreProcessor() opcao = input("Preparar base de treino e teste da pasta de janelas Yes/no?") if opcao.lower() in ["","yes","y"]: prepararBasesTreinoTestePorJanela(PreProcessor) opcao = input("Preparar base Yes/no (não uso, pre-processo a mensagem antes de classificar):") if opcao.lower() in ["","yes","y"]: prepararBase(PreProcessor)
def generate_inverted_list(self, docs_dict): inv_list = dict() for (doc_id, doc) in docs_dict.items(): term_list = Preprocessor.preprocessor_tokenizer(doc) for term in term_list: if term not in inv_list: inv_list[term] = [] inv_list[term].append(doc_id) return inv_list
def test_Response(self): questions, responses = Preprocessor.load_corpus() question_list = Processor.vectorizer(questions) self.assertEqual( Processor.process("what is a star?", question_list, responses), "Stars are mostly made of hydrogen and helium\n") self.assertEqual(Processor.process("Hello", question_list, responses), "Hello! I am Nova.\n") self.assertEqual( Processor.process("What is your job?", question_list, responses), "I teach you about astronomy and geography!\n")
def preProcess(self): try: # preprocess the data and update df with the processed data preprocessor = Preprocessor(self.df) self.df = preprocessor.df self.hasProcessed=True except KeyError: tkMessageBox.showinfo("K Means Clustering", "The selected file is invalid") return #pop message to user tkMessageBox.showinfo("K Means Clustering", "Preprocessing completed successfully!")
def getTopNFormulae(text, n): global model, multiWordDict, thresholdFormulaConf words = pp.parse(text, multiWordDict) nameList, valueList = getFormulaConfidence(getConceptConfidence(words)) result = {} index = 0 for value in valueList: result[nameList[index]] = value index = index + 1 return sorted(result, key=result.get, reverse=True)[0:n]
def Train(): """ Trains the model with data from DATA/input.csv :return: """ # initialize source -csv source csvhand = Handler.CSVHandler("DATA/input.csv") train_data = csvhand.getData() train_data_pro = Preprocessor.preprocess(train_data) Model.Init() #intialize the model labels_ = Model.Train(train_data_pro) return
def train_structured_perceptron(): model = pos.StructuredPerceptron() fail = Preprocessor.load_sp_data('data/train_dev_data/PT.5000.train') train_data = [] train_labels = [] for _, (d, l) in enumerate(fail): train_data.append(d) train_labels.append(l) train_model(train_data, train_labels, model, 'models/sp-pt.model', (5, 0.2))
def process_image(self, img_path): self._img_path = img_path image_array = image.img_to_array( image.load_img(img_path, target_size=self._target_size)) average_edgeness = np.zeros( (image_array.shape[0], image_array.shape[1])) average_cornerness = np.zeros( (image_array.shape[0], image_array.shape[1])) #On a channel by channel basis computes the features values of all points for color_channel in xrange(image_array.shape[2]): channel_array = np.zeros( (image_array.shape[0], image_array.shape[1])) for x in xrange(len(channel_array)): for y in xrange(len(channel_array[0])): channel_array[x][y] = image_array[x][y][color_channel] #Could potentially crash if not given a 56 x 56 image as input for this method channel_edgeness, channel_cornerness = Preprocessor.process_image_sd2( channel_array) for x in xrange(len(channel_array)): for y in xrange(len(channel_array[0])): #Next 2 lines utilize "magic numbers" that are the maximum that was #found over the MNIST dataset, effectively regularizing the data average_edgeness[x][y] += channel_edgeness[x][ y] / 0.634789974395 / image_array.shape[2] average_cornerness[x][y] += channel_cornerness[x][ y] / 0.16428281162 / image_array.shape[2] #Scales image size down to 28 by 28 to fit the model ensmalled_edgeness = np.zeros( (image_array.shape[0] / 2, image_array.shape[1] / 2)) ensmalled_cornerness = np.zeros( (image_array.shape[0] / 2, image_array.shape[1] / 2)) for x in xrange(len(ensmalled_edgeness)): for y in xrange(len(ensmalled_edgeness[0])): ensmalled_edgeness[x][y] += average_edgeness[2 * x][2 * y] ensmalled_edgeness[x][y] += average_edgeness[2 * x + 1][2 * y] ensmalled_edgeness[x][y] += average_edgeness[2 * x][2 * y + 1] ensmalled_edgeness[x][y] += average_edgeness[2 * x + 1][2 * y + 1] ensmalled_edgeness[x][y] /= 4.0 ensmalled_cornerness[x][y] += average_cornerness[2 * x][2 * y] ensmalled_cornerness[x][y] += average_cornerness[2 * x + 1][2 * y] ensmalled_cornerness[x][y] += average_cornerness[2 * x][2 * y + 1] ensmalled_cornerness[x][y] += average_cornerness[2 * x + 1][2 * y + 1] ensmalled_cornerness[x][y] /= 4.0 self._image_array = np.expand_dims(np.asarray( [ensmalled_edgeness, ensmalled_cornerness]), axis=0)
def test_structured_perceptron(): model = pos.StructuredPerceptron() fail = Preprocessor.load_sp_data('data/train_dev_data/EN.dev') test_data = [] test_labels = [] for _, (d, l) in enumerate(fail): test_data.append(d) test_labels.append(l) acc_en = test_model(test_data, test_labels, model, 'models/postagger.model') print "Accuracy on english:", acc_en model = pos.StructuredPerceptron() fail = Preprocessor.load_sp_data('data/train_dev_data/NL.dev') test_data = [] test_labels = [] for _, (d, l) in enumerate(fail): test_data.append(d) test_labels.append(l) acc_nl = test_model(test_data, test_labels, model, 'models/sp-nl.model') print "Accuracy on dutch:", acc_nl model = pos.StructuredPerceptron() fail = Preprocessor.load_sp_data('data/train_dev_data/PT.dev') test_data = [] test_labels = [] for _, (d, l) in enumerate(fail): test_data.append(d) test_labels.append(l) acc_pt = test_model(test_data, test_labels, model, 'models/sp-pt.model') print "Accuracy on portuguese:", acc_pt print "Total accuracy:", (acc_en + acc_nl + acc_pt) / 3.0
def run(): # 创建预处理器对象 dpre = Preprocessor.Preprocessor() # 执行数据预处理 dpre.preprocessing() # 创建LDA处理模型 lda = LDAModel.LDAModel(dpre) # 执行参数估算 lda.est()
def plot_NYISO_forecast_error(true_path, pred_path): """ Plots and give metrics regarding the NYISO forecasts Args : true_path : (file / folder string) : True NYISO load data file / folder pred_path : (file / folder string) : NYISO forecasts load data file / folder """ import Loader as ld import Preprocessor as pre import matplotlib.pyplot as plt true_ld = ld.NY_Loader(true_path) nyiso_ld = ld.NY_Loader(pred_path) true_pre = pre.NY_Preprocessor(true_ld.data, 'Integrated Load', year_range=list(range(2008, 2018))) nyiso_pre = pre.NY_Preprocessor(nyiso_ld.data, 'Integrated Load', year_range=list(range(2008, 2018)), fix_duplicates='keep_last') results = mlu.get_results(true_pre.get_data().values, nyiso_pre.get_data(), true_pre.get_data().index) import Plotter Plotter.plot_results(results, groupby='month') print("Global error : {0}".format(mlu.get_measures(results, 'global', 'MAPE'))) print("Global error : {0}".format(mlu.get_measures(results, 'global', 'RMSE'))) plt.show()
def getTopNConcepts(text, n): global model, multiWordDict words = pp.parse(text, multiWordDict) conceptConf = getConceptConfidence(words) count = 0 result = [] for w in sorted(conceptConf, key=conceptConf.get, reverse=True): result.append(w) count = count + 1 if count == n: break return result
def preprocess(type): pp = Preprocessor.Preprocessor() df = pp.getDataFrame('./data/original/' + type) pp.toNumericData(df, save=type) df = pd.read_csv('./csv/' + type + '.csv', sep="\t", header=None) df, label = pp.toAutoEncoderData(df) print('df {} Label {}'.format(df.shape, label.shape)) print(df.head) df.to_csv('./data/processed/' + type + '.csv', sep="\t", header=None, index=False) np.save('./data/processed/' + type + '_label.npy', label)
def __init__(self, filename='./corpus/train.csv'): if os.path.exists(filename): data = pd.read_csv(filename) self.data = shuffle(data) X_data = pd.DataFrame(data.drop('sentiment', axis=1)) Y_data = column_or_1d(data[:]['sentiment'], warn=True) self.X_train, self.X_val,\ self.y_train, self.y_val = train_test_split(X_data, Y_data, test_size=0.3, random_state=1) self.model = None self.load_model() self.preprocessor = Preprocessor.Preprocessor() else: print('No Source!') self.preprocessor.process_data()
def addMultiWords(self, listOfMultiWords): """ Updates the internal regex with a list of multiwords """ buff = StringIO.StringIO() for multiWord in listOfMultiWords: if multiWord not in self.multiWordsRegex and multiWord not in buff.getvalue(): buff.write(self.regexTemplate.format(multiWord)) # add a normalized (no accents) version if multiWord != Preprocessor.normalize(multiWord): buff.write(self.regexTemplate.format(Preprocessor.normalize(multiWord))) if len(self.multiWordsRegex) == 0: self.multiWordsRegex = buff.getvalue().strip("|") else: self.multiWordsRegex += "|" + buff.getvalue().strip("|") buff.close()
def PredictRaw(studentRecord): """ Predicts the data from DATA/test.csv :studentRecord: data of the format [ID,FI,E1...E2] ID and FI are must :return: list of subjects he should have studied """ studentRecord = formatStudent(studentRecord) pred_data = pd.DataFrame().from_records([studentRecord]) pred_data.columns = ["ID", "FI", "E", "E", "E"] pred_data_pro = Preprocessor.preprocess(pred_data, train=False) FI = list(Preprocessor.getFI()) subjects = list(Preprocessor.getSubjects()) fieldsInData = FI + subjects # fields are FI_X and S_X predictions = Model.Predict(pred_data_pro) C = Model.getClusterCenters() #cluster centers pred = predictions[ 0] #get the prediciton value which is the only value present Ctarget = C[pred] # get the corresponding cluster student = pred_data_pro.values[ 0] #get the row of values (for a student) for which predicition is done print("Needed ", end=" ") result = {"FI": None, "S": []} for att_i in range(len(fieldsInData)): Catt = Ctarget[att_i] #Cluster attribute value studentAtt = student[att_i] # attribute value present in thhe field = fieldsInData[att_i] # column name if (Catt > 0.5): #if the value is high . Value might me changed later print(field, end=" ") # print the field or append it to the result array if (field in FI): result["FI"] = field else: result["S"].append(field) print() return result
def predict(self, predictedLength): """ Predict probabilities throughout the day where the given block-duration is likely to be available :param predictedLength: The duration of the predicted block :return: A list containing the predicted probabilities """ predicted_times = [] current_prediction = 0 for x in p.decimal_range(8, 17, 0.25): current_prediction = (self.wknn.predict_proba( [[x, predictedLength]])) * 100 if (current_prediction[0][1] > 0): predicted_times.append([x, current_prediction[0][1]]) return predicted_times
def _process_xml_query(self, xml_node): query_id = xml_node.getElementsByTagName("QueryNumber")[0].firstChild.nodeValue query = xml_node.getElementsByTagName("QueryText")[0].firstChild.nodeValue processed_query = " ".join(Preprocessor.preprocessor_tokenizer(query)) self.query_dict[query_id] = processed_query records = xml_node.getElementsByTagName("Records")[0] relevant_documents_list = list() for item in records.getElementsByTagName("Item"): doc_id = item.firstChild.nodeValue scores = item.getAttribute("score") votes = 0 for i in range(len(scores)): if scores[i]!='0': votes += 1 relevant_documents_list.append((doc_id, votes)) self.expected_docs_by_query[query_id] = relevant_documents_list
def query_vector(self, query): terms = Preprocessor.preprocessor_tokenizer(query) counter = collections.Counter(terms) query_vector = dict() query_vector_magnitude = 0 for term in terms: if term not in self.document_frequency: continue max_tf = counter.most_common(1)[0][1] # it returns an list of item + frequency tf = counter[term] df = self.document_frequency[term] val = (0.5 + 0.5*tf/max_tf)*math.log10(self.n_terms/df) query_vector[term] = val query_vector_magnitude += val*val # normalizing step query_vector_magnitude = math.sqrt(query_vector_magnitude) for term in query_vector: query_vector[term] /= query_vector_magnitude return query_vector
def loadTargetsFromFile(filename): # load targets, their names, nicknames and ergonyms from given file into an array of three lists. # returns the array. # names with accents and cedilla are duplicated after normalization, except if escaped with ^. # NAMES = 0 NICKNAMES = 1 ERGOS = 2 f = codecs.open(filename,"r", "utf-8") targets = [] for fileLine in f: line = fileLine.lower() # lines starting with "#" are skipped -- mjs 2011.10.27 if line[0] == '#': if debug: print "skipped: ", line continue sepIndex = line.find(":") id = line[0:sepIndex] names = [] nicknames = [] ergos = [] if id != None and id != '': mentions = line[sepIndex+1:].split(';') try: namesTokens = mentions[NAMES] for name in namesTokens.split(','): cleanName = name.replace("\n","").strip(' ').rstrip(' ') if debug: print "cleanName = ", cleanName if cleanName != '': if cleanName[0] != '^': names.append(cleanName) if cleanName != Preprocessor.normalize(cleanName): names.append(Preprocessor.normalize(cleanName)) if debug: print "appended clean name", cleanName else: names.append(cleanName[1:]) if debug: print "appended unclean name", cleanName, cleanName[1:] except IndexError: None try: nicknamesTokens = mentions[NICKNAMES] for name in nicknamesTokens.split(','): cleanName = name.replace("\n","").strip(' ').rstrip(' ') if debug: print "cleanName (nickname) = ", cleanName if cleanName != '': if cleanName[0] != '^': nicknames.append(cleanName) if cleanName != Preprocessor.normalize(cleanName): nicknames.append(Preprocessor.normalize(cleanName)) if debug: print "appended clean nickname", cleanName, Preprocessor.normalize(cleanName) else: nicknames.append(cleanName[1:]) if debug: print "appended uncleaned nickname ", cleanName, cleanName[1:] except IndexError: None try: ergoTokens = mentions[ERGOS] for name in ergoTokens.split(','): cleanName = name.replace("\n","").strip(' ').rstrip(' ') if debug: print "cleanName (ergonym) = ", cleanName if cleanName != '': if cleanName[0] != '^': ergos.append(cleanName) if cleanName != Preprocessor.normalize(cleanName): ergos.append(Preprocessor.normalize(cleanName)) if debug: print "appended clean ergonym", cleanName, Preprocessor.normalize(cleanName) else: ergos.append(cleanName[1:]) if debug: print "appended uncleaned ergonym", cleanName, cleanName[1:], except IndexError: None targets.append(Person(id,names,nicknames,ergos)) f.close() return targets
def newGetTweets(beginDate,endDate,proxy): """ Gets tweets from a service for a certain period Params: begin date end date proxy Returns: list of Opinion instances """ print "Getting new tweets..." username = "******" password = "******" top_level_url = "http://pattie.fe.up.pt/solr/portugal/select" requestTweets = "http://pattie.fe.up.pt/solr/portugal/select?q=created_at:[{0}%20TO%20{1}]&indent=on&wt=json" #requestTweets = "http://pattie.fe.up.pt/solr/portugal/select?q=created_at:[2012-05-25T13:00:00Z%20TO%202012-05-25T15:00:00Z]&indent=on&wt=json" #Password manager because the service requires authentication password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() password_mgr.add_password(None, top_level_url, username, password) auth_handler = urllib2.HTTPBasicAuthHandler(password_mgr) opener = None if proxy != None: proxy_handler = urllib2.ProxyHandler({'http': proxy}) opener = urllib2.build_opener(auth_handler,proxy_handler) else: #opener = urllib2.build_opener(auth_handler) opener = urllib2.build_opener() if beginDate.strftime('%Y') == "1900": print "Getting Tweets from STDIN ..." twitterData = sys.stdin; else: """ print "Requesting: " + requestTweets.format(beginDate.strftime('%Y-%m-%dT%H:%M:%Sz'), endDate.strftime('%Y-%m-%dT%H:%M:%Sz')) twitterData = opener.open(requestTweets.format(beginDate.strftime('%Y-%m-%dT%H:%M:%Sz'), endDate.strftime('%Y-%m-%dT%H:%M:%Sz'))); """ print "Requesting: " + requestTweets.format(urllib.quote(beginDate.strftime('%Y-%m-%dT%H:%M:%SZ')), urllib.quote(endDate.strftime('%Y-%m-%dT%H:%M:%SZ'))) twitterData = opener.open(requestTweets.format(urllib.quote(beginDate.strftime('%Y-%m-%dT%H:%M:%SZ')), urllib.quote(endDate.strftime('%Y-%m-%dT%H:%M:%SZ')))); #Read the JSON response jsonTwitter = simplejson.loads(unicode(twitterData.read().decode("utf-8"))) print jsonTwitter["response"] politicians = getPoliticians() sentiTokens = getSentiTokens() multiWordTokenizer = getMultiWordsTokenizer(politicians, sentiTokens) listOfTweets = [] #Build a dictionary for tweet in jsonTwitter["response"]["docs"]: id = str(tweet["user_id"]) + "_" + str(tweet["id"]) userId = unicode(tweet["user_id"]) #print userId date = datetime.strptime(tweet["created_at"], '%Y-%m-%dT%H:%M:%Sz') taggedSentence = multiWordTokenizer.tokenizeMultiWords(unicode(tweet["text"])) taggedSentence = Preprocessor.removeURLs(taggedSentence) taggedSentence = Preprocessor.removeUsernames(taggedSentence) listOfTweets.append(Opinionizers.Opinion(tweet["id"],unicode(tweet["text"]),user=userId,date=date,processedSentence = taggedSentence)) print len(listOfTweets), " tweets loaded\n" return listOfTweets
def testOldProcessWithDiagnostics(sourceFile): results = {"numOf-1":0,"correct-1":0,"numOf0":0,"correct0":0,"numOf1":0,"correct1":0} #corpus = csv.reader(codecs.open(sourceFile,"r","utf-8"),delimiter='|') corpus = codecs.open(sourceFile,"r","utf-8") listOfTweets = [] i=0 politicians = getFromCache(PERSONS_CACHE) #getPoliticians() sentiTokens = getFromCache(SENTI_CACHE) #getSentiTokens() rulesClassifier = EuroOpinionizers.Rules(politicians,sentiTokens) naiveClassifier = EuroOpinionizers.Naive(politicians,sentiTokens) multiWordTokenizer = getFromCache(MULTIWORD_CACHE) #getMultiWordsTokenizer(politicians, sentiTokens) print "loading tweets..." for line in corpus: tweet = line.split('|') #skip the first line if tweet[0] != 'ID': #print tweet """ fullSentence = '' #in some cases the message spawns across several fields so we are concatenating them... for block in tweet[TEXT:]: fullSentence = fullSentence + block """ fullSentence = tweet[TEXT] tokenizedSentence = multiWordTokenizer.tokenizeMultiWords(unicode(fullSentence)) tokenizedSentence = Preprocessor.removeURLs(tokenizedSentence) tokenizedSentence = Preprocessor.removeUsernames(tokenizedSentence) o = Opinion(id = tweet[ID], user = u"Teste", sentence = unicode(fullSentence), processedSentence = unicode(tokenizedSentence), target = unicode(tweet[TARGET]), mention = unicode(tweet[MENTION]), polarity = int(tweet[SENTIMENT_POLARITY])) listOfTweets.append(o) i = i+1 """ if i!=0 and i%10 == 0: break """ print "tweets loaded..." falseNeg = [] falsePos = [] falseNeut = [] totalList = [] for tweet in listOfTweets: rulesTweet = rulesClassifier.inferPolarity(tweet,True) naiveTweet = naiveClassifier.inferPolarity(tweet,True) regex = ur"score:(.*);" sentiScore = re.search(regex, naiveTweet.metadata).group(1) tweetScore = int(sentiScore) + int(rulesTweet.polarity) if tweetScore > 0: tweet.polarity = 1 elif tweetScore < 0: tweet.polarity = -1 else: tweet.polarity = 0 tweet.metadata = rulesTweet.metadata + ";" + naiveTweet.metadata totalList.append(tweet) """ if tweet.polarity == -1: results["numOf-1"] += 1 if tweet.polarity == classifiedTweet.polarity: results["correct-1"] +=1 else: matchedRules = rulesDiagnosticHelper(rulesClassifier.generateFeatureSet(tweet,True)) classifiedTweet.metadata += " " + matchedRules falseNeg.append(classifiedTweet) elif tweet.polarity == 0: results["numOf0"] += 1 if tweet.polarity == classifiedTweet.polarity: results["correct0"] +=1 else: matchedRules = rulesDiagnosticHelper(rulesClassifier.generateFeatureSet(tweet,True)) classifiedTweet.metadata += " " + matchedRules falseNeut.append(classifiedTweet) if tweet.polarity == 1: results["numOf1"] += 1 if tweet.polarity == classifiedTweet.polarity: results["correct1"] +=1 else: matchedRules = rulesDiagnosticHelper(rulesClassifier.generateFeatureSet(tweet,True)) classifiedTweet.metadata += " " + matchedRules falsePos.append(classifiedTweet) logTweets(falseNeg,"./falseNegs.csv") logTweets(falseNeut,"./falseNeut.csv") logTweets(falsePos,"./falsePos.csv") """ logTweets(totalList,"./newProcess.csv") """
def testSubjectivity(sourceFile): corpus = codecs.open(sourceFile,"r","utf-8") listOfTweets = [] rejectList = [] i=0 politicians = getPoliticians() sentiTokens = getSentiTokens() #rulesClassifier = EuroOpinionizers.Rules(politicians,sentiTokens) naiveClassifier = EuroOpinionizers.Naive(politicians,sentiTokens) #multiWordTokenizer = getMultiWordsTokenizer(politicians, sentiTokens) print "loading tweets..." for line in corpus: tweet = line.replace("\"","\'").split('|') #skip the first line if tweet[0] != 'PERIOD': #print tweet fullSentence = '' #in some cases the message spawns across several fields so we are concatenating them... for block in tweet[TEXT:]: fullSentence = fullSentence + block tokenizedSentence = Preprocessor.separateSpecialSymbols(unicode(fullSentence)) #multiWordTokenizer.tokenizeMultiWords(unicode(fullSentence)) #tokenizedSentence = Preprocessor.removeURLs(tokenizedSentence) #tokenizedSentence = Preprocessor.removeUsernames(tokenizedSentence) print tokenizedSentence o = Opinion(id = tweet[ID], user = u"Teste", sentence = unicode(fullSentence), processedSentence = tokenizedSentence, target = unicode(tweet[TARGET]), mention = unicode(tweet[MENTION]), polarity = int(tweet[SENTIMENT_POLARITY])) matches = re.findall(naiveClassifier.sentiTokensRegex ,tokenizedSentence) if matches != None and len(matches) > 0: listOfTweets.append(o) else: rejectList.append(o) i = i+1 """ if i!=0 and i%30 == 0: break """ logTweets(listOfTweets,"./listOfTweets.csv") logTweets(rejectList,"./rejectList.csv")
def genFeatsWithSubjectivity(isGoldStandard,sourceFile, destinyFile): corpus = codecs.open(sourceFile,"r","utf-8") politicians = getPoliticians() sentiTokens = getSentiTokens() rulesClassifier = EuroOpinionizers.Rules(politicians,sentiTokens) naiveClassifier = EuroOpinionizers.Naive(politicians,sentiTokens) multiWordTokenizer = getMultiWordsTokenizer(politicians, sentiTokens) listOfTweets = [] i=0 for line in corpus: tweet = line.replace("\"","\'").split('|') #skip the first line if tweet[0] != 'PERIOD': sentence = '' for block in tweet[TEXT:]: sentence = sentence + block tokenizedSentence = multiWordTokenizer.tokenizeMultiWords(sentence) tokenizedSentence = Preprocessor.removeURLs(tokenizedSentence) tokenizedSentence = Preprocessor.removeUsernames(tokenizedSentence) #tokenizedSentence = separateSpecialSymbols(tokenizedSentence) o = Opinion(id = tweet[ID], sentence = unicode(sentence), processedSentence = unicode(tokenizedSentence), target = unicode(tweet[TARGET]), mention = unicode(tweet[MENTION]), polarity = int(tweet[SENTIMENT_POLARITY])) listOfTweets.append(o) i = i+1 """ if i!=0 and i%50 == 0: break """ print "tweets loaded..." featuresFile = open(destinyFile,"w") featuresFile.write(ARFF_HEADERS) unknownSentiFile = codecs.open("./unknownSentiment.csv","w","utf-8") featurama = cStringIO.StringIO() unknownSentiment = cStringIO.StringIO() tmp = cStringIO.StringIO() for tweet in listOfTweets: #used to verify if an instance matches any rule and\or has any sentiToken #if after processing this var still has value 0 then we don't add it to #the training set... sentiCounter = 0 tmp = cStringIO.StringIO() tmp.write(str(tweet.id)+ ",") featureSet = rulesClassifier.genConditionalFeatureSet(tweet,True) for feature in featureSet: tmp.write(str(feature) + ",") sentiCounter += feature naiveClassification = naiveClassifier.inferPolarity(tweet,True) tmp.write(str(naiveClassification.polarity) + ",") sentiCounter+= naiveClassification.polarity pos = naiveClassification.metadata.find("score:") score = naiveClassification.metadata[pos+6:].replace(";","") tmp.write(str(score) + ",") sentiCounter+= int(score) isNews = isNewsSource(tweet.sentence) tmp.write(str(isNews) + ",") sentiCounter+= isNews if sentiCounter != 0: if isGoldStandard: featurama.write(unicode(tmp.getvalue())+unicode(tweet.polarity)) else: featurama.write(tmp.getvalue()+"?\n") else: unknownSentiment.write(str(tweet.id)+"|"+str(tweet.polarity)+"|"+tweet.sentence+"\n") featuresFile.write(featurama.getvalue()) featuresFile.close() unknownSentiFile.write(unknownSentiment.getvalue()) unknownSentiFile.close()
if i=='m' or i=='M': corpus='mr' binary_classification=True else: corpus='reuters' #Forming Prepare the cat_num_docs dictionary, where the number of documents in the training set for each ##category are stored ##also forming the training set and test set start_time = time.time() #Here, apart from the naive bayes classifier, everything is done by nltk #2) refer the comments for the function get_testset_trainset() li = Preprocessor.get_testset_trainset(corpus) testset = li[1] trainset = li[0] li = Preprocessor.startup() cat_num_docs = li[1] ###--------------------DEBUG STATEMENTS---------------------- #for f in trainset: # print f , FilenameCategoriesDict[f] #print "Freedom\n" #for f in testset: # print f ###--------------------DEBUG STATEMENTS----------------------
def __init__(self): self.stack = Stack() self.preprocessor = Preprocessor() self.memory = {} self.usedWrite = False
def init(): global model, stdFormulaList, drvFormulaList, concepts, multiWordDict # Get Data print "Loading Physics questions from default dataset ..." train_data = readFile('./Datasets/Training1.csv') validation_data = readFile('./Datasets/Validation1.csv') test_data = readFile('./Datasets/Testing1.csv') print "Done!" # Get Domain Knowledge print "Loading Domain Knowledge ..." DM.init() stdFormulaList = DM.getStdFormulaList() drvFormulaList = DM.getDrvFormulaList() concepts = DM.getConcepts() multiWordDict = DM.getMultiWordDict() print "Done!" # Generate Matrix print "Generating feature matrix for training ..." words = pp.parse(train_data[0,0], multiWordDict) X_Train = getFeatureVector(words) for i in range(1, train_data.shape[0]): words = pp.parse(train_data[i,0], multiWordDict) X_Train = np.vstack([X_Train, getFeatureVector(words)]) words = pp.parse(validation_data[0,0], multiWordDict) X_Validation = getFeatureVector(words) for i in range(1, validation_data.shape[0]): words = pp.parse(validation_data[i,0], multiWordDict) X_Validation = np.vstack([X_Validation, getFeatureVector(words)]) words = pp.parse(test_data[0,0], multiWordDict) X_Test = getFeatureVector(words) for i in range(1, test_data.shape[0]): words = pp.parse(test_data[i,0], multiWordDict) X_Test = np.vstack([X_Test, getFeatureVector(words)]) Y_Train = train_data[:,1] Y_Validation = validation_data[:,1] Y_Test = test_data[:,1] print "Done!" # Train the model print "Training ..." model.fit(X_Train, Y_Train) print "Done!" # Predict the result print "\n Training Data:" print "Prediction " + str(model.predict(X_Train)) print "Actual " + str(Y_Train) print "Accuracy " + str(model.score(X_Train,Y_Train)*100) + "%" print "\n Vaildation Data:" print "Prediction " + str(model.predict(X_Validation)) print "Actual " + str(Y_Validation) print "Accuracy " + str(model.score(X_Validation,Y_Validation)*100) + "%" print "\n Test Data:" print "Prediction " + str(model.predict(X_Test)) print "Actual " + str(Y_Test) print "Accuracy " + str(model.score(X_Test,Y_Test)*100) + "%"
def getCategory(text): global model, multiWordDict words = pp.parse(text, multiWordDict) return getClassName(model.predict([getFeatureVector(words)]))
class Interpreter: stack = None preprocessor = None memory = None Keywords = [';', ':', 'DO', 'LOOP', 'BEGIN', 'UNTIL', 'EXPECT', 'LEAVE','DUP', 'SWAP', 'DROP', 'EMIT', 'MOD', 'KEY', 'DEPTH', 'ROLL', 'PICK'] def __init__(self): self.stack = Stack() self.preprocessor = Preprocessor() self.memory = {} self.usedWrite = False def interact(self): while True: try: line = raw_input('>>> ') line = self.preprocessor.preprocess(line) self.process(line) if self.usedWrite: sys.stdout.write('\n') self.usedWrite = False except Exception as e: print e.message def push(self, statement): try: print '>>>', statement statement = self.preprocessor.preprocess(statement) self.process(statement) if self.usedWrite: sys.stdout.write('\n') self.usedWrite = False except Exception as e: print e.message def process(self, line, is_inside_until=False): statement = Tokenizer(line.strip(' ').strip('\n'), ' ') while statement.has_next(): token = statement.next() if token in ['.', '.s', 'CR']: if token == '.': sys.stdout.write(self.stack.pop()) self.usedWrite = True elif token == 'CR': print else: print self.stack elif token in self.memory: self.process(self.memory[token]) elif token in ['DUP', '+', '-', '*', '/', 'SWAP', 'DROP', '<', '>', '<=', '>=', '=', 'EMIT', 'MOD', 'KEY', 'DEPTH', 'ROLL', 'PICK']: import Operators Operators.Op[token](self.stack) elif token == ':': self.function_definition(statement) elif token == 'IF': self.handle_if(statement) elif token == 'DO': self.do_loop(statement) elif token == 'BEGIN': self.until_loop(statement) elif token == 'LEAVE': if is_inside_until and self.stack.pop() != '0': return False elif token == 'EXPECT': self.expect() elif token.startswith('."'): self.print_string(token, statement) elif token.isdigit(): self.stack.push(token) elif token.strip() == '': pass else: raise NameError('Invalid Input: ' + token) return True def print_string(self, current_token, statement): if current_token.endswith('"'): print current_token[2:-1] else: # If the string contains white space then we need to connect its parts. string = current_token while statement.has_next(): token = statement.next() string = string + ' ' + token if string.endswith('"'): break print string[2:-1] def expect(self): count = int(self.stack.pop()) string = '' while len(string) < count: string = string + raw_input('... ') for index in xrange(count): self.stack.push(string[index]) def until_loop(self, statement): loop_body = '' pair = 1 while statement.has_next(): token = statement.next() if token == 'UNTIL' and pair == 1: pair -= 1 break elif token == 'BEGIN': pair += 1 elif token == 'UNTIL' and pair > 1: pair -= 1 loop_body = loop_body + ' ' + token if pair != 0: raise NameError('Unmatched UNTIL Block') while True: # If not means if LEAVE occurred if not self.process(loop_body, is_inside_until=True): break elif self.stack.top() == '0': break def do_loop(self, statement): loop_body = '' pair = 1 while statement.has_next(): token = statement.next() if token == 'DO': pair += 1 elif token == 'LOOP' and pair > 1: pair -= 1 elif token == 'LOOP' and pair == 1: pair -= 1 break loop_body = loop_body + ' ' + token if pair != 0: raise NameError('Unmatched DO Block') start = int(self.stack.pop()) end = int(self.stack.pop()) for i in xrange(start, end): self.process(loop_body) def handle_if(self, statement): condition = bool(int(self.stack.pop())) #if the top was !=0 its True otherwise its False if_body = '' pair = 1 if condition: while statement.has_next(): token = statement.next() if token == 'IF': pair += 1 elif token == 'ELSE' and pair == 1: # if pair was one ELSE is for the current if break elif token == 'THEN' and pair > 1: # means that the THEN is not for the currect if but in its body pair -= 1 elif token == 'THEN' and pair == 1: # THEN is for the current if pair -= 1 break if_body = if_body + ' ' + token if pair != 0: while statement.has_next(): token = statement.next() if token == 'IF': #if it was breaked with else it will ignore everything till THEN of the current if pair += 1 elif token == 'THEN': pair -= 1 elif token == 'THEN' and pair == 1: pair -= 1 break else: collect = False while statement.has_next(): token = statement.next() if token == 'IF': pair += 1 elif token == 'ELSE' and pair == 1: collect = True continue elif token == 'THEN' and pair > 1: pair -= 1 elif token == 'THEN' and pair == 1: pair -= 1 break if collect: if_body = if_body + ' ' + token if pair != 0: raise NameError('Unmatched IF Block') self.process(if_body) def function_definition(self, statement): name = statement.next() self.memory[name] = '' while statement.has_next(): token = statement.next() # Return when we reach end of the function definition. if token == ';': return self.memory[name] = self.memory[name] + ' ' + token raise NameError('Syntax Error')
def loadSentiTokens(path,pathExceptions): f = codecs.open(path,"r", "utf-8") adjectives = [] firstLine = f.next() exceptions = unicode(loadExceptionTokens(pathExceptions)) lemmaRegex = ",(.*?)\." flexRegex = "^(.*?)," polarityRegex = "POL:..=(-1|0|1)(?:;|$)" posRegex = "PoS=(.*?);" currentLemma = re.search(lemmaRegex,firstLine).group(1) pol = re.findall(polarityRegex,firstLine) pol1 = int(pol[0]) try: pol2 = int(pol[1]) except IndexError: pol2 = 0 currentPolarity = str(pol1 + pol2) currentPos = re.search(posRegex,firstLine).group(1).lower() currentFlex = re.search(flexRegex,firstLine).group(1) currentFlexions = [] currentFlexions.append(currentFlex) if currentFlex not in exceptions and currentFlex != Preprocessor.normalize(currentFlex): currentFlexions.append(Preprocessor.normalize(currentFlex)) for line in f: try: if "REV=Amb" not in line: lemma = re.search(lemmaRegex,line).group(1) #print re.search(lemmaRegex,line).groups() if lemma != currentLemma: if currentLemma not in exceptions and currentLemma != Preprocessor.normalize(currentLemma): currentFlexions.append(Preprocessor.normalize(currentLemma)) adjectives.append(SentiToken(currentLemma,currentPolarity,currentPos,currentFlexions)) currentLemma = lemma #currentPolarity = re.search(polarityRegex,line).group(1) pol = re.findall(polarityRegex,line) pol1 = int(pol[0]) try: pol2 = int(pol[1]) except IndexError: pol2 = 0 currentPolarity = str(pol1 + pol2) currentPos = re.search(posRegex,line).group(1).lower() currentFlexions = [] currentFlex = re.search(flexRegex,line).group(1) currentFlexions.append(currentFlex) #print "L:", currentLemma,"P:",currentPolarity,"POS:",currentPos,"F:",currentFlex if currentFlex not in exceptions and currentFlex != Preprocessor.normalize(currentFlex): currentFlexions.append(Preprocessor.normalize(currentFlex)) else: currentFlex = re.search(flexRegex,line).group(1) currentFlexions.append(currentFlex) #print "l:", lemma, "f:", currentFlex, "p:", currentPos if currentFlex not in exceptions and currentFlex != Preprocessor.normalize(currentFlex): currentFlexions.append(Preprocessor.normalize(currentFlex)) except: None f.close() return adjectives
def generateTextFeatures(sourceFile,destinyFile): corpus = codecs.open(sourceFile,"r","utf-8") politicians = getPoliticians() sentiTokens = getSentiTokens() #rulesClassifier = EuroOpinionizers.Rules(politicians,sentiTokens) #naiveClassifier = EuroOpinionizers.Naive(politicians,sentiTokens) multiWordTokenizer = getMultiWordsTokenizer(politicians, sentiTokens) listOfTweets = [] i=0 headers = """ @relation twitometro @ATTRIBUTE ID NUMERIC @ATTRIBUTE tweet STRING @ATTRIBUTE polarity? {-1,0,1} @data """ featuresFile = open(destinyFile,"w") featuresFile.write(headers) for line in corpus: tweet = line.replace("\"","\'").split('|') #skip the first line if tweet[0] != 'PERIOD': sentence = '' for block in tweet[TEXT:]: sentence = sentence + block tokenizedSentence = multiWordTokenizer.tokenizeMultiWords(sentence) tokenizedSentence = Preprocessor.removeURLs(tokenizedSentence) tokenizedSentence = Preprocessor.removeUsernames(tokenizedSentence) tokenizedSentence = tokenizedSentence.replace(tweet[MENTION]," <TARGET> ") tokenizedSentence = Preprocessor.removeStopWords(tokenizedSentence) tokenizedSentence = Preprocessor.separateSpecialSymbols(tokenizedSentence) tokenizedSentence = tokenizedSentence.replace("\n","") tokenizedSentence = tokenizedSentence.replace(","," ") featuresFile.write(tweet[ID]+",\""+tokenizedSentence+"\","+tweet[SENTIMENT_POLARITY]+"\n") print sentence + " --> " + tokenizedSentence """ o = Opinion(id = tweet[ID], sentence = unicode(sentence), processedSentence = unicode(tokenizedSentence), target = unicode(tweet[TARGET]), mention = unicode(tweet[MENTION]), polarity = int(tweet[SENTIMENT_POLARITY])) listOfTweets.append(o) """ i = i+1 """ if i!=0 and i%20 == 0: break """ featuresFile.close()