def addMultiWords(self, listOfMultiWords):
        """
            Updates the internal regex with a
            list of multiwords
        """

        buff = StringIO.StringIO()

        for multiWord in listOfMultiWords:

            if multiWord not in self.multiWordsRegex and multiWord not in buff.getvalue(
            ):
                buff.write(self.regexTemplate.format(multiWord))

                #add a normalized (no accents) version
                if multiWord != Preprocessor.normalize(multiWord):
                    buff.write(
                        self.regexTemplate.format(
                            Preprocessor.normalize(multiWord)))

        if len(self.multiWordsRegex) == 0:
            self.multiWordsRegex = buff.getvalue().strip('|')
        else:
            self.multiWordsRegex += "|" + buff.getvalue().strip('|')

        buff.close()
示例#2
0
def Predict():
    """
        Predicts the data from DATA/test.csv
    :return:
    """

    #get user input from file test.csv
    csvhand = Handler.CSVHandler("DATA/test.csv")
    pred_data = csvhand.getData()
    pred_data_pro = Preprocessor.preprocess(pred_data, train=False)
    fieldsInData = list(Preprocessor.getFI()) + list(
        Preprocessor.getSubjects())  # fields are FI_X and S_X
    predictions = Model.Predict(pred_data_pro)
    C = Model.getClusterCenters()  #cluster centers
    for pred_i in range(len(predictions)):
        pred = predictions[pred_i]  #get the prediciton vlaue form the list
        Ctarget = C[pred]  # get the corresponding cluster
        student = pred_data_pro.values[
            pred_i]  #get the row of values (for a student) for which predicition is done
        print("Needed ", end=" ")
        for att_i in range(len(fieldsInData)):
            Catt = Ctarget[att_i]  #Cluster attribute value
            studentAtt = student[att_i]  # attribute value present in thhe
            field = fieldsInData[att_i]  # column name
            if (Catt >
                    0.5):  #if the value is high . Value might me changed later
                print(field, end=" "
                      )  # print the field or append it to the result array
        print()
    return
    def spectrograms_augmentation(self, class_to_augment, many, version, methods):
        spec_aug = []
        if(version == 1):
            aug_dir = self.augmented_spec_v1_dir + class_to_augment + "/"
        elif(version == 2):
            aug_dir = self.augmented_spec_v2_dir + class_to_augment + "/"
        if not os.path.exists(aug_dir):
            os.makedirs(aug_dir)
        files = os.listdir(aug_dir)
        if(len(files) >= many):
            for file in files:
                spec = np.load(file)
                spec_aug.append(spec)
                if(len(spec_aug) >= many):
                    break
            return np.asarray(spec_aug, dtype=np.float32)
        preprocessor = Preprocessor(aug_dir, version=version, dump=True)

        shutil.rmtree(aug_dir)
        os.makedirs(aug_dir)
        audio_signals_aug = self.audio_signals_augmentation(
            class_to_augment, many, methods)
        print("creating new spectrograms...")
        name = 0
        for sig_aug in tqdm(audio_signals_aug):
            spec = preprocessor.compute_spectrogram(
                sig_aug, str(name) + ".npy")
            spec_aug.append(spec)
            name += 1
            if(len(spec_aug) >= many):
                break
        return np.asarray(spec_aug, dtype=np.float32)
示例#4
0
def Run(source: str, debug: bool):

    try:
        prog = Program(source)

        preproc = Preprocessor()
        preproc.preprocess(prog)

        # for l in prog.preprocessed:
        #    print(l)

        assembler = Assembler()
        assembler.assemble(prog)

        # for l in prog.labels:
        #     print(l, f" Position: {l.position}")

        # for i in prog.instructions:
        #     print(i, f" Position: {i.position}  Label: {i.labelName}")
        #     for p in i.parameters:
        #         print(" ", p, end = "")
        #         if p.labelName != None:
        #             print(f"   {p.labelName}")
        #         else:
        #             print("")

        # for b in prog.binary:
        #     print("%04X " % b, end = "")

        # print("")

        computer = Computer()
        computer.loadProgram(prog)

        if debug == False:
            computer.run()

            for l in prog.labels:
                if l.size > 0:
                    print("%13s (%6s[%3d]): " % (l.name, l.datatype, l.size), end ="")
                    for i in range(l.position, l.position + l.size):
                        print("%d " % computer.memory[i], end = "")
                    print("")

        else:
            debugger = Debugger(computer, prog)
            debugger.run()

    except PreprocessorError as e:
        print(e)

    except AssemblerError as e:
        print(e)

    except CompilerError as e:
        print(e)

    except Exception as e:
        raise e
示例#5
0
    def __init__(self, sample_rate=32000, dim_to_conform=3000):
        self.class_id_mapping = {
            'Hi-hat': 0,
            'Saxophone': 1,
            'Trumpet': 2,
            'Glockenspiel': 3,
            'Cello': 4,
            'Knock': 5,
            'Gunshot_or_gunfire': 6,
            'Clarinet': 7,
            'Computer_keyboard': 8,
            'Keys_jangling': 9,
            'Snare_drum': 10,
            'Writing': 11,
            'Laughter': 12,
            'Tearing': 13,
            'Fart': 14,
            'Oboe': 15,
            'Flute': 16,
            'Cough': 17,
            'Telephone': 18,
            'Bark': 19,
            'Chime': 20,
            'Bass_drum': 21,
            'Bus': 22,
            'Squeak': 23,
            'Scissors': 24,
            'Harmonica': 25,
            'Gong': 26,
            'Microwave_oven': 27,
            'Burping_or_eructation': 28,
            'Double_bass': 29,
            'Shatter': 30,
            'Fireworks': 31,
            'Tambourine': 32,
            'Cowbell': 33,
            'Electric_piano': 34,
            'Meow': 35,
            'Drawer_open_or_close': 36,
            'Applause': 37,
            'Acoustic_guitar': 38,
            'Violin_or_fiddle': 39,
            'Finger_snapping': 40
        }

        self.classes_frequency = {}
        self.classes_percent = {}
        self.classes_verified = {}
        self.files = []
        self.labels = []
        self.verified = []

        self.files_loaded = set()

        self.train_csv = False
        self.sample_rate = sample_rate

        self.preprocessor = Preprocessor(dump=True)
        self.dim_to_conform = dim_to_conform
示例#6
0
def preprocess(sentence):  # Uses functions in Preprocessor.py to format sentence, including accounting for spelling errors
    normalized_sentence = Preprocessor.sentence_normalizer(sentence)
    formatted_sentence = Preprocessor.sentence_formatter(normalized_sentence)  # remove punctuation
    lemmatized_sentence = Preprocessor.sentence_lemmatizer(formatted_sentence)  # lemmatize words
    cleaned_sentence = Preprocessor.sentence_cleaner(lemmatized_sentence)  # remove stopwords
    preprocessed_sentence = cleaned_sentence
    #print(preprocessed_sentence)
    return preprocessed_sentence
def regularize_data():
    #Check if the unregularized data exists.  If not, construct it
    if not os.path.exists('TrainingSet.pkl'):
        Preprocessor.process_mnist()

    maxEdgeness = 0
    maxCornerness = 0

    edgeness, cornerness, answers = pickle.load(open('TrainingSet.pkl', 'r'))
    for i in xrange(len(edgeness)):
        for x in xrange(28):
            for y in xrange(28):
                if maxEdgeness < edgeness[i][x][y]:
                    maxEdgeness = edgeness[i][x][y]
                if maxCornerness < cornerness[i][x][y]:
                    maxCornerness = cornerness[i][x][y]
    edgeness, cornerness, answers = pickle.load(open('ValidationSet.pkl', 'r'))
    for i in xrange(len(edgeness)):
        for x in xrange(28):
            for y in xrange(28):
                if maxEdgeness < edgeness[i][x][y]:
                    maxEdgeness = edgeness[i][x][y]
                if maxCornerness < cornerness[i][x][y]:
                    maxCornerness = cornerness[i][x][y]
    edgeness, cornerness, answers = pickle.load(open('TestSet.pkl', 'r'))
    for i in xrange(len(edgeness)):
        for x in xrange(28):
            for y in xrange(28):
                if maxEdgeness < edgeness[i][x][y]:
                    maxEdgeness = edgeness[i][x][y]
                if maxCornerness < cornerness[i][x][y]:
                    maxCornerness = cornerness[i][x][y]

    edgeness, cornerness, answers = pickle.load(open('TrainingSet.pkl', 'r'))
    for i in xrange(len(edgeness)):
        for x in xrange(28):
            for y in xrange(28):
                edgeness[i][x][y] /= maxEdgeness
                cornerness[i][x][y] /= maxCornerness
    pickle.dump([edgeness, cornerness, answers],
                open('TrainingSetRegularized.pkl', mode='w'))
    edgeness, cornerness, answers = pickle.load(open('ValidationSet.pkl', 'r'))
    for i in xrange(len(edgeness)):
        for x in xrange(28):
            for y in xrange(28):
                edgeness[i][x][y] /= maxEdgeness
                cornerness[i][x][y] /= maxCornerness
    pickle.dump([edgeness, cornerness, answers],
                open('ValidationSetRegularized.pkl', mode='w'))
    edgeness, cornerness, answers = pickle.load(open('TestSet.pkl', 'r'))
    for i in xrange(len(edgeness)):
        for x in xrange(28):
            for y in xrange(28):
                edgeness[i][x][y] /= maxEdgeness
                cornerness[i][x][y] /= maxCornerness

    pickle.dump([edgeness, cornerness, answers],
                open('TestSetRegularized.pkl', mode='w'))
示例#8
0
    def load_spectrogram(self, version):
        if (len(self.files) == 0):
            raise NameError("load the file name list before from csv file")

        if (version == 1):
            data_path = "./dataset/spec/ver1/"
        elif (version == 2):
            data_path = "./dataset/spec/ver2/"

        preprocessor = Preprocessor(spectrogram_path=data_path,
                                    version=version,
                                    test=False,
                                    dump=True)

        if (self.train_csv):
            audio_path = "./dataset/audio_train/"
        else:
            audio_path = "./dataset/audio_test/"

        print("loading spectrograms...")
        for file_name in tqdm(self.files):
            spec_file_name = file_name.replace(".wav", ".npy")
            spec_file_name = os.path.join(data_path, spec_file_name)
            try:
                spec = np.load(spec_file_name)
            except FileNotFoundError:
                print(
                    file_name,
                    " spectrogram not exist, compute spectrogram from the original file"
                )
                audio_file_name = os.path.join(audio_path, file_name)
                signal, sample_rate = librosa.load(audio_file_name,
                                                   sr=self.sample_rate,
                                                   mono=True)
                spec = preprocessor.compute_spectrogram(
                    signal, os.path.basename(spec_file_name))
            self.spectrograms.append(spec)
            # compute statistics
            Xk = spec.shape[1]
            if (self.spec_statistic["max"] == None
                    or Xk > self.spec_statistic["max"]):
                self.spec_statistic["max"] = Xk
            if (self.spec_statistic["min"] == None
                    or Xk < self.spec_statistic["min"]):
                self.spec_statistic["min"] = Xk
            k = len(self.spectrograms)
            delta = (Xk - self.spec_statistic["average"]) / k
            self.spec_statistic[
                "average"] = self.spec_statistic["average"] + delta
            self.spec_statistic["variance"] = (
                ((k - 1) * self.spec_statistic["variance"]) /
                k) + (delta * (Xk - self.spec_statistic["average"]))
            self.spec_statistic["len_hist"][
                Xk] = self.spec_statistic["len_hist"].setdefault(Xk, 0) + 1

        return self.spectrograms, self.labels
def CoarseClassify(trainfile, testfile):
    fulltrainX, fulltrainFY, fulltrainCY = Preprocessor.getdataset(trainfile)
    fulltestX, fulltestFY, fulltestCY = Preprocessor.getdataset(testfile)
    vec = TfidfVectorizer(binary=True, use_idf=True, decode_error='ignore')
    tfidf_train_data = vec.fit_transform(fulltrainX)
    tfidf_test_data = vec.transform(fulltestX)
    trainX = tfidf_train_data.toarray().tolist()
    testX = tfidf_test_data.toarray().tolist()
    return classify(trainX, fulltrainCY, testX, fulltestCY, fulltrainFY,
                    fulltestFY)
示例#10
0
def preprocess(
        sentence):  # Uses functions in Preprocessor.py to format sentence
    formatted_sentence = Preprocessor.sentence_formatter(
        sentence)  # remove punctuation
    lemmatized_sentence = Preprocessor.sentence_lemmatizer(
        formatted_sentence)  # lemmatize words
    cleaned_sentence = Preprocessor.sentence_cleaner(
        lemmatized_sentence)  # remove stopwords
    preprocessed_sentence = cleaned_sentence
    return preprocessed_sentence
def CoarseClassifytext(trainfile, text):
    process = "dummy:dummy " + text
    textdf = pd.DataFrame(data=[process])
    fulltrainX, fulltrainFY, fulltrainCY = Preprocessor.getdataset(trainfile)
    fulltestX, Trash1, Trash2 = Preprocessor.preprocess(textdf)
    vec = TfidfVectorizer(binary=True, use_idf=True, decode_error='ignore')
    tfidf_train_data = vec.fit_transform(fulltrainX)
    tfidf_test_data = vec.transform(fulltestX)
    trainX = tfidf_train_data.toarray().tolist()
    testX = tfidf_test_data.toarray().tolist()
    return predict(testX), testX, trainX, fulltrainCY, fulltrainFY
def testPreProcessor():
    csvhand = han.CSVHandler("../DATA/input.csv")
    csvhand.open()
    data = csvhand.getData()
    data_train = pr.preprocess(data)
    csvhand = han.CSVHandler("../DATA/input.csv")
    csvhand.open()
    data_test = csvhand.getData()
    data_test = pr.preprocess(data_test, train=False)
    import pdb
    pdb.set_trace()
    return
示例#13
0
文件: Assembler.py 项目: idosch/tecs
def main():
    """Drives the entire translation process."""

    """Preprocess the file"""
    pp = Preprocessor()
    pp.remove_white_spaces_comments()
    pp.write_temp_file()

    """First pass - adds labels to the symbol table."""
    parser = Parser(sys.argv[1] + '.tmp')
    symbol_table = SymbolTable()
    pc = -1
    while parser.has_more_commands():
        parser.advance()
        command_type = parser.command_type()
        if command_type == 'A_COMMAND' or command_type == 'C_COMMAND':
            pc += 1
        elif command_type == 'L_COMMAND':
            label = parser.symbol()
            symbol_table.addEntry(label, pc + 1)

    """Second pass - handles variables names and writes the *.hack file."""
    ram_address = 16
    parser = Parser(sys.argv[1] + '.tmp')
    code = Code()
    file_name = parser.get_file_name()
    hack_file = open(file_name + '.hack', 'w')

    while parser.has_more_commands():
        parser.advance()
        command_type = parser.command_type()
        if command_type == 'A_COMMAND':
            a_symbol = parser.symbol()
            if a_symbol[0] in '0123456789':
                a_symbol_binary = code.convert_to_binary(a_symbol)
                hack_file.write('0' + a_symbol_binary + '\n')
            else:
                if symbol_table.contains(a_symbol) is False:
                    symbol_table.addEntry(a_symbol, ram_address)
                    ram_address += 1
                address = symbol_table.GetAddress(a_symbol)
                address_binary = code.convert_to_binary(address)
                hack_file.write('0' + address_binary + '\n')

        elif command_type == 'C_COMMAND':
            comp = code.comp(parser.comp())
            dest = code.dest(parser.dest())
            jump = code.jump(parser.jump())
            hack_file.write('111' + comp + dest + jump + '\n')

    hack_file.close()
示例#14
0
    def __call__(self, data, targetClass, num_of_rules ):
        if self.dataOK(data):  # Checks weather targetClass is discrete
            original_data = data
            data = Preprocessor.generateFeatures(data, targetClass)
            
            # initialization of beams
            beam = [SDRule(data=data, targetClass=targetClass,  g=self.g)] * self.beamWidth 
            newBeam = [SDRule(data=data, targetClass=targetClass,  g=self.g)] * self.beamWidth
            worstRuleIndex = 0
        
            improvements = true 
            while improvements:  
                improvements = false
                for rule in beam:
                # for f in features:
                    for feature in data.domain.attributes:
                        newRule = rule.cloneAndAddCondition(feature, 'True')
                        if newRule.support > self.minSupport and self.betterThanWorstRule(newRule, newBeam, worstRuleIndex) and self.isRelevant(newRule, newBeam):
                            worstRuleIndex = self.replaceWorstRule(newRule, newBeam, worstRuleIndex)
                            improvements = true
                beam = newBeam
            
            # perform rule subset selection
            if num_of_rules != 0:
                beam = self.ruleSubsetSelection(beam, num_of_rules, data)

            targetClassRule = SDRule(original_data, targetClass, conditions=[], g=1)
            
            # change beam so the rules apply to original data
            fixedBeam = [rule.getFixed(original_data) for rule in beam]
            
            return SDRules(fixedBeam, targetClassRule)
def testPreProcessor():
    csvHandler = han.CSVHandler("../DATA/input.csv")
    csvHandler.open()  # todo remove this kind of useless  code
    data = csvHandler.getData()
    data=pr.preprocess(data)
    print(data)
    return
def train(samples_proportion=0.7):
    global words_in_ham, ham_word_count, words_in_spam, spam_word_count, raw_ham_prob, raw_spam_prob

    ham, spam = read_spam_ham()

    print("Spam size: " + str(len(spam)) + " Ham size: " + str(len(ham)))

    all_emails = append_ham_and_spam(ham, spam)

    random.shuffle(all_emails)

    print('Corpus size = ' + str(len(all_emails)) + ' emails')

    features = [(Preprocessor.get_features(email, ' '), label)
                for (email, label) in all_emails]

    print('Collected ' + str(len(features)) + ' feature sets')
    '''
    # define Support value in %
    support = 10
    spam_support_count = (spam_size * 10) / 100;
    ham_support_count = (ham_size * 10) / 100;
    print('Spam support count:' + str(spam_support_count))
    print('Ham support count:' + str(ham_support_count))
    # get the spam frequent itemset and ham frequent itemset
    # spam_frequent, ham_frequent = get_frequent(all_features, spam_support_count, ham_support_count)
    # train the our own naivebayes classifier and collect dictionary of raw probabilities of words
    '''

    train_size = int(len(features) * samples_proportion)

    train_set, test_set = features[:train_size], features[train_size:]

    ham_mail_count, spam_mail_count = mails_in_ham_spam(train_set)

    spam_prior = 1.0 * spam_mail_count / len(train_set)
    ham_prior = 1.0 * ham_mail_count / len(train_set)

    words_in_ham, words_in_spam = frequency_in_ham_spam(train_set)

    spam_vocab = len(spam_word_count)
    ham_vocab = len(ham_word_count)

    t = get_probabilities_in_each_class(ham_prior, words_in_ham, ham_vocab,
                                        ham_word_count, raw_ham_prob,
                                        raw_spam_prob, spam_prior,
                                        words_in_spam, spam_vocab,
                                        spam_word_count, test_set, train_set)

    ham_prior, words_in_ham, ham_vocab, raw_ham_prob, raw_spam_prob, spam_prior, words_in_spam, spam_vocab, test_set, train_set = get_parameters(
        t)
    #print("Train Size:" + str(len(train_set)) + str(' Test size:') + str(len(test_set)))

    #evaluate(train_set, test_set, raw_spam_prob, raw_ham_prob, words_in_spam, words_in_ham, spam_vocab, ham_vocab,
    #         spam_prior,
    #         ham_prior)

    classifier = NaiveBayesClassifier(list(spam_word_count),
                                      list(ham_word_count))
    t = classifier.prob_classify(classifier, train_set).max()
 def inferTarget(self,opinion):
     
     """ 
         Tries to identify mentions of the targets in a message
         Params: opinion -> Opinion object
         Returns: tuple(inferred target, algorithm metadata)
     """
    
     info = u"Targets: "
     sentence = Preprocessor.separateSpecialSymbols(opinion.sentence.lower()) 
     #print sentence
     
     matches = []
     
     for target in self.targets:            
         
         for name in target.names:
                                
             if sentence.find(" "+name+" ") != -1:
                 
                 matches.append(name)
         
         for nickName in target.nicknames:
                 
             if sentence.find(" "+nickName+" ") != -1:
                 
                 matches.append(nickName)
                 
         for ergo in target.ergos:
             
             if sentence.find(" "+ergo+" ") != -1:
                 
                 matches.append(ergo)                
         
     targets = {}
     
     for mention in matches:
         
         targetId = self.getTargetByMention(mention)
         
         if targetId != None and not self.isFalsePositive(mention, sentence):                    
             
             if mention not in info:
                 info += mention + ","
                 
             targets[targetId] = mention
     
     if len(targets) > 0:
         
         results = []
         info = info.strip(',')
     
         for target,mention in targets.items():
             
             results.append(opinion.clone(target=target,mention=mention,metadata=info))
          
         return results
     else:
         
         return None                     
示例#18
0
def get_prediction(filename):
    preprocessor = Preprocessor.Preprocessor()
    # trainer = Trainer.Trainer(preprocessor)
    predictor = Predictor.Predictor(preprocessor)
    # image_name_list = os.listdir(os.path.join(os.getcwd(), "img"))
    # for image_name in image_name_list:
    #     image_path = os.path.join(os.path.join(os.getcwd(), "img"), image_name)
    return predictor.predict([filename])
def menu():
    PreProcessor = Preprocessor.PreProcessor()
    opcao = input("Preparar base de treino e teste yes/No?")
    if opcao.lower() in ["yes","y"]:
        prepararBasesTreinoTeste(PreProcessor)

    opcao = input("Preparar base de treino e teste sem stemming yes/No?")
    if opcao.lower() in ["yes","y"]:
        prepararBasesTreinoTesteWithoutStemming(PreProcessor)

    PreProcessor = Preprocessor.PreProcessor()
    opcao = input("Preparar base de treino e teste da pasta de janelas Yes/no?")
    if opcao.lower() in ["","yes","y"]:
        prepararBasesTreinoTestePorJanela(PreProcessor)

    opcao = input("Preparar base Yes/no (não uso, pre-processo a mensagem antes de classificar):")    
    if opcao.lower() in ["","yes","y"]:
        prepararBase(PreProcessor)
 def generate_inverted_list(self, docs_dict):
     inv_list = dict()
     for (doc_id, doc) in docs_dict.items():
         term_list = Preprocessor.preprocessor_tokenizer(doc)
         for term in term_list:
             if term not in inv_list:
                 inv_list[term] = []
             inv_list[term].append(doc_id)
     return inv_list
示例#21
0
 def test_Response(self):
     questions, responses = Preprocessor.load_corpus()
     question_list = Processor.vectorizer(questions)
     self.assertEqual(
         Processor.process("what is a star?", question_list, responses),
         "Stars are mostly made of hydrogen and helium\n")
     self.assertEqual(Processor.process("Hello", question_list, responses),
                      "Hello! I am Nova.\n")
     self.assertEqual(
         Processor.process("What is your job?", question_list, responses),
         "I teach you about astronomy and geography!\n")
示例#22
0
 def preProcess(self):
     try:
         # preprocess the data and update df with the processed data
         preprocessor = Preprocessor(self.df)
         self.df = preprocessor.df
         self.hasProcessed=True
     except KeyError:
         tkMessageBox.showinfo("K Means Clustering", "The selected file is invalid")
         return
     #pop message to user
     tkMessageBox.showinfo("K Means Clustering", "Preprocessing completed successfully!")
示例#23
0
def getTopNFormulae(text, n):
    global model, multiWordDict, thresholdFormulaConf
    words = pp.parse(text, multiWordDict)
    nameList, valueList = getFormulaConfidence(getConceptConfidence(words))
    result = {}
    index = 0

    for value in valueList:
        result[nameList[index]] = value
        index = index + 1

    return sorted(result, key=result.get, reverse=True)[0:n]
示例#24
0
def Train():
    """
       Trains the model with data from DATA/input.csv
    :return:
    """
    # initialize source -csv source
    csvhand = Handler.CSVHandler("DATA/input.csv")
    train_data = csvhand.getData()
    train_data_pro = Preprocessor.preprocess(train_data)
    Model.Init()  #intialize the model
    labels_ = Model.Train(train_data_pro)
    return
示例#25
0
def getTopNFormulae(text, n):
    global model, multiWordDict, thresholdFormulaConf
    words = pp.parse(text, multiWordDict)
    nameList, valueList = getFormulaConfidence(getConceptConfidence(words))
    result = {}
    index = 0

    for value in valueList:
        result[nameList[index]] = value
        index = index + 1

    return sorted(result, key=result.get, reverse=True)[0:n]
示例#26
0
def train_structured_perceptron():
    model = pos.StructuredPerceptron()
    fail = Preprocessor.load_sp_data('data/train_dev_data/PT.5000.train')

    train_data = []
    train_labels = []
    for _, (d, l) in enumerate(fail):
        train_data.append(d)
        train_labels.append(l)

    train_model(train_data, train_labels, model, 'models/sp-pt.model',
                (5, 0.2))
示例#27
0
    def process_image(self, img_path):
        self._img_path = img_path
        image_array = image.img_to_array(
            image.load_img(img_path, target_size=self._target_size))
        average_edgeness = np.zeros(
            (image_array.shape[0], image_array.shape[1]))
        average_cornerness = np.zeros(
            (image_array.shape[0], image_array.shape[1]))

        #On a channel by channel basis computes the features values of all points
        for color_channel in xrange(image_array.shape[2]):
            channel_array = np.zeros(
                (image_array.shape[0], image_array.shape[1]))
            for x in xrange(len(channel_array)):
                for y in xrange(len(channel_array[0])):
                    channel_array[x][y] = image_array[x][y][color_channel]
            #Could potentially crash if not given a 56 x 56 image as input for this method
            channel_edgeness, channel_cornerness = Preprocessor.process_image_sd2(
                channel_array)
            for x in xrange(len(channel_array)):
                for y in xrange(len(channel_array[0])):
                    #Next 2 lines utilize "magic numbers" that are the maximum that was
                    #found over the MNIST dataset, effectively regularizing the data
                    average_edgeness[x][y] += channel_edgeness[x][
                        y] / 0.634789974395 / image_array.shape[2]
                    average_cornerness[x][y] += channel_cornerness[x][
                        y] / 0.16428281162 / image_array.shape[2]

        #Scales image size down to 28 by 28 to fit the model
        ensmalled_edgeness = np.zeros(
            (image_array.shape[0] / 2, image_array.shape[1] / 2))
        ensmalled_cornerness = np.zeros(
            (image_array.shape[0] / 2, image_array.shape[1] / 2))
        for x in xrange(len(ensmalled_edgeness)):
            for y in xrange(len(ensmalled_edgeness[0])):
                ensmalled_edgeness[x][y] += average_edgeness[2 * x][2 * y]
                ensmalled_edgeness[x][y] += average_edgeness[2 * x + 1][2 * y]
                ensmalled_edgeness[x][y] += average_edgeness[2 * x][2 * y + 1]
                ensmalled_edgeness[x][y] += average_edgeness[2 * x + 1][2 * y +
                                                                        1]
                ensmalled_edgeness[x][y] /= 4.0
                ensmalled_cornerness[x][y] += average_cornerness[2 * x][2 * y]
                ensmalled_cornerness[x][y] += average_cornerness[2 * x + 1][2 *
                                                                            y]
                ensmalled_cornerness[x][y] += average_cornerness[2 * x][2 * y +
                                                                        1]
                ensmalled_cornerness[x][y] += average_cornerness[2 * x +
                                                                 1][2 * y + 1]
                ensmalled_cornerness[x][y] /= 4.0
        self._image_array = np.expand_dims(np.asarray(
            [ensmalled_edgeness, ensmalled_cornerness]),
                                           axis=0)
示例#28
0
def test_structured_perceptron():
    model = pos.StructuredPerceptron()
    fail = Preprocessor.load_sp_data('data/train_dev_data/EN.dev')

    test_data = []
    test_labels = []
    for _, (d, l) in enumerate(fail):
        test_data.append(d)
        test_labels.append(l)

    acc_en = test_model(test_data, test_labels, model,
                        'models/postagger.model')
    print "Accuracy on english:", acc_en

    model = pos.StructuredPerceptron()
    fail = Preprocessor.load_sp_data('data/train_dev_data/NL.dev')

    test_data = []
    test_labels = []
    for _, (d, l) in enumerate(fail):
        test_data.append(d)
        test_labels.append(l)

    acc_nl = test_model(test_data, test_labels, model, 'models/sp-nl.model')
    print "Accuracy on dutch:", acc_nl

    model = pos.StructuredPerceptron()
    fail = Preprocessor.load_sp_data('data/train_dev_data/PT.dev')

    test_data = []
    test_labels = []
    for _, (d, l) in enumerate(fail):
        test_data.append(d)
        test_labels.append(l)

    acc_pt = test_model(test_data, test_labels, model, 'models/sp-pt.model')

    print "Accuracy on portuguese:", acc_pt
    print "Total accuracy:", (acc_en + acc_nl + acc_pt) / 3.0
示例#29
0
def run():

    # 创建预处理器对象
    dpre = Preprocessor.Preprocessor()

    # 执行数据预处理
    dpre.preprocessing()

    # 创建LDA处理模型
    lda = LDAModel.LDAModel(dpre)

    # 执行参数估算
    lda.est()
示例#30
0
def plot_NYISO_forecast_error(true_path, pred_path):
    """ Plots and give metrics regarding the NYISO forecasts 
        Args :
            true_path : (file / folder string)  : True NYISO load data file / folder 
            pred_path : (file / folder string)  : NYISO forecasts load data file / folder 
    """
    import Loader as ld
    import Preprocessor as pre
    import matplotlib.pyplot as plt

    true_ld = ld.NY_Loader(true_path)
    nyiso_ld = ld.NY_Loader(pred_path)
    true_pre = pre.NY_Preprocessor(true_ld.data, 'Integrated Load', year_range=list(range(2008, 2018)))
    nyiso_pre = pre.NY_Preprocessor(nyiso_ld.data, 'Integrated Load', year_range=list(range(2008, 2018)), fix_duplicates='keep_last')
    results = mlu.get_results(true_pre.get_data().values, nyiso_pre.get_data(), true_pre.get_data().index)

    import Plotter
    Plotter.plot_results(results, groupby='month')

    print("Global error : {0}".format(mlu.get_measures(results, 'global', 'MAPE')))
    print("Global error : {0}".format(mlu.get_measures(results, 'global', 'RMSE')))
    plt.show()
示例#31
0
def getTopNConcepts(text, n):
    global model, multiWordDict
    words = pp.parse(text, multiWordDict)
    conceptConf = getConceptConfidence(words)
    count = 0
    result = []

    for w in sorted(conceptConf, key=conceptConf.get, reverse=True):
        result.append(w)
        count = count + 1
        if count == n:
            break

    return result
def preprocess(type):
    pp = Preprocessor.Preprocessor()
    df = pp.getDataFrame('./data/original/' + type)
    pp.toNumericData(df, save=type)
    df = pd.read_csv('./csv/' + type + '.csv', sep="\t", header=None)

    df, label = pp.toAutoEncoderData(df)
    print('df {} Label {}'.format(df.shape, label.shape))
    print(df.head)
    df.to_csv('./data/processed/' + type + '.csv',
              sep="\t",
              header=None,
              index=False)
    np.save('./data/processed/' + type + '_label.npy', label)
示例#33
0
文件: train.py 项目: itlsqin/NLP-JD
 def __init__(self, filename='./corpus/train.csv'):
     if os.path.exists(filename):
         data = pd.read_csv(filename)
         self.data = shuffle(data)
         X_data = pd.DataFrame(data.drop('sentiment', axis=1))
         Y_data = column_or_1d(data[:]['sentiment'], warn=True)
         self.X_train, self.X_val,\
         self.y_train, self.y_val = train_test_split(X_data, Y_data, test_size=0.3, random_state=1)
         self.model = None
         self.load_model()
         self.preprocessor = Preprocessor.Preprocessor()
     else:
         print('No Source!')
         self.preprocessor.process_data()
示例#34
0
def getTopNConcepts(text, n):
    global model, multiWordDict
    words = pp.parse(text, multiWordDict)
    conceptConf = getConceptConfidence(words)
    count = 0
    result = []

    for w in sorted(conceptConf, key=conceptConf.get, reverse=True):
        result.append(w)
        count = count + 1
        if count == n:
            break

    return result
    def addMultiWords(self, listOfMultiWords):

        """
            Updates the internal regex with a
            list of multiwords
        """

        buff = StringIO.StringIO()

        for multiWord in listOfMultiWords:

            if multiWord not in self.multiWordsRegex and multiWord not in buff.getvalue():
                buff.write(self.regexTemplate.format(multiWord))

                # add a normalized (no accents) version
                if multiWord != Preprocessor.normalize(multiWord):
                    buff.write(self.regexTemplate.format(Preprocessor.normalize(multiWord)))

        if len(self.multiWordsRegex) == 0:
            self.multiWordsRegex = buff.getvalue().strip("|")
        else:
            self.multiWordsRegex += "|" + buff.getvalue().strip("|")

        buff.close()
示例#36
0
def PredictRaw(studentRecord):
    """
        Predicts the data from DATA/test.csv
    :studentRecord: data of the format [ID,FI,E1...E2] ID and FI are must
    :return: list of subjects he should have studied
    """
    studentRecord = formatStudent(studentRecord)
    pred_data = pd.DataFrame().from_records([studentRecord])
    pred_data.columns = ["ID", "FI", "E", "E", "E"]
    pred_data_pro = Preprocessor.preprocess(pred_data, train=False)
    FI = list(Preprocessor.getFI())
    subjects = list(Preprocessor.getSubjects())
    fieldsInData = FI + subjects  # fields are FI_X and S_X
    predictions = Model.Predict(pred_data_pro)
    C = Model.getClusterCenters()  #cluster centers

    pred = predictions[
        0]  #get the prediciton value which is the only value present
    Ctarget = C[pred]  # get the corresponding cluster
    student = pred_data_pro.values[
        0]  #get the row of values (for a student) for which predicition is done
    print("Needed ", end=" ")
    result = {"FI": None, "S": []}
    for att_i in range(len(fieldsInData)):
        Catt = Ctarget[att_i]  #Cluster attribute value
        studentAtt = student[att_i]  # attribute value present in thhe
        field = fieldsInData[att_i]  # column name
        if (Catt > 0.5):  #if the value is high . Value might me changed later
            print(field,
                  end=" ")  # print the field or append it to the result array
            if (field in FI):
                result["FI"] = field
            else:
                result["S"].append(field)
    print()
    return result
示例#37
0
 def predict(self, predictedLength):
     """
     Predict probabilities throughout the day where the given block-duration is likely to be available
     
     :param predictedLength: The duration of the predicted block
     :return: A list containing the predicted probabilities
     """
     predicted_times = []
     current_prediction = 0
     for x in p.decimal_range(8, 17, 0.25):
         current_prediction = (self.wknn.predict_proba(
             [[x, predictedLength]])) * 100
         if (current_prediction[0][1] > 0):
             predicted_times.append([x, current_prediction[0][1]])
     return predicted_times
    def _process_xml_query(self, xml_node):
        query_id = xml_node.getElementsByTagName("QueryNumber")[0].firstChild.nodeValue
        query = xml_node.getElementsByTagName("QueryText")[0].firstChild.nodeValue

        processed_query = " ".join(Preprocessor.preprocessor_tokenizer(query))
        self.query_dict[query_id] = processed_query

        records = xml_node.getElementsByTagName("Records")[0]
        relevant_documents_list = list()
        for item in records.getElementsByTagName("Item"):
            doc_id = item.firstChild.nodeValue

            scores = item.getAttribute("score")
            votes = 0
            for i in range(len(scores)):
                if scores[i]!='0':
                    votes += 1

            relevant_documents_list.append((doc_id, votes))
        self.expected_docs_by_query[query_id] = relevant_documents_list
示例#39
0
    def query_vector(self, query):
        terms = Preprocessor.preprocessor_tokenizer(query)

        counter = collections.Counter(terms)
        query_vector = dict()
        query_vector_magnitude = 0
        for term in terms:
            if term not in self.document_frequency:
                continue

            max_tf = counter.most_common(1)[0][1] # it returns an list of item + frequency
            tf = counter[term]
            df = self.document_frequency[term]
            val = (0.5 + 0.5*tf/max_tf)*math.log10(self.n_terms/df)
            query_vector[term] = val

            query_vector_magnitude += val*val

        # normalizing step
        query_vector_magnitude = math.sqrt(query_vector_magnitude)
        for term in query_vector:
            query_vector[term] /= query_vector_magnitude

        return query_vector
def loadTargetsFromFile(filename):
    # load targets, their names, nicknames and ergonyms from given file into an array of three lists.
    # returns the array.
    # names with accents and cedilla are duplicated after normalization, except if escaped with ^.
    #
    
    NAMES = 0
    NICKNAMES = 1
    ERGOS = 2
    
    f = codecs.open(filename,"r", "utf-8")
    
    targets = []
    
    for fileLine in f:
        
        line = fileLine.lower()


        # lines starting with "#" are skipped -- mjs 2011.10.27
        if line[0] == '#':
            if debug: print "skipped: ", line
            continue
        
        sepIndex = line.find(":")
        id = line[0:sepIndex]
        
        names = []
        nicknames = []
        ergos = []

        if id != None and id != '':
        
            mentions = line[sepIndex+1:].split(';') 
            
            try:
            
                namesTokens = mentions[NAMES]
                
                for name in namesTokens.split(','):                    
                    
                    cleanName = name.replace("\n","").strip(' ').rstrip(' ')

                    if debug: print "cleanName = ", cleanName
                    
                    if cleanName != '':
                        if cleanName[0] != '^':
                            names.append(cleanName)
                            if cleanName != Preprocessor.normalize(cleanName): 
                                names.append(Preprocessor.normalize(cleanName))
                                if debug: print "appended clean name", cleanName

                        else:
                            names.append(cleanName[1:])
                            if debug: print "appended unclean name", cleanName,  cleanName[1:]
                       
            except IndexError:
                None
            
            try: 
                nicknamesTokens = mentions[NICKNAMES]
                
                for name in nicknamesTokens.split(','):
                    
                    cleanName = name.replace("\n","").strip(' ').rstrip(' ')

                    if debug: print "cleanName (nickname) = ", cleanName


                    if cleanName != '':
                        if cleanName[0] != '^':
                            nicknames.append(cleanName)
                            if cleanName != Preprocessor.normalize(cleanName): 
                                nicknames.append(Preprocessor.normalize(cleanName))
                                if debug: print "appended clean nickname", cleanName, Preprocessor.normalize(cleanName)

                        else:
                            nicknames.append(cleanName[1:])
                            if debug: print "appended uncleaned nickname ", cleanName, cleanName[1:]

            except IndexError:
                None
                
            try:                        
                ergoTokens = mentions[ERGOS]
                
                for name in ergoTokens.split(','):
                    cleanName = name.replace("\n","").strip(' ').rstrip(' ')

                    if debug: print "cleanName (ergonym) = ", cleanName


                    if cleanName != '':
                        if cleanName[0] != '^':
                            ergos.append(cleanName)
                            if cleanName != Preprocessor.normalize(cleanName): 
                                ergos.append(Preprocessor.normalize(cleanName))
                                if debug: print "appended clean ergonym", cleanName, Preprocessor.normalize(cleanName)

                        else:
                            ergos.append(cleanName[1:])  
                            if debug: print "appended uncleaned ergonym", cleanName, cleanName[1:],
            
            except IndexError:
                None
                   
            targets.append(Person(id,names,nicknames,ergos))
                
    f.close()

    return targets
示例#41
0
def newGetTweets(beginDate,endDate,proxy):
    
    """
    Gets tweets from a service for a certain period
    Params: begin date 
    end date
    proxy
    
    Returns: list of Opinion instances
    """
    
    print "Getting new tweets..."
    
    username = "******"
    password = "******"
    top_level_url = "http://pattie.fe.up.pt/solr/portugal/select"
    requestTweets = "http://pattie.fe.up.pt/solr/portugal/select?q=created_at:[{0}%20TO%20{1}]&indent=on&wt=json"
    #requestTweets = "http://pattie.fe.up.pt/solr/portugal/select?q=created_at:[2012-05-25T13:00:00Z%20TO%202012-05-25T15:00:00Z]&indent=on&wt=json"
    
    #Password manager because the service requires authentication
    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
    password_mgr.add_password(None, top_level_url, username, password)
    auth_handler = urllib2.HTTPBasicAuthHandler(password_mgr)
    opener = None
    
  
    
    if proxy != None:
        proxy_handler = urllib2.ProxyHandler({'http': proxy})        
        opener = urllib2.build_opener(auth_handler,proxy_handler)       
    else:
        #opener = urllib2.build_opener(auth_handler)
        opener = urllib2.build_opener()
    
    if beginDate.strftime('%Y') == "1900":
       
        print "Getting Tweets from STDIN ..."
        twitterData = sys.stdin;
        
    else:
        """
        print "Requesting: " + requestTweets.format(beginDate.strftime('%Y-%m-%dT%H:%M:%Sz'),
                                                    endDate.strftime('%Y-%m-%dT%H:%M:%Sz'))
        twitterData = opener.open(requestTweets.format(beginDate.strftime('%Y-%m-%dT%H:%M:%Sz'),
                                                       endDate.strftime('%Y-%m-%dT%H:%M:%Sz')));
                                                       
        """
        
        print "Requesting: " + requestTweets.format(urllib.quote(beginDate.strftime('%Y-%m-%dT%H:%M:%SZ')),
                                                    urllib.quote(endDate.strftime('%Y-%m-%dT%H:%M:%SZ')))
        twitterData = opener.open(requestTweets.format(urllib.quote(beginDate.strftime('%Y-%m-%dT%H:%M:%SZ')),
                                                       urllib.quote(endDate.strftime('%Y-%m-%dT%H:%M:%SZ'))));
        
    #Read the JSON response
    jsonTwitter = simplejson.loads(unicode(twitterData.read().decode("utf-8")))
    
    print jsonTwitter["response"]
    
    politicians = getPoliticians()
    sentiTokens = getSentiTokens()
       
    multiWordTokenizer = getMultiWordsTokenizer(politicians, sentiTokens)
                                
    listOfTweets = []
    
    #Build a dictionary
    for tweet in jsonTwitter["response"]["docs"]:
    
        id = str(tweet["user_id"]) + "_" + str(tweet["id"])
        userId = unicode(tweet["user_id"])
        #print userId
        
        date =  datetime.strptime(tweet["created_at"], '%Y-%m-%dT%H:%M:%Sz')
        
        taggedSentence = multiWordTokenizer.tokenizeMultiWords(unicode(tweet["text"]))
        taggedSentence = Preprocessor.removeURLs(taggedSentence)
        taggedSentence = Preprocessor.removeUsernames(taggedSentence)  
        
        listOfTweets.append(Opinionizers.Opinion(tweet["id"],unicode(tweet["text"]),user=userId,date=date,processedSentence = taggedSentence))
    
    print len(listOfTweets), " tweets loaded\n"  
   
    return listOfTweets
def testOldProcessWithDiagnostics(sourceFile):
    
    results = {"numOf-1":0,"correct-1":0,"numOf0":0,"correct0":0,"numOf1":0,"correct1":0}
    
    #corpus = csv.reader(codecs.open(sourceFile,"r","utf-8"),delimiter='|')
    corpus = codecs.open(sourceFile,"r","utf-8")
    
    listOfTweets = []
    i=0
    
    politicians = getFromCache(PERSONS_CACHE) #getPoliticians()
    sentiTokens = getFromCache(SENTI_CACHE) #getSentiTokens()
    
    rulesClassifier = EuroOpinionizers.Rules(politicians,sentiTokens)     
    naiveClassifier = EuroOpinionizers.Naive(politicians,sentiTokens)    
    multiWordTokenizer = getFromCache(MULTIWORD_CACHE) #getMultiWordsTokenizer(politicians, sentiTokens)
    
    print "loading tweets..."
    
    for line in corpus:
        
        tweet = line.split('|')
        
        #skip the first line
        if tweet[0] != 'ID':
            
            #print tweet
            """
            fullSentence = ''
            
            
            #in some cases the message spawns across several fields so we are concatenating them...
            for block in tweet[TEXT:]:
                fullSentence = fullSentence + block
            """
            
            fullSentence = tweet[TEXT]    
            tokenizedSentence = multiWordTokenizer.tokenizeMultiWords(unicode(fullSentence))
            tokenizedSentence =  Preprocessor.removeURLs(tokenizedSentence)
            tokenizedSentence = Preprocessor.removeUsernames(tokenizedSentence)            
            
            o = Opinion(id = tweet[ID],
                        user = u"Teste",
                        sentence = unicode(fullSentence),
                        processedSentence = unicode(tokenizedSentence),
                        target = unicode(tweet[TARGET]),
                        mention = unicode(tweet[MENTION]),
                        polarity = int(tweet[SENTIMENT_POLARITY]))            
            listOfTweets.append(o)
    
            i = i+1            
            """
            if i!=0 and i%10 == 0:
                
                break
            """
            
    print "tweets loaded..."
    
    falseNeg = []
    falsePos = []
    falseNeut = []
    totalList = []
              
    for tweet in listOfTweets:
        
        rulesTweet = rulesClassifier.inferPolarity(tweet,True)
        naiveTweet = naiveClassifier.inferPolarity(tweet,True)
        
        regex = ur"score:(.*);"
        sentiScore = re.search(regex, naiveTweet.metadata).group(1)
        tweetScore = int(sentiScore) + int(rulesTweet.polarity)
        
        if tweetScore > 0:
            tweet.polarity = 1
        elif tweetScore < 0:
            tweet.polarity = -1
        else:
            tweet.polarity = 0
        
        tweet.metadata = rulesTweet.metadata + ";" + naiveTweet.metadata
        
        totalList.append(tweet)
        
        """
        if tweet.polarity == -1:
            results["numOf-1"] += 1
            if tweet.polarity == classifiedTweet.polarity:
                results["correct-1"] +=1
            else:
                matchedRules = rulesDiagnosticHelper(rulesClassifier.generateFeatureSet(tweet,True))
                classifiedTweet.metadata += " " + matchedRules
                falseNeg.append(classifiedTweet)
                
        elif tweet.polarity == 0:
            results["numOf0"] += 1
            if tweet.polarity == classifiedTweet.polarity:
                results["correct0"] +=1
            else:            
                matchedRules = rulesDiagnosticHelper(rulesClassifier.generateFeatureSet(tweet,True))
                classifiedTweet.metadata += " " + matchedRules
                falseNeut.append(classifiedTweet)
                
        if tweet.polarity == 1:
            results["numOf1"] += 1
            if tweet.polarity == classifiedTweet.polarity:
                results["correct1"] +=1
            else:
                matchedRules = rulesDiagnosticHelper(rulesClassifier.generateFeatureSet(tweet,True))
                classifiedTweet.metadata += " " + matchedRules
                falsePos.append(classifiedTweet)
    
    logTweets(falseNeg,"./falseNegs.csv")
    logTweets(falseNeut,"./falseNeut.csv")
    logTweets(falsePos,"./falsePos.csv")
    """
    logTweets(totalList,"./newProcess.csv")
    
    """
def testSubjectivity(sourceFile):
    
    corpus = codecs.open(sourceFile,"r","utf-8")
    
    listOfTweets = []
    rejectList = []
    i=0
    
    politicians = getPoliticians()
    sentiTokens = getSentiTokens()
    
    #rulesClassifier = EuroOpinionizers.Rules(politicians,sentiTokens)     
    naiveClassifier = EuroOpinionizers.Naive(politicians,sentiTokens)    
    #multiWordTokenizer = getMultiWordsTokenizer(politicians, sentiTokens)
    
    print "loading tweets..."
    
    for line in corpus:
        
        tweet = line.replace("\"","\'").split('|')
        
        #skip the first line
        if tweet[0] != 'PERIOD':
            
            #print tweet
            
            fullSentence = ''
            
            #in some cases the message spawns across several fields so we are concatenating them...
            for block in tweet[TEXT:]:
                fullSentence = fullSentence + block
                
            tokenizedSentence = Preprocessor.separateSpecialSymbols(unicode(fullSentence)) #multiWordTokenizer.tokenizeMultiWords(unicode(fullSentence))
            #tokenizedSentence =  Preprocessor.removeURLs(tokenizedSentence)
            #tokenizedSentence = Preprocessor.removeUsernames(tokenizedSentence)            
            print tokenizedSentence
            
            o = Opinion(id = tweet[ID],
                        user = u"Teste",
                        sentence = unicode(fullSentence),
                        processedSentence = tokenizedSentence,
                        target = unicode(tweet[TARGET]),
                        mention = unicode(tweet[MENTION]),
                        polarity = int(tweet[SENTIMENT_POLARITY]))
            
            matches = re.findall(naiveClassifier.sentiTokensRegex ,tokenizedSentence) 
            
            if matches != None and len(matches) > 0:  
            
                listOfTweets.append(o)
                
            else:
                rejectList.append(o) 
            
            i = i+1
            
            """
            if i!=0 and i%30 == 0:
                
                break
            """
    
    logTweets(listOfTweets,"./listOfTweets.csv")
    logTweets(rejectList,"./rejectList.csv")
def genFeatsWithSubjectivity(isGoldStandard,sourceFile, destinyFile):
    
    corpus = codecs.open(sourceFile,"r","utf-8")
     
    politicians = getPoliticians()
    sentiTokens = getSentiTokens()
    
    rulesClassifier = EuroOpinionizers.Rules(politicians,sentiTokens)     
    naiveClassifier = EuroOpinionizers.Naive(politicians,sentiTokens)    
    multiWordTokenizer = getMultiWordsTokenizer(politicians, sentiTokens)
    listOfTweets = []
    
    i=0
    
    for line in corpus:
        
        tweet = line.replace("\"","\'").split('|')
        
        #skip the first line
        if tweet[0] != 'PERIOD':            
            
            sentence = ''
            
            for block in tweet[TEXT:]:
                sentence = sentence + block
                           
            tokenizedSentence = multiWordTokenizer.tokenizeMultiWords(sentence)            
            tokenizedSentence = Preprocessor.removeURLs(tokenizedSentence)
            tokenizedSentence = Preprocessor.removeUsernames(tokenizedSentence)
            #tokenizedSentence =  separateSpecialSymbols(tokenizedSentence)
            
            o = Opinion(id = tweet[ID],
                        sentence = unicode(sentence),
                        processedSentence = unicode(tokenizedSentence),
                        target = unicode(tweet[TARGET]),
                        mention = unicode(tweet[MENTION]),
                        polarity = int(tweet[SENTIMENT_POLARITY]))
            
            listOfTweets.append(o)
    
            i = i+1
        """          
        if i!=0 and i%50 == 0:
            
            break
        """
        
        
    print "tweets loaded..."
    
    featuresFile = open(destinyFile,"w") 
    featuresFile.write(ARFF_HEADERS)
    unknownSentiFile = codecs.open("./unknownSentiment.csv","w","utf-8")
        
    featurama = cStringIO.StringIO()
    unknownSentiment = cStringIO.StringIO()    
    
    tmp = cStringIO.StringIO()
    
    for tweet in listOfTweets:
                
        #used to verify if an instance matches any rule and\or has any sentiToken
        #if after processing this var still has value 0 then we don't add it to 
        #the training set...
        sentiCounter = 0
        
        tmp = cStringIO.StringIO()
        tmp.write(str(tweet.id)+ ",")
                 
        featureSet = rulesClassifier.genConditionalFeatureSet(tweet,True)
                
        for feature in featureSet:
            tmp.write(str(feature) + ",")
            sentiCounter += feature
               
        naiveClassification = naiveClassifier.inferPolarity(tweet,True)
        tmp.write(str(naiveClassification.polarity) + ",")
        sentiCounter+= naiveClassification.polarity
                        
        pos = naiveClassification.metadata.find("score:")
        score = naiveClassification.metadata[pos+6:].replace(";","")
        tmp.write(str(score) + ",")        
        sentiCounter+= int(score)
        
        isNews = isNewsSource(tweet.sentence)    
    
        tmp.write(str(isNews) + ",")
        sentiCounter+= isNews
        
        if sentiCounter != 0:
            
            if isGoldStandard:
                    
                featurama.write(unicode(tmp.getvalue())+unicode(tweet.polarity))
            
            else:
                featurama.write(tmp.getvalue()+"?\n")
        else:
            unknownSentiment.write(str(tweet.id)+"|"+str(tweet.polarity)+"|"+tweet.sentence+"\n")
            
        
    featuresFile.write(featurama.getvalue())
    featuresFile.close()
    unknownSentiFile.write(unknownSentiment.getvalue())
    unknownSentiFile.close() 
if i=='m' or i=='M':
    corpus='mr'
    binary_classification=True
else:
    corpus='reuters'
#Forming Prepare the cat_num_docs dictionary, where the number of documents in the training set for each
             ##category are stored
    ##also forming the training set and test set

start_time = time.time()

#Here, apart from the naive bayes classifier, everything is done by nltk

#2) refer the comments for the function get_testset_trainset()

li = Preprocessor.get_testset_trainset(corpus)
testset = li[1]
trainset = li[0]
li = Preprocessor.startup()
cat_num_docs = li[1]

###--------------------DEBUG STATEMENTS----------------------
#for f in trainset:
 #   print f , FilenameCategoriesDict[f] 

#print "Freedom\n"

#for f in testset:
 #   print f    
###--------------------DEBUG STATEMENTS----------------------
示例#46
0
 def __init__(self):
     self.stack = Stack()
     self.preprocessor = Preprocessor()
     self.memory = {}
     self.usedWrite = False
示例#47
0
def init():
    global model, stdFormulaList, drvFormulaList, concepts, multiWordDict

    # Get Data
    print "Loading Physics questions from default dataset ..."
    train_data = readFile('./Datasets/Training1.csv')
    validation_data = readFile('./Datasets/Validation1.csv')
    test_data = readFile('./Datasets/Testing1.csv')
    print "Done!"

    # Get Domain Knowledge
    print "Loading Domain Knowledge ..."
    DM.init()
    stdFormulaList = DM.getStdFormulaList()
    drvFormulaList = DM.getDrvFormulaList()
    concepts = DM.getConcepts()
    multiWordDict = DM.getMultiWordDict()
    print "Done!"

    # Generate Matrix
    print "Generating feature matrix for training ..."
    words = pp.parse(train_data[0,0], multiWordDict)
    X_Train = getFeatureVector(words)
    for i in range(1, train_data.shape[0]):
        words = pp.parse(train_data[i,0], multiWordDict)
        X_Train = np.vstack([X_Train, getFeatureVector(words)])

    words = pp.parse(validation_data[0,0], multiWordDict)
    X_Validation = getFeatureVector(words)
    for i in range(1, validation_data.shape[0]):
        words = pp.parse(validation_data[i,0], multiWordDict)
        X_Validation = np.vstack([X_Validation, getFeatureVector(words)])

    words = pp.parse(test_data[0,0], multiWordDict)
    X_Test = getFeatureVector(words)
    for i in range(1, test_data.shape[0]):
        words = pp.parse(test_data[i,0], multiWordDict)
        X_Test = np.vstack([X_Test, getFeatureVector(words)])

    Y_Train = train_data[:,1]
    Y_Validation = validation_data[:,1]
    Y_Test = test_data[:,1]
    print "Done!"

    # Train the model
    print "Training ..."
    model.fit(X_Train, Y_Train)
    print "Done!"

    # Predict the result
    print "\n Training Data:"
    print "Prediction " + str(model.predict(X_Train))
    print "Actual     " + str(Y_Train)
    print "Accuracy   " + str(model.score(X_Train,Y_Train)*100) + "%"

    print "\n Vaildation Data:"
    print "Prediction " + str(model.predict(X_Validation))
    print "Actual     " + str(Y_Validation)
    print "Accuracy   " + str(model.score(X_Validation,Y_Validation)*100) + "%"

    print "\n Test Data:"
    print "Prediction " + str(model.predict(X_Test))
    print "Actual     " + str(Y_Test)
    print "Accuracy   " + str(model.score(X_Test,Y_Test)*100) + "%"
示例#48
0
def getCategory(text):
    global model, multiWordDict
    words = pp.parse(text, multiWordDict)
    return getClassName(model.predict([getFeatureVector(words)]))
示例#49
0
class Interpreter:
    stack = None
    preprocessor = None
    memory = None

    Keywords = [';', ':', 'DO', 'LOOP', 'BEGIN', 'UNTIL', 'EXPECT', 'LEAVE','DUP', 'SWAP', 'DROP', 'EMIT', 'MOD', 'KEY', 'DEPTH', 'ROLL', 'PICK']

    def __init__(self):
        self.stack = Stack()
        self.preprocessor = Preprocessor()
        self.memory = {}
        self.usedWrite = False

    def interact(self):
        while True:
            try:
                line = raw_input('>>> ')
                line = self.preprocessor.preprocess(line)
                self.process(line)
                if self.usedWrite:
                    sys.stdout.write('\n')
                    self.usedWrite = False
            except Exception as e:
                print e.message

    def push(self, statement):
        try:
            print '>>>', statement
            statement = self.preprocessor.preprocess(statement)
            self.process(statement)
            if self.usedWrite:
                    sys.stdout.write('\n')
                    self.usedWrite = False
        except Exception as e:
            print e.message

    def process(self, line, is_inside_until=False):
        statement = Tokenizer(line.strip(' ').strip('\n'), ' ')
        while statement.has_next():
            token = statement.next()
            if token in ['.', '.s', 'CR']:
                if token == '.':
                    sys.stdout.write(self.stack.pop())
                    self.usedWrite = True
                elif token == 'CR':
                    print
                else:
                    print self.stack
            elif token in self.memory:
                self.process(self.memory[token])
            elif token in ['DUP', '+', '-', '*', '/', 'SWAP', 'DROP', '<', '>', '<=', '>=', '=', 'EMIT', 'MOD', 'KEY',
                           'DEPTH', 'ROLL', 'PICK']:
                import Operators
                Operators.Op[token](self.stack)
            elif token == ':':
                self.function_definition(statement)
            elif token == 'IF':
                self.handle_if(statement)
            elif token == 'DO':
                self.do_loop(statement)
            elif token == 'BEGIN':
                self.until_loop(statement)
            elif token == 'LEAVE':
                if is_inside_until and self.stack.pop() != '0':
                    return False
            elif token == 'EXPECT':
                self.expect()
            elif token.startswith('."'):
                self.print_string(token, statement)
            elif token.isdigit():
                self.stack.push(token)
            elif token.strip() == '':
                pass
            else:
                raise NameError('Invalid Input: ' + token)
        return True

    def print_string(self, current_token, statement):
        if current_token.endswith('"'):
            print current_token[2:-1]
        else:
            # If the string contains white space then we need to connect its parts.
            string = current_token
            while statement.has_next():
                token = statement.next()
                string = string + ' ' + token
                if string.endswith('"'):
                    break
            print string[2:-1]

    def expect(self):
        count = int(self.stack.pop())
        string = ''
        while len(string) < count:
            string = string + raw_input('... ')
        for index in xrange(count):
            self.stack.push(string[index])

    def until_loop(self, statement):
        loop_body = ''
        pair = 1
        while statement.has_next():
            token = statement.next()
            if token == 'UNTIL' and pair == 1:
                pair -= 1
                break
            elif token == 'BEGIN':
                pair += 1
            elif token == 'UNTIL' and pair > 1:
                pair -= 1
            loop_body = loop_body + ' ' + token
        if pair != 0:
            raise NameError('Unmatched UNTIL Block')
        while True:
            # If not means if LEAVE occurred
            if not self.process(loop_body, is_inside_until=True):
                break
            elif self.stack.top() == '0':
                break

    def do_loop(self, statement):
        loop_body = ''
        pair = 1
        while statement.has_next():
            token = statement.next()
            if token == 'DO':
                pair += 1
            elif token == 'LOOP' and pair > 1:
                pair -= 1
            elif token == 'LOOP' and pair == 1:
                pair -= 1
                break
            loop_body = loop_body + ' ' + token
        if pair != 0:
            raise NameError('Unmatched DO Block')
        start = int(self.stack.pop())
        end = int(self.stack.pop())
        for i in xrange(start, end):
            self.process(loop_body)

    def handle_if(self, statement):
        condition = bool(int(self.stack.pop()))  #if the top was !=0 its True otherwise its False
        if_body = ''
        pair = 1
        if condition:
            while statement.has_next():
                token = statement.next()
                if token == 'IF':
                    pair += 1
                elif token == 'ELSE' and pair == 1:  # if pair was one ELSE is for the current if
                    break
                elif token == 'THEN' and pair > 1:  # means that the THEN is not for the currect if but in its body
                    pair -= 1
                elif token == 'THEN' and pair == 1:  # THEN is for the current if
                    pair -= 1
                    break
                if_body = if_body + ' ' + token
            if pair != 0:
                while statement.has_next():
                    token = statement.next()
                    if token == 'IF':  #if it was breaked with else it will ignore everything till THEN of the current if
                        pair += 1
                    elif token == 'THEN':
                        pair -= 1
                    elif token == 'THEN' and pair == 1:
                        pair -= 1
                        break
        else:
            collect = False
            while statement.has_next():
                token = statement.next()
                if token == 'IF':
                    pair += 1
                elif token == 'ELSE' and pair == 1:
                    collect = True
                    continue
                elif token == 'THEN' and pair > 1:
                    pair -= 1
                elif token == 'THEN' and pair == 1:
                    pair -= 1
                    break
                if collect:
                    if_body = if_body + ' ' + token
        if pair != 0:
            raise NameError('Unmatched IF Block')
        self.process(if_body)

    def function_definition(self, statement):
        name = statement.next()
        self.memory[name] = ''
        while statement.has_next():
            token = statement.next()
            # Return when we reach end of the function definition.
            if token == ';':
                return
            self.memory[name] = self.memory[name] + ' ' + token
        raise NameError('Syntax Error')
def loadSentiTokens(path,pathExceptions):

    f = codecs.open(path,"r", "utf-8")
    adjectives = []
    firstLine = f.next()
    exceptions = unicode(loadExceptionTokens(pathExceptions))       
    
    lemmaRegex = ",(.*?)\."     
    flexRegex = "^(.*?),"    
    polarityRegex = "POL:..=(-1|0|1)(?:;|$)"
    posRegex = "PoS=(.*?);"
    
    currentLemma = re.search(lemmaRegex,firstLine).group(1)
    
    pol = re.findall(polarityRegex,firstLine)
    pol1 = int(pol[0])
    
    try:
        pol2 = int(pol[1])
      
    except IndexError:
        pol2 = 0
    
    currentPolarity = str(pol1 + pol2)
    
    currentPos = re.search(posRegex,firstLine).group(1).lower()
    currentFlex = re.search(flexRegex,firstLine).group(1)
    currentFlexions = []
    currentFlexions.append(currentFlex)
    
    if currentFlex not in exceptions and currentFlex != Preprocessor.normalize(currentFlex):
                        
        currentFlexions.append(Preprocessor.normalize(currentFlex))
       
    for line in f:
        
        try:
            if "REV=Amb" not in line:               
                
                lemma = re.search(lemmaRegex,line).group(1)
                #print re.search(lemmaRegex,line).groups()
                
                if lemma != currentLemma:            
                    
                    if currentLemma not in exceptions and currentLemma != Preprocessor.normalize(currentLemma):
                        
                        currentFlexions.append(Preprocessor.normalize(currentLemma))
                                
                    adjectives.append(SentiToken(currentLemma,currentPolarity,currentPos,currentFlexions))
                                        
                    currentLemma = lemma                    
                    
                    #currentPolarity = re.search(polarityRegex,line).group(1)
                    pol = re.findall(polarityRegex,line)
                    
                    pol1 = int(pol[0])
                    
                    try:
                        pol2 = int(pol[1])
                       
                    except IndexError:
                        pol2 = 0
                    
                    currentPolarity = str(pol1 + pol2)
                                        
                    currentPos = re.search(posRegex,line).group(1).lower()
                    currentFlexions = []
                    currentFlex = re.search(flexRegex,line).group(1)
                    currentFlexions.append(currentFlex)    
                    
                    #print "L:", currentLemma,"P:",currentPolarity,"POS:",currentPos,"F:",currentFlex
                    
                    if currentFlex not in exceptions and currentFlex != Preprocessor.normalize(currentFlex):
                        
                        currentFlexions.append(Preprocessor.normalize(currentFlex))
                    
                else:   
                    currentFlex = re.search(flexRegex,line).group(1)
                    currentFlexions.append(currentFlex)
                    
                    #print "l:", lemma, "f:", currentFlex, "p:", currentPos
                    if currentFlex not in exceptions and currentFlex != Preprocessor.normalize(currentFlex):
                        
                        currentFlexions.append(Preprocessor.normalize(currentFlex))
                 
        except:
            None
    f.close()    
    
    return adjectives
def generateTextFeatures(sourceFile,destinyFile):
    
    corpus = codecs.open(sourceFile,"r","utf-8")
     
    politicians = getPoliticians()
    sentiTokens = getSentiTokens()
    
    #rulesClassifier = EuroOpinionizers.Rules(politicians,sentiTokens)     
    #naiveClassifier = EuroOpinionizers.Naive(politicians,sentiTokens)    
    multiWordTokenizer = getMultiWordsTokenizer(politicians, sentiTokens)
    listOfTweets = []
    
    i=0
    headers = """
    

@relation twitometro

@ATTRIBUTE ID NUMERIC 
@ATTRIBUTE tweet STRING
@ATTRIBUTE polarity? {-1,0,1}

@data
 
    """

    featuresFile = open(destinyFile,"w") 
    featuresFile.write(headers)
    
    for line in corpus:
        
        tweet = line.replace("\"","\'").split('|')
        
        #skip the first line
        if tweet[0] != 'PERIOD':            
            
            sentence = ''
            
            for block in tweet[TEXT:]:
                sentence = sentence + block
                           
            tokenizedSentence = multiWordTokenizer.tokenizeMultiWords(sentence)
            tokenizedSentence =  Preprocessor.removeURLs(tokenizedSentence)
            tokenizedSentence = Preprocessor.removeUsernames(tokenizedSentence)
            tokenizedSentence = tokenizedSentence.replace(tweet[MENTION]," <TARGET> ")
            tokenizedSentence = Preprocessor.removeStopWords(tokenizedSentence)
            tokenizedSentence = Preprocessor.separateSpecialSymbols(tokenizedSentence)
            tokenizedSentence = tokenizedSentence.replace("\n","") 
            tokenizedSentence = tokenizedSentence.replace(","," ")
            
            featuresFile.write(tweet[ID]+",\""+tokenizedSentence+"\","+tweet[SENTIMENT_POLARITY]+"\n")
            print sentence + " --> " + tokenizedSentence
            
            """
            o = Opinion(id = tweet[ID],
                        sentence = unicode(sentence),
                        processedSentence = unicode(tokenizedSentence),
                        target = unicode(tweet[TARGET]),
                        mention = unicode(tweet[MENTION]),
                        polarity = int(tweet[SENTIMENT_POLARITY]))
            
            listOfTweets.append(o)
            """
            
            i = i+1       
            
            """     
            if i!=0 and i%20 == 0:
            
                break
            """
            
    featuresFile.close()