Exemplos de preprocess em Python, exemplos de utility.preprocess em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: locker.py Projeto: snowy13/lockbox

def decrypt():
    # Its importanr to note that we do not check for a successful decryption or correct password
    # If intercepted we do not want to give attackers the ability to discern correct from incorrect attempts
    print("Please enter path to file")
    file_path = u.read_line()
    print("Please enter Encryption pass phrase")
    password = u.read_line()
    # generating AES cipher using inputted password
    key = c.create_key(password)
    # turning .epub into .zip for ease of use
    zip_name = u.preprocess(file_path)
    with zipfile.ZipFile(zip_name, mode='r') as myzip:
        for name in myzip.namelist():
            if name.endswith(".xhtml") or name.endswith(
                    ".css") or name.endswith(".opf") or name.endswith(".ncx"):
                with myzip.open(name) as in_file:
                    contents = in_file.read()
                    out = c.decrypt_AES(key, contents)
                u.update_zip(zip_name, name, out)
            else:
                with myzip.open(name) as in_file:
                    contents = in_file.read()
                    u.update_zip(zip_name, name, contents)

    u.postprocess(zip_name)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: TweetClassifier.py Projeto: bikashmishra/TwitterAnalysis

    def predict_tweets(self, docs, predict_log_p=False):
        """ 
        Take in a list of docs and cerate a feature array
        of size [nexamples]x[nfeatures]. This can be a sparse matrix
        This matrix/array is then sent to predict and log_likelihood
        """
        nfeatures = len(self.features_)
        nexamples = len(docs)
        X = sparse.lil_matrix((nexamples,nfeatures), dtype=np.float)
        
        stop_words = util.getStopWords()

        iexample = -1
        for tweet in docs:
            iexample += 1
            tweet = util.preprocess(tweet)
            words = [w for w in tweet.split() if (len(w)>=3 and w not in stop_words
                                               and re.search(r'^[a-zA-Z][a-zA-Z0-9]*$',w))]
            for f in words:
                if f in self.features_:
                    X[iexample,self.features_.index(f)] += 1
                
        if not predict_log_p:
            return self.predict(X)
        else:
            return self.predict_logprob(X)

Exemplo n.º 3

0

Exibir arquivo

	def build(self):
		print("BUILDING MODEL...")
		
		if self.is_built:
			return

		self.is_built = True

		generator_factory = self.create_generator()
		discriminator_factory = self.create_discriminator()
		smoothing = 1
		seed = self.options.seed
		kernel = 4

		self.input_rgb = tf.placeholder(tf.float32, shape=(None, None, None, 3), name='input_rgb')

		self.input_color = preprocess(self.input_rgb, colorspace_in=COLORSPACE_RGB, colorspace_out='LAB')

		self.input_gray = tf.image.rgb_to_grayscale(self.input_rgb)

		generator = generator_factory.create(self.input_gray, kernel, seed)
		discriminator_real = discriminator_factory.create(tf.concat([self.input_gray, self.input_color], 3), kernel, seed)
		discriminator_fake = discriminator_factory.create(tf.concat([self.input_gray, generator], 3), kernel, seed, reuse_variables=True)

		generator_ce = tf.nn.sigmoid_cross_entropy_with_logits(logits=discriminator_fake, labels=tf.ones_like(discriminator_fake))
		discriminator_real_ce = tf.nn.sigmoid_cross_entropy_with_logits(logits=discriminator_real, labels=tf.ones_like(discriminator_real) * smoothing)
		discriminator_real_ce = tf.nn.sigmoid_cross_entropy_with_logits(logits=discriminator_fake, labels=tf.zeros_like(discriminator_fake))

		self.dis_loss_real = tf.reduce_mean(discriminator_real_ce)
		self.dis_loss_fake = tf.reduce_mean(discriminator_real_ce)
		self.dis_loss = tf.reduce_mean(discriminator_real_ce + discriminator_real_ce)

		self.gen_loss_gan = tf.reduce_mean(generator_ce)
		self.gen_loss_l1 = tf.reduce_mean(tf.abs(self.input_color - generator)) * 100.0
		self.gen_loss = self.gen_loss_gan + self.gen_loss_l1

		self.sampler = tf.identity(generator_factory.create(self.input_gray, kernel, seed, reuse_variables=True), name='output')
		self.accuracy = pixelwise_accuracy(self.input_color, generator, 'LAB', 2.0)
		self.learning_rate = tf.constant(self.options.lr)

		if self.options.lr_decay and self.options.lr_decay_rate > 0:
			self.learning_rate = tf.maximum(1e-6, tf.train.exponential_decay(
				learning_rate=self.options.lr,
				global_step=self.global_step,
				decay_steps=self.options.lr_decay_steps,
				decay_rate=self.options.lr_decay_rate))

		self.gen_train = tf.train.AdamOptimizer(
			learning_rate=self.learning_rate,
			beta1=0
		).minimize(self.gen_loss, var_list=generator_factory.var_list)

		self.dis_train = tf.train.AdamOptimizer(
			learning_rate=self.learning_rate / 10,
			beta1=0
		).minimize(self.dis_loss, var_list=discriminator_factory.var_list, global_step=self.global_step)

		self.saver = tf.train.Saver()

Exemplo n.º 4

0

Exibir arquivo

def storyline():
    # _items = db.appdb.find()
    # items = [items for items in _items]

    #processing text to return data :{line: text, pos_line:{n:[],v:[],adj:[{n:JJ},{n:JJ}],cd:[{n:CD}],pronoun_line:text}
    #preprocess also makes sure that verbs are written with _number in the increasing order in which they appear
    if request.method == 'POST':
        data, charDictList = utility.preprocess(request.json)

    else:
        line = {"data": {"line": request.args.get("line")}, "animCharDict": {}}
        linejson = jsonify(line)

        data, charDictList = utility.preprocess(linejson.json)
    if data == None:
        return jsonify({"error": "Text is empty, I wanna hear your story!"})
    result = par.parse(data, charDictList, db)
    return json.dumps(result, default=str)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: bot.py Projeto: autumnchiu/Chatbot

    def find_intent_match(self, responses, user_message):
        if len(responses) == 0:
            return self.idk_response(user_message)
        processed_message = Counter(preprocess(user_message))

        processed_responses = [
            Counter(preprocess(response)) for response in responses
        ]

        similarity_list = [
            compare_overlap(processed_message, rep)
            for rep in processed_responses
        ]

        # If none of the responses really fit what the user is asking:
        if (max(similarity_list) < 1 or len(responses) == 0):
            return self.idk_response(user_message)

        response_index = similarity_list.index(max(similarity_list))
        return responses[response_index]

Exemplo n.º 6

0

Exibir arquivo

Arquivo: locker.py Projeto: snowy13/lockbox

def encrypt():
    print("Please enter path to file")
    file_path = u.read_line()
    print("Please enter Encryption pass phrase")
    password = u.read_line()
    manifest = u.get_epub_info(file_path)
    actual_content = []
    # generating AES cipher using inputted password
    key = c.create_key(password)
    # turning .epub into .zip for ease of use
    zip_name = u.preprocess(file_path)
    with zipfile.ZipFile(zip_name, mode='r') as myzip:
        for name in myzip.namelist():
            if name.startswith("OEBPS/"):
                actual_content.append(name[6:])
            # encrypt only the files that require encryption, ignore meta and image files
            if name.endswith(".xhtml") or name.endswith(
                    ".css") or name.endswith(".opf") or name.endswith(".ncx"):
                with myzip.open(name) as in_file:
                    contents = in_file.read()
                    out = c.encrypt_AES(key, contents)
                u.update_zip(zip_name, name, out)
            else:
                with myzip.open(name) as in_file:
                    contents = in_file.read()
                    u.update_zip(zip_name, name, contents)

    # Compare the file manifest to files found
    for item in manifest:
        if item in actual_content:
            actual_content.remove(item)

    # Must remove this file as it isn't tracked in the actual manifest
    # actual_content.remove("package.opf")
    # actual_content.remove("")

    if len(actual_content) > 0:
        # print(len(actual_content)+" Files not listed in manifest")
        print("[WARN] Following items not listed in manifest")
        for item in actual_content:
            print(item)

    u.postprocess(zip_name)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: solver.py Projeto: alexhgh/WizardParty

def solve(num_wizards, num_constraints, wizards, constraints):
    """
    Write your algorithm here.
    Input:
        num_wizards: Number of wizards
        num_constraints: Number of constraints
        wizards: An array of wizard names, in no particular order
        constraints: A 2D-array of constraints, 
                     where constraints[0] may take the form ['A', 'B', 'C']i

    Output:
        An array of wizard names in the ordering your algorithm returns
    """
    constraints = utility.preprocess(wizards, num_constraints, constraints)
    opt, name = utility.find_optimizable(constraints)
    print("optimizable wizards: ", len(name))
    print("related constraints: ", len(opt))
    output = utility.strategy1(num_wizards, wizards, constraints)
    order = [wizards[o] for o in output]

    return order

Exemplo n.º 8

0

Exibir arquivo

from lightgbm import LGBMClassifier
from sklearn.ensemble import (AdaBoostClassifier, ExtraTreesClassifier,
                              GradientBoostingClassifier,
                              RandomForestClassifier)
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier

from ensemble import Ensemble
from utility import read_data, preprocess

start = time.time()

# Read in our input data
df_train, df_test = read_data()

df_train, df_test = preprocess(df_train, df_test)

id_test = df_test['id'].values

X = df_train.drop(['id', 'target'], axis=1)
X_test = df_test[X.columns].values
X = X.values
y = df_train['target'].values

df_train = None
df_test = None

print("Loaded and prepared data in %.2f seconds" % (time.time() - start))

gc.collect()

Exemplo n.º 9

0

Exibir arquivo


def read_data(dataset_path):
    print('Reading Data...')
    data = pd.read_csv(dataset_path)
    X, y = data.data.values, data.intent.values
    return X, y


def pipeline_sent_enc(text):
    return embed([text]).numpy()[0]


if __name__ == '__main__':
    dataset = read_data('dataset/mainModel.csv')
    dataset = preprocess(dataset, {'tfidf': tfidf, 'tokenizer': tokenizer})

    features = {
        # 'lstm_features' : pipeline_lstm_feature,
        'sent_enc': pipeline_sent_enc,
        # 'glove' : pipeline_avg_glove,
        # 'idf_glove' : pipeline_idf_glove,
        # 'tfidf' : pipeline_tfidf_vectorize,
        # 'elmo' : pipeline_elmo
    }

    features = featurize_and_split(dataset, features)
    """

    rf = {
        'model' : OneVsRestClassifier(RandomForestClassifier()),

Exemplo n.º 10

0

Exibir arquivo

Arquivo: tfidf.py Projeto: kaylee4m/Edi-EPCC

import re
import math
import sys
from nltk.stem import PorterStemmer
from utility import preprocess, get_dic_term, remove_stop
import time

indexfile = sys.argv[1]
outfile = open("results.ranked.txt", 'w')
queryfile = open(sys.argv[2])

dic = preprocess(indexfile)  # read index file into dict
ps = PorterStemmer()


# function of calculating the tfidf score
def tfidf_score(queries, docID, dictionary):
    socre = 0
    for i, phase in enumerate(queries):
        terms_index = get_dic_term(phase, dictionary, processed=True)
        df_fren = len(terms_index)
        if docID in terms_index.keys():
            tf_fren = len(terms_index[docID])
        else:
            tf_fren = 0
        if df_fren == 0 or tf_fren == 0:
            s = 0
        else:
            s = (1 + math.log(tf_fren, 10)) * math.log((5000 / df_fren), 10)

        socre += s

Exemplo n.º 11

0

Exibir arquivo

Arquivo: train.py Projeto: anpenta/handwritten-character-classification-deep-learning

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Train Module
# Module to train a CNN model on character data.

import numpy as np

import cnn
import utility

input_arguments = utility.parse_input_arguments(module="train")
images, labels = utility.load_data(input_arguments.image_path)
class_count = len(np.unique(labels))

images, labels = utility.preprocess(images, labels)
images, labels = utility.shuffle(images, labels)
x_train, x_test, y_train, y_test = utility.split(images, labels, test_size=0.2)

cnn = cnn.CNN(x_train.shape[1], x_train.shape[2], x_train.shape[3], class_count)
cnn.summary()

cnn.train(x_train, y_train, epochs=input_arguments.epochs, batch_size=input_arguments.batch_size,
          validation_split=input_arguments.validation_split, output_path=input_arguments.output_path)
cnn.test(x_test, y_test, output_path=input_arguments.output_path)

Exemplo n.º 12

0

Exibir arquivo

# the Free Software Foundation, either version 3 of the License, or
# at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Train Module
# Module to train a DCGAN model on image data.

import dcgan
import utility

input_arguments = utility.parse_input_arguments(module="train")
images = utility.load_images(input_arguments.image_path)
images = utility.preprocess(images)
utility.shuffle(images)

dcgan = dcgan.DCGAN(images.shape[1], images.shape[2], images.shape[3])
dcgan.summary()

dcgan.train(images,
            epochs=input_arguments.epochs,
            batch_size=input_arguments.batch_size,
            saving_frequency=input_arguments.saving_frequency,
            output_path=input_arguments.output_path)