示例#1
0
def spectral_cluster():
    t0 = time()
    S = spectral_clustering(
        loadPickle('./models/trump_sample_affinity.pickle'), n_clusters=100)
    savePickle(S, './models/trump_sample_spectral.pickle')
    print(S)
    print("Spectral clustering took {}s".format(time() - t0))
示例#2
0
文件: HSserver.py 项目: MaxineLyu/TGA
    def store_all_org(self, codelist=None):
        if not self.codelist and not codelist: self.codelist = self.tga.getCodeList()
        for code in self.codelist:
            self.store_one_org(code)
        utils.savePickle(var=self.sofar, filename="orgs_before_error")
        self.solve_error()

        return self.sofar
示例#3
0
def kmeans():
    t0 = time()
    K = k_means(loadPickle('./models/trump_sample_vectors.pickle'),
                n_clusters=100,
                n_jobs=-1)
    savePickle(K, './models/trump_sample_kmeans.pickle')
    print(K)
    print("K-means took {}s".format(time() - t0))
def TrainInitialModelSample():

    train_data = read_OnlyTrainData(dropFileName=True)
    train_data = train_data[:20]

    X_train = train_data.drop('label', axis=1).values
    y_train = train_data['label']

    test_data = read_OnlyTestData(dropFileName=True, returnXy=False)
    X_test_set = test_data.drop('label', axis=1).values
    y_test_set = test_data['label']

    svmClassifier = SVC(C=10,
                        kernel='linear',
                        gamma=0.001,
                        probability=True,
                        random_state=500156)
    logRegClassifier = LogisticRegression(random_state=789)
    rfClassifier = RandomForestClassifier(criterion='entropy',
                                          random_state=4528)

    classifiers = {
        type(svmClassifier).__name__: svmClassifier,
        type(logRegClassifier).__name__: logRegClassifier,
        type(rfClassifier).__name__: rfClassifier,
    }

    #Train all 3 classifires with initial data samlples

    experiments = []
    scores = {}
    for clfname, clf in classifiers.iteritems():

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test_set)
        score = accuracy_score(y_pred, y_test_set)
        pred_probs = clf.predict_proba(X_test_set)

        modelObj = {}
        modelObj['classifier_name'] = clfname
        modelObj['acc_score'] = score
        modelObj['pred_probs'] = pred_probs
        modelObj['clf_obj'] = clf

        scores[clfname] = score
        #save models to pickle
        savePickle(clf, clfname)

        #print scores
        #save initial scores to pickle
        savePickle([scores[clfname]], clfname + '_scores')

        experiments.append(modelObj)

    return scores
def generateOracleData(df_samples):

    oracleSamples = []
    savePickle(df_samples,'finalOracleSamples15')
    for df in df_samples:
        clfname = df['classifier'][0]
        for clip_name in df['sample']:
            instance = {'name': clip_name[-1], 'clip': 'static/datafiles/audiofiles/'+clip_name[-1]+'.mp3'}
            oracleSamples.append(instance)

    return oracleSamples
示例#6
0
def train():

    data_file = '~/data/twitter/ece901/161112politics0.csv'
    model_base = './models/w2v'
    model_pickle = model_base + '.pickle'
    model_bin = model_base + '.bin'

    data = TweetIterator(data_file, False, 'tokenized_tweet')

    t0 = time()
    model = word2vec.Word2Vec(data, workers=multiprocessing.cpu_count(), sg=1)
    print("Training word2vec model took {}s".format(time() - t0))

    savePickle(model, model_pickle)
    saveWord2Vec(model, model_bin)
示例#7
0
def get_vecs():

    t0 = time()
    tweet2vec = Tweet2Vec(model_file, char=False, chrd=True, word=True)
    print("Loading model took {}s".format(time() - t0))

    source = pd.read_csv(source_file, header=None, sep=chr(1))
    text = source[0]

    t0 = time()
    M = tweet2vec[text]
    print(M)
    print(M.shape)
    print("Grabbing {} vectors took {}s".format(len(text), time() - t0))

    savePickle(M, './models/trump_sample_vectors.pickle')
示例#8
0
def PrepareHashtags(source, top_n=2000):
    '''
    This function will pick out the `top_n` most frequent hashtags

    And save them as `./models/hashtags.txt`

    You can then make a MultiLabelBinarizer object with MakeMLB()
    '''

    print(
        "Processing {} and creating MultiLabelBinarizer object".format(source))

    model_dir = './models'
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    counts = {}
    counts_details = {}

    num_tweets = 0

    for i, hashtags in enumerate(TweetIterator(source, True, 'hashtags')):
        num_tweets += 1
        if num_tweets % 1000 == 0:
            print("Processed {} tweets".format(num_tweets))
        for h in hashtags:
            if h not in counts:
                counts[h] = 1
                counts_details[h] = [i]
            else:
                counts[h] += 1
                counts_details[h].append(i)

    counts_sorted = sorted(counts.keys(), key=lambda x: -counts[x])

    top_hashtags = counts_sorted[:top_n]

    hashtag_file = os.path.join(model_dir, 'hashtags.txt')
    hashtag_count_file = os.path.join(model_dir, 'hashtag_counts.pickle')
    saveList(top_hashtags, hashtag_file)
    savePickle(counts, hashtag_count_file)
def trainModels(trainingData):

    # get test data of 50 instances
    X_test_set, y_test_set = read_OnlyTestData(dropFileName=True, returnXy=True)

    # get pickle files
    LogRegression = getPicklefile('LogisticRegression')
    Svc = getPicklefile('SVC')
    RFClassifier = getPicklefile('RandomForestClassifier')

    classifiers = [LogRegression,Svc,RFClassifier]

    scores = {}
    for execNo in range(len(trainingData)):

        tdata = trainingData[execNo]
        tdata = mapping(tdata)
        X = tdata.drop(['label','audio_name'],axis=1).values
        y = tdata['label']

        clf = classifiers[execNo]
        clfname = type(clf).__name__

        # retrain the model
        clf.fit(X,y)
        y_pred = clf.predict(X_test_set)
        score = accuracy_score(y_pred, y_test_set)
        scores[clfname] = score
        # save models to pickle
        savePickle(clf, clfname)

        #get previous score and save it
        oldscores = getPicklefile(clfname+'_scores')

        oldscores.append(score)
        #print "OLD",oldscores
        # save scores to pickle
        savePickle(oldscores, clfname + '_scores')

    return scores
示例#10
0
def MakeMLB(top_n=1000):
    '''
    This function produces the "MultiLabelBinarizer" object and saves it as a
    pickle file in the ./models directory

    The MultiLabelBinarizer is the object that turns a list of hashtags into a
    binary vector, for labels for our model

    Loads `top_n` hashtags in `./models/hashtags.txt' and makes a MultiLabelBinarizer object
    '''

    model_dir = './models'
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    output_mlb = os.path.join(model_dir, 'mlb.pickle')
    hashtag_file = os.path.join(model_dir, 'hashtags.txt')

    top_hashtags = loadList(hashtag_file)
    top_hashtags = top_hashtags[:top_n]

    mlb = MultiLabelBinarizer(sparse_output=False).fit([top_hashtags])
    savePickle(mlb, output_mlb)

    print("Final set of hashtags: {}".format(mlb.classes_))
    def getSamples_toAnnotate():
        constDiff = 20
        data = json.loads(request.data)
        runCount = int(data['stepSampleCount'])
        sampleCounter = runCount + constDiff
        if (sampleCounter == 420):
            return render_template('index.html')
        entropied_samples = compute.computeOracle(sampleCounter)

        logRegSample = entropied_samples[0:5]
        svmSample = entropied_samples[20:25]
        rfSample = entropied_samples[40:45]

        savePickle(logRegSample, 'LogisticRegression_Samples')
        savePickle(svmSample, 'SVC_Samples')
        savePickle(rfSample, 'RandomForestClassifier_Samples')

        print(sampleCounter, " samples")
        return json.dumps([logRegSample, svmSample, rfSample])
def preprocessDeepModel(sequencesPath, outputPath, maxLen=None):
    """
    Preprocess the sequences to make them trainable by a deep model.

    Parameters:
    -----------
        sequencesPath (str); where are stored the sequences
        outputPath (str): where will be stored the preprocessed sequenced
        maxLen(int): size of the padded sequences

    Returns:
    --------
        (np.arrays): the training and the validation data, and the training and the validation
                     labels.
    """

    modelPath = "./Resources/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin"

    # Download the model if needed
    if not os.path.isfile(modelPath):
        link = " http://embeddings.org/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin"
        os.system("wget -O " + modelPath + link)

    # Load the model
    w2v = word2vec.load(modelPath)
    vocab = set(w2v.vocab)

    # Load the encoder
    encoder = openPickle("./Data/dict.pkl")
    decoder = {encoder[key]: key for key in encoder}

    #if not os.path.isfile(sequencesPath):
    if not os.path.isfile(outputPath):

        fromOldToNew = reIndexToken(w2v, decoder)

        if not os.path.isfile("./Data/newDict.pkl"):

            newCoder = {"pad": 0, "unk": len(decoder) - 1}
            for key in decoder:
                if fromOldToNew[key] != len(decoder) - 1:
                    newCoder[decoder[key]] = fromOldToNew[key]

            savePickle("./Data/newDict.pkl", newCoder)
        else:
            newCoder = openPickle("./Data/newDict.pkl")

        if not os.path.isfile(sequencesPath):
            raise FileNotFoundError("Please run studyWord2Vec.py")

        sequences = openPickle(sequencesPath)
        sequences = reIndexSequences(sequences, fromOldToNew)
        savePickle(outputPath, sequences)
    else:
        sequences = openPickle(outputPath)

    if maxLen is None:
        maxLength = max([len(seq) for seq in sequences])
    else:
        maxLength = maxLen

    return pad_sequences(sequences, maxlen=maxLength)
示例#13
0
import json
import numpy as np

import params
import utils

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

captions = utils.readCaptions(params.TRAIN_CAPTIONS_PATH)
stemmed_dict = utils.stemming(captions)

#flatten corpus
corpus = []
for x in stemmed_dict.keys():
    for y in stemmed_dict.get(x):
        corpus.append(y)

bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=3)
transformer = TfidfTransformer(smooth_idf=False)

counts = bigram_vectorizer.fit_transform(corpus)
transformer.fit(counts)

utils.savePickle(object=bigram_vectorizer, PATH=params.REDUCED_BIGRAM_MODEL)
utils.savePickle(object=transformer, PATH=params.REDUCED_TF_IDF_MODEL)
示例#14
0
from preprocessing import TfIdfTransformer, sparseBagOfWords

# Naive Bayes with bag of words
model = LinearSVC(C = 0.1, class_weight="balanced")
_, expectedScore = evaluateModel(model, "./Data/Learn/sequences.pkl", "./Data/Learn/labels.pkl")

preds = getPredictions(model, "./Data/Learn/sequences.pkl", "./Data/Learn/labels.pkl",
                       "./Data/Test/sequences.pkl")

mcPreds = convertLabels(preds)

name = "model__%s__preprocesser__%s__expected__%.4f.pkl"%("LinearSVC(C=0.1, weight_class=balanced)",
                                                          "sparseBagOfWords",
                                                           expectedScore)

savePickle(pjoin("./Results/", name), mcPreds)

# Naive Bayes with tf-idf
print("\n" + "#"*50 + "\n")
model = LinearSVC(C=11., class_weight="balanced")
preprocesser = TfIdfTransformer(norm="l1")
_, expectedScore = evaluateModel(model, "./Data/Learn/sequences.pkl", "./Data/Learn/labels.pkl",
                                 preprocesser)

preds = getPredictions(model, "./Data/Learn/sequences.pkl", "./Data/Learn/labels.pkl",
                       "./Data/Test/sequences.pkl")

mcPreds = convertLabels(preds)

name = "model__%s__preprocesser__%s__expected__%.4f.pkl"
name = name%("LinearSVC(C=0.001, weight_class=balanced)", preprocesser, expectedScore)
示例#15
0
def get_affinity():
    t0 = time()
    A = rbf_kernel(loadPickle('./models/trump_sample_vectors.pickle'))
    savePickle(A, './models/trump_sample_affinity.pickle')
    print(A.shape)
    print("Spectral clustering took {}s".format(time() - t0))
示例#16
0
model.add(Dense(output_dim=128))
model.compile("nadam", "mae")

print('Training...')
i = 0
for epoch in range(EPOCHS):
    random.seed(42)
    random.shuffle(text)
    print('    EPOCH:', epoch)
    for text_descriptors, img_descriptors in utils.getBatch(
            text, images, BATCH_SIZE):

        print(vstack(text_descriptors).shape)
        """    tmp = list(zip(text_descriptors, img_descriptors))
            random.seed(42)
            random.shuffle(tmp)
            text_descriptors, img_descriptors = zip(*tmp)"""
        t0 = time.time()
        mlp.partial_fit(vstack(text_descriptors), img_descriptors)
        print('        Partial fit {} took: {} min, Score {}'.format(
            i, round((time.time() - t0) / 60, 2), mlp.loss_))
        """    
            mlp.fit(vstack(text_descriptors), img_descriptors)
            """
        """    i = i+1
            if i == 2:
                break"""

print('Saving model...')
utils.savePickle(mlp, params.CNN_MLP)
print('Done!')
示例#17
0
文件: vae.py 项目: starstorms9/shape
9   03797390   Mug      214   |  10  02880940   Bowl     186
'''
anchor_vects, labels = getRecons(num_to_get=10, cat_label_index=8)
        
#%% Interpolate between 2 set reconstructions from the previous method
interpolateDesigns(anchor_vects, labels, 3, 5)

#%% Run model on all data to get latent vects and loss. Used for streamlit app and other places.
shape2loss = {}
shape2vec = {}
for sample, label in tqdm(zip(all_voxs, all_mids), unit_scale=True, desc="Saving shape 2 vec: ", unit=" encodes", total=len(all_voxs))  :
    sample = tf.cast(sample, dtype=tf.float32)
    shape2vec[label] = model.encode(sample[None,...], reparam=True).numpy()[0]
    shape2loss[label] = model.compute_loss(sample[None,...]).numpy()    
    
ut.savePickle(os.path.join(lg.root_dir,"shape2vec.pkl"), shape2vec)
ut.savePickle(os.path.join(lg.root_dir,"shape2loss.pkl"), shape2loss)    

#%% Shapetime journey code for fun. Shapetime journey methods :
def showRandIndices(num_to_show=100) :
    for i in np.random.randint(0, len(shape2vec), size=num_to_show) :
        vox = shapemodel.decode(shape2vec[mids[i]][None,...], apply_sigmoid=True)[0,...,0]    
        ut.plotVox(vox, step=2, limits = cf_limits, title=i)
        
def journey(journey_length = 20, vects_sample=8, max_dist=8, interp_points=6, plot_step=2, start_index = 715)
    model.training=False    
    journey_vecs = []
    visited_indices = [start_index]
    journey_mids = []
    
    mids = list(shape2vec.keys())
示例#18
0
文件: HSserver.py 项目: MaxineLyu/TGA
                print "Code:", code
                org = self.tga.getOrgDetails(code)
                self.sofar.append(org)
                if org.DeliveryNotifications and org.Scopes:
                    if code in code2cid.keys():
                        print "Cid:", code2cid[code]
                        score, qualsno = self.tga.get_org_scale(org)
                        print "Score:", score
                        self.cid2scale[(code2cid[code], code)] = (score, qualsno)
                    else:
                        print "New company {} found!".format(code)
                        self.new_company.append(code)
            except Exception, e:
                self.error[code] = e
                time.sleep(15)
        utils.savePickle(var=self.sofar, filename="all_TGA")
        utils.savePickle(var=self.cid2scale, filename="cid2scale")
        utils.savePickle(var=self.error, filename="scalepipeError")
        utils.savePickle(var=self.new_company, filename="new_company")
#        for cid, code in self.cid2scale.keys():
#            print "Updating company with RTOcode: {}".format(code)
#            self._updateCompany('scale_score', self.cid2scale[(cid, code)], companyID=cid)
#            print "Update finished"
#            self.updated.append(code)
        

    def _addContact2Company(self, vid, companyId):
        print "Adding contacts to company"

        url = 'https://api.hubapi.com/companies/v2/companies/{0}/contacts/{1}?hapikey={2}'.format(str(companyId),
                                                                                                  str(vid),
示例#19
0
from preprocessing import getMeanVectors
from utils import openPickle, savePickle


if not os.path.isfile("./Data/Learn/embeddedMeanSequences.pkl"):
	encoder = openPickle("./Data/dict.pkl")
	decoder = {encoder[key]: key for key in encoder}

	w2v = word2vec.load("./Resources/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin")

	preprocesser = lambda x: getMeanVectors(x, w2v, decoder)

	sequences = np.array(openPickle("./Data/Learn/correctedSequences.pkl"))

	for i in range(len(sequences) // 5000):
		if i == 0:
			embeddedSeq = preprocesser(sequences[0:5000])
		else:
			embeddedSeq = np.vstack((embeddedSeq, preprocesser(sequences[5000 * i: 5000 * (i+1)])))
		print("Process until i = %s"%i)

	embeddedSeq = np.vstack((embeddedSeq, preprocesser(sequences[5000 * i:])))

	savePickle("./Data/Learn/embeddedMeanSequences.pkl", embeddedSeq)

model = XGBClassifier(n_estimators=500, max_depth=5, reg_alpha=10., reg_lambda=20.)
_, expectedScore = evaluateModel(model, "./Data/Learn/embeddedMeanSequences.pkl",
	"./Data/Learn/labels.pkl", lambda x:x)
print("")

示例#20
0
paddedTrainSeq = preprocessDeepModel("./Data/Learn/correctedSequences.pkl",
                                     "./Data/Learn/kerasSequences.pkl", 409)
labels = toBoolList(openPickle("./Data/Learn/labels.pkl"))

trainInd, testInd = getTrainTest(labels)

X_train, X_val = paddedTrainSeq[trainInd], paddedTrainSeq[testInd]
y_train, y_val = labels[trainInd], labels[testInd]

# Test Data
sequences = openPickle("./Data/Test/sequences.pkl")
correcter = openPickle("./Resources/tokenCorrecter.pkl")
correctedSequences = sequencesCorrecter(sequences, correcter)

savePickle("./Data/Test/correctedSequences.pkl", correctedSequences)

paddedSeq = preprocessDeepModel("./Data/Test/correctedSequences.pkl",
                                "./Data/Test/kerasSequences.pkl", 409)

# CNN
CNNPath = "./Resources/CNNWeight/CNNWeight.h5"

cnn = load_model(CNNPath)

## Evaluate score

trainPreds = cnn.predict(X_train).flatten()
print("Training score: %.4f" % f1_score(trainPreds > 0.5, y_train))

valPreds = cnn.predict(X_val).flatten()