Exemplo n.º 1
0
def main(components):
    #Connection to Neo4j
    graph = Graph("http://localhost:7474/db/data/cypher", password="******")

    #Obtain the features for corresponding InterMine Model
    feature_array = create_features()

    #Obtaining the list of Genes
    genes, length_genes = get_genes(graph)

    #Treating each Gene as a set
    sets = create_gene_documents(feature_array)

    #Singles mapped to ID's
    shingles = generate_shingle_id(sets)

    #Get signature based on the Shingle ID's
    signatures = generate_signatures(shingles, components)

    #similarity_matrix = get_similarity_matrix(signatures,len(genes),components)

    #return similarity_matrix

    b = 10
    r = 4

    #Obtain the matrix formed due to Locality Sensitive Hashing
    lsh_matrix = LSH(b, r, signatures, components)

    #Candidate Genes for Close Inspection
    candidate_gene = candidate_genes(lsh_matrix)

    #Use the information regarding candidate genes to obtain similarity scores
    final_similarity = get_similar_genes(candidate_gene, genes)
Exemplo n.º 2
0
def predict_extended():
    f = request.files['data_file']
    if not f:
        return "No file"

    stream = io.StringIO(f.stream.read().decode("UTF8"), newline=None)
    stream.seek(0)
    result = transform(stream.read())
    df = pd.read_csv(StringIO(result))

    # Preprocessing & Feature Building
    X = create_features(df, chunk_size, 5, 1, .5)
    X = pd.DataFrame(columns=features, data=X)
    array_preds = extended_model.predict(X)
    prediction = stats.mode(array_preds)[0][0]
    display_text = "The predicted resolutions for each interval are: {}.  \n Overall, " \
                   "the most commonly predicted resolution is: {}.".format(array_preds, prediction)

    return render_template('index.html', prediction_text_extended=display_text)
def tokenize_words(data: np.ndarray) -> np.ndarray:
    """[summary]
    tokenize the words, and add additional features
    Arguments:
        data {[numpy array]} -- the data
    
    Returns:
        [numpy array] -- the tokenized data, with additional features
    """
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                          filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                          lower=True)
    lines = data[:, 6]
    tokenizer.fit_on_texts(lines)
    x = tokenizer.texts_to_sequences(lines)
    x = pad_sequences(x, maxlen=MAX_SEQUENCE_LENGTH)

    additional_2 = np.array(
        [features.create_features(data[i])[0] for i in range(len(data))])
    x = np.hstack((x, additional_2))
    return x
Exemplo n.º 4
0
def run_model():
    # Get the Data
    df = acquire.get_telco_data()

    # Prepare the Data
    df = prepare.drop_columns(df)
    df = prepare.fix_dtypes(df)

    # Add Features
    df = features.create_features(df)

    # Encode DataFrame
    df = encode.encode_df(df)

    # Select features to be used in the model
    cols = ['contract_type', 
            'tenure',
            'monthly_charges',
            'payment_type',
            'has_internet']

    X = df[cols]
    y = df.churn
    
    # Create and fit the model
    forest = RandomForestClassifier(n_estimators=100, 
                                      max_depth=9,
                                      random_state=123).fit(X, y)

    # Create a DataFrame to hold predictions
    results = pd.DataFrame(
        {'Costumer_ID': df.customer_id,
         'Model_Predictions': forest.predict(X),
         'Model_Probabilities': forest.predict_proba(X)[:,1]
        })

    # Generate csv
    results.to_csv('model_results.csv')

    return results
Exemplo n.º 5
0
def main_operation():
    #Connection to Neo4j
    graph = Graph("http://localhost:7474/db/data/cypher", password="******")

    #Obtain the features for corresponding InterMine Model
    feature_array = create_features()

    #Obtaining the list of Genes
    genes, length_genes = get_genes(graph)

    #Computing singular sets for each gene as a document =: Type : Lists
    gene_documents = create_gene_documents(feature_array)

    #Conversion into Feature Vectors
    tfidf_vectors = compute_tfidf(gene_documents)

    #Obtaining the cluster labels
    cluster_labels = compute_clusters(tfidf_vectors)

    #Obtain clusters in form of Gene ID's
    gene_clusters = get_gene_clusters(cluster_labels, genes)

    print gene_clusters
Exemplo n.º 6
0
    def main_transitions(self):

        self.labels = self.nodes_labels()  #labels(node,(label, amount))
        self.train_person, self.test_person, self.train, self.test = self.train_test_split(
        )

        # self.train_person = pickle.load(open("./dataset/"+self.dataset_name+"/pkl/train_person.pkl","rb"))
        # self.test_person = pickle.load(open("./dataset/" + self.dataset_name + "/pkl/test_person.pkl", "rb"))
        # self.train = pickle.load(open("./dataset/" + self.dataset_name + "/pkl/train_per_year.pkl", "rb"))
        # self.test = pickle.load(open("./dataset/" + self.dataset_name + "/pkl/test_per_year.pkl", "rb"))
        # self.labels = pickle.load(open("./dataset/" + self.dataset_name + "/pkl/labels.pkl", "rb"))

        #creating the network, connecting the neighbors edges and the timed edges
        mg_dict = self.sort_by_years()
        mg = self.create_multigraph(mg_dict)
        self.community_gnx, total_timed_edges = self.create_gnx(mg)

        # creating input for graphs and labels for the GCN (graph per timestamp, labels, features matrices)
        preparations.main_prep(self.dataset_name, self.edges_path,
                               self.nodes_path, self.number_of_unique_labels,
                               self.dataset_time_range[0])
        create_features(
            self.dataset_name,
            self.time_inds)  # creating feature matrices for the GCN

        params_ = {
            "data_name": self.dataset_name,  # parameters of the GCN model
            "net": Net,
            "epochs": Epochs,
            "activation": "relu",
            "dropout_rate": Dropout_Rate,
            "hidden_sizes": Hidden_Sizes,
            "learning_rate": Learning_Rate,
            "weight_decay": Weight_Decay,
            "time_ins": Time_Inds,
            "num_of_classes": self.number_of_unique_labels
        }
        self.similarity_edges = run_trial(params_)  #runs the GCN model

        t = time.time()
        for u, v in self.similarity_edges:  #adding the similarity edges to the graph with similarity factor weight
            self.community_gnx.add_edge(u, v, weight=self.similarity_factor)
        print("adding similarity edges time: ", time.time() - t)
        self.cd = self.com_det()
        self.com_nodes = self.cd_com_nodes()
        print("number of communities: ", len(self.com_nodes))

        # self.cd = pickle.load(open("./dataset/" + self.dataset_name + "/pkl/cd_" + str(self.name) + ".pkl", "rb"))
        # self.com_nodes = pickle.load(open("./dataset/" + self.dataset_name + "/pkl/com_nodes_" + str(self.name) + ".pkl", "rb"))
        # print("number of communities: ", len(self.com_nodes))

        # paint communities
        t = time.time()
        train_com_nodes = self.com_nodes_t(self.train)
        train_com_labels = self.com_label(train_com_nodes)
        self.top_label = self.paint_com(train_com_labels)
        print("number of communities painted: ", len(self.top_label))
        print("paint communities time: {:.4f}".format(time.time() - t))

        self.test_com_nodes = self.com_nodes_t(self.test)
        test_com_label = self.com_label(self.test_com_nodes)
        # top_label_test = self.paint_com(test_com_label)

        self.com_size_test = {
            com: len(test_com_label[com])
            for com in test_com_label
        }
        com_size_train = {
            com: len(train_com_labels[com])
            for com in train_com_labels
        }

        # Accuracy
        self.node_com_top_label = self.node_comlabel()
        # accuracy train
        t = time.time()
        self.check_communal_accuracy_t(self.train, com_size_train, "train")
        print("accuracy train time: {:.5f}".format(time.time() - t))
        # accuracy test
        t = time.time()
        self.check_communal_accuracy_t(self.test, self.com_size_test, "test")
        print("accuracy test time: {:.5f}".format(time.time() - t))
        # total accuracy
        total_accuracy = self.total_painting_accuracy()
        print("total accuracy in painted communities: {:.5f}".format(
            total_accuracy))

        # Entropy
        # entroy train
        t = time.time()
        self.check_communal_entropy(self.train, com_size_train, "train",
                                    train_com_labels)
        print("entropy train time: {:.5f}".format(time.time() - t))
        # entropy test
        t = time.time()
        self.check_communal_entropy(self.test, self.com_size_test, "test",
                                    test_com_label)
        print("entropy test time: {:.5f}".format(time.time() - t))

        # transitions
        self.transitions_results()
        self.plot_compaint()
        self.plot_changes_per_year()
Exemplo n.º 7
0
import numpy as np
import cPickle

from features import create_features, PROJECT
from parse import load_data
from dict_vectorizer import DictVectorizer

videos, users, reviews = load_data()
orig_X = np.array([(x['date'], x['text'], x['user']) for x in reviews])
feats = create_features(orig_X, None)
v = DictVectorizer(sparse=False)
feats = v.fit_transform(feats)

# feats is now in vectorized format
# v.transform() is the transformation that needs to be used on test data

cPickle.dump(v, open(PROJECT + "db/dictvectorizer.pickle", "wb"))
Exemplo n.º 8
0
import fix_paths

from models.commit import Commit
import common
import config
import features

import pickle
from sklearn import svm

session = common.Session()

svc_file = open(config.SERIALIZED_SVC_LOCATION)
clf = pickle.loads(svc_file.read())

print 'Classifying all commits.'
count = 0
for commit in session.query(Commit).all():
  commit.classification = int(clf.predict([features.create_features(commit)])[0])
  session.add(commit)
  count += 1
  if count % 1000 == 0:
    print count
session.commit()
Exemplo n.º 9
0
def get_real_data():
    df = util.load_data_to_dataframe('dataset/val_test_split.json')
    unseen_test = create_features(df)
    train_feats, train_labels, _ = get_feats_labels_ids(unseen_test)
    return train_feats, train_labels
Exemplo n.º 10
0
"""
  Runs the project.
"""

# Import local methods.
import loading
import cleaning
import features
import training

# Load raw data into datasets.
loading.load()

# Clean data.
cleaning.clean()

# Create features.
features.create_features()

# Train neural network.
training.train_neural_network()

# Train logistic regression.
training.train_logistic_regression()