Пример #1
0
def main(**kwargs):
    kwargs = parse_cl(sys.argv[1:])
    initialize_logger(kwargs['args'].out, kwargs['args'].debug, kwargs['args'].print_debug)
    logger = mylog.getLogger(__name__)
    start = time.time()
    if "prepare" in kwargs:
        logger.info("Run prepare")
        prepare(kwargs["args"])
    elif "cluster" in kwargs:
        logger.info("Run cluster")
        cluster(kwargs["args"])
    elif "report" in kwargs:
        logger.info("Run report")
        report(kwargs["args"])
    elif "predict" in kwargs:
        logger.info("Run predictions")
        predictions(kwargs["args"])
    elif "explore" in kwargs:
        logger.info("Run explore")
        explore(kwargs["args"])
    elif "stats" in kwargs:
        logger.info("Run stats")
        stats(kwargs["args"])
    elif "collapse" in kwargs:
        logger.info("Run collapse")
        collapse_fastq(kwargs["args"])
    elif "simulator" in kwargs:
        logger.info("Run simulator")
        # function to simulator
    logger.info('It took %.3f minutes' % ((time.time()-start)/60))
Пример #2
0
def get_test_data(max_size, n_cats):
    with open('word_to_index_top_30000.json', 'r') as f:
        d = json.load(f)

    data = []
    labels = []

    print("collecting test data and labels..")
    with open('test_data.csv', 'r', encoding="utf8") as csvfile:
        reader = csv.reader(csvfile)
        for r in reader:
            words = preprocess_string(r[0], CUSTOM_FILTERS)
            nums = [0] * len(words)
            for i, word in enumerate(words):
                if word in d:
                    nums[i] = d[word]
            data.append(nums)
            labels.append(r[-1])
    print("collected test data and labels succesfully.")

    print("preparing test data and labels..")
    x_test, y_test = prepare(X=data[1:], y=labels[1:], max_size=max_size, n_cats=n_cats, shuffle_data=True)
    print("prepared test data and labels succesfully.")

    return x_test, y_test
Пример #3
0
def get_train_data(max_size, n_cats):
    data = []
    labels = []

    print('collecting training data..')
    with open('data.csv', 'r') as f:
        for row in csv.reader(f):
            nums = [0] * len(row)
            for i, d in enumerate(row):
                nums[i] = int(d)
            data.append(nums)
    f.close()
    print('collected training data successfully.')

    print('collecting training labels..')
    with open('labels.csv', 'r') as f:
        for row in csv.reader(f):
            labels.append(int(row[0]))
    f.close()
    print('collected training labels successfully.')

    print("preparing training data and labels..")
    x_train, y_train = prepare(X=data, y=labels, max_size=max_size, n_cats=n_cats, shuffle_data=True) 
    print("prepared training data and labels successfully.")

    return x_train, y_train
Пример #4
0
def main(**kwargs):
    kwargs = parse_cl(sys.argv[1:])
    initialize_logger(kwargs['args'].out, kwargs['args'].debug,
                      kwargs['args'].print_debug)
    logger = mylog.getLogger(__name__)
    start = time.time()
    if "prepare" in kwargs:
        logger.info("Run prepare")
        prepare(kwargs["args"])
    elif "cluster" in kwargs:
        logger.info("Run cluster")
        cluster(kwargs["args"])
    elif "report" in kwargs:
        logger.info("Run report")
        report(kwargs["args"])
    elif "predict" in kwargs:
        logger.info("Run predictions")
        predictions(kwargs["args"])
    elif "target" in kwargs:
        logger.info("Run target annotation")
        targets_enrichment(kwargs["args"])
    elif "seqbuster" in kwargs:
        logger.info("Run seqbuster")
        miraligner(kwargs["args"])
    elif "explore" in kwargs:
        logger.info("Run explore")
        explore(kwargs["args"])
    elif "stats" in kwargs:
        logger.info("Run stats")
        stats(kwargs["args"])
    elif "collapse" in kwargs:
        logger.info("Run collapse")
        collapse_fastq(kwargs["args"])
    elif "simulator" in kwargs:
        logger.info("Run simulator")
        simulate(kwargs["args"])
    logger.info('It took %.3f minutes' % ((time.time() - start) / 60))
Пример #5
0
def train(data_dir, review_dir, embedding_dir, model_dir):
    """
    This module uses natural language toolkit (nltk) to divide reviews into single words. Based on that gensim model is
    trained in order to provide embedding vectors.
    """
    embedding_dim = 100

    x_train, x_test, _, _ = prepare(data_dir)
    all_reviews = x_test + x_train
    review_lines = []
    counter = 0
    for line in all_reviews:
        tokens = word_tokenize(line)
        tokens = [w.lower() for w in tokens]
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words]
        review_lines.append(words)
        counter += 1
        if counter % 10000 == 0:
            print(counter, '/', len(all_reviews))

    with open(review_dir, 'w') as f:
        wr = csv.writer(f)
        wr.writerows(review_lines)

    print(review_lines[0])
    print(review_lines[3])
    print(len(review_lines))

    model = gensim.models.Word2Vec(sentences=review_lines, size=embedding_dim, window=5, workers=4, min_count=10)

    words = list(model.wv.vocab)
    print('Vocabulary size: %d' % len(words))

    model_dir = model_dir
    model.save(model_dir)

    filename = embedding_dir
    model.wv.save_word2vec_format(filename, binary=False)
Пример #6
0
import prepare_data
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np


titanic = prepare_data.prepare("train.csv")
titanic_test = prepare_data.prepare("test.csv")


kf = KFold(titanic.shape[0], n_folds=3, random_state=1)


algorithms = [
    [RandomForestClassifier(random_state=1, n_estimators=10000, min_samples_split=5, min_samples_leaf=2), ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title", "FamilyId"]],
    [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]
]


predictions = []
for train, test in kf:
    train_target = titanic["Survived"].iloc[train]
    full_test_predictions = []
    for alg, predictors in algorithms:
        alg.fit(titanic[predictors].iloc[train,:], train_target)
        test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
        full_test_predictions.append(test_predictions)
Пример #7
0
            if j == inp.num_of_relay_positions:
                print("|", end='|')
        print()
        if i == inp.num_of_relay_positions:
            print("---" * 20)

    print("========")

    print('optimal value = ', solver.Objective().Value())
    print()
    print("Time = ", solver.WallTime(), " milliseconds")
    return dict_constant["l"] * solver.Objective().Value(
    ), connect_matrix_result, solver.WallTime()


if __name__ == '__main__':
    _dict_constant, _data_path = parse_config()
    logger = init_log()
    paths = glob.glob(_data_path)
    print(paths)
    # paths.reverse()
    for path in paths:
        logger.info("input path %s: ", path)
        _inp, _is_adj_matrix, _distance_matrix = prepare(path)
        result, connect_matrix, t = solve_by_or_tools(_inp, _is_adj_matrix,
                                                      _distance_matrix,
                                                      _dict_constant)
        # logger.info("Connected Matrix: \n%s", connect_matrix)
        logger.info("Result: %s", result)
        logger.info("Time: %s", t)
def train_network(data_dir, review_dir, embedding_dir, models_dir, logs_dir, batch,
                  epochs, transfer):
    """
    In this function structure of neural network is defined. All the training takes place here based on provided data.
    """
    embedding_dim = 100

    _, _, y_train, y_test = prepare(data_dir)
    embedding_matrix, tokenizer_obj, num_words = use(embedding_dir)

    with open(review_dir, 'r') as f:
        reader = csv.reader(f)
        review_lines = list(list(rec) for rec in csv.reader(f, delimiter=','))

    avg_lenght = 0
    for i in review_lines:
        avg_lenght += len(i)
    avg_lenght = int(avg_lenght / len(review_lines) + 100)

    print('Data prepared')
    print('')

    model = Sequential()
    if transfer:
        embedding_layer = Embedding(num_words, embedding_dim,
                                    input_length=avg_lenght,
                                    trainable=False)
        embedding_layer.build((None,))
        embedding_layer.set_weights([embedding_matrix])
    else:
        embedding_layer = Embedding(num_words, embedding_dim,
                                    input_length=avg_lenght,
                                    trainable=True)
    model.add(embedding_layer)
    model.add(Flatten())
    model.add(Dense(256, activation='tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='sigmoid'))
    adam = optimizers.adam(lr=0.001)
    model.compile(loss="binary_crossentropy",
                  optimizer=adam,
                  metrics=['accuracy'])
    checkpointer = ModelCheckpoint(filepath=models_dir,
                                   verbose=1, save_best_only=False, save_weights_only=False, period=1)
    tensorboard = TensorBoard(log_dir=logs_dir.format(time()))

    x_train = review_lines[200000:]
    x_test = review_lines[:200000]

    x_shuffle_train = list(zip(x_train, y_train))
    random.shuffle(x_shuffle_train)
    x_train, y_train = zip(*x_shuffle_train)
    x_train = list(x_train)
    y_train = list(y_train)

    x_shuffle_test = list(zip(x_test, y_test))
    random.shuffle(x_shuffle_test)
    x_test, y_test = zip(*x_shuffle_test)
    x_test = list(x_test)
    y_test = list(y_test)

    y_train = to_categorical(y_train, num_classes=2)
    y_test = to_categorical(y_test, num_classes=2)

    training_batch_generator = Generator(batch, x_train, y_train, tokenizer_obj, avg_lenght)
    validation_batch_generator = Generator(batch, x_test, y_test, tokenizer_obj, avg_lenght)

    print('Model prepared, start training...')
    model.fit_generator(generator=training_batch_generator,
                        steps_per_epoch=(300000 // batch),
                        epochs=epochs,
                        verbose=1,
                        validation_data=validation_batch_generator,
                        validation_steps=(200000 // batch),
                        use_multiprocessing=False,
                        max_queue_size=1,
                        callbacks=[checkpointer, tensorboard])

    if transfer:
        model.layers[0].trainable = True
        sgd = optimizers.sgd(lr=0.00001)
        model.compile(loss="binary_crossentropy",
                      optimizer=sgd,
                      metrics=['accuracy'])
        model.fit_generator(generator=training_batch_generator,
                            steps_per_epoch=(300000 // batch),
                            epochs=2,
                            verbose=1,
                            validation_data=validation_batch_generator,
                            validation_steps=(200000 // batch),
                            use_multiprocessing=False,
                            max_queue_size=1,
                            callbacks=[checkpointer, tensorboard])
Пример #9
0
import prepare_data
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

titanic = prepare_data.prepare("train.csv")
titanic_test = prepare_data.prepare("test.csv")

kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

algorithms = [[
    RandomForestClassifier(random_state=1,
                           n_estimators=10000,
                           min_samples_split=5,
                           min_samples_leaf=2),
    [
        "Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title",
        "FamilyId"
    ]
],
              [
                  LogisticRegression(random_state=1),
                  [
                      "Pclass", "Sex", "Fare", "FamilySize", "Title", "Age",
                      "Embarked"
                  ]
              ]]
Пример #10
0
        

        for j in range(T):
           this_dset[j] = views[i][j]

        dsets.append(this_dset)
    """
    colors = []
    for article in p[:10]:
        colors.append(colorsys.hsv_to_rgb(0.588, 0.2, random.uniform(0.4, 0.7)))
    # stacked_graph(dsets, baseline_fn = min_weighted_wiggles, color_seq='random')
    sg = pystreamgraph.StreamGraph(views, colors=colors, labels=escaped_p)
    sg.draw("generated_figure.svg", "MH370 related articles", show_labels=True, width=1800, height=8400)
    # pl.savefig('generated_figure.png')
    # pl.show()

if __name__ == '__main__':
    nicknames = ['olympics', 'mh370', 'ebola']
    for nickname in nicknames:
        p = pickle.load(open('../page/' + nickname + '_cluster.pickle'))
        views = prepare_data.prepare(p)
        v = np.array(views)
        v2 = np.fliplr(np.rot90(v.copy(),-1))
        template = open('streamgraph.js/template.tpl')
        t = template.read()
        t = t.replace("<DATA>", str(v2.tolist()))
        t = t.replace("<TITLES>", str(p))
        h = open('streamgraph.js/data/' + nickname + '.js', "w")
        h.write(t)
        h.close()
Пример #11
0
import sys, getopt, random
import os
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as f
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pyspark.sql.functions import percent_rank
from pyspark.sql import Window
from prepare_data import prepare
from prepare_data import train_val_test_split
from pyspark.mllib.tree import RandomForest


filepath = "hdfs:/user/ct2522"
data =prepare(filepath)  
train,val,test=train_val_test_split(data)
train_col=train.columns
train_col.remove("Popularity")
rf = RandomForest.trainClassifier(train)
rf.predict(val.drop("Popularity"))