Пример #1
0
def main():
	preprocess.main()
	nodes = []
	sentences = []
	with open('sentences.txt') as f:
		while(True):
			line = f.readline()
			if(line=='\n' or line==''):
				break
			nodes.append(sentence_node(0,line.strip('\n')))
	print len(nodes)
	for x in range(len(nodes)):
		sentences.append(nodes[x].sentence)
	tfidf_vectorizer = TfidfVectorizer()
	tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
	similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
	G = nx.Graph()
	for x in range(len(nodes)):
		G.add_node(x)
	# G.add_nodes_from(nodes)
	for i in range(len(nodes)):
		for j in range(len(nodes)):
			if(i<j and similarity_matrix[i][j]!=0):
				G.add_edge(i,j,weight=similarity_matrix[i][j])
	pdb.set_trace()			
	for i in range(len(nodes)):
		if len(G[i]) == 0:
			print "No out edges"					
	pr = nx.pagerank(G,alpha=0.85)
	# print pr
	sorted_pr = sorted(pr.items(), key=operator.itemgetter(1), reverse=True)
	print sorted_pr[:10]
	for item in sorted_pr[:10]:
		print nodes[item[0]].sentence			
Пример #2
0
def preprocess_lm_data(data_dir):
    preprocess_parser = options.get_preprocessing_parser()
    preprocess_args = preprocess_parser.parse_args([
        '--only-source',
        '--trainpref', os.path.join(data_dir, 'train.out'),
        '--validpref', os.path.join(data_dir, 'valid.out'),
        '--testpref', os.path.join(data_dir, 'test.out'),
        '--destdir', data_dir,
    ])
    preprocess.main(preprocess_args)
Пример #3
0
def preprocess_lm_data(data_dir):
    preprocess_parser = preprocess.get_parser()
    preprocess_args = preprocess_parser.parse_args([
        '--only-source',
        '--trainpref', os.path.join(data_dir, 'train.out'),
        '--validpref', os.path.join(data_dir, 'valid.out'),
        '--testpref', os.path.join(data_dir, 'test.out'),
        '--destdir', data_dir,
    ])
    preprocess.main(preprocess_args)
 def preprocess_data(self, data_dir):
     preprocess_parser = preprocess.get_parser()
     preprocess_args = preprocess_parser.parse_args([
         '--source-lang', 'in',
         '--target-lang', 'out',
         '--trainpref', os.path.join(data_dir, 'train'),
         '--validpref', os.path.join(data_dir, 'valid'),
         '--testpref', os.path.join(data_dir, 'test'),
         '--thresholdtgt', '0',
         '--thresholdsrc', '0',
         '--destdir', data_dir,
     ])
     preprocess.main(preprocess_args)
Пример #5
0
def preprocess_lm_data(data_dir):
    preprocess_parser = options.get_preprocessing_parser()
    preprocess_args = preprocess_parser.parse_args([
        "--only-source",
        "--trainpref",
        os.path.join(data_dir, "train.out"),
        "--validpref",
        os.path.join(data_dir, "valid.out"),
        "--testpref",
        os.path.join(data_dir, "test.out"),
        "--destdir",
        data_dir,
    ])
    preprocess.main(preprocess_args)
Пример #6
0
def preprocess_translation_data(data_dir, extra_flags=None):
    preprocess_parser = options.get_preprocessing_parser()
    preprocess_args = preprocess_parser.parse_args(
        [
            '--source-lang', 'in',
            '--target-lang', 'out',
            '--trainpref', os.path.join(data_dir, 'train'),
            '--validpref', os.path.join(data_dir, 'valid'),
            '--testpref', os.path.join(data_dir, 'test'),
            '--thresholdtgt', '0',
            '--thresholdsrc', '0',
            '--destdir', data_dir,
        ] + (extra_flags or []),
    )
    preprocess.main(preprocess_args)
Пример #7
0
def preprocess_translation_data(data_dir, extra_flags=None):
    preprocess_parser = preprocess.get_parser()
    preprocess_args = preprocess_parser.parse_args(
        [
            '--source-lang', 'in',
            '--target-lang', 'out',
            '--trainpref', os.path.join(data_dir, 'train'),
            '--validpref', os.path.join(data_dir, 'valid'),
            '--testpref', os.path.join(data_dir, 'test'),
            '--thresholdtgt', '0',
            '--thresholdsrc', '0',
            '--destdir', data_dir,
        ] + (extra_flags or []),
    )
    preprocess.main(preprocess_args)
Пример #8
0
def test_fl_sms(isolated_filesystem):
    os.chdir("advanced/Federated SMS Spam prediction/")
    Path("data").mkdir(parents=True, exist_ok=True)
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
    urllib.request.urlretrieve(url, "data.zip")
    with ZipFile("data.zip", "r") as zipObj:
        # Extract all the contents of the zip file in current directory
        zipObj.extractall()
    import preprocess

    preprocess.main()
    res = pm.execute_notebook("Federated SMS Spam prediction.ipynb",
                              "/dev/null",
                              parameters={"epochs": 1},
                              timeout=300)
    assert isinstance(res, nbformat.notebooknode.NotebookNode)
Пример #9
0
def createDataDump():
    data = {}   
    data['docList'], data['fullText'], data['classDict'] = preprocess.main();
    data['vocabList'] = createVocabList(data['docList'])
    f = open('yyy_all_data.pkl', 'wb')
    pickle.dump(data, f)
    f.close()
Пример #10
0
def get_all_tweets(screen_name):
	#Twitter only allows access to a users most recent 3240 tweets with this method
	
	#authorize twitter, initialize tweepy
	auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
	auth.set_access_token(access_key, access_secret)
	api = tweepy.API(auth)
	
	#initialize a list to hold all the tweepy Tweets
	alltweets = []	
	result = []
	
	#make initial request for most recent tweets (200 is the maximum allowed count)

	user = api.get_user(screen_name = screen_name)
	location = user.location
	lang = user.lang
	print location
	if location and lang =="en":
		print 1

		new_tweets = api.user_timeline(screen_name = screen_name,count=200)
		#save most recent tweets
		alltweets.extend(new_tweets)
		
		#save the id of the oldest tweet less one
		oldest = alltweets[-1].id - 1
		#keep grabbing tweets until there are no tweets left to grab
		while len(new_tweets) > 0:
			#print "getting tweets before %s" % (oldest)
			
			#all subsiquent requests use the max_id param to prevent duplicates
			new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)

			#save most recent tweets
			alltweets.extend(new_tweets)
			#update the id of the oldest tweet less one
			oldest = alltweets[-1].id - 1
			
			#print "...%s tweets downloaded so far" % (len(alltweets))
		
		#transform the tweepy tweets into a 2D array that will populate the csv	
		#outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"),screen_name] for tweet in alltweets]
		x=0
		for count in range(len(alltweets)):
			char = alltweets[count].text.encode("utf-8")
			char = main(char)
			result.append(char)
		#outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"),screen_name] for tweet in alltweets]
		outtweets = [[alltweets[i].id_str, alltweets[i].created_at, result[i], location, screen_name] for i in range(len(alltweets))]
		
		#write the csv
		direct = '/home/minghao/Downloads/big data proj/unsupervised/data_unsupervised'	
		d = dirname(dirname(abspath(__file__)))
		with open(direct+'/%s_tweets.csv' % screen_name, 'wb') as f:
			writer = csv.writer(f)
			writer.writerow(["id","created_at","text","location","screen_name"])
			writer.writerows(outtweets)
		
		pass
Пример #11
0
def eval_face(input_dir):
    """
    function that recognizes the face on a picture.
    input: location of the picture. (str)
    output: predicted label of the picture. (str)
    errors:
    + picture_name does not exist/is not and image - throw type error
    + no positive match to any of the labels - throw nonexistent error
    """

    # se procesa la foto.
    pre.main(input_dir, input_dir, 180)

    # se evalúa la foto con el modelo.
    return main(input_dir, conf["model_path"], conf["classifier_output_path"],
                conf["batch_size"], conf["num_threads"], conf["num_epochs"],
                conf["min_num_images_per_class"], conf["split_ratio"], False)
def main():
    raw_data, raw_data, duplicate_sets, question_texts = preprocess.main()
    number_of_categories = len(duplicate_sets)

    tokenized_sentences, word_index = tokenize_data(question_texts.values())
    # Y_processed = to_categorical(np.asarray(Y_raw), 2)

    embedded_sequences = make_embedding_layer(word_index)
    model = make_model(embedded_sequences, number_of_categories)
Пример #13
0
    def test_train_mode(self):
        """Runs pipeline in train mode outputting train, test and eval filesets."""
        test_pipeline = TestPipeline()
        # Set extra options to the pipeline for test purpose
        test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time())))
        self.addCleanup(shutil.rmtree, test_dir)

        # Checks that pipeline reaches state "Done"
        pipeline_verifiers = [PipelineStateMatcher()]
        extra_opts = {
            'project': PROJECT,
            'output_path': test_dir,
            'on_success_matcher': all_of(*pipeline_verifiers),
            'runner': 'DirectRunner',
        }

        res = preprocess.main(
            test_pipeline.get_full_options_as_args(**extra_opts),
            query=self.TEST_QUERY,
            await_completion=True)

        # Check counts coming out of GetFirstClaim step.
        parse_first_claim_cnt = get_pipeline_metric(
            res, 'parse_firstclaim_success')
        self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt)

        # Check counts coming out of AddFeatures step.
        add_features_cnt = get_pipeline_metric(res, 'create_features_success')
        self.assertEqual(self.TOTAL_RECORDS, add_features_cnt)

        # Check counts coming out of AddLabel step.
        broad_cnt = get_pipeline_metric(res, 'add_label_broad')
        narrow_cnt = get_pipeline_metric(res, 'add_label_narrow')
        self.assertEqual(self.TOTAL_RECORDS, broad_cnt + narrow_cnt)

        # Check if the number of records coming out of Train/Test = limit step.
        splits = ['train_cnt', 'eval_cnt', 'test_cnt']
        train_test_split_cnt = sum(
            [get_pipeline_metric(res, m) for m in splits])
        self.assertEqual(self.TOTAL_RECORDS, train_test_split_cnt)

        # Check if number of protos created matched output of train/test split.
        create_proto_success = sum([
            get_pipeline_metric(res, 'create_proto_success', index=i)
            for i in range(3)
        ])
        self.assertEqual(self.TOTAL_RECORDS, create_proto_success)

        # Open a tf Example and check fields.
        example = read_example_proto(test_dir)
        for feature_name in preprocess.FEATURE_NAMES:
            self.assertGreaterEqual(get_tf_feature(example, feature_name), 0)
        # Make sure label feature is present.
        labels = ['broad', 'narrow']
        self.assertIn(get_tf_feature(example, 'label', 'bytes_list'), labels)
Пример #14
0
def preprocess_translation_data(data_dir, extra_flags=None):
    preprocess_parser = options.get_preprocessing_parser()
    preprocess_args = preprocess_parser.parse_args([
        "--source-lang",
        "in",
        "--target-lang",
        "out",
        "--trainpref",
        os.path.join(data_dir, "train"),
        "--validpref",
        os.path.join(data_dir, "valid"),
        "--testpref",
        os.path.join(data_dir, "test"),
        "--thresholdtgt",
        "0",
        "--thresholdsrc",
        "0",
        "--destdir",
        data_dir,
    ] + (extra_flags or []), )
    preprocess.main(preprocess_args)
Пример #15
0
def infer_anomaly_model(clf, infer_data):
    print("<<<<<< preprocess data")
    df = preprocess.main(infer_data)

    X_test = np.array(df.iloc[:, 1:])
    X_test = X_test.astype('int')
    #normalize
    my_scaler = joblib.load('./scaler.gz')
    X_test_std = my_scaler.transform(X_test)

    y_test_pred = clf.predict(X_test_std)
    print(y_test_pred)
    return y_test_pred
Пример #16
0
def add_face(input_dir):
    """
    function that retrains the model to add a new face.
    input: location of the images. (str)
    output: error code, 0 otherwise. (exception, none)
    errors:
    + pictures_folder is not a folder - throw type error
    + pictures_folder contains a non-image - fail silently and continue
      for the rest
    + pictures_folder does not have [enough] pictures - throw exception and
      exit
    + pictures_folder contains pictures that are not from the same people -
      this kills the model.
    """

    # se procesan las fotos.
    pre.main(input_dir, input_dir, 180)

    # se cargan los embeddings originales, reentrena el modelo, y guarda
    # en el mismo directorio el nuevo modelo y embeddings.
    main(input_dir, conf["model_path"], conf["classifier_output_path"],
         conf["batch_size"], conf["num_threads"], conf["num_epochs"],
         conf["min_num_images_per_class"], conf["split_ratio"], True)
Пример #17
0
def main(input_file, file_type, label_col, model_file):
    if file_type == 'file':
        print("<<<<<< preprocess data")
        df = preprocess.main(input_file)
    if file_type == 'folder':
        print("<<<<<< preprocess data")
        df = preprocess.process_file_list(input_file)
    print("<<<<< data split")
    y_train, y_test, X_train, X_test = split_data(df, label_col)
    #data normalization
    mm = MinMaxScaler()
    mm.fit(X_train)
    joblib.dump(mm, './scaler.gz')

    X_train_std = mm.transform(X_train)
    X_test_std = mm.transform(X_test)

    for model_name in ['KNN', 'XGBOD']:
        print("<<<<< model: ", model_name)
        model_test(model_name, y_train, y_test, X_train_std, X_test_std,
                   model_file, '0')
Пример #18
0
    def test_inference_mode(self):
        """Runs a pipeline in inference mode which should output one fileset."""
        test_pipeline = TestPipeline()
        # Set extra options to the pipeline for test purpose
        test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time())))
        self.addCleanup(shutil.rmtree, test_dir)

        # Checks that pipeline reaches state "Done"
        pipeline_verifiers = [PipelineStateMatcher()]
        extra_opts = {
            'project': PROJECT,
            'output_path': test_dir,
            'on_success_matcher': all_of(*pipeline_verifiers),
            'runner': 'DirectRunner',
            'pipeline_mode': 'inference',
        }

        res = preprocess.main(
            test_pipeline.get_full_options_as_args(**extra_opts),
            query=self.TEST_QUERY,
            await_completion=True)

        # Check counts coming out of GetFirstClaim step.
        parse_first_claim_cnt = get_pipeline_metric(
            res, 'parse_firstclaim_success')
        self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt)

        # Ensure a proto is created for all input records
        create_proto_success = get_pipeline_metric(res, 'create_proto_success')
        self.assertEqual(self.TOTAL_RECORDS, create_proto_success)

        # Open a tf Example and check fields.
        example = read_example_proto(test_dir)
        for feature_name in preprocess.FEATURE_NAMES:
            self.assertGreaterEqual(get_tf_feature(example, feature_name), 0)

        # Make sure label feature is not present since we are in inference.
        with self.assertRaises(IndexError):
            get_tf_feature(example, 'label', 'bytes_list')
Пример #19
0
def main(argv):
    
    ''' 
    controls the over-arching implmentation of the algorithms
    '''
    
    directory = argv[0]
    features = argv[1]
    algorithms = argv[2]
    
    #parsing
    print("parsing json data...")
    clusters, order, data, test_clusters, test_order, test_data, corpusdict = parse_json.main([directory])
    
    #preprocessing
    vocab = preprocess.main([features, corpusdict])
    
    #featurization step 1
    print("generating observations and features...")
    train_scores = observations.main([clusters, order, data, directory, features, vocab])
    test_scores = observations.main([test_clusters, test_order, test_data, directory, features, vocab])
    
    #featurization step 2
    print("generating training and testing data...")
    train_data, train_target = features_and_labels.main([train_scores, features])
    test_data, test_target = features_and_labels.main([test_scores, features])

    #modeling
    print("running algorithms...")
    if algorithms.log_reg:
        predicted_labels, perform_results = log_reg.main([train_data, train_target, test_data, test_target])
    if algorithms.svm:
        predicted_labels, perform_results = svm.main([train_data, train_target, test_data, test_target])
    #results
    print("Algorithm details and Results:")
    print(perform_results)
def main():
    """
    Test for the feature extraction class
    :return:
    """
    import preprocess
    ftr = FeatureExtraction(6)
    filename = "all_tweets.txt"
    lines = preprocess.main(filename)

    all_tweets = " ".join([" ".join(line[1]) for line in lines])

    print "The most frequent bigrams are :", ftr.most_frequent_bigrams(all_tweets)
    print "The most frequent unigrams are :", ftr.most_frequent_unigrams(all_tweets)

    hashtag_dic = PatternsFeatures().pattern_classifier(lines, '#')

    print 'The 10 most frequent hashtags', PatternsFeatures().get_most_frequent_pattern(hashtag_dic)
    print "number of tweets without hashtag is %d, it's %d percent of the data set" % (len(hashtag_dic['no_pattern_tweet']), int(100*len(hashtag_dic['no_pattern_tweet'])/len(lines)))

    name_dic = PatternsFeatures().pattern_classifier(lines, '@')

    print 'The 10 most frequent usernames: ', PatternsFeatures().get_most_frequent_pattern(name_dic)
    print "number of tweets without a user name is %d, it's %d percent of the data set" % (len(name_dic['no_pattern_tweet']), int(100*len(name_dic['no_pattern_tweet'])/len(lines)))
Пример #21
0
def lm_scoring(preprocess_directory, bpe_status, gen_output, pre_gen,
               cur_lm_dict, cur_lm_name, cur_language_model, cur_lm_bpe_code,
               batch_size, lm_score_file, target_lang, source_lang, prefix_len=None):
    if prefix_len is not None:
        assert bpe_status == "different", "bpe status must be different to use prefix len"
    if bpe_status == "no bpe":
        # run lm on output without bpe
        write_reprocessed(gen_output.no_bpe_source, gen_output.no_bpe_hypo,
                          gen_output.no_bpe_target, pre_gen+"/rescore_data_no_bpe.de",
                          pre_gen+"/rescore_data_no_bpe.en", pre_gen+"/reference_file_no_bpe")

        preprocess_lm_param = ["--only-source",
                               "--trainpref", pre_gen+"/rescore_data_no_bpe."+target_lang,
                               "--srcdict", cur_lm_dict,
                               "--destdir", preprocess_directory]
        preprocess_parser = options.get_preprocessing_parser()
        input_args = preprocess_parser.parse_args(preprocess_lm_param)
        preprocess.main(input_args)

        eval_lm_param = [preprocess_directory,
                         "--path", cur_language_model,
                         "--output-word-probs",
                         "--batch-size", str(batch_size),
                         "--max-tokens", "1024",
                         "--sample-break-mode", "eos",
                         "--gen-subset", "train"]

        eval_lm_parser = options.get_eval_lm_parser()
        input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param)

        with open(lm_score_file, 'w') as f:
            with redirect_stdout(f):
                eval_lm.main(input_args)

    elif bpe_status == "shared":
            preprocess_lm_param = ["--only-source",
                                   "--trainpref", pre_gen+"/rescore_data."+target_lang,
                                   "--srcdict", cur_lm_dict,
                                   "--destdir", preprocess_directory]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_lm_param)
            preprocess.main(input_args)

            eval_lm_param = [preprocess_directory,
                             "--path", cur_language_model,
                             "--output-word-probs",
                             "--batch-size", str(batch_size),
                             "--sample-break-mode", "eos",
                             "--gen-subset", "train"]

            eval_lm_parser = options.get_eval_lm_parser()
            input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param)

            with open(lm_score_file, 'w') as f:
                with redirect_stdout(f):
                    eval_lm.main(input_args)

    elif bpe_status == "different":
        rescore_file = pre_gen+"/rescore_data_no_bpe"
        rescore_bpe = pre_gen+"/rescore_data_new_bpe"

        rescore_file += "."
        rescore_bpe += "."

        write_reprocessed(gen_output.no_bpe_source, gen_output.no_bpe_hypo,
                          gen_output.no_bpe_target, rescore_file+source_lang,
                          rescore_file+target_lang, pre_gen+"/reference_file_no_bpe",
                          bpe_symbol=None)

        # apply LM bpe to nbest list
        bpe_src_param = ["-c", cur_lm_bpe_code,
                         "--input", rescore_file+target_lang,
                         "--output", rescore_bpe+target_lang]
        subprocess.call(["python",
                         os.path.join(os.path.dirname(__file__),
                                      "subword-nmt/subword_nmt/apply_bpe.py")] + bpe_src_param,
                        shell=False)
        # uncomment to use fastbpe instead of subword-nmt bpe
        # bpe_src_param = [rescore_bpe+target_lang, rescore_file+target_lang, cur_lm_bpe_code]
        # subprocess.call(["/private/home/edunov/fastBPE/fast", "applybpe"] + bpe_src_param, shell=False)

        preprocess_dir = preprocess_directory

        preprocess_lm_param = ["--only-source",
                               "--trainpref", rescore_bpe+target_lang,
                               "--srcdict", cur_lm_dict,
                               "--destdir", preprocess_dir]
        preprocess_parser = options.get_preprocessing_parser()
        input_args = preprocess_parser.parse_args(preprocess_lm_param)
        preprocess.main(input_args)

        eval_lm_param = [preprocess_dir,
                         "--path", cur_language_model,
                         "--output-word-probs",
                         "--batch-size", str(batch_size),
                         "--max-tokens", "1024",
                         "--sample-break-mode", "eos",
                         "--gen-subset", "train"]

        eval_lm_parser = options.get_eval_lm_parser()
        input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param)

        with open(lm_score_file, 'w') as f:
            with redirect_stdout(f):
                eval_lm.main(input_args)
Пример #22
0
    print "+-----------------------------------------------------------------+"
################################################################################

    print " time taken for the classification process %f sec " % (time() - t0)
#####################################################################################################
    x_axis = [i for i in range(10, 200, 20)]
    plt.figure(facecolor='white')
    fig1, = plt.plot(x_axis, accuracy_list_nb, 'r*-', label='Naive bayes accuracy')
    fig2, = plt.plot(x_axis, f_measure_list_nb, 'ro-', label='Naive bayes f-measure')
    fig3, = plt.plot(x_axis, accuracy_list_svm, 'g*-', label='SVM accuracy')
    fig4, = plt.plot(x_axis, f_measure_list_svm, 'go-', label='SVM f-measure')
    fig5, = plt.plot(x_axis, accuracy_list_maxent, '*-', label='max Entropy accuracy')
    fig6, = plt.plot(x_axis, f_measure_list_maxent, 'o-', label='max Entropy f-measure')

    plt.xlabel('Number of features')
    plt.ylabel('Results')
    plt.title('Results of the classification using unigrams and bigrams')
    plt.legend(handles=[fig1, fig2, fig3, fig4, fig5, fig6], loc=4)
    plt.show()


t0 = time()

filename = 'all_tweets.txt'
lines = preprocess.main(filename)


bigram_evaluation(lines)
unigram_evaluation(lines)
uni_and_bi_validation(lines)
KEEP_PROB = tf.placeholder(tf.float32)
###dropout end
  
###网络定义从这里开始
with slim.arg_scope(xception_arg_scope()):
    Y_prediction,end_points = xception(X,
                     num_classes=16,
                     is_training=Is_training,
                     scope='xception',
                     keep_prob=KEEP_PROB)
###网络结构这里结束
Y_softmax = tf.nn.softmax(Y_prediction)

    
initialization()#初始化函数,包括初始化训练集和测试集,得到训练集和测试集的个数,取出id对应的种类
preprocess.main()#creat TFrecord

variables_to_restore = slim.get_variables_to_restore()
###saver
saver = tf.train.Saver(variables_to_restore,max_to_keep = 1)  # 保存所有的变量,最多保存10个
model_file=tf.train.latest_checkpoint('./save/')#尝试加载上次最新的训练结果

with open("./prediction-split-softmax.csv", 'a', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["file_id","Blues","Classical","Country","Easy Listening",'Electronic','Experimental','Folk','Hip-Hop','Instrumental','International','Jazz','Old-Time / Historic','Pop','Rock','Soul-RnB',"Spoken"])
     
    
with tf.Session() as sess: 
    #加载最新的模型
    if model_file !=None:
        saver.restore(sess,model_file)
Пример #24
0
import preprocess as p
import rbm
import tensorflow as tf
import numpy as np


input_matrix, labels = p.main()

print "Input matrix shape = ", input_matrix.shape[0]
print "labels shape = ", labels.shape[0]


print labels[0, 0:10]

#for row in input_matrix:
visible = input_matrix[0]
hidden = labels[0]
vis = tf.Variable(visible)
r = rbm.RBM("chr0.0",visible.shape[0], hidden.shape[0])

with tf.Session() as session:
        # Run the model
    #session.run(r)
    session.run(r.propup(vis))
    # Run just the variable y and print 
    #print(session.run(y))
    

#sess.Run(rbm 

#x = RBM("test", 
Пример #25
0
    #get rects
    rects = img.find_rects(threshold=0)

    if (len(rects) == 0):
        continue

    #draw raw rects
    for k, r in enumerate(rects):
        c = r.corners()
        for i, p in enumerate(c):
            p_ = c[i - 1]
            if draw:
                img.draw_line(p[0], p[1], p_[0], p_[1], 5, color=(0, 0, 0))

    try:
        theta, translation = preprocess.main(rects, img)
    except NoEdgeException as e:
        print("NoEdgeException")
        continue
    except NotEnoughDataException as e:
        print("NotEnoughDataException")
        continue
    except NoRectException as e:
        print("NoRectException")
        continue

    # gRotation = getGlobalRotation(gRotation, lRotation, theta)
    # protocol.feedGlobalRotation(gRotation, pyb.millis() - startTime,frame_id)
    protocol.feedLocalRotation(theta, pyb.millis() - startTime, frame_id)
    lRotation = theta
Пример #26
0
	def UpdateData(self,dataLocation):
		preprocess.main(dataLocation)
Пример #27
0
# Copyright Aleksey Gurtovoy 2001-2004
#
# Distributed under the Boost Software License, Version 1.0.
# (See accompanying file LICENSE_1_0.txt or copy at
# http://www.boost.org/LICENSE_1_0.txt)
#
# See http://www.boost.org/libs/mpl for documentation.

# $Source: /CVSROOT/boost/libs/mpl/preprocessed/preprocess_set.py,v $
# $Date: 2007/10/29 07:32:56 $
# $Revision: 1.1.1.1 $

import preprocess

preprocess.main(["plain"], "set", "boost\\mpl\\set\\aux_\\preprocessed")
Пример #28
0
# Copyright Aleksey Gurtovoy 2001-2004
#
# Distributed under the Boost Software License, Version 1.0.
# (See accompanying file LICENSE_1_0.txt or copy at
# http://www.boost.org/LICENSE_1_0.txt)
#
# See http://www.boost.org/libs/mpl for documentation.

# $Source: /CVSROOT/boost/libs/mpl/preprocessed/preprocess_map.py,v $
# $Date: 2007/10/29 07:32:56 $
# $Revision: 1.1.1.1 $

import preprocess

preprocess.main(["plain", "typeof_based", "no_ctps"], "map",
                "boost\\mpl\\map\\aux_\\preprocessed")
Пример #29
0
#!/usr/bin/env python3

import numpy as np
import matplotlib.pyplot as plt
import icepack, icepack.plot

# This function pulls in the mesh and observational data that we'll use.
import preprocess
preprocess.main()

# Read in the observational data.
vx_obs = icepack.read_arc_ascii_grid(open("ross-vx.txt", "r"))
vy_obs = icepack.read_arc_ascii_grid(open("ross-vy.txt", "r"))
h_obs = icepack.read_arc_ascii_grid(open("ross-h.txt", "r"))

mesh = icepack.read_msh("ross.msh")
fig, ax = plt.subplots()
ax.set_aspect('equal')
icepack.plot.plot_mesh(ax, mesh)
plt.show(fig)

discretization = icepack.make_discretization(mesh, 1)

v = icepack.interpolate(discretization, vx_obs, vy_obs)
h = icepack.interpolate(discretization, h_obs)

# Make a dumb guess for the ice temperature. In "real life", you would want to
# use an inverse method that would tune the temperature to fit observations.
theta = icepack.interpolate(discretization, lambda x: 253.0)
Пример #30
0
# Copyright Aleksey Gurtovoy 2001-2006
#
# Distributed under the Boost Software License, Version 1.0.
# (See accompanying file LICENSE_1_0.txt or copy at
# http://www.boost.org/LICENSE_1_0.txt)
#
# See http://www.boost.org/libs/mpl for documentation.

# $Source: /cvsroot/boost/boost/libs/mpl/preprocessed/preprocess_set.py,v $
# $Date: 2006/11/23 19:57:11 $
# $Revision: 1.2.8.1 $

import preprocess
import os.path

preprocess.main(["plain"], "set",
                os.path.join("boost", "mpl", "set", "aux_", "preprocessed"))
Пример #31
0
# Copyright Aleksey Gurtovoy 2001-2004
#
# Distributed under the Boost Software License, Version 1.0. 
# (See accompanying file LICENSE_1_0.txt or copy at 
# http://www.boost.org/LICENSE_1_0.txt)
#
# See http://www.boost.org/libs/mpl for documentation.

# $Source: /home/project/cvs/hivm/version2/trunk/src/3rd_party/build/boost_1_33_1/libs/mpl/preprocessed/preprocess_list.py,v $
# $Date: 2006/08/07 05:34:09 $
# $Revision: 1.1 $

import preprocess

preprocess.main(
      [ "plain" ]
    , "list"
    , "boost\\mpl\\list\\aux_\\preprocessed"
    )
Пример #32
0
def gen_and_reprocess_nbest(args):
    if args.score_dict_dir is None:
        args.score_dict_dir = args.data
    if args.prefix_len is not None:
        assert args.right_to_left1 is False, "prefix length not compatible with right to left models"
        assert args.right_to_left2 is False, "prefix length not compatible with right to left models"

    if args.nbest_list is not None:
        assert args.score_model2 is None

    if args.backwards1:
        scorer1_src = args.target_lang
        scorer1_tgt = args.source_lang
    else:
        scorer1_src = args.source_lang
        scorer1_tgt = args.target_lang

    store_data = os.path.join(
        os.path.dirname(__file__)) + "/rerank_data/" + args.data_dir_name
    if not os.path.exists(store_data):
        os.makedirs(store_data)

    pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \
        backwards_preprocessed_dir, lm_preprocessed_dir = \
        rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset,
                                     args.gen_model_name, args.shard_id, args.num_shards,
                                     args.sampling, args.prefix_len, args.target_prefix_frac,
                                     args.source_prefix_frac)
    assert not (args.right_to_left1
                and args.backwards1), "backwards right to left not supported"
    assert not (args.right_to_left2
                and args.backwards2), "backwards right to left not supported"
    assert not (args.prefix_len is not None and args.target_prefix_frac is not None), \
        "target prefix frac and target prefix len incompatible"

    # make directory to store generation results
    if not os.path.exists(pre_gen):
        os.makedirs(pre_gen)

    rerank1_is_gen = args.gen_model == args.score_model1 and args.source_prefix_frac is None
    rerank2_is_gen = args.gen_model == args.score_model2 and args.source_prefix_frac is None

    if args.nbest_list is not None:
        rerank2_is_gen = True

    # make directories to store preprossed nbest list for reranking
    if not os.path.exists(left_to_right_preprocessed_dir):
        os.makedirs(left_to_right_preprocessed_dir)
    if not os.path.exists(right_to_left_preprocessed_dir):
        os.makedirs(right_to_left_preprocessed_dir)
    if not os.path.exists(lm_preprocessed_dir):
        os.makedirs(lm_preprocessed_dir)
    if not os.path.exists(backwards_preprocessed_dir):
        os.makedirs(backwards_preprocessed_dir)

    score1_file = rerank_utils.rescore_file_name(
        pre_gen,
        args.prefix_len,
        args.model1_name,
        target_prefix_frac=args.target_prefix_frac,
        source_prefix_frac=args.source_prefix_frac,
        backwards=args.backwards1)
    if args.score_model2 is not None:
        score2_file = rerank_utils.rescore_file_name(
            pre_gen,
            args.prefix_len,
            args.model2_name,
            target_prefix_frac=args.target_prefix_frac,
            source_prefix_frac=args.source_prefix_frac,
            backwards=args.backwards2)

    predictions_bpe_file = pre_gen + "/generate_output_bpe.txt"

    using_nbest = args.nbest_list is not None

    if using_nbest:
        print("Using predefined n-best list from interactive.py")
        predictions_bpe_file = args.nbest_list

    else:
        if not os.path.isfile(predictions_bpe_file):
            print(
                "STEP 1: generate predictions using the p(T|S) model with bpe")
            print(args.data)
            param1 = [
                args.data, "--path", args.gen_model, "--shard-id",
                str(args.shard_id), "--num-shards",
                str(args.num_shards), "--nbest",
                str(args.num_rescore), "--batch-size",
                str(args.batch_size), "--beam",
                str(args.num_rescore), "--max-sentences",
                str(args.num_rescore), "--gen-subset", args.gen_subset,
                "--source-lang", args.source_lang, "--target-lang",
                args.target_lang
            ]
            if args.sampling:
                param1 += ["--sampling"]

            gen_parser = options.get_generation_parser()
            input_args = options.parse_args_and_arch(gen_parser, param1)

            print(input_args)
            with open(predictions_bpe_file, 'w') as f:
                with redirect_stdout(f):
                    generate.main(input_args)

    gen_output = rerank_utils.BitextOutputFromGen(
        predictions_bpe_file,
        bpe_symbol=args.remove_bpe,
        nbest=using_nbest,
        prefix_len=args.prefix_len,
        target_prefix_frac=args.target_prefix_frac)

    if args.diff_bpe:
        rerank_utils.write_reprocessed(
            gen_output.no_bpe_source, gen_output.no_bpe_hypo,
            gen_output.no_bpe_target,
            pre_gen + "/source_gen_bpe." + args.source_lang,
            pre_gen + "/target_gen_bpe." + args.target_lang,
            pre_gen + "/reference_gen_bpe." + args.target_lang)
        bitext_bpe = args.rescore_bpe_code
        bpe_src_param = [
            "-c", bitext_bpe, "--input",
            pre_gen + "/source_gen_bpe." + args.source_lang, "--output",
            pre_gen + "/rescore_data." + args.source_lang
        ]
        bpe_tgt_param = [
            "-c", bitext_bpe, "--input",
            pre_gen + "/target_gen_bpe." + args.target_lang, "--output",
            pre_gen + "/rescore_data." + args.target_lang
        ]

        subprocess.call([
            "python",
            os.path.join(os.path.dirname(__file__),
                         "subword-nmt/subword_nmt/apply_bpe.py")
        ] + bpe_src_param,
                        shell=False)

        subprocess.call([
            "python",
            os.path.join(os.path.dirname(__file__),
                         "subword-nmt/subword_nmt/apply_bpe.py")
        ] + bpe_tgt_param,
                        shell=False)

    if (not os.path.isfile(score1_file) and not rerank1_is_gen) or \
            (args.score_model2 is not None and not os.path.isfile(score2_file) and not rerank2_is_gen):
        print(
            "STEP 2: process the output of generate.py so we have clean text files with the translations"
        )

        rescore_file = "/rescore_data"
        if args.prefix_len is not None:
            prefix_len_rescore_file = rescore_file + "prefix" + str(
                args.prefix_len)
        if args.target_prefix_frac is not None:
            target_prefix_frac_rescore_file = rescore_file + "target_prefix_frac" + str(
                args.target_prefix_frac)
        if args.source_prefix_frac is not None:
            source_prefix_frac_rescore_file = rescore_file + "source_prefix_frac" + str(
                args.source_prefix_frac)

        if not args.right_to_left1 or not args.right_to_left2:
            if not args.diff_bpe:
                rerank_utils.write_reprocessed(
                    gen_output.source,
                    gen_output.hypo,
                    gen_output.target,
                    pre_gen + rescore_file + "." + args.source_lang,
                    pre_gen + rescore_file + "." + args.target_lang,
                    pre_gen + "/reference_file",
                    bpe_symbol=args.remove_bpe)
                if args.prefix_len is not None:
                    bw_rescore_file = prefix_len_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + prefix_len_rescore_file + "." +
                        args.source_lang,
                        pre_gen + prefix_len_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        prefix_len=args.prefix_len,
                        bpe_symbol=args.remove_bpe)
                elif args.target_prefix_frac is not None:
                    bw_rescore_file = target_prefix_frac_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + target_prefix_frac_rescore_file + "." +
                        args.source_lang,
                        pre_gen + target_prefix_frac_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        bpe_symbol=args.remove_bpe,
                        target_prefix_frac=args.target_prefix_frac)
                else:
                    bw_rescore_file = rescore_file

                if args.source_prefix_frac is not None:
                    fw_rescore_file = source_prefix_frac_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + source_prefix_frac_rescore_file + "." +
                        args.source_lang,
                        pre_gen + source_prefix_frac_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        bpe_symbol=args.remove_bpe,
                        source_prefix_frac=args.source_prefix_frac)
                else:
                    fw_rescore_file = rescore_file

        if args.right_to_left1 or args.right_to_left2:
            rerank_utils.write_reprocessed(
                gen_output.source,
                gen_output.hypo,
                gen_output.target,
                pre_gen + "/right_to_left_rescore_data." + args.source_lang,
                pre_gen + "/right_to_left_rescore_data." + args.target_lang,
                pre_gen + "/right_to_left_reference_file",
                right_to_left=True,
                bpe_symbol=args.remove_bpe)

        print("STEP 3: binarize the translations")
        if not args.right_to_left1 or args.score_model2 is not None and not args.right_to_left2 or not rerank1_is_gen:

            if args.backwards1 or args.backwards2:
                if args.backwards_score_dict_dir is not None:
                    bw_dict = args.backwards_score_dict_dir
                else:
                    bw_dict = args.score_dict_dir
                bw_preprocess_param = [
                    "--source-lang", scorer1_src, "--target-lang", scorer1_tgt,
                    "--trainpref", pre_gen + bw_rescore_file, "--srcdict",
                    bw_dict + "/dict." + scorer1_src + ".txt", "--tgtdict",
                    bw_dict + "/dict." + scorer1_tgt + ".txt", "--destdir",
                    backwards_preprocessed_dir
                ]
                preprocess_parser = options.get_preprocessing_parser()
                input_args = preprocess_parser.parse_args(bw_preprocess_param)
                preprocess.main(input_args)

            preprocess_param = [
                "--source-lang", scorer1_src, "--target-lang", scorer1_tgt,
                "--trainpref", pre_gen + fw_rescore_file, "--srcdict",
                args.score_dict_dir + "/dict." + scorer1_src + ".txt",
                "--tgtdict",
                args.score_dict_dir + "/dict." + scorer1_tgt + ".txt",
                "--destdir", left_to_right_preprocessed_dir
            ]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_param)
            preprocess.main(input_args)

        if args.right_to_left1 or args.right_to_left2:
            preprocess_param = [
                "--source-lang", scorer1_src, "--target-lang", scorer1_tgt,
                "--trainpref", pre_gen + "/right_to_left_rescore_data",
                "--srcdict",
                args.score_dict_dir + "/dict." + scorer1_src + ".txt",
                "--tgtdict",
                args.score_dict_dir + "/dict." + scorer1_tgt + ".txt",
                "--destdir", right_to_left_preprocessed_dir
            ]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_param)
            preprocess.main(input_args)

    return gen_output
Пример #33
0
# Copyright Aleksey Gurtovoy 2001-2006
#
# Distributed under the Boost Software License, Version 1.0. 
# (See accompanying file LICENSE_1_0.txt or copy at 
# http://www.boost.org/LICENSE_1_0.txt)
#
# See http://www.boost.org/libs/mpl for documentation.

# $Id$
# $Date$
# $Revision$

import preprocess
import os.path

preprocess.main(
      [ "plain" ]
    , "list"
    , os.path.join( "boost", "mpl", "list", "aux_", "preprocessed" )
    )
Пример #34
0
def preprocessing(main_config_fpath):
    '''Run preprocessing'''
    print 'Running preprocessing...'
    preprocess.main(main_config_fpath)
# Copyright Aleksey Gurtovoy 2001-2004
#
# Distributed under the Boost Software License, Version 1.0. 
# (See accompanying file LICENSE_1_0.txt or copy at 
# http://www.boost.org/LICENSE_1_0.txt)
#
# See http://www.boost.org/libs/mpl for documentation.

# $Source: /cvsroot/boost/boost/libs/mpl/preprocessed/preprocess_vector.py,v $
# $Date: 2004/09/02 15:41:30 $
# $Revision: 1.2 $

import preprocess

preprocess.main(
      [ "no_ctps", "plain", "typeof_based" ]
    , "vector"
    , "boost\\mpl\\vector\\aux_\\preprocessed"
    )
Пример #36
0
import preprocess as p
import rbm
import tensorflow as tf
import numpy as np

input_matrix, labels = p.main()

print "Input matrix shape = ", input_matrix.shape[0]
print "labels shape = ", labels.shape[0]

print labels[0, 0:10]

#for row in input_matrix:
visible = input_matrix[0]
hidden = labels[0]
vis = tf.Variable(visible)
r = rbm.RBM("chr0.0", visible.shape[0], hidden.shape[0])

with tf.Session() as session:
    # Run the model
    #session.run(r)
    session.run(r.propup(vis))
    # Run just the variable y and print
    #print(session.run(y))

#sess.Run(rbm

#x = RBM("test",
#rbm = tf.Variable(x+5, name = 'y')
#sess = tf.Session()
#sess.Run(
Пример #37
0
# Copyright Aleksey Gurtovoy 2001-2006
#
# Distributed under the Boost Software License, Version 1.0. 
# (See accompanying file LICENSE_1_0.txt or copy at 
# http://www.boost.org/LICENSE_1_0.txt)
#
# See http://www.boost.org/libs/mpl for documentation.

# $Id$
# $Date$
# $Revision$

import preprocess
import os.path

preprocess.main(
      [ "plain", "typeof_based" ]
    , "vector"
    , os.path.join( "boost", "mpl", "vector", "aux_", "preprocessed" )
    )
Пример #38
0
def main():

    ################################
    # Step 1: Pre-processing Check
    # Pipeline Starts Here: Data is checked to see if pre-processing is necessary. If so, 'preprocess.py' is envoked.

    longdescription()  # Prints a description of the project.

    # Finding the counts of the country files relative to the biological sex files (should be a 1:2 ratio)
    parsedfiles, originalfiles = countfiles(
    )  # if the ratio is different, then runpreproccessing is called.

    # Checking to see if all of the files for each country have been parsed by biological sex (represented by 1 or 2).
    if runpreprocessing(numofcountryfiles=originalfiles,
                        numofsexfiles=parsedfiles):
        preprocess.main()
        print("\nBeginning Analysis")
        time.sleep(2)

    print(
        "\n\n###########################################################################"
    )
    print("# Step 2: Data Storage and Management")
    ################################
    # Step 2: Data Storage and Management
    # Two Dictionaries (for each sex) are created to house the data. Minimum support criterion are also calculated here.
    sex1_file_dict = {}
    sex1_age_icd_support = {}

    sex2_file_dict = {}
    sex2_age_icd_support = {}

    for filename in os.listdir("."):
        sex1_datafile = re.match("(^Sex1_\w+)_\d+.csv", filename)
        sex2_datafile = re.match("(^Sex2_\w+)_\d+.csv", filename)
        if sex1_datafile:
            #print("Match Sex1:", filename, ":", sex1_datafile.group(1))
            argus = files2dictionary(filename, sex1_datafile.group(1),
                                     sex1_age_icd_support)
            print("Sex1", "\n", argus[0], "\n", argus[1])
            sex1_file_dict.update(argus[0]), sex1_age_icd_support.update(
                argus[1])  # concatenating dicts
        elif sex2_datafile:
            #print("Match Sex2:", filename, ":", sex2_datafile.group(1), "\n")
            argus = files2dictionary(filename, sex2_datafile.group(1),
                                     sex2_age_icd_support)
            print("Sex2", "\n", argus[0], "\n", argus[1])
            sex2_file_dict.update(argus[0]), sex2_age_icd_support.update(
                argus[1])

    print(
        "\n\n###########################################################################"
    )
    print("# Step 3: Apriori Algorithm")
    ################################
    # Step 3: Apriori Algorithm
    # Implementing a modified version of the Apriori algorithm for speeding up an otherwise exhaustive HPC problem

    # Creating a list of all countries
    sex1_countries_list = sex1_file_dict.keys()
    sex2_countries_list = sex2_file_dict.keys()
    sex1_file_dict.values()

    # Creating an age support dictionary. This is used to make sure the minimum support count for each age is met.
    age_support_dict = {}
    for key, value in sex1_age_icd_support.items():
        try:
            int(key)
            if sex1_age_icd_support[key] == len(
                    sex1_countries_list):  # checking minimum support counts
                age_support_dict[key] = value
        except ValueError:
            pass
    print("Age Support Dict: ", age_support_dict)

    signifOUTFH = open("results.tsv", "w")
    signifOUTFH.write("Sex\tAge\tSignificant_Combination\n")

    counter1 = 0
    print("Countries Evaluated: {}\n".format(sex1_countries_list))
    for country_age_dict in sex1_file_dict.values():
        counter1 += 1
        for age, icds_dict in country_age_dict.items():
            if age in age_support_dict and counter1 <= 1:  # meaning this age is in all six files

                qu = [str(i) for i in range(1, 36, 1)]
                #icd_count = round(float(sex1_file_dict[country][age][i]) * 1000000)
                qu, insig = bottom_up_trim(qu, sex1_file_dict,
                                           sex1_countries_list, age)
                print("Sex1:\tAge\t{}\nNew Queue\t{}\nInsignificant\t{}\n".
                      format(age, qu, insig))
                """!!!!!!!!!!!!! Tie in APRIORI ALGORITHM here :D !!!!!!!!!!!!!"""
                significant_combinations = apriori_v3(qu, insig,
                                                      sex1_file_dict,
                                                      sex1_countries_list, age)
                if len(significant_combinations) > 0:
                    signifOUTFH.write("{}\t{}\t{}\n".format(
                        "1", age, significant_combinations))
                print("Apriori Significant Combs", significant_combinations)
                print("##################################\nSuccess!")

    print("Countries Evaluated: {}\n".format(sex2_countries_list))
    counter2 = 0
    for country_age_dict in sex2_file_dict.values():
        counter2 += 1
        for age, icds_dict in country_age_dict.items():
            if age in age_support_dict and counter2 <= 1:  # meaning this age is in all six files

                qu = [str(i) for i in range(1, 36, 1)]
                qu, insig = bottom_up_trim(qu, sex2_file_dict,
                                           sex2_countries_list, age)
                print("Sex2:\tAge\t{}\nNew Queue\t{}\nInsignificant\t{}\n".
                      format(age, qu, insig))
                """!!!!!!!!!!!!! Tie in APRIORI ALGORITHM here :D !!!!!!!!!!!!!"""
                significant_combinations = apriori_v3(qu, insig,
                                                      sex2_file_dict,
                                                      sex2_countries_list, age)
                if len(significant_combinations) > 0:
                    signifOUTFH.write("{}\t{}\t{}\n".format(
                        "2", age, significant_combinations))
                print("Apriori Significant Combs", significant_combinations)
                print("##################################\nSuccess!")
    signifOUTFH.close()
Пример #39
0
# Copyright Aleksey Gurtovoy 2001-2006
#
# Distributed under the Boost Software License, Version 1.0. 
# (See accompanying file LICENSE_1_0.txt or copy at 
# http://www.boost.org/LICENSE_1_0.txt)
#
# See http://www.boost.org/libs/mpl for documentation.

# $Id: preprocess_map.py 49241 2008-10-10 09:24:39Z agurtovoy $
# $Date: 2008-10-10 02:24:39 -0700 (Fri, 10 Oct 2008) $
# $Revision: 49241 $

import preprocess
import os.path

preprocess.main(
      [ "plain", "typeof_based", "no_ctps" ]
    , "map"
    , os.path.join( "boost", "mpl", "map", "aux_", "preprocessed" )
    )
# Copyright Aleksey Gurtovoy 2001-2004
#
# Distributed under the Boost Software License, Version 1.0. 
# (See accompanying file LICENSE_1_0.txt or copy at 
# http://www.boost.org/LICENSE_1_0.txt)
#
# See http://www.boost.org/libs/mpl for documentation.

# $Source: /cvsroot/boost/boost/libs/mpl/preprocessed/preprocess_set.py,v $
# $Date: 2004/09/02 15:41:30 $
# $Revision: 1.2 $

import preprocess

preprocess.main(
      [ "plain" ]
    , "set"
    , "boost\\mpl\\set\\aux_\\preprocessed"
    )
Пример #41
0
# Copyright Aleksey Gurtovoy 2001-2004
#
# Distributed under the Boost Software License, Version 1.0.
# (See accompanying file LICENSE_1_0.txt or copy at
# http://www.boost.org/LICENSE_1_0.txt)
#
# See http://www.boost.org/libs/mpl for documentation.

# $Source: /CVSROOT/boost/libs/mpl/preprocessed/preprocess_vector.py,v $
# $Date: 2007/10/29 07:32:56 $
# $Revision: 1.1.1.1 $

import preprocess

preprocess.main(["no_ctps", "plain", "typeof_based"], "vector",
                "boost\\mpl\\vector\\aux_\\preprocessed")
        ##Normalize the weight within [0,1]
        W = W / sum_W

        ##for each classifier, update their weights
        for k in range(0, 3):
            W_clf[k] += log((1 - error[k]) / error[k])

    print(W_clf)
    return W_clf


if __name__ == '__main__':

    # loading from preprocess.py

    train_X, train_y = preprocess.main(True)
    test_X, test_y = preprocess.main(False)

    # sklearn.metrics.precision_score(y_true, y_pred, labels=None, pos_label=1, average=’binary’, sample_weight=None)[source]
    # sklearn.metrics.recall_score(y_true, y_pred, labels=None, pos_label=1, average=’binary’, sample_weight=None)[source]

    # ---------------- KNN ---------------- #

    start_time = time.time()
    knn_classifier = KNN(train_X, train_y,
                         k=20)  # create KNN classifier ### need change k here
    end_time = time.time()

    y_train_pred = knn_classifier.predict(train_X)
    y_test_pred = knn_classifier.predict(test_X)
# Copyright Aleksey Gurtovoy 2001-2004
#
# Distributed under the Boost Software License, Version 1.0. 
# (See accompanying file LICENSE_1_0.txt or copy at 
# http://www.boost.org/LICENSE_1_0.txt)
#
# See http://www.boost.org/libs/mpl for documentation.

# $Source: /cvsroot/boost/boost/libs/mpl/preprocessed/preprocess_map.py,v $
# $Date: 2004/09/02 15:41:30 $
# $Revision: 1.2 $

import preprocess

preprocess.main(
      [ "plain", "typeof_based" ]
    , "map"
    , "boost\\mpl\\map\\aux_\\preprocessed"
    )
import preprocess
import lgb


preprocess.main(update_means_only = False, forced_update = False) # preprocessing data
lgb.main()  # train model and predict
print('done.')
Пример #45
0
# Copyright Aleksey Gurtovoy 2001-2006
#
# Distributed under the Boost Software License, Version 1.0.
# (See accompanying file LICENSE_1_0.txt or copy at
# http://www.boost.org/LICENSE_1_0.txt)
#
# See http://www.boost.org/libs/mpl for documentation.

# $Id$
# $Date$
# $Revision$

import preprocess
import os.path

preprocess.main(["plain", "typeof_based", "no_ctps"], "map",
                os.path.join("boost", "mpl", "map", "aux_", "preprocessed"))
Пример #46
0
def main(arguments):
    global args
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument(
        '--tasks', default=range(1, 21), type=int, nargs='+', help='Tasks list')
    args = parser.parse_args(arguments)

    # Filter the tasks expecting more than one output
    tasks = args.tasks
    for t in [19]:
        if t in tasks:
            tasks.remove(t)

    # Wrap up in an argument
    new_arguments = ['--task'] + [str(t) for t in tasks]
    # Preprocessing
    preprocess.main(new_arguments)

    # Loading the data
    sentences_train, questions_train, questions_sentences_train, answers_train = read_preprocessed_matrix_data(
        'new_train')
    with open('../Data/preprocess/new_word2index', 'rb') as file:
        word2index = pickle.load(file)

    # ###### Training the questions embeddings
    # Count the answers words (indexed from 1 to len(answer_words))
    answer_words = set(answers_train.flatten())
    aw_number = len(answer_words)

    # Questions embeddings
    questions_embeddings_train = train_question_vector(questions_train, answers_train,
                                                 aw_number, alpha=0.1)

    # ##### Predictions
    # ##### Train
    # Batch predictions
    predictions_train = batch_prediction(questions_train, questions_sentences_train, sentences_train,
                                   aw_number, questions_embeddings_train, word2index)

    # Select response (index start at 1)
    output = np.argmax(predictions_train, axis=1) + 1

    # Compute global accuracy
    response = answers_train.flatten()
    print(len(response))
    accuracy = np.sum(output == response)/(1.*len(output))

    # Accuracy per tasks on train
    results_train = np.ones((len(tasks), 2))
    for i in xrange(len(tasks)):
        task_id = questions_train[1000*i, 0]
        local_acc = np.sum(
            output[1000*i:1000*(i+1)] == response[1000*i:1000*(i+1)])/(1000.)
        results_train[i, 0] = task_id
        results_train[i, 1] = local_acc

    print('---------------TRAIN------------------')

    for i in xrange(len(tasks)):
        print 'Results for task {}'.format(results_train[i, 0])
        print 'Average Accuracy is {}'.format(results_train[i, 1])
        print('----------------------------------------')

    print('----------------------------------------')
    print('Number of possible answers {}'.format(aw_number))
    print 'Results for {}'.format(tasks)
    print 'Average Accuracy is {}'.format(accuracy)
    print('----------------------------------------')

    # ##### Test
    sentences_test, questions_test, questions_sentences_test, answers_test = read_preprocessed_matrix_data(
        'new_test')

    # Batch predictions
    predictions_test = batch_prediction(questions_test, questions_sentences_test, sentences_test,
                                        aw_number, questions_embeddings_train, word2index)

    # Select response (index start at 1)
    output = np.argmax(predictions_test, axis=1) + 1

    # Compute global accuracy
    response = answers_test.flatten()
    print(len(response))
    accuracy = np.sum(output == response)/(1.*len(output))

    # Accuracy per tasks on train
    results_test = np.ones((len(tasks), 2))
    for i in xrange(len(tasks)):
        task_id = questions_test[1000*i, 0]
        local_acc = np.sum(
            output[1000*i:1000*(i+1)] == response[1000*i:1000*(i+1)])/(1000.)
        results_test[i, 0] = task_id
        results_test[i, 1] = local_acc

    print('---------------TEST------------------')

    for i in xrange(len(tasks)):
        print 'Results for task {}'.format(results_test[i, 0])
        print 'Average Accuracy is {}'.format(results_test[i, 1])
        print('----------------------------------------')

    print('----------------------------------------')
    print('Number of possible answers {}'.format(aw_number))
    print 'Results for {}'.format(tasks)
    print 'Average Accuracy is {}'.format(accuracy)
    print('----------------------------------------')