def recognize_image(image_name):
    mr.remove_meme_faces(base_path, test_items_path, image_name)
    pp.pre_process(base_path, pre_process_path, image_name)
    ls.line_segment(base_path, test_items_path, pre_processed_path, image_name)
    ws.word_segment(base_path, line_segments_path, image_name)
    cs.character_segment(base_path, words_segments_path, image_name)
    predict_sentence = rc.predict_sentence(base_path, character_segment_path,
                                           image_name)
    result = sc.spell_correct(predict_sentence)

    return result
示例#2
0
    def __init__(self, train_path, test_path):
        # Preprocessed train and test
        self.processed_train = pre_process(train_path)
        self.processed_test = pre_process(test_path)

        # Sentence array without unknown
        self.train_array_no_unk = sentence_array(self.processed_train)
        # Sentence array without occ -3
        train, test = get_sentences(self.processed_train, self.processed_test)
        self.train_array = train
        self.test_array = test
示例#3
0
文件: main.py 项目: mahshidaln/iBeM
def preprocessing(input):
    metabolites = int(input.readline())
    reactions = int(input.readline())
    print('Metabolites: {0}\nReactions: {1}\n'.format(metabolites, reactions))

    reversibles = [int(x) for x in input.readline().split()]

    stoichio = []
    for line in input:
        stoichio.append([float(x) for x in line.split()])

    pre_process(stoichio, reversibles)
示例#4
0
def main():

    # Properties
    code = 62

    # Prepare patients into array
    patients = prepare.prepare_patients_()

    # Loops through patients
    for i in range(len(patients)):
        patient = patients[i]
        pre_process.pre_process(patients, patient, code)
 def run(self, args_s):
     args_d = json.loads(args_s)
     iname = args_d['panid']
     self.socketIO.emit('update', {
         'id': iname,
         "phase": 1,
         'val': -1,
         'max': -1
     })
     self.socketIO.wait(seconds=1)
     print("{0} start pre".format(args_d['local_id']))
     pre_process.pre_process(
         namedtuple('Struct', args_d.keys())(*args_d.values()))
def pollution_change(pollution, source, year, option='Mean'):
	
	df1 = pre_process(pollution, source, year, option)
	df2 = pre_process(pollution, source, '2016', option)
	
	df1[source+' '+option] = df1[source+' '+option].astype(float)
	df2[source+' '+option] = df2[source+' '+option].astype(float)
	
	df = df1
	df[source+' '+option] = df2[source+' '+option] - df1[source+' '+option]
	
	df[source+' '+option] = df[source+' '+option].astype(str)
	df["text"] = df["state"] + '<br>' +\
    	source + ' '+option+' '+df[source+' '+option]
	
	#scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            #[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

	data = [ dict(
        	type='choropleth',
        	#colorscale = scl,
        	autocolorscale = True,
        	locations = df.index,
        	z = df[source+' '+option].astype(float),
        	locationmode = 'USA-states',
        	text = df['text'],
        	marker = dict(
            	line = dict (
                	color = 'rgb(255,255,255)',
                	width = 2
            	) ),
        	colorbar = dict(
            	title = pollution.loc[0, source+' Units'])
        	) ]

	layout = dict(
			title = year+' - 2016 US '+source+' level change by state<br>(Hover for details)',
        	geo = dict(
            	scope='usa',
            	projection=dict( type='albers usa' ),
            	showlakes = True,
            	lakecolor = 'rgb(255, 255, 255)'),
            	)
    
	fig = dict( data=data, layout=layout )
	py.iplot( fig, filename='us-pollution-change-map' )
	plotSuccessful = "Pollution change map plotted."
	return fig, plotSuccessful
示例#7
0
def predict(train_on_gpu, net, test_review, vocab_to_int, sequence_length=200):
    # prints out whether a given a review is predicted to be positive or negative in sentiment using a trained model
    # parameters include net: a trained network, test_review = review made of normal text and punctuations, sequence_length = the padded length of a review

    # pre-process and tokenize the review
    words, reviews_split = pre_process(test_review)
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in words])

    # test sequence padding
    features = pad_features(test_ints, sequence_length)
    # convert to tensor
    feature_tensor = torch.from_numpy(features)

    net.eval()
    batch_size = feature_tensor.size(0)

    # initialize hidden state
    h = net.init_hidden(batch_size, train_on_gpu)

    if (train_on_gpu):
        feature_tensor = feature_tensor.cuda()

    # get predicted output
    output, h = net(feature_tensor, h)

    # convert output probability to predicted class (0 or 1)
    pred = torch.round(output.squeeze())

    if (pred.item() == 1):
        print("Positive review detected!")
    else:
        print("Negative review detected.")
def prediction():
    if request.method == 'GET':
        argument_dict={}
        dict_tmp =  request.args.to_dict()
        for key in dict_tmp.keys():
            argument_dict = ast.literal_eval(key)

        order = argument_dict['orderTitle']
        desc = argument_dict['description']
        
        text_string = order+" "+desc
        pre_processed_str = pre_process(text_string)
        df_dict = predict_lstm.main(order, desc)

        l1= list(df_dict.keys())
        l2 = list(df_dict.values())

        res ={}
        for key in l1:
            for value in l2:
                res[key]=value
                l2.remove(value)
                break

        result=[]
        for key, value in res.items():
            t={}
            t['Prediction']=key
            t['Confidence']= value
            result.append(t)

        return json.dumps(result)
示例#9
0
def transforms(img_dir, img_name, pre=False):
    """transforms the img.

    Args:
        img_dir (str): dir. of image.
        img_name (str): file name of image.
        pre (bool): toggle pre-process

    Returns:
        tensor: image after transforms.
    """
    if pre:
        cache_path = os.path.join(img_dir, 'cache', img_name + 'pre_1.png')
        if os.path.isfile(cache_path):
            data = Image.open(cache_path).convert('L')
        else:
            data = pre_process(os.path.join(img_dir, img_name),
                               remove_curve=True)
            data.save(cache_path)
    else:
        data = Image.open(os.path.join(img_dir, img_name)).convert('L')
    transforms = T.Compose([
        T.Resize((128, 128)),
        T.ToTensor(),
    ])
    data = transforms(data)
    return data
示例#10
0
def pollution_map(df, source, year, option='Mean'):

    # Pre-processes the pollution data so that it can be plotted by plotly.
    df2 = pre_process(df, source, year, option)

    #scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
    #[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

    data = [
        dict(
            type='choropleth',
            #colorscale = scl,
            autocolorscale=True,
            locations=df2.index,
            z=df2[source + ' ' + option].astype(float),
            locationmode='USA-states',
            text=df2['text'],
            marker=dict(line=dict(color='rgb(255,255,255)', width=2)),
            colorbar=dict(title=df.loc[0, source + ' Units']))
    ]

    layout = dict(
        title=year + ' US ' + source +
        ' level by state<br>(Hover for details)',
        geo=dict(scope='usa',
                 projection=dict(type='albers usa'),
                 showlakes=True,
                 lakecolor='rgb(255, 255, 255)'),
    )

    fig = dict(data=data, layout=layout)
    py.iplot(fig, filename='us-pollution-map')
    plotSuccessful = "Pollution map plotted."
    return fig, plotSuccessful
示例#11
0
def main():

    compilable = sys.argv[1]
    preProcessed = pre_process.pre_process(compilable)
    parsed = function_parse.function_parse(preProcessed)
    formatable = order_of_computation.order_span(parsed)
    result = var_parse.var_format(formatable)
    print(result)
示例#12
0
 def __init__(self, n, using_weighted_distance, weight, p):
     self.n = n
     self.words = {}
     self.points = list()
     self.pre_processor = pre_process.pre_process()
     self.weighted_distance = using_weighted_distance
     self.weight = weight
     self.p = p
示例#13
0
def main():

    compilable = sys.argv[1]
    preProcessed = pre_process.pre_process(compilable)
    parsed = function_parse.function_parse(preProcessed)
    formatable = order_of_computation.order_span(parsed)
    result = var_parse.var_format(formatable)
    print(result)
示例#14
0
	def __init__(self, n, using_weighted_distance, weight, p):
		self.n = n
		self.words = {}
		self.points = list()
		self.pre_processor = pre_process.pre_process()
		self.weighted_distance = using_weighted_distance
		self.weight = weight
		self.p = p
示例#15
0
def SFM(X, y):
    # 从模型中选择,根据重要性,类似逐个选择,后向选择,逐渐抛弃不重要的
    X_train, X_test, y_train, y_test = pre_process(X, y)
    clf = SVM_recommend()
    m_range = [2000 - 50 * i for i in range(36, 40)]
    for m in m_range:
        selector = SelectFromModel(
            clf, threshold=-np.inf,
            max_features=m)  # 只根据max_features确定选择的数量,不设定threshold
        X_ = selector.fit_transform(np.asarray(X), np.asarray(y))
        X_train, X_test, y_train, y_test = pre_process(X_, y, bReset=True)
        clf = SVM_recommend_run(B_SFM,
                                X_train,
                                X_test,
                                y_train,
                                y_test,
                                paras={'max-features': m})
示例#16
0
def ocr_core(filename):
    """
    This function will handle the core OCR processing of images.
    """
    # pytesseract.pytesseract.tesseract_cmd = '/app/.apt/usr/bin/tesseract'
    text = pytesseract.image_to_string(
        pre_process(cv2.imread(filename))
    )  # We'll use Pillow's Image class to open the image and pytesseract to detect the string in the image
    return text
示例#17
0
def evaluateScore(X, y):
    X_train, X_test, y_train, y_test = pre_process(X, y, bReset=True)
    clf = SVC(C=0.01, max_iter=2000, kernel='linear', probability=True)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    y_classes = set(y)
    y_pred = label_binarize(y_pred, range(1, len(y_classes) + 1))
    y_test = label_binarize(y_test, range(1, len(y_classes) + 1))

    auc = metrics.roc_auc_score(y_test, y_pred, average='micro')
    return auc
示例#18
0
def UF(X, y):
    # 计算feature间的相关性,进行选择,这里做的是forward
    k_range = [50 * i for i in range(1, 4)]
    for k in k_range:
        selector = SelectKBest(chi2, k=k)
        X_ = selector.fit_transform(X, y)
        X_train, X_test, y_train, y_test = pre_process(X_, y, bReset=True)
        SVM_recommend_run(F_UF,
                          X_train,
                          X_test,
                          y_train,
                          y_test,
                          paras={'k-best': k})
示例#19
0
def SVM_base(X, y):
    # model_name为采用的降维方法,X为降维后的feature数据
    X_train, X_test, y_train, y_test = pre_process(X, y)
    for d in DECI_FUNCS:
        for k in KERNELS:
            for C in CS:
                SVM_recommend_run(COMPARE,
                                  X_train,
                                  X_test,
                                  y_train,
                                  y_test,
                                  paras={},
                                  C=C,
                                  kernel=k,
                                  decision_function_shape=d)
            '''
示例#20
0
def VT(X, y):
    # 利用方差进行选拔, 这里是backward
    for var in [0.03 * i for i in range(1, 50)]:
        selector = VarianceThreshold(threshold=var)
        X_ = selector.fit_transform(X)
        # X.shape[1] # selected feature amount
        X_train, X_test, y_train, y_test = pre_process(X_, y, bReset=True)
        # 有问题
        SVM_recommend_run(B_VT,
                          X_train,
                          X_test,
                          y_train,
                          y_test,
                          paras={
                              'variance': var,
                              'feature-num': X_.shape[1]
                          })
示例#21
0
def get_csv():
    # 预处理
    imgs, img_names = pre_process()
    canny_imgs = []
    # 手写canny边缘检测
    # new_gray = guass_smooth(gray_img)
    # dx, dy, M, theta = gradient(new_gray)
    # nms = NMS(M, dx, dy)
    # dt = double_threhold(nms)
    # dt[dt == 1] = 255
    # plt.imshow(dt)
    # plt.show()
    # canny算法边缘检测
    """cv2的canny算法"""
    for img in imgs:
        img = cv2.GaussianBlur(img, (3, 3), 0)
        canny = cv2.Canny(img, 50, 150)
        canny_imgs.append(canny.ravel())
        # print(canny_imgs[0])

    """存储带(无)标签样本:通过二值化 & canny边缘检测"""
    label_df = pd.read_excel('after_process\character.xlsx', sheet_name='Sheet1')             # 标签名,文件名   13128唯一样本,2013个汉字
    img_df = pd.DataFrame({
        'file_name': img_names,
        'img': canny_imgs
    })                                                              # 图像,文件名     13462(含重复)样本
    img_df = img_df.drop_duplicates('file_name', keep='first')                                # 删除img_df文件名重复项  13430个唯一样本

    # 让数组全显示,由于csv写入的时_str_,不全显示则会写入省略号
    np.set_printoptions(threshold=sys.maxsize)

    # 按照文件名合并,得到每个图象对应标签
    label_img_df = pd.merge(label_df, img_df, on='file_name')              # 图像,文件名,标签名  13120个带标签唯一样本
    label_img_df.to_csv(r'after_process\label_character.csv')

    # 再从img_df中获取剩下的无标签样本
    name_list = label_img_df['file_name'].tolist()
    unlabel_img_df = img_df[~img_df['file_name'].isin(name_list)]           # 图像,文件名      310个无标签唯一样本
    unlabel_img_df.to_csv(r'after_process\unlabel_character.csv')

    return None
示例#22
0
def process_image(image) -> dict:
    result = dict()
    result['timestamp'] = int(time.time())  # unix timestamp

    # Pre-process: Get just the scoreboard portion of the screen
    image_scaled = pre_process.pre_process(image, (70, 38, 350, 62),
                                           RESIZE_FACTOR)
    # image_scaled.show()
    clock_text, total_seconds = get_clock(image_scaled)
    home_score, away_score = get_score(image_scaled)
    home_name = get_home_team_name(image_scaled)
    away_name = get_away_team_name(image_scaled)

    result['clock'] = clock_text
    result['gametime'] = total_seconds  # in seconds
    result['home_score'] = home_score
    result['away_score'] = away_score
    result['home_name'] = home_name
    result['away_name'] = away_name

    return result
示例#23
0
def generateSummary(sp):
    try:
        page_id = sp.page_id
        page = ParentPage.query.get(page_id)
        format = page.format
        if format == "html":
            content = page.page_content
            print(content)

        elif format == "pdf":
            print(content)

        elif format == "png":
            print(content)

        else:
            None

        try:
            #modifying mycore_parentpage by adding data to columns named page_content, image_link, facts, title
            page.page_content, page.image_link, page.facts, page.title = pre_process(
                html_content)
        except:
            pass
        article_text = page.page_content
        gensimOut = gensimSum(article_text)
        out = gensimOut
        if out == "":
            sp.status = 3
        else:
            preface = custom_summarize(out, 5, 200)
            out = generate_paragraphs(out)
            out = preface_output_merger(preface, out)
            sp.status = 2
        sp.data = out

    except Exception as e:
        sp.status = 3

    db.session.commit()
示例#24
0
    def evaluate(self, test_x, test_y):
        # Timekeeping
        print("Start Evaluating.")
        start_time = time.time()

        correct = 0
        for x, y in zip(test_x, test_y):
            max_group = ""
            max_p = 1
            x_words = pre_process(x)

            for candidate_group in self.posts.keys():
                # P(O|H) * P(H) for each candidate group
                p = math.log(self.p_group[candidate_group])
                for word in x_words:
                    if word in self.vocabulary:
                        p += math.log(
                            self.p_word_given_group[candidate_group][word])

                if p > max_p or max_p == 1:
                    max_p = p
                    max_group = candidate_group
示例#25
0
    def train(self, train_x, train_y):
        """
        :param train_x: Words from each document to train on
        :param train_y: Class the document belongs to
        :return:
        """
        # Timekeeping
        print("Start Training.")
        start_time = time.time()

        # Connect data and labels together (x -> y)
        for x, y in zip(train_x, train_y):
            words = pre_process(x)
            for word in words:
                self.posts[y].append(word)
                self.vocabulary.add(word)

        # Calculate P(Hj) and P(Wk|Hj)
        for group in self.posts.keys():
            self.p_word_given_group[group] = {}
            docs_in_group = self.posts[group]
            self.p_group[group] = len(docs_in_group) / len(train_x)

            # Count number of words
            for word in self.vocabulary:
                self.p_word_given_group[group][word] = 1.0

            for word in self.posts[group]:
                if word in self.vocabulary:
                    self.p_word_given_group[group][word] += 1.0

            for word in self.vocabulary:
                self.p_word_given_group[group][word] /= len(
                    self.posts[group]) + len(self.vocabulary)

        # Timekeeping
        timed = int(time.time() - start_time)
        print("Training finished in ", timed, "seconds.")
示例#26
0
def main(path):
	args = utils.get_args()

	filename = path + "/Data/" + args.filename
	
	#read data
	xls_data = binarization.read_xls(filename)
	#drop columns
	xls_reduce_data = utils.reduce_data(xls_data)
	
	#scores_xls, columns_xls = ranking.get_ranking(xls_reduce_data)
	#create_json.ranking_json(scores_xls, columns_xls, path, "pre_process_ranking.json")

	process_data = pre_process.pre_process(xls_data, args)
	process_data.to_csv(path + "/Data/process_data.csv")
	create_json.create_json_categories(process_data, path)
	
	

	scores_ranking, columns_ranking = ranking.get_ranking(process_data)
	create_json.ranking_json(scores_ranking, columns_ranking, path, "after_process_ranking.json")
	
	columns = list(process_data.columns.values)
	print("columns", columns)

	start = time.time()
	binar_data = pd.DataFrame()
	for column in columns:
	 	print("process column: ", column)
	 	binar_column = binarization.process_column(process_data[str(column)], column, args)
	 	binar_data = pd.concat([binar_data, binar_column], axis = 1)
	 	#binar_data[column] = binar_df

	end = time.time()
	print("binarizacion demoro", end-start)
	print(binar_data)
	binar_data.to_csv(path + "/Data/binar_data.csv")
示例#27
0
            if feature not in good_features:
                selected_features = list(good_features) + [feature]
                Xts = np.column_stack(X[:, j] for j in selected_features)
                score = evaluateScore(Xts, y)
                scores.append((score, feature))
                print("Current AUC : ", np.mean(score))
        good_features.add(sorted(scores)[-1][1])
        score_history.append(sorted(scores)[-1])
        print("Current Features : ", sorted(list(good_features)))

    # Remove last added feature
    good_features.remove(score_history[-1][1])
    good_features = sorted(list(good_features))
    print("Selected Features : ", good_features)
    return good_features


def transform(X, y):
    good_features = selectionLoop(X, y)
    return X[:, good_features]


if __name__ == "__main__":
    os.chdir('..')
    X, y = load_data_small()
    print(X.shape)
    X_ = transform(X, y)
    X_train, X_test, y_train, y_test = pre_process(X_, y, bReset=True)
    SVM_recommend_run(AUC, X_train, X_test, y_train, y_test,
                      {'feature-num': X_.shape[1]})
示例#28
0
from pre_process import pre_process, const_df
from data_prep import data_const
from frame_prep import get_frames
from ConvLSTM import ConvLSTM_input, model
from performance import performance_metrics
import os
import tensorflow as tf

if __name__ == '__main__':
    fs = 20
    frame_size = fs * 4
    hop_size = fs * 2
    epochs = 150

    os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

    preprocessed_list = pre_process()
    df = const_df(preprocessed_list)
    scaled_X = data_const(df)
    X, y = get_frames(scaled_X, frame_size, hop_size)
    print(X.shape)
    X_train, X_test, y_train, y_test = ConvLSTM_input(X, y)
    history, model = model(X_train, X_test, y_train, y_test, epochs)
    performance_metrics(model, history, epochs, X_test, y_test)
示例#29
0
        objOpt, thetaOpt, lamOpt, muOpt = sub.solve_subproblem(
            y, xBar)  #(objOpt) upper bound

        thetaBar.append(thetaOpt)
        lamBar.append(lamOpt)
        muBar.append(muOpt)

        SUBD = np.amin([objOpt, SUBD])

        print "THETA"
        print thetaOpt
        print "X"
        print xBar

        # preprocess: deal with duplicate hyperplanes, and remove all 0 coefficients hyperplanes.
        g_flag, replicated_marker, coefficients = pre.pre_process(
            xBar, thetaOpt, lamOpt, muOpt, y)

        # print the flag and deplicate markers
        #print "g_flag", g_flag
        #print "replicated_marker", replicated_marker
        #print len(coefficients)
        #for co_index in xrange(len(coefficients)):
        #  print coefficients[co_index]

        # Get all the unique hyperplanes and save the coefficients of them.
        linker, unique_coefficients = pre.unique_coeff(g_flag,
                                                       replicated_marker,
                                                       coefficients, M, K, N)

        # Set a threshold as the distance used in the cell enumeration
        distance = [np.spacing(1)]
#!/usr/bin/python
import sys

from IPython import embed

from connect_blop import download_data
from load_data import load_data
from pre_process import pre_process
from fit_predict_test import fit_predict_test


# ---- get data
key_string = str(sys.argv[1])
download_data(key_string)
df_dic = load_data()

# ---- pre process & feature building
data = pre_process(df_dic)

# ---- fit, predict and cross-validate
fit_predict_test(data)
示例#31
0
    def on_request(self, *args):
        global mutex1, mutex2, mutex, mutex_data
        # tf.reset_default_graph()
        print("got request")
        data = args[0]
        filename, ext = splitext(data['input_path'])
        panid = basename(filename)
        # download file from upper server
        print("download...")
        sshdownload(data)
        args_d = {}
        remote_uuid = "{0}{1}".format(uuid.uuid4(), "_deeplearning")
        socketIO = SocketIO('localhost', ssht2.local_bind_port,
                            LoggingNamespace)
        args_d['remote_uuid'] = remote_uuid
        args_d['socketIO'] = socketIO
        args_d['model'] = "pspnet50_ade20k"

        args_d['sliding'] = True
        args_d['flip'] = True
        args_d['multi_scale'] = True
        print("phase 1...")
        args_d['input_path'] = "./{0}{1}".format(panid, ext)
        args_d['output_path'] = "{2}/{0}{1}".format(panid, ext,
                                                    config_p1_folder)
        pre_process.pre_process(
            namedtuple('Struct', args_d.keys())(*args_d.values()))
        print("phase 2...")
        # args_d['sess']=sess
        # args_d['model_ok']=pspnet
        args_d['input_path'] = config_p1_folder + '/'
        args_d['input_path_filter'] = panid

        args_d['output_path'] = config_p2_folder + '/'
        del args_d['socketIO']
        mutex.acquire()
        with open("temp_arg.json", 'w+') as fout:
            fout.write(json.dumps(args_d))
        mutex.release()
        # mutex1.put(args_d,block=True)
        print("sent task,wait response")
        while (1):
            # print("waiting...")
            mutex.acquire()
            if not os.path.exists("temp_arg.json"):
                break
            mutex.release()
            time.sleep(1)
        mutex.release()
        # mutex2.get(block=True)
        args_d['socketIO'] = socketIO
        print("phase 3...")
        args_d['input_path'] = "./{0}{1}".format(panid, ext)
        args_d['input_path2'] = "{2}/{0}{1}".format(panid, ext,
                                                    config_p2_folder)
        args_d['output_path'] = "{2}/{0}{1}".format(panid, ext,
                                                    config_p3_folder)
        class_scores = img_combine2.img_combine2(
            namedtuple('Struct', args_d.keys())(*args_d.values()))
        print("blended...")
        img = misc.imread("./{0}{1}".format(panid, ext))
        img = misc.imresize(img, 10)

        class_image = np.argmax(class_scores, axis=2)
        pm = np.max(class_scores, axis=2)
        colored_class_image = utils.color_class_image(class_image,
                                                      args_d['model'])
        #colored_class_image is [0.0-1.0] img is [0-255]
        alpha_blended = 0.5 * colored_class_image + 0.5 * img
        misc.imsave(filename + "_seg_blended" + ext, alpha_blended)
        print("upload...")
        sshupload(data, filename + "_seg_blended" + ext)
        print("garbage cleaning")
        print("success")
        self.emit("next")
         # Solve the subproblem
         objOpt, thetaOpt, lamOpt, muOpt = sub.solve_subproblem(y, xBar)   #(objOpt) upper bound
 
         thetaBar.append(thetaOpt)
         lamBar.append(lamOpt)
         muBar.append(muOpt)
 
         SUBD = np.amin([objOpt, SUBD])
                         
         print "THETA"
         print thetaOpt
         print "X"
         print xBar
 
         # preprocess: deal with duplicate hyperplanes, and remove all 0 coefficients hyperplanes.
         g_flag, replicated_marker, coefficients =  pre.pre_process(xBar, thetaOpt, lamOpt, muOpt, y)
         
         # print the flag and deplicate markers
         #print "g_flag", g_flag
         #print "replicated_marker", replicated_marker
         #print len(coefficients)
         #for co_index in xrange(len(coefficients)):
         #  print coefficients[co_index]
 
         # Get all the unique hyperplanes and save the coefficients of them.
         linker, unique_coefficients = pre.unique_coeff(g_flag,  replicated_marker,  coefficients,  M,  K,  N)
 
         # Set a threshold as the distance used in the cell enumeration
         distance = [np.spacing(1)]
         for i in xrange(len(unique_coefficients)):
             sum = 0.0
示例#33
0
def translate(sentence):
	sentence = pre_process(sentence)
	decoder = Decoder()
	sentence = decoder.decode(sentence)
	sentence = post_process(sentence)
	return sentence
示例#34
0
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from my_confusion_matrix import MyConfusionMatrix
#from pandas_ml import ConfusionMatrix

# Folder with the various subfolders
raw_data_folder = Path("../data/derive")
# Foldre with all of the new processed files
proc_data_folder = Path("../data/processed")

# Check if we've already processed the data, pre-process if not
try:
    _ = next(proc_data_folder.iterdir())
except StopIteration:
    # No files in preprocessed, so generate them
    pre_process.pre_process(raw_data_folder, proc_data_folder)

# Use scikit learn's count vectorizer to convert text files into X matrix by word frequency
corpus = proc_data_folder.iterdir()
vectorizer = CountVectorizer(input='filename',
                             token_pattern=r'[a-zA-Z]+-?[a-zA-Z]+')
X = vectorizer.fit_transform(corpus)

# Generate y by iterating through files and extracting file names. Tested to make sure order is same with X matrix
y = np.array([])
for file in proc_data_folder.iterdir():
    # labels are stored in the filename after the dash
    label = int(file.stem.split('-')[1])
    # Not most efficient way but we don't expect crazy numbers of files so append should be fine
    y = np.append(y, [label])
示例#35
0
if __name__ == "__main__":
    online = False

    if online:
        from config_online import *
    else:
        from config_offline import *

    print("******************* Preprocess ***********************")
    start = time.time()

    train_origin = pre.read_csv(path_train)
    print('Loading Train_set Done! Time: %.3f s' % (time.time() - start))
    print('origin_train_shape = [%s,%s]' % (train_origin.shape))

    train_origin = pre.pre_process(train_origin)
    print('pre_process Train_set Done! Time: %.3f s' % (time.time() - start))

    train = pre.feature_engineering(train_origin, True)
    print('feature_engineering Train_set Done! Time: %.3f s' % (time.time() - start))
    print('train_shape = [%s,%s]' % (train.shape))

    del train_origin

    test_origin = pre.read_csv(path_test)
    print('Loading Test_set Done! Time: %.3f s' % (time.time() - start))
    print('origin_test_shape = [%s,%s]' % (test_origin.shape))

    test_origin = pre.pre_process(test_origin)
    print('pre_process Test_set Done! Time: %.3f s' % (time.time() - start))