Exemplo n.º 1
0
def Base(eta, l2):
    params.outfile = 'Base_model'
    params.dataf = 'data/oct27.traindev.proc.cnn'
    params.dev = 'data/oct27.test.proc.cnn'
    params.test = 'data/daily547.proc.cnn'
    params.batchsize = 10
    params.hidden = 100
    params.embedsize = 100
    params.eta = eta
    params.L2 = l2
    params.dropout = 0
    params.frac = 0.1
    params.emb = 0

    (words, We) = getWordmap('wordvects.tw100w5-m40-it2')
    We = np.asarray(We).astype('float32')
    tagger = getTagger('data/tagger')
    print tagger
    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize) + '_' + "LearningRate" + '_' + str(
            params.eta) + '_' + str(
                params.hiddensize) + '_' + str(l2) + '.pickle'

    traindata = getData(params.dataf, words, tagger)
    trainx0, trainy0 = traindata
    devdata = getData(params.dev, words, tagger)
    devx0, devy0 = devdata
    testdata = getData(params.test, words, tagger)
    testx0, testy0 = testdata

    tm = base_model(We, params)
    tm.train(traindata, devdata, testdata, params)
def parse_dailykos():
	#JNYTDocument.drop_collection()
	
	#URL for a search for the term US Presidential elections between 05/01/2011 and 05/31/2013
	base_url = "http://www.dailykos.com/search?submit=Search&time_begin=05%2F01%2F2011&text_type=any&search_type=search_stories&order_by=-time&text_expand=contains&text=US%20Presidential%20Elections&time_type=time_published&usernames=%28usernames%29&tags=%28tags%29&time_end=05%2F31%2F2013&page="

	main_url = base_url+"1"
	main_soup = BeautifulSoup(utils.getData(main_url)).find("div",{"class":"ajax-form-results ajax-delay-load"})

	no_of_items = main_soup.find("h4",{"class":"sub-head"}).get_text()
	no_of_items = no_of_items.replace(" results were found","")
	no_of_items = int(no_of_items)

	no_of_pages = no_of_items / 50

	#They rate limit the requests coming in from an bot. So an initial wait period before the loop begins..
	#Inside the loop, parsing the other contents would introduce the sufficient wait time. 
	time.sleep(10)

	for page_num in range(1, 30):
		url = base_url + `page_num`
		soup = BeautifulSoup(utils.getData(url)).find("div",{"class":"ajax-form-results ajax-delay-load"})

		table_list = soup.find("table",{"class":"styled storiesAsGrid"})

		if table_list != None:
			tbody = table_list.find("tbody")
			if tbody != None:
				link_rows = tbody.findAll("tr")
				for link_row in link_rows:
					dailyKosDoc = JNYTDocument()
					link = link_row.find("td",{"class":"first"}).find("a",{"class":"title"})
					date = link_row.find("td",{"class":"sm date"})

					dailyKosDoc.pub_date = datetime.strptime(date.get_text(),'%m/%d/%Y')
					dailyKosDoc.source = "DailyKos"
					dailyKosDoc.web_url = "http://www.dailykos.com" + link['href']
					dailyKosDoc.headline = link.get_text()
					dailyKosDoc.political_leaning = "Liberal"
					dailyKosDoc.save()

					#Getting the social shares for the URL
					#dailyKosDoc.social_shares = shares.get_social_counts(dailyKosDoc.web_url)
					#dailyKosDoc.save()
					
					#Getting the content of the URL
					try:
						content_soup = BeautifulSoup(utils.getData(dailyKosDoc.web_url)).find("div",{"id":"storyWrapper"}).find("div",{"class":"article-body"})
						dailyKosDoc.content = content_soup.get_text()
						dailyKosDoc.save()
					except:
						pass
					#break
		#if page_num == 2:
		#	break
	return `page_num`
def parse_michelle_malkin():
	#JNYTDocument.drop_collection()
	#http://michellemalkin.com/page/1/?s=presidential+elections+2012

	base_url = "http://michellemalkin.com/page/<<page_num>>/?s=presidential+elections+2012"
	page_num = 1

	while True:
		url = base_url.replace("<<page_num>>", `page_num`)
		soup = BeautifulSoup(utils.getData(url)).find("div",{"id":"content"})

		title = soup.find("h1",{"class":"leadStoryAlt"})

		if title == "Not Found":
			break

		article = soup.find("div",{"class":"article"})

		headings = article.findAll("h2")
		author = article.findAll("div",{"class":"author"})

		for index, h2 in enumerate(headings):
			link = h2.find("a")
			meta_data = [string.strip() for string in author[index].get_text().encode('utf-8').split('\xc2\xa0\xc2\xa0')]
			
			michelleMalkinDoc = JNYTDocument()

			michelleMalkinDoc.web_url = link['href']
			michelleMalkinDoc.political_leaning = "Conservative"
			michelleMalkinDoc.source = "Michelle Malkin"
			michelleMalkinDoc.headline = link.get_text()
			michelleMalkinDoc.pub_date = datetime.strptime(meta_data[2],"%B %d, %Y %I:%M %p")
			michelleMalkinDoc.save()

			#Getting the social shares for the URL
			#michelleMalkinDoc.social_shares = shares.get_social_counts(michelleMalkinDoc.web_url)
			#michelleMalkinDoc.save()

			#Getting the document content.
			content_soup = BeautifulSoup(utils.getData(michelleMalkinDoc.web_url)).find("div",{"class":"blog"}).findAll("p")
			article_content = ""

			for paragraph in content_soup:
				text = paragraph.get_text()
				if text.startswith("**Written by ") or text.startswith("Twitter @"):
					continue
				article_content += " "+ text
			
			michelleMalkinDoc.content = article_content.strip()
			michelleMalkinDoc.save()

		page_num += 1
	return `index`
def parse_fivethirtyeight():
	#JNYTDocument.drop_collection()

	base_url = "http://fivethirtyeight.com/page/<<page_num>>/?s=presidential+elections+2012"
	page_num = 1

	while True:
		url = base_url.replace("<<page_num>>", `page_num`)
		url_content = utils.getData(url)

		if url_content is None:
			break

		# if page_num == 3:
		# 	break
		
		# soup = BeautifulSoup(utils.getData(url))
		# print soup
		soup = BeautifulSoup(url_content).find("div",{"id":"main"})
		posts = soup.findAll("div")

		for index, div in enumerate(posts):
			if index == 0:
				continue
			#do something with the individual posts here..
			date_string = div.find("span",{"class":"datetime updated"}).get_text()
			link = div.find("h2",{"class":"article-title entry-title"}).find("a")

			fivethirtyeightDoc = JNYTDocument()

			fivethirtyeightDoc.web_url = link['href'].strip()
			fivethirtyeightDoc.political_leaning = "Liberal"
			fivethirtyeightDoc.source = "FiveThirtyEight"
			fivethirtyeightDoc.headline = link.get_text().strip()
			fivethirtyeightDoc.pub_date = datetime.strptime(date_string.strip(),"%b %d, %Y")
			fivethirtyeightDoc.save()

			#Getting the social shares for the URL
			#fivethirtyeightDoc.social_shares = shares.get_social_counts(fivethirtyeightDoc.web_url)
			#fivethirtyeightDoc.save()

			try:
				content_soup = BeautifulSoup(utils.getData(fivethirtyeightDoc.web_url)).find("div",{"class":"entry-content"})
				fivethirtyeightDoc.content = content_soup.get_text()
				fivethirtyeightDoc.save()
			except:
				pass
			#print date_string.strip(), title_link.get_text().strip(), title_link['href'].strip()
			# break
		# break
		page_num = page_num + 1
	return `index`
def parse_pj_media():
	# JNYTDocument.drop_collection()
	#http://pjmedia.com/page/1/?s=presidential+elections+2012&submit_x=0&submit_y=0&search_sortby=date
	base_url = "http://pjmedia.com/page/<<page_num>>/?s=presidential+elections+2012&submit_x=0&submit_y=0&search_sortby=date"
	page_num = 1

	while True:
		url = base_url.replace("<<page_num>>",`page_num`)
		html_content = utils.getData(url)

		if html_content == None:
			break
		
		soup = BeautifulSoup(html_content)
		articles = soup.find("div",{"id":"archive-content"}).findAll("div",{"class":"category-story"})

		for article in articles:
			pjMediaDoc = JNYTDocument()
			link = article.find("h2").find("a")
			meta_data = [string.strip() for string in article.find("div",{"class":"category-author2"}).get_text().split('-')]

			date_str = meta_data[0]
			date_str = date_str.replace("th,",",")
			date_str = date_str.replace("st,",",")
			date_str = date_str.replace("nd,",",")
			date_str = date_str.replace("rd,",",")

			pjMediaDoc.web_url = link['href']
			pjMediaDoc.political_leaning = "Conservative"
			pjMediaDoc.source = "PJ Media"
			pjMediaDoc.headline = link.get_text().strip()
			pjMediaDoc.pub_date = datetime.strptime(date_str,"%A, %B %d, %Y")
			pjMediaDoc.save()

			#Getting the social shares for the URL
			#pjMediaDoc.social_shares = shares.get_social_counts(pjMediaDoc.web_url)
			#pjMediaDoc.save()

			#Getting the content of the document
			content_soup = BeautifulSoup(utils.getData(pjMediaDoc.web_url+"?singlepage=true")).find("div",{"class":"post"}).find("div",{"class":"clearingfix"}).findAll("p")
			article_content = ""

			for paragraph in content_soup:
				text = paragraph.get_text()
				article_content += " "+ text

			pjMediaDoc.content = article_content.strip()
			pjMediaDoc.save()

		page_num += 1

	return `page_num`
def parse_talkingpointsmemo():
	#JNYTDocument.drop_collection()
	base_url = "https://www.googleapis.com/customsearch/v1element?key=AIzaSyCVAXiUzRYsML1Pv6RwSG1gunmMikTzQqY&rsz=filtered_cse&num=10&hl=en&prettyPrint=false&source=gcsc&gss=.com&sig=23952f7483f1bca4119a89c020d13def&start=<<start_num>>&cx=partner-pub-7451232131633930:5915231553&q=presidential%20elections%202012&safe=active&googlehost=www.google.com&callback=google.search.Search.apiary272&nocache=1416895033146"
	start_num = 10
	
	while start_num < 100:
		# if start_num == 30:
		# 	break

		url = base_url.replace("<<start_num>>", `start_num`)
		url_content = utils.getData(url)

		url_content = url_content.replace("// API callback","")
		url_content = url_content.replace("google.search.Search.apiary272(","").strip()
		url_content = url_content[:-2]

		data = json.loads(url_content)
		for result in data["results"]:
			# print result["titleNoFormatting"], result["url"]

			talkingPointsMemoDoc = JNYTDocument()
			talkingPointsMemoDoc.web_url = result["url"]
			talkingPointsMemoDoc.headline = result["titleNoFormatting"]
			talkingPointsMemoDoc.political_leaning = "Liberal"
			talkingPointsMemoDoc.source = "Talking Points Memo"
			talkingPointsMemoDoc.save()

			#Getting the social shares for the URL
			talkingPointsMemoDoc.social_shares = shares.get_social_counts(talkingPointsMemoDoc.web_url)
			talkingPointsMemoDoc.save()

			try:
				content_soup = BeautifulSoup(utils.getData(talkingPointsMemoDoc.web_url))
				by_line = content_soup.find("section",{"class":"byline"}).find("time")

				date_string = by_line.get_text().strip().rsplit(",",1)[0]
				
				talkingPointsMemoDoc.pub_date = datetime.strptime(date_string.strip(),"%B %d, %Y")
				content = content_soup.find("div",{"class":"story-teaser"})
				body_content = content_soup.find("div",{"class":"story-body"})

				main_content = content.get_text() + " " + body_content.get_text()
				talkingPointsMemoDoc.content = main_content.strip()
				talkingPointsMemoDoc.save()
			except:
				print "Exception occured"
				pass

		start_num += 10
		# break
	return 'Anand'
def Base(eta, l2, morepara, emb, batchsize):
    params.outfile = 'POS_CRF_Bilstm_Viterbi_'
    params.dataf = '../pos_data/oct27.traindev.proc.cnn'
    params.dev = '../pos_data/oct27.test.proc.cnn'
    params.test = '../pos_data/daily547.proc.cnn'
    params.batchsize = batchsize
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.L2 = l2
    params.dropout = 0
    params.num_labels = 25

    params.morepara = morepara

    (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
    #words.update({'UUUNKKK':0})
    #a=[0]*len(We[0])
    #newWe = []
    #newWe.append(a)
    #We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../pos_data/tagger')
    print tagger
    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize
    ) + '_dropout_' + str(params.dropout) + "_LearningRate" + '_' + str(
        params.eta) + '_' + str(l2) + str(morepara) + '_emb_' + str(emb)
    #examples are shuffled data

    traindata = getData(params.dataf, words, tagger)
    trainx0, trainy0 = traindata
    devdata = getData(params.dev, words, tagger)
    devx0, devy0 = devdata
    print 'dev set', len(devx0)
    testdata = getData(params.test, words, tagger)
    testx0, testy0 = testdata

    print 'test set', len(testx0)
    #print Y
    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile
    #lm = LM_model(params)
    #lm.train(trainy0, devy0, params)

    tm = CRF_model(We, params)
    tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
def parse_crooksnliars():
	#JNYTDocument.drop_collection()
	base_url = "http://crooksandliars.com/solr/presidential%20elections%202012?page=<<page_num>>&filters=im_cl_section%3A1"
	page_num = 0

	while page_num < 313:
		url = base_url.replace("<<page_num>>", `page_num`)
		url_content = utils.getData(url)
		soup = BeautifulSoup(url_content).find("div",{"class":"search-results"})

		content_nodes = soup.findAll("div",{"class":"buildmode-teaser"})

		for index, div in enumerate(content_nodes):
			crooksNLiarsDoc = JNYTDocument()
			title = div.find("div",{"class","field-title"}).find("a")

			field_submitted = div.find("div",{"class":"field field-submitted submitted"})
			author_link = field_submitted.find("a")

			temp_string = field_submitted.get_text()
			temp_string = temp_string.replace("By","")
			

			temp_string = temp_string.replace(author_link.get_text(),"").strip()
			date_string = temp_string.split("-")[0].strip().rsplit(" ",2)[0]
			date_string = date_string.replace("Anonymous","")
			
			crooksNLiarsDoc.web_url = "http://crooksandliars.com" + title["href"]
			crooksNLiarsDoc.headline = title.get_text().strip()
			crooksNLiarsDoc.political_leaning = "Liberal"
			crooksNLiarsDoc.source = "Crooks N Liars"
			crooksNLiarsDoc.pub_date = datetime.strptime(date_string.strip(),"%B %d, %Y")
			crooksNLiarsDoc.save()

			#Getting the social shares for the URL
			#crooksNLiarsDoc.social_shares = shares.get_social_counts(crooksNLiarsDoc.web_url)
			#crooksNLiarsDoc.save()

			try:
				content_soup = BeautifulSoup(utils.getData(crooksNLiarsDoc.web_url)).find("div",{"class":"nd-region-middle-wrapper"})
				crooksNLiarsDoc.content = content_soup.get_text()
				crooksNLiarsDoc.save()
			except:
				pass
			# break
		# break
		page_num = page_num + 1
	return `page_num`
Exemplo n.º 9
0
def thread(id, filename, nbInputs, seed=None):
    rs = np.random.RandomState(seed)
    with Session() as sess:
        with createSolver(id, nbInputs, sess, RandomState(rs.randint(1E9))) \
                as solver:
            res = evaluateSolver(solver, getData(filename, nbInputs, 1), rs)
    return res
def test(filename):
    X, Y_, _ = utils.getData(filename)
    Y = XGboost_revalue(Y_)
    dataSet_name = filename.split('/')[1].split('.')[0]
    print("------------------------" + dataSet_name +
          "------------------------")
    train_data, train_label, test_data, test_label = utils.splitDataSet(
        X, Y, test_size=0.3)
    # 转换为DMatrix数据格式
    dtrain = xgb.DMatrix(train_data, label=train_label)
    dtest = xgb.DMatrix(test_data, label=test_label)
    # 设置参数
    parameters = {
        'eta': 0.01,
        'subsample': 0.75,
        'objective': 'multi:softmax',  # error evaluation for multiclass tasks
        'num_class': 2,  # number of classes to predic
        'max_depth': 8  # depth of the trees in the boosting process
    }
    num_round = 500  # the number of training iterations
    bst = xgb.train(parameters, dtrain, num_round)
    preds = bst.predict(dtest)  #输出的是概率
    acc, p, r, f1 = utils.calAccuracy(preds, test_label)
    print("正确率:{:.2%}\t查准率:{:.4f}\t查全率:{:.4f}\tF1:{:.4f}".format(
        acc, p, r, f1))
Exemplo n.º 11
0
def svc():
    X, y = getData()

    clf = make_pipeline(StandardScaler(), SVC(gamma="auto"))
    clf.fit(X, y)

    return clf.score(X, y)
Exemplo n.º 12
0
def visualize():

    pkl_path = './dataset/full_data.pkl'
    main_transform = transforms.Compose([transforms.ToTensor()])
    trainset, testset = getData(pkl_path, train_transform = main_transform, test_transform = main_transform)
    test_dataloader = DataLoader(testset, batch_size = 1, shuffle = True)
    model = custom.CustomNet()
    model.load_state_dict(torch.load('./models/trainedModels/currBest.model'))
    plt.figure(dpi=300)
    curr = 150
    for i,batch in enumerate(test_dataloader):
        if(i >= 10 and i < 15):
            plt.subplot(curr + i + 1)
            data = batch
            img = data['data']
            labels = data['labels'][0].numpy()
            outs = model(img)
            _,preds = torch.max(outs,1)
            outs = outs[0].detach().numpy()
            preds = preds[0].detach().numpy()
            plt.imshow(img.numpy()[0][0], cmap='gray')
            plt.xlabel('Predictions: '+ str(preds[0]) +' '+ str(preds[1])+' ' +str(preds[2]) + '\n Ground Truth:'+ str(labels[0])+' ' +str(labels[1])+' ' +str(labels[2]))

        else:
            if(i > 15):
                break
            else:
                continue
    plt.show()
Exemplo n.º 13
0
    def getX(self):
        samples = getData()
        x_train = np.zeros((BATCH_SIZE, 576))
        conds = np.zeros((BATCH_SIZE, NUM_CONDS))
        for i in range(BATCH_SIZE):
            if i % 25 == 0:
                x = getSingleSample(samples)
                cond = np.array([1, 0, 0, 0])

            else:
                x = synthData((i % 25) / 25, samples)

                if (i % 25) > 17:
                    cond = np.array([0, 0, 0, 1])
                elif (i % 25) > 8:
                    cond = np.array([0, 0, 1, 0])
                else:
                    cond = np.array([0, 1, 0, 0])

            x_train[i, :] = x
            conds[i, :] = cond

        x_train = np.reshape(x_train, (BATCH_SIZE, 576, 1))

        return x_train, conds
Exemplo n.º 14
0
 def __init__(self,
              split=0.7,
              interval='1min',
              predict_period=1,
              days=5,
              mins_interval=30,
              start_date='2020-08-24',
              end_date='2020-08-29',
              stock_name='SPY',
              stride=1):
     super(stockGraphGenerator, self).__init__()
     self.__start_date = datetime.datetime.strptime(
         start_date + ' 10:00:00', '%Y-%m-%d %H:%M:%S')
     self.__end_date = datetime.datetime.strptime(end_date + ' 20:00:00',
                                                  '%Y-%m-%d %H:%M:%S')
     self.__mins_interval = mins_interval
     self.__stride = stride
     self.__data_len = self.__calculateLen(days, mins_interval)
     self.__interval = interval
     self.__predict_period = predict_period
     self.__data_raw = utils.getData(stock_name).reset_index()
     self.train_data = torch.utils.data.Subset(
         self, list(range(0, int(split * self.__data_len))))
     self.test_data = torch.utils.data.Subset(
         self,
         list(range(int(split * self.__data_len), int(self.__data_len))))
Exemplo n.º 15
0
def runMLE():
    #Train on 8000, test on 2000
    X, Y = getData()
    Xtrain = X[:8000]
    Ytrain = Y[:8000]
    Xtest = X[8000:]
    Ytest = Y[8000:]

    means, covs = getConditionals(Xtrain, Ytrain)
    priors = getPriors(Ytrain)

    acc = 0

    Probs = np.zeros((10, len(Xtest)))
    for j in range(10):
        #Fix for non-invertible matrices
        A = covs[j]
        A = A + .01 * np.identity(np.shape(covs[j])[0])

        #Use logpdf to avoid overflow
        p = stats.multivariate_normal.logpdf(Xtest, mean=means[j], cov=A)
        p = p * priors[j]
        Probs[j] = p
    ypred = np.zeros(len(Xtest))
    for i in range(len(Xtest)):
        ypred = np.argmax(Probs[:, i])
        if (ypred == Ytest[i]):
            acc += 1
    acc = acc / len(Xtest)

    return acc
Exemplo n.º 16
0
def question3(dataDir, imageName, imageName2):
    
    time, data = getData(dataDir)
    
    # part 1
    x = data[:,0:3]
    x_des = data[:,3:6]

    data_label = ["x", "y", "z"]
    data_des_label = ["$x_d$", "$y_d$", "$z_d$"]
    fig, plots = plt.subplots(3, figsize=(5, 6))
    plotXYZ(plots, x, x_des, time, data_label, data_des_label)

    title = "Question 3\nx vs x_desired"
    plots[0].set_title(title, fontsize=13,  font="monospace")
    fig.supylabel('End effector positions (m)', fontsize=11,  font="monospace")
    plots[-1].set_xlabel('Time (seconds)', fontsize=11,  font="monospace")
    fig.tight_layout()
    fig.savefig(imageName)

    # part 2
    delta_phi = data[:,6:9]

    data_label = ["x", "y", "z"]
    fig, plots = plt.subplots(3, figsize=(5, 6))
    plotXYZ(plots, delta_phi, 0, time, data_label, 0)

    title = "Question 3\n$\delta$$\phi$"
    plots[0].set_title(title, fontsize=13,  font="monospace")
    fig.supylabel('Orientation error (rad)', fontsize=11,  font="monospace")
    plots[-1].set_xlabel('Time (seconds)', fontsize=11,  font="monospace")
    fig.tight_layout()
    fig.savefig(imageName2)

    plt.close('all')
Exemplo n.º 17
0
    def getData(self):
        X, y = utils.getData()

        trainX, testX, trainY, testY = train_test_split(
            X, y, test_size=0.2, shuffle=True, random_state=1234
        )

        return (trainX, trainY), (testX, testY)
def parse_redstate():
	base_url = "http://www.redstate.com/search/presidential+elections+2012/page/"
	
	for page_num in range(1,63):
		url = base_url + `page_num`
		soup = BeautifulSoup(utils.getData(url))

		articles = soup.find("ul",{"class":"story-loop"}).findAll("ul",{"class","post"})

		for index, article in enumerate(articles):
			title_link = article.find("a")
			date_string = article.find("span",{"class":"byline-italic"})
			date_string = date_string.get_text().split(" at ")[0]
			date_string = date_string.replace("th,",",")
			date_string = date_string.replace("st,",",")
			date_string = date_string.replace("nd,",",")
			date_string = date_string.replace("rd,",",")

			redStateDoc = JNYTDocument()
			redStateDoc.web_url = title_link['href']
			redStateDoc.political_leaning = "Conservative"
			redStateDoc.source = "RedState"
			redStateDoc.headline = title_link.get_text().strip()
			redStateDoc.pub_date = datetime.strptime(date_string.strip(),"%B %d, %Y")
			redStateDoc.save()

			#Getting the social shares for the URL
			#redStateDoc.social_shares = shares.get_social_counts(redStateDoc.web_url)
			#redStateDoc.save()

			content_soup = BeautifulSoup(utils.getData(redStateDoc.web_url)).find("div",{"class":"the-content"}).findAll("p")
			article_content = ""
			text = ""

			for paragraph in content_soup:
				text = paragraph.get_text()
				article_content += " "+ text

			article_content = article_content.replace(text,"")

			redStateDoc.content = article_content.strip()
			redStateDoc.save()

		# break

	return `page_num`
Exemplo n.º 19
0
def featurize():
    print("---- Reading Data ----")
    img_paths = glob.glob(DATA_PATH)

    print("len(img_paths):", len(img_paths))
    random.seed(a=13521)
    random.shuffle(img_paths)

    train_test_split = 0.8
    X_test_paths = img_paths[int(train_test_split * len(img_paths)):]

    dims = (448, 448, 3)

    # Loading Data
    X_test = utils.getData(X_test_paths, dims)
    print("X_test:", X_test.shape)

    # To check NaN pixel images
    nan_pixels_per_image = utils.nansInData(X_test)
    # plt.scatter(x=np.arange(0,len(nan_pixels_per_image)), y=nan_pixels_per_image)
    # plt.savefig("nan_scatter.png")

    # Checking min max to see if normalization is needed or not
    print("Before normalization")
    print(np.nanmin(X_test), np.nanmax(X_test))

    X_test = utils.normalize(X_test)

    # Checking min max after normalization
    print("After normalization")
    print(np.nanmin(X_test), np.nanmax(X_test))

    # Interpolate nan values
    X_test = utils.interpolateNaNValues(X_test)

    # To check NaN pixel images
    nan_pixels_per_image = utils.nansInData(X_test)

    print("---- Reading Model ----")
    model = load_model(OUTPUT_MODEL_PATH)
    print(model.summary())

    print("---- Featurizing Data ----")
    feature_list = extract_features(img_array=X_test,
                                    model=model,
                                    layer_names=['conv2d_8'])

    #     layer_name = 'conv2d_8'
    #     intermediate_layer_model = Model(inputs=model.input,
    #                                      outputs=model.get_layer(layer_name).output)
    #     intermediate_output = intermediate_layer_model.predict(data)
    #     feature_list = intermediate_output

    utils.nansInData(feature_list, data_type="feature")

    # Save the features and the filelist order for later use.
    pickle.dump(feature_list, file=open((FEATURES_OUTPUT), mode='wb'))
    pickle.dump(X_test_paths, file=open((PATH_LIST), mode='wb'))
Exemplo n.º 20
0
	def testSyntheticData(self):
		#A,S,F = [],[],[]
		x_train, y_train, x_control_train, x_control_test, x_test, y_test = ut.getData()
		dist_params, dist_params_train =  ut.getDistribution(x_train, y_train, x_control_train)

		mean, cov, meanT, covT = dist_params["mean"], dist_params["cov"], dist_params_train["mean"], dist_params_train["cov"]
		#print(mean)
		meanN = [0] * len(mean)
		covN = np.identity(len(mean))

		#clf = GaussianMixture(n_components=2, covariance_type='full')
		means = [mean, meanN]
		covariances = [cov, covN]
		lw = float(sys.argv[2])
		weights = [1-lw, lw]

		#for i in range(0,4):
		LR, LE = len(y_train), len(y_test)
		train, test = [],[]
		for i in range(0, LR):
			j = np.random.choice([0,1], p=weights)
			seed = np.random.randint(10)
			train.append(multivariate_normal(means[j], covariances[j], allow_singular=1).rvs(size=1, random_state=seed))
		for i in range(0, LE):
			j = np.random.choice([0,1], p=weights)
			seed = np.random.randint(10)
			test.append(multivariate_normal(means[j], covariances[j], allow_singular=1).rvs(size=1, random_state=seed))

		x_train, y_train, x_control_train = [], [], []
		for t in train:
			x_train.append(t[:-2])
			if t[len(t)-2] < 0:
				y_train.append(-1)
			else:
				y_train.append(1)
			#y_train.append(t[len(t)-2])
			if t[len(t)-1] < 0.5:
				x_control_train.append(0)
			else:
				x_control_train.append(1)

		x_control_test, x_test, y_test = [], [], []
		for t in test:
			x_test.append(t[:-2])
			if t[len(t)-2] < 0:
				y_test.append(-1)
			else:
				y_test.append(1)
			if t[len(t)-1] < 0.5:
				x_control_test.append(0)
			else:
				x_control_test.append(1)

		#print(x_train, y_train, x_control_train)
		y_res = self.processGivenData(0.9, x_train, y_train, x_control_train, x_test, y_test, x_control_test, dist_params, dist_params_train)
		acc, sr, fdr = ut.getStats(y_test, y_res, x_control_test)
		print("Acc: ", acc, " SR: ", sr, " FDR: ", fdr)
def parse_time():

	current_page_url = "http://search.time.com/results.html?Ntt=immigration+reform&Nf=p_date_range%7cBTWN+20110101+20130531"

	while current_page_url != None:
		soup = BeautifulSoup(utils.getData(current_page_url)).find("div",{"class":"resultsCol"})

		pagination = soup.find("div",{"class":"pagi"}).find("a",{"title":"Next"})
		if pagination != None:
			current_page_url = pagination["href"]
		else:
			current_page_url = None
		
		articles = soup.findAll("div",{"class":"tout"})

		for article in articles:
			image_div = article.find("div",{"class":"img"})
			if image_div != None:
				title_link = article.find("h3").find("a")
				date_string = article.find("span",{"class":"date"}).get_text().strip()
				
				content_soup = BeautifulSoup(utils.getData(title_link['href'])).find("div",{"class":"entry-content"})

				if content_soup != None:
					content_soup = content_soup.findAll("p")

					article_content = ""
					for p in content_soup:
						article_content += p.get_text().strip()
					print title_link['href'], title_link.get_text(), date_string

					timeDoc = JNYTDocument()
					timeDoc.pub_date = datetime.strptime(date_string,'%b %d, %Y')
					timeDoc.source = "Time"
					timeDoc.web_url = title_link['href']
					timeDoc.headline = title_link.get_text()
					timeDoc.content = article_content
					timeDoc.save()

				#Getting the social shares for the URL
				#timeDoc.social_shares = shares.get_social_counts(timeDoc.web_url)
				#timeDoc.save()
		#current_page_url = None
	return current_page_url
Exemplo n.º 22
0
def testImages():
    pkl_path = './dataset/full_data.pkl'
    main_transform = transforms.Compose([transforms.ToTensor()])
    trainset, testset = getData(pkl_path, train_transform = main_transform, test_transform = main_transform)
    img = trainset[0]['data'][0]
    labels = str(trainset[0]['labels'])
    plt.figure(dpi=300)
    plt.imshow(img, cmap='gray')
    plt.xlabel(labels)
    plt.show()
Exemplo n.º 23
0
def LModel(eta,batchsize,dSize,relSize, updatewords):
	trainSize = [50]

	acti = ['relu','tanh']
	evaT = ['sum','max','cause']

	layersize =dSize

	params.frac = 1.0
	params.outfile = 'Model_FA'+'_eta_'+str(eta)+'_dSize_'+ str(dSize) + '_batchsize_'+ str(batchsize) + '_relSize_'+ str(relSize) + '_trainSize_'+str(trainSize[0]) + '_updatewords_' + str(updatewords)
#params.dataf = '../data/conceptnet/AddModelData/omcs_train_new'+str(trainSize[0])+'.txt'
	#params.dataf = '../data/conceptnet/AddModelData/causes_omcs.txt'
	params.dataf = '../data/conceptnet/AddModelData/new_omcs100.txt'
	params.batchsize = batchsize
	params.hiddensize = 25
	params.type = "MAX"
	params.save = True
	params.constraints = False
	params.embedsize = dSize
	params.relsize = relSize
	params.activation = acti[0]
	params.evaType = evaT[0]
	params.usepeep = True
	params.LC = 0.00001
	params.Lw = 0.01
	params.eta = eta
	params.margin = 1
	params.save= True

	(words, We) = getWordmap('../data/conceptnet/embeddings/embeddings.skip.newtask.en.d'+str(dSize)+'.m1.w5.s0.it20.txt')
	#print We.shape
	rel = getRelation('../data/conceptnet/rel.txt')
	params.outfile = "../models/"+params.outfile+"_"+str(params.LC)+"_"+str(params.Lw)+".txt"
                                #examples are shuffled data
	examples = getData(params.dataf)

	params.data = examples[0:int(params.frac*len(examples))]

	#print "Using Training Data"+params.dataf
	#print "Using Word Embeddings with Dimension "+str(dSize[0])

	#print "Training on "+str(len(params.data))
	#print "Saving models to: "+params.outfile

	Rel_init = np.zeros((35,params.relsize,params.relsize))
	for k in range(35):
		for i in range(params.relsize):
         		for j in range(params.relsize):
                  		if(i==j):
                          		Rel_init[k][i][j] = 1+random.uniform(-0.2,0.2)
                  		else:
                          		Rel_init[k][i][j] = random.uniform(-0.2,0.2)

	tm = theano_word_model(We, words, layersize, params.embedsize, rel, params.relsize, Rel_init, params.LC, params.Lw, params.eta, params.margin, params.usepeep, updatewords)
	tm.train( params.data, params)
Exemplo n.º 24
0
def testTrainingLoop(need_pickle=False):
    if (need_pickle):
        pkl_path = dsetToPickle('./dataset/', 'train.csv')
    else:
        pkl_path = './dataset/full_data.pkl'

    main_transform = transforms.Compose([transforms.ToTensor()])
    trainset, testset = getData(pkl_path, train_transform = main_transform, test_transform = main_transform)
    train_dataloader = DataLoader(trainset, batch_size = 64, shuffle = True)
    test_dataloader = DataLoader(testset, batch_size = 64, shuffle = True)
    trainModel(None, train_dataloader, test_dataloader)
Exemplo n.º 25
0
def gridSearchScore():
    X, y = getData()
    scores = ["precision", "recall"]
    for score in scores:
        print(f"# Tuning hyper-parameters for {score}")
        clf = gridSearch(X, y)

        means = clf.cv_results_["mean_test_score"]
        stds = clf.cv_results_["std_test_score"]
        for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
            print("{:0.3f} (+/-{:0.03f}) for {}".format(mean, std * 2, params))
Exemplo n.º 26
0
	def testPreprocessedData(self):
		x_train, y_train, x_control_train, x_control_test, x_test, y_test = ut.getData()
		#checkNormalFit(x_train, y_train, x_control_train)

		for i in range(1,11):
			try : 
				tau = i/10.0
				print("Tau : ", tau)
				y_res = self.processGivenData(tau, x_train, y_train, x_control_train, x_test, y_test, x_control_test, [], [])
				ut.getStats(y_test, y_res, x_control_test)
				print("\n")
			except Exception as e:
				logging.exception(str(tau) + " failed\n" + str(e))
Exemplo n.º 27
0
def getAddress(url, predictors):
    '''
    Finds all the addresses on the web-page

    Parameters
    ----------
    url : The url of the page

    predictors : a list of tuples which are like (parameters, model)
        Here parameters is a dictionary of the hyper-parameters of the model

    Returns
    -------
    final : A list of lists, where every list contains the paragraph which are the
        part of the same address.
    '''

    soup, paras, paradict = parsePage(url)
    # print soup
    addresses = []

    if 'tripadvisor' in url:
        final = TripAdAddr(soup)

    else:
        results = set()

        for params, pred in predictors:
            # get the feature vectors for the text on the web-page as required by the
            X = getData(paras,
                        params['NUM_FEATURES'],
                        params['BATCH_SIZE'],
                        SEQ_LENGTH=params['SEQ_LENGTH'])
            res = pred(X).flatten()
            addrs = getLabels(res, paras, params['NUM_CLUST'])

            # take the intersection of the results extracted by the classifiers...
            # success depends heavily on the ability of the classifiers to find all the addresses
            results = results.intersection(addrs)
            #print getScores(pred, paras, params)

        # the final address extractor is the hard coded rule-based function which works when
        # there are telephone numbers in the address
        results = results.union(rulEx(paras))

        # to align the addresses based on their position on the page
        addresses = sorted(results, key=lambda x: x[1])
        final = accuAddr(addresses)

    # print final
    return final
Exemplo n.º 28
0
def testDataset(need_pickle=False):
    if (need_pickle):
        pkl_path = dsetToPickle('./dataset/', 'train.csv')
    else:
        pkl_path = './dataset/full_data.pkl'
    train, test = getData(pkl_path)
    print('Testing...')
    print("train.data:\n", train.data)
    print("train[0]:\n", train[0])
    for i in range(5):
        plt.subplot(150 + i + 1)
        sample = train[i]['data'][0]
        plt.imshow(sample.astype(int), cmap='gray', vmin=0, vmax=255)
    plt.show()
Exemplo n.º 29
0
def scrape(stats_file, pokedes_file):
    tr_elements = utils.getData('http://pokemondb.net/pokedex/all', '//tr')
    col = []

    # For each row, store each first element (header) and an empty list
    i = 0
    for t in tr_elements[0]:
        name = t.text_content()
        if name == '#':
            name = 'no'

        col.append((utils.clean_string(name), []))
        i += 1

    # Since out first row is the header, data is stored on the second row onwards
    for j in range(1, len(tr_elements)):
        T = tr_elements[j]

        if len(T) != 10:
            break

        i = 0
        for t in T.iterchildren():
            data = t.text_content()

            if i > 0:
                try:
                    data = int(data)
                except:
                    pass

            col[i][1].append(data)
            i += 1

    # Construct Data Frame using Pandas.
    Dict = {title: column for (title, column) in col}
    df = pd.DataFrame(Dict)

    # Apply clean up
    df['name'] = df['name'].apply(utils.str_bracket)
    df['type'] = df['type'].apply(utils.str_break)
    df['img_filename'] = df['name']
    df['img_filename'] = df['img_filename'].apply(utils.generate_img_file_name)

    # Save to json
    df.to_json(stats_file, orient='records')

    # Save image_filename list
    utils.save_df_to_text(df, pokedes_file, 'img_filename')
Exemplo n.º 30
0
def getAddress(url, predictors):
    '''
    Finds all the addresses on the web-page

    Parameters
    ----------
    url : The url of the page

    predictors : a list of tuples which are like (parameters, model)
        Here parameters is a dictionary of the hyper-parameters of the model

    Returns
    -------
    final : A list of lists, where every list contains the paragraph which are the
        part of the same address.
    '''

    soup, paras, paradict = parsePage(url)
    # print soup
    addresses = []

    if 'tripadvisor' in url:
        final = TripAdAddr(soup)

    else:
        results = set()

        for params, pred in predictors:
            # get the feature vectors for the text on the web-page as required by the
            X = getData(paras, params['NUM_FEATURES'], params[
                'BATCH_SIZE'], SEQ_LENGTH=params['SEQ_LENGTH'])
            res = pred(X).flatten()
            addrs = getLabels(res, paras, params['NUM_CLUST'])

            # take the intersection of the results extracted by the classifiers...
            # success depends heavily on the ability of the classifiers to find all the addresses
            results = results.intersection(addrs)
            #print getScores(pred, paras, params)

        # the final address extractor is the hard coded rule-based function which works when
        # there are telephone numbers in the address
        results = results.union(rulEx(paras))

        # to align the addresses based on their position on the page
        addresses = sorted(results, key=lambda x: x[1])
        final = accuAddr(addresses)

    # print final
    return final
Exemplo n.º 31
0
 def handle(self):
     # print('Client: ', self.client_address)
     self.logger = logging.getLogger()
     while True:
         try:
             data = utils.getData(self.connection)
             if data is None:
                 break
         except:
             import traceback
             # self.logger.error(traceback.format_exc())
             # traceback.print_exc()
             break
         record = logging.makeLogRecord(data)
         self.handleLogRecord(record)
Exemplo n.º 32
0
	def handle(self):
		#print('Client: ', self.client_address)
		while True:
			# obj = getData(self.connection)
			# self.protocol(obj)
			try:
				obj = utils.getData(self.connection)
				if obj is None:
					break
				self.protocol(obj)
			except:
				import traceback
				# traceback.print_exc()
				self.resend()
				self._error("exception, exit!")
				break
Exemplo n.º 33
0
def main():
    if (NEED_PICKLE):
        pkl_path = utils.dsetToPickle('./dataset/', 'train.csv')
    else:
        pkl_path = './dataset/full_data.pkl'

    # set up the data transforms
    train_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.45], std=[0.225]),
    ])
    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.45], std=[0.225]),
    ])

    # set up the datasets/loaders
    trainset, testset = utils.getData(pkl_path,
                                      split=0.75,
                                      drop=0.5,
                                      train_transform=train_transform,
                                      test_transform=test_transform)
    trainloader = DataLoader(trainset,
                             batch_size=64,
                             shuffle=True,
                             drop_last=True)
    testloader = DataLoader(testset,
                            batch_size=64,
                            shuffle=True,
                            drop_last=True)

    # create the model
    model = prnet.PretrainedResnet(TOTAL_ROOTS, TOTAL_VOWELS, TOTAL_CONS)
    # model.load_state_dict(torch.load('./best_model.model'))

    # train the model
    model = train.train(model, trainloader, testloader, epochs=35, lr=0.01)

    # save the model
    torch.save(model.state_dict(), './saved_model.model')

    # validate the model
    # acc = train.validate(model, testloader)
    # print("Validation Accuracy: %.3f" % (acc))

    return
Exemplo n.º 34
0
def add3():
    if request.method == 'POST':
        data = request.form
        print(data)
        sql = "INSERT INTO Card (CardNum,CardName,Type,Sex,Workunit,Address,Telephone,Email,RegisterDate) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s');" \
              % (data['CardNum'], data['CardName'], data['Type'], data['Sex'],\
                 data['WorkUnit'], data['Address'], data['Telephone'], data['Email'],data['RegisterDate'])
        utils.execu(sql)
        return redirect(url_for('CardIfo'))
    else:
        sql = "select * from %s" % (utils.Table4)
        content, labels = utils.query(sql, utils.Table4)
        sql = "select TypeName from %s" % (utils.Table5)
        TypeName = utils.getData(sql)
        return render_template('CardAdd.html',
                               labels=labels,
                               content=content,
                               TypeName=TypeName)
Exemplo n.º 35
0
def sqlmodify3():
    if request.method == 'POST':
        data = request.form
        sql = "update %s set CardNum='%s',CardName='%s',TypeName='%s',Sex='%s',WorkUnit='%s',Address='%s',Telephone='%s',Email='%s',RegisterDate='%s' where CardNum=%s" \
              % (utils.Table4, data['CardNum'], data['CardName'], data['TypeName'], data['Sex'], data['WorkUnit'], data['Address'], data['Telephone'], data['Email'],data['RegisterDate'], data['uid'])
        utils.execu(sql)
        return redirect(url_for('CardIfo'))
    else:
        uid = int(request.args.get('uid'))
        sql = "select * from %s" % (utils.Table4)
        content, labels = utils.query(sql, utils.Table4)
        sql = "select TypeName from %s" % (utils.Table5)
        TypeName = utils.getData(sql)
        return render_template('CardModify.html',
                               labels=labels,
                               content=content,
                               uid=uid,
                               TypeName=TypeName)
Exemplo n.º 36
0
def sqlmodify1():
    if request.method == 'POST':
        data = request.form
        sql = "update %s set BookNum='%s',BookName='%s',Categories='%s',Author='%s',Press='%s',PublicateDate='%s',Price='%s',IsLend='%s' where BookNum=%s" \
              % (utils.Table1, data['BookNum'], data['BookName'], data['Categories'], data['Author'], data['Press'], data['PublicateDate'], data['Price'], data['IsLend'], data['uid'])
        utils.execu(sql)
        return redirect(url_for('BooIfo'))
    else:
        uid = int(request.args.get('uid'))
        sql = "select * from %s" % (utils.Table1)
        content, labels = utils.query(sql, utils.Table1)
        sql = "select categories from %s" % (utils.Table3)
        categories = utils.getData(sql)
        return render_template('BooModify.html',
                               labels=labels,
                               content=content,
                               uid=uid,
                               categories=categories)
Exemplo n.º 37
0
def question1(dataDir, subNum, imageName):
    
    time, data = getData(dataDir)
    x = data[:,0:3]
    x_des = data[:,3:6]

    data_label = ["x", "y", "z"]
    data_des_label = ["$x_d$", "$y_d$", "$z_d$"]
    fig, plots = plt.subplots(3, figsize=(5, 6))
    plotXYZ(plots, x, x_des, time, data_label, data_des_label)

    title = "Question 1" + subNum + "\nx vs x_desired"
    plots[0].set_title(title, fontsize=13,  font="monospace")
    fig.supylabel('End effector positions (m)', fontsize=11,  font="monospace")
    plots[-1].set_xlabel('Time (seconds)', fontsize=11,  font="monospace")
    fig.tight_layout()
    fig.savefig(imageName)

    plt.close('all')
Exemplo n.º 38
0
def question2(dataDir, subNum, imageName, imageName2):
    
    time, data = getData(dataDir)
    
    # part 1
    x = data[:,0:3]
    x_des = data[:,3:6]

    data_label = ["x", "y", "z"]
    data_des_label = ["$x_d$", "$y_d$", "$z_d$"]
    fig, plots = plt.subplots(3, figsize=(5, 6))
    plotXYZ(plots, x, x_des, time, data_label, data_des_label)

    title = "Question 2" + subNum + "\nx vs x_desired"
    plots[0].set_title(title, fontsize=13,  font="monospace")
    fig.supylabel('End effector positions (m)', fontsize=11,  font="monospace")
    plots[-1].set_xlabel('Time (seconds)', fontsize=11,  font="monospace")
    fig.tight_layout()
    fig.savefig(imageName)

    # part 2
    q4_data = data[:,6:9]
    q6_data = data[:,9:12]

    fig, plots = plt.subplots(2, figsize=(5, 6))

    data_label = [r"$q_{4}$", r"$q_{4_{low}}$", r"$q_{4_{high}}$"]
    lineType = ["c", "c--", "c-."]
    simpleSubplot(plots[0], q4_data, time, data_label, lineType)
    plots[0].set_title(title, fontsize=13,  font="monospace")

    data_label = [r"$q_{6}$", r"$q_{6_{low}}$", r"$q_{6_{high}}$"]
    lineType = ["m", "m--", "m-."]
    simpleSubplot(plots[1], q6_data, time, data_label, lineType)

    title = "Question 2" + subNum + "\nJoint Angle with Joint Limits"
    fig.supylabel('Joint Angles (rad)', fontsize=11,  font="monospace")
    plots[-1].set_xlabel('Time (seconds)', fontsize=11,  font="monospace")
    fig.tight_layout()
    fig.savefig(imageName2)

    plt.close('all')
Exemplo n.º 39
0
def rf(classes=3):
    X, y, _ = utils.getData(samplingType="1", classes=classes)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        stratify=y,
                                                        random_state=123456)

    rf = RandomForestClassifier(
        n_estimators=10,
        oob_score=True,
        random_state=123456,
        n_jobs=os.cpu_count(),
        criterion="entropy",
    )
    rf.fit(X_train, y_train)

    predicted = rf.predict(X_test)
    accuracy = accuracy_score(y_test, predicted)

    print(f"Out-of-bag score estimate: {rf.oob_score_:.3}")

    cm = pd.DataFrame(confusion_matrix(y_test, predicted), )
    sns.heatmap(cm, annot=True)

    with open(f"./result/rf_result_{classes}.pkl", "wb") as f:
        pkl.dump(
            [
                ["model", "predict", "accuracy", "out-of-bag", "cm"],
                [rf, predicted, accuracy, rf.oob_score_, cm],
            ],
            f,
        )

    print(f"Mean accuracy score: ", accuracy_score(y_test, predicted))
    print("precision: ", metrics.precision_score(y_test, predicted))
    print("recall: ", metrics.recall_score(y_test, predicted))
    print("f1: ", metrics.f1_score(y_test, predicted))
    print(classification_report(y_test, predicted))

    return y_test, predicted
def parse_nyt():
	url = "http://api.nytimes.com/svc/search/v2/articlesearch.json?q=stock+market+crash+&begin_date=20070101&end_date=20090101&api-key=318a69b2af97848f66071cb4c1fdc831:15:69992102" 
	response = urlopen(url).read()
	response = json.loads(response)
	print "Got response from nytimes"
	articleContent = []
	i = 0
	page = 1
	hits = response["response"]["meta"]["hits"]
	while i<51 and page<(hits/10):
		print 'Getting response for page',page
		url = "http://api.nytimes.com/svc/search/v2/articlesearch.json?q=stock+market+crash+&begin_date=20070101&end_date=20090101&page="+str(page)+"&api-key=318a69b2af97848f66071cb4c1fdc831:15:69992102" 
		try:
			response = urlopen(url).read()
			response = json.loads(response)
			for article in response["response"]["docs"]:
				if random.randint(0,3) == 3: 								#1/3 probability
					print article["web_url"]
					soup1 = BeautifulSoup(utils.getData(article["web_url"]))
					soup = soup1.findAll("p",{"itemprop": "articleBody"})
					if soup == None or len(soup) == 0:
						soup = soup1.find("div", {"id": "articleBody"})
						if soup!=None:
							soup = soup.findAll("p")
					if soup == None or len(soup)==0:
						soup = soup1.find("div", {"class": "articleBody"}) 
						if soup!=None:
							soup = soup.findAll("p")
					if soup!=None and len(soup)>0:
						if article["word_count"]>200 and article["lead_paragraph"]!=None:
							articleContent.append({})
							articleContent[i]["abstract"] = article["abstract"]
							articleContent[i]["pub_date"] = article["pub_date"]
							articleContent[i]["headline"] = article["headline"]["main"]
							articleContent[i]["keywords"] = article["keywords"]
							articleContent[i]["lead_paragraph"] = article["lead_paragraph"]
							articleContent[i]["web_url"] = article["web_url"]
							articleContent[i]["id"] = article["_id"]
							articleContent[i]["word_count"] = article["word_count"]
							keywords = ""
							keywords = getMultiples(article["keywords"],"value")
							# should probably pull these if/else checks into a module
							#	variables = [article["pub_date"], keywords, str(article["headline"]["main"]) if "main" in article["headline"].keys() else "", str(article["source"]) if "source" in article.keys() else "", str(article["document_type"]) if "document_type" in article.keys() else "", article["web_url"] if "web_url" in article.keys() else "",str(article["news_desk"]) if "news_desk" in article.keys() else "",str(article["section_name"]) if "section_name" in article.keys() else "",str(article["lead_paragraph"]).replace("\n","") if "lead_paragraph" in article.keys() else ""]
							#	line = "\t".join(variables)
							#	articleContent[i]["text"] = line
							sent = ""
							if type(soup) is not str:
								sent = " ".join([str(word) for word in soup])
							else:
								sent = soup
							articleContent[i]["text"] = utils.strip(sent)
							print articleContent[i]["headline"],article["keywords"],article["lead_paragraph"]
							i+=1
							print 'Extracted',i,article["pub_date"]
							if i>51:
								break
		except:
			print "Skipped"
		page+=1

	print "Articles Extracted",i	
	return articleContent
Exemplo n.º 41
0
)

args = parser.parse_args()

params.LW = args.LW
params.outfile = args.outfile
params.batchsize = args.batchsize
params.dim = args.dim
params.wordfile = args.wordfile
params.save = str2bool(args.save)
params.train = args.train
params.margin = args.margin
params.type = args.samplingtype
params.epochs = args.epochs
params.evaluate = str2bool(args.evaluate)
params.learner = str2learner(args.learner)
params.learner = lasagne.updates.adagrad

(words, We) = getWordmap(params.wordfile)
examples = getData(params.train, words)

if args.num_examples:
    examples = examples[0 : args.num_examples]

print "Number of training examples: ", len(examples)
print sys.argv

model = paragram_word_model(We, params)

train(model, examples, words, params)