def query(query, variables = None):
	res = None
	if variables is None:
		res = glc.execute_db_query(query)
	else:
		res = glc.execute_db_query(query, variables)
	return res
示例#2
0
def concatContent():
	# All articles in november
	articles = glc.execute_db_query("SELECT content FROM articles WHERE created_at BETWEEN '2016-11-01 00:00:00' AND '2016-11-30 23:59:59' AND source_id = 2")
	text = ""
	for article in articles :
		contents = article["content"]
		text = text + contents
	return text
示例#3
0
def get_authors(ids, cursor = None):
	return glc.execute_db_query("""SELECT * FROM authors WHERE author_id = ANY(%s)""", (ids,), cursor = cursor)
示例#4
0
def get_all_sources(cursor = None):
	return glc.execute_db_query("""SELECT * FROM SOURCES""", cursor = cursor)
示例#5
0
def get_all_authors(cursor = None):
	return glc.execute_db_query("""SELECT * FROM authors""", cursor = cursor)
示例#6
0
def get_author_articles(author_id, cursor = None):
	return glc.execute_db_query("""SELECT article_id FROM article_authors WHERE author_id = %s""", (author_id,), cursor = cursor)
示例#7
0
def get_authors_with_num_sources(num_sources, cursor = None):
	return glc.execute_db_query("""WITH author_source_counts AS
		(SELECT author_id, COUNT(source_id) FROM source_authors GROUP BY author_id)
		SELECT author_id FROM author_source_counts WHERE count = %s""", (num_sources,))
示例#8
0
def get_all_tags(cursor = None):
	return glc.execute_db_query("""SELECT * FROM tags""", cursor = cursor)
示例#9
0
def get_tags_named(tag_names, cursor = None):
	return glc.execute_db_query("""SELECT * FROM tags WHERE tag_name = ANY(%s)""", (tag_names,), cursor = cursor)
示例#10
0
def get_sources_named(names, cursor = None):
	return glc.execute_db_query("""SELECT * FROM sources WHERE name = ANY(%s)""", (names,), cursor = cursor)
示例#11
0
def get_sources_linked(base_urls, cursor = None):
	return glc.execute_db_query("""SELECT * FROM sources WHERE base_url = ANY(%s)""", (base_urls,), cursor = cursor)
示例#12
0
def get_article_tag_link(article_id, tag_id, cursor = None):
	return glc.execute_db_query("""SELECT * FROM article_tags WHERE article_id = %s AND tag_id = %s""", (article_id, tag_id), cursor = cursor)[0]
示例#13
0
def get_tag_articles(tag_id, cursor = None):
	return glc.execute_db_query("""SELECT article_id FROM article_tags WHERE tag_id = %s""", (tag_id,), cursor = cursor)
示例#14
0
def get_sources(ids, cursor = None):
	return glc.execute_db_query("""SELECT * FROM sources WHERE source_id = ANY(%s)""", (ids,), cursor = cursor)
示例#15
0
def get_author_sources(author_id, cursor = None):
	return glc.execute_db_query("""SELECT source_id FROM source_authors WHERE author_id = %s""", (source_id,), cursor = cursor)
示例#16
0
def get_source_author_link(source_id, author_id, cursor = None):
	return glc.execute_db_query("""SELECT * FROM source_authors WHERE source_id = %s AND author_id = %s""", (source_id, author_id), cursor = cursor)[0]
示例#17
0
def get_author_named(first_name, last_name, cursor = None):
	return glc.execute_db_query("""SELECT * FROM authors WHERE first_name = %s AND last_name = %s""", (first_name, last_name), cursor = cursor)[0]
示例#18
0
def get_all_articles(cursor = None):
	return glc.execute_db_query("""SELECT * FROM articles""", cursor = cursor)
示例#19
0
def get_authors_last_named(last_name, cursor = None):
	return glc.execute_db_query("""SELECT * FROM authors WHERE last_name = %s""", (last_name,), cursor = cursor)
示例#20
0
	def query_articles(self, query):
		#example query: "SELECT * FROM articles  WHERE created_at BETWEEN '2016-11-01 00:00:00' AND '2016-12-30 23:59:59' LIMIT 30"
		articles = glc.execute_db_query(query)
		for each_article in articles:
			#encode article text properly and clean
			self.articles.append(each_article)
示例#21
0
def get_tags(ids, cursor = None):
	return glc.execute_db_query("""SELECT * FROM tags WHERE tag_id = ANY(%s)""", (ids,), cursor = cursor)
示例#22
0
def main():
    test = Features()

    articles = glc.execute_db_query("""SELECT * FROM articles LIMIT 1""")
    for article in articles:
        print(test.get_features(article))
示例#23
0
def get_articles(ids, cursor = None):
	return glc.execute_db_query("""SELECT * FROM articles WHERE article_id = ANY(%s)""", (ids,), cursor = cursor)
示例#24
0
def get_article_author_link(article_id, author_id, cursor = None):
	return glc.execute_db_query("""SELECT * FROM article_authors WHERE article_id = %s AND author_id = %s""", (article_id, author_id), cursor = cursor)[0]
示例#25
0
def query(query):
	return glc.execute_db_query(query)
示例#26
0
def get_articles_linked(urls, cursor = None):
	return glc.execute_db_query("""SELECT * FROM articles WHERE url = ANY(%s)""", (urls,), cursor = cursor)
示例#27
0
def get_avg_authors_per_article(cursor = None):
	return glc.execute_db_query("""WITH article_author_count AS
		(SELECT article_id, COUNT(author_id) FROM article_authors GROUP BY article_id)
		SELECT AVG(count) FROM article_author_count""")["avg"]
示例#28
0
def get_articles_entitled(title, cursor = None):
	return glc.execute_db_query("""SELECT * FROM articles WHERE title = %s""", (title,), cursor = cursor)
示例#29
0
def get_data(num_tests, classifyFromTokens, test_variance = 0, num_sources = None, num_articles = None):
	# Get specified number of sources from database (gets all if no number sources passed in)
	sources = None
	if num_sources is None:
		sources = glc.execute_db_query("SELECT source_id, name FROM sources")
	else:
		sources = glc.execute_db_query("SELECT source_id, name FROM sources LIMIT %s", (num_sources,))
	print("Sources query completed. Retrieved %d sources from database." % len(sources))

	# Build a dictionary of source_id keys which contain lists of articles from that source as their values
	source_article_dict = {}
	for source in sources:
		source_article_dict[source["source_id"]] = None
		if num_articles is None:
			source_article_dict[source["source_id"]] = glc.execute_db_query("SELECT article_id, source_id, content FROM articles WHERE source_id=%s", (source["source_id"],))
		else:
			source_article_dict[source["source_id"]] = glc.execute_db_query("SELECT article_id, source_id, content FROM articles WHERE source_id=%s LIMIT %s", (source["source_id"], num_articles))

		print("%s query completed! Retrived %d articles from database." % (source["name"], len(source_article_dict[source["source_id"]])))

	#Build a dictionary of all of the tokens in the database
	if classifyFromTokens == 0:
		tokens = glc.execute_db_query("SELECT token, realCount, fakeCount FROM tokens")
	else:
		tokens = glc.execute_db_query("SELECT token FROM tokens")

	print("Tokens query completed! Retrived %d tokens from database." % (len(tokens)))

	print("All queries completed, data fetched!")

	if classifyFromTokens == 0:
		training_articles = []
		test_articles = []
		for i in range(0, len(sources)):
			test_articles.extend(source_article_dict[sources[i]["source_id"]])

		source_id = [0, 1]
		name = ["fake", "real"]
		sources = {"source_id": source_id, "name": name}
	else:
		# Get the even distribution percent that would happen with this many sources (ex. 3 sources this would be .33)
		even_split_percent = float(1) / len(sources)
		# Get the range on the bounded percentage split (ex. .33 with a variance of 10% would have a range of .033)
		bound_range = even_split_percent * test_variance
		# Create a list of how each of the sources are going to be split
		source_splits = [even_split_percent] * len(sources)
		# Randomly split sources (leave out last one as it is calculated using the previous source distributions)
		for i in range(0, len(sources) - 1):
			# Minimum bound for number of tests can pick
			min_bound = int(num_tests * (even_split_percent - bound_range))
			# Maximum bound for number of tests can pick
			max_bound = int(num_tests * (even_split_percent + bound_range))

			# Pick a number inbetween these bounds
			source_splits[i] = randint(min_bound, max_bound)
			# Claculates the last distribution based on previous distributions and how much is left
			source_splits[-1] = num_tests - sum(source_splits[:-1])

			print("\nTest data contains: ")
			# Create training and test article lists using previously generated list splits
			training_articles = []
			test_articles = []
			for i in range(0, len(sources)):
				training_articles.extend(source_article_dict[sources[i]["source_id"]][source_splits[i]:])
				test_articles.extend(source_article_dict[sources[i]["source_id"]][:source_splits[i]])

				print("\t%d %s articles" % (source_splits[i], sources[i]["name"]))

	return sources, training_articles, test_articles, tokens
示例#30
0
def get_fake_articles(cursor = None):
	return glc.execute_db_query("""SELECT * FROM articles WHERE is_fake = TRUE""", cursor = cursor)