def query(query, variables = None): res = None if variables is None: res = glc.execute_db_query(query) else: res = glc.execute_db_query(query, variables) return res
def concatContent(): # All articles in november articles = glc.execute_db_query("SELECT content FROM articles WHERE created_at BETWEEN '2016-11-01 00:00:00' AND '2016-11-30 23:59:59' AND source_id = 2") text = "" for article in articles : contents = article["content"] text = text + contents return text
def get_authors(ids, cursor = None): return glc.execute_db_query("""SELECT * FROM authors WHERE author_id = ANY(%s)""", (ids,), cursor = cursor)
def get_all_sources(cursor = None): return glc.execute_db_query("""SELECT * FROM SOURCES""", cursor = cursor)
def get_all_authors(cursor = None): return glc.execute_db_query("""SELECT * FROM authors""", cursor = cursor)
def get_author_articles(author_id, cursor = None): return glc.execute_db_query("""SELECT article_id FROM article_authors WHERE author_id = %s""", (author_id,), cursor = cursor)
def get_authors_with_num_sources(num_sources, cursor = None): return glc.execute_db_query("""WITH author_source_counts AS (SELECT author_id, COUNT(source_id) FROM source_authors GROUP BY author_id) SELECT author_id FROM author_source_counts WHERE count = %s""", (num_sources,))
def get_all_tags(cursor = None): return glc.execute_db_query("""SELECT * FROM tags""", cursor = cursor)
def get_tags_named(tag_names, cursor = None): return glc.execute_db_query("""SELECT * FROM tags WHERE tag_name = ANY(%s)""", (tag_names,), cursor = cursor)
def get_sources_named(names, cursor = None): return glc.execute_db_query("""SELECT * FROM sources WHERE name = ANY(%s)""", (names,), cursor = cursor)
def get_sources_linked(base_urls, cursor = None): return glc.execute_db_query("""SELECT * FROM sources WHERE base_url = ANY(%s)""", (base_urls,), cursor = cursor)
def get_article_tag_link(article_id, tag_id, cursor = None): return glc.execute_db_query("""SELECT * FROM article_tags WHERE article_id = %s AND tag_id = %s""", (article_id, tag_id), cursor = cursor)[0]
def get_tag_articles(tag_id, cursor = None): return glc.execute_db_query("""SELECT article_id FROM article_tags WHERE tag_id = %s""", (tag_id,), cursor = cursor)
def get_sources(ids, cursor = None): return glc.execute_db_query("""SELECT * FROM sources WHERE source_id = ANY(%s)""", (ids,), cursor = cursor)
def get_author_sources(author_id, cursor = None): return glc.execute_db_query("""SELECT source_id FROM source_authors WHERE author_id = %s""", (source_id,), cursor = cursor)
def get_source_author_link(source_id, author_id, cursor = None): return glc.execute_db_query("""SELECT * FROM source_authors WHERE source_id = %s AND author_id = %s""", (source_id, author_id), cursor = cursor)[0]
def get_author_named(first_name, last_name, cursor = None): return glc.execute_db_query("""SELECT * FROM authors WHERE first_name = %s AND last_name = %s""", (first_name, last_name), cursor = cursor)[0]
def get_all_articles(cursor = None): return glc.execute_db_query("""SELECT * FROM articles""", cursor = cursor)
def get_authors_last_named(last_name, cursor = None): return glc.execute_db_query("""SELECT * FROM authors WHERE last_name = %s""", (last_name,), cursor = cursor)
def query_articles(self, query): #example query: "SELECT * FROM articles WHERE created_at BETWEEN '2016-11-01 00:00:00' AND '2016-12-30 23:59:59' LIMIT 30" articles = glc.execute_db_query(query) for each_article in articles: #encode article text properly and clean self.articles.append(each_article)
def get_tags(ids, cursor = None): return glc.execute_db_query("""SELECT * FROM tags WHERE tag_id = ANY(%s)""", (ids,), cursor = cursor)
def main(): test = Features() articles = glc.execute_db_query("""SELECT * FROM articles LIMIT 1""") for article in articles: print(test.get_features(article))
def get_articles(ids, cursor = None): return glc.execute_db_query("""SELECT * FROM articles WHERE article_id = ANY(%s)""", (ids,), cursor = cursor)
def get_article_author_link(article_id, author_id, cursor = None): return glc.execute_db_query("""SELECT * FROM article_authors WHERE article_id = %s AND author_id = %s""", (article_id, author_id), cursor = cursor)[0]
def query(query): return glc.execute_db_query(query)
def get_articles_linked(urls, cursor = None): return glc.execute_db_query("""SELECT * FROM articles WHERE url = ANY(%s)""", (urls,), cursor = cursor)
def get_avg_authors_per_article(cursor = None): return glc.execute_db_query("""WITH article_author_count AS (SELECT article_id, COUNT(author_id) FROM article_authors GROUP BY article_id) SELECT AVG(count) FROM article_author_count""")["avg"]
def get_articles_entitled(title, cursor = None): return glc.execute_db_query("""SELECT * FROM articles WHERE title = %s""", (title,), cursor = cursor)
def get_data(num_tests, classifyFromTokens, test_variance = 0, num_sources = None, num_articles = None): # Get specified number of sources from database (gets all if no number sources passed in) sources = None if num_sources is None: sources = glc.execute_db_query("SELECT source_id, name FROM sources") else: sources = glc.execute_db_query("SELECT source_id, name FROM sources LIMIT %s", (num_sources,)) print("Sources query completed. Retrieved %d sources from database." % len(sources)) # Build a dictionary of source_id keys which contain lists of articles from that source as their values source_article_dict = {} for source in sources: source_article_dict[source["source_id"]] = None if num_articles is None: source_article_dict[source["source_id"]] = glc.execute_db_query("SELECT article_id, source_id, content FROM articles WHERE source_id=%s", (source["source_id"],)) else: source_article_dict[source["source_id"]] = glc.execute_db_query("SELECT article_id, source_id, content FROM articles WHERE source_id=%s LIMIT %s", (source["source_id"], num_articles)) print("%s query completed! Retrived %d articles from database." % (source["name"], len(source_article_dict[source["source_id"]]))) #Build a dictionary of all of the tokens in the database if classifyFromTokens == 0: tokens = glc.execute_db_query("SELECT token, realCount, fakeCount FROM tokens") else: tokens = glc.execute_db_query("SELECT token FROM tokens") print("Tokens query completed! Retrived %d tokens from database." % (len(tokens))) print("All queries completed, data fetched!") if classifyFromTokens == 0: training_articles = [] test_articles = [] for i in range(0, len(sources)): test_articles.extend(source_article_dict[sources[i]["source_id"]]) source_id = [0, 1] name = ["fake", "real"] sources = {"source_id": source_id, "name": name} else: # Get the even distribution percent that would happen with this many sources (ex. 3 sources this would be .33) even_split_percent = float(1) / len(sources) # Get the range on the bounded percentage split (ex. .33 with a variance of 10% would have a range of .033) bound_range = even_split_percent * test_variance # Create a list of how each of the sources are going to be split source_splits = [even_split_percent] * len(sources) # Randomly split sources (leave out last one as it is calculated using the previous source distributions) for i in range(0, len(sources) - 1): # Minimum bound for number of tests can pick min_bound = int(num_tests * (even_split_percent - bound_range)) # Maximum bound for number of tests can pick max_bound = int(num_tests * (even_split_percent + bound_range)) # Pick a number inbetween these bounds source_splits[i] = randint(min_bound, max_bound) # Claculates the last distribution based on previous distributions and how much is left source_splits[-1] = num_tests - sum(source_splits[:-1]) print("\nTest data contains: ") # Create training and test article lists using previously generated list splits training_articles = [] test_articles = [] for i in range(0, len(sources)): training_articles.extend(source_article_dict[sources[i]["source_id"]][source_splits[i]:]) test_articles.extend(source_article_dict[sources[i]["source_id"]][:source_splits[i]]) print("\t%d %s articles" % (source_splits[i], sources[i]["name"])) return sources, training_articles, test_articles, tokens
def get_fake_articles(cursor = None): return glc.execute_db_query("""SELECT * FROM articles WHERE is_fake = TRUE""", cursor = cursor)