예제 #1
0
    def test_check_db(self):
        from xml.etree import ElementTree
        from db_utils import connect_to_db
        from weather_data import TABLE_NAME
        from collect_data import get_weather_data

        weather_data = get_weather_data()
        main_element = ElementTree.fromstring(weather_data.content)

        connection = connect_to_db()
        if connection:
            try:
                query = (
                    "SELECT "
                    "  datetime, cloudiness, wind_speed, wind_direction, temperature, humidity, pressure "
                    "FROM " + TABLE_NAME + " "
                    "WHERE "
                    "  datetime = '{0}'")
                cursor = connection.cursor()

                check_results(self, cursor, main_element.findall("metData"),
                              query)

                print()
            finally:
                connection.close()
예제 #2
0
def worker_insert_tweets_2017(record_queue, m_q, num_workers):
    table = connect_to_db('drug_viz_data', 'tweets_2017_filter')
    none_count = 0
    while True:
        joined_gdf = record_queue.get()
        if joined_gdf is None:
            none_count += 1
            if none_count >= num_workers:
                m_q.put('Records finished, time {}'.format(datetime.now()))
                m_q.put(None)
                break
            continue
        # process fields
        joined_gdf = joined_gdf.rename(
            {
                'POP100': 'Pop_2010',
                'GEOID': 'GeoID_2010',
                'TRACT': 'TractID_2010',
                'STATE': 'StateID',
                'COUNTY': 'CountyID_2010'
            },
            axis='columns')
        joined_gdf = joined_gdf[[
            'Followers', 'Friends', 'Statuses', 'category', 'drugCategory',
            'fakeLocalTime', 'geo', 'textNorm', 'textRaw', 'textTokenCounts',
            'textTokens', 'timezone', 'tweetID', 'utcTime', 'Pop_2010',
            'GeoID_2010', 'TractID_2010', 'StateID', 'CountyID_2010'
        ]]
        # turn gdf to list of dicts
        tweet_dicts = joined_gdf.to_dict('records')
        # insert
        m_q.put('Inserting, time {}, size {}'.format(datetime.now(),
                                                     len(tweet_dicts)))
        table.insert_many(tweet_dicts)
예제 #3
0
def process_records_mp():
    # load shpfiles
    all_gdf, main_crs = load_shpfiles()
    # load census data
    all_census_df = load_census_concat()
    # join shpfiles and census data
    pop_gdf = join_census_shpfile(all_gdf, all_census_df)
    # setup process and queues
    manager = Manager()
    in_q = manager.Queue(20)
    record_q = manager.Queue(3)
    m_q = manager.Queue(1000)
    num_workers = 2
    # start processes
    process_pool = []
    for i in range(num_workers):
        process_p = Process(target=worker_prepare_records_positive_tweets,
                            args=(main_crs, pop_gdf, in_q, record_q, m_q))
        process_p.start()
        process_pool.append(process_p)
    out_p = Process(target=worker_insert_tweets_2017,
                    args=(record_q, m_q, num_workers))
    out_p.start()
    # put file path into queue
    # change this path
    path_temp = 'D:/TEMPFILE/2017_filter/2017_positive_filtered_tweets_{}.json'
    for i in range(2):
        p = path_temp.format(i)
        in_q.put(p)
    for i in range(num_workers):
        in_q.put(None)
    print('all in')
    # wait till finish
    while True:
        m = m_q.get()
        if m is None:
            print('all done')
            break
        print(m)
    # make index
    table = connect_to_db('drug_viz_data', 'tweets_2017_filter')
    table.create_index([('textNorm', pymongo.TEXT)])
    table.create_index([('geo', pymongo.GEOSPHERE)])
    table.create_index("utcTime")
    table.create_index("fakeLocalTime")
    table.create_index("textTokens")
    table.create_index("category")
    table.create_index("drugCategory")
    table.create_index("StateID")
    table.create_index("GeoID_2010")
    table.create_index("Pop_2010")
    # done
    for p in process_pool:
        p.join()
    out_p.join()
    print('all process ends')
    return
예제 #4
0
def prepare():
    from db_utils import connect_to_db, create_tables

    print("Preparing environment ...")
    connection = connect_to_db()
    if connection:
        try:
            create_tables(connection)
        finally:
            connection.close()

        print("Environment successfully prepared.")
예제 #5
0
def collect():
    from xml.etree import ElementTree
    from db_utils import connect_to_db

    connection = connect_to_db()
    if connection:
        try:
            weather_data = get_weather_data()
            main_element = ElementTree.fromstring(weather_data.content)
            data_elements = main_element.findall("metData")
            for data_element in data_elements:
                weather_data = convert_xml_to_weather_record(data_element)
                save_to_db(connection, weather_data)
                print("Weather data collected for {0}.".format(
                    weather_data.datetime))

            connection.commit()

        finally:
            connection.close()
def user_info_extraction_cycle_db(driver, users_list, is_from_scratch):
    """
    Process of extracting user data of user_list and saving them to a DB
    :param driver: The instantiated web driver
    :param users_list: List of user names
    :param is_from_scratch: boolean, specify if taking info from scratch
    :return: None
    """
    connection = db_utils.connect_to_db(config.DB_HOST, config.DB_USER,
                                        config.DB_PWD, config.DB_NAME)
    if not is_from_scratch:
        existing_users = db_utils.select_users(connection, users_list)
        users_list = get_new_users_db(users_list, existing_users)
    if users_list:
        users_dict = get_users_info(driver, users_list)
        if users_dict:
            db_utils.save_user_info(connection, users_dict)
            logger.info(config.MSG_DICT["SAVING_DB_USERS_COUNT"].format(
                len(users_dict)))
            # print(config.MSG_DICT["SAVING_DB_USERS_COUNT"].format(len(users_dict)))
    connection.close()
예제 #7
0
    for table in tables:
        if table not in tables_existing:
            tdef = tables[table]
            sql = """
				CREATE TABLE policing.""" + table + """ (
					""" + ', '.join([
                col[0] + ' ' + col[1]
                for col in tdef if col[0] not in ['primary_key', 'index']
            ]) + """ 
					""" + ''.join([
                ', PRIMARY KEY (' + col[1] + ')'
                for col in tdef if col[0] == 'primary_key'
            ]) + """
					""" + ''.join([
                ', INDEX (' + col[1] + ')' for col in tdef if col[0] == 'index'
            ]) + """
				)
			"""
            cur.execute(sql)
            con.commit()
            print('created table: ' + table)


############
##  MAIN  ##
############

con, cur = db_utils.connect_to_db()
tables_existing = check_tables(con, cur)
create_tables(con, cur, tables_existing)
예제 #8
0
#                     filemode='w',
#                     format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
#                     datefmt='%H:%M:%S',
#                     level=logging.DEBUG)
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)
handler = logging.FileHandler('./log/query_log.log', 'a', 'utf-8')
formatter = logging.Formatter(
    '%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s')
handler.setFormatter(formatter)  # Pass handler as a parameter, not assign
root_logger.addHandler(handler)

census_years = ['2010']
utc = pytz.utc
state_dict = get_state_fips_dict()
db_map = connect_to_db('drug_viz')
db_data = connect_to_db('drug_viz_data')

MAX_KEYWORDS = 100
MAX_SAMPLES = 10

# by default, each query field is joined with "AND" logic
# to use text search and geo query, only "geowithin" can be used
# text search is always performed first, then filter conditions are considered

# the query should return the following fields:
# the mapping data: geojson string of census tracts
# the tweet data: each matched tweet with selected fields, include:
# id (not the tweet id for privacy)
# local time stamp (for time display)
# token count (for keyword display)