Exemplo n.º 1
0
def import_submissions(course_id = "C00198", dbname="test1"):
	con = psycopg2.connect("dbname=%s"%dbname)
	# subs = generate_submissions(users=10, pbls=100)

	userslist = pd.read_sql_query("select user_id from coursesusers where course_id like '%s';" % course_id,con=con)
	userslist = [u[0] for u in userslist.values if u[0]]
	usersstr = str(userslist).strip('[]')

	users = pd.read_sql_query("select user_id, creation_date from users \
	                            where demo=0 and instructor=0 and administrator=0\
	                            and user_id in (%s);" % usersstr,con=con)

	lusers = users.user_id.tolist()
	usersstr = str(lusers).strip('[]')

	problists = pd.read_sql_query("select list_id from courseslists where course_id like '%s';" % course_id ,con=con)
	problists = [l[0] for l in problists.values if l[0]]
	problistsstr = str(problists).strip('[]')

	probs = pd.read_sql_query("select problem_nm from listitems where list_id \
	                                in(%s);" % problistsstr,con=con)

	lprobs = [p[0] for p in probs.values if p[0]]
	lprobsstr = str(lprobs).strip('[]')

	submissions = pd.read_sql_query("select submission_uid, user_id, problem_id, submission_id, \
	                        state, time_out, time_in, veredict, score \
	                        from submissions where user_id in (%s);" % (usersstr) ,
	                       con=con)

	submissions.problem_id = submissions.problem_id.apply(lambda x: x[:-3])
	#submissions.set_index('submission_uid', inplace=True)

	subs = submissions[submissions.problem_id.isin(lprobs)]
	return subs
def fetch2DB():
    # init step
    fetch2DB.timestamp = datetime.now()

    # step1: get DB connection
    dcm_sql = dcm(echo=False)
    engine = dcm_sql.getengine()
    conn = dcm_sql.getconn()

    # step2.1: get current stock list
    dfm_stocks = pd.read_sql_query('''select [Stock_ID] from stock_basic_info 
                                        where (Market_ID = 'SH' or Market_ID = 'SZ') 
                                        and (Stock_ID like '0%' or Stock_ID like '3%' or Stock_ID like '6%')

                                            '''
#                                     + " and Stock_ID = '300274'"  #only used for debug and test purpose
                               , engine)
#    print(dfm_stocks)

    # step2:loop at stock list and fetch and save to DB
    for item in dfm_stocks['Stock_ID']:         # get column Stock_ID from dataframe
    # step2.2: get current character list in each loop so that new chars are included.
        dfm_cur_chars = pd.read_sql_query('''select * from ZCFG_character
                                            where Char_Origin = 'Tquant'
                                            and (Char_Usage = 'FIN10' or Char_Usage = 'FIN20' or Char_Usage = 'FIN30' )
                                                '''
                                          , engine)
        fetch2DB_individual(item,dfm_cur_chars,conn)               # item is str type
Exemplo n.º 3
0
def backTest(trainEndDate, code, testDate, predictDate):
    conn = db.get_history_data_db('D')
    df = None
    # train more date
    # model = pickle.load(open('%s/%s.pkl' % (config.model_dir, code), 'r'))
    rng = np.random.RandomState(1)
    model = AdaBoostRegressor(DecisionTreeRegressor(
        max_depth=4), n_estimators=1000, random_state=rng, loss='square')
    df = pd.read_sql_query(
        "select * from history_data where date([date])<='%s' and code='%s' order by code, date([date]) asc" % (
            trainEndDate, code), conn)
    shift_1 = df['close'].shift(-2)
    df['target'] = shift_1
    data = df[df['target'] > -1000]

    X_train = data.ix[:, 'code':'turnover']
    y_train = data.ix[:, 'target']
    if len(X_train) < 500:
        return
    print len(X_train)
    # print data
    # for i in range(0, 10):
    #     model.fit(X_train, y_train)
    model.fit(X_train, y_train)
    # predict tomorrow
    try:
        df = pd.read_sql_query(config.sql_history_data_by_code_date % (code, testDate), conn)
        # print df
    except Exception, e:
        print e
Exemplo n.º 4
0
def get_dfs(station = None, path='./data/'):
  """
   Get all available databases of bikes and weather,
   return dict of { date : [bikes_dataframe, weather_dataframe] } 
  """
  df_dic = {}
  for db in os.listdir(path):
    if db.endswith(".db"):
      with sqlite3.connect(path+db) as con:
        if station: bikes = pd.read_sql_query(
          "SELECT \"index\",\""+station+"\" FROM bikes", con)
        else:       bikes = pd.read_sql_query(
          "SELECT * FROM bikes", con)
        weather = pd.read_sql_query("SELECT * FROM weather", con)
        # Only include full-day records, 2-minute intervals
        # means 60*24/2 ~ 700 scrapes
        if len(bikes['index']) > 700:
          date = datetime.datetime.strptime(
            db.split("_")[0] , "%Y-%m-%d").date()
        # Fix wind speed values and cast temperatures to integers
          weather['Wind_Speed'] =  weather['Wind_Speed'].replace(
                                     to_replace = 'calm', value = 0)
          weather[['Temperature', 'Feels_Like','Wind_Speed']] = weather[
            ['Temperature',  'Feels_Like' ,'Wind_Speed']
            ].astype(int)
          #convert timestamp string to a datetime time object
          bikes = bikes.rename(columns = {'index' : 'Time'})
          for df in [bikes, weather]:
            df['Time'] = pd.to_datetime(df['Time'].apply(lambda x:
                datetime.datetime.strptime(str(date)+' '+str(x),
                    "%Y-%m-%d %H:%M:%S")))

          df_dic[str(date)] = [bikes,weather]

  return df_dic
def do_test_winloss(username, dbname):

    con = None
    con = psycopg2.connect(database=dbname, user=username)
    known_table = 'winloss'
    
    sql_query = "SELECT COUNT(*) FROM %s;" % (known_table)
    try:
        count_sql = pd.read_sql_query(sql_query, con)
        if count_sql is not None:
            exists = True
    except:
        exists = False
    print '    Table, %s, exists: %s' % (known_table, exists)

    if exists is True:
        print '      Total number of entries in %s: %i' % (known_table, count_sql.loc[0])

        sql_query = "SELECT * FROM %s;" % (known_table)
        try:
            all_sql = pd.read_sql_query(sql_query, con)
        except:
            a = 1
        print '      First 5 entries of %s: ' % (known_table)
        print all_sql.head(5)
    print ''
def do_test_gamestats(username, dbname, year):
        
    con = None
    con = psycopg2.connect(database=dbname, user=username)
    known_table = 'teams' + year 
        
    sql_query = "SELECT COUNT(*) FROM %s;" % (known_table)
    try:
        count_sql = pd.read_sql_query(sql_query, con)
        if count_sql is not None:
            exists = True
    except:
        exists = False
    print '    Table, %s, exists: %s' % (known_table, exists)

    sql_query = "SELECT DISTINCT(game_id) FROM %s;" % (known_table)
    try:
        count_to_get = pd.read_sql_query(sql_query, con)
        print '    There are %s distinct games in the %s table' % (len(count_to_get), known_table)
    except:
        print '  games table, %s, does not exist' % known_table
     
       
    sql_query = "SELECT * FROM %s;" % (known_table)
    try:
        all_sql = pd.read_sql_query(sql_query, con)
        print '      First 5 entries of %s: ' % (known_table)
        print all_sql.head(5)
        print '      Last 10 entries of %s: ' % (known_table)
        print all_sql.tail(10)
    except:
        a = 1
 
    print ''
Exemplo n.º 7
0
    def meet_all(self, day):
        """
        in a site meeting, the latest state of 3 variable sets are synchronized
        :param day: the day hold the meeting
        :return:
        """
        projects = pd.read_sql_query(
            "SELECT ID as ProjectID FROM Fact_Project WHERE MeetingCycle<>0 AND " + str(day) + " % MeetingCycle =0",
            self.engine)
        if len(projects.ProjectID) == 0 or day == 1:
            return 0
        # information about task progress
        sync_task = pd.read_sql_query("SELECT * FROM Sync_Task", self.engine)
        sync_task = sync_task.merge(projects, how='inner', on=['ProjectID']).reset_index(drop=True)
        sync_task['Day'] = day - 1
        self.log_wp(sync_task)

        # information about production rate
        sync_production_rate = pd.read_sql_query("SELECT * FROM Sync_ProductionRate", self.engine)
        sync_production_rate = sync_production_rate.merge(projects, how='inner', on=['ProjectID']).reset_index(
            drop=True)
        sync_production_rate['Day'] = day - 1
        self.log_production_rate(sync_production_rate)

        # information about workspace priority
        sync_workspace_priority = pd.read_sql_query("SELECT * FROM Sync_WorkSpacePriority", self.engine)
        sync_workspace_priority = sync_workspace_priority.merge(projects, how='inner', on=['ProjectID']).reset_index(
            drop=True)
        sync_workspace_priority['Day'] = day - 1
        self.log_priority_space(sync_workspace_priority)
def step2():
  df = pd.DataFrame
  # make list of unique asin
  sql = "SELECT DISTINCT asin FROM subset"
  asinList = pd.read_sql_query(sql, disk_engine)
  print(asinList.head())
  print(len(asinList.index))
  for row in asinList['asin'].tolist():
    print("loading: ", row)
    sql = "SELECT * FROM subset WHERE asin = '" + row + "' LIMIT 20"
    temp_df = pd.read_sql_query(sql, disk_engine, index_col = 'index')
    print("Temp df:\n", temp_df.head())
    if len(temp_df.index)>1:
      print("appending temp_df to df")
      if df.empty:
        df = temp_df
      else:
        df = df.append(temp_df, ignore_index=True)
    
  print(df.head(), df.tail())
  sql = "CREATE TABLE subset AS SELECT * FROM reviews WHERE asin IN ( SELECT asin FROM reviews GROUP BY asin HAVING COUNT (asin)>999)"
  disk_engine.execute(sql)
  print("new table created")
  sql = "SELECT * FROM subset a WHERE a.'index' IN ( SELECT b.'index' FROM subset b WHERE b.'index' IS NOT NULL AND a.'asin' = b.'asin' ORDER BY b.'unixReviewTime', b.'index' LIMIT 20) ORDER BY a.'asin', a.'unixReviewTime'" 

  # df = pd.read_sql_query(sql, disk_engine, index_col = 'index')
  # print(df.head())
  
  #df.to_sql('means', disk_engine, if_exists='replace', index_label = 'index')
  df.to_csv(text_destination)
  print("Success! Hooray!")
  return
Exemplo n.º 9
0
def getData(teamId):
    teamId = str(teamId)
    
    owinningScript = "SELECT Wteam AS team, Wscore AS score, (CAST (Wfgm AS FLOAT))/(CAST(Wfga AS FLOAT)) as fgp,(CAST (Wfgm3 AS FLOAT))/(CAST(Wfga3 AS FLOAT)) as tpp, (CAST (Wftm AS FLOAT))/(CAST(Wfta AS FLOAT)) as ftp, Wor as ofr FROM RegularSeasonDetailedResults WHERE Season >= 2014 AND (Wteam = ?)"
    owinningDf = pd.read_sql_query(owinningScript, conn, params = (teamId, ))
    
    olosingScript ="SELECT Lteam AS team, Lscore AS score, (CAST (Lfgm AS FLOAT))/(CAST(Lfga AS FLOAT)) as fgp,(CAST (Lfgm3 AS FLOAT))/(CAST(Lfga3 AS FLOAT)) as tpp, (CAST (Lftm AS FLOAT))/(CAST(Lfta AS FLOAT)) as ftp, Lor as ofr FROM RegularSeasonDetailedResults WHERE Season >= 2014 AND (Lteam =?)"
    olosingDf = pd.read_sql_query(olosingScript, conn, params = (teamId, ))
    
    oteamDf = owinningDf.append(olosingDf)
    
    o = oteamDf.apply(genOffScore,axis=1)
    
    omean = o.mean(axis=0) 
    #print(omean)

    dwinningScript = "SELECT Wteam as team, Lscore as oppscore, Lto as oppto, Wdr as dr, Wstl as stl, Wblk as blk FROM RegularSeasonDetailedResults WHERE Season >= 2014 AND (Wteam = ?)"
    dwinningDf = pd.read_sql_query(dwinningScript, conn, params = (teamId, ))
    
    dlosingScript = "SELECT Lteam as team, Wscore as oppscore, Wto as oppto, Ldr as dr, Lstl as stl, Lblk as blk FROM RegularSeasonDetailedResults WHERE Season >= 2014 AND (Lteam = ?)"
    dlosingDf = pd.read_sql_query(dlosingScript, conn, params = (teamId, ))
    
    dteamDf = dwinningDf.append(dlosingDf)
    
    d = dteamDf.apply(genDefScore,axis=1)
    dmean = d.mean(axis=0)

    od = pd.concat([o,d],axis=1)
    
    #return(omean,dmean)
    #print(od)
    return(od)
Exemplo n.º 10
0
def cesareans_output():
  #pull 'birth_month' from input field and store it
  patient = request.args.get('birth_month')
  ucase = request.args.get('ucase')
  #pull 'bad_foods' from input field and store it
  bad_foods = request.args.get('bad_foods')
  ucase = request.args.get('ucase')
  #just select the Cesareans  from the birth database for the month that the user inputs
  query = """SELECT * FROM nutrients_table2;"""
  #print query
  query_results=pd.read_sql_query(query,con)

  query2 = """SELECT * FROM dri_table;"""
  #print query
  query_results2=pd.read_sql_query(query2,con)

  #print query_results
  births = []
  for i in range(0,query_results.shape[0]):
      births.append(dict(pos=query_results.iloc[i]['pos'], energy=query_results.iloc[i]['energy'], price_serv=query_results.iloc[i]['price_serv']))
      
  the_result = []
  the_result, result_nutrients, nutrients4, total_cost = ModelIt(query_results, ucase, query_results2, patient, bad_foods)
  #the_result = len(the_result)
  return render_template("output.html", births = births, the_result = the_result, result_nutrients=result_nutrients, nutrients4=nutrients4, total_cost=total_cost, ucase=ucase, sites=query_results.to_html())
Exemplo n.º 11
0
def nearby_station_features(con, station_id):
    sql_query = "SELECT * FROM station_info;"
    station_info = pd.read_sql_query(sql_query, con)
    station_lat = station_info[(station_info['station_id'] == station_id)]['latitude'].values[0]
    station_lon = station_info[(station_info['station_id'] == station_id)]['longitude'].values[0]
    station_info['distance'] = ((station_info['latitude'] - station_lat) * 111.03) ** 2 + \
                               ((station_info['longitude'] - station_lon) * 85.39) ** 2

    counter = 0
    nearest_stations = []
    for station_id in station_info.sort_values('distance')['station_id']:
        ## can update this since new count
        check_station_status = pd.read_sql_query("SELECT * FROM station_statuses WHERE station_id = %d;" %station_id, con)
        if len(check_station_status['event_date']) > 100:
            nearest_stations.append(station_id)
            counter += 1
        if counter == 4:
            break

    nearby_station_data = []
    for index, nearby_id in enumerate(nearest_stations[1:]):
        sql_query = "SELECT event_date, num_bikes FROM station_statuses WHERE station_id = %d;" %nearby_id
        tmp_nearby_data = pd.read_sql_query(sql_query, con)
        nearby_station_data.append(pd.DataFrame(data={'event_date': tmp_nearby_data['event_date'],
                                                      'num_bikes_st%d' %(index + 1): tmp_nearby_data['num_bikes']}))

        # print index, nearby_station_data[-1].info()
    return nearby_station_data
Exemplo n.º 12
0
def validate_mutation_1(uniprot_id, mutation):
    """Select Provean; assert length > 0
    """
    logger.debug(helper.underline("Validating that we have provean..."))
    sql_query = """\
select 1
from {db_schema}.provean
where uniprot_id = '{uniprot_id}' and
provean_supset_filename is not null;
""".format(
        uniprot_id=uniprot_id, db_schema=conf.CONFIGS["db_schema"]
    )
    logger.debug(sql_query)
    df1 = pd.read_sql_query(sql_query, conf.CONFIGS["engine"])
    logger.debug(df1.head(2))
    #
    logger.debug(helper.underline("And that we have at least one domain with a template..."))
    sql_query = """\
select 1
from {db_schema}.uniprot_domain
join {db_schema}.uniprot_domain_template using (uniprot_domain_id)
where uniprot_id = '{uniprot_id}';
""".format(
        uniprot_id=uniprot_id, db_schema=conf.CONFIGS["db_schema"]
    )
    logger.debug(sql_query)
    df2 = pd.read_sql_query(sql_query, conf.CONFIGS["engine"])
    logger.debug(df2.head(2))
    assert len(df1) >= 1 or len(df2) == 0
Exemplo n.º 13
0
def getMyStocks(uid,flag,isSingle=False):
    user_id = uid
    if flag == '0' or flag == '1':
        global_bdf = pd.read_sql_query(
            "select ms.*,sb.zgb,sb.launch_date,sb.grow_type,sb.industry from my_stocks ms,stock_basic sb " \
            "where ms.code=sb.code and sb.flag=0 and ms.user_id = %(uid)s", db.engine, \
            params={'uid': user_id}, \
            index_col='code')
        bdf = global_bdf[global_bdf['flag'] == int(flag)]
        bdf = bdf.sort_values(by='created_time', ascending=False)
    elif isSingle:  # 如果是股票代码
        bdf = pd.read_sql_query(
            "select ms.*,sb.zgb,sb.launch_date,sb.grow_type,sb.industry from my_stocks ms,stock_basic sb " \
            "where ms.code=sb.code and sb.flag=0 and ms.code = %(code)s and ms.user_id = %(uid)s", db.engine,
            params={'code': flag, 'uid': user_id}, \
            index_col='code')
    elif flag == '2': #所有股票
        bdf = dbs.get_global_basic_data()
    else:
        tf1 = pd.read_sql_query("select sb.* from relation_stocks rs,stock_basic sb " \
                                "where rs.relation_stock=sb.code and sb.flag=0 and rs.main_stock=%(name)s and rs.user_id=%(uid)s",
                                db.engine,
                                params={'name': flag, 'uid': user_id}, \
                                index_col='code')
        # 添加股票自身
        tf2 = dbs.get_global_basic_data()
        tf2 = tf2[tf2.index == flag]
        bdf = pd.concat([tf1, tf2])

    return getStockItem(bdf)
Exemplo n.º 14
0
def load_football():
    """ Loads football data
    Dataset of football stats. +25,000 matches, +10,000 players from 11 European Countries with their lead championship
    Seasons 2008 to 2016. It also contains players attributes sourced from EA Sports' FIFA video game series,
    including the weekly updates, team line up with squad formation (X, Y coordinates), betting odds from up to 10 
    providers and detailed match events (goal types, possession, corner, cross, fouls, cards etc...) for +10,000 matches.
    The meaning of the columns can be found here: http://www.football-data.co.uk/notes.txt
    Number of attributes in each table (size of the dataframe):
    countries (11, 2)
    matches (25979, 115)
    leagues (11, 3)
    teams (299, 5)
    players (183978, 42)
    Link to the source: https://www.kaggle.com/hugomathien/soccer
    
    Returns
    -------
    list of pandas DataFrame
    """
    database_path = reduce(os.path.join, _FOOTBALL_PATH, _get_datapath())
    with sqlite3.connect(database_path) as con:
        countries = pd.read_sql_query("SELECT * from Country", con)
        matches = pd.read_sql_query("SELECT * from Match", con)
        leagues = pd.read_sql_query("SELECT * from League", con)
        teams = pd.read_sql_query("SELECT * from Team", con)
        players = pd.read_sql("SELECT * FROM Player_Attributes;", con)
    return countries, matches, leagues, teams, players
Exemplo n.º 15
0
def import_all_submissions(dbname="test1"):
    con = psycopg2.connect("dbname=%s" % dbname)
    # subs = generate_submissions(users=10, pbls=100)
    users = pd.read_sql_query("select user_id, creation_date from users \
                        where demo=0 and instructor=0 and administrator=0\
                                ", con=con)

    lusers = users.user_id.tolist()
    usersstr = str(lusers).strip('[]')

    probs = pd.read_sql_query("select problem_nm from abstractproblems where problem_nm like 'P%%'\
                              ", con=con)

    # problem_id like 'P%%'

    lprobs = [p[0] for p in probs.values if p[0]]
    lprobsstr = str(lprobs).strip('[]')

    submissions = pd.read_sql_query("select submission_uid, user_id, problem_id, submission_id, \
                            state, time_out, time_in, veredict, score \
                            from submissions where user_id in (%s);" % (usersstr),
                                    con=con)

    # get rid of languages
    submissions.problem_id = submissions.problem_id.apply(lambda x: x[:-3])
    #submissions.set_index('submission_uid', inplace=True)

    subs = submissions[submissions.problem_id.isin(lprobs)]
    return subs
Exemplo n.º 16
0
def get_record_factor(conn_func, parent_kind):
    conn = conn_func()
    qtype = QUESTION_SUBTYPE_MAP[parent_kind]
    sql = ('select question_type,question_id,user_id,status,date '
           'from question_record_detail where status!=0 and question_type=%s')
    record = pd.read_sql_query(sql, conn, params=(qtype,))
    record.rename(columns = {'question_type':'qtype',
                             'question_id':'qid',
                             'user_id':'uid'}, inplace=True)

    sql = ('select target_kind, target_id, tag_id from knowledge_tag '
           'where target_kind = %s')
    ktags = pd.read_sql_query(sql, conn, params=(qtype,))
    ktags.columns = ['qtype', 'qid', 'tag_id']

    conf = CONF_MAP[parent_kind]['question_conf']
    table_name = conf['table']
    qid_name = conf['qid']
    sql = ('select question_type,%s,difficulty from %s '
           'where question_type=%s' % (qid_name, table_name, qtype))
    diff = pd.read_sql_query(sql, conn)
    if len(diff) == 0:
        return None
    diff.columns = ['qtype', 'qid', 'difficulty']

    def _convert_fac(x):
        res = zip(map(str, x.tag_id), map(str, x.difficulty))
        return "|".join(['%s:%s' % (t, d) for t, d in res])

    fac = pd.merge(ktags, diff).groupby(['qtype', 'qid']
            ).apply(_convert_fac).reset_index()
    fac.columns = ['qtype', 'qid', 'score']

    return pd.merge(record, fac)[['uid', 'status', 'score']]
Exemplo n.º 17
0
	def __getCodeInfo__(self):
		rtn={}
		if self.config['UseCache']:
			rtn=self.__loadCache__('CodeInfo.json')
		if rtn=={}:
			CommodityInfo=pd.read_sql_query('select Future,Exchange,TradeUnit,Tick,Target,DeliveryMethod,Unit1,Unit2'+\
      			',TablePrefix,DominantContracts,AcsyCode from CommodityInfo where AcsyCode like N\'%0000\'',self.dbconn)
			TradeDate=pd.read_sql_query('select * from TradeDate where StartDate is not null',self.dbconn)
			TradeTime=pd.read_sql_query('select * from TradeTime',self.dbconn)
			CommodityInfo.index=CommodityInfo['AcsyCode'].apply(lambda x: x[:-4])
			CommodityInfo.drop(labels='AcsyCode',axis=1,inplace=True)
			CommodityInfo=CommodityInfo.T.to_dict()
			TradeDate.index=TradeDate['AcsyCode']
			TradeDate=TradeDate.T.to_dict()
			TT={}
			for name,group in TradeTime.groupby('AcsyCode'):
				TT[name]=[(datetime.datetime.strptime(v['StartTime'],'%H%M%S%f').time(),datetime.datetime.strptime(v['EndTime'],'%H%M%S%f').time()) for k,v in group.T.to_dict().items()]
			TradeTime={k:{(datetime.datetime.strptime(v['StartDate'],'%Y-%m-%d').date(),datetime.date(9999,12,31) if v['EndDate']==None else datetime.datetime.strptime(v['EndDate'],'%Y-%m-%d').date()):\
			TT[k]} for k,v in TradeDate.items()}
			TT={}
			
			for k,v in TradeTime.items():
				key=k[:-4]
				if key in TT.keys():
					TT[key].update(v)
				else:
					TT[key]=v
			
			rtn={}
			keys=set(TT.keys())
			for k,v in CommodityInfo.items():
				v.update({'TradeTime': TT[k] if k in keys else []})
				rtn[k]=v
			self.__saveCache__('CodeInfo.json',rtn)
		return rtn
Exemplo n.º 18
0
def populate_encoders_scale(table,disk_engine,events_tbl=None):
    df = pd.read_sql_query('select * from {table} limit 5'.format(table=table),disk_engine)
    col_names = df.columns.values
    encoders = {}
    # time_cols = ['AUTHZN_RQST_PROC_TM','PREV_ADR_CHNG_DT','PREV_PMT_DT','PREV_CARD_RQST_DT','frd_ind_swt_dt']
    for c,name in enumerate(col_names):
        tp = df.dtypes[c]
        # print tp

        if tp == 'object':
            # print 'ORIGINAL NAME:',name
            if name not in time_cols:
                print name
                df_cols = pd.read_sql_query('select distinct {col_name} from {table}'.format(col_name=name,table=table),disk_engine,chunksize=100000)
                arr = []
                progress = progressbar.ProgressBar(widgets=[progressbar.Bar('=', '[', ']'), ' ',
                                            progressbar.Percentage(), ' ',
                                            progressbar.ETA()]).start()
                for c,df_col in enumerate(df_cols): 
                    # arr = np.vstack((arr,np.array(df_col)))
                    arr.extend(np.array(df_col))
                    progress.update(c+1)
                if events_tbl != None:
                    df_cols = pd.read_sql_query('select distinct {col_name} from {table}'.format(col_name=name,table=events_tbl),disk_engine,chunksize=100000)
                    for c,df_col in enumerate(df_cols): 
                        arr.extend(np.array(df_col))
                        progress.update(c+1)
                progress.finish()
                arr = np.array(arr)
                encoders[name] = encode_column(np.array(arr).ravel())
    return encoders
Exemplo n.º 19
0
 def load_bars(self, pcontract, dt_start, dt_end, window_size):
     cursor = self.db.cursor()
     id_start, u = datautil.encode2id(pcontract.period, dt_start)
     id_end, u = datautil.encode2id(pcontract.period, dt_end)
     table = string.replace(str(pcontract.contract), '.', '_')
     #sql = "SELECT COUNT(*) FROM {tb} \
             #WHERE {start}<=id AND id<={end}".format(tb=table, start=id_start, end=id_end)
     #max_length = cursor.execute(sql).fetchone()[0]
     #
     sql = "SELECT datetime, open, close, high, low, volume FROM {tb} \
             WHERE {start}<=id AND id<={end}".format(tb=table, start=id_start, end=id_end)
             
     data = pd.read_sql_query(sql, self.db, index_col='datetime')
     if not series.g_rolling:
         data = pd.read_sql_query(sql, self.db, index_col='datetime')
         ## @todo
         return SqliteSourceWrapper(pcontract, data, None, len(data))
     else:
         cursor.execute(sql)
         data = pd.DataFrame({
             'open': [],
             'close': [],
             'high': [],
             'low': [],
             'volume': []
             })
         data.index = []
         return SqliteSourceWrapper(pcontract, data, cursor, window_size)
Exemplo n.º 20
0
def get_breakdown(orig_state, dest_state, info):
    column_code = queries.code_translate[info]
    q = queries.transactionsBetweenStates.params(orig_code=orig_state,
                                                 dest_code=dest_state)
    a = queries.getAux(info)
    
    try:
        data = pd.DataFrame({'counts': pd.read_sql_query(q.limit(QL), ENGINE).groupby(info)[info].count()})
        appendix = pd.read_sql_query(a, ENGINE)
        if data.index.dtype != appendix[column_code].dtype:
            appendix[column_code] = appendix[column_code].astype(str)

        m = pd.merge(appendix, data, left_on=column_code, right_index=True)
        
        # convert all numeric-code columns to be named code for easier manip
        m = m.rename(columns={column_code:'code'})

        # for some reason the mode table has a diff name for description column
        if info == 'MODE':
            m = m.rename(columns={'Mode Description':'Description'})

    except Exception as e:
        return Response(str(e) + " is not a correct column to groupby")
    
    return Response(m.to_json(orient="records"),
                    mimetype='application/json',
                    headers={'Cache-Control': 'no-cache'})
Exemplo n.º 21
0
def get_data(comparison,candidate,labels,top_features,feature_tooltips,party):

    #pull mean info for the right candidate
    topic_string=""
    feature_count=0
    for topic_num in top_features:
        topic_string=topic_string + ", AVG(topic%s) as topic%s"%(topic_num,feature_count)
        feature_count=feature_count+1

    cand_supp_query="SELECT user_candidate.candidate%s FROM user_topics INNER JOIN user_candidate ON (user_topics.user_id = user_candidate.user_id) WHERE user_candidate.candidate='%s' GROUP BY user_candidate.candidate;" %(topic_string, candidate)
    candidate_supp_data=pd.read_sql_query(cand_supp_query,con)

    party_query="SELECT user_candidate.party%s FROM user_topics INNER JOIN user_candidate ON (user_topics.user_id = user_candidate.user_id) WHERE user_candidate.candidate !='%s' AND party = '%s' GROUP BY user_candidate.party;" %(topic_string, candidate, party)
    party_data=pd.read_sql_query(party_query,con)

    all_data=candidate_supp_data.append(party_data)
    all_data['label']='comparison'
    all_data['label'][all_data['candidate']==candidate]="ChosenCandidate"


    all_data.drop('candidate', axis=1, inplace=True)
    all_data.drop('party', axis=1, inplace=True)
    all_data.set_index(['label'],inplace=True)

    flipped=all_data.T
    flipped['index_word']=labels
    flipped['topic_words']=feature_tooltips

    return flipped
Exemplo n.º 22
0
def get_stock_k_line_if_ma_is_null(code):

    sql = 'SELECT min(date) as date FROM {table} where code={code} and ma_12 is NULL'.format(table=STOCK_KLINE_TABLE, code=code)
    df = pd.read_sql_query(sql, engine)

    d_end=datetime.datetime.today()
    #date_end =d_end.strftime('%Y-%m-%d')
    if len(df) > 0:
        date_start = df.ix[0, 'date']
        if date_start is None:
            return None

        date_start = str(date_start)[:10]
        d_start = str_to_datatime(date_start, '%Y-%m-%d')
        delta = d_end - d_start
        days = delta.days + AVR_LONG + 1

    try:
        sql = "select * from {table} where code='{code}' order by date desc limit {count}".format(
               table=STOCK_KLINE_TABLE, code=code,  count=days)

        df = pd.read_sql_query(sql, engine)
        df = df.sort_index(by='date', ascending=True)
        return df
    except Exception as e:
        print str(e)
        return None
Exemplo n.º 23
0
def read_db():
    con = None
    con = psycopg2.connect(database=db_name, user=db_user,
                           host='localhost', password=db_pswd)

    # query:
    weather_query = """
                    SELECT * FROM weather_data_table;
                    """

    record_query = """
                   SELECT * FROM running_data_table;
                   """

    stat_query = """
                 SELECT date, city, COUNT(city)
                 FROM running_data_table
                 GROUP BY date,city;
                 """
    # read database
    weather_data = pd.read_sql_query(weather_query, con)

    running_data = pd.read_sql_query(record_query, con)

    stat_data = pd.read_sql_query(stat_query, con)

    stat = pd.merge(weather_data, stat_data, on=['date', 'city'])

    full = pd.merge(weather_data, running_data, on=['date', 'city'])
    full.to_sql('full_data_table', engine, if_exists='replace')

    return weather_data, running_data, full, stat
Exemplo n.º 24
0
    def __load_tables(db_path, pair_name, session):
        """
        :param db_path: path to historical database
        :param pair_name: name of currency exchange pair
        :return: data frame of time, prices and volume, sessions list
        """
        table_names_pairs = sorted(FxSingleCurrencyBroker.DB_TABLES.items(), key=operator.itemgetter(1))
        logging.info("Loading " + db_path)
        # connect to sqlite database
        con = sqlite3.connect(db_path)
        # fetch table names
        cursor = con.cursor()
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
        cdata = cursor.fetchall()
        table_names = [c[0] for c in cdata]
        # check database
        for key, _ in table_names_pairs:
            if key not in table_names and key is not 'TIME':
                raise LookupError("Loaded database doesn't have required table: " + key)
        # read tables data to data frames (pandas)
        df_list = [pd.read_sql_query("SELECT TIME from " + table_names[0], con)]
        df_columns = []
        for key, _ in table_names_pairs:
            if key is not 'TIME':
                df_list.append(pd.read_sql_query("SELECT " + pair_name + " from " + key, con))
                logging.info(key + " has been read")
            df_columns.append(key)
        con.close()

        df = pd.concat(df_list, axis=1)
        df.columns = df_columns

        return df, FxSingleCurrencyBroker.__split_sessions(df, session)
Exemplo n.º 25
0
def test_data():
    """Issue some simple queries to test whether tables exist"""

    print "Testing 2 queries!"

    dbname = 'taxi'
    username = '******'

#     engine = create_engine('postgresql://%s@localhost/%s'%(username,dbname))
#     print 'ENGINE', engine.url

    # connect to the server to run a test SQL command
    con = None
    con = psycopg2.connect(database = dbname, user = username)

    # query:
    sql_query = "SELECT COUNT(trip_distance) "+\
        " FROM taxi_trips WHERE trip_distance > 10;"
    subdat = pd.read_sql_query(sql_query,con)
    print sql_query
    print subdat.head()

    sql_query = "SELECT COUNT(fare_amount) "+\
        " FROM taxi_fares WHERE fare_amount > 30;"
    subdat = pd.read_sql_query(sql_query,con)
    print sql_query
    print subdat.head()
    con.close()
Exemplo n.º 26
0
    def get_lesson_info(self):
        '''
        lesson_info: lesson_id, week, root_id, lesson_plan_id
        '''
        conn = self.conn_func()
        sql = ('select id s_id, parent_id, level '
               'from book_hierarchy where level > 0')
        A = pd.read_sql_query(sql, conn)
        A.sort(columns='level', inplace=True)
        root_id_map = {}
        for _, row in A.iterrows():
            s_id, parent_id, level = row
            s_id = int(s_id)
            level = int(level)
            if not pd.isnull(parent_id):
                parent_id = int(parent_id)
            if level == 1:
                root_id_map[s_id] = s_id
            else:
                root_id_map[s_id] = root_id_map[parent_id]
        _f = lambda x: root_id_map[x.s_id]
        sql = 'select id s_id, week from book_hierarchy where level = 4'
        lesson_info = pd.read_sql_query(sql, conn)
        lesson_info['root_id'] = lesson_info.apply(_f, axis=1)
        lesson_info.rename(columns={'s_id':'lesson_id'}, inplace=True)

        sql = ('select id lesson_plan_id, lesson_id '
               'from lesson_plan where subject_id = %s '
               'and status = %s' % (self.subject_id, K_APPROVED))
        lesson_plan = pd.read_sql_query(sql , conn)
        lesson_info = pd.merge(lesson_info, lesson_plan)
        return lesson_info
Exemplo n.º 27
0
def get_data(limit=None, target='severity_final'):
    """Returns train test split of relevant data from database."""
    print '{}: connecting to database'.format(datetime.datetime.now())
    conn = connect_db()

    print '{}: loading data from database'.format(datetime.datetime.now())
    col_list = """
        assigned_to_init, cc_init,
        product_init, version_init,
        component_init, op_sys_init, reporter_bug_cnt,
        desc_init, short_desc_init,
        priority_final, severity_final
        """
    if limit:
        df_original = pd.read_sql_query(
            'select {} from final limit {}'.format(col_list, limit), con=conn)
    else:
        df_original = pd.read_sql_query(
            'select {} from final'.format(col_list), con=conn)

    df = df_original.copy(deep=True)

    # Feature engineering
    print '{}: feature engineering {}'.format(datetime.datetime.now(), target)
    df = create_features(df, target=target)

    y_all = df.pop(target)
    X_all = df

    return train_test_split(X_all, y_all, test_size=0.25, random_state=42)
Exemplo n.º 28
0
def get_courses(con, filtered=True):
    df = pd.read_sql_query("select course_id, title from courses;", con=con)
    kw = df.title.apply(lambda x: x.split()[0])
    kw.name = "kw"
    df = df.join(kw)

    cusers = pd.read_sql_query("select user_id, course_id from coursesusers \
        where course_id in(%s);" % str(df.course_id.tolist()).strip('[]'),
                               con=con)

    # usrcnt = cusers[cusers.user_id != None].course_id.value_counts()
    cusers = cusers.drop_duplicates()
    cusers = cusers.dropna()
    cusers = cusers[cusers.user_id.isin(get_good_users(con))]
    usrcnt = cusers.course_id.value_counts()
    usrcnt.name = 'usrcnt'
    print usrcnt.describe()

    df.set_index(df.course_id, inplace=True)

    pblcnt = pd.Series(name='pblcnt')
    for c in df.course_id:
        pblcnt[c] = len(select_pbls(con, course_id=c))

    df = df.join(pblcnt)
    df.pblcnt = df.pblcnt.fillna(0)
    
    df = df.join(usrcnt)
    df.usrcnt = df.usrcnt.fillna(0)
    # kw_vc = df.kw.value_counts()
    return df
    def to_dataframe(self, timerange=None):
        c = self.conn.cursor()

        select_byrange = True
        select_syntax = None
        df = None

        if timerange is None:
            print "select all"
            select_byrange = False
        elif len(timerange) == 1:
            timestamp1 = int(timerange[0])
            timestamp2 = timestamp1 + 1
        elif len(timerange) == 2:
            timestamp1 = timerange[0]
            timestamp2 = timerange[1]

        if select_byrange:
            select_syntax = "SELECT * FROM %s WHERE Timestamp>=? AND Timestamp<? ORDER BY Timestamp" % self.name
            print "params=", timestamp1, timestamp2
            df = pd.read_sql_query(select_syntax, self.conn, params=[timestamp1, timestamp2])
        else:
            select_syntax = "SELECT * from %s ORDER BY Timestamp" % self.name
            df = pd.read_sql_query(select_syntax, self.conn, index_col=["Timestamp"])

        print select_syntax

        # c.execute(select_syntax, (timestamp1, timestamp2))
        # df = pd.DataFrame(self._c.fetchall())
        # df.columns = c.keys()
        # df.set_index(['Timestamp'])

        # df = pd.read_sql_query(select_syntax, self.conn, params=[timestamp1, timestamp2], index_col=['Timestamp'])

        return df
Exemplo n.º 30
0
    def get_publish_articles(self):

        t1 = time.time()
        print 'begin query...'
        #sql = 'select distinct user_id from %s where user_id not in (select distinct user_id from %s)' % (big_v_table_mysql, archive_table_mysql)
        #df = pd.read_sql_query(sql, engine)
        #user_ids = df['user_id'].get_values()
        sql1 = 'select distinct user_id from %s where fans_count > 1000 and fans_count < 10001 ' % (big_v_table_mysql)
        sql2 = 'select distinct user_id from %s' % archive_table_mysql
        df1 = pd.read_sql_query(sql1, engine)
        df2 = pd.read_sql_query(sql2, engine)
        user_ids1 = df1['user_id'].get_values()
        user_ids2 = df2['user_id'].get_values()
        user_ids = [id for id in set(user_ids1).difference(user_ids2)]
        t2 = time.time()
        print 'query mysql by join cose:', t2-t1, 's'

        for user_id in user_ids:
            try:
                self.get_publish_articles_by_id(user_id)
            except Exception, e:
                se = Series([user_id, GetNowTime(), str(e)], index=['user_id', 'fail_time', 'fail_reason'])
                df = DataFrame(se).T
                df.to_sql(unfinish_arcticle_table_mysql, engine, if_exists='append', index=False)
                print e
Exemplo n.º 31
0
 def find_companies_by_name(self, name):
     query = "SELECT FullNameRu, Founders FROM Minjust2018 WHERE FullNameRu LIKE '%" + name + "%' LIMIT 0, 20"
     df = pd.read_sql_query(query, self.db_connection)
     return df
Exemplo n.º 32
0
 def find_companies_by_founder(self, founder, nosearch):
     df = pd.read_sql_query(
         "SELECT FullNameRu FROM Minjust2018 WHERE Founders LIKE '%" +
         founder + "%' AND FullNameRu <> '" + nosearch + "' LIMIT 0, 10",
         self.db_connection)
     return df.values.tolist()
Exemplo n.º 33
0
from scipy.stats import skew
import pgeocode


# In[102]:


from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler


# In[103]:


db = sqlite3.connect('home_sales.db')
df = pd.read_sql_query('SELECT * FROM sales;',db)


# In[104]:


# dropping null values
df = df.dropna()


# In[105]:


# feature engineering place_name from zipcode
nomi = pgeocode.Nominatim('us')
for index, row in df.iterrows():
Exemplo n.º 34
0
def build_ensembl_genes(cursor, conn):
    '''queries the MySQL public ensembl database and outputs a gene lookup object
    in JSON format. It also injects into our sqlite database just so that we can
    do the processing directly there.
    '''

    #connect to Ensembl MySQL public server
    core = create_engine(
        'mysql+mysqldb://[email protected]/homo_sapiens_core_92_38'
    )

    q = """
    select
    et.exon_id,
    et.transcript_id,
    g.stable_id as gene_id,
    g.description,
    r.name as chr,
    g.seq_region_start as start,
    g.seq_region_end as end,
    e.seq_region_start as exon_start,
    e.seq_region_end as exon_end,
    t.seq_region_strand as fwdstrand
    from exon_transcript et, exon e, gene g, transcript t, seq_region r
    where
    g.canonical_transcript_id = et.transcript_id and
    g.seq_region_id = r.seq_region_id and
    r.coord_system_id = 4 and
    r.name NOT RLIKE 'CHR' and
    et.transcript_id = t.transcript_id and
    e.exon_id =et.exon_id
    """

    start_time = time.time()

    df = pd.read_sql_query(q, core, index_col='exon_id')
    df['exons'] = list(zip(df.exon_start, df.exon_end))
    df['fwdstrand'] = df['fwdstrand'].map({1: True, -1: False})
    df['tss'] = df.apply(lambda row: row['start']
                         if row['fwdstrand'] else row['end'],
                         axis=1)
    keepcols = [
        'gene_id', 'description', 'tss', 'chr', 'start', 'end', 'fwdstrand'
    ]
    genes = pd.DataFrame(
        df.groupby(keepcols)['exons'].apply(list)).reset_index()
    genes.set_index('gene_id', inplace=True)
    print(genes['chr'].value_counts())
    genes.to_json(OUTGENENAME, orient='index')

    print("--- Genes table completed in %s seconds ---" %
          (time.time() - start_time))
    genes.loc[:, ('chr', 'start', 'end')].to_sql('gene',
                                                 conn,
                                                 if_exists='replace')

    # add indices
    try:
        cursor.execute('''
        CREATE INDEX ix_gene_gene_id ON gene (gene_id);
        ''')
    except sqlite3.OperationalError as operror:
        print(operror)
        pass
Exemplo n.º 35
0
 def get_user_buglist(self):
     #缺陷类型为user 运营端提交的缺陷
     zentaodb = self.zentaodb
     bugsql = "select id,title from zt_bug where status='active' and type='user'"
     buglist = pd.read_sql_query(bugsql, zentaodb)
     return buglist
Exemplo n.º 36
0
    def get_data2(self, query):

        cnx = sqlite3.connect(self.db)
        data = pd.read_sql_query(query, cnx)
        data['fecha'] = pd.to_datetime(data['fecha'], format="%d/%m/%Y")
        return data
Exemplo n.º 37
0
from sklearn.ensemble import GradientBoostingRegressor
import skopt
import pickle
from opioid_functions import *
import os

os.chdir('/Users/zach.olivier/Desktop/GTX/CSE_6242/course_project')

# define the file
sqlite_file = 'DVADB/DVADB.db'

# open a connection
conn = sqlite3.connect(sqlite_file)

# read from the main table
df = pd.read_sql_query("SELECT * FROM npi_summary", conn)

# take a sample for analysis / modeling
df_model = df.sample(frac=.3)

# quick summary
print(f'dataframe dimensions: {df_model.shape}')
print(f'column names: {df_model.columns}')
print(f' column types: {df_model.dtypes}')

# set index to npi
df_model = df_model.set_index('npi')

# columns that we cannot use for modeling
drop_cols = [
    'nppes_provider_last_org_name',
Exemplo n.º 38
0
print("\n", "="*50, "\n", sep="")

'''
Create a sql db from adult dataset and name it sqladb
'''
sqladb = db.connect("./adult_data.db")

cursor = sqladb.cursor()
df.to_sql("adult_data", sqladb, if_exists="replace", index=False)

'''
1. Select 10 records from the adult sqladb
'''
print("Select 10 records\n")
query = "SELECT * FROM adult_data LIMIT 10"
print(pd.read_sql_query(query, sqladb))
print("\n", "="*50, "\n", sep="")

'''
2. Show me the average hours per week of all men who are working in private sector
'''
print("Show me the average hours per week of all men who are working in private sector\n")
query = "SELECT AVG(hours_per_week) AS average_hours_per_week FROM adult_data WHERE sex = 'Male' AND workclass = 'Private'"
print(pd.read_sql_query(query, sqladb))
print("\n", "="*50, "\n", sep="")

'''
3. Show me the frequency table for education, occupation and relationship, separately
'''
print("Show me the frequency table for education\n")
query = "SELECT  education, COUNT(education) AS frequency FROM adult_data GROUP BY education ORDER BY frequency DESC"
Exemplo n.º 39
0
import pandas as pd
import sqlite3 as sql

# dir dans lequel est installé la database sqlite3
SQLiteDir = '/Users/griceldacalzada/Documents/Python/TestCarrefoursFeux/SQlite'
# se connecte se la base de donnee sqlite3 Hermes
conn = sql.connect(SQLiteDir + '/Hermes2018.db')
cur = conn.cursor()
# liste des b3s dans la base de donnée
B3S_DF = pd.read_sql_query(
    "SELECT name FROM sqlite_master WHERE type='table';", conn)
B3S_list = list(B3S_DF['name'])

for b3s in B3S_list:

    sql = "DELETE FROM {b3s} WHERE Jour NOT BETWEEN '2018-01-01' AND '2019-01-01'; ".format(
        b3s=b3s)
    cur.execute(sql)
    print(sql)

    #cur.execute("SELECT * FROM {b3s} ORDER BY Jour ASC;".format(b3s=b3s))

    conn.commit()

cur.execute("VACUUM;")

conn.close()
Exemplo n.º 40
0
def to_sql(
    df: pd.DataFrame,
    table_name: str,
    creds: SqlCreds,
    sql_type: str = "table",
    schema: str = "dbo",
    index: bool = True,
    if_exists: str = "fail",
    batch_size: int = None,
    debug: bool = False,
    bcp_path: str = None,
):
    """
    Writes the pandas DataFrame to a SQL table or view.

    Will write all columns to the table or view. If the destination table/view doesn't exist, will create it.
    Assumes the SQL table/view has the same number, name, and type of columns.
    To only write parts of the DataFrame, filter it beforehand and pass that to this function.
    Unlike the pandas counterpart, if the DataFrame has no rows, nothing will happen.

    Parameters
    ----------
    df : pandas.DataFrame
    table_name : str
        Name of SQL table or view, without the schema
    creds : bcpandas.SqlCreds
        The credentials used in the SQL database.
    sql_type : {'table'}, can only be 'table'
        The type of SQL object of the destination.
    schema : str, default 'dbo'
        The SQL schema.
    index : bool, default True
        Write DataFrame index as a column. Uses the index name as the column
        name in the table.
    if_exists : {'fail', 'replace', 'append'}, default 'fail'
        How to behave if the table already exists.
        * fail: Raise a BCPandasValueError.
        * replace: Drop the table before inserting new values.
        * append: Insert new values to the existing table. Matches the dataframe columns to the database columns by name.
            If the database table exists then the dataframe cannot have new columns that aren't in the table, 
            but conversely table columns can be missing from the dataframe.
    batch_size : int, optional
        Rows will be written in batches of this size at a time. By default, BCP sets this to 1000.
    debug : bool, default False
        If True, will not delete the temporary CSV and format files, and will output their location.
    bcp_path : str, default None
        The full path to the BCP utility, useful if it is not in the PATH environment variable
    """
    # validation
    if df.shape[0] == 0 or df.shape[1] == 0:
        return
    assert sql_type == TABLE, "only supporting table, not view, for now"
    assert if_exists in IF_EXISTS_OPTIONS

    if df.columns.has_duplicates:
        raise BCPandasValueError(
            "Columns with duplicate names detected, SQL requires that column names be unique. "
            f"Duplicates: {df.columns[df.columns.duplicated(keep=False)]}")

    # TODO diff way to implement? could be big performance hit with big dataframe
    if index:
        df = df.copy(deep=True).reset_index()

    delim = get_delimiter(df)
    quotechar = get_quotechar(df)

    if batch_size is not None:
        if batch_size == 0:
            raise BCPandasValueError("Param batch_size can't be 0")
        if batch_size > df.shape[0]:
            raise BCPandasValueError(
                "Param batch_size can't be larger than the number of rows in the DataFrame"
            )

    # save to temp path
    csv_file_path = get_temp_file()
    # replace bools with 1 or 0, this is what pandas native does when writing to SQL Server
    df.replace({
        True: 1,
        False: 0
    }).to_csv(
        path_or_buf=csv_file_path,
        sep=delim,
        header=False,
        index=False,  # already set as new col earlier if index=True
        quoting=csv.QUOTE_MINIMAL,  # pandas default
        quotechar=quotechar,
        line_terminator=NEWLINE,
        doublequote=True,
        escapechar=None,  # not needed, as using doublequote
    )
    logger.debug(f"Saved dataframe to temp CSV file at {csv_file_path}")

    # build format file
    fmt_file_path = get_temp_file()

    sql_item_exists = _sql_item_exists(sql_type=sql_type,
                                       schema=schema,
                                       table_name=table_name,
                                       creds=creds)
    cols_dict = None  # for mypy
    if if_exists == "append":
        # get dict of column names -> order of column
        cols_dict = dict(
            pd.read_sql_query(
                """
                SELECT COLUMN_NAME, ORDINAL_POSITION 
                FROM INFORMATION_SCHEMA.COLUMNS 
                WHERE TABLE_SCHEMA = '{_schema}'
                AND TABLE_NAME = '{_tbl}'
            """.format(_schema=schema, _tbl=table_name),
                creds.engine,
            ).values)

        # check that column names match in db and dataframe exactly
        if sql_item_exists:
            # the db cols are always strings, unlike df cols
            extra_cols = [
                str(x) for x in df.columns if str(x) not in cols_dict.keys()
            ]
            if extra_cols:
                raise BCPandasValueError(
                    f"Column(s) detected in the dataframe that are not in the database, "
                    f"cannot have new columns if `if_exists=='append'`, "
                    f"the extra column(s): {extra_cols}")

    fmt_file_txt = build_format_file(df=df,
                                     delimiter=delim,
                                     db_cols_order=cols_dict)
    with open(fmt_file_path, "w") as ff:
        ff.write(fmt_file_txt)
    logger.debug(f"Created BCP format file at {fmt_file_path}")

    try:
        if if_exists == "fail":
            if sql_item_exists:
                raise BCPandasValueError(
                    f"The {sql_type} called {schema}.{table_name} already exists, "
                    f"`if_exists` param was set to `fail`.")
            else:
                _create_table(schema=schema,
                              table_name=table_name,
                              creds=creds,
                              df=df,
                              if_exists=if_exists)
        elif if_exists == "replace":
            _create_table(schema=schema,
                          table_name=table_name,
                          creds=creds,
                          df=df,
                          if_exists=if_exists)
        elif if_exists == "append":
            if not sql_item_exists:
                _create_table(schema=schema,
                              table_name=table_name,
                              creds=creds,
                              df=df,
                              if_exists=if_exists)

        # BCP the data in
        bcp(
            sql_item=table_name,
            direction=IN,
            flat_file=csv_file_path,
            format_file_path=fmt_file_path,
            creds=creds,
            sql_type=sql_type,
            schema=schema,
            batch_size=batch_size,
            bcp_path=bcp_path,
        )
    finally:
        if not debug:
            logger.debug(f"Deleting temp CSV and format files")
            os.remove(csv_file_path)
            os.remove(fmt_file_path)
        else:
            logger.debug(
                f"`to_sql` DEBUG mode, not deleting the files. CSV file is at "
                f"{csv_file_path}, format file is at {fmt_file_path}")
Exemplo n.º 41
0
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# ### Import Dataset

# In[ ]:

conn = pymysql.connect(host='kpmg-server.mysql.database.azure.com',
                       port=int(3306),
                       user='******',
                       passwd='5527563Aas@',
                       db='imap',
                       charset='utf8mb4')

df = pd.read_sql_query("SELECT * FROM imap.data", conn)

# In[4]:

# Convert body to list
data = df.content.values.tolist()

# ### Tokenize words and Clean-up text

# In[5]:


def sent_to_words(sentences):  # 문장을 토큰화
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True)
               )  # deacc=True removes punctuations
    'POSTGRES_PORT': 5432,  # change to your port
    'POSTGRES_USERNAME': '******',  # change to your username
    'POSTGRES_PASSWORD': '******',  # change to your password
    'POSTGRES_DBNAME': 'test_surfers_bible_db'
}  # change to your db name

# create connection and cursor
conn = ps.connect(host=credentials['POSTGRES_ADDRESS'],
                  database=credentials['POSTGRES_DBNAME'],
                  user=credentials['POSTGRES_USERNAME'],
                  password=credentials['POSTGRES_PASSWORD'],
                  port=credentials['POSTGRES_PORT'])

cur = conn.cursor()

beach_df = pd.read_sql_query("SELECT * FROM BEACH_TABLE;", conn)

conn.close()
cur.close()

# beach_df.head()

# # API KEYS:

# In[27]:

SG_API_KEY_DICT = {}
SG_API_KEY_DICT[
    '1'] = "8aab844c-8cfd-11ea-9f57-0242ac130002-8aab8500-8cfd-11ea-9f57-0242ac130002"
SG_API_KEY_DICT[
    '2'] = "ca3fa016-8cfd-11ea-ad84-0242ac130002-ca3fa0c0-8cfd-11ea-ad84-0242ac130002"
#                         password="******",
#                         host="ec2-54-227-241-179.compute-1.amazonaws.com",
#                         port="5432",
#                         database="d46q2igt2d4vbg",
#                         sslmode="require")

if (conn):
    logger.info("Connection Successful!")
else:
    logger.info("Connection Error!")

logger.info("Get all the Community Partners from the Database")

# Get all the Community Partners from the database
dfCommunity = pd.read_sql_query(
    "SELECT pc.name as Community_Partner,pc.address_line1, pc.address_line2, pc.city, pc.state,pc.zip, hm.mission_name ,p.mission_type, pc.legislative_district,pc.median_household_income, pc2.community_type,pc.website_url FROM partners_communitypartner PC join partners_communitypartnermission p on PC.id = p.community_partner_id join home_missionarea hm on p.mission_area_id = hm.id join partners_communitytype pc2 on PC.community_type_id = pc2.id",
    con=conn)
if len(dfCommunity) == 0:
    logger.critical("No Community Partners fetched from the Database on " + str(currentDT))
else:
    logger.info(repr(len(dfCommunity)) + "Community Partners are in the Database on " + str(currentDT))

# Get all the Projects from the database and get their Campus Partners , Community Partners associated
dfProjects = pd.read_sql_query(
    "SELECT  project_name,academic_year , pc2.name as campus_partner ,um.college_name,ppcp.name as community_partner FROM projects_project P join projects_academicyear pa on P.academic_year_id = pa.id join projects_projectcampuspartner pc on P.id = pc.project_name_id join projects_projectcommunitypartner ppc on P.id = ppc.project_name_id join partners_communitypartner ppcp on ppc.community_partner_id = ppcp.id join partners_campuspartner pc2 on  pc.campus_partner_id= pc2.id join university_college um on um.id = pc2.college_name_id WHERE p.id IN (SELECT project_name_id FROM projects_projectcommunitypartner)",
    con=conn)
if len(dfProjects) == 0:
    logger.critical("No Projects are fetched from the Database as of " + str(currentDT))
else:
    logger.info(repr(len(dfProjects)) + "Projects are in the Database as of " + str(currentDT))
conn.close()
Exemplo n.º 44
0
def read_data():
    conn = db_connection()
    try:
        return pd.read_sql_query("SELECT * FROM reports", conn)
    except Exception as e:
        return "No table with that name found, Error: {}".format(e)
Exemplo n.º 45
0
def get_data():
    cnx = sqlite3.connect('Project2')
    w2 = pd.read_sql_query("SELECT * FROM hawaii", cnx)
    res = w2.to_json(orient='table')
    return res
Exemplo n.º 46
0
 def sample_range_date_time_table(self, club, start, end, step=0):
     self.open_connect()
     query = "SELECT * FROM club_tab WHERE (club = ?) AND (data_time BETWEEN ? AND ?)"
     return pd.read_sql_query(query,
                              self.connect,
                              params=(club, start, end))
Exemplo n.º 47
0
    else:
        print('Not CAT4 or DRT')
    return result


#load data from postgres

#postgres_str='postgres://*****:*****@46.101.58.30:5432/academic_tracker_production'

#product

postgres_str = 'postgres://*****:*****@ec2-18-203-229-185.eu-west-1.compute.amazonaws.com:5432/d9k69uia8l4iu'

cnx = create_engine(postgres_str)

df_schools = pd.read_sql_query('''SELECT id, name FROM schools''', cnx)
df_schools_ = df_schools[(df_schools['name'].str.contains("Test*") == False)
                         & (df_schools['name'].str.contains("TEST*") == False)]
#df_schools_=df_schools_[df_schools_['name'].str.contains("TEST*")==False]

#df2=pd.read_sql_query('''SELECT id, student_id, academical_year, calendar_year FROM student_years''', cnx)

#df1=pd.read_sql_query('''SELECT id, subject, student_year_id, score, title, examined_at,
#       expectation FROM results''', cnx)

###
###df1=part('0', '50000')
###df1=df1.rename(columns = {'id': 'idu'})
#df=pd.read_sql_query('''SELECT results.id, results.subject, results.score, results.expectation, student_years.student_id,  student_years.academical_year, student_years.calendar_year FROM results INNER JOIN student_years ON results.student_year_id = student_years.id ORDER BY results.id''', cnx)
#df3=pd.read_sql_query('''SELECT id, `, gender, year_of_entry, name FROM students''', cnx)
#displaying map
map


# # Fetching Data for Count of Applicants basis City

# In[3]:


import connectors
import pandas as pd

conn = connectors.db_conn()
cur = conn.cursor()
sql_query = "select count(apid) as applicants , case when current_state = 'DELHI' then 'Delhi' else current_city end as region from applicants group by 2 order by 1 desc"
db_data = pd.read_sql_query(sql_query, conn)
print(db_data)
## Always close the connection
conn = None


# # Plotting fetched data on India Map and applying visualisation effects

# In[4]:


folium.Choropleth(geo_data= 'India.geojson', #loading geojson file uploaded
             data=db_data, # my dataset
             columns=['region', 'applicants'], # region is here for matching the geojson regions, applicants is the column that changes the color of regions
             key_on= 'feature.properties.NAME_2', # this path contains region in str type, this region should match with our region column
             fill_color='BuPu', 
Exemplo n.º 49
0
geo_box = (18.005611, 48.987386, -124.626080, -62.361014)

# connect to server
engine = sqlalchemy.create_engine('mysql://%(user)s:%(pass)s@%(host)s' %
                                  config.database)
engine.execute('use %s' % config.database['name'])  # select db
recent_data = (datetime.now() - timedelta(weeks=12)).strftime("%Y-%m-%d")
sql_query = '''SELECT post_date, latitude, longitude, image_url, likes, caption, post_url
			FROM instagram
			WHERE post_date > '%s'
			AND latitude between %s AND %s
			AND longitude between %s AND %s
			ORDER BY post_date DESC, likes DESC
			''' % (recent_data, geo_box[0], geo_box[1], geo_box[2], geo_box[3])

posts = pd.read_sql_query(sql_query, engine, parse_dates=['date'])
n_points = posts.shape[0]

posts = posts[posts['caption'].notnull()]
posts.reset_index(drop=True)

sentences = []  # Initialize an empty list of sentences

print "Parsing sentences from training set"
for caption in posts['caption']:
    sentences += caption_to_sentences(caption, tokenizer)

# Set values for various parameters
num_features = 400  # Word vector dimensionality
min_word_count = 30  # Minimum word count
num_workers = 4  # Number of threads to run in parallel
Exemplo n.º 50
0
        print statement indicating successful write to db
    '''
    engine = create_engine(
        'postgres://*****:*****@localhost:5432/bloodmoneydb')
    df.to_sql('model_input_tbl_raw', engine,
              if_exists='fail')  # if need to recreate change this to 'replace'

    return 'Data successfully written to database'


if __name__ == "__main__":
    conn = psycopg2.connect(host="localhost",
                            database="bloodmoneydb",
                            user="******",
                            password="******")

    sql_query = """
        SELECT *
        FROM joined_fight_event_fighters_data
    """

    df = pd.read_sql_query(sql_query, con=conn)
    df = calculate_age_of_fighter(df=df)
    df = fighter_home_court(df)
    df = calculate_pct_of_possible_rounds_fought(df=df)
    df = calculate_win_streak(df=df)
    df = transform_to_wide_by_fight(df)

    write_data_to_tbl(df)

    print('Success')
Exemplo n.º 51
0
def power_ice(conn, start, end):
    #query data from database
    start_str = str(Time(start).mjd)
    end_str = str(Time(end).mjd)

    sql_c = "SELECT * FROM SE_ZIMIRICEA_IDLE WHERE start_time BETWEEN "+start_str+" AND "+end_str+" ORDER BY start_time"
    _idle = pd.read_sql_query(sql_c, conn)
    sql_c = "SELECT * FROM SE_ZIMIRICEA_HV_ON WHERE start_time BETWEEN "+start_str+" AND "+end_str+" ORDER BY start_time"
    _hv = pd.read_sql_query(sql_c, conn)

    voltage = 30
    _idle['average'] *= voltage
    _hv['average'] *= voltage

    _idle['start_time'] = pd.to_datetime( Time(_idle['start_time'], format = "mjd").datetime )
    _hv['start_time'] = pd.to_datetime( Time(_hv['start_time'], format = "mjd").datetime )

    #set column data source
    idle = ColumnDataSource(_idle)
    hv = ColumnDataSource(_hv)

    # create a new plot with a title and axis labels
    p = figure( tools = "pan,wheel_zoom,box_zoom,reset,save",       \
                toolbar_location = "above",                         \
                plot_width = 1120,                                  \
                plot_height = 500,                                  \
                y_range = [5,14],                                   \
                x_axis_type = 'datetime',                           \
                output_backend = "webgl",                           \
                x_axis_label = 'Date', y_axis_label='Power (W)')

    p.grid.visible = True
    p.title.text = "POWER ICE"
    pf.add_basic_layout(p)
    pf.add_limit_box(p, 6, 8, alpha = 0.1, color = "green")


    # add a line renderer with legend and line thickness
    scat1=p.scatter(x = "start_time", y = "average", color = 'orange', legend = "Power idle", source = idle)
    scat2=p.scatter(x = "start_time", y = "average", color = 'red', legend = "Power hv on", source = hv)
    p.line(x = "start_time", y = "average", color = 'orange', legend = "Power idle", source = idle)
    p.line(x = "start_time", y = "average", color = 'red', legend = "Power hv on", source = hv)

    #generate error bars
    err_xs_hv = []
    err_ys_hv = []
    err_xs_idle = []
    err_ys_idle = []

    for index, item in _hv.iterrows():
        err_xs_hv.append((item['start_time'],item['start_time']))
        err_ys_hv.append((item['average'] - item['deviation'], item['average'] + item['deviation']))

    for index, item in _idle.iterrows():
        err_xs_idle.append((item['start_time'],item['start_time']))
        err_ys_idle.append((item['average'] - item['deviation'], item['average'] + item['deviation']))
    # plot them
    p.multi_line(err_xs_hv, err_ys_hv, color='red', legend='Power hv on')
    p.multi_line(err_xs_idle, err_ys_idle, color='orange', legend='Power idle')

    #activate HoverTool for scatter plot
    hover_tool = HoverTool( tooltips =
    [
        ('count', '@data_points'),
        ('mean', '@average'),
        ('deviation', '@deviation'),

    ], mode='mouse', renderers=[scat1,scat2])

    p.tools.append(hover_tool)

    p.legend.location = "bottom_right"
    p.legend.click_policy = "hide"

    return p
Exemplo n.º 52
0
def exec_query_file(path: str):
    conn = open_db()
    query = import_query(path)
    return pd.read_sql_query(query, conn)
Exemplo n.º 53
0
#!/usr/bin/env python

import sqlite3
import pandas as pd

move_from = sqlite3.connect("/u/home/c/cloeffle/scratch/sql/bacteria_data.db")

move_to = sqlite3.connect("/u/home/c/cloeffle/scratch/sql/new_bacteria_stats.db")
cur = move_to.cursor()

frame = pd.read_sql_query("SELECT FILENAME, FILEPATH, chromosome_count, avg_length_chromosomes, max_length_chromosomes, min_length_chromosomes, contig_count, avg_length_contig, max_length_contig, min_length_contig, plasmid_count, avg_length_plasmids, max_length_plasmids, min_length_plasmids FROM SPECIESDB WHERE DBNAME='ENSEMBL'", move_from)

for index, ROW in frame.iterrows(): 
	cur.execute ("INSERT INTO SPECIESDB (FILENAME, FILEPATH, chromosome_count, avg_length_chromosomes, max_length_chromosomes, min_length_chromosomes, contig_count, avg_length_contig, max_length_contig, min_length_contig, plasmid_count, avg_length_plasmids, max_length_plasmids, min_length_plasmids, DBNAME) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (ROW["FILENAME"], ROW["FILEPATH"], ROW["chromosome_count"], ROW["avg_length_chromosomes"], ROW["max_length_chromosomes"], ROW["min_length_chromosomes"], ROW["contig_count"], ROW["avg_length_contig"], ROW["max_length_contig"], ROW["min_length_contig"], ROW["plasmid_count"], ROW["avg_length_plasmids"], ROW["max_length_plasmids"], ROW["min_length_plasmids"], "ENSEMBL"))
	move_to.commit()

move_from.close()
cur.close()
move_to.close()
Exemplo n.º 54
0
 def query_events(self, task, sql):
     sql_filename = self.convert_lhe(task)
     return pd.read_sql_query(sql, 'sqlite:///' + sql_filename)
Exemplo n.º 55
0
# -*- coding: utf-8 -*-
"""
Created on Sun Jan  3 10:32:44 2016

@author: wayne
"""

import pandas as pd
import sqlite3

# Read sqlite query results into a pandas DataFrame
conn = sqlite3.connect("babysleep_test.db")
data = pd.read_sql_query("SELECT * from sleep_data", conn)

# verify that result of SQL query is stored in the dataframe
print data.head()

conn.close()
Exemplo n.º 56
0
import pandas as pd
import sqlite3
import json

RASA_DB_FILE = "rasa.db"
RASA_ANA_FILE = "cleanRasa.db"
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect(RASA_DB_FILE)
conversation_data = pd.read_sql_query("SELECT data FROM conversation_event",
                                      con)

# Divide the conversation into sessions
# inactivity period in the units of seconds
inactivity_period_allowed = 1 * 60
old_sender_id = None
old_timestamp = 0

# extract conversation text and entities information from the database
conversation_text = pd.DataFrame(columns=('sender_id', 'session_id', 'event',
                                          'timestamp', 'text', 'entities'))
for conversation in conversation_data['data']:
    if 'text' in conversation:
        res = json.loads(conversation)
        sender_id = res['sender_id']
        event = res['event']
        timestamp = res['timestamp']
        text = res['text']
        entities = defaultdict(list)
        if 'parse_data' in res:
            for i in res['parse_data']['entities']:
                entities[i['entity']].append(i['value'])
Exemplo n.º 57
0
# Import packages
from sqlalchemy import create_engine
import pandas as pd

# Create engine: engine
engine = create_engine('sqlite:///Chinook.sqlite')

# Execute query and store records in DataFrame: df
df = pd.read_sql_query('SELECT * FROM Employee WHERE EmployeeId >= 6 ORDER BY BirthDate;', engine)

# Print head of DataFrame
print(df.head())
Exemplo n.º 58
0
def names():
    """Return a list of sample names."""
    stmt = belly_db.session.query(Samples).statement
    df = pd.read_sql_query(stmt, belly_db.session.bind)
    return jsonify(list(df.columns)[2:])
Exemplo n.º 59
0
    def export_case(self, save_path, case_name, max_tiles=None):
        main_df = self.format_df()
        main_name = os.path.join(save_path, case_name + "-info.csv")
        main_df.to_csv(main_name)

        if max_tiles == None:
            max_tiles = self.get_dimensions()

        tile_count_query = """SELECT COUNT(NULLIF(pap_area,0)) as pap_count,
                                COUNT(NULLIF(den_area,0)) as den_count,
                                COUNT(NULLIF(hy_area,0)) as hy_count,
                                COUNT(NULLIF(min_area,0)) as min_count
                            FROM master
                            GROUP BY tile_id
                            """
        impacted_count_query = """SELECT COUNT(a.tag) as imp_count
                                    FROM master AS a, impacted AS b
                                    WHERE a.tag = b.tag
                                    GROUP BY a.tile_id
                                """

        count_df = pd.read_sql_query(tile_count_query, self.conn)
        imp_df = pd.read_sql_query(impacted_count_query, self.conn)
        imp_count = imp_df["imp_count"]
        count_df.insert(4, "imp_count", imp_count)
        count_name = os.path.join(save_path, case_name + "-counts.csv")
        count_df.to_csv(count_name)

        impacted_tag_query = """SELECT tag
                                FROM impacted"""

        impacted_df = pd.read_sql_query(impacted_tag_query, self.conn)
        impacted_name = os.path.join(save_path,
                                     case_name + "-impacted-tags.csv")
        impacted_df.to_csv(impacted_name)

        total_count_query = """SELECT COUNT(NULLIF(pap_area,0)) as total_pap,
                                COUNT(NULLIF(den_area,0)) as total_den,
                                COUNT(NULLIF(hy_area,0)) as total_hy,
                                COUNT(NULLIF(min_area,0)) as total_min,
                                COUNT(NULLIF(den_area,0)) * 1.0 / COUNT(NULLIF(pap_area,0)) * 100 AS den_count_perc,
                                COUNT(NULLIF(hy_area,0)) * 1.0 / COUNT(NULLIF(pap_area,0)) * 100 AS hy_count_perc,
                                COUNT(NULLIF(min_area,0)) * 1.0/ COUNT(NULLIF(pap_area,0)) * 100 AS min_count_perc
                            FROM master
                            """

        total_impacted = len(self.pull_impacted_tags())
        total_pap = self.get_counts()[0]
        impacted_perc = round((total_impacted / total_pap) * 100,
                              constants.perc_digits)

        count_df = pd.read_sql_query(total_count_query, self.conn)
        count_df.insert(4, "total_imp", total_impacted)
        count_df.insert(8, "imp_count_perc", impacted_perc)
        count_name = os.path.join(save_path,
                                  case_name + "-count-percentages.csv")
        count_df.to_csv(count_name)

        for fib_type in constants.fib_types:
            points_query = f"""SELECT {fib_type}.*,
                                master.tile_id,
                                tiles.real_tile
                                FROM {fib_type}
                                INNER JOIN master on master.tag = {fib_type}.tag
                                INNER JOIN tiles on tiles.rel_tile = master.tile_id
                                """

            pt_df = pd.read_sql_query(points_query, self.conn)
            # account of padding of the image in tkinter
            pt_df["real_x"] = (pt_df["rel_x"] * constants.zoom_multiplier
                               ) - constants.padding_size
            pt_df["real_y"] = (pt_df["rel_y"] * constants.zoom_multiplier
                               ) - constants.padding_size

            # convert to overall coordinates
            pt_df["real_x"] = pt_df["real_x"] + (
                (pt_df["real_tile"] % max_tiles[0]) * constants.tile_size)
            pt_df["real_y"] = pt_df["real_y"] + (
                (pt_df["real_tile"] // max_tiles[0]) * constants.tile_size)

            pt_df["rel_x"] = pt_df["rel_x"] - (
                constants.padding_size // constants.zoom_multiplier
            )  #set rel coords 0,0 to top left of tile
            pt_df["rel_y"] = pt_df["rel_y"] - (constants.padding_size //
                                               constants.zoom_multiplier)

            points_name = os.path.join(save_path,
                                       case_name + f"-{fib_type}-pts.csv")
            pt_df.to_csv(points_name)
Exemplo n.º 60
0
    con = psycopg2.connect(dbname='mimic')

    #initialize context dictionary
    context_dic= {}

    # Query mimic for notes
    notes_query = \
    """
    select n.subject_id,n.text
    from mimiciii.noteevents n
    where iserror IS NULL --this is null in mimic 1.4, rather than empty space
    and subject_id > %d
    and subject_id < %d
    ;
    """ % (min_id,max_id)
    notes = pd.read_sql_query(notes_query, con)
    text = ''
    for i,row in notes.iterrows():
        toks = tokenize(row.text)
        text += ' '.join(toks)+'\n'
        extract_context(toks,window_size,context_dic)
    with open('context_small.txt','w') as f:
        f.write(text)
    f.close()
    context_dictionary_filename = str(context_dictionary_name)+'.npy'
    np.save(context_dictionary_filename,context_dic) #Save context dictionary after having read all the notes


regex_punctuation  = re.compile('[\',\.\-/\n]')
regex_alphanum     = re.compile('[^a-zA-Z0-9_ ]')
regex_num          = re.compile('\d[\d ]+')