from subprocess import call from lib.database.backend_db import Db _PIPELINE_PATH = path.dirname(path.realpath(__file__)) + '/' _OTHER_SQL_FILE = _PIPELINE_PATH + 'other.sql' _COUNTS_SQL_FILE = _PIPELINE_PATH + 'counts.sql' _DUMP_SQL_FILE = _PIPELINE_PATH + 'dump.sql' #TODO(rrayborn): Make this a relative path _OUTPUT_PATH = '/var/server/server/useradvocacy/data/static_json/' _OUTPUT_JSON_NAME = 'hello.json' _OUTPUT_CSV_NAME = 'hello.csv.gz' #TODO(rrayborn): make this more elegant _INPUT_DB = Db('input') def update(start_date=date.today() - timedelta(days=6 * 7), end_date=date.today() - timedelta(days=1), last_run_date=date.today() - timedelta(days=1), output_path=_OUTPUT_PATH): ''' Updates the Hello files. Args: start_date (datetime): start date of data to pull, inclusive (default: 42 days ago) end_date (datetime): end date of data to pull, inclusive (default: 1 day ago) last_run_date (datetime): last date that the pipeline was run for (default: 1 day ago) output_path (str): the location where our files should be output (default: _OUTPUT_PATH) '''
from subprocess import check_output from collections import Counter _OVERLAP = 2 # INCREASE THIS IF GOOGLE STARTS BACK POPULATING DATA _PIPELINE_PATH = path.dirname(path.realpath(__file__)) + '/' _DATA_PATH = _PIPELINE_PATH + 'data/' _TMP_CSV = _DATA_PATH + '.google_play_tmp.csv' _LATEST_CSV = _DATA_PATH + 'google_play_latest.csv' _ALL_CSV = _DATA_PATH + 'google_play_all.csv' _REVIEW_FILE_PATTERN = environ['GS_PLAY_BUCKET'] + \ '/reviews/reviews_org.mozilla.*%s.csv' #This should be handled better... _SENTIMENT_DB = Db('sentiment') #FLAGS = gflags.FLAGS # #gflags.DEFINE_integer('my_version', 0, 'Version number.') #gflags.DEFINE_string('filename', None, 'Input file name', short_name='f') # #gflags.RegisterValidator('my_version', # lambda value: value % 2 == 0, # message='--my_version must be divisible by 2') #gflags.MarkFlagAsRequired('filename') # # def main():