Exemplo n.º 1
0
def search_files(config):

    # project id
    project_id = config['project_id']
    # bucket name
    bucket_name = config['buckets']['open']

    # connect to google cloud storage
    gcs = GcsConnector(project_id, bucket_name)

    #---------------------
    # search for MAF files
    #---------------------
    maf_file = re.compile("^.*.maf$")
    # search only these tumor types defined in config files
    search_tumors = ["tcga/" + d.lower() for d in config['all_tumor_types']]
    data_library = gcs.search_files(search_patterns=['.maf'],
                                    regex_search_pattern=maf_file,
                                    prefixes=search_tumors)
    data_library['basefilename'] = map(
        lambda x: os.path.splitext(os.path.basename(x))[0],
        data_library['filename'])
    data_library['unique_filename'] = format_dupe_values(
        data_library['basefilename'])

    return data_library
Exemplo n.º 2
0
def main(config):

    #    etl = util.DataETL("isb-cgc", "isb-cgc-open") # this starts a new connectioni
    project_id = config['project_id']
    bucket_name = config['buckets']['open']

    # connect to bucket to get files
    gcs = GcsConnector(project_id, bucket_name)
    isoform_file = re.compile("^.*.isoform.quantification.txt.json$")
    data_library = gcs.search_files(
        search_patterns=['.isoform.quantification.txt'],
        regex_search_pattern=isoform_file,
        prefixes=['tcga/intermediary/mirna/isoform/'])
    # we are eliminating bad files - size 0; could be hg18 etc
    data_library.loc[:, 'basefilename'] = data_library['filename'].map(
        lambda x: os.path.splitext(os.path.basename(x))[0].replace(
            '.json', ''))
    data_library = data_library.query('size > 0')

    conn = sqlite3.connect('../etl-mirna-isoform.db')
    sql = 'SELECT * from {0}'.format('task_queue')
    all_files_df = pd.read_sql_query(sql, conn)
    conn.close()

    with open('downloadedfiles.txt') as f:
        lines = f.read().splitlines()

    all_files_df = all_files_df[(all_files_df.DatafileName.isin(
        data_library.basefilename))]
    all_files_df = all_files_df[~(all_files_df.DatafileName.isin(lines))]
    data_library = all_files_df
    print data_library

    conn = sqlite3.connect('etl-isoform-download.db')
    submit_to_queue(data_library, conn, 'task_queue')
    queue_df = data_library

    # restart ETL; this gets the diff; also takes care of errors
    try:
        sql = 'SELECT * from task_queue_status where errors="None"'
        queue_df2 = pd.read_sql_query(sql, conn)
        print 'completed: ', len(queue_df2)
        queue_df = queue_df[~(
            queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))]
        print 'Not completed: ', len(queue_df)
    except Exception, e:
        pass
Exemplo n.º 3
0
def main(config):

#    etl = util.DataETL("isb-cgc", "isb-cgc-open") # this starts a new connectioni
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
   
    # connect to bucket to get files 
    gcs = GcsConnector(project_id, bucket_name)
    isoform_file = re.compile("^.*.isoform.quantification.txt.json$")
    data_library = gcs.search_files(search_patterns=['.isoform.quantification.txt'], regex_search_pattern=isoform_file, prefixes=['tcga/intermediary/mirna/isoform/'])
    # we are eliminating bad files - size 0; could be hg18 etc
    data_library.loc[:, 'basefilename'] = data_library['filename'].map(lambda x: os.path.splitext(os.path.basename(x))[0].replace('.json', ''))
    data_library = data_library.query('size > 0')
    

    conn = sqlite3.connect('../etl-mirna-isoform.db')
    sql = 'SELECT * from {0}'.format('task_queue')
    all_files_df = pd.read_sql_query(sql, conn)
    conn.close()

    with open('downloadedfiles.txt') as f:
        lines = f.read().splitlines()

    all_files_df = all_files_df[ (all_files_df.DatafileName.isin(data_library.basefilename))]
    all_files_df = all_files_df[ ~ (all_files_df.DatafileName.isin(lines))]
    data_library = all_files_df
    print data_library
    
    conn = sqlite3.connect('etl-isoform-download.db')
    submit_to_queue(data_library, conn, 'task_queue')
    queue_df = data_library

     # restart ETL; this gets the diff; also takes care of errors
    try:
        sql = 'SELECT * from task_queue_status where errors="None"'
        queue_df2 = pd.read_sql_query(sql, conn)
        print 'completed: ', len(queue_df2)
        queue_df = queue_df[ ~(queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))]
        print 'Not completed: ',  len(queue_df)
    except Exception, e:
        pass
Exemplo n.º 4
0
def search_files(config):

    # project id
    project_id = config['project_id']
    # bucket name 
    bucket_name = config['buckets']['open']
   
    # connect to google cloud storage 
    gcs = GcsConnector(project_id, bucket_name)

    #---------------------    
    # search for MAF files
    #---------------------
    maf_file = re.compile("^.*.maf$")
    # search only these tumor types defined in config files
    search_tumors = ["tcga/" + d.lower() for d in config['all_tumor_types']]
    data_library = gcs.search_files(search_patterns=['.maf'], regex_search_pattern=maf_file, prefixes=search_tumors)
    data_library['basefilename'] = map(lambda x: os.path.splitext(os.path.basename(x))[0], data_library['filename']) 
    data_library['unique_filename'] = format_dupe_values(data_library['basefilename'])

    return data_library
def main(config):

    log_filename = 'etl_download_isoform.log'
    log_name = 'etl_download_isoform'
    log = configure_logging(log_name, log_filename)
    log.info('begin downloading isoform files')
    #    etl = util.DataETL("isb-cgc", "isb-cgc-open") # this starts a new connection
    project_id = config['project_id']
    bucket_name = config['buckets']['open']

    # connect to bucket to get files
    gcs = GcsConnector(project_id, bucket_name)
    isoform_file = re.compile("^.*.isoform.quantification.txt.json$")
    data_library = gcs.search_files(
        search_patterns=['.isoform.quantification.txt'],
        regex_search_pattern=isoform_file,
        prefixes=[config['mirna_isoform_matrix']['isoform_gcs_dir']])
    # we are eliminating bad files - size 0; could be hg18 etc
    data_library.loc[:, 'basefilename'] = data_library['filename'].map(
        lambda x: os.path.splitext(os.path.basename(x))[0].replace(
            '.json', ''))
    data_library = data_library.query('size > 0')

    log.info('\tbegin selecting isoform files from sql-lite isoform db')
    conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db'])
    sql = 'SELECT * from {0}'.format('task_queue')
    all_files_df = pd.read_sql_query(sql, conn)
    conn.close()
    log.info('\tfinished selecting isoform files')

    log.info('\tbegin reading from down loaded files')
    with open(config['mirna_isoform_matrix']
              ['isoform_download_prev_files']) as f:
        lines = f.read().splitlines()
    log.info('\tfinished reading from down loaded files')

    log.info(
        'filter files.\n\tfiles in cloud storage: %s\n\tfiles previously marked to download: %s\n%s\n'
        % (len(data_library), len(all_files_df), data_library))
    all_files_df = all_files_df[(all_files_df.DatafileName.isin(
        data_library.basefilename))]
    all_files_df = all_files_df[~(all_files_df.DatafileName.isin(lines))]
    data_library = all_files_df
    log.info('finished filter files: %s\n%s\n' %
             (len(data_library), data_library))

    conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db'])
    submit_to_queue(data_library, conn, 'task_queue', log)
    queue_df = data_library

    # restart ETL; this gets the diff; also takes care of errors
    try:
        conn = sqlite3.connect('isoform_download.db')
        sql = 'SELECT * from task_queue_status where errors="None"'
        queue_df2 = pd.read_sql_query(sql, conn)
        log.info('\tso far completed: ' % (len(queue_df2)))
        queue_df = queue_df[~(
            queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))]
        log.info('\tso far not completed: ' % (len(queue_df)))
    except Exception:
        log.exception(
            '\n++++++++++++++++++++++\n\tproblem filtering completed jobs, ignoring\n++++++++++++++++++++++\n'
        )

    # -----------------------------------------------------
    # thread this with concurrent futures
    #------------------------------------------------------
    log.info('\tsubmit jobs to process manager')
    pm = process_manager.ProcessManager(max_workers=200,
                                        db='isoform_download.db',
                                        table='task_queue_status',
                                        log=log)
    for count, df in data_library.iterrows():
        row = df.to_dict()
        if 0 == count % 512:
            time.sleep(10)
        if 0 == count % 2048:
            log.info('\t\tsubmitting %s file: %s' %
                     (count, row['DatafileName']))
        if not os.path.isdir(
                config['mirna_isoform_matrix']['isoform_download_dir'] +
                row['Platform']):
            os.makedirs(
                config['mirna_isoform_matrix']['isoform_download_dir'] +
                row['Platform'])
        outfilename = config['mirna_isoform_matrix'][
            'isoform_download_dir'] + row['Platform'] + "/" + row[
                'DatafileName']
        pm.submit(download_file, project_id, bucket_name,
                  row['DatafileNameKey'], outfilename, '')
        time.sleep(0.2)
    log.info('\tsubmitted %s total jobs to process manager' % (count))

    log.info('\tstart process manager completion check')
    pm.start()

    log.info('finished downloading isoform files')