예제 #1
0
def process_cghub(config,
                  type_uri='detail',
                  log=None,
                  removedups=False,
                  limit=-1,
                  verbose=False,
                  print_response=False):
    """
    return type:
        tumor_type2cghub_records: organizes the cghub record classes per tumor type
    """
    log_info(log, 'begin process cghub')
    module = import_module(config['cghub_module'])
    mappings = config['metadata_locations']['cghub']
    cghub_records, _ = module.main(mappings['study'],
                                   log=log,
                                   removedups=removedups,
                                   limit=limit)
    tumor_type2cghub_records = {}
    count = 0
    seen_bad_codes = set()
    for cghub_record in cghub_records:
        if 0 == count % 8192:
            log_info(log, '\tprocess %s cghub records' % (count))
        count += 1
        tumor_type2cghub_records.setdefault(
            cghub_record.disease_abbr, []).append(
                create_cghub_metadata(mappings, cghub_record, seen_bad_codes,
                                      log))
    log_info(log, 'finished process cghub: %s total records' % (count))
    return tumor_type2cghub_records
예제 #2
0
파일: _mem.py 프로젝트: johntyree/mem
 def __init__(self, mem, subdir, memfile="Memfile"):
     def set_attr(attr, val):
         object.__setattr__(self, attr, val)
     set_attr("mem", mem)
     set_attr("orig_dir", os.path.abspath(os.curdir))
     set_attr("subdir", os.path.join(self.orig_dir, subdir))
     set_attr("memfile", os.path.join(self.subdir, memfile))
     set_attr("mf", util.import_module(self.memfile, self.memfile))
def insert_metadata(config, table_columns, table_rows, log):
    try:
        datastore = import_module(config['database_module'])
        tables = ['metadata_clinical', 'metadata_biospecimen', 'metadata_data', 'metadata_samples']
        for index in range(len(tables)):
            datastore.ISBCGC_database_helper.column_insert(config, list(table_rows[index]), tables[index], table_columns[index], log)
    
    except Exception as e:
        log.exception('problem saving metadata to the database')
        raise e
예제 #4
0
def get_song_checker(player_name):
    'Returns the "currentSong" function for the specified player.'

    if player_name not in song_checkers:
        modname ='nowplaying.%s' % player_name
        if modname not in sys.modules:
            sys.modules[modname] = import_module(modname)
    try:
        return song_checkers[player_name]
    except KeyError:
        return None
예제 #5
0
파일: spider.py 프로젝트: Gzure/spider
def add_task():
    """
    :param url: str target url
    :param name: task name
    :param url_type: desc info
    :param script: python script for handler url response
    :param start method: script scheduler method
    :param depends: requirments txt
    :param scheduler_type: interval or cron
    :return:
    """
    script = request.files.get('script')
    task = request.form.copy().to_dict()
    if script:
        path = 'crawlers/' + script.filename
        script.save(path)
    else:
        if 'script' in task:
            del task['script']

    depends = task.get('depends', None)
    if depends:
        for depend in depends.split(','):
            c_o = commands.getoutput('pip install %s' % depend)
            LOG.info('c_o:%s', c_o)
    del task['depends']

    trigger_value = task['trigger_value']
    trigger_dict = {}
    for entry in trigger_value.split(','):
        key, value = entry.split(':', 1)
        trigger_dict[key] = int(value) if value.isdigit() else value

    del task['trigger_value']
    task.update(trigger_dict)

    try:
        if scheduler.get_job(task['id']):
            scheduler.remove_job(task['id'])

        # if crawler file change, we should reload it
        crawler_module = util.import_module(task['func'])
        reload(crawler_module)

        job = scheduler.add_job(**task)
        return redirect('/index.html')
    except ConflictingIdError:
        return jsonify(dict(error_message='Job %s already exists.' %
                            task.get('id')),
                       status=409)
    except Exception as e:
        LOG.exception(e)
        return jsonify(dict(error_message=str(e)), status=500)
    def test_mirna_isoform(self):
        self.config[
            'input_id_file'] = 'gdc/doc/gdc_manifest_mirnaiso.2016-12-12_test_40.tsv'
        module = import_module(self.config['gcs_wrapper'])
        module.open_connection(self.config, self.log)

        try:
            project = 'TCGA-UCS'
            data_type = 'Isoform Expression Quantification'
            self.run_upload(self.config, project, data_type, self.log)
        finally:
            module.close_connection()
예제 #7
0
def instantiate_etl_class(config, program_name, data_type, log):
    etl_class = None
    if data_type in config[program_name]['process_files']['datatype2bqscript']:
        log.info('\t\t\tinstantiating etl class %s' %
                 (config[program_name]['process_files']['datatype2bqscript']
                  [data_type]['class']))
        etl_module_name = config[program_name]['process_files'][
            'datatype2bqscript'][data_type]['python_module']
        module = import_module(etl_module_name)
        etl_class_name = config[program_name]['process_files'][
            'datatype2bqscript'][data_type]['class']
        Etl_class = getattr(module, etl_class_name)
        etl_class = Etl_class(config)
    return etl_class
예제 #8
0
def store_metadata(config, log, table, key2metadata):
    '''
    calls the store_metadata method in the module specified by the configuration file
    
    parameters:
        config: the configuration map
        log: logger to log any messages
        table: the mysql table to save the metadata to
        key2metadata: the metadata to save
    '''
    metadata_modules = config['metadata_modules']
    for metadata_module in metadata_modules:
        module = import_module(metadata_module)
        module.store_metadata(config, log, table, key2metadata)
예제 #9
0
def finalize(config, log):
    if config['process_annotation']:
        call_metadata2annotation(config, log)

    if config['process_case'] and config['process_metadata_attrs']:
        for program_name in config['program_names']:
            if 0 == len(config['program_name_restrict']
                        ) or program_name in config['program_name_restrict']:
                postproc_module = import_module(
                    config[program_name]['process_cases']['postproc_case']
                    ['postproc_module'])
                postproc_module.process_metadata_attrs(config, log)

    if 'process_images' in config and config['process_images']:
        process_images(config, log)
예제 #10
0
def insert_metadata(config, table_columns, table_rows, log):
    try:
        datastore = import_module(config['database_module'])
        tables = [
            'metadata_clinical', 'metadata_biospecimen', 'metadata_data',
            'metadata_samples'
        ]
        for index in range(len(tables)):
            datastore.ISBCGC_database_helper.column_insert(
                config, list(table_rows[index]), tables[index],
                table_columns[index], log)

    except Exception as e:
        log.exception('problem saving metadata to the database')
        raise e
예제 #11
0
def initializeDB(config, log):
    module = import_module(config['database_module'])
    helper = module.ISBCGC_database_helper
    helper.initialize(config, log)

    # populate the data_availability tables
    if config['update_schema']:
        isb_labels = set(config['data_type2isb_label'].values())
        for build in config['genomic_builds']:
            params = [[build, isb_label] for isb_label in isb_labels]
            for program_name in config['program_name_restrict']:
                helper.column_insert(
                    config, params,
                    '%s_metadata_data_type_availability' % (program_name),
                    ['genomic_build', 'isb_label'], log)
예제 #12
0
def process_cghub(config, type_uri = 'detail', log = None, removedups = False, limit = -1, verbose = False, print_response = False):
    """
    return type:
        tumor_type2cghub_records: organizes the cghub record classes per tumor type
    """
    log_info(log, 'begin process cghub')
    module = import_module(config['cghub_module'])
    mappings = config['metadata_locations']['cghub']
    cghub_records, _ = module.main(mappings['study'], log = log, removedups = removedups, limit = limit)
    tumor_type2cghub_records = {}
    count = 0
    seen_bad_codes = set()
    for cghub_record in cghub_records:
        if 0 == count % 8192:
            log_info(log, '\tprocess %s cghub records' % (count))
        count += 1
        tumor_type2cghub_records.setdefault(cghub_record.disease_abbr, []).append(create_cghub_metadata(mappings, cghub_record, seen_bad_codes, log))
    log_info(log, 'finished process cghub: %s total records' % (count))
    return tumor_type2cghub_records
예제 #13
0
    def setup(self, opts):
        self.uri = opts.get('Uri')      # url to load
        self.rate = int(opts.get('Rate', 30))   # rate in seconds
        self.timeout = int(opts.get('Timeout', 30)) # timeout for IO operations

        # load known URI schemes
        scheme = urlparse.urlparse(self.uri).scheme
        if scheme == 'http' or scheme == 'https':
            # load a page over http
            self.update = self.update_http
        elif scheme == 'file':
            # load data from a file
            self.update = self.update_file
        elif scheme == 'python':
            # load data by calling a python function
            u = util.import_module(urlparse.urlparse(self.uri).netloc)
            self.update = lambda: self.process(u(opts))
        else:
            raise ValueError("Unknown URI scheme: " + scheme)
예제 #14
0
    def __init__(self, channel_name, encoder_names):
        # the channel this tube will use
        self.channel = dict()
        # the list of encoders this tube will use - order matters
        self.encoders = list()

        if not isinstance(encoder_names, list):
            raise TypeError("Encoders must be specified as a list of string names.")
        if not channel_name or not isinstance(channel_name, str):
            raise TypeError("Channel name must be specified as a string.")

        channel_class = util.import_module('sneakers.channels', channel_name)
        for encoder in encoder_names:
            if not encoder or not isinstance(encoder, str):
                raise TypeError("Encoders must be specified as a list of string names.")
            encoder_class = self.__import_module('sneakers.encoders', encoder.lower())
            self.encoders.append({'name': encoder, 'class': encoder_class()})

        self.channel = {'name': channel_name, 'class': channel_class()}
예제 #15
0
def uploadTCGA(configFileName):
    print datetime.now(), 'begin uploadTCGA()'
    global executor
    try:
        with open(configFileName) as configFile:
            config = json.load(configFile)
        
        log_dir = str(date.today()).replace('-', '_') + '_' + config['log_dir_tag'] + '/'
        log_name = create_log(log_dir, 'top_processing')
        log = logging.getLogger(log_name)
        log.info('begin uploadTCGA()')
        
        executor = futures.ThreadPoolExecutor(max_workers=config['threads'])
        
        module = import_module(config['database_module'])
        module.ISBCGC_database_helper.initialize(config, log)
     
        if config['upload_files'] or config['upload_etl_files']:
            # open the GCS wrapper here so it can be used by all the tumor types/platforms to save files
            gcs_wrapper.open_connection()
        info_status(config, log)
        tumor_type2platform2archive_types2archives, platform2archive2metadata = process_latestarchive(config, log_name)
        prepare_upload(tumor_type2platform2archive_types2archives, log)
        if 'process_cghub' not in config or config['process_cghub']:
            tumor_type2cghub_records = process_cghub(config, log=log, removedups=True, limit=-1)
        else:
            log.warning('\n\t====================\n\tnot processing CGHub records this run!\n\t====================')
            tumor_type2cghub_records = {}
        barcode2metadata = process_metadata_current(config, log_name)
        if 'process_annotations' not in config or config['process_annotations']:
            barcode2annotations = process_annotations(config, log_name)
        else:
            log.warning('\n\t====================\n\tnot processing annotations this run!\n\t====================')
            barcode2annotations = {}
        process_tumortypes(config, log_dir, tumor_type2platform2archive_types2archives, platform2archive2metadata, tumor_type2cghub_records, barcode2metadata, barcode2annotations, log)
    finally:
        if executor:
            executor.shutdown(wait=False)
        if gcs_wrapper:
            gcs_wrapper.close_connection()
    log.info('finish uploadTCGA()')
    print datetime.now(), 'finish uploadTCGA()'
예제 #16
0
def register_config_items(configitem):
    """Registers config items with Mercurial's registrar.

    The argument is a ``registrar.configitem`` instance.
    """
    # TRACKING hg43
    configitems = import_module('mercurial.configitems')

    configitem(b'bugzilla', b'username', default=configitems.dynamicdefault)
    configitem(b'bugzilla', b'apikey', default=configitems.dynamicdefault)
    configitem(b'bugzilla', b'password', default=configitems.dynamicdefault)
    configitem(b'bugzilla', b'userid', default=configitems.dynamicdefault)
    configitem(b'bugzilla', b'cookie', default=configitems.dynamicdefault)
    configitem(b'bugzilla',
               b'firefoxprofile',
               default=configitems.dynamicdefault)
    configitem(b'bugzilla', b'url', default=configitems.dynamicdefault)
    configitem(b'mozilla',
               b'trustedbmoapikeyservices',
               default=configitems.dynamicdefault)
예제 #17
0
def uploadGDC():
    print datetime.now(), 'begin uploadGDC()'

    gcs_wrapper = None
    try:
        args = parseargs()
        with open(args.config) as configFile:
            config = json.load(configFile)

        log_dir = str(date.today()).replace(
            '-', '_') + '_' + config['log_dir_tag'] + '/'
        log_name = create_log(log_dir, 'top_processing')
        log = logging.getLogger(log_name)

        log.info('begin uploadGDC()')

        initializeDB(config, log)

        if config['upload_files'] or config['upload_etl_files']:
            # open the GCS wrapper here so it can be used by all the projects/platforms to save files
            gcs_wrapper = import_module(config['gcs_wrapper'])
            gcs_wrapper.open_connection(config, log)

        for endpt_type in config['endpt_types']:
            log.info('processing %s endpoints' % (endpt_type))
            if config['process_annotation']:
                process_annotations(config, endpt_type, log_dir)
            else:
                log.warning(
                    '\n\t====================\n\tnot processing annotations this run!\n\t===================='
                )
            process_programs(config, endpt_type, log_dir, log)
        finalize(config, log)
    except:
        raise
    finally:
        if gcs_wrapper:
            gcs_wrapper.close_connection()

    log.info('finished uploadGDC()')
    print datetime.now(), 'finished uploadGDC()'
예제 #18
0
def __insert_rows(config, endpt_type, tablename, values, mapfilter, log):
    maps = []
    for value in values:
        maps += flatten_map(value, mapfilter)
    print_list_synopsis(maps, '\t\trows to save for %s' % (tablename), log)

    module = import_module(config['database_module'])
    fieldnames = module.ISBCGC_database_helper.field_names(tablename)
    rows = []
    for nextmap in maps:
        rows += __addrow(endpt_type, fieldnames, nextmap, log)
    if config['update_cloudsql']:
        #     def select(cls, config, stmt, log, params = [], verbose = True):
        wherelist = []
        for fieldname in fieldnames:
            wherelist += ['%s = %%s' % (fieldname)]
        stmt = 'select %s from %s where %s' % (fieldnames[0], tablename,
                                               ' and '.join(wherelist))
        count = 0
        for index in range(8):
            if len(rows) == index:
                break
            result = module.ISBCGC_database_helper.select(
                config, stmt, log, rows[index])
            count += 1 if len(result) > 0 else 0
        if count == min(len(rows), 8):
            log.warning(
                '\n\t====================\n\tfirst %d records already saved for %s, skipping\n\t===================='
                % (count, tablename))
            return
        elif 0 < count:
            raise ValueError(
                'only some of the first %d records were saved for %s' %
                (count, tablename))
        module.ISBCGC_database_helper.column_insert(config, rows, tablename,
                                                    fieldnames, log)
    else:
        log.warning(
            '\n\t====================\n\tnot saving to cloudsql to %s this run!\n\t===================='
            % (tablename))
예제 #19
0
def process_cases(config, endpt_type, program_name, project_name, log_dir):
    try:
        log_name = create_log(log_dir, project_name + '_cases')
        log = logging.getLogger(log_name)

        log.info('begin process_cases(%s)' % (project_name))
        case2info = get_map_rows(config, endpt_type, 'case', program_name,
                                 get_filter(project_name), log)
        save2db(
            config, endpt_type, '%s_metadata_clinical' % (program_name),
            case2info,
            config[program_name]['process_cases']['clinical_table_mapping'],
            log)

        remove_null_samples(case2info, log)
        save2db(config, endpt_type, '%s_metadata_biospecimen' % (program_name),
                case2info,
                config[program_name]['process_cases']['sample_table_mapping'],
                log)

        # fill in the rest of the metadata depending on the program
        if 0 < len(case2info.values()):
            postproc_module = import_module(
                config[program_name]['process_cases']['postproc_case']
                ['postproc_module'])
            postproc_module.postprocess(config, project_name, endpt_type, log)

        log.info('finished process_cases(%s)' % (project_name))

        #         log.info('begin process_cases(%s) for omf files' % (project_name))
        #         omf2info = get_omf_map_rows(config, project_name, log)
        #         save2db(config, 'metadata_gdc_clinical', case2info, config['process_cases']['clinical_table_mapping'], log)
        #         log.info('finished process_cases(%s) for omf files' % (project_name))

        return case2info
    except:
        log.exception('problem processing cases(%s):' % (project_name))
        raise
    finally:
        close_log(log)
def store_metadata(config, log, table, key_metadata):
    if not config['process_bio']:
        return
    count = 0
    count_upload = 0
    not_data_fields = set()
    upload_exts = set()
    field2stats = {}
    nospecies = []
    nosdrf = set()
    insert_str_listlist = []
    datastore = import_module(config['database_module'])
    field_names = datastore.ISBCGC_database_helper.field_names(table)
    field_name2column = dict([(column_name, index) for index, column_name in enumerate(field_names)])
    list_fields = config['list_fields']
    log.info('\tstarting store metadata')
    for metadata in key_metadata.itervalues():
        try:
            # skip cellline/control samples:
            if 'SampleBarcode' in metadata and '20' == metadata['SampleBarcode'][13:15]:
                continue

            list_values = [None for _ in range(len(field_name2column))]
            for field, value in metadata.iteritems():
                # make sure every string field is stored as a string rather than unicode
                # unicode is stored as a blob in the datastore
                try:
                    if value in (None, '->'):
                        metadata[field] = None
                    else:
                        if field in list_fields:
                            metadata[field] = [str(v.encode('ascii', 'ignore').strip()) for v in value]
                            try:
                                value = '(' + ','.join(value) + ')'
                            except:
                                log.exception('problem setting list %s: %s' % (field, value))
                        else:
                            metadata[field] = str(value.encode('ascii', 'ignore').strip())
                        # and add to the statistics for this field
                        stats = field2stats.setdefault(field, [0, set()])
                        stats[0] += 1
                        try:
                            stats[1].add(value)
                        except:
                            log.exception('problem adding value %s' % (value))
                        
                    if field in field_name2column:
                        list_values[field_name2column[field]] = value
                    else:
                        not_data_fields.add(field)
                    
                    if 'DatafileUploaded' == field and 'true' == value:
                        count_upload += 1
                        upload_exts.add(metadata['DatafileName'][metadata['DatafileName'].rfind('.') + 1:])
                except Exception as e:
                    log.exception("problem with converting to string and recording stats for %s: %s" % (field, value))
                    raise e
                    
            insert_str_listlist += [list_values]
            archive = metadata['DataArchiveName'] if 'DataArchiveName' in metadata else 'NO_ARCHIVE'
            if 'metadata_data' == table and 'Species' not in metadata:
                nospecies.append(metadata['DatafileName'] + ' ' + metadata['DataLevel'] + ' ' + metadata['DatafileUploaded'] + ' ' + archive)
            if 'metadata_data' == table and 'SDRFFileName' not in metadata and 'project_accession' not in metadata:
                nosdrf.add(metadata['DatafileName'] + ' ' + metadata['DataLevel'] + ' ' + metadata['DatafileUploaded'] + ' ' + archive)

            if 0 == count % 1024:
                log.info('\tinsert statement %s file metadata.  latest: %s' % (count, metadata))
            count += 1

        
        except Exception as e:
            log.exception('problem in store_metadata()')
            raise e
    log.info('\tsetup %s total records, %s to upload with extensions %s' % (count, count_upload, ','.join(upload_exts)))

    # now save to cloudsql
    datastore.ISBCGC_database_helper.insert(config, insert_str_listlist, table, log)
    
    log.info('\tstatistics:')
    fields = field2stats.keys()
    fields.sort()
    for field in fields:
        stats = field2stats[field]
        try:
            log.info('\t\tstats for %s(%s:%s): %s' % (field, len(stats[1]), stats[0], ','.join(stats[1]) if 21 > len(stats[1]) else stats[1].pop()))
        except:
            log.exception('problem with %s(total: %s distinct: %s)' % (field, stats[0], len(stats[1])))
    if 0 < len(nospecies) or 0 < len(nosdrf):
        if nospecies == nosdrf:
            log.warning( '\tfiles with no species match files with no sdrf file(%s):\n\t\t%s' % (len(nospecies), '\n\t\t'.join(list(nospecies)[:150])))
        else:
            log.warning( '\tfiles with no species(%s):\n\t\t%s' % (len(nospecies), '\n\t\t'.join(list(nospecies)[:150])))
            log.warning( '\tfiles with no sdrf file(%s):\n\t\t%s' % (len(nosdrf), '\n\t\t'.join(list(nosdrf)[:150])))
    
    log.info('\tfinished store metadata.  stored %s total records' % (count))
def store_metadata(config, log, table, key_metadata):
    if not config['process_bio']:
        return

    count = 0
    count_upload = 0
    not_data_fields = set()
    upload_exts = set()
    field2stats = {}
    nospecies = []
    nosdrf = set()
    insert_str_listlist = []
    datastore = import_module(config['database_module'])
    field_names = datastore.ISBCGC_database_helper.field_names(table)
    field_name2column = dict([
        (column_name, index) for index, column_name in enumerate(field_names)
    ])
    list_fields = config['list_fields']
    log.info('\tstarting store metadata for %s' % (table))
    # test issue 758
    maftype2protocol2count = {}
    for metadata in key_metadata.itervalues():
        try:
            # skip cellline/control samples:
            if 'SampleBarcode' in metadata and '20' == metadata[
                    'SampleBarcode'][13:15]:
                continue

            list_values = [None for _ in range(len(field_name2column))]
            for field, value in metadata.iteritems():
                # make sure every string field is stored as a string rather than unicode
                # unicode is stored as a blob in the datastore
                try:
                    if value in (None, '->', '', 'None'):
                        metadata[field] = None
                    else:
                        fields2value[field] = fields2value.setdefault(
                            field, 0) + 1
                        fields2maxlength[field] = max(
                            fields2maxlength.setdefault(field, 0), len(value))
                        if field in list_fields:
                            metadata[field] = [
                                str(v.encode('ascii', 'ignore').strip())
                                for v in value
                            ]
                            try:
                                value = '(' + ','.join(value) + ')'
                            except:
                                log.exception('problem setting list %s: %s' %
                                              (field, value))
                        else:
                            metadata[field] = str(
                                value.encode('ascii', 'ignore').strip())
                        # and add to the statistics for this field
                        stats = field2stats.setdefault(field, [0, set()])
                        stats[0] += 1
                        try:
                            stats[1].add(value)
                        except:
                            log.exception('problem adding value %s' % (value))

                    if field in field_name2column:
                        list_values[field_name2column[field]] = value
                    else:
                        not_data_fields.add(field)

                    # test issue 758
                    if 'SecurityProtocol' == field:
                        protocol2count = None
                        if metadata['DatafileName'].endswith('protected.maf'):
                            protocol2count = maftype2protocol2count.setdefault(
                                'protected', {})
                        elif metadata['DatafileName'].endswith('somatic.maf'):
                            protocol2count = maftype2protocol2count.setdefault(
                                'somatic', {})
                        elif metadata['DatafileName'].endswith('vcf'):
                            protocol2count = maftype2protocol2count.setdefault(
                                'vcf', {})

                        if dict == type(protocol2count):
                            newcount = protocol2count.setdefault(
                                metadata['SecurityProtocol'], 0) + 1
                            protocol2count[
                                metadata['SecurityProtocol']] = newcount

                    if 'DatafileUploaded' == field and 'true' == value:
                        log.log_info(
                            'DatafileUploaded should not be true for %s:%s' %
                            (metadata['Platform'], metadata['DatafileName']))
                        count_upload += 1
                        upload_exts.add(metadata['DatafileName']
                                        [metadata['DatafileName'].rfind('.') +
                                         1:])
                except Exception as e:
                    log.exception(
                        "problem with converting to string and recording stats for %s: %s"
                        % (field, value))
                    raise e

            insert_str_listlist += [list_values]
            archive = metadata[
                'DataArchiveName'] if 'DataArchiveName' in metadata else 'NO_ARCHIVE'
            if 'metadata_data' == table and 'Species' not in metadata:
                nospecies.append(metadata['DatafileName'] + ' ' +
                                 metadata['DataLevel'] + ' ' +
                                 metadata['DatafileUploaded'] + ' ' + archive)
            if 'metadata_data' == table and 'SDRFFileName' not in metadata and 'project_accession' not in metadata:
                nosdrf.add(metadata['DatafileName'] + ' ' +
                           metadata['DataLevel'] + ' ' +
                           metadata['DatafileUploaded'] + ' ' + archive)

            if 0 == count % 1024:
                log.info('\tinsert statement %s file metadata.  latest: %s' %
                         (count, metadata))
            count += 1

        except Exception as e:
            log.exception('problem in store_metadata()')
            raise e
    log.info('\tsetup %s total records, %s to upload with extensions %s' %
             (count, count_upload, ','.join(upload_exts)))

    # test issue 758
    if 0 < len(maftype2protocol2count):
        log.info('\tmaf security binning:')
        for filetype, protocol2count in maftype2protocol2count.iteritems():
            for protocol, count in protocol2count.iteritems():
                log.info('\t\t%s: %s\t%s' % (filetype, protocol, count))

    # now save to cloudsql
    datastore.ISBCGC_database_helper.insert(config, insert_str_listlist, table,
                                            log)

    log.info('\tstatistics:')
    fields = field2stats.keys()
    fields.sort()
    for field in fields:
        stats = field2stats[field]
        try:
            log.info(
                '\t\tstats for %s(%s:%s): %s' %
                (field, len(stats[1]), stats[0],
                 ','.join(stats[1]) if 21 > len(stats[1]) else stats[1].pop()))
        except:
            log.exception('problem with %s(total: %s distinct: %s)' %
                          (field, stats[0], len(stats[1])))
    if 0 < len(nospecies) or 0 < len(nosdrf):
        if nospecies == nosdrf:
            log.warning(
                '\tfiles with no species match files with no sdrf file(%s):\n\t\t%s'
                % (len(nospecies), '\n\t\t'.join(list(nospecies)[:150])))
        else:
            log.warning('\tfiles with no species(%s):\n\t\t%s' %
                        (len(nospecies), '\n\t\t'.join(list(nospecies)[:150])))
            log.warning('\tfiles with no sdrf file(%s):\n\t\t%s' %
                        (len(nosdrf), '\n\t\t'.join(list(nosdrf)[:150])))

    log.info('\tfinished store metadata.  stored %s total records' % (count))
예제 #22
0
def uploadTCGA(configFileName):
    '''
    based on the configuration map loaded from the configFileName, loads the DCC data into GCS.  also
    obtains metadata based on file paths, SDRF values, and CGHub manifest values
    
    parameters:
        configFileName: the file name of the configuration map
    '''
    print datetime.now(), 'begin uploadTCGA()'
    global executor
    gcs_wrapper = None
    try:
        with open(configFileName) as configFile:
            config = json.load(configFile)

        run_dir = str(date.today()).replace(
            '-', '_') + '_' + config['log_dir_tag'] + '/'
        log_name = create_log(run_dir, 'top_processing')
        log = logging.getLogger(log_name)
        log.info('begin uploadTCGA()')
        executor = futures.ThreadPoolExecutor(max_workers=config['threads'])
        info_status(config, log)

        setup_database(config, log)

        # open the GCS wrapper here so it can be used by all the tumor types/platforms to save files
        gcs_wrapper = import_module(config['gcs_wrapper'])
        gcs_wrapper.open_connection(config, log)

        tumor_type2platform2archive_types2archives, platform2archive2metadata = process_latestarchive(
            config, run_dir, log_name)
        prepare_upload(tumor_type2platform2archive_types2archives, log)
        if 'process_cghub' not in config or config['process_cghub']:
            tumor_type2cghub_records = process_cghub(config,
                                                     run_dir,
                                                     log=log,
                                                     removedups=True,
                                                     limit=-1)
        else:
            log.warning(
                '\n\t====================\n\tnot processing CGHub records this run!\n\t===================='
            )
            tumor_type2cghub_records = {}
        barcode2metadata = process_metadata_current(config, run_dir, log_name)
        if 'process_annotations' not in config or config['process_annotations']:
            barcode2annotations = process_annotations(config, run_dir,
                                                      log_name)
        else:
            log.warning(
                '\n\t====================\n\tnot processing annotations this run!\n\t===================='
            )
            barcode2annotations = {}
        process_tumortypes(config, run_dir,
                           tumor_type2platform2archive_types2archives,
                           platform2archive2metadata, tumor_type2cghub_records,
                           barcode2metadata, barcode2annotations, log)

        # associate the annotation metadata with the other metadata tables
        associate_metadata2annotation(config, log)

        # print out the stats
        metadata_modules = config['metadata_modules']
        for metadata_module in metadata_modules:
            module = import_module(metadata_module)
            module.print_combined_stats(log)
    finally:
        if executor:
            executor.shutdown(wait=False)
    log.info('finish uploadTCGA()')

    try:
        # upload the logs and TCGA files used for upload to GCS
        upload_run_files(config, run_dir, log)
    except Exception as e:
        log.exception('problem moving the logs and run files to GCS')
    finally:
        if gcs_wrapper:
            gcs_wrapper.close_connection()

    print datetime.now(), 'finish uploadTCGA()'
예제 #23
0
def store_metadata(config, log, table, key2metadata):   
    metadata_modules = config['metadata_modules']
    for metadata_module in metadata_modules:
        module = import_module(metadata_module)
        module.store_metadata(config, log, table, key2metadata)
예제 #24
0
from functools import wraps
from unittest import TestCase
from mock import patch, MagicMock, call
import util
import gpg_wrapper_common


MODULE_NAME = "gpg_key"
LIBRARY_PATH = "library"

module = util.import_module(MODULE_NAME, LIBRARY_PATH)

class GPGWrapperTestCase(gpg_wrapper_common.GPGKeyTestCase):
    gpg_class = module.GPG

def patch_path(keyring='', exists=True, isdir=False):
    from os.path import isdir as original_isdir
    from os.path import exists as original_exists

    def _isdir(path):
        if path == keyring:
            return isdir
        return original_isdir

    def _exists(path):
        if path == keyring:
            return exists
        return original_exists

    def decorator(method):
        @wraps(method)
예제 #25
0
파일: _mem.py 프로젝트: davidkhess/mem
 def __init__(self, mem, subdir, memfile="Memfile"):
     self.mem = mem
     self.orig_dir = os.path.abspath(os.curdir)
     self.subdir = os.path.join(self.orig_dir, subdir)
     self.memfile = os.path.join(self.subdir, memfile)
     self.mf = util.import_module(self.memfile, self.memfile)
예제 #26
0
from mock import patch, call, MagicMock
from functools import wraps
from unittest import TestCase
from mock import Mock
from ansible.runner.return_data import ReturnData
import gpg_common
import gpg_remote_common
import gpg_plugin_common
import gpg_wrapper_common
from gpg_remote_common import patch_execute_module, provided_mocks
import util

MODULE_NAME = "gpg"
LIBRARY_PATH = "plugins/action_plugins"

module = util.import_module(MODULE_NAME, LIBRARY_PATH, _as="_gpg")

class ConnectionTestCase(gpg_remote_common.ConnectionTestCase):
    connection_class = module.Connection
    connection_error = module.ConnectionError

    @patch_execute_module(return_value=1)
    def test_run_gpg(self, e_m):
        c = self.get_connection()
        result = c.run_gpg(a=1, b=2)
        args = dict(a=1, b=2)
        e_m.assert_called_once_with('gpg', args, check=False)
        self.assertEqual(result, 1)

    def patch_tempfile(name="saymyname"):
        def decorator(method):
예제 #27
0
def main(configFilename, openETLFilename, contETLFilename, outputFilename,
         contBucketContentFilename):
    '''
    process the two ETL files and obtain the metadata for the filenames in contETLFilename
    that aren't in openETLFilename and save to the output file
    
    parameters:
        openETLFilename: the data metadata file produced by the open access upload run (note: this will have controlled access
        metadata for those controlled access files who appear in the metadata)
        contETLFilename: the data metadata file produced by the controlled access upload run
        outputFilename: he file to write the results to
    '''
    with open(configFilename) as configFile:
        config = json.load(configFile)
    run_dir = str(date.today()).replace(
        '-', '_') + '_' + config['log_dir_tag'] + '/'
    log_name = create_log(run_dir, 'top_processing')
    log = logging.getLogger(log_name)
    log.info('%s: start processing controlled vs open upload metadata' %
             (str(datetime.now())))

    openFilename2aliquot2metadata = {}
    processETLFile(openETLFilename, openFilename2aliquot2metadata, log)
    contFilename2aliquot2metadata = {}
    processETLFile(contETLFilename, contFilename2aliquot2metadata, log)

    openkeys = set(openFilename2aliquot2metadata.keys())
    contkeys = set(contFilename2aliquot2metadata.keys())
    onlycontkeys = contkeys - openkeys

    bucketFiles = set()
    processBucketContents(contBucketContentFilename, bucketFiles, log)
    intersect = bucketFiles & onlycontkeys
    log.info('\n\tthe count of files only in the controlled access ETL: %s\n\tthe intersection count with the bucket: %s\n\tthe non intersect left in the ETL: %s' % \
        (len(onlycontkeys), len(intersect), len(onlycontkeys - bucketFiles)))

    headerCols = [
        'Pipeline', 'DataArchiveURL', 'DatafileUploaded', 'Datatype', 'Study',
        'DataArchiveVersion', 'DataCenterType', 'Project', 'Platform',
        'DataLevel', 'ParticpantBarcode', 'SecurityProtocol', 'SampleBarcode',
        'AliquotBarcode', 'IncludeForAnalysis', 'DataCenterName',
        'DatafileName', 'DataCenterCode', 'DataArchiveName', 'Species',
        'AliquotUUID'
    ]
    inserts = []
    missing_cols = {}
    missing_filenames = set()
    missing_aliquots = set()
    with open(outputFilename, 'w') as outfile:
        for key in onlycontkeys:
            for metadata in contFilename2aliquot2metadata[key].itervalues():
                outfile.write(metadata)
                metadata_json = json.loads(metadata)
                if len(metadata_json) > len(headerCols):
                    raise ValueError(
                        'found unknown column(s): %s' %
                        (','.join(set(metadata_json.keys()) -
                                  set(headerCols))))
                newinsert = []
                for header in headerCols:
                    if header in metadata_json:
                        newinsert += [metadata_json[header]]
                    else:
                        newinsert += [None]
                        count = missing_cols.setdefault(header, 0) + 1
                        missing_filenames.add(key)
                        missing_aliquots.add(metadata_json['AliquotBarcode'])
                        missing_cols[header] = count
            inserts += [newinsert]
        log.info('\n\tmissing cols: %s from files %s and aliquots %s\n' %
                 (', '.join('%s: %s' % (column, count)
                            for column, count in missing_cols.iteritems()),
                  len(missing_filenames), len(missing_aliquots)))

    db_module = import_module(config['database_module'])
    db_module.ISBCGC_database_helper.column_insert(config, inserts,
                                                   'metadata_data', headerCols,
                                                   log)

    log.info('%s: finished processing controlled vs open upload metadata' %
             (str(datetime.now())))
예제 #28
0
파일: __init__.py 프로젝트: johntyree/mem
def import_memfile(f):
    return util.import_module(f, f)