Exemplo n.º 1
    def _init(this, source, target, encoding=DEFAULT_ENCODING):
        super(populate_from_sdf, this)._init(target)

        assert tc.isstring(source)
        with open(source) as fh:
            data = fh.read().decode(encoding)

        this._sdfrecords = s2p.parse_sdf(data)
def main(path):
    Read in the sdf file
    # map field labels to model fields
    properties = ('model_field','required','default','converter')
    get_primary_name = lambda x: x.split(';')[0].strip()
    get_alternate_names = lambda x: ';'.join([x.strip() for x in x.split(';')[1:]])
    labels = { s2p.MOLDATAKEY:('molfile',True),
              # NOTE: even though these db field are not integers, 
              # it is convenient to convert the read in values to INT to make sure they are not interpreted as float values
               'facility_reagent_id': ('facility_id',True,None, lambda x: util.convertdata(x[x.index('HMSL')+4:],int)), 
               'salt_id': ('salt_id',True,None, lambda x: util.convertdata(x,int)),
               'lincs_id':('lincs_id',False), #None,lambda x:util.convertdata(x,int)),
               'smiles': ('_smiles',True),
               'molecular_mass':('_molecular_mass',False,None, lambda x: round(util.convertdata(x, float),2)),
               # 'concentration':'concentration',
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    labels = util.fill_in_column_definitions(properties,labels)
    assert typecheck.isstring(path)
    with open(path) as fh:
        data = fh.read().decode(DEFAULT_ENCODING)

    records = s2p.parse_sdf(data)
    logger.info(str(('read rows: ', len(records))))
    count = 0
    for record in records:
        logger.debug(str(('record', record)))
        initializer = {}
        for key,properties in labels.items():
            logger.debug(str(('look for key: ', key, ', properties: ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']
            value = record.get(key)

            # Todo, refactor to a method
                logger.debug(str(('raw value', value)))
                if(converter != None):
                    value = converter(value)
                if(value == None ):
                    if( default != None ):
                        value = default
                if(value == 'n/a'): value = None
                if(value == None and  required == True):
                    raise Exception(str(('Field is required: ', key, initializer, 'record:', count)))
                logger.debug(str(('model_field: ' , model_field, ', value: ', value)))
                initializer[model_field] = value
            except Exception, e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]      
                logger.error(str((exc_type, fname, exc_tb.tb_lineno)))
                logger.error(str(('invalid input', e, 'count', count)))
                raise e
        # follows is a kludge, to split up the entered "chemical_name" field, on ';' - TODO: just have two fields that get entered
        if(logger.isEnabledFor(logging.DEBUG)): logger.debug(str(('initializer: ', initializer)))
            sm = SmallMolecule(**initializer)
            logger.info(str(('sm created:', sm)))
            count += 1
        except Exception, e:
            logger.error(str(('save failed for: ', initializer, 'error',e, 'count: ', count)))
            raise e
Exemplo n.º 3
def main(path):
    properties = ('model_field','required','default','converter')
    get_primary_name = lambda x: x.split(';')[0].strip()
    get_alternate_names = (
        lambda x: '; '.join([x.strip() for x in x.split(';')[1:]]))
    labels = { s2p.MOLDATAKEY:('molfile',True),
        'facility_reagent_id': (
            lambda x: util.convertdata(x[x.index('HMSL')+4:],int)), 
        'salt_id': ('salt_id',True,None, lambda x: util.convertdata(x,int)),
        'smiles': ('_smiles',False),
            lambda x: round(util.convertdata(x, float),2)),
        'relevant_citations': '_relevant_citations',
        'date_loaded': ('date_loaded',False,None,util.date_converter),
        'date_publicly_available': ('date_publicly_available',False,None,
        'date_updated': ('date_updated',False,None,util.date_converter),
    labels = util.fill_in_column_definitions(properties,labels)
    assert typecheck.isstring(path)
    with open(path) as fh:
        data = fh.read().decode(DEFAULT_ENCODING)

    records = s2p.parse_sdf(data)
    logger.info('rows read: %d ', len(records))
    count = 0
    for record in records:
        initializer = {}
        for key,properties in labels.items():
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']
            value = record.get(key)

                if(converter != None):
                    value = converter(value)
                if(value == None ):
                    if( default != None ):
                        value = default
                if(value == 'n/a'): value = None
                if(value == None and  required == True):
                    raise Exception(
                        'Field is required: %r, values: %r, row: %d'
                        % (key,initializer,count))
                initializer[model_field] = value
            except Exception, e:
                logger.exception('invalid input, row: %d', count)
                raise e
        # follows is a kludge, to split up the entered "chemical_name" field, 
        # on ';' - TODO: just have two fields that get entered
            sm = SmallMolecule(**initializer)
            count += 1
            # create a default batch - 0
        except Exception:
            logger.exception('save failed for: %r, row: %d', initializer, count)