Exemplo n.º 1
0
def make_grid_for_file(results_dir, list_file, grid_file):
    POOL_DIRECTIONS = ['H', 'V']
    POOL_RANGE = range(1, 13)
    logger.info('%s => %s', list_file, grid_file)
    df = DataFrame(filename=os.path.join(results_dir, list_file))
    (pair, id) = df.get_columns('pair', 'id')
    logger.info('%d hits', len(pair))
    col_names = [ 'V' + str(i) for i in POOL_RANGE ]
    row_names = [ 'H' + str(i) for i in POOL_RANGE ]
    data_dict = dict() # will hold a list of the hits for each row, column pair
    for r in row_names:
        for c in col_names:
            data_dict[(r,c)] = [ ]
    for (mypair, myid) in zip(pair, id):
        (horiz, vert) = mypair.split(' x ')
        data_dict[(horiz, vert)] = data_dict[(horiz, vert)] + [ myid ]
    # now build a new data frame as a list of tuples, column name and column list
    data_by_column = [ ]
    # first column is the row names
    data_by_column += [(grid_file, row_names)]
    # subsequent columns are by vertical pool
    for c in col_names:
        col_data = [ ]
        for r in row_names:
            col_data.append(' '.join(sorted(data_dict[(r,c)])))
        data_by_column += [ (c, col_data)]
    grid_dataframe = DataFrame(data=data_by_column)
    grid_dataframe.write(os.path.join(results_dir, grid_file))
Exemplo n.º 2
0
def get_control_from_file(filename, simple=True):
    """
    read the file as a data frame
    for each id, check how many times it occurs as control or experimental
    make a dict with (id, name) as key where pair is often as controls, or name is nd
    """
    logger.info('reading controls from %s', filename)
    control = DataFrame(filename=filename)
    control_dict = dict()
    
    if (simple):
        (ids, names) = control.get_columns('id','name')
        for (id, name) in zip(ids, names):
            control_dict[(id,name)] = True
    else:
        (id, name, control, exptl) = control.get_columns('id', 'name', 'control', 'exptl')
        id_to_name = dict()
        for (i, n, c, e) in zip(id, name, control, exptl):
            isND = n in [ 'ND', 'nd', 'N.D.' ]
            isControl = (i == 'CONTROL')
            isIgg = (n == 'IgG')
            if ((c >= e) or isND or isControl or isIgg):
                control_dict[(i, n)] = True
                
        # insert some special cases
        control_dict[('CONTROL', 'IgG')] = True
    
        for (i, n) in zip(id, name):
            if i not in id_to_name:
                id_to_name[i] = dict()
            id_to_name[i][n] = True
        
        id_to_names = dict()
        for i in id_to_name:
            names = sorted(id_to_name[i].keys())
            cnt = len(names)
            name_str = ','.join(names)
            id_to_names[i] = dict()
            id_to_names[i]['cnt'] = cnt
            id_to_names[i]['names'] = name_str
        ids = sorted(id_to_names.keys())
        cnts = [ id_to_names[x]['cnt'] for x in ids ]
        names = [ id_to_names[x]['names'] for x in ids ]
        df = DataFrame(data=[ ('id', ids), ('cnt', cnts), ('names', names)])
        df.write('id_to_names.txt')
    return(control_dict)