Exemplo n.º 1
0
def __build_reader(
    path: str,
    delimiter: str = ',',
    logger: Logger=getLogger(__name__)
) -> 'csv_reader':
    """Build a CSV reader object

    Parameters
    ----------
    path: str
        Path to the file to build the reader from
    delimiter: str, optional
        Pattern to separate columns
    logger: Logger, optional

    Returns
    -------
    CSV reader object
    """
    if isinstance(path, bytes):
        reader = csv_reader(
            StringIO(path.decode('utf-8')),
            delimiter=delimiter
        )
    else:
        try:
            reader = csv_reader(
                open(path, 'r'),
                delimiter=delimiter
            )
        except FileNotFoundError:
            logger.error('Could not read file: '+str(path))
            return None
    next(reader)
    return reader
Exemplo n.º 2
0
    def parseSignificativeFeaturesFile(self, fileName, isBedFormat=False):
        #TODO: HEADER
        relevantFeatures = {}
        if os_path.isfile(fileName):
            with open(fileName, 'rU') as inputDataFile:
                for line in csv_reader(inputDataFile, delimiter="\t"):
                    if (isBedFormat == True):
                        lineProc = line[0] + "_" + line[1] + "_" + line[2]
                    else:
                        lineProc = line[0]

                    # If the relevants file is not in BED format and contains more than 1 column, it means
                    # that the second one contains the original ID
                    if len(line) > 1 and not isBedFormat:
                        featureID = ":::".join([line[0], line[1]]).lower()
                    else:
                        featureID = lineProc.lower()
                    relevantFeatures[featureID] = 1
            inputDataFile.close()
            logging.info("PARSING RELEVANT FEATURES FILE (" + fileName +
                         ")... THE FILE CONTAINS " +
                         str(len(relevantFeatures.keys())) +
                         " RELEVANT FEATURES")
        else:
            logging.info("PARSING RELEVANT FEATURES FILE (" + fileName +
                         ")... NO RELEVANT FEATURES FILE SUBMITTED")

        return relevantFeatures
Exemplo n.º 3
0
def CreateConstMuskingumXFile(x_value,
                              in_connectivity_file,
                              out_x_file):
    """
    Create muskingum X file from value that is constant all the way through for each river segment.
    
    Args:
        x_value(float): Value for the muskingum X parameter [0-0.5].
        in_connectivity_file(str): The path to the RAPID connectivity file.
        out_x_file(str): The path to the output x file.
    
    Example::
    
        from RAPIDpy.gis.muskingum import CreateConstMuskingumXFile
        #------------------------------------------------------------------------------
        #main process
        #------------------------------------------------------------------------------
        if __name__ == "__main__":
            CreateConstMuskingumXFile(x_value=0.3,
                                      in_connectivity_file='/path/to/rapid_connect.csv',
                                      out_x_file='/path/to/x.csv',
                                      )
    """
    num_rivers = 0
    with open_csv(in_connectivity_file, "r") as csvfile:
        reader = csv_reader(csvfile)
        for row in reader:
            num_rivers+=1

    with open_csv(out_x_file,'w') as kfile:
        x_writer = csv_writer(kfile)
        for idx in xrange(num_rivers):
            x_writer.writerow([x_value])    
Exemplo n.º 4
0
def load_csv(filename):
    dataset = []
    with open(filename, 'r') as file:
        reader = csv_reader(file)
        for row in reader:
            dataset.append(row)
    return dataset
Exemplo n.º 5
0
def get_ref_band_weights(file: str, col: int, nbands: int) -> np.array:
    ref_band_weights_reader = csv_reader(open(file, "r"))
    ban_data = {
        int(row[0][1:]): float(row[col])
        for row in ref_band_weights_reader
    }
    return np.array([ban_data.get(iW, 0.0) for iW in range(nbands)])
Exemplo n.º 6
0
def create_namedtuple_from_csv(name, csv):
    l = logic.Mutex()

    l.is_filename = (isinstance(csv, str) and '\n' not in csv
                     and os.path.isfile(csv))

    l.is_csv_text = (isinstance(csv, str) and '\n' in csv and ',' in csv)

    l.is_csv_lines = (not isinstance(csv, str)
                      and (hasattr(csv, '__iter__') or hasattr(csv, 'next')))

    lines = None

    with l as g:
        if g.is_filename:
            with open(csv, 'r') as f:
                lines = f.read().split('\n')

        elif g.is_csv_text:
            lines = csv.split('\n')

        elif g.is_csv_lines:
            lines = csv

    data = [r for r in csv_reader(lines)]
    mutable_sheets = set()
    mutable = True if name in mutable_sheets else False
    return create_namedtuple(name, data, mutable=mutable)
Exemplo n.º 7
0
def copy_all(fname,dt=None):
    if dt!=None:
        DT_TD=timedelta(seconds=dt)
    ten_min=timedelta(minutes=10)

    with open(fname+'.csv', 'rU') as fin:
        csv_file=csv_reader(fin)
        csv_file.next() ##read off header

        for line in csv_file:
            if len(line)==0:
                continue

            event_num=line[1]
            timestamp=datetime.strptime(line[0].split('.')[0], '%Y-%m-%d %H:%M:%S')
            if dt==None:
                DT_TD=timedelta(seconds=int(line[2]))

            print
            print "copying event", event_num
            present_time=timestamp-DT_TD
            while True:
                print "  start time:", present_time
                file_start,s=load_file_10MIN(present_time)
                if file_start+ten_min>=timestamp+DT_TD:
                    break
                present_time=file_start+ten_min
Exemplo n.º 8
0
def CreateConstMuskingumXFile(x_value, in_connectivity_file, out_x_file):
    """
    Create muskingum X file from value that is constant all the way through
    for each river segment.

    Parameters
    ----------
    x_value: float
        Value for the muskingum X parameter [0-0.5].
    in_connectivity_file: str
        The path to the RAPID connectivity file.
    out_x_file: str
        The path to the output x file.


    Example::

        from RAPIDpy.gis.muskingum import CreateConstMuskingumXFile

        CreateConstMuskingumXFile(
            x_value=0.3,
            in_connectivity_file='/path/to/rapid_connect.csv',
            out_x_file='/path/to/x.csv')

    """
    num_rivers = 0
    with open_csv(in_connectivity_file, "r") as csvfile:
        reader = csv_reader(csvfile)
        for _ in reader:
            num_rivers += 1

    with open_csv(out_x_file, 'w') as kfile:
        x_writer = csv_writer(kfile)
        for _ in xrange(num_rivers):
            x_writer.writerow([x_value])
Exemplo n.º 9
0
 def create_attn_list(self):
     """
     Parses the CSV file containing the attenuation to apply during the test.
     """
     file_content = None
     if self._scenario_uri.startswith('http'):
         file_to_load = NamedTemporaryFile(delete=True)
         urlretrieve(self._scenario_uri, file_to_load.name)
         file_content = open(file_to_load.name, 'r')
         self._log.info("SCENARIO_FILE=%s" % file_to_load.name)
     else:
         if (os_path.isfile(self._scenario_uri)):
             file_content = open(self._scenario_uri, 'r')
             self._log.info("SCENARIO_FILE=%s" %
                            os_path.basename(self._scenario_uri))
         else:
             self._log.error("Can't open the scenario file %s" %
                             self._scenario_uri)
     if file_content is not None:
         self._log.info("Loading scenario...")
         reader = csv_reader(file_content, delimiter=';')  # semi_column
         for row in reader:
             if reader.line_num == 1:
                 self._num_of_aps = len(row) - SHIFT_COLUMN
                 self._log.info("Found %d APs" % self._num_of_aps)
                 self._ap_list = row[SHIFT_COLUMN:len(row)]
                 self._log.debug("APs list is: %s" % self._ap_list)
             else:
                 self._scenario_values.append(row)
Exemplo n.º 10
0
def CreateConstMuskingumXFile(x_value, in_connectivity_file, out_x_file):
    """
    Create muskingum X file from value that is constant all the way through for each river segment.
    
    Args:
        x_value(float): Value for the muskingum X parameter [0-0.5].
        in_connectivity_file(str): The path to the RAPID connectivity file.
        out_x_file(str): The path to the output x file.
    
    Example::
    
        from RAPIDpy.gis.muskingum import CreateConstMuskingumXFile
        #------------------------------------------------------------------------------
        #main process
        #------------------------------------------------------------------------------
        if __name__ == "__main__":
            CreateConstMuskingumXFile(x_value=0.3,
                                      in_connectivity_file='/path/to/rapid_connect.csv',
                                      out_x_file='/path/to/x.csv',
                                      )
    """
    num_rivers = 0
    with open_csv(in_connectivity_file, "r") as csvfile:
        reader = csv_reader(csvfile)
        for row in reader:
            num_rivers += 1

    with open_csv(out_x_file, 'w') as kfile:
        x_writer = csv_writer(kfile)
        for idx in xrange(num_rivers):
            x_writer.writerow([x_value])
def main():
    for infilename in sys.argv[1:]:
        outfilename = sub("\.csv", "_pad.csv", infilename)
        prev_dt = -1
        week = timedelta(days=7)
        one = timedelta(days=1)
        with open(outfilename, "wb") as outfile:
            w = csv_writer(outfile)
            with open(infilename, "rb") as infile:
                r = csv_reader(infile)
                header = r.next()
                w.writerow(header)
                for row in r:
                    dt = dt_parser.parse(row[0])
                    if prev_dt != -1:
                        # we're past the first line... compare!
                        diff = dt - prev_dt
                        if diff > one:
                            for i in reversed(range(diff.days - 1)):
                                wahoo = timedelta(days=(i+1))
                                pad = dt - wahoo
                                #print >> sys.stderr, "padding:%s" % pad
                                w.writerow([_get_dt_str(pad), 0])
                    w.writerow([_get_dt_str(dt), row[1]])
                    prev_dt = dt
Exemplo n.º 12
0
def read_colorbrewer(iterable):
    res = defaultdict(dict)

    iterator = iter(iterable)
    fieldnames = next(csv_reader(iterator, DIALECT))
    reader = DictReader(iterator, fieldnames, dialect=DIALECT)

    for row in reader:

        def int_cell(colname):
            return int(row[colname])

        color_name = row["ColorName"]

        if color_name:
            num_of_colors = int_cell("NumOfColors")

            colors = []
            res[color_name][num_of_colors] = colors

        try:
            colors.append(tuple(map(int_cell, "RGB")))
        except ValueError:
            # data section is over
            break

    return res
Exemplo n.º 13
0
def csv_file_column_names(filename):
    path = join(BASE_DATASOURCES_DIR, filename)
    with open(path, "r") as f:
        reader = csv_reader(f)
        for row in reader:
            return row
        raise IOError("csv file {0} has no rows" % filename)
Exemplo n.º 14
0
def main():
    csv_input_file = open(PATH_TO_CSV, 'r')
    reader = csv_reader(csv_input_file, delimiter=',')

    csv_output_file = open("out.csv", 'w', encoding='utf-8-sig', newline='')
    fieldnames = ['keyword', 'freq']
    writer = csv_writer(csv_output_file, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
        pdf_path = PATH_TO_PDF + row[0] + ".pdf"
        json_path = PATH_TO_JSON + row[0] + ".json"

        keywords = get_keywords(json_path)
        cleared_keywords = get_cleared_keywords(keywords)
        pdf_words = get_cleared_pdf(pdf_path)

        keywords_freq = get_keywords_freq(pdf_words, cleared_keywords)

        for i in range(len(keywords)):
            writer.writerow({
                'keyword': keywords[i].encode('utf8'),
                'freq': keywords_freq[i]
            })

    csv_input_file.close()
    csv_output_file.close()
Exemplo n.º 15
0
def main(args: Namespace) -> None:
    caption_indices = set(
    )  # Safe to do b/c embeddings inserted in right order
    embeddings = []
    filenames = glob(path.join(args.data_dir, args.embed_dir, '*.pickle'))
    # Sort filenames in-place by numerical value of file (not lexicographically)
    filenames.sort(key=lambda filename: int(filename.split('/')[-1][:-7]))
    print('Sorted partial embedding files')
    for filename in filenames:
        with open(filename, 'rb') as partial_embed:
            # Zip iterator of (caption index, 2048-dim NumPy image embedding)
            for index, embed in load(partial_embed):
                caption_indices.add(index)
                embeddings.append(embed)
    print('Started stacking embeddings after loading them into memory')
    # Stack embeddings together into single matrix before saving
    embeddings = stack(embeddings)
    print('Finished stacking embeddings')
    save(path.join(args.data_dir, args.outfile), embeddings)
    print('Finished saving embeddings')
    # Save pruned captions as simple text file (no need for TSV anymore)
    with open(path.join(args.data_dir, args.infile), newline='') as tsvfile:
        tsv_reader = csv_reader(tsvfile, delimiter='\t')
        with open(path.join(args.data_dir, args.pruned_captions),
                  'w') as outfile:
            for i, row in enumerate(tsv_reader):
                if i in caption_indices:
                    outfile.write(f'{row[0]}\n')
    print('Finished saving pruned captions')
Exemplo n.º 16
0
def get_arp_table(*, modify=False, host=None):
    '''
    return arp table as dictionary

        {IPv4Address(ip): mac} = get_arp_table(modify=True)

    if modify is set to True, the ":" will be removed from the mac addresses.

    if host is specified, return just the mac address of the host sent in, returning None if host is not present.
    '''

    with open('/proc/net/arp') as arp_table:
        # 'IP address', 'HW type', 'Flags', 'HW address', 'Mask', 'Device'
        arp_table = list(
            csv_reader(arp_table, skipinitialspace=True, delimiter=' ')
        )

    if (modify):
        arp_table = {IPv4Address(a[0]): a[3].replace(':', '') for a in arp_table[1:]}

    else:
        arp_table = {IPv4Address(a[0]): a[3] for a in arp_table[1:]}

    if (host):
        return arp_table.get(host, None)

    else:
        return arp_table
Exemplo n.º 17
0
def get_exwoce_params():
    """Return a dictionary of WOCE parameters allowed for Exchange conversion.

    Returns:
        {'PMNEMON': {
            'unit_mnemonic': 'WOCE', 'range': [0.0, 10.0], 'format': '%8.3f'}}

    """
    reader = csv_reader(open(WOCE_PARAMS_FOR_EXWOCE, 'r'))

    # First line is header
    reader.next()

    params = {}
    for order, row in enumerate(reader):
        if row[-1] == 'x':
            continue
        if not row[1]:
            row[1] = None
        if row[2]:
            prange = map(float, row[2].split(','))
        else:
            prange = None
        if not row[3]:
            row[3] = None
        params[row[0]] = {
            'unit_mnemonic': row[1],
            'range': prange,
            'format': convert_fortran_format_to_c(row[3]),
            'order': order,
        }
    return params
Exemplo n.º 18
0
        def unicode_csv_reader(csvfile):
            
            records = []
            for data in csv_reader(csvfile):
              
                #skip the first line of the csv
                if data[0] == "Identifier":
                    continue
                
                #label = offense_type
                
                #features used include:
                    #Day of Week,Occurrence Month,
                    #Occurrence Day,
                    #Occurrence Hour,CompStat Month,
                    #CompStat Day,
                    #Sector,Precinct,
                    #Borough,Jurisdiction,
                    #XCoordinate, YCoordinate, Location
                    
                label = data[10]
                feature = data[3:5] + data[6:9] + data[11:-1]
                
                records.append([label, feature])

            return records
Exemplo n.º 19
0
def check_src_in_sink_1(
    source_inchi: str, sink_file: str,
    logger: Logger = getLogger(__name__)) -> int:
    """
    Check if source is present in sink file. InChIs have to be strictly equal.

    Parameters
    ----------
    source_inchi: str
        Path to file containing the source.
    sink_file: str
        Path to file containing the sink.
    logger : Logger
        The logger object.

    Returns
    -------
    int Return code.

    """

    logger.info('   |- Source in Sink (simple)')

    try:
        with open(sink_file, 'r') as f:
            for row in csv_reader(f, delimiter=',', quotechar='"'):
                if source_inchi == row[1]:
                    logger.error('        source has been found in sink')
                    return -1

    except FileNotFoundError as e:
        logger.error(e)
        return -2

    return 0
Exemplo n.º 20
0
def read_scorefile(filepath, gene, offset):
    if not exists(filepath):
        raise ValueError("file '%s' doesn't exist!" % filepath)

    with open(filepath) as fh:
        reader = csv_reader(fh, delimiter='\t')

        header = reader.next()

        # some error correction
        if len(header) < 3:
            raise RuntimeError('expected at least 3 columns')
        if header[0].lower() != 'position':
            raise RuntimeError("expected first column label to be 'position'")
        if header[1].lower() != 'aa':
            raise RuntimeError("expected second column label to be 'aa'")

        drug_idxs = [(i + 2, d) for i, d in enumerate(header[2:])]
        pos_by_drug = dict((d, []) for d in header[2:])

        for row in reader:
            # coordinates are assumed 1-indexed
            coord = int(row[0]) + offset - 1
            amino = row[1].upper()
            klass = (DrugCoord.DELETION  if amino == '-' else
                     DrugCoord.INSERTION if amino == 'INSERT' else
                     DrugCoord.VALUE)
            for idx, drug in drug_idxs:
                val = int(row[idx])
                if val > 0:
                    pos_by_drug[drug].append(DrugCoord(coord, klass, amino, val))

        return dict((d, DrugData(d, gene, pos_by_drug[d])) for _, d in drug_idxs)
Exemplo n.º 21
0
def process_gcc_split(data_dir: str, tsvname: str) -> List[List[str]]:
    lines = []
    with open(path.join(data_dir, tsvname), newline='') as tsvfile:
        tsv_reader = csv_reader(tsvfile, delimiter='\t')
        for line in tsv_reader:
            lines.append([detokenizer.detokenize(line[0].split()), line[1]])
    return lines
Exemplo n.º 22
0
def load_attributes(filename, solr_sources, bq_sources):
    attr_file = open(filename, "r")
    for line in csv_reader(attr_file):
        if line[0] not in ATTR_SET:
            ATTR_SET[line[0]] = new_attribute(
                line[0], line[0].replace("_", " ").title() if re.search(
                    r'_', line[1]) else line[1], Attribute.CATEGORICAL
                if line[2] == 'CATEGORICAL STRING' else Attribute.STRING
                if line[2] == "STRING" else Attribute.CONTINUOUS_NUMERIC,
                True if line[-1] == 'True' else False, True)
        attr = ATTR_SET[line[0]]
        if attr['name'] != 'gcs_url':
            attr['solr_collex'].extend(solr_sources)
        attr['bq_tables'].extend(bq_sources)

        attr['set_types'].append({
            'set': DataSetType.IMAGE_DATA,
            'child_record_search': 'StudyInstanceUID'
        })

        if attr['name'] in DISPLAY_VALS:
            if 'preformatted_values' in DISPLAY_VALS[attr['name']]:
                attr['preformatted_values'] = True
            else:
                attr['display_vals'] = DISPLAY_VALS[attr['name']]['vals']

    attr_file.close()
Exemplo n.º 23
0
def parse_weights(weights_data_file, transcription_factors_line,
                  start_of_table_line, end_of_table):
    deleted_genes = []
    read_out_genes = []
    _data_matrix = []

    with open(weights_data_file, 'rb') as in_file:
        reader = csv_reader(in_file, delimiter='\t')
        for line_number, line in enumerate(reader):
            if line_number == transcription_factors_line:
                deleted_genes = [string.split(' ')[0] for string in line[1:]]
            if end_of_table > line_number > start_of_table_line:
                read_out_genes.append(line[0])
                # print line
                # print line_number
                _data_matrix.append([nan_helper(value) for value in line[1:]])
                # one line is all experimental conditions for a single gene

    _data_matrix = np.array(_data_matrix)

    # check that we got everything properly
    logging.info('deleted genes:\t%s', len(deleted_genes))
    logging.info('genes in read-out:\t%s', len(read_out_genes))
    logging.info('data matrix shape:\t%s', _data_matrix.shape)

    return _data_matrix, deleted_genes, read_out_genes
def open_csv(inpath, namefile, convert_to_float=False):
#===============================================================================

    from csv import reader as csv_reader

    # open file, read all lines
    inputpath = os.path.join(inpath,namefile)
    f=open(inputpath,'rU') 
    reader=csv_reader(f, delimiter=',', skipinitialspace=True)
    lines=[]
    for row in reader:
        lines.append(row)
    f.close()

    # storing headers in list headerow
    headerow=lines[0]

    # deleting rows that are not data (first and last rows of the file)
    del lines[0]

    # transforming data from string to float type
    converted_data=[]
    for line in lines:
        if convert_to_float==True:
            converted_data.append(map(float,line))
        else:
            converted_data.append(line)
    data = np.array(converted_data)

    # creating one dictionnary and storing the float data in it
    dictnamelist= {}
    for j,varname in enumerate(headerow):
        dictnamelist[varname]=data[:,j]
    
    return dictnamelist
Exemplo n.º 25
0
    def __parse_file_list(cls, dir_info, file_filter=FAFileFilterEnum.FILES_AND_DIRS):
        file_list = []
        f = StringIO(dir_info)
        reader = csv_reader(f, delimiter=",")
        rows = []
        for row in reader:
            rows.append(row)
        if len(rows) > 0:
            if len(rows[0]) != 1 and rows[0] != "WLANSD_FILELIST":
                raise FACommandAPIException("Unexpected file entry result at first line", rows[0])
            # TODO: implement a mapping function for filtering by type
            logging.getLogger().debug("Row count of file information : " + str(len(rows)))
            for row in rows[1:]:
                if len(row) != 6:
                    raise FACommandAPIException("Unknown file entry ", row)
                if file_filter is None or file_filter is FAFileFilterEnum.FILES_AND_DIRS:
                    logging.getLogger().debug("Filtering disabled.")
                    file_list.append({"Path": row[0] + "/" + row[1], "DecimalDate": row[4], "DecimalTime": row[5]})
                elif file_filter is FAFileFilterEnum.FILES_ONLY:  # only files
                    if int(row[3]) == FA_FILE_IDENTIFIER:
                        logging.getLogger().debug("Filtering only files.")
                        file_list.append({"Path": row[0] + "/" + row[1], "DecimalDate": row[4], "DecimalTime": row[5]})
                elif file_filter is FAFileFilterEnum.DIRS_ONLY:  # only directories
                    if int(row[3]) == FA_DIR_IDENTIFIER:
                        logging.getLogger().debug("Filtering only directories.")
                        file_list.append({"Path": row[0] + "/" + row[1], "DecimalDate": row[4], "DecimalTime": row[5]})
                else:
                    raise FACommandAPIException("Unknown file filtering!")

        return file_list
Exemplo n.º 26
0
def csv_to_dict(ppl, namesfile):
    with open(namesfile, 'r') as fd:
        read_csv = csv_reader(fd, delimiter=',')
        for row in read_csv:
            row[0] = row[0].strip().title()  # The person name
            row[1] = row[1].strip()  # The e-mail
            ppl.append(tuple((row[0], row[1])))
Exemplo n.º 27
0
 def _m_chemXref(self, chem_xref_path):
     chemXref = {}
     with open(chem_xref_path) as f:
         c = csv_reader(f, delimiter='\t')
         for row in c:
             if not row[0][0] == '#':
                 mnx = self._checkMNXMdeprecated(row[1])
                 if len(row[0].split(':')) == 1:
                     dbName = 'mnx'
                     dbId = row[0]
                 else:
                     dbName = row[0].split(':')[0]
                     dbId = ''.join(row[0].split(':')[1:])
                     if dbName == 'deprecated':
                         dbName = 'mnx'
                 #mnx
                 if not mnx in chemXref:
                     chemXref[mnx] = {}
                 if not dbName in chemXref[mnx]:
                     chemXref[mnx][dbName] = []
                 if not dbId in chemXref[mnx][dbName]:
                     chemXref[mnx][dbName].append(dbId)
                 ### DB ###
                 if not dbName in chemXref:
                     chemXref[dbName] = {}
                 if not dbId in chemXref[dbName]:
                     chemXref[dbName][dbId] = mnx
     return chemXref
Exemplo n.º 28
0
 def _m_compXref(self, compXref_path):
     compXref = {}
     name_compXref = {}
     try:
         with open(compXref_path) as f:
             c = csv_reader(f, delimiter='\t')
             #not_recognised = []
             for row in c:
                 #cid = row[0].split(':')
                 if not row[0][0] == '#':
                     #collect the info
                     mnxc = row[1]
                     if len(row[0].split(':')) == 1:
                         dbName = 'mnx'
                         dbCompId = row[0]
                     else:
                         dbName = row[0].split(':')[0]
                         dbCompId = ''.join(row[0].split(':')[1:])
                         dbCompId = dbCompId.lower()
                     if dbName == 'deprecated':
                         dbName = 'mnx'
                     #create the dicts
                     if not mnxc in compXref:
                         compXref[mnxc] = {}
                     if not dbName in compXref[mnxc]:
                         compXref[mnxc][dbName] = []
                     if not dbCompId in compXref[mnxc][dbName]:
                         compXref[mnxc][dbName].append(dbCompId)
                     #create the reverse dict
                     if not dbCompId in name_compXref:
                         name_compXref[dbCompId] = mnxc
     except FileNotFoundError:
         self.logger.error('compXref file not found')
         return {}
     return compXref, name_compXref
Exemplo n.º 29
0
def csv_open(file, expected_columns):
    """
    Yields rows of csv file as dictionaries

    Parameters:
        file - Path, or file-like object, of the CSV file to use
        expected_columns - Columns of the csv file
            If the first row of the CSV file are these labels, take the columns in that order
            Otherwise, take the columns in the order given by expected_columns
    """

    if isinstance(file, str):
        with open(file, encoding='utf-8') as f:
            yield from csv_open(f, expected_columns=expected_columns)
            return

    expected_columns = tuple(expected_columns)

    csv_iter = csv_reader(file)

    first_row = next(csv_iter)

    if set(first_row) == set(expected_columns):
        columns = first_row
    else:
        columns = expected_columns
        csv_iter = chain([first_row], csv_iter)

    for row in csv_iter:
        if len(row) < len(columns):
            raise IndexError("Too few columns in row {!r}".format(row))

        yield dict(zip(columns, row))
Exemplo n.º 30
0
def read_from_csv(file_name: str) -> ColourList:
    reader = csv_reader(
        open(file_name, "r", newline=""),
        delimiter=" ",
        quotechar='"',
        quoting=QUOTE_NONNUMERIC,
    )
    return [row for row in reader]
Exemplo n.º 31
0
def read_csv(csv_file, encoding=r'UTF-8'):
    from csv import reader as csv_reader
    from pathlib import Path
    with Path(csv_file).open(r'rt', encoding=encoding) as istream:
#         return csv_reader(istream) # I/O operation on closed file.
        r = csv_reader(istream)
        for y in r:
            yield y
def get_info_from_csv(csv_file, Headers_and_data=Headers_and_data):

    from csv import reader as csv_reader

    reader = csv_reader(csv_file)
    iter_reader = iter(reader)
    headers = next(iter_reader)  # Drop the row with column names
    return Headers_and_data(headers, iter_reader)
Exemplo n.º 33
0
def parse_data_summary(file_location):
    data_table = []
    with open(file_location, 'rb') as source:
        reader = csv_reader(source, delimiter='\t')
        header = reader.next()
        for line in reader:
            data_table.append(line)
    return np.array(data_table).astype(np.float)
Exemplo n.º 34
0
 def _get_data(file_name: str):
     DataResolver._validate_file_name(file_name)
     with open(path_join(DataResolver.RES_DIR_PATH, file_name)) as f:
         reader = csv_reader(f)
         try:
             return [[int(elem) for elem in line] for line in reader]
         except ValueError:
             raise ValueError(INVALID_CHAR_IN_RES.format(file_name))
Exemplo n.º 35
0
def loadDictionaryOfTranslationsIfNecessary():
    global phrase_translation
    if not phrase_translation:
        with open(filepath + '/data/translations.csv') as f:
            reader = csv_reader(f, delimiter=',', quotechar='"')
            for row in reader:
                phrase_translation[row[0]] = row[1]
            print("phrase_translation is", phrase_translation)
Exemplo n.º 36
0
def tsv_init(filename):
    """
    Args:
        filename (str)
    """
    tsv_file = open(filename)
    tsv_read = csv_reader(tsv_file, delimiter="\t")
    return tsv_file, tsv_read
Exemplo n.º 37
0
def parse_direct_connections(direct_connections_data_file):
    TF_2_Genes = defaultdict(lambda: 0)
    with open(direct_connections_data_file) as source:
        reader = csv_reader(source, delimiter=';')
        for line in reader:
            TF_2_Genes[line[0], line[1]] = 1

    return TF_2_Genes
Exemplo n.º 38
0
 def _get_records(self, local=False):
     RECORD = os.path.join(self.path, 'RECORD')
     record_reader = csv_reader(open(RECORD, 'rb'), delimiter=',')
     for row in record_reader:
         path, md5, size = row[:] + [None for i in xrange(len(row), 3)]
         if local:
             path = path.replace('/', os.sep)
             path = os.path.join(sys.prefix, path)
         yield path, md5, size
Exemplo n.º 39
0
def read_csv_file(filename):
    """Read the contents of a CVS file into a dict"""
    with open(filename, encoding='latin-1') as file:
        reader = csv_reader(file)
        next(reader)  # skip header
        for line in reader:
            entry = {}
            for key in CSV_MAP:
                entry[key] = line[CSV_MAP[key]]
            yield entry
Exemplo n.º 40
0
    def run(self):
        dump_logger = getLogger('dumpscraper')
        # Let's invoke the getscore runner and tell him to work on training data
        dump_logger.info("Calculating dump score...")
        running = getscore.DumpScraperGetscore(self.settings, self.parentArgs)
        running.run()

        # First of all let's feed the classifier with the training data
        training = scipy_genfromtxt(self.settings['data_dir'] + "/" + "training/features.csv", delimiter=",", skip_header=1, usecols=(0, 1, 2))
        target = scipy_genfromtxt(self.settings['data_dir'] + "/" + "training/features.csv", delimiter=",", skip_header=1, usecols=(-2))

        clf = sklearn.neighbors.KNeighborsClassifier(10, weights='uniform')
        clf.fit(training, target)

        trash_count = hash_count = plain_count = 0
        cleared = []

        with open(self.settings['data_dir'] + "/" + 'features.csv', 'rb') as csvfile:
            reader = csv_reader(csvfile)

            for line in reader:
                if line[0] == 'Trash score':
                    continue

                features = np_array(line[0:3])
                features = features.reshape(1, -1)
                label = clf.predict(features)

                if label == 0:
                    folder = 'trash'
                    trash_count += 1
                elif label == 1:
                    folder = 'hash'
                    hash_count += 1
                elif label == 2:
                    folder = 'plain'
                    plain_count += 1

                target_file = self.settings['data_dir'] + "/" + 'organized/' + folder + "/" + line[-1]
                target_dir = path.dirname(target_file)

                # If asked for a clean run, let's delete the entire folder before copying any file
                if self.parentArgs.clean and target_dir not in cleared and path.exists(target_dir):
                    cleared.append(target_dir)
                    shutil_rmtree(target_dir)

                if not path.exists(target_dir):
                    makedirs(target_dir)

                shutil_copyfile(self.settings['data_dir'] + "/" + 'raw/' + line[-1], target_file)

        dump_logger.info("Trash files: " + str(trash_count))
        dump_logger.info("Hash files: " + str(hash_count))
        dump_logger.info("Plain files: " + str(plain_count))
        dump_logger.info("Operation completed")
Exemplo n.º 41
0
    def get_paths(self):
        """
        Read the list of installed paths from record or source file.

        Example
        -------
        [(u'skdata/__init__.py', u'sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU', 0),
         (u'skdata/diabetes.py', None, None),
         ...
        ]
        """
        manifest_full_path = self.manifest_full_path
        if manifest_full_path:
            python_version = self.python_version
            sp_dir = get_python_site_packages_short_path(python_version) + "/"
            prepend_metadata_dirname = basename(manifest_full_path) == "installed-files.txt"
            if prepend_metadata_dirname:
                path_prepender = basename(dirname(manifest_full_path)) + "/"
            else:
                path_prepender = ""

            def process_csv_row(row):
                cleaned_path = posix_normpath("%s%s%s" % (sp_dir, path_prepender, row[0]))
                if len(row) == 3:
                    checksum, size = row[1:]
                    if checksum:
                        assert checksum.startswith('sha256='), (self._metadata_dir_full_path,
                                                                cleaned_path, checksum)
                        checksum = checksum[7:]
                    else:
                        checksum = None
                    size = int(size) if size else None
                else:
                    checksum = size = None
                return cleaned_path, checksum, size

            csv_delimiter = ','
            if PY2:
                csv_delimiter = csv_delimiter.encode('utf-8')
            with open(manifest_full_path) as csvfile:
                record_reader = csv_reader(csvfile, delimiter=csv_delimiter)
                # format of each record is (path, checksum, size)
                records = tuple(process_csv_row(row) for row in record_reader if row[0])
            files_set = set(record[0] for record in records)

            _pyc_path, _py_file_re = pyc_path, PY_FILE_RE
            py_ver_mm = get_major_minor_version(python_version, with_dot=False)
            missing_pyc_files = (ff for ff in (
                _pyc_path(f, py_ver_mm) for f in files_set if _py_file_re.match(f)
            ) if ff not in files_set)
            records = sorted(concatv(records, ((pf, None, None) for pf in missing_pyc_files)))
            return records

        return []
Exemplo n.º 42
0
 def load_ami_map(cls):
     ami_map = {}
     with open(dirname(__file__) + "/ami_map.csv", "r") as fd:
         reader = csv_reader(fd, dialect='excel-tab')
         header = reader.next()
         
         for row in reader:
             data = dict(zip(header, row))
             key = (data['os_id'], data['version'], data['region'],
                    data['virtualization_type'])
             ami_map[key] = data['ami_id']
     return ami_map
Exemplo n.º 43
0
    def open_marbach(marbach_file, insertion_index):
        with open(marbach_file, 'rb') as source:
            reader = csv_reader(source, delimiter='\t')
            for line in reader:
                interaction_from = line[0]
                interaction_to = line[1]

                if len(line) > 2:
                    weight = np.abs(float(line[2]))
                else:
                    weight = np.nan

                master_accumulator[(interaction_from, interaction_to)][insertion_index] = weight
Exemplo n.º 44
0
def aggregate_monthly_data(csv_data):
    """
    Pass your `csv_data` as an iterable whose members are individual
    lines of data (e.g. using a generator returned by the `iter_lines()`
    method of a `requests` library `Response` object) from a Climate
    Data Online (CDO)-style CSV file. Your CSV file must include the
    date (`DATE`), precipitation (`PRCP`), minimum temperature (`TMIN`),
    and maximum temperature (`TMAX`). The first line of your data file
    must be a header line.

    Returns a 12-member list of structured monthly data, each of which
    is a dict containing

    - `days_of_data`,
    - `precipitation_total`,
    - `min_temperature_total`,
    - `max_temperature_total`,
    - `all_min_temperatures`, and
    - `all_max_temperatures`.
    """

    csv_data = csv_reader(csv_data)

    header_row = next(csv_data)
    date_index = header_row.index('DATE')
    prcp_index = header_row.index('PRCP')
    tmin_index = header_row.index('TMIN')
    tmax_index = header_row.index('TMAX')

    monthlies = [dict(days_of_data=0, precipitation_total=0,
                      min_temperature_total=0, max_temperature_total=0,
                      all_min_temperatures=[], all_max_temperatures=[])
                 for _ in range(12)]

    for data_row in csv_data:
        row_month = int(data_row[date_index][4:6])
        row_prcp = int(data_row[prcp_index])
        row_tmin = int(data_row[tmin_index])
        row_tmax = int(data_row[tmax_index])

        monthly = monthlies[row_month - 1]
        monthly['days_of_data'] += 1
        monthly['precipitation_total'] += row_prcp
        monthly['min_temperature_total'] += row_tmin
        monthly['max_temperature_total'] += row_tmax
        monthly['all_min_temperatures'].append(row_tmin)
        monthly['all_max_temperatures'].append(row_tmax)

    return monthlies
Exemplo n.º 45
0
 def antibodies(self):
     antibodies = []
     with open(self.__csvfile) as fh:
         sample = fh.read(MonogramData.__sample_len)
         sniffer = csv_sniffer()
         dialect = sniffer.sniff(sample)
         if not sniffer.has_header(sample):
             raise ValueError(MonogramData.__no_header_msg)
         fh.seek(0)
         reader = csv_reader(fh, dialect)
         # grab everything after the accession column in the header row
         for row in reader:
             antibodies.extend(r.strip() for r in row[1:])
             break
     return antibodies
Exemplo n.º 46
0
def load_vector(file):
    # At the beginning we don't know how large this vector will be.
    chunk_rows = 32768
    cur_len = chunk_rows
    b = np.ndarray(shape=[cur_len], dtype=float)
    with open(file, 'r') as f:
        reader = csv_reader(f,'excel-tab')
        for i, row in enumerate(reader):
            if i >= cur_len:
                # Enlarge the vector if we have to.
                cur_len += chunk_rows
                b.resize([cur_len])
            b[i] = row[0]
    # Probably our vector is now a bit longer than the file ... shrink it!
    b.resize([i+1])
    return b
def open_csv(inpath,filelist,convert_to_float=False):
#===============================================================================

    from csv import reader as csv_reader

    Dict = {}

    for i,namefile in enumerate(filelist):
         
        #print "\nOpening %s......"%(namefile)

        # open file, read all lines
        inputpath = os.path.join(inpath,namefile)
        f=open(inputpath,'rU') 
        reader=csv_reader(f, delimiter=',', skipinitialspace=True)
        lines=[]
        for row in reader:
            lines.append(row)
        f.close()

        # storing headers in list headerow
        headerow=lines[0]

        # deleting rows that are not data (first and last rows of the file)
        del lines[0]

        # transforming data from string to float type
        converted_data=[]
        for line in lines:
            if '' in line:
                newline = []
                for it in line:
                    if it=='': newline += ['-9999.']
                    if it!='': newline += [it]
                line = newline
            converted_data.append(map(float,line))
        data = np.array(converted_data)

        # creating one dictionnary and storing the float data in it
        dictnamelist= {}
        for j,varname in enumerate(headerow):
            dictnamelist[varname]=data[:,j]
        Dict[namefile] = dictnamelist
    
        #print "Dictionary created!"

    return Dict
Exemplo n.º 48
0
Arquivo: p12nrpn.py Projeto: llvtt/P12
def bank_from_csv(filename):
    """Load a bank of NRPN configurations from a CSV file."""
    settings = []
    with open(filename, 'r') as fd:
        reader = csv_reader(fd)
        for row in reader:
            try:
                setting = Setting(
                    name=row[0],
                    number=int(row[1]),
                    min=int(row[2]),
                    max=int(row[3])
                )
                settings.append(setting)
            except Exception:
                print("Could not parse row: %r" % row)
    return settings
Exemplo n.º 49
0
def parse_complex_portal(complex_portal_file):

    def unpack_complex_contents(complex_name):
        unpacked_subnodes = []
        subnode_list = new_nodes[complex_name]['components']

        for sub_node in subnode_list:

            if sub_node in new_nodes[complex_name].keys():
                unpacked_subnodes += unpack_complex_contents(sub_node)

            else:
                if ':' in sub_node or '_9606' in sub_node:
                    pass
                elif '-' in sub_node:
                    unpacked_subnodes.append(sub_node.split('-')[0])
                else:
                    unpacked_subnodes.append(sub_node)

        return unpacked_subnodes

    base = []
    new_nodes = {}

    with open(complex_portal_file, 'rb') as source:
        reader = csv_reader(source, delimiter='\t')
        header = reader.next()
        for line in reader:
            legacy_id = line[0]
            display_name = line[1]
            componenets = line[4].split('|')
            componenets = [comp.split('(')[0] for comp in componenets]
            node = {'ID': legacy_id, 'displayName': display_name, 'components': componenets}
            new_nodes[node['ID']] = node

    # print new_nodes

    for node in new_nodes.itervalues():
        node['components'] = unpack_complex_contents(node['ID'])
        base += node['components']

    # print new_nodes

    base = list(set(base))

    return new_nodes, base
Exemplo n.º 50
0
def parse_hint(_hint_csv):
    """
    Reads protein-protein relationships from a HiNT database file

    :param _hint_csv: location of the HiNT database tsv file
    :return: {UP_Identifier:[UP_ID1, UP_ID2, ...]}
    """
    local_relations = defaultdict(list)

    with open(_hint_csv, 'r') as source_file:
        hint_reader = csv_reader(source_file, delimiter='\t')
        hint_reader.next()
        for i, fields in enumerate(hint_reader):
            if fields[2] != fields[3]:
                local_relations[fields[3]].append(fields[2])
                local_relations[fields[2]].append(fields[3])
    return dict(local_relations)
Exemplo n.º 51
0
def parse_TRRUST(trrust_file):
    base = []
    ret_dict = {}

    with open(trrust_file, 'rb') as source:
        reader = csv_reader(source, delimiter='\t')
        for line in reader:
            interaction_from = line[0]
            interaction_to = line[1]
            interaction_type = line[2]
            evidence = line[3].split(';')
            evidence_redundancy = len(evidence)
            base.append(interaction_to)
            base.append(interaction_from)
            ret_dict[(interaction_from, interaction_to)] = evidence_redundancy

    base = list(set(base))

    return ret_dict, base
Exemplo n.º 52
0
def parse_bio_grid(bio_grid):
    """
    Parses the given file as a BioGrid file and returns as

    :param bio_grid: the location of the biogrid_path bioflow file that needs to bprased
    :return:
    """
    ret_dict = {}
    base = []

    with open(bio_grid, 'rb') as source_file:
        biogrid_reader = csv_reader(source_file, 'excel-tab')
        biogrid_reader.next()
        for fields in biogrid_reader:
            ret_dict[tuple(fields[7:9])] = [fields[17]]
            if fields[18] != '-':
                ret_dict[tuple(fields[7:9])].append(fields[18])
            base.append(fields[7])
            base.append(fields[8])

    return ret_dict, base
Exemplo n.º 53
0
def parse_cellnet_grn(cellnet_file):
    base = []
    ret_dict = {}

    with open(cellnet_file, 'rb') as source:
        reader = csv_reader(source, delimiter=',')
        header = reader.next()
        # print header
        for line in reader:
            interaction_no = int(line[0])
            interaction_from = line[1]
            interaction_to = line[2]
            interaction_z_score = float(line[3])
            interaction_correlation = float(line[4])
            base.append(interaction_to)
            base.append(interaction_from)
            ret_dict[(interaction_from, interaction_to)] = interaction_correlation

    base = list(set(base))

    return ret_dict, base
Exemplo n.º 54
0
def create_namedtuple_from_csv(name, csv):
    l = logic.Mutex()

    l.is_filename = (
        isinstance(csv, str) and
        '\n' not in csv and
        os.path.isfile(csv)
    )

    l.is_csv_text = (
        isinstance(csv, str) and
        '\n' in csv and
        ',' in csv
    )

    l.is_csv_lines = (
        not isinstance(csv, str) and (
            hasattr(csv, '__iter__') or
            hasattr(csv, 'next')
        )
    )

    lines = None

    with l as g:
        if g.is_filename:
            with open(csv, 'r') as f:
                lines = f.read().split('\n')

        elif g.is_csv_text:
            lines = csv.split('\n')

        elif g.is_csv_lines:
            lines = csv

    data = [ r for r in csv_reader(lines) ]
    mutable_sheets = set()
    mutable = True if name in mutable_sheets else False
    return create_namedtuple(name, data, mutable=mutable)
def csv2numpy(source, c_header=True, r_header=True):

    def correct_line(_row):
        return [float(item) if item not in ['inf', '', ' '] else np.inf for item in _row]

    with open(source, 'r') as source_file:
        reader = csv_reader(source_file)

        if c_header:
            c_headers = reader.next()
        else:
            c_headers = []

        r_headers = []
        data_container = []

        for row in reader:
            if r_header:
                r_headers.append(row[0])
                row = row[1:]
            data_container.append(correct_line(row))

        return np.array(data_container), c_headers, r_headers
Exemplo n.º 56
0
genes_to_ids_dict = {}

# translation_file_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Veena data/Mouse_2_human.tsv'
# gene_to_id_file_location = ''
# data_source_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Veena data/both_ENSMUG.csv'
# data_dump_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Veena data/both_ENSHUM.csv'


translation_file_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Veena data/Mouse_2_human.tsv'
gene_to_id_file_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Kp_Km data/mouse_look_up_table.tsv'
data_source_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Kp_Km data/all_significant.csv'
data_dump_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Kp_Km data/all_sig_hum.csv'


with open(translation_file_location, 'r') as source:
    reader = csv_reader(source, delimiter='\t')
    print reader.next()
    for line in reader:
        if line[0] and line[1]:
            if int(line[3]):
                # We still need to account for the confidence in mapping
                high_conf_translation_dict[line[0]] = [line[1], line[2]]
                # print line[0:4]
            else:
                low_conf_translation_dict[line[0]] = [line[1], line[2]]

high_conf_trans = []
low_conf_trans = []


if gene_to_id_file_location:
Exemplo n.º 57
0
log = get_logger(__name__)


interactome_interface_instance = InteractomeInterface(True, True)
interactome_interface_instance.fast_load()

md5_hash = interactome_interface_instance.md5_hash()

print "samples found to test against:\t %s" % interactome_rand_samp_db.find({'size': 2,
                                                                          'sys_hash': md5_hash,
                                                                          'sparse_rounds': False}).count()

essential_genes_bulbs_ids = []

with open(Dumps.analysis_set_bulbs_ids, 'r') as source:
    reader = csv_reader(source)
    for line in reader:
        essential_genes_bulbs_ids += line

essential_genes_bulbs_ids = [int(gene) for gene in essential_genes_bulbs_ids]

values = []
length_width_accumulator = []
essentiality_percentage = []

for i, sample in enumerate(interactome_rand_samp_db.find({'size': 2, 'sys_hash': md5_hash,
                                                                  'sparse_rounds': False})):

    # if i > 10:
    #     break
 def unicode_csv_reader(csvfile, *args, **kwargs):
     for row in csv_reader(csvfile, *args, **kwargs):
         yield [unicode(cell, encoding) for cell in row]
Exemplo n.º 59
0
    def seqrecords(self, antibodies, clonal=False):
        if clonal:
            raise ValueError('clonal property is not available with Monogram datasets')
        if len(antibodies) > 1:
            raise ValueError('only one antibody can be interrogated with Monogram datasets')

        seqrecords = []
        with open(self.__fastafile) as h:
            source = Verifier(SeqIO.parse(h, 'fasta'), DNAAlphabet)
            try:
                seqrecords = list(source)
            except VerifyError:
                source.set_alphabet(AminoAlphabet)
                seqrecords = list(source)

        underdash = re_compile(r'[_-](\d+)$')
        for r in seqrecords:
            r.id = underdash.sub(r'_\1', r.id)

        ic50s = dict((r.id, []) for r in seqrecords)

        with open(self.__csvfile) as fh:
            sample = fh.read(MonogramData.__sample_len)
            sniffer = csv_sniffer()
            dialect = sniffer.sniff(sample)
            if not sniffer.has_header(sample):
                raise ValueError(MonogramData.__no_header_msg)
            fh.seek(0)
            reader = csv_reader(fh, dialect)
            columns = None
            for i, row in enumerate(reader):
                if columns is None:
                    columns = dict((v.strip(), j) for j, v in enumerate(row))
                    missing = set(antibodies) - set(columns.keys())
                    if len(missing):
                        raise ValueError("antibodies ('%s') not found!" % "', '".join(missing))
                else:
                    acc = underdash.sub(r'_\1', row[0])
                    try:
                        if acc in ic50s:
                            cln_ic50s = [float(row[columns[ab]].strip().lstrip('<>'))
                                         for ab in antibodies
                                         if ab in columns and columns[ab] < len(row)]
                            ic50s[acc].extend(cln_ic50s)
                    except:
                        pass

        drop = []
        for i, r in enumerate(seqrecords):
            if r.id not in ic50s or len(ic50s[r.id]) == 0:
                drop.append(i)
                warn("skipping sequence '%s', VALUE not found" % r.id)
            else:
                values = {'IC50': ic50s[r.id]}
                r.description = json_dumps({
                    'ab': antibodies[0],
                    'values': values
                    })
                r.annotations['antibody'] = values

        for i in sorted(drop, reverse=True):
            del seqrecords[i]

        return seqrecords, clonal, antibodies