def __build_reader( path: str, delimiter: str = ',', logger: Logger=getLogger(__name__) ) -> 'csv_reader': """Build a CSV reader object Parameters ---------- path: str Path to the file to build the reader from delimiter: str, optional Pattern to separate columns logger: Logger, optional Returns ------- CSV reader object """ if isinstance(path, bytes): reader = csv_reader( StringIO(path.decode('utf-8')), delimiter=delimiter ) else: try: reader = csv_reader( open(path, 'r'), delimiter=delimiter ) except FileNotFoundError: logger.error('Could not read file: '+str(path)) return None next(reader) return reader
def parseSignificativeFeaturesFile(self, fileName, isBedFormat=False): #TODO: HEADER relevantFeatures = {} if os_path.isfile(fileName): with open(fileName, 'rU') as inputDataFile: for line in csv_reader(inputDataFile, delimiter="\t"): if (isBedFormat == True): lineProc = line[0] + "_" + line[1] + "_" + line[2] else: lineProc = line[0] # If the relevants file is not in BED format and contains more than 1 column, it means # that the second one contains the original ID if len(line) > 1 and not isBedFormat: featureID = ":::".join([line[0], line[1]]).lower() else: featureID = lineProc.lower() relevantFeatures[featureID] = 1 inputDataFile.close() logging.info("PARSING RELEVANT FEATURES FILE (" + fileName + ")... THE FILE CONTAINS " + str(len(relevantFeatures.keys())) + " RELEVANT FEATURES") else: logging.info("PARSING RELEVANT FEATURES FILE (" + fileName + ")... NO RELEVANT FEATURES FILE SUBMITTED") return relevantFeatures
def CreateConstMuskingumXFile(x_value, in_connectivity_file, out_x_file): """ Create muskingum X file from value that is constant all the way through for each river segment. Args: x_value(float): Value for the muskingum X parameter [0-0.5]. in_connectivity_file(str): The path to the RAPID connectivity file. out_x_file(str): The path to the output x file. Example:: from RAPIDpy.gis.muskingum import CreateConstMuskingumXFile #------------------------------------------------------------------------------ #main process #------------------------------------------------------------------------------ if __name__ == "__main__": CreateConstMuskingumXFile(x_value=0.3, in_connectivity_file='/path/to/rapid_connect.csv', out_x_file='/path/to/x.csv', ) """ num_rivers = 0 with open_csv(in_connectivity_file, "r") as csvfile: reader = csv_reader(csvfile) for row in reader: num_rivers+=1 with open_csv(out_x_file,'w') as kfile: x_writer = csv_writer(kfile) for idx in xrange(num_rivers): x_writer.writerow([x_value])
def load_csv(filename): dataset = [] with open(filename, 'r') as file: reader = csv_reader(file) for row in reader: dataset.append(row) return dataset
def get_ref_band_weights(file: str, col: int, nbands: int) -> np.array: ref_band_weights_reader = csv_reader(open(file, "r")) ban_data = { int(row[0][1:]): float(row[col]) for row in ref_band_weights_reader } return np.array([ban_data.get(iW, 0.0) for iW in range(nbands)])
def create_namedtuple_from_csv(name, csv): l = logic.Mutex() l.is_filename = (isinstance(csv, str) and '\n' not in csv and os.path.isfile(csv)) l.is_csv_text = (isinstance(csv, str) and '\n' in csv and ',' in csv) l.is_csv_lines = (not isinstance(csv, str) and (hasattr(csv, '__iter__') or hasattr(csv, 'next'))) lines = None with l as g: if g.is_filename: with open(csv, 'r') as f: lines = f.read().split('\n') elif g.is_csv_text: lines = csv.split('\n') elif g.is_csv_lines: lines = csv data = [r for r in csv_reader(lines)] mutable_sheets = set() mutable = True if name in mutable_sheets else False return create_namedtuple(name, data, mutable=mutable)
def copy_all(fname,dt=None): if dt!=None: DT_TD=timedelta(seconds=dt) ten_min=timedelta(minutes=10) with open(fname+'.csv', 'rU') as fin: csv_file=csv_reader(fin) csv_file.next() ##read off header for line in csv_file: if len(line)==0: continue event_num=line[1] timestamp=datetime.strptime(line[0].split('.')[0], '%Y-%m-%d %H:%M:%S') if dt==None: DT_TD=timedelta(seconds=int(line[2])) print print "copying event", event_num present_time=timestamp-DT_TD while True: print " start time:", present_time file_start,s=load_file_10MIN(present_time) if file_start+ten_min>=timestamp+DT_TD: break present_time=file_start+ten_min
def CreateConstMuskingumXFile(x_value, in_connectivity_file, out_x_file): """ Create muskingum X file from value that is constant all the way through for each river segment. Parameters ---------- x_value: float Value for the muskingum X parameter [0-0.5]. in_connectivity_file: str The path to the RAPID connectivity file. out_x_file: str The path to the output x file. Example:: from RAPIDpy.gis.muskingum import CreateConstMuskingumXFile CreateConstMuskingumXFile( x_value=0.3, in_connectivity_file='/path/to/rapid_connect.csv', out_x_file='/path/to/x.csv') """ num_rivers = 0 with open_csv(in_connectivity_file, "r") as csvfile: reader = csv_reader(csvfile) for _ in reader: num_rivers += 1 with open_csv(out_x_file, 'w') as kfile: x_writer = csv_writer(kfile) for _ in xrange(num_rivers): x_writer.writerow([x_value])
def create_attn_list(self): """ Parses the CSV file containing the attenuation to apply during the test. """ file_content = None if self._scenario_uri.startswith('http'): file_to_load = NamedTemporaryFile(delete=True) urlretrieve(self._scenario_uri, file_to_load.name) file_content = open(file_to_load.name, 'r') self._log.info("SCENARIO_FILE=%s" % file_to_load.name) else: if (os_path.isfile(self._scenario_uri)): file_content = open(self._scenario_uri, 'r') self._log.info("SCENARIO_FILE=%s" % os_path.basename(self._scenario_uri)) else: self._log.error("Can't open the scenario file %s" % self._scenario_uri) if file_content is not None: self._log.info("Loading scenario...") reader = csv_reader(file_content, delimiter=';') # semi_column for row in reader: if reader.line_num == 1: self._num_of_aps = len(row) - SHIFT_COLUMN self._log.info("Found %d APs" % self._num_of_aps) self._ap_list = row[SHIFT_COLUMN:len(row)] self._log.debug("APs list is: %s" % self._ap_list) else: self._scenario_values.append(row)
def CreateConstMuskingumXFile(x_value, in_connectivity_file, out_x_file): """ Create muskingum X file from value that is constant all the way through for each river segment. Args: x_value(float): Value for the muskingum X parameter [0-0.5]. in_connectivity_file(str): The path to the RAPID connectivity file. out_x_file(str): The path to the output x file. Example:: from RAPIDpy.gis.muskingum import CreateConstMuskingumXFile #------------------------------------------------------------------------------ #main process #------------------------------------------------------------------------------ if __name__ == "__main__": CreateConstMuskingumXFile(x_value=0.3, in_connectivity_file='/path/to/rapid_connect.csv', out_x_file='/path/to/x.csv', ) """ num_rivers = 0 with open_csv(in_connectivity_file, "r") as csvfile: reader = csv_reader(csvfile) for row in reader: num_rivers += 1 with open_csv(out_x_file, 'w') as kfile: x_writer = csv_writer(kfile) for idx in xrange(num_rivers): x_writer.writerow([x_value])
def main(): for infilename in sys.argv[1:]: outfilename = sub("\.csv", "_pad.csv", infilename) prev_dt = -1 week = timedelta(days=7) one = timedelta(days=1) with open(outfilename, "wb") as outfile: w = csv_writer(outfile) with open(infilename, "rb") as infile: r = csv_reader(infile) header = r.next() w.writerow(header) for row in r: dt = dt_parser.parse(row[0]) if prev_dt != -1: # we're past the first line... compare! diff = dt - prev_dt if diff > one: for i in reversed(range(diff.days - 1)): wahoo = timedelta(days=(i+1)) pad = dt - wahoo #print >> sys.stderr, "padding:%s" % pad w.writerow([_get_dt_str(pad), 0]) w.writerow([_get_dt_str(dt), row[1]]) prev_dt = dt
def read_colorbrewer(iterable): res = defaultdict(dict) iterator = iter(iterable) fieldnames = next(csv_reader(iterator, DIALECT)) reader = DictReader(iterator, fieldnames, dialect=DIALECT) for row in reader: def int_cell(colname): return int(row[colname]) color_name = row["ColorName"] if color_name: num_of_colors = int_cell("NumOfColors") colors = [] res[color_name][num_of_colors] = colors try: colors.append(tuple(map(int_cell, "RGB"))) except ValueError: # data section is over break return res
def csv_file_column_names(filename): path = join(BASE_DATASOURCES_DIR, filename) with open(path, "r") as f: reader = csv_reader(f) for row in reader: return row raise IOError("csv file {0} has no rows" % filename)
def main(): csv_input_file = open(PATH_TO_CSV, 'r') reader = csv_reader(csv_input_file, delimiter=',') csv_output_file = open("out.csv", 'w', encoding='utf-8-sig', newline='') fieldnames = ['keyword', 'freq'] writer = csv_writer(csv_output_file, fieldnames=fieldnames) writer.writeheader() for row in reader: pdf_path = PATH_TO_PDF + row[0] + ".pdf" json_path = PATH_TO_JSON + row[0] + ".json" keywords = get_keywords(json_path) cleared_keywords = get_cleared_keywords(keywords) pdf_words = get_cleared_pdf(pdf_path) keywords_freq = get_keywords_freq(pdf_words, cleared_keywords) for i in range(len(keywords)): writer.writerow({ 'keyword': keywords[i].encode('utf8'), 'freq': keywords_freq[i] }) csv_input_file.close() csv_output_file.close()
def main(args: Namespace) -> None: caption_indices = set( ) # Safe to do b/c embeddings inserted in right order embeddings = [] filenames = glob(path.join(args.data_dir, args.embed_dir, '*.pickle')) # Sort filenames in-place by numerical value of file (not lexicographically) filenames.sort(key=lambda filename: int(filename.split('/')[-1][:-7])) print('Sorted partial embedding files') for filename in filenames: with open(filename, 'rb') as partial_embed: # Zip iterator of (caption index, 2048-dim NumPy image embedding) for index, embed in load(partial_embed): caption_indices.add(index) embeddings.append(embed) print('Started stacking embeddings after loading them into memory') # Stack embeddings together into single matrix before saving embeddings = stack(embeddings) print('Finished stacking embeddings') save(path.join(args.data_dir, args.outfile), embeddings) print('Finished saving embeddings') # Save pruned captions as simple text file (no need for TSV anymore) with open(path.join(args.data_dir, args.infile), newline='') as tsvfile: tsv_reader = csv_reader(tsvfile, delimiter='\t') with open(path.join(args.data_dir, args.pruned_captions), 'w') as outfile: for i, row in enumerate(tsv_reader): if i in caption_indices: outfile.write(f'{row[0]}\n') print('Finished saving pruned captions')
def get_arp_table(*, modify=False, host=None): ''' return arp table as dictionary {IPv4Address(ip): mac} = get_arp_table(modify=True) if modify is set to True, the ":" will be removed from the mac addresses. if host is specified, return just the mac address of the host sent in, returning None if host is not present. ''' with open('/proc/net/arp') as arp_table: # 'IP address', 'HW type', 'Flags', 'HW address', 'Mask', 'Device' arp_table = list( csv_reader(arp_table, skipinitialspace=True, delimiter=' ') ) if (modify): arp_table = {IPv4Address(a[0]): a[3].replace(':', '') for a in arp_table[1:]} else: arp_table = {IPv4Address(a[0]): a[3] for a in arp_table[1:]} if (host): return arp_table.get(host, None) else: return arp_table
def get_exwoce_params(): """Return a dictionary of WOCE parameters allowed for Exchange conversion. Returns: {'PMNEMON': { 'unit_mnemonic': 'WOCE', 'range': [0.0, 10.0], 'format': '%8.3f'}} """ reader = csv_reader(open(WOCE_PARAMS_FOR_EXWOCE, 'r')) # First line is header reader.next() params = {} for order, row in enumerate(reader): if row[-1] == 'x': continue if not row[1]: row[1] = None if row[2]: prange = map(float, row[2].split(',')) else: prange = None if not row[3]: row[3] = None params[row[0]] = { 'unit_mnemonic': row[1], 'range': prange, 'format': convert_fortran_format_to_c(row[3]), 'order': order, } return params
def unicode_csv_reader(csvfile): records = [] for data in csv_reader(csvfile): #skip the first line of the csv if data[0] == "Identifier": continue #label = offense_type #features used include: #Day of Week,Occurrence Month, #Occurrence Day, #Occurrence Hour,CompStat Month, #CompStat Day, #Sector,Precinct, #Borough,Jurisdiction, #XCoordinate, YCoordinate, Location label = data[10] feature = data[3:5] + data[6:9] + data[11:-1] records.append([label, feature]) return records
def check_src_in_sink_1( source_inchi: str, sink_file: str, logger: Logger = getLogger(__name__)) -> int: """ Check if source is present in sink file. InChIs have to be strictly equal. Parameters ---------- source_inchi: str Path to file containing the source. sink_file: str Path to file containing the sink. logger : Logger The logger object. Returns ------- int Return code. """ logger.info(' |- Source in Sink (simple)') try: with open(sink_file, 'r') as f: for row in csv_reader(f, delimiter=',', quotechar='"'): if source_inchi == row[1]: logger.error(' source has been found in sink') return -1 except FileNotFoundError as e: logger.error(e) return -2 return 0
def read_scorefile(filepath, gene, offset): if not exists(filepath): raise ValueError("file '%s' doesn't exist!" % filepath) with open(filepath) as fh: reader = csv_reader(fh, delimiter='\t') header = reader.next() # some error correction if len(header) < 3: raise RuntimeError('expected at least 3 columns') if header[0].lower() != 'position': raise RuntimeError("expected first column label to be 'position'") if header[1].lower() != 'aa': raise RuntimeError("expected second column label to be 'aa'") drug_idxs = [(i + 2, d) for i, d in enumerate(header[2:])] pos_by_drug = dict((d, []) for d in header[2:]) for row in reader: # coordinates are assumed 1-indexed coord = int(row[0]) + offset - 1 amino = row[1].upper() klass = (DrugCoord.DELETION if amino == '-' else DrugCoord.INSERTION if amino == 'INSERT' else DrugCoord.VALUE) for idx, drug in drug_idxs: val = int(row[idx]) if val > 0: pos_by_drug[drug].append(DrugCoord(coord, klass, amino, val)) return dict((d, DrugData(d, gene, pos_by_drug[d])) for _, d in drug_idxs)
def process_gcc_split(data_dir: str, tsvname: str) -> List[List[str]]: lines = [] with open(path.join(data_dir, tsvname), newline='') as tsvfile: tsv_reader = csv_reader(tsvfile, delimiter='\t') for line in tsv_reader: lines.append([detokenizer.detokenize(line[0].split()), line[1]]) return lines
def load_attributes(filename, solr_sources, bq_sources): attr_file = open(filename, "r") for line in csv_reader(attr_file): if line[0] not in ATTR_SET: ATTR_SET[line[0]] = new_attribute( line[0], line[0].replace("_", " ").title() if re.search( r'_', line[1]) else line[1], Attribute.CATEGORICAL if line[2] == 'CATEGORICAL STRING' else Attribute.STRING if line[2] == "STRING" else Attribute.CONTINUOUS_NUMERIC, True if line[-1] == 'True' else False, True) attr = ATTR_SET[line[0]] if attr['name'] != 'gcs_url': attr['solr_collex'].extend(solr_sources) attr['bq_tables'].extend(bq_sources) attr['set_types'].append({ 'set': DataSetType.IMAGE_DATA, 'child_record_search': 'StudyInstanceUID' }) if attr['name'] in DISPLAY_VALS: if 'preformatted_values' in DISPLAY_VALS[attr['name']]: attr['preformatted_values'] = True else: attr['display_vals'] = DISPLAY_VALS[attr['name']]['vals'] attr_file.close()
def parse_weights(weights_data_file, transcription_factors_line, start_of_table_line, end_of_table): deleted_genes = [] read_out_genes = [] _data_matrix = [] with open(weights_data_file, 'rb') as in_file: reader = csv_reader(in_file, delimiter='\t') for line_number, line in enumerate(reader): if line_number == transcription_factors_line: deleted_genes = [string.split(' ')[0] for string in line[1:]] if end_of_table > line_number > start_of_table_line: read_out_genes.append(line[0]) # print line # print line_number _data_matrix.append([nan_helper(value) for value in line[1:]]) # one line is all experimental conditions for a single gene _data_matrix = np.array(_data_matrix) # check that we got everything properly logging.info('deleted genes:\t%s', len(deleted_genes)) logging.info('genes in read-out:\t%s', len(read_out_genes)) logging.info('data matrix shape:\t%s', _data_matrix.shape) return _data_matrix, deleted_genes, read_out_genes
def open_csv(inpath, namefile, convert_to_float=False): #=============================================================================== from csv import reader as csv_reader # open file, read all lines inputpath = os.path.join(inpath,namefile) f=open(inputpath,'rU') reader=csv_reader(f, delimiter=',', skipinitialspace=True) lines=[] for row in reader: lines.append(row) f.close() # storing headers in list headerow headerow=lines[0] # deleting rows that are not data (first and last rows of the file) del lines[0] # transforming data from string to float type converted_data=[] for line in lines: if convert_to_float==True: converted_data.append(map(float,line)) else: converted_data.append(line) data = np.array(converted_data) # creating one dictionnary and storing the float data in it dictnamelist= {} for j,varname in enumerate(headerow): dictnamelist[varname]=data[:,j] return dictnamelist
def __parse_file_list(cls, dir_info, file_filter=FAFileFilterEnum.FILES_AND_DIRS): file_list = [] f = StringIO(dir_info) reader = csv_reader(f, delimiter=",") rows = [] for row in reader: rows.append(row) if len(rows) > 0: if len(rows[0]) != 1 and rows[0] != "WLANSD_FILELIST": raise FACommandAPIException("Unexpected file entry result at first line", rows[0]) # TODO: implement a mapping function for filtering by type logging.getLogger().debug("Row count of file information : " + str(len(rows))) for row in rows[1:]: if len(row) != 6: raise FACommandAPIException("Unknown file entry ", row) if file_filter is None or file_filter is FAFileFilterEnum.FILES_AND_DIRS: logging.getLogger().debug("Filtering disabled.") file_list.append({"Path": row[0] + "/" + row[1], "DecimalDate": row[4], "DecimalTime": row[5]}) elif file_filter is FAFileFilterEnum.FILES_ONLY: # only files if int(row[3]) == FA_FILE_IDENTIFIER: logging.getLogger().debug("Filtering only files.") file_list.append({"Path": row[0] + "/" + row[1], "DecimalDate": row[4], "DecimalTime": row[5]}) elif file_filter is FAFileFilterEnum.DIRS_ONLY: # only directories if int(row[3]) == FA_DIR_IDENTIFIER: logging.getLogger().debug("Filtering only directories.") file_list.append({"Path": row[0] + "/" + row[1], "DecimalDate": row[4], "DecimalTime": row[5]}) else: raise FACommandAPIException("Unknown file filtering!") return file_list
def csv_to_dict(ppl, namesfile): with open(namesfile, 'r') as fd: read_csv = csv_reader(fd, delimiter=',') for row in read_csv: row[0] = row[0].strip().title() # The person name row[1] = row[1].strip() # The e-mail ppl.append(tuple((row[0], row[1])))
def _m_chemXref(self, chem_xref_path): chemXref = {} with open(chem_xref_path) as f: c = csv_reader(f, delimiter='\t') for row in c: if not row[0][0] == '#': mnx = self._checkMNXMdeprecated(row[1]) if len(row[0].split(':')) == 1: dbName = 'mnx' dbId = row[0] else: dbName = row[0].split(':')[0] dbId = ''.join(row[0].split(':')[1:]) if dbName == 'deprecated': dbName = 'mnx' #mnx if not mnx in chemXref: chemXref[mnx] = {} if not dbName in chemXref[mnx]: chemXref[mnx][dbName] = [] if not dbId in chemXref[mnx][dbName]: chemXref[mnx][dbName].append(dbId) ### DB ### if not dbName in chemXref: chemXref[dbName] = {} if not dbId in chemXref[dbName]: chemXref[dbName][dbId] = mnx return chemXref
def _m_compXref(self, compXref_path): compXref = {} name_compXref = {} try: with open(compXref_path) as f: c = csv_reader(f, delimiter='\t') #not_recognised = [] for row in c: #cid = row[0].split(':') if not row[0][0] == '#': #collect the info mnxc = row[1] if len(row[0].split(':')) == 1: dbName = 'mnx' dbCompId = row[0] else: dbName = row[0].split(':')[0] dbCompId = ''.join(row[0].split(':')[1:]) dbCompId = dbCompId.lower() if dbName == 'deprecated': dbName = 'mnx' #create the dicts if not mnxc in compXref: compXref[mnxc] = {} if not dbName in compXref[mnxc]: compXref[mnxc][dbName] = [] if not dbCompId in compXref[mnxc][dbName]: compXref[mnxc][dbName].append(dbCompId) #create the reverse dict if not dbCompId in name_compXref: name_compXref[dbCompId] = mnxc except FileNotFoundError: self.logger.error('compXref file not found') return {} return compXref, name_compXref
def csv_open(file, expected_columns): """ Yields rows of csv file as dictionaries Parameters: file - Path, or file-like object, of the CSV file to use expected_columns - Columns of the csv file If the first row of the CSV file are these labels, take the columns in that order Otherwise, take the columns in the order given by expected_columns """ if isinstance(file, str): with open(file, encoding='utf-8') as f: yield from csv_open(f, expected_columns=expected_columns) return expected_columns = tuple(expected_columns) csv_iter = csv_reader(file) first_row = next(csv_iter) if set(first_row) == set(expected_columns): columns = first_row else: columns = expected_columns csv_iter = chain([first_row], csv_iter) for row in csv_iter: if len(row) < len(columns): raise IndexError("Too few columns in row {!r}".format(row)) yield dict(zip(columns, row))
def read_from_csv(file_name: str) -> ColourList: reader = csv_reader( open(file_name, "r", newline=""), delimiter=" ", quotechar='"', quoting=QUOTE_NONNUMERIC, ) return [row for row in reader]
def read_csv(csv_file, encoding=r'UTF-8'): from csv import reader as csv_reader from pathlib import Path with Path(csv_file).open(r'rt', encoding=encoding) as istream: # return csv_reader(istream) # I/O operation on closed file. r = csv_reader(istream) for y in r: yield y
def get_info_from_csv(csv_file, Headers_and_data=Headers_and_data): from csv import reader as csv_reader reader = csv_reader(csv_file) iter_reader = iter(reader) headers = next(iter_reader) # Drop the row with column names return Headers_and_data(headers, iter_reader)
def parse_data_summary(file_location): data_table = [] with open(file_location, 'rb') as source: reader = csv_reader(source, delimiter='\t') header = reader.next() for line in reader: data_table.append(line) return np.array(data_table).astype(np.float)
def _get_data(file_name: str): DataResolver._validate_file_name(file_name) with open(path_join(DataResolver.RES_DIR_PATH, file_name)) as f: reader = csv_reader(f) try: return [[int(elem) for elem in line] for line in reader] except ValueError: raise ValueError(INVALID_CHAR_IN_RES.format(file_name))
def loadDictionaryOfTranslationsIfNecessary(): global phrase_translation if not phrase_translation: with open(filepath + '/data/translations.csv') as f: reader = csv_reader(f, delimiter=',', quotechar='"') for row in reader: phrase_translation[row[0]] = row[1] print("phrase_translation is", phrase_translation)
def tsv_init(filename): """ Args: filename (str) """ tsv_file = open(filename) tsv_read = csv_reader(tsv_file, delimiter="\t") return tsv_file, tsv_read
def parse_direct_connections(direct_connections_data_file): TF_2_Genes = defaultdict(lambda: 0) with open(direct_connections_data_file) as source: reader = csv_reader(source, delimiter=';') for line in reader: TF_2_Genes[line[0], line[1]] = 1 return TF_2_Genes
def _get_records(self, local=False): RECORD = os.path.join(self.path, 'RECORD') record_reader = csv_reader(open(RECORD, 'rb'), delimiter=',') for row in record_reader: path, md5, size = row[:] + [None for i in xrange(len(row), 3)] if local: path = path.replace('/', os.sep) path = os.path.join(sys.prefix, path) yield path, md5, size
def read_csv_file(filename): """Read the contents of a CVS file into a dict""" with open(filename, encoding='latin-1') as file: reader = csv_reader(file) next(reader) # skip header for line in reader: entry = {} for key in CSV_MAP: entry[key] = line[CSV_MAP[key]] yield entry
def run(self): dump_logger = getLogger('dumpscraper') # Let's invoke the getscore runner and tell him to work on training data dump_logger.info("Calculating dump score...") running = getscore.DumpScraperGetscore(self.settings, self.parentArgs) running.run() # First of all let's feed the classifier with the training data training = scipy_genfromtxt(self.settings['data_dir'] + "/" + "training/features.csv", delimiter=",", skip_header=1, usecols=(0, 1, 2)) target = scipy_genfromtxt(self.settings['data_dir'] + "/" + "training/features.csv", delimiter=",", skip_header=1, usecols=(-2)) clf = sklearn.neighbors.KNeighborsClassifier(10, weights='uniform') clf.fit(training, target) trash_count = hash_count = plain_count = 0 cleared = [] with open(self.settings['data_dir'] + "/" + 'features.csv', 'rb') as csvfile: reader = csv_reader(csvfile) for line in reader: if line[0] == 'Trash score': continue features = np_array(line[0:3]) features = features.reshape(1, -1) label = clf.predict(features) if label == 0: folder = 'trash' trash_count += 1 elif label == 1: folder = 'hash' hash_count += 1 elif label == 2: folder = 'plain' plain_count += 1 target_file = self.settings['data_dir'] + "/" + 'organized/' + folder + "/" + line[-1] target_dir = path.dirname(target_file) # If asked for a clean run, let's delete the entire folder before copying any file if self.parentArgs.clean and target_dir not in cleared and path.exists(target_dir): cleared.append(target_dir) shutil_rmtree(target_dir) if not path.exists(target_dir): makedirs(target_dir) shutil_copyfile(self.settings['data_dir'] + "/" + 'raw/' + line[-1], target_file) dump_logger.info("Trash files: " + str(trash_count)) dump_logger.info("Hash files: " + str(hash_count)) dump_logger.info("Plain files: " + str(plain_count)) dump_logger.info("Operation completed")
def get_paths(self): """ Read the list of installed paths from record or source file. Example ------- [(u'skdata/__init__.py', u'sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU', 0), (u'skdata/diabetes.py', None, None), ... ] """ manifest_full_path = self.manifest_full_path if manifest_full_path: python_version = self.python_version sp_dir = get_python_site_packages_short_path(python_version) + "/" prepend_metadata_dirname = basename(manifest_full_path) == "installed-files.txt" if prepend_metadata_dirname: path_prepender = basename(dirname(manifest_full_path)) + "/" else: path_prepender = "" def process_csv_row(row): cleaned_path = posix_normpath("%s%s%s" % (sp_dir, path_prepender, row[0])) if len(row) == 3: checksum, size = row[1:] if checksum: assert checksum.startswith('sha256='), (self._metadata_dir_full_path, cleaned_path, checksum) checksum = checksum[7:] else: checksum = None size = int(size) if size else None else: checksum = size = None return cleaned_path, checksum, size csv_delimiter = ',' if PY2: csv_delimiter = csv_delimiter.encode('utf-8') with open(manifest_full_path) as csvfile: record_reader = csv_reader(csvfile, delimiter=csv_delimiter) # format of each record is (path, checksum, size) records = tuple(process_csv_row(row) for row in record_reader if row[0]) files_set = set(record[0] for record in records) _pyc_path, _py_file_re = pyc_path, PY_FILE_RE py_ver_mm = get_major_minor_version(python_version, with_dot=False) missing_pyc_files = (ff for ff in ( _pyc_path(f, py_ver_mm) for f in files_set if _py_file_re.match(f) ) if ff not in files_set) records = sorted(concatv(records, ((pf, None, None) for pf in missing_pyc_files))) return records return []
def load_ami_map(cls): ami_map = {} with open(dirname(__file__) + "/ami_map.csv", "r") as fd: reader = csv_reader(fd, dialect='excel-tab') header = reader.next() for row in reader: data = dict(zip(header, row)) key = (data['os_id'], data['version'], data['region'], data['virtualization_type']) ami_map[key] = data['ami_id'] return ami_map
def open_marbach(marbach_file, insertion_index): with open(marbach_file, 'rb') as source: reader = csv_reader(source, delimiter='\t') for line in reader: interaction_from = line[0] interaction_to = line[1] if len(line) > 2: weight = np.abs(float(line[2])) else: weight = np.nan master_accumulator[(interaction_from, interaction_to)][insertion_index] = weight
def aggregate_monthly_data(csv_data): """ Pass your `csv_data` as an iterable whose members are individual lines of data (e.g. using a generator returned by the `iter_lines()` method of a `requests` library `Response` object) from a Climate Data Online (CDO)-style CSV file. Your CSV file must include the date (`DATE`), precipitation (`PRCP`), minimum temperature (`TMIN`), and maximum temperature (`TMAX`). The first line of your data file must be a header line. Returns a 12-member list of structured monthly data, each of which is a dict containing - `days_of_data`, - `precipitation_total`, - `min_temperature_total`, - `max_temperature_total`, - `all_min_temperatures`, and - `all_max_temperatures`. """ csv_data = csv_reader(csv_data) header_row = next(csv_data) date_index = header_row.index('DATE') prcp_index = header_row.index('PRCP') tmin_index = header_row.index('TMIN') tmax_index = header_row.index('TMAX') monthlies = [dict(days_of_data=0, precipitation_total=0, min_temperature_total=0, max_temperature_total=0, all_min_temperatures=[], all_max_temperatures=[]) for _ in range(12)] for data_row in csv_data: row_month = int(data_row[date_index][4:6]) row_prcp = int(data_row[prcp_index]) row_tmin = int(data_row[tmin_index]) row_tmax = int(data_row[tmax_index]) monthly = monthlies[row_month - 1] monthly['days_of_data'] += 1 monthly['precipitation_total'] += row_prcp monthly['min_temperature_total'] += row_tmin monthly['max_temperature_total'] += row_tmax monthly['all_min_temperatures'].append(row_tmin) monthly['all_max_temperatures'].append(row_tmax) return monthlies
def antibodies(self): antibodies = [] with open(self.__csvfile) as fh: sample = fh.read(MonogramData.__sample_len) sniffer = csv_sniffer() dialect = sniffer.sniff(sample) if not sniffer.has_header(sample): raise ValueError(MonogramData.__no_header_msg) fh.seek(0) reader = csv_reader(fh, dialect) # grab everything after the accession column in the header row for row in reader: antibodies.extend(r.strip() for r in row[1:]) break return antibodies
def load_vector(file): # At the beginning we don't know how large this vector will be. chunk_rows = 32768 cur_len = chunk_rows b = np.ndarray(shape=[cur_len], dtype=float) with open(file, 'r') as f: reader = csv_reader(f,'excel-tab') for i, row in enumerate(reader): if i >= cur_len: # Enlarge the vector if we have to. cur_len += chunk_rows b.resize([cur_len]) b[i] = row[0] # Probably our vector is now a bit longer than the file ... shrink it! b.resize([i+1]) return b
def open_csv(inpath,filelist,convert_to_float=False): #=============================================================================== from csv import reader as csv_reader Dict = {} for i,namefile in enumerate(filelist): #print "\nOpening %s......"%(namefile) # open file, read all lines inputpath = os.path.join(inpath,namefile) f=open(inputpath,'rU') reader=csv_reader(f, delimiter=',', skipinitialspace=True) lines=[] for row in reader: lines.append(row) f.close() # storing headers in list headerow headerow=lines[0] # deleting rows that are not data (first and last rows of the file) del lines[0] # transforming data from string to float type converted_data=[] for line in lines: if '' in line: newline = [] for it in line: if it=='': newline += ['-9999.'] if it!='': newline += [it] line = newline converted_data.append(map(float,line)) data = np.array(converted_data) # creating one dictionnary and storing the float data in it dictnamelist= {} for j,varname in enumerate(headerow): dictnamelist[varname]=data[:,j] Dict[namefile] = dictnamelist #print "Dictionary created!" return Dict
def bank_from_csv(filename): """Load a bank of NRPN configurations from a CSV file.""" settings = [] with open(filename, 'r') as fd: reader = csv_reader(fd) for row in reader: try: setting = Setting( name=row[0], number=int(row[1]), min=int(row[2]), max=int(row[3]) ) settings.append(setting) except Exception: print("Could not parse row: %r" % row) return settings
def parse_complex_portal(complex_portal_file): def unpack_complex_contents(complex_name): unpacked_subnodes = [] subnode_list = new_nodes[complex_name]['components'] for sub_node in subnode_list: if sub_node in new_nodes[complex_name].keys(): unpacked_subnodes += unpack_complex_contents(sub_node) else: if ':' in sub_node or '_9606' in sub_node: pass elif '-' in sub_node: unpacked_subnodes.append(sub_node.split('-')[0]) else: unpacked_subnodes.append(sub_node) return unpacked_subnodes base = [] new_nodes = {} with open(complex_portal_file, 'rb') as source: reader = csv_reader(source, delimiter='\t') header = reader.next() for line in reader: legacy_id = line[0] display_name = line[1] componenets = line[4].split('|') componenets = [comp.split('(')[0] for comp in componenets] node = {'ID': legacy_id, 'displayName': display_name, 'components': componenets} new_nodes[node['ID']] = node # print new_nodes for node in new_nodes.itervalues(): node['components'] = unpack_complex_contents(node['ID']) base += node['components'] # print new_nodes base = list(set(base)) return new_nodes, base
def parse_hint(_hint_csv): """ Reads protein-protein relationships from a HiNT database file :param _hint_csv: location of the HiNT database tsv file :return: {UP_Identifier:[UP_ID1, UP_ID2, ...]} """ local_relations = defaultdict(list) with open(_hint_csv, 'r') as source_file: hint_reader = csv_reader(source_file, delimiter='\t') hint_reader.next() for i, fields in enumerate(hint_reader): if fields[2] != fields[3]: local_relations[fields[3]].append(fields[2]) local_relations[fields[2]].append(fields[3]) return dict(local_relations)
def parse_TRRUST(trrust_file): base = [] ret_dict = {} with open(trrust_file, 'rb') as source: reader = csv_reader(source, delimiter='\t') for line in reader: interaction_from = line[0] interaction_to = line[1] interaction_type = line[2] evidence = line[3].split(';') evidence_redundancy = len(evidence) base.append(interaction_to) base.append(interaction_from) ret_dict[(interaction_from, interaction_to)] = evidence_redundancy base = list(set(base)) return ret_dict, base
def parse_bio_grid(bio_grid): """ Parses the given file as a BioGrid file and returns as :param bio_grid: the location of the biogrid_path bioflow file that needs to bprased :return: """ ret_dict = {} base = [] with open(bio_grid, 'rb') as source_file: biogrid_reader = csv_reader(source_file, 'excel-tab') biogrid_reader.next() for fields in biogrid_reader: ret_dict[tuple(fields[7:9])] = [fields[17]] if fields[18] != '-': ret_dict[tuple(fields[7:9])].append(fields[18]) base.append(fields[7]) base.append(fields[8]) return ret_dict, base
def parse_cellnet_grn(cellnet_file): base = [] ret_dict = {} with open(cellnet_file, 'rb') as source: reader = csv_reader(source, delimiter=',') header = reader.next() # print header for line in reader: interaction_no = int(line[0]) interaction_from = line[1] interaction_to = line[2] interaction_z_score = float(line[3]) interaction_correlation = float(line[4]) base.append(interaction_to) base.append(interaction_from) ret_dict[(interaction_from, interaction_to)] = interaction_correlation base = list(set(base)) return ret_dict, base
def create_namedtuple_from_csv(name, csv): l = logic.Mutex() l.is_filename = ( isinstance(csv, str) and '\n' not in csv and os.path.isfile(csv) ) l.is_csv_text = ( isinstance(csv, str) and '\n' in csv and ',' in csv ) l.is_csv_lines = ( not isinstance(csv, str) and ( hasattr(csv, '__iter__') or hasattr(csv, 'next') ) ) lines = None with l as g: if g.is_filename: with open(csv, 'r') as f: lines = f.read().split('\n') elif g.is_csv_text: lines = csv.split('\n') elif g.is_csv_lines: lines = csv data = [ r for r in csv_reader(lines) ] mutable_sheets = set() mutable = True if name in mutable_sheets else False return create_namedtuple(name, data, mutable=mutable)
def csv2numpy(source, c_header=True, r_header=True): def correct_line(_row): return [float(item) if item not in ['inf', '', ' '] else np.inf for item in _row] with open(source, 'r') as source_file: reader = csv_reader(source_file) if c_header: c_headers = reader.next() else: c_headers = [] r_headers = [] data_container = [] for row in reader: if r_header: r_headers.append(row[0]) row = row[1:] data_container.append(correct_line(row)) return np.array(data_container), c_headers, r_headers
genes_to_ids_dict = {} # translation_file_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Veena data/Mouse_2_human.tsv' # gene_to_id_file_location = '' # data_source_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Veena data/both_ENSMUG.csv' # data_dump_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Veena data/both_ENSHUM.csv' translation_file_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Veena data/Mouse_2_human.tsv' gene_to_id_file_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Kp_Km data/mouse_look_up_table.tsv' data_source_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Kp_Km data/all_significant.csv' data_dump_location = '/home/andrei/Dropbox/workspaces/JHU/Ewald Lab/Kp_Km data/all_sig_hum.csv' with open(translation_file_location, 'r') as source: reader = csv_reader(source, delimiter='\t') print reader.next() for line in reader: if line[0] and line[1]: if int(line[3]): # We still need to account for the confidence in mapping high_conf_translation_dict[line[0]] = [line[1], line[2]] # print line[0:4] else: low_conf_translation_dict[line[0]] = [line[1], line[2]] high_conf_trans = [] low_conf_trans = [] if gene_to_id_file_location:
log = get_logger(__name__) interactome_interface_instance = InteractomeInterface(True, True) interactome_interface_instance.fast_load() md5_hash = interactome_interface_instance.md5_hash() print "samples found to test against:\t %s" % interactome_rand_samp_db.find({'size': 2, 'sys_hash': md5_hash, 'sparse_rounds': False}).count() essential_genes_bulbs_ids = [] with open(Dumps.analysis_set_bulbs_ids, 'r') as source: reader = csv_reader(source) for line in reader: essential_genes_bulbs_ids += line essential_genes_bulbs_ids = [int(gene) for gene in essential_genes_bulbs_ids] values = [] length_width_accumulator = [] essentiality_percentage = [] for i, sample in enumerate(interactome_rand_samp_db.find({'size': 2, 'sys_hash': md5_hash, 'sparse_rounds': False})): # if i > 10: # break
def unicode_csv_reader(csvfile, *args, **kwargs): for row in csv_reader(csvfile, *args, **kwargs): yield [unicode(cell, encoding) for cell in row]
def seqrecords(self, antibodies, clonal=False): if clonal: raise ValueError('clonal property is not available with Monogram datasets') if len(antibodies) > 1: raise ValueError('only one antibody can be interrogated with Monogram datasets') seqrecords = [] with open(self.__fastafile) as h: source = Verifier(SeqIO.parse(h, 'fasta'), DNAAlphabet) try: seqrecords = list(source) except VerifyError: source.set_alphabet(AminoAlphabet) seqrecords = list(source) underdash = re_compile(r'[_-](\d+)$') for r in seqrecords: r.id = underdash.sub(r'_\1', r.id) ic50s = dict((r.id, []) for r in seqrecords) with open(self.__csvfile) as fh: sample = fh.read(MonogramData.__sample_len) sniffer = csv_sniffer() dialect = sniffer.sniff(sample) if not sniffer.has_header(sample): raise ValueError(MonogramData.__no_header_msg) fh.seek(0) reader = csv_reader(fh, dialect) columns = None for i, row in enumerate(reader): if columns is None: columns = dict((v.strip(), j) for j, v in enumerate(row)) missing = set(antibodies) - set(columns.keys()) if len(missing): raise ValueError("antibodies ('%s') not found!" % "', '".join(missing)) else: acc = underdash.sub(r'_\1', row[0]) try: if acc in ic50s: cln_ic50s = [float(row[columns[ab]].strip().lstrip('<>')) for ab in antibodies if ab in columns and columns[ab] < len(row)] ic50s[acc].extend(cln_ic50s) except: pass drop = [] for i, r in enumerate(seqrecords): if r.id not in ic50s or len(ic50s[r.id]) == 0: drop.append(i) warn("skipping sequence '%s', VALUE not found" % r.id) else: values = {'IC50': ic50s[r.id]} r.description = json_dumps({ 'ab': antibodies[0], 'values': values }) r.annotations['antibody'] = values for i in sorted(drop, reverse=True): del seqrecords[i] return seqrecords, clonal, antibodies